In [1]:
import pandas as pd

In [2]:
nba = pd.read_csv('../data/nba_stats_classified.csv')
ncaa = pd.read_csv('../data/ncaa_summary.csv')
combine = pd.read_csv('../data/combine_stats.csv')
id_map = pd.read_csv('../data/id_map.csv')

In [3]:
id_map_ = id_map[['pro_id', 'cbb_id']]
nba_ = nba[['player_id', 'name', 'classification']].drop_duplicates()

map_nba = pd.merge(id_map_, nba_, left_on = 'pro_id', right_on = 'player_id', how = 'inner')

map_nba.drop(['player_id'], axis = 1, inplace = True)

map_nba.head()

Unnamed: 0,pro_id,cbb_id,name,classification
0,martike01,kenyon-martin-1,Kenyon Martin,Starter
1,swiftst01,stromile-swift-1,Stromile Swift,Role Player
2,milesda01,,Darius Miles,Role Player
3,fizerma01,marcus-fizer-1,Marcus Fizer,Role Player
4,millemi01,mike-miller-1,Mike Miller,Starter


In [4]:
ncaa_to_nba = pd.merge(map_nba, ncaa, left_on = 'cbb_id', right_on = 'player_id', how = 'inner')

ncaa_to_nba.drop(['player_id'], axis = 1, inplace = True)

ncaa_to_nba['season'] = ncaa_to_nba['season'] + 1

ncaa_to_nba.rename(columns = {'season': 'draft_season'}, inplace = True)

ncaa_to_nba.head()

Unnamed: 0,pro_id,cbb_id,name,classification,draft_season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,...,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,win_shares_per_40_minutes,num_seasons
0,martike01,kenyon-martin-1,Kenyon Martin,Starter,2000,,44.317881,,97.397351,,...,0.592391,13.11457,59.642384,337.298013,0.573662,193.503311,,8.271523,0.366543,4
1,swiftst01,stromile-swift-1,Stromile Swift,Role Player,2000,,28.327079,,86.837953,,...,0.601466,16.787548,75.37484,287.752665,0.606385,179.642644,,7.438934,0.300672,2
2,fizerma01,marcus-fizer-1,Marcus Fizer,Role Player,2000,,39.048448,,34.53931,,...,0.59489,10.562431,74.490862,484.872063,0.562921,276.314476,,8.178155,0.274545,3
3,millemi01,mike-miller-1,Mike Miller,Starter,2000,,83.001074,,13.060866,,...,0.584151,15.081633,70.515217,208.881489,0.562878,117.334765,,4.81826,0.197698,2
4,johnsde03,dermarr-johnson-1,DerMarr Johnson,Role Player,2000,,45.0,,30.0,,...,0.594,12.0,46.0,153.0,0.575,88.0,,4.8,0.218,1


In [5]:
combine.drop(['position'], axis = 1, inplace = True)

combined_data = pd.merge(ncaa_to_nba, combine, left_on = ['name', 'draft_season'], right_on = ['player', 'season'], how = 'inner')

combined_data.drop(['player', 'season'], axis = 1, inplace = True)

combined_data.head()

Unnamed: 0,pro_id,cbb_id,name,classification,draft_season,assist_percentage,assists,block_percentage,blocks,box_plus_minus,...,standing_vertical,max_vertical,bench_press,body_fat,hand_length,hand_width,height_shoes,standing_reach,weight,wingspan
0,crawfja01,jamal-crawford-1,Jamal Crawford,Starter,2000,,76.0,,16.0,,...,,,0.0,,,,,102.5,175.0,82.0
1,claxtsp01,speedy-claxton-1,Speedy Claxton,Starter,2000,,177.680432,,8.389454,,...,36.0,42.5,6.0,,,,,94.5,166.0,72.0
2,harvedo01,donnell-harvey-1,Donnell Harvey,Role Player,2000,,37.0,,31.0,,...,33.0,32.5,15.0,,,,,105.5,220.0,84.5
3,madsema01,mark-madsen-1,Mark Madsen,Bust,2000,,21.080449,,21.0,,...,30.5,33.5,13.0,,,,,104.5,236.5,84.5
4,langhda01,dan-langhi-1,Dan Langhi,Exclude,2000,,27.106623,,8.368212,,...,31.0,34.5,12.0,,,,,104.0,197.5,80.0


In [6]:
combined_data['classification'].value_counts()

Exclude        288
Role Player    163
Starter        104
All-Star        19
Bust            17
Name: classification, dtype: int64

In [7]:
num_players = len(combined_data)
bad_columns = []

for column in combined_data.columns:
    print(f'{column}: {combined_data[column].isna().sum()} missing values')
    
    # append column to list for removal if exceeds 25% missing data
    if combined_data[column].isna().sum() >= 0.25 * num_players:
        bad_columns.append(column)
    else:
        continue
        
combined_data.drop(bad_columns, axis = 1, inplace = True)

combined_data.head()

pro_id: 0 missing values
cbb_id: 0 missing values
name: 0 missing values
classification: 0 missing values
draft_season: 0 missing values
assist_percentage: 87 missing values
assists: 0 missing values
block_percentage: 87 missing values
blocks: 0 missing values
box_plus_minus: 300 missing values
conference: 0 missing values
defensive_box_plus_minus: 300 missing values
defensive_rebound_percentage: 260 missing values
defensive_rebounds: 15 missing values
defensive_win_shares: 0 missing values
effective_field_goal_percentage: 0 missing values
field_goal_attempts: 0 missing values
field_goal_percentage: 0 missing values
field_goals: 0 missing values
free_throw_attempt_rate: 0 missing values
free_throw_attempts: 0 missing values
free_throw_percentage: 0 missing values
free_throws: 0 missing values
games_played: 0 missing values
games_started: 14 missing values
minutes_played: 0 missing values
offensive_box_plus_minus: 300 missing values
offensive_rebound_percentage: 260 missing values
offen

Unnamed: 0,pro_id,cbb_id,name,classification,draft_season,assist_percentage,assists,block_percentage,blocks,conference,...,lane_agility,three_quarter_sprint,standing_vertical,max_vertical,bench_press,body_fat,height_shoes,standing_reach,weight,wingspan
0,crawfja01,jamal-crawford-1,Jamal Crawford,Starter,2000,,76.0,,16.0,big-ten,...,,,,,0.0,,,102.5,175.0,82.0
1,claxtsp01,speedy-claxton-1,Speedy Claxton,Starter,2000,,177.680432,,8.389454,america-east,...,10.48,3.06,36.0,42.5,6.0,,,94.5,166.0,72.0
2,harvedo01,donnell-harvey-1,Donnell Harvey,Role Player,2000,,37.0,,31.0,sec,...,11.23,,33.0,32.5,15.0,,,105.5,220.0,84.5
3,madsema01,mark-madsen-1,Mark Madsen,Bust,2000,,21.080449,,21.0,pac-12,...,12.12,3.46,30.5,33.5,13.0,,,104.5,236.5,84.5
4,langhda01,dan-langhi-1,Dan Langhi,Exclude,2000,,27.106623,,8.368212,sec,...,10.85,3.24,31.0,34.5,12.0,,,104.0,197.5,80.0


In [8]:
# replace NAs with mean within player position
for column in combined_data.columns:
    try:
        combined_data[column].fillna(combined_data.groupby('position')[column].transform('mean'), inplace = True)
    except:
        continue

In [9]:
combined_data.to_csv('../data/nba_draftees.csv', index = False)