In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# read in dataset
nba_draftees = pd.read_csv('../data/nba_draftees.csv')

# print columns
for column in nba_draftees.columns:
    print(column)
    
nba_draftees.head()

pro_id
cbb_id
name
classification
draft_season
assist_percentage
assists
block_percentage
blocks
conference
defensive_rebounds
defensive_win_shares
effective_field_goal_percentage
field_goal_attempts
field_goal_percentage
field_goals
free_throw_attempt_rate
free_throw_attempts
free_throw_percentage
free_throws
games_played
games_started
minutes_played
offensive_rebounds
offensive_win_shares
personal_fouls
points
position
steals
team_abbreviation
three_point_attempt_rate
three_point_attempts
three_point_percentage
three_pointers
total_rebound_percentage
total_rebounds
true_shooting_percentage
turnover_percentage
turnovers
two_point_attempts
two_point_percentage
two_pointers
usage_percentage
win_shares
win_shares_per_40_minutes
num_seasons
lane_agility
three_quarter_sprint
standing_vertical
max_vertical
bench_press
body_fat
height_shoes
standing_reach
weight
wingspan


Unnamed: 0,pro_id,cbb_id,name,classification,draft_season,assist_percentage,assists,block_percentage,blocks,conference,...,lane_agility,three_quarter_sprint,standing_vertical,max_vertical,bench_press,body_fat,height_shoes,standing_reach,weight,wingspan
0,crawfja01,jamal-crawford-1,Jamal Crawford,Starter,2000,22.589956,76.0,1.235886,16.0,big-ten,...,11.109571,3.214851,30.461181,36.434322,0.0,6.339076,76.125,102.5,175.0,82.0
1,claxtsp01,speedy-claxton-1,Speedy Claxton,Starter,2000,22.589956,177.680432,1.235886,8.389454,america-east,...,10.48,3.06,36.0,42.5,6.0,6.339076,76.125,94.5,166.0,72.0
2,harvedo01,donnell-harvey-1,Donnell Harvey,Role Player,2000,8.553282,37.0,7.231304,31.0,sec,...,11.23,3.379818,33.0,32.5,15.0,8.84902,83.25,105.5,220.0,84.5
3,madsema01,mark-madsen-1,Mark Madsen,Bust,2000,11.063979,21.080449,3.982195,21.0,pac-10,...,12.12,3.46,30.5,33.5,13.0,7.669869,80.883562,104.5,236.5,84.5
4,langhda01,dan-langhi-1,Dan Langhi,Exclude,2000,11.063979,27.106623,3.982195,8.368212,sec,...,10.85,3.24,31.0,34.5,12.0,7.669869,80.883562,104.0,197.5,80.0


In [3]:
# split data into groups based on draft_season
player_data = nba_draftees[nba_draftees['draft_season'] <= 2014]
validation_data = nba_draftees[nba_draftees['draft_season'] > 2014].reset_index(drop = True)

# exclude players who did not meet requirements for model data
player_data = player_data[player_data['classification'] != 'Exclude'].reset_index(drop = True)

# drop draft_season column
player_data.drop(['draft_season'], axis = 1, inplace = True)

In [4]:
# create dummy values for position, conference
player_data = pd.get_dummies(player_data, columns=['position', 'conference'])

# create target variable
target = player_data['classification']

# subset remaining columns with numeric values
model_data = player_data.select_dtypes(include=[np.number])

In [5]:
# split data into train & test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(model_data, target, random_state = 23)

In [6]:
# run preliminary model using basic parameters
from sklearn.ensemble import RandomForestClassifier
rf_prelim = RandomForestClassifier(n_estimators = 100, random_state = 23)
rf_prelim = rf_prelim.fit(X_train, y_train)
rf_prelim.score(X_test, y_test)

0.5211267605633803

In [7]:
# split categorical data using same train/test split
info_cols = player_data.columns.difference(model_data.columns)

player_info = player_data[info_cols]

train_info, test_info, train_class, test_class = train_test_split(player_info, target, random_state = 23)

In [8]:
test_raw = pd.DataFrame(zip(test_info['name'], rf_prelim.predict_proba(X_test), rf_prelim.predict(X_test)))

In [9]:
# unpack test_raw df
player = []
bust = []
rp = []
st = []
asg = []
tClass = []

for index, row in test_raw.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])
    tClass.append(row[2])

test_predictions = pd.DataFrame({'Name': player,
                                 'All-Star %': asg,
                                 'Starter %': st,
                                 'Role Player %': rp,
                                 'Bust %': bust,
                                 'True Classification': tClass})

test_predictions.to_csv('../data/rf_prelim.csv', index = False)