In [1]:
# import dependencies
from sklearn import tree
import pandas as pd
import numpy as np
import os

In [2]:
# read in dataset
final_full = pd.read_csv("../data/final_full.csv")

# print columns
for column in final_full.columns:
    print(column)

# preview dataset
final_full = final_full.dropna(subset = ['conference'])

final_full.head()

year
player
affiliation
overall_pick
position
lane_agility
shuttle_run
sprint
standing_leap
max_leap
bench_press
body_fat
hand_length
hand_width
height_no_shoes
height_shoes
reach
weight
wingspan
assist_percentage
assists
block_percentage
blocks
box_plus_minus
conference
defensive_rebound_percentage
defensive_rebounds
effective_field_goal_percentage
field_goal_attempts
field_goals
free_throw_attempt_rate
free_throw_attempt
free_throw_percentage
free_throws
minutes_played
offensive_rebound_percentage
offensive_rebounds
personal_fouls
points
steal_percentage
steals
three_point_attempt_rate
three_point_attempts
three_point_percentage
three_pointers
total_rebound_percentage
total_rebounds
true_shooting_percentage
turnover_percentage
turnovers
two_point_attempts
two_point_percentage
two_pointers
usage_percentage
win_shares
player_id
classification


Unnamed: 0,year,player,affiliation,overall_pick,position,lane_agility,shuttle_run,sprint,standing_leap,max_leap,...,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,player_id,classification
0,2000,A.J. Guyton,Indiana,32,G,10.55,3.04,3.22,33.0,37.5,...,0.572,12.2,69.0,272,0.485,132,,4.3,aj-guyton-1,Exclude
1,2007,Aaron Brooks,Oregon,26,G,10.57,3.04,3.2,32.5,39.5,...,0.589,14.4,89.0,265,0.502,133,24.4,5.7,aaron-brooks-1,Role Player
2,2014,Aaron Gordon,Arizona,4,F,10.81,2.76,3.27,32.5,39.0,...,0.503,10.5,55.0,337,0.513,173,23.2,5.4,aaron-gordon-1,Starter
3,2007,Aaron Gray,Pittsburgh,49,C,12.07,3.1,3.7,27.0,32.5,...,0.567,11.1,55.0,361,0.565,204,,6.8,aaron-gray-1,Bust
4,2018,Aaron Holiday,California-Los Angeles,23,G,10.96,3.22,3.27,25.5,33.0,...,0.609,18.5,125.0,255,0.486,124,26.7,4.9,aaron-holiday-1,Exclude


In [3]:
# subset data from '00-'14 and '15 and on
limit_plyrs = final_full.loc[final_full['classification'] != 'Exclude']
past_yrs = limit_plyrs.loc[final_full['year'] <= 2014]

future_yrs = final_full.loc[final_full['year'] > 2014]

In [4]:
# drop unnecessary columns
data_clean = final_full.drop(['player', 'affiliation', 'overall_pick', 'player_id',\
                               'defensive_rebounds', 'defensive_rebound_percentage', 'offensive_rebounds', 'offensive_rebound_percentage',\
                               'effective_field_goal_percentage', 'field_goal_attempts', 'field_goals', 'free_throw_attempt_rate',\
                               'free_throws', 'steal_percentage', 'three_point_attempt_rate', 'three_pointers', 'true_shooting_percentage',\
                               'two_pointers', 'shuttle_run', 'hand_length', 'hand_width', 'height_no_shoes', 'standing_leap',\
                               'weight', 'reach', 'bench_press', 'points', 'assist_percentage', 'usage_percentage', 'box_plus_minus'], axis = 1)

# one-hot encode position
data_clean = pd.get_dummies(data_clean, columns=['position'])
data_clean = pd.get_dummies(data_clean, columns=['conference'])
    
# limit past data to '00-'14, filter players who didn't meet min playing time
past_clean = data_clean.loc[data_clean['year'] <= 2014]
past_clean = past_clean.loc[past_clean['classification'] != 'Exclude']
target = past_clean['classification']

past_clean = past_clean.drop(['classification', 'year'], axis = 1)
feature_names = past_clean.columns

# limit future data to '00-'14
future_clean = data_clean.loc[data_clean['year'] > 2014]
future_clean = future_clean.drop(['classification', 'year'], axis = 1)

# replace NAs with imputed values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)

for column in past_clean.columns:
    past_clean[[column]] = imputer.fit_transform(past_clean[[column]])

for column in future_clean.columns:
    future_clean[[column]] = imputer.fit_transform(future_clean[[column]])



In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(past_clean, target, random_state = 45)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(past_yrs, target, random_state = 45)

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.47058823529411764

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 500)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.4588235294117647

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.07131951346383882, 'steals'),
 (0.048471982005231944, 'two_point_attempts'),
 (0.04669240377251225, 'block_percentage'),
 (0.045442569349488655, 'assists'),
 (0.043938693485231245, 'three_point_percentage'),
 (0.041699670606208813, 'sprint'),
 (0.04119437631261427, 'free_throw_attempt'),
 (0.03955089018183935, 'total_rebounds'),
 (0.039168834777448626, 'turnovers'),
 (0.03903185497315265, 'two_point_percentage'),
 (0.03902224985943373, 'height_shoes'),
 (0.03841015358559751, 'body_fat'),
 (0.0383494487744865, 'max_leap'),
 (0.03804197541779123, 'personal_fouls'),
 (0.0378305768757826, 'win_shares'),
 (0.037315211363291474, 'minutes_played'),
 (0.0371163392942764, 'turnover_percentage'),
 (0.03698337551037071, 'lane_agility'),
 (0.03690790632074153, 'total_rebound_percentage'),
 (0.036652647919234664, 'free_throw_percentage'),
 (0.03632329192965204, 'wingspan'),
 (0.033822154158305126, 'blocks'),
 (0.03352945251465987, 'three_point_attempts'),
 (0.008357857845956834, 'conference_big

In [9]:
# sorted(zip(clf.feature_importances_, feature_names), reverse=True)

In [10]:
test = pd.DataFrame(zip(X_test_full['player'], rf.predict_proba(X_test), rf.predict(X_test)))
future = pd.DataFrame(zip(future_yrs['player'], rf.predict_proba(future_clean), rf.predict(future_clean)))

In [11]:
player = []
bust = []
rp = []
st = []
asg = []
for index, row in test.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])

test_preds = pd.DataFrame({"Player": player,\
                           "All-Star %": asg,\
                           "Starter %": st,\
                           "Role Player %": rp,\
                           "Bust %": bust})

merge_cols = X_test_full.columns.difference(test_preds.columns)
test_preds = pd.merge(test_preds, X_test_full[merge_cols], how = 'outer', left_on = 'Player', right_on = 'player')
test_preds.to_csv('../data/test_data_predictions.csv', index = False)

In [12]:
player = []
bust = []
rp = []
st = []
asg = []
for index, row in future.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])

future_preds = pd.DataFrame({"Player": player,\
                             "All-Star %": asg,\
                             "Starter %": st,\
                             "Role Player %": rp,\
                             "Bust %": bust})

merge_cols = future_yrs.columns.difference(future_preds.columns)
future_preds = pd.merge(future_preds, future_yrs[merge_cols], how = 'outer', left_on = 'Player', right_on = 'player')
future_preds.to_csv('../data/future_data_predictions.csv', index = False)