In [1]:
# import dependencies
from sklearn import tree
import pandas as pd
import numpy as np
import os

In [2]:
# read in dataset
final_full = pd.read_csv("../data/final_full.csv")

# print columns
for column in final_full.columns:
    print(column)

# preview dataset
final_full = final_full.dropna(subset = ['conference'])

final_full.head()

year
player
affiliation
overall_pick
position
lane_agility
shuttle_run
sprint
standing_leap
max_leap
bench_press
body_fat
hand_length
hand_width
height_no_shoes
height_shoes
reach
weight
wingspan
assist_percentage
assists
block_percentage
blocks
box_plus_minus
conference
defensive_rebound_percentage
defensive_rebounds
effective_field_goal_percentage
field_goal_attempts
field_goals
free_throw_attempt_rate
free_throw_attempt
free_throw_percentage
free_throws
minutes_played
offensive_rebound_percentage
offensive_rebounds
personal_fouls
points
steal_percentage
steals
three_point_attempt_rate
three_point_attempts
three_point_percentage
three_pointers
total_rebound_percentage
total_rebounds
true_shooting_percentage
turnover_percentage
turnovers
two_point_attempts
two_point_percentage
two_pointers
usage_percentage
win_shares
player_id
classification


Unnamed: 0,year,player,affiliation,overall_pick,position,lane_agility,shuttle_run,sprint,standing_leap,max_leap,...,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,win_shares,player_id,classification
0,2000,A.J. Guyton,Indiana,32,G,10.55,3.04,3.22,33.0,37.5,...,0.572,12.2,69.0,272,0.485,132,,4.3,aj-guyton-1,Exclude
1,2007,Aaron Brooks,Oregon,26,G,10.57,3.04,3.2,32.5,39.5,...,0.589,14.4,89.0,265,0.502,133,24.4,5.7,aaron-brooks-1,Role Player
2,2014,Aaron Gordon,Arizona,4,F,10.81,2.76,3.27,32.5,39.0,...,0.503,10.5,55.0,337,0.513,173,23.2,5.4,aaron-gordon-1,Starter
3,2007,Aaron Gray,Pittsburgh,49,C,12.07,3.1,3.7,27.0,32.5,...,0.567,11.1,55.0,361,0.565,204,,6.8,aaron-gray-1,Bust
4,2018,Aaron Holiday,California-Los Angeles,23,G,10.96,3.22,3.27,25.5,33.0,...,0.609,18.5,125.0,255,0.486,124,26.7,4.9,aaron-holiday-1,Exclude


In [3]:
# remove players who didn't meet playing time qualifications
limit_plyrs = final_full.loc[final_full['classification'] != 'Exclude']

# subset data from '00-'14 and '15 and on
future_yrs = limit_plyrs.loc[final_full['year'] > 2014]
past_yrs = limit_plyrs.loc[final_full['year'] <= 2014]

# check distribution of classifications
# print(limit_yrs['classification'].value_counts())
# print(future_yrs['classification'].value_counts())

In [4]:
# drop unnecessary columns
data_clean = limit_plyrs.drop(['year', 'player', 'affiliation', 'overall_pick', 'player_id',
                            'defensive_rebounds', 'defensive_rebound_percentage', 'offensive_rebounds', 'offensive_rebound_percentage',
                            'effective_field_goal_percentage', 'field_goal_attempts', 'field_goals', 'free_throw_attempt_rate',
                            'free_throws', 'steal_percentage', 'three_point_attempt_rate', 'three_pointers', 'true_shooting_percentage',
                            'two_pointers', 'shuttle_run', 'hand_length', 'hand_width', 'height_no_shoes', 'standing_leap',
                            'weight', 'reach', 'bench_press', 'points', 'assist_percentage', 'usage_percentage', 'box_plus_minus'], axis = 1)

# one-hot encode position
data_clean = pd.get_dummies(data_clean, columns=['position']) 
data_clean = pd.get_dummies(data_clean, columns=['conference'])
    
# limit data to '00-'14
past_clean = data_clean.loc[final_full['year'] <= 2014]
target = past_clean['classification']

past_clean = past_clean.drop(['classification'], axis = 1)
feature_names = past_clean.columns

# replace NAs with imputed values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)

for column in past_clean.columns:
    past_clean[[column]] = imputer.fit_transform(past_clean[[column]])

future_clean = data_clean.loc[final_full['year'] > 2014]
future_clean = future_clean.drop(['classification'], axis = 1)

for column in future_clean.columns:
    future_clean[[column]] = imputer.fit_transform(future_clean[[column]])

past_clean.head()



Unnamed: 0,lane_agility,sprint,max_leap,body_fat,height_shoes,wingspan,assists,block_percentage,blocks,free_throw_attempt,...,conference_ovc,conference_pac-10,conference_pac-12,conference_patriot,conference_sec,conference_southern,conference_summit,conference_sun-belt,conference_wac,conference_wcc
1,10.57,3.2,39.5,2.7,71.75,76.0,149.0,0.3,6.0,136.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.81,3.27,39.0,5.05,80.75,83.75,75.0,3.4,39.0,180.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.07,3.7,32.5,10.8,85.75,87.25,61.0,2.863445,62.0,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,12.15,3.37,35.5,9.1,81.75,84.75,82.0,2.863445,70.0,202.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
10,11.96,3.16,41.0,5.8,79.0,85.0,24.0,2.7,40.0,210.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(past_clean, target, random_state = 45)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(past_yrs, target, random_state = 45)

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.43529411764705883

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 500)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.5058823529411764

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.0712122902733024, 'steals'),
 (0.04992568665471411, 'two_point_attempts'),
 (0.046395589435570365, 'assists'),
 (0.04464067728042981, 'block_percentage'),
 (0.044367161955396216, 'three_point_percentage'),
 (0.04420596209301201, 'sprint'),
 (0.040817254418358116, 'total_rebounds'),
 (0.040732247574375685, 'wingspan'),
 (0.04047954164627938, 'win_shares'),
 (0.03947772435145178, 'free_throw_attempt'),
 (0.03878741323857551, 'minutes_played'),
 (0.038710666615973395, 'turnover_percentage'),
 (0.03856878112273869, 'turnovers'),
 (0.03788572650927737, 'two_point_percentage'),
 (0.03785665993413127, 'max_leap'),
 (0.037732608879367406, 'total_rebound_percentage'),
 (0.03671571411429038, 'free_throw_percentage'),
 (0.03670195729688029, 'height_shoes'),
 (0.036156186500632605, 'lane_agility'),
 (0.03598164914059827, 'personal_fouls'),
 (0.03477794123353377, 'body_fat'),
 (0.034137458269499044, 'three_point_attempts'),
 (0.03197811065496771, 'blocks'),
 (0.008922277953590171, 'conference_b

In [9]:
# sorted(zip(clf.feature_importances_, feature_names), reverse=True)

In [10]:
test = pd.DataFrame(zip(X_test_full['player'], rf.predict_proba(X_test), rf.predict(X_test)))
future = pd.DataFrame(zip(future_yrs['player'], rf.predict_proba(future_clean), rf.predict(future_clean)))

print(test[2].value_counts())
print('')
print(future[2].value_counts())

Role Player    49
Starter        36
Name: 2, dtype: int64

Starter        25
Role Player    15
Name: 2, dtype: int64


In [11]:
player = []
bust = []
rp = []
st = []
asg = []
for index, row in test.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])

asg_df = pd.DataFrame({"Player": player,
                      "All-Star %": asg,
                      "Starter %": st,
                      "Role Player %": rp,
                      "Bust %": bust})
asg_df.sort_values("Bust %", ascending = False)

Unnamed: 0,Player,All-Star %,Starter %,Role Player %,Bust %
26,Miles Plumlee,0.036,0.372,0.346,0.246
35,Meyers Leonard,0.024,0.404,0.362,0.210
58,Willie Green,0.042,0.244,0.538,0.176
28,David Harrison,0.054,0.152,0.636,0.158
1,Spencer Hawes,0.010,0.354,0.478,0.158
...,...,...,...,...,...
3,Draymond Green,0.096,0.482,0.400,0.022
47,James Johnson,0.014,0.438,0.528,0.020
79,Delonte West,0.078,0.728,0.178,0.016
78,Quincy Pondexter,0.082,0.304,0.600,0.014


In [12]:
player = []
bust = []
rp = []
st = []
asg = []
for index, row in future.iterrows():
    player.append(row[0])
    asg.append(row[1][0])
    st.append(row[1][3])
    rp.append(row[1][2])
    bust.append(row[1][1])

asg_df = pd.DataFrame({"Player": player,
                      "All-Star %": asg,
                      "Starter %": st,
                      "Role Player %": rp,
                      "Bust %": bust})
asg_df.sort_values("Bust %", ascending = False)

Unnamed: 0,Player,All-Star %,Starter %,Role Player %,Bust %
27,Myles Turner,0.122,0.336,0.324,0.218
3,Cheick Diallo,0.076,0.474,0.294,0.156
19,Kelly Oubre Jr.,0.016,0.292,0.548,0.144
2,Buddy Hield,0.086,0.398,0.386,0.13
30,Pat Connaughton,0.044,0.47,0.36,0.126
6,Devin Booker,0.022,0.422,0.44,0.116
33,Sam Dekker,0.052,0.4,0.432,0.116
23,Malcolm Brogdon,0.064,0.26,0.564,0.112
16,Justin Anderson,0.02,0.516,0.354,0.11
0,Bobby Portis,0.026,0.476,0.406,0.092
