In [1]:
# import dependencies
from sklearn import tree
import pandas as pd
import numpy as np
import os

In [2]:
# read in dataset
final_full = pd.read_csv("../data/final_full.csv")

# print columns
for column in final_full.columns:
    print(column)

# preview dataset
final_full = final_full.dropna(subset = ['conference'])

final_full.head()

year
player
affiliation
overall_pick
position
lane_agility
shuttle_run
sprint
standing_leap
max_leap
bench_press
body_fat
hand_length
hand_width
height_no_shoes
height_shoes
reach
weight
wingspan
assist_percentage
assists
block_percentage
blocks
box_plus_minus
conference
defensive_rebound_percentage
defensive_rebounds
effective_field_goal_percentage
field_goal_attempts
field_goals
free_throw_attempt_rate
free_throw_attempt
free_throw_percentage
free_throws
minutes_played
offensive_rebound_percentage
offensive_rebounds
personal_fouls
points
steal_percentage
steals
three_point_attempt_rate
three_point_attempts
three_point_percentage
three_pointers
total_rebound_percentage
total_rebounds
true_shooting_percentage
turnover_percentage
turnovers
two_point_attempts
two_point_percentage
two_pointers
usage_percentage
player_id
classification


Unnamed: 0,year,player,affiliation,overall_pick,position,lane_agility,shuttle_run,sprint,standing_leap,max_leap,...,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,player_id,classification
1,2000,A.J. Guyton,Indiana,32,G,10.55,3.04,3.22,33.0,37.5,...,90,0.572,12.2,69.0,272,0.485,132,,aj-guyton-1,Exclude
3,2007,Aaron Brooks,Oregon,26,G,10.57,3.04,3.2,32.5,39.5,...,149,0.589,14.4,89.0,265,0.502,133,24.4,aaron-brooks-1,Role Player
5,2014,Aaron Gordon,Arizona,4,F,10.81,2.76,3.27,32.5,39.0,...,303,0.503,10.5,55.0,337,0.513,173,23.2,aaron-gordon-1,Starter
7,2007,Aaron Gray,Pittsburgh,49,C,12.07,3.1,3.7,27.0,32.5,...,341,0.567,11.1,55.0,361,0.565,204,,aaron-gray-1,Bust
9,2018,Aaron Holiday,California-Los Angeles,23,G,10.96,3.22,3.27,25.5,33.0,...,121,0.609,18.5,125.0,255,0.486,124,26.7,aaron-holiday-1,Exclude


In [3]:
limit_plyrs = final_full.loc[final_full['classification'] != 'Exclude']

# limit data to '00-'14
limit_yrs = limit_plyrs.loc[final_full['year'] <= 2014]

# check distribution of classifications
limit_yrs['classification'].value_counts()

Role Player    156
Starter        150
Bust            32
All-Star        22
Name: classification, dtype: int64

In [4]:
target = limit_yrs['classification']

# drop unnecessary columns
data_clean = limit_yrs.drop(['year', 'player', 'affiliation', 'overall_pick', 'player_id', 'classification',
                            'defensive_rebounds', 'defensive_rebound_percentage', 'offensive_rebounds', 'offensive_rebound_percentage',
                            'effective_field_goal_percentage', 'field_goal_attempts', 'field_goals', 'free_throw_attempt_rate',
                            'free_throws', 'steal_percentage', 'three_point_attempt_rate', 'three_pointers', 'true_shooting_percentage',
                            'two_pointers', 'shuttle_run', 'hand_length', 'hand_width', 'height_no_shoes', 'standing_leap',
                            'weight', 'reach', 'bench_press', 'points', 'assist_percentage', 'usage_percentage'], axis = 1)

# one-hot encode position
data_clean = pd.get_dummies(data_clean, columns=['position']) 
data_clean = pd.get_dummies(data_clean, columns=['conference'])

# replace NAs with imputed values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)

for column in data_clean.columns:
    data_clean[[column]] = imputer.fit_transform(data_clean[[column]])
    
feature_names = data_clean.columns

data_clean.head()



Unnamed: 0,lane_agility,sprint,max_leap,body_fat,height_shoes,wingspan,assists,block_percentage,blocks,box_plus_minus,...,conference_ovc,conference_pac-10,conference_pac-12,conference_patriot,conference_sec,conference_southern,conference_summit,conference_sun-belt,conference_wac,conference_wcc
3,10.57,3.2,39.5,2.7,71.75,76.0,149.0,0.3,6.0,9.047368,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10.81,3.27,39.0,5.05,80.75,83.75,75.0,3.4,39.0,9.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,12.07,3.7,32.5,10.8,85.75,87.25,61.0,2.863445,62.0,9.047368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,12.15,3.37,35.5,9.1,81.75,84.75,82.0,2.863445,70.0,9.047368,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
21,11.96,3.16,41.0,5.8,79.0,85.0,24.0,2.7,40.0,9.047368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_clean, target, random_state = 23)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(limit_yrs, target, random_state = 23)

In [6]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.43333333333333335

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 400)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.5444444444444444

In [8]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.07130458623702082, 'steals'),
 (0.05422731697892525, 'three_point_percentage'),
 (0.05123943595656077, 'assists'),
 (0.044818299181206, 'total_rebounds'),
 (0.04454911342381298, 'blocks'),
 (0.04314279554430592, 'minutes_played'),
 (0.042837360173091296, 'two_point_attempts'),
 (0.04141760278846474, 'turnovers'),
 (0.040751363709308666, 'free_throw_percentage'),
 (0.0404881563839969, 'two_point_percentage'),
 (0.03878801740939531, 'free_throw_attempt'),
 (0.03859458887906806, 'block_percentage'),
 (0.03824847085986802, 'lane_agility'),
 (0.03803098894314859, 'sprint'),
 (0.03720594029342145, 'three_point_attempts'),
 (0.036187241579422234, 'personal_fouls'),
 (0.03581291841152049, 'body_fat'),
 (0.03571245326041741, 'total_rebound_percentage'),
 (0.03569328410576364, 'turnover_percentage'),
 (0.034879784625896394, 'max_leap'),
 (0.034837483720436324, 'wingspan'),
 (0.031376559664424306, 'height_shoes'),
 (0.0224303184242133, 'box_plus_minus'),
 (0.01496195215652916, 'conference_big

In [9]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.10450552746527797, 'three_point_percentage'),
 (0.09992423244169188, 'turnover_percentage'),
 (0.08955962171543617, 'steals'),
 (0.07579599769867978, 'body_fat'),
 (0.07575461906496962, 'turnovers'),
 (0.0731894307482756, 'two_point_percentage'),
 (0.05897727914462664, 'minutes_played'),
 (0.054319705026059796, 'total_rebounds'),
 (0.053290694020697964, 'personal_fouls'),
 (0.05184671157081571, 'free_throw_percentage'),
 (0.051542316919451936, 'assists'),
 (0.04879020654420862, 'two_point_attempts'),
 (0.041032762241208845, 'block_percentage'),
 (0.033591902407223755, 'blocks'),
 (0.01891835927940817, 'wingspan'),
 (0.01841575212243285, 'total_rebound_percentage'),
 (0.018381868493872713, 'conference_acc'),
 (0.011068651996310451, 'free_throw_attempt'),
 (0.009487415996837526, 'max_leap'),
 (0.007906179997364607, 'position_F'),
 (0.003700765105149401, 'lane_agility'),
 (0.0, 'three_point_attempts'),
 (0.0, 'sprint'),
 (0.0, 'position_G'),
 (0.0, 'position_C'),
 (0.0, 'height_shoes'

In [10]:
df = pd.DataFrame(zip(X_test_full['player'], rf.predict_proba(X_test), rf.predict(X_test)))
df2 = pd.DataFrame(zip(X_test_full['player'], clf.predict_proba(X_test), clf.predict(X_test)))

print(df[2].value_counts())
print(df2[2].value_counts())

Role Player    49
Starter        39
Bust            2
Name: 2, dtype: int64
Role Player    39
Starter        34
Bust           11
All-Star        6
Name: 2, dtype: int64


In [11]:
for index, row in df.iterrows():
    print(f'Player Name: {row[0]}')
    print(f'Model Prediction: {row[2]}')
    print(f'Probability of Bust: {row[1][1]}')
    print(f'Probability of Role Player: {row[1][2]}')
    print(f'Probability of Starter: {row[1][3]}')
    print(f'Probability of All-Star: {row[1][0]}')
    #print(f'Probability of Role Player or Better: {row[1][0] + row[1][2] + row[1][3]}')
    #print(f'Probability of Starter or Better: {row[1][0] + row[1][3]}')
    #print(f'Probability of All-Star: {row[1][0]}')
    print('----------------------------------------------')

Player Name: Kris Humphries
Model Prediction: Role Player
Probability of Bust: 0.115
Probability of Role Player: 0.56
Probability of Starter: 0.31
Probability of All-Star: 0.015
----------------------------------------------
Player Name: Steve Blake
Model Prediction: Starter
Probability of Bust: 0.1425
Probability of Role Player: 0.3625
Probability of Starter: 0.3775
Probability of All-Star: 0.1175
----------------------------------------------
Player Name: Mike Conley
Model Prediction: Starter
Probability of Bust: 0.055
Probability of Role Player: 0.225
Probability of Starter: 0.505
Probability of All-Star: 0.215
----------------------------------------------
Player Name: Fred Jones
Model Prediction: Starter
Probability of Bust: 0.06
Probability of Role Player: 0.3575
Probability of Starter: 0.5025
Probability of All-Star: 0.08
----------------------------------------------
Player Name: Dominic McGuire
Model Prediction: Starter
Probability of Bust: 0.1125
Probability of Role Player: 0

Probability of All-Star: 0.0225
----------------------------------------------
Player Name: Josh Childress
Model Prediction: Role Player
Probability of Bust: 0.1175
Probability of Role Player: 0.59
Probability of Starter: 0.2775
Probability of All-Star: 0.015
----------------------------------------------
Player Name: Kevin Love
Model Prediction: Role Player
Probability of Bust: 0.015
Probability of Role Player: 0.575
Probability of Starter: 0.3025
Probability of All-Star: 0.1075
----------------------------------------------
Player Name: Stromile Swift
Model Prediction: Starter
Probability of Bust: 0.01
Probability of Role Player: 0.09
Probability of Starter: 0.8875
Probability of All-Star: 0.0125
----------------------------------------------
Player Name: Kemba Walker
Model Prediction: Starter
Probability of Bust: 0.15
Probability of Role Player: 0.3
Probability of Starter: 0.355
Probability of All-Star: 0.195
----------------------------------------------
Player Name: Aaron Gray
Mod

Player Name: Ty Lawson
Model Prediction: Starter
Probability of Bust: 0.07
Probability of Role Player: 0.3225
Probability of Starter: 0.46
Probability of All-Star: 0.1475
----------------------------------------------
Player Name: Cory Joseph
Model Prediction: Role Player
Probability of Bust: 0.1725
Probability of Role Player: 0.4875
Probability of Starter: 0.2875
Probability of All-Star: 0.0525
----------------------------------------------
Player Name: Brandon Roy
Model Prediction: Role Player
Probability of Bust: 0.105
Probability of Role Player: 0.4875
Probability of Starter: 0.315
Probability of All-Star: 0.0925
----------------------------------------------
Player Name: Ekpe Udoh
Model Prediction: Starter
Probability of Bust: 0.085
Probability of Role Player: 0.2875
Probability of Starter: 0.57
Probability of All-Star: 0.0575
----------------------------------------------
Player Name: Channing Frye
Model Prediction: Role Player
Probability of Bust: 0.03
Probability of Role Player

In [12]:
for index, row in df2.iterrows():
    print(f'Player Name: {row[0]}')
    print(f'Model Prediction: {row[2]}')
    print(f'Probability of Bust: {row[1][1]}')
    print(f'Probability of Role Player: {row[1][2]}')
    print(f'Probability of Starter: {row[1][3]}')
    print(f'Probability of All-Star: {row[1][0]}')
    #print(f'Probability of Role Player or Better: {row[1][0] + row[1][2] + row[1][3]}')
    #print(f'Probability of Starter or Better: {row[1][0] + row[1][3]}')
    #print(f'Probability of All-Star: {row[1][0]}')
    print('----------------------------------------------')

Player Name: Kris Humphries
Model Prediction: Bust
Probability of Bust: 1.0
Probability of Role Player: 0.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Steve Blake
Model Prediction: Role Player
Probability of Bust: 0.0
Probability of Role Player: 1.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Mike Conley
Model Prediction: Starter
Probability of Bust: 0.0
Probability of Role Player: 0.0
Probability of Starter: 1.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Fred Jones
Model Prediction: All-Star
Probability of Bust: 0.0
Probability of Role Player: 0.0
Probability of Starter: 0.0
Probability of All-Star: 1.0
----------------------------------------------
Player Name: Dominic McGuire
Model Prediction: Starter
Probability of Bust: 0.0
Probability of Role Player: 0.0
Probability of Starter: 1.0
Probabil

Probability of Bust: 0.0
Probability of Role Player: 1.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Tayshaun Prince
Model Prediction: Role Player
Probability of Bust: 0.0
Probability of Role Player: 1.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Jared Dudley
Model Prediction: Starter
Probability of Bust: 0.0
Probability of Role Player: 0.0
Probability of Starter: 1.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Glen Davis
Model Prediction: Role Player
Probability of Bust: 0.0
Probability of Role Player: 1.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
----------------------------------------------
Player Name: Arron Afflalo
Model Prediction: Role Player
Probability of Bust: 0.0
Probability of Role Player: 1.0
Probability of Starter: 0.0
Probability of All-Star: 0.0
--------------------