In [1]:
# import dependencies
from sklearn import tree
import pandas as pd
import numpy as np
import os

In [2]:
# read in dataset
final_full = pd.read_csv("../data/final_full.csv")

# print columns
for column in final_full.columns:
    print(column)

# preview dataset
final_full.head()

year
player
affiliation
overall_pick
position
lane_agility
shuttle_run
sprint
standing_leap
max_leap
bench_press
body_fat
hand_length
hand_width
height_no_shoes
height_shoes
reach
weight
wingspan
classification
assist_percentage
assists
block_percentage
blocks
defensive_rebound_percentage
defensive_rebounds
effective_field_goal_percentage
field_goal_attempts
field_goals
free_throw_attempt_rate
free_throw_attempt
free_throw_percentage
free_throws
minutes_played
offensive_rebound_percentage
offensive_rebounds
personal_fouls
points
steal_percentage
steals
three_point_attempt_rate
three_point_attempts
three_point_percentage
three_pointers
total_rebound_percentage
total_rebounds
true_shooting_percentage
turnover_percentage
turnovers
two_point_attempts
two_point_percentage
two_pointers
usage_percentage
player_id
classification.1


Unnamed: 0,year,player,affiliation,overall_pick,position,lane_agility,shuttle_run,sprint,standing_leap,max_leap,...,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage,player_id,classification.1
0,2000,A.J. Guyton,Indiana,32,G,10.55,3.04,3.22,33.0,37.5,...,429,0.572,13.6,218.0,958,0.484,464,,aj-guyton-1,Exclude
1,2007,Aaron Brooks,Oregon,26,G,10.57,3.04,3.2,32.5,39.5,...,397,0.565,18.5,303.0,616,0.479,295,23.2,aaron-brooks-1,Starter
2,2014,Aaron Gordon,Arizona,4,F,10.81,2.76,3.27,32.5,39.0,...,303,0.503,10.5,55.0,337,0.513,173,23.2,aaron-gordon-1,Starter
3,2007,Aaron Gray,Pittsburgh,49,C,12.07,3.1,3.7,27.0,32.5,...,790,0.565,15.2,176.0,789,0.55,434,24.9,aaron-gray-1,Bust
4,2018,Aaron Holiday,California-Los Angeles,23,G,10.96,3.22,3.27,25.5,33.0,...,320,0.58,19.6,304.0,631,0.469,296,23.3,aaron-holiday-1,Exclude


In [3]:
limit_plyrs = final_full.loc[final_full['classification'] != 'Exclude']

# limit data to '00-'14
limit_yrs = limit_plyrs.loc[final_full['year'] <= 2014]

# check distribution of classifications
limit_yrs['classification'].value_counts()

Role Player    152
Starter         92
All-Star        67
Bust            56
Name: classification, dtype: int64

In [12]:
target = limit_yrs['classification']

# drop unnecessary columns
data_clean = limit_yrs.drop(['year', 'affiliation', 'player', 'position', 'overall_pick', 'player_id',
                             'classification', 'classification.1'], axis = 1)

# one-hot encode position
# data_clean = pd.get_dummies(data_clean, columns=['position']) 

# replace NAs with imputed values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)

for column in data_clean.columns:
    data_clean[[column]] = imputer.fit_transform(data_clean[[column]])
    
feature_names = data_clean.columns

data_clean.head()



Unnamed: 0,lane_agility,sprint,standing_leap,max_leap,bench_press,body_fat,height_no_shoes,height_shoes,weight,wingspan,...,three_pointers,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,two_point_attempts,two_point_percentage,two_pointers,usage_percentage
1,10.57,3.2,32.5,39.5,6.0,2.7,70.0,71.75,161.0,76.0,...,205.0,6.6,397.0,0.565,18.5,303.0,616.0,0.479,295.0,23.2
2,10.81,3.27,32.5,39.0,11.36,5.05,79.5,80.75,220.1,83.75,...,16.0,14.9,303.0,0.503,10.5,55.0,337.0,0.513,173.0,23.2
3,12.07,3.7,27.0,32.5,17.0,10.8,84.0,85.75,271.0,87.25,...,0.0,19.4,790.0,0.565,15.2,176.0,789.0,0.55,434.0,24.9
6,11.0,3.22,29.0,34.0,8.0,5.6,74.0,75.5,186.0,78.5,...,114.0,5.7,379.0,0.551,16.5,299.0,979.0,0.502,491.0,25.3
9,12.15,3.37,31.0,35.5,20.0,9.1,80.0,81.75,246.0,84.75,...,0.0,17.0,864.0,0.601,15.6,173.0,720.0,0.589,424.0,18.6


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_clean, target, random_state = 23)
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(limit_yrs, target, random_state = 23)

In [14]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.31521739130434784

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.34782608695652173

In [16]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.034731778106937894, 'sprint'),
 (0.033279002461611834, 'turnover_percentage'),
 (0.032981000516664916, 'assist_percentage'),
 (0.03230525688039305, 'personal_fouls'),
 (0.032114984310762075, 'usage_percentage'),
 (0.03050694450710742, 'two_point_percentage'),
 (0.02982076007385844, 'total_rebound_percentage'),
 (0.02942750380313259, 'free_throw_attempt_rate'),
 (0.02932764277550598, 'true_shooting_percentage'),
 (0.029256227200587886, 'offensive_rebounds'),
 (0.027996430093717963, 'blocks'),
 (0.027738464222730064, 'defensive_rebounds'),
 (0.027120221083769727, 'steals'),
 (0.027045984247682137, 'lane_agility'),
 (0.02696259625062409, 'assists'),
 (0.026445961582379558, 'effective_field_goal_percentage'),
 (0.026073401473327564, 'total_rebounds'),
 (0.0256467826606525, 'weight'),
 (0.025587209300919834, 'three_point_percentage'),
 (0.02532118522846125, 'three_point_attempts'),
 (0.02470027912027468, 'bench_press'),
 (0.02457744866152776, 'minutes_played'),
 (0.02430071107591374, 'b

In [17]:
df = pd.DataFrame(zip(X_test_full['player'], rf.predict_proba(X_test), rf.predict(X_test)))

df[2].value_counts()

Role Player    55
Starter        25
All-Star        6
Bust            6
Name: 2, dtype: int64

In [18]:
for index, row in df.iterrows():
    print(f'Player Name: {row[0]}')
    print(f'Model Prediction: {row[2]}')
    print(f'Probability of Bust: {row[1][1]}')
    print(f'Probability of Role Player: {row[1][2]}')
    print(f'Probability of Starter: {row[1][3]}')
    print(f'Probability of All-Star: {row[1][0]}')
    #print(f'Probability of Role Player or Better: {row[1][0] + row[1][2] + row[1][3]}')
    #print(f'Probability of Starter or Better: {row[1][0] + row[1][3]}')
    #print(f'Probability of All-Star: {row[1][0]}')
    print('----------------------------------------------')

Player Name: Darius Miller
Model Prediction: Role Player
Probability of Bust: 0.04
Probability of Role Player: 0.6
Probability of Starter: 0.275
Probability of All-Star: 0.085
----------------------------------------------
Player Name: Quentin Richardson
Model Prediction: Starter
Probability of Bust: 0.07
Probability of Role Player: 0.36
Probability of Starter: 0.455
Probability of All-Star: 0.115
----------------------------------------------
Player Name: Mike Scott
Model Prediction: Role Player
Probability of Bust: 0.265
Probability of Role Player: 0.5
Probability of Starter: 0.145
Probability of All-Star: 0.09
----------------------------------------------
Player Name: Will Barton
Model Prediction: All-Star
Probability of Bust: 0.18
Probability of Role Player: 0.25
Probability of Starter: 0.27
Probability of All-Star: 0.3
----------------------------------------------
Player Name: Andrew Nicholson
Model Prediction: Bust
Probability of Bust: 0.425
Probability of Role Player: 0.26
Pro

Player Name: DeJuan Blair
Model Prediction: Role Player
Probability of Bust: 0.24
Probability of Role Player: 0.33
Probability of Starter: 0.2
Probability of All-Star: 0.23
----------------------------------------------
Player Name: Aaron Gray
Model Prediction: Role Player
Probability of Bust: 0.275
Probability of Role Player: 0.345
Probability of Starter: 0.135
Probability of All-Star: 0.245
----------------------------------------------
Player Name: Eddie Griffin
Model Prediction: Role Player
Probability of Bust: 0.15
Probability of Role Player: 0.37
Probability of Starter: 0.3
Probability of All-Star: 0.18
----------------------------------------------
Player Name: Kemba Walker
Model Prediction: Starter
Probability of Bust: 0.18
Probability of Role Player: 0.3
Probability of Starter: 0.32
Probability of All-Star: 0.2
----------------------------------------------
Player Name: Isaiah Canaan
Model Prediction: Starter
Probability of Bust: 0.195
Probability of Role Player: 0.225
Probabi