In [164]:
import sklearn
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Set random seed
np.random.seed(0)

In [165]:
df = pd.read_csv('data/bb_reference/2016to2019Data_merged.csv', encoding="ISO-8859-1")

In [166]:
df = df[df['slug'] == 'curryst01']

In [167]:
df['game_score'] = round(df['game_score'])

In [168]:
df['game_score'].head()

47865     4.0
47866    22.0
47867    14.0
47868    23.0
47869    34.0
Name: game_score, dtype: float64

In [169]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,assists,attempted_field_goals,attempted_free_throws,attempted_three_point_field_goals,blocks,date,defensive_rebounds,game_score,...,turnovers,Unnamed: 0_y,player,birthday,year_min,year_max,pos,height,weight,is_train
47865,47865,141,4,6,0,5,0,1/2/2016,1,4.0,...,2,287,Stephen Curry,14-Mar-88,2010,2019,G,3-Jun,190.0,True
47866,47866,6,4,21,1,10,0,1/4/2016,3,22.0,...,2,287,Stephen Curry,14-Mar-88,2010,2019,G,3-Jun,190.0,True
47867,47867,17,6,13,1,8,0,1/5/2016,0,14.0,...,3,287,Stephen Curry,14-Mar-88,2010,2019,G,3-Jun,190.0,True
47868,47868,6,9,18,6,11,0,1/8/2016,3,23.0,...,1,287,Stephen Curry,14-Mar-88,2010,2019,G,3-Jun,190.0,True
47869,47869,1,11,21,7,14,1,1/9/2016,5,34.0,...,4,287,Stephen Curry,14-Mar-88,2010,2019,G,3-Jun,190.0,True


In [170]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [171]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 246
Number of observations in the test data: 74


In [172]:
# Create a list of the feature column's names
features = ['assists', 'made_field_goals', 'attempted_field_goals', 'made_free_throws', 
            'attempted_free_throws', 'made_three_point_field_goals', 'blocks',
            'defensive_rebounds', 'offensive_rebounds', 'personal_fouls', 'turnovers', 'steals']
# View features
features

['assists',
 'made_field_goals',
 'attempted_field_goals',
 'made_free_throws',
 'attempted_free_throws',
 'made_three_point_field_goals',
 'blocks',
 'defensive_rebounds',
 'offensive_rebounds',
 'personal_fouls',
 'turnovers',
 'steals']

In [173]:
y = train['game_score']

# View target
y.head()

47865     4.0
47866    22.0
47867    14.0
47868    23.0
47869    34.0
Name: game_score, dtype: float64

In [174]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier()

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train[features], y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [175]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test[features])

array([24., 28., 20., 14., 20., 28., 16., 30., 16., 30., 10., 30., 27.,
       25., 13., 14., 22.,  5., 16.,  3., 20., 21., 20., 27., 24., 24.,
       19., 26., 16., 18., 13., 34., 20., 25., 18., 15., 14., 26., 20.,
       17., 25., 21., 12., 22., 16., 18., 22., 16., 20., 32.,  9., 22.,
       14., 20., 14., 25., 24., 14., 23., 18., 28., 18., 13., 20., 18.,
       12., 21.,  0., 34., 19., 18., 12., 33., 23.])

In [176]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0. , 0.1, 0. , 0. ,
        0. , 0. , 0.1, 0. , 0. , 0.1, 0.1, 0. , 0. , 0. , 0. , 0.3, 0. ,
        0.1, 0. , 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.2, 0. ,
        0. , 0. , 0.5, 0. , 0. , 0. , 0.1, 0. , 0.1, 0. , 0. , 0. , 0. ,
        0. , 0.1, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0.1, 0. , 0.1, 0.1, 0. , 0.2, 0.3, 0. , 0. , 0.2, 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.1, 0. , 0. , 0. , 0. , 0.1, 0. , 0. , 0. ,
        0.1, 0.6, 0. , 0. , 0. , 0. , 0. , 0. , 0.1, 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. ,

In [177]:
preds = clf.predict(test[features])

In [178]:
# View the PREDICTED for the first five observations
preds[0:5]

array([24., 28., 20., 14., 20.])

In [179]:
# View the ACTUAL for the first five observations
test['game_score'].head()

47872    18.0
47873    30.0
47875    26.0
47878    18.0
47882    20.0
Name: game_score, dtype: float64

In [180]:
d = {'actual': test['game_score'], 'predicted': preds}

results_df = pd.DataFrame(d)

In [181]:
results_df['error'] = abs(results_df['actual'] - results_df['predicted'])

In [182]:
results_df['error'].describe()

count    74.000000
mean      4.756757
std       4.050404
min       0.000000
25%       2.000000
50%       4.000000
75%       7.000000
max      23.000000
Name: error, dtype: float64

In [183]:
sklearn.metrics.r2_score(results_df['actual'], results_df['predicted'])

0.4470024979184012

In [184]:
# Create confusion matrix
pd.crosstab(test['game_score'], preds, rownames=['Actual Game_Score'], colnames=['Predicted Game_Score'])

Predicted Game_Score,0.0,3.0,5.0,9.0,10.0,12.0,13.0,14.0,15.0,16.0,...,23.0,24.0,25.0,26.0,27.0,28.0,30.0,32.0,33.0,34.0
Actual Game_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2.0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3.0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5.0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7.0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8.0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10.0,0,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11.0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
12.0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14.0,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [185]:
# View a list of the features and their importance scores
list(zip(train[features], clf.feature_importances_))

[('assists', 0.0838688305427914),
 ('made_field_goals', 0.115243677301469),
 ('attempted_field_goals', 0.1035130356038075),
 ('made_free_throws', 0.09457062405452597),
 ('attempted_free_throws', 0.09407585103564334),
 ('made_three_point_field_goals', 0.08793790501805947),
 ('blocks', 0.024061653412728742),
 ('defensive_rebounds', 0.10140121794359916),
 ('offensive_rebounds', 0.04575613012881422),
 ('personal_fouls', 0.08033635719548621),
 ('turnovers', 0.09146658896507448),
 ('steals', 0.07776812879800067)]