# nba_select_model

### Trains a series of models on 2016-17 and 2017-18 season data

#### Data preparation

In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(suppress=True)
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)

from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Read data
results = pd.read_csv('..\\data\\results.csv')

# Turn binary columns to 0/1
results['teamRslt'] = [1 if x == 'Win' else 0 for x in results['teamRslt']]   # Win = 1, Loss = 0
results['teamLoc'] = [1 if x == 'Home' else 0 for x in results['teamLoc']]  # Home = 1, Away = 0

In [3]:
results.head()

Unnamed: 0,gameID,season,lineupID,teamRslt,teamLoc,gmDateTime,teamAbbr,opptAbbr,opptRslt,opptlineupID,timeSincePrev,city_played,lat,long,distSincePrev,oppt_timeSincePrev,oppt_distSincePrev,team_starting_WS,team_starting_BPM,team_starting_MP_per_game,team_starting_GSpct,oppt_starting_WS,oppt_starting_BPM,oppt_starting_MP_per_game,oppt_starting_GSpct,team_bench_WS,team_bench_BPM,team_bench_MP_per_game,team_bench_GSpct,oppt_bench_WS,oppt_bench_BPM,oppt_bench_MP_per_game,oppt_bench_GSpct,teamRsltCount,teamGameCount,opptRsltCount,opptGameCount,teamWinPct,opptWinPct,PY_teamWinPct,PY_opptWinPct,teamWinPct2,opptWinPct2,diff_starting_WS,diff_starting_BPM,diff_starting_MP_per_game,diff_starting_GSpct,diff_bench_WS,diff_bench_BPM,diff_bench_MP_per_game,diff_bench_GSpct,diff_WinPct2,diff_timeSincePrev,diff_distSincePrev
0,2016-10-25-CLExNY,2017,2016-10-25-CLE,1,1,2016-10-25 20:00:00,CLE,NY,Loss,2016-10-25-NY,10.0,Cleveland,41.48,-81.68,0.0,10.0,0.0,8.635458,3.209214,32.947976,0.855825,4.999187,1.053816,31.155083,0.927616,3.07502,-0.355552,25.099727,0.592429,1.603604,-2.176322,21.188719,0.38839,1,1,1,1,0.0,0.0,0.695,0.39,0.695,0.39,3.636271,2.155398,1.792893,-0.071791,1.471416,1.82077,3.911008,0.20404,0.305,0.0,0.0
1,2016-10-25-CLExNY,2017,2016-10-25-NY,0,0,2016-10-25 20:00:00,NY,CLE,Win,2016-10-25-CLE,10.0,Cleveland,41.48,-81.68,0.0,10.0,0.0,4.999187,1.053816,31.155083,0.927616,8.635458,3.209214,32.947976,0.855825,1.603604,-2.176322,21.188719,0.38839,3.07502,-0.355552,25.099727,0.592429,1,1,1,1,0.0,0.0,0.39,0.695,0.39,0.695,-3.636271,-2.155398,-1.792893,0.071791,-1.471416,-1.82077,-3.911008,-0.20404,-0.305,0.0,0.0
2,2016-10-25-PORxUTA,2017,2016-10-25-POR,1,1,2016-10-25 22:00:00,POR,UTA,Loss,2016-10-25-UTA,10.0,Portland,45.54,-122.66,0.0,10.0,0.0,5.088136,1.097432,25.099608,0.604819,5.353471,1.459727,28.105671,0.686381,3.902201,-0.13352,22.791763,0.251167,2.094429,-1.804555,18.090278,0.278704,1,1,1,1,0.0,0.0,0.537,0.488,0.537,0.488,-0.265335,-0.362295,-3.006062,-0.081561,1.807772,1.671036,4.701485,-0.027536,0.049,0.0,0.0
3,2016-10-25-PORxUTA,2017,2016-10-25-UTA,0,0,2016-10-25 22:00:00,UTA,POR,Win,2016-10-25-POR,10.0,Portland,45.54,-122.66,0.0,10.0,0.0,5.353471,1.459727,28.105671,0.686381,5.088136,1.097432,25.099608,0.604819,2.094429,-1.804555,18.090278,0.278704,3.902201,-0.13352,22.791763,0.251167,1,1,1,1,0.0,0.0,0.488,0.537,0.488,0.537,0.265335,0.362295,3.006062,0.081561,-1.807772,-1.671036,-4.701485,0.027536,-0.049,0.0,0.0
4,2016-10-25-GSxSA,2017,2016-10-25-GS,0,1,2016-10-25 22:30:00,GS,SA,Win,2016-10-25-SA,10.0,Oakland,37.77,-122.22,0.0,10.0,0.0,10.311745,4.977746,31.819002,0.897594,6.850803,2.039911,28.18833,0.833244,3.282493,-0.532698,19.265527,0.191636,3.625329,1.289899,18.43501,0.025048,1,1,1,1,0.0,0.0,0.89,0.817,0.89,0.817,3.460942,2.937835,3.630673,0.06435,-0.342836,-1.822597,0.830517,0.166588,0.073,0.0,0.0


In [4]:
# Set predictors and response
Y = results['teamRslt']
X = results.loc[:, ['teamLoc', 
                    'diff_starting_WS', 
                    'diff_starting_BPM', 
                    'diff_starting_MP_per_game', 
                    'diff_starting_GSpct', 
                    'diff_bench_WS', 
                    'diff_bench_BPM', 
                    'diff_bench_MP_per_game', 
                    'diff_bench_GSpct', 
                    'timeSincePrev', 
                    'distSincePrev']]

In [5]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=None)

In [6]:
# Conduct separate PCA on starters and bench stats
PCA_cols_starting = ['diff_starting_WS', 
                     'diff_starting_MP_per_game', 
                     'diff_starting_GSpct' 
                     ]

PCA_cols_bench = ['diff_bench_WS',
                  'diff_bench_MP_per_game', 
                  'diff_bench_GSpct']

X_train_PCA_cols_starting = X_train.loc[:, PCA_cols_starting]
X_train_PCA_cols_bench = X_train.loc[:, PCA_cols_bench]

X_test_PCA_cols_starting = X_test.loc[:, PCA_cols_starting]
X_test_PCA_cols_bench = X_test.loc[:, PCA_cols_bench]

In [7]:
# Perform PCA on training data and update X_train data frame

# PCA on starter metrics
pca_starters = PCA(n_components=0.75, random_state=1)
pca_starters.fit(X_train_PCA_cols_starting)
X_train_PCA_cols_starting_transformed = pd.Series(pca_starters.transform(X_train_PCA_cols_starting)[:,0])

print('# of principal components: (starters):', pca_starters.n_components_)
print('Explained variance (starters):', round(pca_starters.explained_variance_ratio_[0], 2), '\n')

# PCA on bench metrics
pca_bench = PCA(n_components=0.75, random_state=1)
pca_bench.fit(X_train_PCA_cols_bench)
X_train_PCA_cols_bench_transformed = pd.Series(pca_bench.transform(X_train_PCA_cols_bench)[:,0])

print('# of principal components: (bench):', pca_bench.n_components_)
print('Explained variance (bench):', round(pca_bench.explained_variance_ratio_[0], 2))

# Combine PCA cols with non-PCA cols (training)
X_train_noPCA = X_train.drop(PCA_cols_starting+PCA_cols_bench, axis=1).reset_index()
X_train_new = pd.concat([X_train_noPCA, X_train_PCA_cols_starting_transformed, X_train_PCA_cols_bench_transformed], axis=1)
X_train_new = X_train_new.rename(columns = {0:'PC_starters', 1:'PC_bench'})
X_train_new = X_train_new.drop('index', axis=1)

# of principal components: (starters): 1
Explained variance (starters): 0.92 

# of principal components: (bench): 1
Explained variance (bench): 0.97


In [8]:
# Transform test PCA columns and update X_test data frame
X_test_PCA_cols_starting_transformed = pd.Series(pca_starters.transform(X_test_PCA_cols_starting)[:,0])
X_test_PCA_cols_bench_transformed = pd.Series(pca_bench.transform(X_test_PCA_cols_bench)[:,0])

X_test_noPCA = X_test.drop(PCA_cols_starting+PCA_cols_bench, axis=1).reset_index()
X_test_new = pd.concat([X_test_noPCA, X_test_PCA_cols_starting_transformed, X_test_PCA_cols_bench_transformed], axis=1)
X_test_new = X_test_new.rename(columns = {0:'PC_starters', 1:'PC_bench'})
X_test_new = X_test_new.drop('index', axis=1)

# List of feature names
train_cols = list(X_train_new.columns)

# Logistic regression

In [9]:
# Fit logistic regression model
LR = LogisticRegression(fit_intercept=False)
LR.fit(X_train_new, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Random forest

In [10]:
# Fit random forest model (w/ grid search)
params = {
    'max_depth':[3,4,5],
    'max_features':[3,4,5,6],
    'min_samples_leaf':[7,9,11],
    'n_estimators':[1001]
}

RF_classifier = RandomForestClassifier()

grid_search_RF = GridSearchCV(estimator=RF_classifier, param_grid=params, cv=7, verbose=2, n_jobs=-1)

In [11]:
grid_search_RF.fit(X_train_new, Y_train)

Fitting 7 folds for each of 36 candidates, totalling 252 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 252 out of 252 | elapsed:  2.8min finished


GridSearchCV(cv=7, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 4, 5], 'max_features': [3, 4, 5, 6], 'min_samples_leaf': [7, 9, 11], 'n_estimators': [551]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [12]:
# Determine best random forest parameters
print(grid_search_RF.best_params_)
RF = grid_search_RF.best_estimator_

{'max_depth': 3, 'max_features': 6, 'min_samples_leaf': 7, 'n_estimators': 551}


# Decision tree

In [13]:
# Fit decision tree model (w/ grid search)
params_dt = {
    'max_depth':[3,4],
    'max_features':[2,3,4,5],
    'min_samples_leaf':[3,5,7],
    'min_samples_split':[2,3]
}

DT_classifier = DecisionTreeClassifier()

grid_search_DT = GridSearchCV(estimator=DT_classifier, param_grid=params_dt, cv=5, n_jobs=-1)

In [14]:
grid_search_DT.fit(X_train_new, Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 4], 'max_features': [2, 3, 4, 5], 'min_samples_leaf': [3, 5, 7], 'min_samples_split': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
# Determine best decision tree parameters
print(grid_search_DT.best_params_)
DT = grid_search_DT.best_estimator_

{'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 7, 'min_samples_split': 2}


# Gradient boosting

In [16]:
# Fit gradient boosting model (w/ grid search)
params_gb = {
    'learning_rate':[0.03,0.05,0.07],
    'max_depth':[4,5],
    'min_samples_leaf':[23,27,31],
    'min_samples_split':[2,3,6],
    'n_estimators':[80,100,120]
}

GB_classifier = GradientBoostingClassifier()

grid_search_GB = GridSearchCV(estimator=GB_classifier, param_grid=params_gb, cv=5, verbose=2, n_jobs=-1)

In [17]:
grid_search_GB.fit(X_train_new, Y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:  1.4min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.03, 0.05, 0.07], 'max_depth': [4, 5], 'min_samples_leaf': [23, 27, 31], 'min_samples_split': [2, 3, 6], 'n_estimators': [80, 100, 120]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [18]:
# Determine best gradient boosting parameters
print(grid_search_GB.best_params_)
GB = grid_search_GB.best_estimator_

{'learning_rate': 0.03, 'max_depth': 4, 'min_samples_leaf': 23, 'min_samples_split': 2, 'n_estimators': 100}


# Model comparison

In [19]:
models = {'Logistic Regression':LR, 'Decision Tree':DT, 'Random Forest':RF, 'Gradient Boosting':GB}

In [20]:
# Function prints relevant metrics for each model
def modelStats(modeltype, model):
    
    print('--------------------\n', modeltype, ':\n--------------------', sep='')
    
    # Confusion matrix
    pred = model.predict(X_test_new)
    ct = pd.crosstab(Y_test, pred)
    print(ct, '\n')
    
    # AUC
    pred_proba_train = model.predict_proba(X_train_new)[:,1]
    pred_proba_test = model.predict_proba(X_test_new)[:,1]
    auc_train = roc_auc_score(Y_train, pred_proba_train)
    auc_test = roc_auc_score(Y_test, pred_proba_test)
    print('Training AUC:', round(100*auc_train,1))
    print('Testing AUC:', round(100*auc_test,1))
    
    # Accuracy & F1 score
    print('Accuracy:', round(100*(ct[0][0] + ct[1][1]) / Y_test.shape[0], 1))
    print('F1 score:', round(100*f1_score(y_pred=pred, y_true=Y_test), 1))
    print('')
    
    # Feature importances/weights
    if modeltype == 'Logistic Regression':
        lr_coef = model.coef_[0].round(2)
        lr_dict = dict(zip(train_cols, lr_coef))
        for col, coef in lr_dict.items():
            print(col + ': ' + str(coef))
    try:
        fi_values = list((100*model.feature_importances_).round(1))
        fi_dict = dict(zip(train_cols, fi_values))
        for col, fi in fi_dict.items():
            print(col + ': ' + str(fi) + '%')
    except:
        pass
    
    print('\n')

In [21]:
# Print metrics for each model
for modeltype, model in models.items():
    modelStats(modeltype, model)

--------------------
Logistic Regression:
--------------------
col_0       0    1
teamRslt          
0         376  252
1         191  411 

Training AUC: 70.8
Testing AUC: 70.1
Accuracy: 64.0
F1 score: 65.0

teamLoc: 0.55
diff_starting_BPM: 0.28
diff_bench_BPM: 0.11
timeSincePrev: -0.05
distSincePrev: -0.01
PC_starters: -0.02
PC_bench: -0.01


--------------------
Decision Tree:
--------------------
col_0       0    1
teamRslt          
0         381  247
1         190  412 

Training AUC: 70.4
Testing AUC: 68.5
Accuracy: 64.5
F1 score: 65.3

teamLoc: 16.1%
diff_starting_BPM: 80.0%
diff_bench_BPM: 2.5%
timeSincePrev: 0.0%
distSincePrev: 0.0%
PC_starters: 1.3%
PC_bench: 0.0%


--------------------
Random Forest:
--------------------
col_0       0    1
teamRslt          
0         403  225
1         205  397 

Training AUC: 72.1
Testing AUC: 70.1
Accuracy: 65.0
F1 score: 64.9

teamLoc: 13.0%
diff_starting_BPM: 72.8%
diff_bench_BPM: 5.2%
timeSincePrev: 0.3%
distSincePrev: 1.0%
PC_starter