# This notebook will use a new machine learning library not used in class to make NBA game predictions

In [49]:
# Initial imports
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier as xgbC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler

In [155]:
# Reading in game data
nba_df = pd.read_csv(Path('final_data_xgb.csv'),parse_dates = ['GAME_DATE'], infer_datetime_format= True)
# Changing date and season column to an integer
nba_df['GAME_DATE'] = nba_df['GAME_DATE'].apply(lambda x: x.toordinal())
# Resetting the index
nba_df = nba_df.reset_index()
# Dropping null values
nba_df = nba_df.dropna()
# Dropping unnecessary columns
nba_df = nba_df.drop(columns= ['index','SEASON_YEAR'])
nba_df = nba_df[nba_df.HOME_WL != 2]
# Checking dtypes
display(nba_df.dtypes.value_counts())
# Displaying dataframe
display(nba_df)

float64    163
int64        2
dtype: int64

Unnamed: 0,GAME_ID,GAME_DATE,HOME_TEAM_ID,HOME_WL,AWAY_AST_PCT,AWAY_AST_PCT_RANK,AWAY_AST_RATIO,AWAY_AST_RATIO_RANK,AWAY_AST_TO,AWAY_AST_TO_RANK,...,HOME_PIE_RANK,HOME_POSS,HOME_REB_PCT,HOME_REB_PCT_RANK,HOME_TM_TOV_PCT,HOME_TM_TOV_PCT_RANK,HOME_TS_PCT,HOME_TS_PCT_RANK,HOME_W_PCT_RANK,HOME_W_RANK
0,20100398.0,730845,1610612746,0.0,0.63930,976.40,16.705,1246.40,1.6525,1107.00,...,862.45,93.25,0.51935,952.25,0.17775,1522.85,0.53330,1051.65,357.70,357.70
1,20100415.0,730848,1610612746,0.0,0.64190,952.40,16.700,1248.10,1.7305,1047.30,...,844.75,93.40,0.52390,890.45,0.17850,1540.20,0.53210,1066.95,357.70,357.70
2,20100433.0,730851,1610612746,1.0,0.63615,1001.15,16.765,1230.60,1.7410,1040.40,...,864.05,93.00,0.52030,948.30,0.17645,1496.45,0.53660,1036.95,357.70,357.70
3,20100467.0,730855,1610612746,1.0,0.63060,1043.20,16.670,1252.85,1.6935,1102.60,...,875.75,92.75,0.51370,1021.40,0.17580,1483.75,0.53460,1068.55,357.70,357.70
4,20100471.0,730856,1610612758,1.0,0.56615,1481.00,15.570,1466.45,1.4775,1382.95,...,570.00,96.05,0.51800,925.85,0.13650,860.90,0.54115,973.80,60.45,60.45
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24617,22201102.0,738603,1610612762,0.0,0.60920,1052.35,18.595,1029.20,2.5770,630.55,...,1105.90,100.80,0.52625,764.80,0.14480,1185.75,0.57170,1222.95,554.00,554.00
24618,22201103.0,738603,1610612744,1.0,0.59315,1166.10,17.440,1248.95,1.8930,1086.95,...,815.75,102.70,0.52160,803.30,0.15430,1353.60,0.60325,914.85,277.50,277.50
24619,22201104.0,738603,1610612757,0.0,0.60940,1049.75,19.430,774.45,2.3645,678.80,...,1343.15,98.35,0.47170,1504.40,0.13345,971.90,0.61570,841.10,664.60,664.60
24620,22201105.0,738603,1610612758,1.0,0.61380,1014.05,19.790,712.35,2.3610,760.70,...,906.10,100.30,0.50300,1066.70,0.12375,824.15,0.63950,472.25,443.40,443.40


In [156]:
# Separate the y variable, the labels
y = nba_df[['HOME_WL']]

# Separate the X variable, the features
X = nba_df.drop(columns= ['HOME_WL']).set_index(['GAME_ID'])
display(y.tail())
display(X.head())

Unnamed: 0,HOME_WL
24617,0.0
24618,1.0
24619,0.0
24620,1.0
24621,1.0


Unnamed: 0_level_0,GAME_DATE,HOME_TEAM_ID,AWAY_AST_PCT,AWAY_AST_PCT_RANK,AWAY_AST_RATIO,AWAY_AST_RATIO_RANK,AWAY_AST_TO,AWAY_AST_TO_RANK,AWAY_DEF_RATING,AWAY_DEF_RATING_RANK,...,HOME_PIE_RANK,HOME_POSS,HOME_REB_PCT,HOME_REB_PCT_RANK,HOME_TM_TOV_PCT,HOME_TM_TOV_PCT_RANK,HOME_TS_PCT,HOME_TS_PCT_RANK,HOME_W_PCT_RANK,HOME_W_RANK
GAME_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20100398.0,730845,1610612746,0.6393,976.4,16.705,1246.4,1.6525,1107.0,104.005,1239.55,...,862.45,93.25,0.51935,952.25,0.17775,1522.85,0.5333,1051.65,357.7,357.7
20100415.0,730848,1610612746,0.6419,952.4,16.7,1248.1,1.7305,1047.3,103.605,1203.95,...,844.75,93.4,0.5239,890.45,0.1785,1540.2,0.5321,1066.95,357.7,357.7
20100433.0,730851,1610612746,0.63615,1001.15,16.765,1230.6,1.741,1040.4,104.295,1223.15,...,864.05,93.0,0.5203,948.3,0.17645,1496.45,0.5366,1036.95,357.7,357.7
20100467.0,730855,1610612746,0.6306,1043.2,16.67,1252.85,1.6935,1102.6,103.37,1156.1,...,875.75,92.75,0.5137,1021.4,0.1758,1483.75,0.5346,1068.55,357.7,357.7
20100471.0,730856,1610612758,0.56615,1481.0,15.57,1466.45,1.4775,1382.95,109.52,1523.2,...,570.0,96.05,0.518,925.85,0.1365,860.9,0.54115,973.8,60.45,60.45


In [160]:
# Scaling the data using MinMax scaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [95]:
# Define search space for RandomizedCV search
search_space = [
  {
    'clf__n_estimators': [50, 100, 150, 200],
    'clf__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'clf__max_depth': range(3, 10),
  }
] 

In [96]:
# Define cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

In [97]:
# AUC and accuracy as score
scoring = {'AUC':'roc_auc', 'Accuracy':make_scorer(accuracy_score)}

In [98]:
clf_xgb = xgb.XGBClassifier()

In [99]:
# Define grid search
grid = RandomizedSearchCV(
  estimator=clf_xgb,
  param_distributions=search_space,
  cv=kfold,
  scoring=scoring,
  refit='AUC',
  verbose=1,
  n_jobs=-1
)
# Fit grid search
model = grid.fit(X_train, y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [100]:
predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test,predict))

ValueError: Feature shape mismatch, expected: 163, got 164

In [59]:
print(model.best_params_)

{'clf__n_estimators': 100, 'clf__max_depth': 6, 'clf__learning_rate': 0.1}


In [161]:


clf_xgb = xgb.XGBClassifier(objective='binary:logistic', 
                            eval_metric="aucpr",
                            seed=42, 
                            use_label_encoder=False,
                            clf__learning_rate=0.1,
                            clf__max_depth = 6,
                            clf__n_estimators = 100)


In [162]:
clf_xgb.fit(X_train, 
            y_train,
            verbose=True, ## the next three arguments set up early stopping.
            early_stopping_rounds=10,
            eval_set=[(X_train, y_train),(X_test, y_test)])

Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-aucpr:0.78116	validation_1-aucpr:0.75996
[1]	validation_0-aucpr:0.79028	validation_1-aucpr:0.75850
[2]	validation_0-aucpr:0.79632	validation_1-aucpr:0.75983
[3]	validation_0-aucpr:0.80312	validation_1-aucpr:0.76103
[4]	validation_0-aucpr:0.80744	validation_1-aucpr:0.75923
[5]	validation_0-aucpr:0.81240	validation_1-aucpr:0.75773
[6]	validation_0-aucpr:0.81648	validation_1-aucpr:0.75836
[7]	validation_0-aucpr:0.82077	validation_1-aucpr:0.75902
[8]	validation_0-aucpr:0.82410	validation_1-aucpr:0.75787
[9]	validation_0-aucpr:0.82658	validation_1-aucpr:0.75772
[10]	validation_0-aucpr:0.83027	validation_1-aucpr:0.758

XGBClassifier(base_score=0.5, booster='gbtree', clf__learning_rate=0.1,
              clf__max_depth=6, clf__n_estimators=100, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='aucpr', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, ...)

In [163]:

# Making predictions
predictions = clf_xgb.predict(X_test)

In [164]:
# Checking accuracy score
accuracy_score(y_test, predictions)

0.6590318388564003

In [165]:
# Viewing classification report
print(classification_report_imbalanced(y_test, predictions)) 

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.61      0.48      0.78      0.53      0.61      0.36      2519
        1.0       0.68      0.78      0.48      0.73      0.61      0.39      3637

avg / total       0.65      0.66      0.60      0.65      0.61      0.38      6156



In [166]:
print(confusion_matrix(y_test, predictions))

[[1207 1312]
 [ 787 2850]]


In [167]:
clf_xgb.save_model('xgb_nba.json')