# This notebook will use a new machine learning library not used in class to make NBA game predictions

In [1]:
# Initial imports
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier as xgbC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Reading in game data
nba_df = pd.read_csv(Path('final_data.csv'),parse_dates = ['GAME_DATE'], infer_datetime_format= True)
# Changing date column to an integer
nba_df['GAME_DATE'] = nba_df['GAME_DATE'].apply(lambda x: x.toordinal())
# Resetting the index
nba_df = nba_df.reset_index()
# Dropping null values
nba_df = nba_df.dropna()
# Dropping unnecessary columns
nba_df = nba_df.drop(columns= ['index'])
# Checking dtypes
nba_df.dtypes
# Displaying dataframe
display(nba_df)

Unnamed: 0,HOME_TEAM_MIN,HOME_TEAM_FGM,HOME_TEAM_FGA,HOME_TEAM_FG_PCT,HOME_TEAM_FG3M,HOME_TEAM_FG3A,HOME_TEAM_FG3_PCT,HOME_TEAM_FTM,HOME_TEAM_FTA,HOME_TEAM_FT_PCT,...,AWAY_TEAM_STL,AWAY_TEAM_BLK,AWAY_TEAM_TOV,AWAY_TEAM_PF,AWAY_TEAM_PTS,AWAY_TEAM_PLUS_MINUS,HOME_TEAM_ID,GAME_DATE,AWAY_TEAM_ID,HOME_TEAM_WINS
0,242.50,38.40,83.75,0.46050,6.55,16.15,0.41765,16.20,21.70,0.74035,...,7.35,4.70,15.85,20.40,96.10,-3.75,1.610613e+09,730879,1.610613e+09,0.0
1,242.50,38.15,83.60,0.45835,6.50,15.85,0.41825,16.50,21.80,0.74940,...,7.80,5.00,16.30,20.15,95.85,-2.70,1.610613e+09,730875,1.610613e+09,0.0
2,240.00,37.70,82.15,0.46010,6.25,15.25,0.41615,16.00,21.25,0.74755,...,7.80,5.05,16.40,20.55,96.05,-3.25,1.610613e+09,730871,1.610613e+09,1.0
3,240.00,38.10,82.20,0.46435,6.40,15.15,0.42590,16.10,21.45,0.74475,...,8.00,5.00,16.55,21.20,95.60,-1.50,1.610613e+09,730869,1.610613e+09,0.0
4,240.00,38.10,82.70,0.46140,6.35,15.35,0.41900,15.95,21.05,0.75095,...,7.95,5.10,16.55,21.35,95.45,-1.50,1.610613e+09,730867,1.610613e+09,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24331,241.25,41.10,91.90,0.44930,10.30,33.30,0.30630,18.30,24.45,0.74595,...,7.55,5.05,13.15,19.80,115.20,1.00,1.610613e+09,738466,1.610613e+09,0.0
24332,241.25,40.45,92.05,0.44080,10.40,33.75,0.30475,18.55,24.45,0.75685,...,7.25,5.10,13.30,20.00,116.20,1.00,1.610613e+09,738464,1.610613e+09,0.0
24333,241.25,40.35,92.10,0.43930,10.35,33.20,0.31015,18.10,24.05,0.74675,...,7.40,5.50,13.30,19.85,115.90,2.05,1.610613e+09,738459,1.610613e+09,0.0
24334,242.50,40.55,93.00,0.43720,10.50,33.15,0.31535,18.00,24.05,0.74350,...,7.15,5.45,13.25,20.30,115.90,0.85,1.610613e+09,738457,1.610613e+09,1.0


In [3]:
# Separate the y variable, the labels
y = nba_df[['HOME_TEAM_WINS']]

# Separate the X variable, the features
X = nba_df.drop(columns= ['HOME_TEAM_WINS'])
display(y.head())
display(X.head())

Unnamed: 0,HOME_TEAM_WINS
0,0.0
1,0.0
2,1.0
3,0.0
4,0.0


Unnamed: 0,HOME_TEAM_MIN,HOME_TEAM_FGM,HOME_TEAM_FGA,HOME_TEAM_FG_PCT,HOME_TEAM_FG3M,HOME_TEAM_FG3A,HOME_TEAM_FG3_PCT,HOME_TEAM_FTM,HOME_TEAM_FTA,HOME_TEAM_FT_PCT,...,AWAY_TEAM_AST,AWAY_TEAM_STL,AWAY_TEAM_BLK,AWAY_TEAM_TOV,AWAY_TEAM_PF,AWAY_TEAM_PTS,AWAY_TEAM_PLUS_MINUS,HOME_TEAM_ID,GAME_DATE,AWAY_TEAM_ID
0,242.5,38.4,83.75,0.4605,6.55,16.15,0.41765,16.2,21.7,0.74035,...,22.15,7.35,4.7,15.85,20.4,96.1,-3.75,1610613000.0,730879,1610613000.0
1,242.5,38.15,83.6,0.45835,6.5,15.85,0.41825,16.5,21.8,0.7494,...,21.6,7.8,5.0,16.3,20.15,95.85,-2.7,1610613000.0,730875,1610613000.0
2,240.0,37.7,82.15,0.4601,6.25,15.25,0.41615,16.0,21.25,0.74755,...,21.8,7.8,5.05,16.4,20.55,96.05,-3.25,1610613000.0,730871,1610613000.0
3,240.0,38.1,82.2,0.46435,6.4,15.15,0.4259,16.1,21.45,0.74475,...,21.3,8.0,5.0,16.55,21.2,95.6,-1.5,1610613000.0,730869,1610613000.0
4,240.0,38.1,82.7,0.4614,6.35,15.35,0.419,15.95,21.05,0.75095,...,21.1,7.95,5.1,16.55,21.35,95.45,-1.5,1610613000.0,730867,1610613000.0


In [4]:
# Scaling the data using MinMax scaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# Define search space for grid search
search_space = [
  {
    'clf__n_estimators': [50, 100, 150, 200],
    'clf__learning_rate': [0.01, 0.1, 0.2, 0.3],
    'clf__max_depth': range(3, 10),
  }
] 

In [7]:
# Define cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

In [8]:
# AUC and accuracy as score
scoring = {'AUC':'roc_auc', 'Accuracy':make_scorer(accuracy_score)}

In [12]:
# Define grid search
grid = RandomizedSearchCV(
  estimator=clf_xgb,
  param_distributions=search_space,
  cv=kfold,
  scoring=scoring,
  refit='AUC',
  verbose=1,
  n_jobs=-1
)
# Fit grid search
model = grid.fit(X_train, y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly

In [13]:
predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test,predict))

Best AUC Score: 0.6380513665838744
Accuracy: 0.6071663379355687
[[ 306 2179]
 [ 211 3388]]


In [14]:
print(model.best_params_)

{'clf__n_estimators': 200, 'clf__max_depth': 7, 'clf__learning_rate': 0.1}


In [24]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', 
                            eval_metric="aucpr",
                            seed=42, 
                            scale_pos_weight=5,
                            use_label_encoder=False,
                            clf__learning_rate=0.1,
                            clf__max_depth = 7,
                            clf__n_estimators = 200)


In [25]:
clf_xgb.fit(X_train, 
            y_train,
            verbose=True, ## the next three arguments set up early stopping.
            early_stopping_rounds=10,
            eval_set=[(X_test, y_test)])

Parameters: { "clf__learning_rate", "clf__max_depth", "clf__n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-aucpr:0.73948
[1]	validation_0-aucpr:0.73985
[2]	validation_0-aucpr:0.74294
[3]	validation_0-aucpr:0.74206
[4]	validation_0-aucpr:0.74080
[5]	validation_0-aucpr:0.74001
[6]	validation_0-aucpr:0.74073
[7]	validation_0-aucpr:0.73781
[8]	validation_0-aucpr:0.73797
[9]	validation_0-aucpr:0.73795
[10]	validation_0-aucpr:0.73682
[11]	validation_0-aucpr:0.73584
[12]	validation_0-aucpr:0.73346


XGBClassifier(base_score=0.5, booster='gbtree', clf__learning_rate=0.1,
              clf__max_depth=7, clf__n_estimators=200, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='aucpr', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=5, seed=42,
              subsample=1, ...)

In [26]:
# Setting XGB classifier instance
#xgb_clf = xgb.XGBClassifier()
# Fitting the data to the model
#xgb_clf = xgb_clf.fit(X_train, y_train)
# Making predictions
predictions = clf_xgb.predict(X_test)

In [27]:
# Checking accuracy score
accuracy_score(y_test, predictions)

0.6000986193293886

In [19]:
# Viewing classification report
print(classification_report_imbalanced(y_test, predictions)) 

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.76      0.03      0.99      0.06      0.17      0.03      2485
        1.0       0.60      0.99      0.03      0.75      0.17      0.03      3599

avg / total       0.66      0.60      0.42      0.47      0.17      0.03      6084

