# This Notebook will run the Logistic Regression models on the NBA data

In [113]:
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler

In [128]:
nba_df = pd.read_csv(Path('Resources/nba_game_data.csv'),parse_dates = ['GAME_DATE_EST'], infer_datetime_format= True)
nba_df['GAME_DATE_EST'] = nba_df['GAME_DATE_EST'].apply(lambda x: x.toordinal())
nba_df = nba_df.reset_index()
nba_df = nba_df.dropna()
nba_df = nba_df.drop(columns= ['index', 'Unnamed: 0'])
nba_df.dtypes
display(nba_df)

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,735510,11400001,1610612748,1610612740,2014,1610612748,86.0,0.431,0.679,0.333,18.0,42.0,1610612740,98.0,0.462,0.706,0.438,19.0,42.0,0
1,735511,11400002,1610612761,1610612758,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1
2,735512,11400005,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
3,735512,11400004,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
4,735512,11400007,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25791,738226,22101009,1610612743,1610612761,2021,1610612743,115.0,0.551,0.750,0.407,32.0,39.0,1610612761,127.0,0.471,0.760,0.387,28.0,50.0,0
25792,738226,22101008,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,0.400,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0,1
25793,738226,22101007,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,0.324,28.0,52.0,1610612754,119.0,0.489,1.000,0.389,23.0,47.0,0
25794,738226,22101006,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,0.429,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0,1


In [136]:
# Separate the y variable, the labels
y = nba_df[['HOME_TEAM_WINS']]

# Separate the X variable, the features
X = nba_df.drop(columns= ['HOME_TEAM_WINS'])
display(y.head())
display(X.head())



Unnamed: 0,HOME_TEAM_WINS
0,0
1,1
2,1
3,0
4,1


Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,735510,11400001,1610612748,1610612740,2014,1610612748,86.0,0.431,0.679,0.333,18.0,42.0,1610612740,98.0,0.462,0.706,0.438,19.0,42.0
1,735511,11400002,1610612761,1610612758,2014,1610612761,99.0,0.44,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0
2,735512,11400005,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,0.5,29.0,45.0,1610612743,95.0,0.387,0.659,0.5,19.0,43.0
3,735512,11400004,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0
4,735512,11400007,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0


In [137]:
# Checking balance of target values
y.value_counts()

HOME_TEAM_WINS
1                 15155
0                 10542
dtype: int64

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
lr_model = LogisticRegression(random_state=1)
lr_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(random_state=1)

In [147]:
predictions = lr_model.predict(X_test)

In [148]:
y_test

Unnamed: 0,HOME_TEAM_WINS
231,1
6443,1
5611,1
10312,0
17395,0
...,...
18003,0
19394,1
1900,0
15909,1


In [149]:
balanced_accuracy_score(y_test, predictions)

0.9707993742900982

In [150]:
print(classification_report_imbalanced(y_test, predictions)) 


                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.96      0.98      0.97      0.97      0.94      2680
          1       0.97      0.98      0.96      0.98      0.97      0.94      3745

avg / total       0.97      0.97      0.97      0.97      0.97      0.94      6425



In [151]:
confusion_matrix(y_test, predictions)

array([[2570,  110],
       [  65, 3680]])