# This Notebook will run the Logistic Regression models on the NBA data

In [1]:
# View imports
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import MinMaxScaler

In [6]:
# Reading in game data
nba_df = pd.read_csv(Path('final_data.csv'),parse_dates = ['GAME_DATE'], infer_datetime_format= True)
# Changing date column to an integer
nba_df['GAME_DATE'] = nba_df['GAME_DATE'].apply(lambda x: x.toordinal())
# Resetting the index
nba_df = nba_df.reset_index()
# Dropping null values
nba_df = nba_df.dropna()
# Dropping unnecessary columns
nba_df = nba_df.drop(columns= ['index'])
# Checking dtypes
nba_df.dtypes
# Displaying dataframe
display(nba_df)

Unnamed: 0,HOME_TEAM_MIN,HOME_TEAM_FGM,HOME_TEAM_FGA,HOME_TEAM_FG_PCT,HOME_TEAM_FG3M,HOME_TEAM_FG3A,HOME_TEAM_FG3_PCT,HOME_TEAM_FTM,HOME_TEAM_FTA,HOME_TEAM_FT_PCT,...,AWAY_TEAM_STL,AWAY_TEAM_BLK,AWAY_TEAM_TOV,AWAY_TEAM_PF,AWAY_TEAM_PTS,AWAY_TEAM_PLUS_MINUS,HOME_TEAM_ID,GAME_DATE,AWAY_TEAM_ID,HOME_TEAM_WINS
0,242.50,38.40,83.75,0.46050,6.55,16.15,0.41765,16.20,21.70,0.74035,...,7.35,4.70,15.85,20.40,96.10,-3.75,1.610613e+09,730879,1.610613e+09,0.0
1,242.50,38.15,83.60,0.45835,6.50,15.85,0.41825,16.50,21.80,0.74940,...,7.80,5.00,16.30,20.15,95.85,-2.70,1.610613e+09,730875,1.610613e+09,0.0
2,240.00,37.70,82.15,0.46010,6.25,15.25,0.41615,16.00,21.25,0.74755,...,7.80,5.05,16.40,20.55,96.05,-3.25,1.610613e+09,730871,1.610613e+09,1.0
3,240.00,38.10,82.20,0.46435,6.40,15.15,0.42590,16.10,21.45,0.74475,...,8.00,5.00,16.55,21.20,95.60,-1.50,1.610613e+09,730869,1.610613e+09,0.0
4,240.00,38.10,82.70,0.46140,6.35,15.35,0.41900,15.95,21.05,0.75095,...,7.95,5.10,16.55,21.35,95.45,-1.50,1.610613e+09,730867,1.610613e+09,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24331,241.25,41.10,91.90,0.44930,10.30,33.30,0.30630,18.30,24.45,0.74595,...,7.55,5.05,13.15,19.80,115.20,1.00,1.610613e+09,738466,1.610613e+09,0.0
24332,241.25,40.45,92.05,0.44080,10.40,33.75,0.30475,18.55,24.45,0.75685,...,7.25,5.10,13.30,20.00,116.20,1.00,1.610613e+09,738464,1.610613e+09,0.0
24333,241.25,40.35,92.10,0.43930,10.35,33.20,0.31015,18.10,24.05,0.74675,...,7.40,5.50,13.30,19.85,115.90,2.05,1.610613e+09,738459,1.610613e+09,0.0
24334,242.50,40.55,93.00,0.43720,10.50,33.15,0.31535,18.00,24.05,0.74350,...,7.15,5.45,13.25,20.30,115.90,0.85,1.610613e+09,738457,1.610613e+09,1.0


In [7]:
# Separate the y variable, the labels
y = nba_df[['HOME_TEAM_WINS']]

# Separate the X variable, the features
X = nba_df.drop(columns= ['HOME_TEAM_WINS'])
display(y.head())
display(X.head())



Unnamed: 0,HOME_TEAM_WINS
0,0.0
1,0.0
2,1.0
3,0.0
4,0.0


Unnamed: 0,HOME_TEAM_MIN,HOME_TEAM_FGM,HOME_TEAM_FGA,HOME_TEAM_FG_PCT,HOME_TEAM_FG3M,HOME_TEAM_FG3A,HOME_TEAM_FG3_PCT,HOME_TEAM_FTM,HOME_TEAM_FTA,HOME_TEAM_FT_PCT,...,AWAY_TEAM_AST,AWAY_TEAM_STL,AWAY_TEAM_BLK,AWAY_TEAM_TOV,AWAY_TEAM_PF,AWAY_TEAM_PTS,AWAY_TEAM_PLUS_MINUS,HOME_TEAM_ID,GAME_DATE,AWAY_TEAM_ID
0,242.5,38.4,83.75,0.4605,6.55,16.15,0.41765,16.2,21.7,0.74035,...,22.15,7.35,4.7,15.85,20.4,96.1,-3.75,1610613000.0,730879,1610613000.0
1,242.5,38.15,83.6,0.45835,6.5,15.85,0.41825,16.5,21.8,0.7494,...,21.6,7.8,5.0,16.3,20.15,95.85,-2.7,1610613000.0,730875,1610613000.0
2,240.0,37.7,82.15,0.4601,6.25,15.25,0.41615,16.0,21.25,0.74755,...,21.8,7.8,5.05,16.4,20.55,96.05,-3.25,1610613000.0,730871,1610613000.0
3,240.0,38.1,82.2,0.46435,6.4,15.15,0.4259,16.1,21.45,0.74475,...,21.3,8.0,5.0,16.55,21.2,95.6,-1.5,1610613000.0,730869,1610613000.0
4,240.0,38.1,82.7,0.4614,6.35,15.35,0.419,15.95,21.05,0.75095,...,21.1,7.95,5.1,16.55,21.35,95.45,-1.5,1610613000.0,730867,1610613000.0


In [8]:
# Checking balance of target values
y.value_counts()

HOME_TEAM_WINS
1.0               14395
0.0                9941
dtype: int64

In [9]:
# Scaling the data using MinMax scaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Creating LogisticRegression model
lr_model = LogisticRegression(random_state=1)
# Fitting data into LR model
lr_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [10]:
# Making predictions
predictions = lr_model.predict(X_test)

In [11]:
# Displaying dataframe
y_test

Unnamed: 0,HOME_TEAM_WINS
10057,1.0
20175,1.0
8911,0.0
21093,0.0
8664,0.0
...,...
10134,0.0
2987,0.0
17507,0.0
5440,1.0


In [12]:
# Checking accuracy score
balanced_accuracy_score(y_test, predictions)

0.6266016813966215

In [13]:
# Viewing classification report
print(classification_report_imbalanced(y_test, predictions)) 


                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.62      0.43      0.82      0.51      0.60      0.34      2488
        1.0       0.68      0.82      0.43      0.74      0.60      0.37      3596

avg / total       0.66      0.66      0.59      0.65      0.60      0.36      6084



In [14]:
# Checking confusion matrix
confusion_matrix(y_test, predictions)

array([[1079, 1409],
       [ 649, 2947]])