# This will run a Random Forest model on NBA data to predict winners of NBA games


In [1]:
# Initial Imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
nba_df = pd.read_csv(Path('Resources/nba_game_data.csv'),
                    parse_dates = ['GAME_DATE_EST'], 
                    infer_datetime_format= True)  
nba_df.dropna(inplace = True)
nba_df

Unnamed: 0.1,Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,TEAM_ID_away,HOME_TEAM_WINS,PTS_home,...,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,18417,2003-10-08,10300011,1610612737,1610612739,2003,1610612737,1610612739,0,106.20,...,0.75900,0.30090,22.25,46.85,108.85,0.44225,0.77540,0.36805,23.35,46.45
1,17706,2004-01-23,20300611,1610612737,1610612752,2003,1610612737,1610612752,0,93.10,...,0.79300,0.36380,20.25,43.20,89.60,0.42610,0.71285,0.29690,19.55,40.45
2,17680,2004-01-27,20300642,1610612737,1610612756,2003,1610612737,1610612756,0,93.95,...,0.77695,0.36315,20.40,42.90,90.35,0.42740,0.71715,0.31035,19.70,40.05
3,17664,2004-01-29,20300658,1610612737,1610612757,2003,1610612737,1610612757,0,92.75,...,0.76265,0.35945,19.90,43.30,90.15,0.42745,0.71715,0.29785,20.05,39.75
4,17646,2004-01-31,20300672,1610612737,1610612749,2003,1610612737,1610612749,1,92.20,...,0.75585,0.35620,20.15,43.55,89.05,0.42365,0.72145,0.28680,20.15,40.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24570,111,2022-02-25,22100896,1610612766,1610612761,2021,1610612766,1610612761,1,106.05,...,0.78575,0.35080,24.25,43.40,108.90,0.47170,0.79275,0.36455,23.45,44.65
24571,99,2022-02-27,22100915,1610612766,1610612765,2021,1610612766,1610612765,0,105.80,...,0.77465,0.35155,24.50,42.65,108.15,0.46880,0.79010,0.36275,23.25,45.60
24572,54,2022-03-05,22100955,1610612766,1610612759,2021,1610612766,1610612759,1,105.00,...,0.77300,0.35155,24.10,42.45,107.65,0.46500,0.79570,0.35785,22.85,45.80
24573,31,2022-03-08,22100975,1610612766,1610612751,2021,1610612766,1610612751,0,105.90,...,0.76050,0.35850,24.45,42.15,109.35,0.46680,0.79430,0.36500,23.20,45.95


In [17]:
nba_df = nba_df.drop(columns = ['Unnamed: 0'])
nba_df['GAME_DATE_EST'] = nba_df['GAME_DATE_EST'].apply(lambda x: x.toordinal())
display(nba_df.dtypes)
display(nba_df)

GAME_DATE_EST        int64
GAME_ID              int64
HOME_TEAM_ID         int64
VISITOR_TEAM_ID      int64
SEASON               int64
TEAM_ID_home         int64
TEAM_ID_away         int64
HOME_TEAM_WINS       int64
PTS_home           float64
FG_PCT_home        float64
FT_PCT_home        float64
FG3_PCT_home       float64
AST_home           float64
REB_home           float64
PTS_away           float64
FG_PCT_away        float64
FT_PCT_away        float64
FG3_PCT_away       float64
AST_away           float64
REB_away           float64
dtype: object

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,TEAM_ID_away,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,731496,10300011,1610612737,1610612739,2003,1610612737,1610612739,0,106.20,0.43210,0.75900,0.30090,22.25,46.85,108.85,0.44225,0.77540,0.36805,23.35,46.45
1,731603,20300611,1610612737,1610612752,2003,1610612737,1610612752,0,93.10,0.44685,0.79300,0.36380,20.25,43.20,89.60,0.42610,0.71285,0.29690,19.55,40.45
2,731607,20300642,1610612737,1610612756,2003,1610612737,1610612756,0,93.95,0.45340,0.77695,0.36315,20.40,42.90,90.35,0.42740,0.71715,0.31035,19.70,40.05
3,731609,20300658,1610612737,1610612757,2003,1610612737,1610612757,0,92.75,0.44780,0.76265,0.35945,19.90,43.30,90.15,0.42745,0.71715,0.29785,20.05,39.75
4,731611,20300672,1610612737,1610612749,2003,1610612737,1610612749,1,92.20,0.44615,0.75585,0.35620,20.15,43.55,89.05,0.42365,0.72145,0.28680,20.15,40.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24570,738211,22100896,1610612766,1610612761,2021,1610612766,1610612761,1,106.05,0.44970,0.78575,0.35080,24.25,43.40,108.90,0.47170,0.79275,0.36455,23.45,44.65
24571,738213,22100915,1610612766,1610612765,2021,1610612766,1610612765,0,105.80,0.44800,0.77465,0.35155,24.50,42.65,108.15,0.46880,0.79010,0.36275,23.25,45.60
24572,738219,22100955,1610612766,1610612759,2021,1610612766,1610612759,1,105.00,0.44480,0.77300,0.35155,24.10,42.45,107.65,0.46500,0.79570,0.35785,22.85,45.80
24573,738222,22100975,1610612766,1610612751,2021,1610612766,1610612751,0,105.90,0.44815,0.76050,0.35850,24.45,42.15,109.35,0.46680,0.79430,0.36500,23.20,45.95


In [18]:
y = nba_df[['HOME_TEAM_WINS']]
X = nba_df.drop(columns = ['HOME_TEAM_WINS'])
display(y.head())
display(X.head())

Unnamed: 0,HOME_TEAM_WINS
0,0
1,0
2,0
3,0
4,1


Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,TEAM_ID_away,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,731496,10300011,1610612737,1610612739,2003,1610612737,1610612739,106.2,0.4321,0.759,0.3009,22.25,46.85,108.85,0.44225,0.7754,0.36805,23.35,46.45
1,731603,20300611,1610612737,1610612752,2003,1610612737,1610612752,93.1,0.44685,0.793,0.3638,20.25,43.2,89.6,0.4261,0.71285,0.2969,19.55,40.45
2,731607,20300642,1610612737,1610612756,2003,1610612737,1610612756,93.95,0.4534,0.77695,0.36315,20.4,42.9,90.35,0.4274,0.71715,0.31035,19.7,40.05
3,731609,20300658,1610612737,1610612757,2003,1610612737,1610612757,92.75,0.4478,0.76265,0.35945,19.9,43.3,90.15,0.42745,0.71715,0.29785,20.05,39.75
4,731611,20300672,1610612737,1610612749,2003,1610612737,1610612749,92.2,0.44615,0.75585,0.3562,20.15,43.55,89.05,0.42365,0.72145,0.2868,20.15,40.05


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [20]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.fit_transform(X_test)

In [21]:
rf_model = RandomForestClassifier()

In [22]:
rf_model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [23]:
predictions = rf_model.predict(X_test)

In [24]:
accuracy_score(y_test, predictions)

0.6002604166666666

In [25]:
confusion_matrix(y_test, predictions)

array([[ 895, 1635],
       [ 821, 2793]])

In [26]:
classification_report(y_test, predictions)


'              precision    recall  f1-score   support\n\n           0       0.52      0.35      0.42      2530\n           1       0.63      0.77      0.69      3614\n\n    accuracy                           0.60      6144\n   macro avg       0.58      0.56      0.56      6144\nweighted avg       0.59      0.60      0.58      6144\n'

In [27]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.06315160578429671, 'FT_PCT_away'),
 (0.0625479202158116, 'FT_PCT_home'),
 (0.06235966202566463, 'FG3_PCT_home'),
 (0.0623105303483743, 'FG_PCT_away'),
 (0.06158635660427435, 'FG3_PCT_away'),
 (0.061008481017520294, 'FG_PCT_home'),
 (0.05889385215908903, 'PTS_home'),
 (0.05863119897419642, 'GAME_ID'),
 (0.058583009662041746, 'GAME_DATE_EST'),
 (0.05809367514417547, 'REB_home'),
 (0.058061066502378814, 'PTS_away'),
 (0.0574477513350369, 'AST_away'),
 (0.05678682106482605, 'REB_away'),
 (0.05635529051489823, 'AST_home'),
 (0.04011769375137078, 'VISITOR_TEAM_ID'),
 (0.03927660817939124, 'TEAM_ID_away'),
 (0.03205178404178474, 'TEAM_ID_home'),
 (0.031966614603236396, 'HOME_TEAM_ID'),
 (0.02077007807163237, 'SEASON')]