# This will run a Random Forest model on NBA data to predict winners of NBA games


In [1]:
# Initial Imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [25]:
nba_df = pd.read_csv(Path('Resources/games.csv'),
                    parse_dates = ['GAME_DATE_EST'], 
                    infer_datetime_format= True)  
nba_df.dropna(inplace = True)
nba_df

Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,...,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2022-03-12,22101005,Final,1610612748,1610612750,2021,1610612748,104.0,0.398,0.760,...,23.0,53.0,1610612750,113.0,0.422,0.875,0.357,21.0,46.0,0
1,2022-03-12,22101006,Final,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,...,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0,1
2,2022-03-12,22101007,Final,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,...,28.0,52.0,1610612754,119.0,0.489,1.000,0.389,23.0,47.0,0
3,2022-03-12,22101008,Final,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,...,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0,1
4,2022-03-12,22101009,Final,1610612743,1610612761,2021,1610612743,115.0,0.551,0.750,...,32.0,39.0,1610612761,127.0,0.471,0.760,0.387,28.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25791,2014-10-06,11400007,Final,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,...,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
25792,2014-10-06,11400004,Final,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,...,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
25793,2014-10-06,11400005,Final,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,...,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
25794,2014-10-05,11400002,Final,1610612761,1610612758,2014,1610612761,99.0,0.440,0.771,...,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1


In [26]:
nba_df.drop(columns = ['GAME_STATUS_TEXT'], inplace = True)
nba_df['GAME_DATE_EST'] = nba_df['GAME_DATE_EST'].apply(lambda x: x.toordinal())
display(nba_df.dtypes)
display(nba_df)

GAME_DATE_EST        int64
GAME_ID              int64
HOME_TEAM_ID         int64
VISITOR_TEAM_ID      int64
SEASON               int64
TEAM_ID_home         int64
PTS_home           float64
FG_PCT_home        float64
FT_PCT_home        float64
FG3_PCT_home       float64
AST_home           float64
REB_home           float64
TEAM_ID_away         int64
PTS_away           float64
FG_PCT_away        float64
FT_PCT_away        float64
FG3_PCT_away       float64
AST_away           float64
REB_away           float64
HOME_TEAM_WINS       int64
dtype: object

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,738226,22101005,1610612748,1610612750,2021,1610612748,104.0,0.398,0.760,0.333,23.0,53.0,1610612750,113.0,0.422,0.875,0.357,21.0,46.0,0
1,738226,22101006,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,0.429,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0,1
2,738226,22101007,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,0.324,28.0,52.0,1610612754,119.0,0.489,1.000,0.389,23.0,47.0,0
3,738226,22101008,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,0.400,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0,1
4,738226,22101009,1610612743,1610612761,2021,1610612743,115.0,0.551,0.750,0.407,32.0,39.0,1610612761,127.0,0.471,0.760,0.387,28.0,50.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25791,735512,11400007,1610612737,1610612740,2014,1610612737,93.0,0.419,0.821,0.421,24.0,50.0,1610612740,87.0,0.366,0.643,0.375,17.0,43.0,1
25792,735512,11400004,1610612741,1610612764,2014,1610612741,81.0,0.338,0.719,0.381,18.0,40.0,1610612764,85.0,0.411,0.636,0.267,17.0,47.0,0
25793,735512,11400005,1610612747,1610612743,2014,1610612747,98.0,0.448,0.682,0.500,29.0,45.0,1610612743,95.0,0.387,0.659,0.500,19.0,43.0,1
25794,735511,11400002,1610612761,1610612758,2014,1610612761,99.0,0.440,0.771,0.333,21.0,30.0,1610612758,94.0,0.469,0.725,0.385,18.0,45.0,1


In [27]:
y = nba_df[['HOME_TEAM_WINS']]
X = nba_df.drop(columns = ['HOME_TEAM_WINS'])
display(y.head())
display(X.head())

Unnamed: 0,HOME_TEAM_WINS
0,0
1,1
2,0
3,1
4,0


Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,738226,22101005,1610612748,1610612750,2021,1610612748,104.0,0.398,0.76,0.333,23.0,53.0,1610612750,113.0,0.422,0.875,0.357,21.0,46.0
1,738226,22101006,1610612741,1610612739,2021,1610612741,101.0,0.443,0.933,0.429,20.0,46.0,1610612739,91.0,0.419,0.824,0.208,19.0,40.0
2,738226,22101007,1610612759,1610612754,2021,1610612759,108.0,0.412,0.813,0.324,28.0,52.0,1610612754,119.0,0.489,1.0,0.389,23.0,47.0
3,738226,22101008,1610612744,1610612749,2021,1610612744,122.0,0.484,0.933,0.4,33.0,55.0,1610612749,109.0,0.413,0.696,0.386,27.0,39.0
4,738226,22101009,1610612743,1610612761,2021,1610612743,115.0,0.551,0.75,0.407,32.0,39.0,1610612761,127.0,0.471,0.76,0.387,28.0,50.0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [29]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.fit_transform(X_test)

In [30]:
rf_model = RandomForestClassifier()

In [31]:
rf_model.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [32]:
predictions = rf_model.predict(X_test)

In [33]:
accuracy_score(y_test, predictions)

0.9587548638132296

In [34]:
confusion_matrix(y_test, predictions)

array([[2603,   70],
       [ 195, 3557]])

In [46]:
classification_report(y_test, predictions)


TypeError: 'str' object is not callable

In [44]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2347595792978125, 'PTS_home'),
 (0.22202732411498943, 'PTS_away'),
 (0.11761946508373806, 'FG_PCT_home'),
 (0.11682583778870483, 'FG_PCT_away'),
 (0.04034463233992817, 'FG3_PCT_home'),
 (0.03428955728313377, 'FG3_PCT_away'),
 (0.031087689778816464, 'REB_away'),
 (0.030547290411089615, 'AST_home'),
 (0.029788296322294985, 'REB_home'),
 (0.027103528103187224, 'AST_away'),
 (0.018449290699547233, 'FT_PCT_home'),
 (0.016587800026767806, 'FT_PCT_away'),
 (0.015418591382851085, 'GAME_DATE_EST'),
 (0.014575388361216229, 'GAME_ID'),
 (0.010405108663719333, 'HOME_TEAM_ID'),
 (0.010223383332201685, 'VISITOR_TEAM_ID'),
 (0.010117686148791596, 'TEAM_ID_home'),
 (0.010067211116590037, 'TEAM_ID_away'),
 (0.009762339744619935, 'SEASON')]