## NBA Game Predictor

This project is designed to predict the winner in a potential matchup

Let us import the API to be used

In [1]:
%pip install nba-api




Now, we shall import the necessary packages

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# for classification
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

# for metrics
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

## Games Predictor

In [3]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22022,1610612742,DAL,Dallas Mavericks,22200362,2022-12-06,DAL @ DEN,W,239,116,36,72,0.5,17,36,0.472,27,31,0.871,4,31,35,27,10,4,15,32,1.0
1,22022,1610612765,DET,Detroit Pistons,22200361,2022-12-06,DET @ MIA,W,240,116,43,87,0.494,19,41,0.463,11,15,0.733,9,31,40,27,8,1,12,19,20.0
2,22022,1610612739,CLE,Cleveland Cavaliers,22200360,2022-12-06,CLE vs. LAL,W,241,116,48,93,0.516,7,27,0.259,13,17,0.765,9,35,44,25,11,3,7,20,14.0
3,22022,1610612747,LAL,Los Angeles Lakers,22200360,2022-12-06,LAL @ CLE,L,240,102,38,89,0.427,6,36,0.167,20,24,0.833,14,29,43,16,4,6,16,13,-14.0
4,22022,1610612743,DEN,Denver Nuggets,22200362,2022-12-06,DEN vs. DAL,L,241,115,40,78,0.513,13,32,0.406,22,28,0.786,8,28,36,30,8,4,17,28,-1.0


In [4]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [5]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [6]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Dallas Mavericks,0022200362,2022-12-06,DAL @ DEN,W,1.0
1,Detroit Pistons,0022200361,2022-12-06,DET @ MIA,W,20.0
2,Cleveland Cavaliers,0022200360,2022-12-06,CLE vs. LAL,W,14.0
3,Los Angeles Lakers,0022200360,2022-12-06,LAL @ CLE,L,-14.0
4,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0
...,...,...,...,...,...,...
7441,New York Knicks,0021900505,2020-01-01,NYK vs. POR,W,24.0
7442,Washington Wizards,0021900504,2020-01-01,WAS vs. ORL,L,-21.0
7443,Los Angeles Lakers,0021900507,2020-01-01,LAL vs. PHX,W,10.0
7444,Orlando Magic,0021900504,2020-01-01,ORL @ WAS,W,21.0


In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7446 entries, 0 to 7445
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   7446 non-null   object 
 1   GAME_ID     7446 non-null   object 
 2   GAME_DATE   7446 non-null   object 
 3   MATCHUP     7446 non-null   object 
 4   WL          7446 non-null   object 
 5   PLUS_MINUS  7446 non-null   float64
dtypes: float64(1), object(5)
memory usage: 349.2+ KB


In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [9]:
games['GAME_DATE']

0      2022-12-06
1      2022-12-06
2      2022-12-06
3      2022-12-06
4      2022-12-06
          ...    
7441   2020-01-01
7442   2020-01-01
7443   2020-01-01
7444   2020-01-01
7445   2020-01-01
Name: GAME_DATE, Length: 7446, dtype: datetime64[ns]

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
7445,Portland Trail Blazers,0021900505,2020-01-01,POR @ NYK,L,-24.0
7438,Milwaukee Bucks,0021900506,2020-01-01,MIL vs. MIN,W,2.0
7439,Phoenix Suns,0021900507,2020-01-01,PHX @ LAL,L,-10.0
7440,Minnesota Timberwolves,0021900506,2020-01-01,MIN @ MIL,L,-2.0
7444,Orlando Magic,0021900504,2020-01-01,ORL @ WAS,W,21.0
...,...,...,...,...,...,...
4,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0
3,Los Angeles Lakers,0022200360,2022-12-06,LAL @ CLE,L,-14.0
2,Cleveland Cavaliers,0022200360,2022-12-06,CLE vs. LAL,W,14.0
1,Detroit Pistons,0022200361,2022-12-06,DET @ MIA,W,20.0


In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
7445,Portland Trail Blazers,0021900505,2020-01-01,POR @ NYK,L,-24.0,
7438,Milwaukee Bucks,0021900506,2020-01-01,MIL vs. MIN,W,2.0,
7439,Phoenix Suns,0021900507,2020-01-01,PHX @ LAL,L,-10.0,
7440,Minnesota Timberwolves,0021900506,2020-01-01,MIN @ MIL,L,-2.0,
7444,Orlando Magic,0021900504,2020-01-01,ORL @ WAS,W,21.0,
...,...,...,...,...,...,...,...
4,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0,0.513333
3,Los Angeles Lakers,0022200360,2022-12-06,LAL @ CLE,L,-14.0,-2.466667
2,Cleveland Cavaliers,0022200360,2022-12-06,CLE vs. LAL,W,14.0,3.926667
1,Detroit Pistons,0022200361,2022-12-06,DET @ MIA,W,20.0,-9.066667


In [13]:
games[games['TEAM_NAME']=='Los Angeles Lakers'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
7443,Los Angeles Lakers,21900507,2020-01-01,LAL vs. PHX,W,10.0,
7416,Los Angeles Lakers,21900522,2020-01-03,LAL vs. NOP,W,10.0,
7378,Los Angeles Lakers,21900538,2020-01-05,LAL vs. DET,W,7.0,
7348,Los Angeles Lakers,21900553,2020-01-07,LAL vs. NYK,W,30.0,
7304,Los Angeles Lakers,21900572,2020-01-10,LAL @ DAL,W,15.0,
7297,Los Angeles Lakers,21900580,2020-01-11,LAL @ OKC,W,15.0,
7270,Los Angeles Lakers,21900597,2020-01-13,LAL vs. CLE,W,29.0,
7239,Los Angeles Lakers,21900613,2020-01-15,LAL vs. ORL,L,-1.0,
7183,Los Angeles Lakers,21900633,2020-01-18,LAL @ HOU,W,9.0,
7171,Los Angeles Lakers,21900648,2020-01-20,LAL @ BOS,L,-32.0,


In [14]:
msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

In [15]:
games_home.shape

(3723, 7)

In [16]:
games_away.shape

(3723, 7)

In [17]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
7438,Milwaukee Bucks,0021900506,2020-01-01,MIL vs. MIN,W,2.0,
7442,Washington Wizards,0021900504,2020-01-01,WAS vs. ORL,L,-21.0,
7443,Los Angeles Lakers,0021900507,2020-01-01,LAL vs. PHX,W,10.0,
7441,New York Knicks,0021900505,2020-01-01,NYK vs. POR,W,24.0,
7422,Chicago Bulls,0021900511,2020-01-02,CHI vs. UTA,L,-4.0,
...,...,...,...,...,...,...,...
21,Atlanta Hawks,0022200354,2022-12-05,ATL vs. OKC,L,-7.0,0.193333
13,Memphis Grizzlies,0022200357,2022-12-05,MEM vs. MIA,W,8.0,3.000000
5,Miami Heat,0022200361,2022-12-06,MIA vs. DET,L,-20.0,0.533333
4,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0,0.513333


In [18]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
7445,Portland Trail Blazers,0021900505,2020-01-01,POR @ NYK,L,-24.0,
7439,Phoenix Suns,0021900507,2020-01-01,PHX @ LAL,L,-10.0,
7440,Minnesota Timberwolves,0021900506,2020-01-01,MIN @ MIL,L,-2.0,
7444,Orlando Magic,0021900504,2020-01-01,ORL @ WAS,W,21.0,
7420,Memphis Grizzlies,0021900515,2020-01-02,MEM @ SAC,L,-5.0,
...,...,...,...,...,...,...,...
19,Phoenix Suns,0022200358,2022-12-05,PHX @ DAL,L,-19.0,5.100000
20,Philadelphia 76ers,0022200356,2022-12-05,PHI @ HOU,L,-9.0,1.533333
3,Los Angeles Lakers,0022200360,2022-12-06,LAL @ CLE,L,-14.0,-2.466667
1,Detroit Pistons,0022200361,2022-12-06,DET @ MIA,W,20.0,-9.066667


In [19]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Milwaukee Bucks,0021900506,2020-01-01,MIL vs. MIN,W,2.0,,Minnesota Timberwolves,2020-01-01,MIN @ MIL,L,-2.0,
1,Washington Wizards,0021900504,2020-01-01,WAS vs. ORL,L,-21.0,,Orlando Magic,2020-01-01,ORL @ WAS,W,21.0,
2,Los Angeles Lakers,0021900507,2020-01-01,LAL vs. PHX,W,10.0,,Phoenix Suns,2020-01-01,PHX @ LAL,L,-10.0,
3,New York Knicks,0021900505,2020-01-01,NYK vs. POR,W,24.0,,Portland Trail Blazers,2020-01-01,POR @ NYK,L,-24.0,
4,Chicago Bulls,0021900511,2020-01-02,CHI vs. UTA,L,-4.0,,Utah Jazz,2020-01-02,UTA @ CHI,W,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3718,Atlanta Hawks,0022200354,2022-12-05,ATL vs. OKC,L,-7.0,0.193333,Oklahoma City Thunder,2022-12-05,OKC @ ATL,W,7.0,1.233333
3719,Memphis Grizzlies,0022200357,2022-12-05,MEM vs. MIA,W,8.0,3.000000,Miami Heat,2022-12-05,MIA @ MEM,L,-8.0,1.066667
3720,Miami Heat,0022200361,2022-12-06,MIA vs. DET,L,-20.0,0.533333,Detroit Pistons,2022-12-06,DET @ MIA,W,20.0,-9.066667
3721,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0,0.513333,Dallas Mavericks,2022-12-06,DAL @ DEN,W,1.0,2.100000


In [20]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Milwaukee Bucks,0021900506,2020-01-01,MIL vs. MIN,W,2.0,,Minnesota Timberwolves,2020-01-01,MIN @ MIL,L,-2.0,,
1,Washington Wizards,0021900504,2020-01-01,WAS vs. ORL,L,-21.0,,Orlando Magic,2020-01-01,ORL @ WAS,W,21.0,,
2,Los Angeles Lakers,0021900507,2020-01-01,LAL vs. PHX,W,10.0,,Phoenix Suns,2020-01-01,PHX @ LAL,L,-10.0,,
3,New York Knicks,0021900505,2020-01-01,NYK vs. POR,W,24.0,,Portland Trail Blazers,2020-01-01,POR @ NYK,L,-24.0,,
4,Chicago Bulls,0021900511,2020-01-02,CHI vs. UTA,L,-4.0,,Utah Jazz,2020-01-02,UTA @ CHI,W,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3718,Atlanta Hawks,0022200354,2022-12-05,ATL vs. OKC,L,-7.0,0.193333,Oklahoma City Thunder,2022-12-05,OKC @ ATL,W,7.0,1.233333,-1.040000
3719,Memphis Grizzlies,0022200357,2022-12-05,MEM vs. MIA,W,8.0,3.000000,Miami Heat,2022-12-05,MIA @ MEM,L,-8.0,1.066667,1.933333
3720,Miami Heat,0022200361,2022-12-06,MIA vs. DET,L,-20.0,0.533333,Detroit Pistons,2022-12-06,DET @ MIA,W,20.0,-9.066667,9.600000
3721,Denver Nuggets,0022200362,2022-12-06,DEN vs. DAL,L,-1.0,0.513333,Dallas Mavericks,2022-12-06,DAL @ DEN,W,1.0,2.100000,-1.586667


In [22]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,W,
1,L,
2,W,
3,W,
4,L,
...,...,...
3718,L,-1.040000
3719,W,1.933333
3720,L,9.600000
3721,L,-1.586667


In [23]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
445,W,-1.800000
450,L,0.733333
453,L,-6.200000
458,L,-4.433333
459,W,-0.466667
...,...,...
3718,L,-1.040000
3719,W,1.933333
3720,L,9.600000
3721,L,-1.586667


In [24]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [25]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
445,1,-1.800000
450,0,0.733333
453,0,-6.200000
458,0,-4.433333
459,1,-0.466667
...,...,...
3718,0,-1.040000
3719,1,1.933333
3720,0,9.600000
3721,0,-1.586667


## Building the Model

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [28]:
df_train.shape

(2599, 2)

In [29]:
df_test.shape

(650, 2)

In [30]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

Let us use the various Classification Algorithms to identify which gives us the highest accuracy

In [31]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, random_state=7)

In [32]:
xgb_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=7, ...)

In [33]:
y_pred = xgb_clf.predict(X_test)

In [34]:
xgb_score = accuracy_score(y_test, y_pred)
xgb_score

0.5892307692307692

In [35]:
xgb_hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [36]:
random_hyp = RandomizedSearchCV(estimator=xgb_clf, 
                                param_distributions=xgb_hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)

In [37]:
random_hyp.fit(X_train, y_train)

RandomizedSearchCV(cv=7,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300...
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=

In [38]:
random_hyp.best_params_

{'learning_rate': 0.0002019450244810229,
 'max_depth': 3,
 'n_estimators': 150,
 'subsample': 1.0}

In [39]:
model_hyp = random_hyp.best_estimator_

In [40]:
y_pred_hyp = model_hyp.predict(X_test)

In [41]:
rs_score = accuracy_score(y_test, y_pred_hyp)

In [42]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

In [43]:
accuracy_score(y_test, model_saved.predict(X_test))

0.64

In [44]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2020', league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [45]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/01/2020',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus

predict_home_win=model_saved.predict(np.array([games_diff]))[0]
predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]

In [46]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='01/01/2020',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [47]:
predict_games('Boston Celtics','Toronto Raptors')

(1, 0.5068216)