## NBA Game Predictor

This project is designed to predict the winner in a potential matchup

Let us import the API to be used

In [1]:
%pip install nba-api

Note: you may need to restart the kernel to use updated packages.


Now, we shall import the necessary packages

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# for classification
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

# for metrics
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

## Games Predictor

In [3]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable="10/18/2022",league_id_nullable="00")
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,22023,1610612766,CHA,Charlotte Hornets,22300745,2024-02-09,CHA @ MIL,L,239,84,27,79,0.342,7,29,0.241,23,29,0.793,8,40,48,17,2,2,13,20,-36.0
1,22023,1610612740,NOP,New Orleans Pelicans,22300747,2024-02-09,NOP @ LAL,L,241,122,46,89,0.517,11,29,0.379,19,23,0.826,10,31,41,28,5,7,13,22,-17.0
2,22023,1610612761,TOR,Toronto Raptors,22300744,2024-02-09,TOR vs. HOU,W,240,107,40,95,0.421,10,33,0.303,17,19,0.895,16,25,41,24,14,8,11,17,3.0
3,22023,1610612758,SAC,Sacramento Kings,22300746,2024-02-09,SAC vs. DEN,W,242,135,51,87,0.586,17,34,0.5,16,18,0.889,6,36,42,32,11,1,16,22,29.0
4,22023,1610612745,HOU,Houston Rockets,22300744,2024-02-09,HOU @ TOR,L,240,104,38,85,0.447,15,38,0.395,13,16,0.813,12,36,48,22,5,3,21,20,-3.0


In [4]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [5]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [6]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Charlotte Hornets,0022300745,2024-02-09,CHA @ MIL,L,-36.0
1,New Orleans Pelicans,0022300747,2024-02-09,NOP @ LAL,L,-17.0
2,Toronto Raptors,0022300744,2024-02-09,TOR vs. HOU,W,3.0
3,Sacramento Kings,0022300746,2024-02-09,SAC vs. DEN,W,29.0
4,Houston Rockets,0022300744,2024-02-09,HOU @ TOR,L,-3.0
...,...,...,...,...,...,...
4344,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0
4345,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0
4346,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0
4347,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,-14.0


In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4349 entries, 0 to 4348
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   4349 non-null   object 
 1   GAME_ID     4349 non-null   object 
 2   GAME_DATE   4349 non-null   object 
 3   MATCHUP     4349 non-null   object 
 4   WL          4348 non-null   object 
 5   PLUS_MINUS  4349 non-null   float64
dtypes: float64(1), object(5)
memory usage: 204.0+ KB


In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [9]:
games['GAME_DATE']

0      2024-02-09
1      2024-02-09
2      2024-02-09
3      2024-02-09
4      2024-02-09
          ...    
4344   2022-10-19
4345   2022-10-18
4346   2022-10-18
4347   2022-10-18
4348   2022-10-18
Name: GAME_DATE, Length: 4349, dtype: datetime64[ns]

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
4348,Philadelphia 76ers,0022200001,2022-10-18,PHI @ BOS,L,-9.0
4345,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0
4347,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,-14.0
4346,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0
4344,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0
...,...,...,...,...,...,...
8,Atlanta Hawks,0022300742,2024-02-09,ATL @ PHI,W,6.0
10,Los Angeles Lakers,0022300747,2024-02-09,LAL vs. NOP,W,17.0
11,Denver Nuggets,0022300746,2024-02-09,DEN @ SAC,L,-29.0
6,Philadelphia 76ers,0022300742,2024-02-09,PHI vs. ATL,L,-6.0


In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
4348,Philadelphia 76ers,0022200001,2022-10-18,PHI @ BOS,L,-9.0,
4345,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0,
4347,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,-14.0,
4346,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0,
4344,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0,
...,...,...,...,...,...,...,...
8,Atlanta Hawks,0022300742,2024-02-09,ATL @ PHI,W,6.0,-3.933333
10,Los Angeles Lakers,0022300747,2024-02-09,LAL vs. NOP,W,17.0,-2.700000
11,Denver Nuggets,0022300746,2024-02-09,DEN @ SAC,L,-29.0,4.833333
6,Philadelphia 76ers,0022300742,2024-02-09,PHI vs. ATL,L,-6.0,3.400000


In [13]:
games[games['TEAM_NAME']=='Los Angeles Lakers'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
4347,Los Angeles Lakers,22200002,2022-10-18,LAL @ GSW,L,-14.0,
4317,Los Angeles Lakers,22200016,2022-10-20,LAL vs. LAC,L,-6.0,
4264,Los Angeles Lakers,22200037,2022-10-23,LAL vs. POR,L,-2.0,
4231,Los Angeles Lakers,22200064,2022-10-26,LAL @ DEN,L,-11.0,
4210,Los Angeles Lakers,22200076,2022-10-28,LAL @ MIN,L,-9.0,
4166,Los Angeles Lakers,22200095,2022-10-30,LAL vs. DEN,W,11.0,
4130,Los Angeles Lakers,22200117,2022-11-02,LAL vs. NOP,W,3.0,
4103,Los Angeles Lakers,22200131,2022-11-04,LAL vs. UTA,L,-14.0,
4063,Los Angeles Lakers,22200140,2022-11-06,LAL vs. CLE,L,-14.0,
4051,Los Angeles Lakers,22200157,2022-11-07,LAL @ UTA,L,-23.0,


In [14]:
msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

In [15]:
games_home.shape

(2175, 7)

In [16]:
games_away.shape

(2174, 7)

In [17]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
4345,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0,
4346,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0,
4344,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0,
4326,Indiana Pacers,0022200004,2022-10-19,IND vs. WAS,L,-7.0,
4327,Memphis Grizzlies,0022200009,2022-10-19,MEM vs. NYK,W,3.0,
...,...,...,...,...,...,...,...
3,Sacramento Kings,0022300746,2024-02-09,SAC vs. DEN,W,29.0,0.593333
5,Milwaukee Bucks,0022300745,2024-02-09,MIL vs. CHA,W,36.0,2.633333
9,Boston Celtics,0022300743,2024-02-09,BOS vs. WAS,W,4.0,10.400000
10,Los Angeles Lakers,0022300747,2024-02-09,LAL vs. NOP,W,17.0,-2.700000


In [18]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
4348,Philadelphia 76ers,0022200001,2022-10-18,PHI @ BOS,L,-9.0,
4347,Los Angeles Lakers,0022200002,2022-10-18,LAL @ GSW,L,-14.0,
4321,Houston Rockets,0022200005,2022-10-19,HOU @ ATL,L,-10.0,
4322,Charlotte Hornets,0022200011,2022-10-19,CHA @ SAS,W,27.0,
4323,Dallas Mavericks,0022200013,2022-10-19,DAL @ PHX,L,-2.0,
...,...,...,...,...,...,...,...
4,Houston Rockets,0022300744,2024-02-09,HOU @ TOR,L,-3.0,-0.866667
7,Washington Wizards,0022300743,2024-02-09,WAS @ BOS,L,-4.0,-10.100000
8,Atlanta Hawks,0022300742,2024-02-09,ATL @ PHI,W,6.0,-3.933333
11,Denver Nuggets,0022300746,2024-02-09,DEN @ SAC,L,-29.0,4.833333


In [19]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0,,Philadelphia 76ers,2022-10-18,PHI @ BOS,L,-9.0,
1,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0,,Los Angeles Lakers,2022-10-18,LAL @ GSW,L,-14.0,
2,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0,,Houston Rockets,2022-10-19,HOU @ ATL,L,-10.0,
3,Indiana Pacers,0022200004,2022-10-19,IND vs. WAS,L,-7.0,,Washington Wizards,2022-10-19,WAS @ IND,W,7.0,
4,Memphis Grizzlies,0022200009,2022-10-19,MEM vs. NYK,W,3.0,,New York Knicks,2022-10-19,NYK @ MEM,L,-3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,Sacramento Kings,0022300746,2024-02-09,SAC vs. DEN,W,29.0,0.593333,Denver Nuggets,2024-02-09,DEN @ SAC,L,-29.0,4.833333
2171,Milwaukee Bucks,0022300745,2024-02-09,MIL vs. CHA,W,36.0,2.633333,Charlotte Hornets,2024-02-09,CHA @ MIL,L,-36.0,-14.233333
2172,Boston Celtics,0022300743,2024-02-09,BOS vs. WAS,W,4.0,10.400000,Washington Wizards,2024-02-09,WAS @ BOS,L,-4.0,-10.100000
2173,Los Angeles Lakers,0022300747,2024-02-09,LAL vs. NOP,W,17.0,-2.700000,New Orleans Pelicans,2024-02-09,NOP @ LAL,L,-17.0,6.933333


In [20]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Boston Celtics,0022200001,2022-10-18,BOS vs. PHI,W,9.0,,Philadelphia 76ers,2022-10-18,PHI @ BOS,L,-9.0,,
1,Golden State Warriors,0022200002,2022-10-18,GSW vs. LAL,W,14.0,,Los Angeles Lakers,2022-10-18,LAL @ GSW,L,-14.0,,
2,Atlanta Hawks,0022200005,2022-10-19,ATL vs. HOU,W,10.0,,Houston Rockets,2022-10-19,HOU @ ATL,L,-10.0,,
3,Indiana Pacers,0022200004,2022-10-19,IND vs. WAS,L,-7.0,,Washington Wizards,2022-10-19,WAS @ IND,W,7.0,,
4,Memphis Grizzlies,0022200009,2022-10-19,MEM vs. NYK,W,3.0,,New York Knicks,2022-10-19,NYK @ MEM,L,-3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2170,Sacramento Kings,0022300746,2024-02-09,SAC vs. DEN,W,29.0,0.593333,Denver Nuggets,2024-02-09,DEN @ SAC,L,-29.0,4.833333,-4.240000
2171,Milwaukee Bucks,0022300745,2024-02-09,MIL vs. CHA,W,36.0,2.633333,Charlotte Hornets,2024-02-09,CHA @ MIL,L,-36.0,-14.233333,16.866667
2172,Boston Celtics,0022300743,2024-02-09,BOS vs. WAS,W,4.0,10.400000,Washington Wizards,2024-02-09,WAS @ BOS,L,-4.0,-10.100000,20.500000
2173,Los Angeles Lakers,0022300747,2024-02-09,LAL vs. NOP,W,17.0,-2.700000,New Orleans Pelicans,2024-02-09,NOP @ LAL,L,-17.0,6.933333,-9.633333


In [22]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,W,
1,W,
2,W,
3,L,
4,W,
...,...,...
2170,W,-4.240000
2171,W,16.866667
2172,W,20.500000
2173,W,-9.633333


In [23]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
445,L,10.366667
448,L,-8.666667
451,W,3.213333
454,W,-2.500000
455,W,5.593333
...,...,...
2170,W,-4.240000
2171,W,16.866667
2172,W,20.500000
2173,W,-9.633333


In [24]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [25]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
445,0,10.366667
448,0,-8.666667
451,1,3.213333
454,1,-2.500000
455,1,5.593333
...,...,...
2170,1,-4.240000
2171,1,16.866667
2172,1,20.500000
2173,1,-9.633333


## Building the Model

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [28]:
df_train.shape

(1361, 2)

In [29]:
df_test.shape

(341, 2)

In [30]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

Let us use the various Classification Algorithms to identify which gives us the highest accuracy

In [31]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, random_state=7)

In [32]:
xgb_clf.fit(X_train, y_train)

In [33]:
y_pred = xgb_clf.predict(X_test)

In [34]:
xgb_score = accuracy_score(y_test, y_pred)
xgb_score

0.624633431085044

In [35]:
xgb_hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [36]:
random_hyp = RandomizedSearchCV(estimator=xgb_clf, 
                                param_distributions=xgb_hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)

In [37]:
random_hyp.fit(X_train, y_train)

In [38]:
random_hyp.best_params_

{'learning_rate': 0.004758705642828484,
 'max_depth': 2,
 'n_estimators': 200,
 'subsample': 0.9}

In [39]:
model_hyp = random_hyp.best_estimator_

In [40]:
y_pred_hyp = model_hyp.predict(X_test)

In [41]:
rs_score = accuracy_score(y_test, y_pred_hyp)

In [42]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

In [43]:
accuracy_score(y_test, model_saved.predict(X_test))

0.6334310850439883

In [44]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable="10/18/2022", league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [45]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='10/18/2022',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus

predict_home_win=model_saved.predict(np.array([games_diff]))[0]
predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]

In [46]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='10/18/2022',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [47]:
predict_games('Boston Celtics','Toronto Raptors')

(1, 0.698773)