## NBA Game Predictor

This project is designed to predict the winner in a potential matchup

Let us import the API to be used

In [1]:
%pip install nba-api

Note: you may need to restart the kernel to use updated packages.


Now, we shall import the necessary packages

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# for classification
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

# for metrics
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

import warnings
warnings.filterwarnings(action='ignore')

## Games Predictor

In [3]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable="10/22/2024",league_id_nullable="00")
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42024,1610612760,OKC,Oklahoma City Thunder,42400407,2025-06-22,OKC vs. IND,W,240,103,35,87,0.402,11,40,0.275,22,31,0.71,13,27,40,20,14,8,7,23,12.0
1,42024,1610612754,IND,Indiana Pacers,42400407,2025-06-22,IND @ OKC,L,240,91,29,70,0.414,11,28,0.393,22,29,0.759,12,33,45,17,6,4,21,24,-12.0
2,42024,1610612760,OKC,Oklahoma City Thunder,42400406,2025-06-19,OKC @ IND,L,240,91,31,74,0.419,8,30,0.267,21,26,0.808,4,37,41,14,4,4,21,20,-17.0
3,42024,1610612754,IND,Indiana Pacers,42400406,2025-06-19,IND vs. OKC,W,240,108,38,92,0.413,15,42,0.357,17,25,0.68,11,35,46,23,16,5,10,17,17.0
4,42024,1610612754,IND,Indiana Pacers,42400405,2025-06-16,IND @ OKC,L,241,109,37,82,0.451,11,30,0.367,24,30,0.8,18,32,50,23,9,4,22,25,-11.0


In [4]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [5]:
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [6]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Oklahoma City Thunder,0042400407,2025-06-22,OKC vs. IND,W,12.0
1,Indiana Pacers,0042400407,2025-06-22,IND @ OKC,L,-12.0
2,Oklahoma City Thunder,0042400406,2025-06-19,OKC @ IND,L,-17.0
3,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0
4,Indiana Pacers,0042400405,2025-06-16,IND @ OKC,L,-11.0
...,...,...,...,...,...,...
2651,Milwaukee Bucks,0022400066,2024-10-23,MIL @ PHI,W,15.0
2652,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0
2653,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0
2654,New York Knicks,0022400061,2024-10-22,NYK @ BOS,L,-23.0


In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2656 entries, 0 to 2655
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   2656 non-null   object 
 1   GAME_ID     2656 non-null   object 
 2   GAME_DATE   2656 non-null   object 
 3   MATCHUP     2656 non-null   object 
 4   WL          2656 non-null   object 
 5   PLUS_MINUS  2656 non-null   float64
dtypes: float64(1), object(5)
memory usage: 124.6+ KB


In [8]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [9]:
games['GAME_DATE']

0      2025-06-22
1      2025-06-22
2      2025-06-19
3      2025-06-19
4      2025-06-16
          ...    
2651   2024-10-23
2652   2024-10-22
2653   2024-10-22
2654   2024-10-22
2655   2024-10-22
Name: GAME_DATE, Length: 2656, dtype: datetime64[ns]

In [10]:
games = games.sort_values('GAME_DATE')
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
2655,Minnesota Timberwolves,0022400062,2024-10-22,MIN @ LAL,L,-7.0
2652,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0
2654,New York Knicks,0022400061,2024-10-22,NYK @ BOS,L,-23.0
2653,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0
2651,Milwaukee Bucks,0022400066,2024-10-23,MIL @ PHI,W,15.0
...,...,...,...,...,...,...
4,Indiana Pacers,0042400405,2025-06-16,IND @ OKC,L,-11.0
3,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0
2,Oklahoma City Thunder,0042400406,2025-06-19,OKC @ IND,L,-17.0
1,Indiana Pacers,0042400407,2025-06-22,IND @ OKC,L,-12.0


In [11]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
2655,Minnesota Timberwolves,0022400062,2024-10-22,MIN @ LAL,L,-7.0,
2652,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0,
2654,New York Knicks,0022400061,2024-10-22,NYK @ BOS,L,-23.0,
2653,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0,
2651,Milwaukee Bucks,0022400066,2024-10-23,MIL @ PHI,W,15.0,
...,...,...,...,...,...,...,...
4,Indiana Pacers,0042400405,2025-06-16,IND @ OKC,L,-11.0,4.533333
3,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0,2.400000
2,Oklahoma City Thunder,0042400406,2025-06-19,OKC @ IND,L,-17.0,9.966667
1,Indiana Pacers,0042400407,2025-06-22,IND @ OKC,L,-12.0,3.666667


In [13]:
games[games['TEAM_NAME']=='Los Angeles Lakers'].head(35)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
2653,Los Angeles Lakers,22400062,2024-10-22,LAL vs. MIN,W,7.0,
2623,Los Angeles Lakers,22400085,2024-10-25,LAL vs. PHX,W,7.0,
2588,Los Angeles Lakers,22400096,2024-10-26,LAL vs. SAC,W,4.0,
2553,Los Angeles Lakers,22400111,2024-10-28,LAL @ PHX,L,-4.0,
2531,Los Angeles Lakers,22400118,2024-10-30,LAL @ CLE,L,-24.0,
2503,Los Angeles Lakers,22400137,2024-11-01,LAL @ TOR,W,6.0,
2454,Los Angeles Lakers,22400156,2024-11-04,LAL @ DET,L,-12.0,
2436,Los Angeles Lakers,22400174,2024-11-06,LAL @ MEM,L,-17.0,
2407,Los Angeles Lakers,22400195,2024-11-08,LAL vs. PHI,W,10.0,
2368,Los Angeles Lakers,22400211,2024-11-10,LAL vs. TOR,W,20.0,


In [14]:
msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

In [15]:
games_home.shape

(1322, 7)

In [16]:
games_away.shape

(1334, 7)

In [17]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
2652,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0,
2653,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0,
2632,Houston Rockets,0022400068,2024-10-23,HOU vs. CHA,L,-5.0,
2637,Philadelphia 76ers,0022400066,2024-10-23,PHI vs. MIL,L,-15.0,
2638,Atlanta Hawks,0022400064,2024-10-23,ATL vs. BKN,W,4.0,
...,...,...,...,...,...,...,...
8,Indiana Pacers,0042400403,2025-06-11,IND vs. OKC,W,9.0,4.966667
7,Indiana Pacers,0042400404,2025-06-13,IND vs. OKC,L,-7.0,4.733333
5,Oklahoma City Thunder,0042400405,2025-06-16,OKC vs. IND,W,11.0,10.300000
3,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0,2.400000


In [18]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
2655,Minnesota Timberwolves,0022400062,2024-10-22,MIN @ LAL,L,-7.0,
2654,New York Knicks,0022400061,2024-10-22,NYK @ BOS,L,-23.0,
2651,Milwaukee Bucks,0022400066,2024-10-23,MIL @ PHI,W,15.0,
2634,Cleveland Cavaliers,0022400067,2024-10-23,CLE @ TOR,W,30.0,
2635,Memphis Grizzlies,0022400070,2024-10-23,MEM @ UTA,W,2.0,
...,...,...,...,...,...,...,...
9,Oklahoma City Thunder,0042400403,2025-06-11,OKC @ IND,L,-9.0,10.966667
6,Oklahoma City Thunder,0042400404,2025-06-13,OKC @ IND,W,7.0,10.600000
4,Indiana Pacers,0042400405,2025-06-16,IND @ OKC,L,-11.0,4.533333
2,Oklahoma City Thunder,0042400406,2025-06-19,OKC @ IND,L,-17.0,9.966667


In [19]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0,,New York Knicks,2024-10-22,NYK @ BOS,L,-23.0,
1,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0,,Minnesota Timberwolves,2024-10-22,MIN @ LAL,L,-7.0,
2,Houston Rockets,0022400068,2024-10-23,HOU vs. CHA,L,-5.0,,Charlotte Hornets,2024-10-23,CHA @ HOU,W,5.0,
3,Philadelphia 76ers,0022400066,2024-10-23,PHI vs. MIL,L,-15.0,,Milwaukee Bucks,2024-10-23,MIL @ PHI,W,15.0,
4,Atlanta Hawks,0022400064,2024-10-23,ATL vs. BKN,W,4.0,,Brooklyn Nets,2024-10-23,BKN @ ATL,L,-4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,Indiana Pacers,0042400403,2025-06-11,IND vs. OKC,W,9.0,4.966667,Oklahoma City Thunder,2025-06-11,OKC @ IND,L,-9.0,10.966667
1318,Indiana Pacers,0042400404,2025-06-13,IND vs. OKC,L,-7.0,4.733333,Oklahoma City Thunder,2025-06-13,OKC @ IND,W,7.0,10.600000
1319,Oklahoma City Thunder,0042400405,2025-06-16,OKC vs. IND,W,11.0,10.300000,Indiana Pacers,2025-06-16,IND @ OKC,L,-11.0,4.533333
1320,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0,2.400000,Oklahoma City Thunder,2025-06-19,OKC @ IND,L,-17.0,9.966667


In [20]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [21]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Boston Celtics,0022400061,2024-10-22,BOS vs. NYK,W,23.0,,New York Knicks,2024-10-22,NYK @ BOS,L,-23.0,,
1,Los Angeles Lakers,0022400062,2024-10-22,LAL vs. MIN,W,7.0,,Minnesota Timberwolves,2024-10-22,MIN @ LAL,L,-7.0,,
2,Houston Rockets,0022400068,2024-10-23,HOU vs. CHA,L,-5.0,,Charlotte Hornets,2024-10-23,CHA @ HOU,W,5.0,,
3,Philadelphia 76ers,0022400066,2024-10-23,PHI vs. MIL,L,-15.0,,Milwaukee Bucks,2024-10-23,MIL @ PHI,W,15.0,,
4,Atlanta Hawks,0022400064,2024-10-23,ATL vs. BKN,W,4.0,,Brooklyn Nets,2024-10-23,BKN @ ATL,L,-4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,Indiana Pacers,0042400403,2025-06-11,IND vs. OKC,W,9.0,4.966667,Oklahoma City Thunder,2025-06-11,OKC @ IND,L,-9.0,10.966667,-6.000000
1318,Indiana Pacers,0042400404,2025-06-13,IND vs. OKC,L,-7.0,4.733333,Oklahoma City Thunder,2025-06-13,OKC @ IND,W,7.0,10.600000,-5.866667
1319,Oklahoma City Thunder,0042400405,2025-06-16,OKC vs. IND,W,11.0,10.300000,Indiana Pacers,2025-06-16,IND @ OKC,L,-11.0,4.533333,5.766667
1320,Indiana Pacers,0042400406,2025-06-19,IND vs. OKC,W,17.0,2.400000,Oklahoma City Thunder,2025-06-19,OKC @ IND,L,-17.0,9.966667,-7.566667


In [22]:
games_merged[['WL_home', 'avg_30_plus_minus_diff']]

Unnamed: 0,WL_home,avg_30_plus_minus_diff
0,W,
1,W,
2,L,
3,L,
4,W,
...,...,...
1317,W,-6.000000
1318,L,-5.866667
1319,W,5.766667
1320,W,-7.566667


In [23]:
games_model = games_merged[['WL_home', 'avg_30_plus_minus_diff']].dropna()
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
440,W,14.966667
443,L,3.966667
444,W,1.333333
447,W,10.600000
448,L,-3.800000
...,...,...
1317,W,-6.000000
1318,L,-5.866667
1319,W,5.766667
1320,W,-7.566667


In [24]:
games_model['WL_home'] = games_model['WL_home'].map({'W': 1, 'L': 0})

In [25]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
440,1,14.966667
443,0,3.966667
444,1,1.333333
447,1,10.600000
448,0,-3.800000
...,...,...
1317,1,-6.000000
1318,0,-5.866667
1319,1,5.766667
1320,1,-7.566667


## Building the Model

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [28]:
df_train.shape

(689, 2)

In [29]:
df_test.shape

(173, 2)

In [30]:
target = 'WL_home'
X_train = df_train.drop(columns=target)
y_train = df_train[target]

X_test = df_test.drop(columns=target)
y_test= df_test[target]

Let us use the various Classification Algorithms to identify which gives us the highest accuracy

In [31]:
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, random_state=7)

In [32]:
xgb_clf.fit(X_train, y_train)

In [33]:
y_pred = xgb_clf.predict(X_test)

In [34]:
xgb_score = accuracy_score(y_test, y_pred)
xgb_score

0.6069364161849711

In [35]:
xgb_hyp_params = {'learning_rate': loguniform(0.0001, 1),
              'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
              'subsample': [0.7, 0.8, 0.9, 1.0],
              'n_estimators': [50, 100, 150, 200]}

In [36]:
random_hyp = RandomizedSearchCV(estimator=xgb_clf, 
                                param_distributions=xgb_hyp_params, 
                                n_iter=20, 
                                cv=7,
                                scoring='accuracy',
                                random_state=7)

In [37]:
random_hyp.fit(X_train, y_train)

In [38]:
random_hyp.best_params_

{'learning_rate': 0.025246179740589205,
 'max_depth': 2,
 'n_estimators': 150,
 'subsample': 0.7}

In [39]:
model_hyp = random_hyp.best_estimator_

In [40]:
y_pred_hyp = model_hyp.predict(X_test)

In [41]:
rs_score = accuracy_score(y_test, y_pred_hyp)

In [42]:
from joblib import dump, load
dump(model_hyp, 'model_nba.joblib') 
model_saved = load('model_nba.joblib') 

In [43]:
accuracy_score(y_test, model_saved.predict(X_test))

0.6647398843930635

In [44]:
from nba_api.stats.endpoints import leaguegamefinder
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable="10/18/2022", league_id_nullable='00')
games = gamefinder.get_data_frames()[0]

games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

import pandas as pd

games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

games = games.sort_values('GAME_DATE')

games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

msk = games['MATCHUP'].str.contains('@')
games_away = games[msk]
games_home = games[~msk]

games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home'] - games_merged['avg_30_plus_minus_away']

In [45]:
team_home='Toronto Raptors'
team_away='Boston Celtics'

import numpy as np
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='10/18/2022',
                                           league_id_nullable='00')
games = gamefinder.get_data_frames()[0]
games = games[['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

msk_home = (games['TEAM_NAME'] == team_home)
games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
home_plus_minus = games_30_home['PLUS_MINUS'].mean()

msk_away = (games['TEAM_NAME'] == team_away)
games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
away_plus_minus = games_30_away['PLUS_MINUS'].mean()

games_diff=home_plus_minus - away_plus_minus

predict_home_win=model_saved.predict(np.array([games_diff]))[0]
predict_winning_probability=model_saved.predict_proba(np.array([games_diff]))[0][1]

In [46]:
def predict_games(team_home, team_away):
    gamefinder = leaguegamefinder.LeagueGameFinder(
        date_from_nullable='10/18/2022',
        league_id_nullable='00')
    games = gamefinder.get_data_frames()[0]
    games = games[
        ['TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]
    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    msk_home = (games['TEAM_NAME'] == team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    msk_away = (games['TEAM_NAME'] == team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()

    games_diff = home_plus_minus - away_plus_minus

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]
    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    return predict_home_win, predict_winning_probability

In [47]:
predict_games('Boston Celtics','Toronto Raptors')

(1, 0.8107539)