In [1]:
%pip install nba_api

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nba_api.stats.endpoints import leaguegamefinder

In [3]:
gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2020', league_id_nullable='00') 
games = gamefinder.get_data_frames()[0]
games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,PTS,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS
0,42021,1610612738,BOS,Boston Celtics,42100406,2022-06-16,BOS vs. GSW,L,239,90,...,0.917,11,30,41,27,8,8,22,16,-13.0
1,42021,1610612744,GSW,Golden State Warriors,42100406,2022-06-16,GSW @ BOS,W,241,103,...,1.0,15,29,44,27,13,7,15,20,13.0
2,42021,1610612744,GSW,Golden State Warriors,42100405,2022-06-13,GSW vs. BOS,W,240,104,...,0.867,4,35,39,23,9,2,6,28,10.0
3,42021,1610612738,BOS,Boston Celtics,42100405,2022-06-13,BOS @ GSW,L,238,94,...,0.677,8,39,47,18,2,2,18,16,-10.0
4,42021,1610612744,GSW,Golden State Warriors,42100404,2022-06-10,GSW @ BOS,W,241,107,...,0.8,16,39,55,20,10,5,16,21,10.0


### 'PLUS_MINUS' is the most important feature for our prediction

# Cleaning and Exploring data

In [4]:
games.columns

Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')

In [5]:
games = games[['TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

In [6]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
0,Boston Celtics,0042100406,2022-06-16,BOS vs. GSW,L,-13.0
1,Golden State Warriors,0042100406,2022-06-16,GSW @ BOS,W,13.0
2,Golden State Warriors,0042100405,2022-06-13,GSW vs. BOS,W,10.0
3,Boston Celtics,0042100405,2022-06-13,BOS @ GSW,L,-10.0
4,Golden State Warriors,0042100404,2022-06-10,GSW @ BOS,W,10.0
...,...,...,...,...,...,...
6147,New Orleans Pelicans,0021900723,2020-01-31,NOP vs. MEM,W,28.0
6148,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0
6149,Oklahoma City Thunder,0021900725,2020-01-31,OKC @ PHX,W,4.0
6150,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0


### Final dataframe: 
1. one row for one game
2. two columns:
    1. result of game: target
    2. score stat comparing 2 teams: feature

In [7]:
import pandas as pd

In [8]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6152 entries, 0 to 6151
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   TEAM_NAME   6152 non-null   object 
 1   GAME_ID     6152 non-null   object 
 2   GAME_DATE   6152 non-null   object 
 3   MATCHUP     6152 non-null   object 
 4   WL          6152 non-null   object 
 5   PLUS_MINUS  6152 non-null   float64
dtypes: float64(1), object(5)
memory usage: 288.5+ KB


**We need to convert Dtype of GAME_DATE to datetime**

In [9]:
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

In [10]:
games['GAME_DATE']

0      2022-06-16
1      2022-06-16
2      2022-06-13
3      2022-06-13
4      2022-06-10
          ...    
6147   2020-01-31
6148   2020-01-31
6149   2020-01-31
6150   2020-01-31
6151   2020-01-31
Name: GAME_DATE, Length: 6152, dtype: datetime64[ns]

In [11]:
games = games.sort_values('GAME_DATE')

In [12]:
games

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS
6151,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0
6138,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0
6139,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0
6140,Denver Nuggets,0021900724,2020-01-31,DEN @ MIL,W,12.0
6141,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0
...,...,...,...,...,...,...
4,Golden State Warriors,0042100404,2022-06-10,GSW @ BOS,W,10.0
3,Boston Celtics,0042100405,2022-06-13,BOS @ GSW,L,-10.0
2,Golden State Warriors,0042100405,2022-06-13,GSW vs. BOS,W,10.0
1,Golden State Warriors,0042100406,2022-06-16,GSW @ BOS,W,13.0


### Creating a feature to measure how each team performed before the current game, using **PLUS_MINUS**

**rolling()**: calculate statistics based on a moving window of fixed size(30) <br>
**closed='left'**: includes 30 records before the current game, but not including the current game

### Below code is to **for each team, calculate mean of PLUS_MINUS for the recent 30 games**

In [13]:
games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

In [14]:
games[games['TEAM_NAME'] == 'Toronto Raptors'].head(36)

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
6151,Toronto Raptors,21900720,2020-01-31,TOR @ DET,W,13.0,
6115,Toronto Raptors,21900740,2020-02-02,TOR vs. CHI,W,27.0,
6064,Toronto Raptors,21900758,2020-02-05,TOR vs. IND,W,1.0,
6052,Toronto Raptors,21900772,2020-02-07,TOR @ IND,W,9.0,
6023,Toronto Raptors,21900781,2020-02-08,TOR vs. BKN,W,1.0,
6005,Toronto Raptors,21900796,2020-02-10,TOR vs. MIN,W,11.0,
5977,Toronto Raptors,21900809,2020-02-12,TOR @ BKN,L,-10.0,
5917,Toronto Raptors,21900829,2020-02-21,TOR vs. PHX,W,17.0,
5890,Toronto Raptors,21900843,2020-02-23,TOR vs. IND,W,46.0,
5871,Toronto Raptors,21900858,2020-02-25,TOR vs. MIL,L,-11.0,


### We see that 'avg_30_plus_minus' has values from 31st row, and this 7.1 value is the mean of PLUS_MINUS values of the first 30 rows

# Home advantage: 
## The Home team has an advantage of the opponent team
### See MATCHUP column. If TOR vs CHI, Toronto Raptors is the HOME TEAM. Else, other team is the home team

In [15]:
msk = games['MATCHUP'].str.contains('@')

In [16]:
games_away = games[msk] 
games_home = games[~msk]

In [17]:
games_home.shape

(3076, 7)

In [18]:
games_away.shape

(3076, 7)

In [19]:
games_home

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
6139,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,
6141,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,
6143,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,
6150,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,
6148,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,
...,...,...,...,...,...,...,...
8,Golden State Warriors,0042100402,2022-06-05,GSW vs. BOS,W,19.0,2.533333
6,Boston Celtics,0042100403,2022-06-08,BOS vs. GSW,W,16.0,8.133333
5,Boston Celtics,0042100404,2022-06-10,BOS vs. GSW,L,-10.0,8.366667
2,Golden State Warriors,0042100405,2022-06-13,GSW vs. BOS,W,10.0,3.900000


In [20]:
games_away

Unnamed: 0,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,PLUS_MINUS,avg_30_plus_minus
6151,Toronto Raptors,0021900720,2020-01-31,TOR @ DET,W,13.0,
6138,Portland Trail Blazers,0021900726,2020-01-31,POR @ LAL,W,8.0,
6140,Denver Nuggets,0021900724,2020-01-31,DEN @ MIL,W,12.0,
6142,Chicago Bulls,0021900721,2020-01-31,CHI @ BKN,L,-15.0,
6145,Dallas Mavericks,0021900722,2020-01-31,DAL @ HOU,L,-7.0,
...,...,...,...,...,...,...,...
9,Boston Celtics,0042100402,2022-06-05,BOS @ GSW,L,-19.0,9.433333
7,Golden State Warriors,0042100403,2022-06-08,GSW @ BOS,L,-16.0,3.900000
4,Golden State Warriors,0042100404,2022-06-10,GSW @ BOS,W,10.0,3.433333
3,Boston Celtics,0042100405,2022-06-13,BOS @ GSW,L,-10.0,7.100000


In [21]:
games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))

In [22]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away
0,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,
1,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,
2,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,,Chicago Bulls,2020-01-31,CHI @ BKN,L,-15.0,
3,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,
4,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,Golden State Warriors,0042100402,2022-06-05,GSW vs. BOS,W,19.0,2.533333,Boston Celtics,2022-06-05,BOS @ GSW,L,-19.0,9.433333
3072,Boston Celtics,0042100403,2022-06-08,BOS vs. GSW,W,16.0,8.133333,Golden State Warriors,2022-06-08,GSW @ BOS,L,-16.0,3.900000
3073,Boston Celtics,0042100404,2022-06-10,BOS vs. GSW,L,-10.0,8.366667,Golden State Warriors,2022-06-10,GSW @ BOS,W,10.0,3.433333
3074,Golden State Warriors,0042100405,2022-06-13,GSW vs. BOS,W,10.0,3.900000,Boston Celtics,2022-06-13,BOS @ GSW,L,-10.0,7.100000


In [23]:
games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home']-games_merged['avg_30_plus_minus_away']

In [24]:
games_merged

Unnamed: 0,TEAM_NAME_home,GAME_ID,GAME_DATE_home,MATCHUP_home,WL_home,PLUS_MINUS_home,avg_30_plus_minus_home,TEAM_NAME_away,GAME_DATE_away,MATCHUP_away,WL_away,PLUS_MINUS_away,avg_30_plus_minus_away,avg_30_plus_minus_diff
0,Los Angeles Lakers,0021900726,2020-01-31,LAL vs. POR,L,-8.0,,Portland Trail Blazers,2020-01-31,POR @ LAL,W,8.0,,
1,Milwaukee Bucks,0021900724,2020-01-31,MIL vs. DEN,L,-12.0,,Denver Nuggets,2020-01-31,DEN @ MIL,W,12.0,,
2,Brooklyn Nets,0021900721,2020-01-31,BKN vs. CHI,W,15.0,,Chicago Bulls,2020-01-31,CHI @ BKN,L,-15.0,,
3,Detroit Pistons,0021900720,2020-01-31,DET vs. TOR,L,-13.0,,Toronto Raptors,2020-01-31,TOR @ DET,W,13.0,,
4,Phoenix Suns,0021900725,2020-01-31,PHX vs. OKC,L,-4.0,,Oklahoma City Thunder,2020-01-31,OKC @ PHX,W,4.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3071,Golden State Warriors,0042100402,2022-06-05,GSW vs. BOS,W,19.0,2.533333,Boston Celtics,2022-06-05,BOS @ GSW,L,-19.0,9.433333,-6.900000
3072,Boston Celtics,0042100403,2022-06-08,BOS vs. GSW,W,16.0,8.133333,Golden State Warriors,2022-06-08,GSW @ BOS,L,-16.0,3.900000,4.233333
3073,Boston Celtics,0042100404,2022-06-10,BOS vs. GSW,L,-10.0,8.366667,Golden State Warriors,2022-06-10,GSW @ BOS,W,10.0,3.433333,4.933333
3074,Golden State Warriors,0042100405,2022-06-13,GSW vs. BOS,W,10.0,3.900000,Boston Celtics,2022-06-13,BOS @ GSW,L,-10.0,7.100000,-3.200000


In [25]:
games_model = games_merged[['WL_home','avg_30_plus_minus_diff']].dropna()

In [26]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
398,L,-6.233333
399,L,-5.366667
400,W,1.933333
402,W,2.466667
403,L,-8.066667
...,...,...
3071,W,-6.900000
3072,W,4.233333
3073,L,4.933333
3074,W,-3.200000


In [27]:
games_model['WL_home'] = games_model['WL_home'].map({'W':1, 'L' :0})

In [28]:
games_model

Unnamed: 0,WL_home,avg_30_plus_minus_diff
398,0,-6.233333
399,0,-5.366667
400,1,1.933333
402,1,2.466667
403,0,-8.066667
...,...,...
3071,1,-6.900000
3072,1,4.233333
3073,0,4.933333
3074,1,-3.200000


# Building the model

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
df_train, df_test = train_test_split(games_model, stratify=games_model['WL_home'], test_size=0.2, random_state=7)

In [31]:
df_train.shape

(2054, 2)

In [32]:
df_test.shape

(514, 2)

In [33]:
target = 'WL_home'
X_train = df_train.drop(columns=target) #avg_plus_minus_diff
y_train = df_train[target] #WL_home

X_test = df_test.drop(columns=target) #avg_plus_minus_diff
y_test = df_test[target] #WL_home

## We use XGBoost to show the general steps of modelling. XGBoost is a popular gradient boosting library, it is easy to use, fast to run and often gives good performance

In [34]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [35]:
import xgboost as xgb

In [36]:
clf=xgb.XGBClassifier(use_label_encoder=False, random_state=7)

In [37]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=7,
              reg_alpha=0, reg_lambda=1, ...)

In [38]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)

In [39]:
accuracy_score(y_test, y_pred)

0.5778210116731517

## Tune hyperparameters using randomsearch: hyperparameters values are set before the learning process begins

In [40]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

In [41]:
hyp_params={'learning_rate':loguniform(0.0001, 1),
            'max_depth':[2,3,4,5,6,7,8,9],
            'subsample':[0.7, 0.8, 0.9, 1.0],
            'n_estimators':[50, 100, 150, 200]}

In [42]:
random_hyp = RandomizedSearchCV(estimator=clf, 
                               param_distributions=hyp_params,
                               n_iter=20,
                               cv=7,
                               scoring='accuracy',
                               random_state=7)

In [43]:
random_hyp.fit(X_train, y_train)

RandomizedSearchCV(cv=7,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin=256,...
                                           n_estimators=100, n_jobs=0,
                                           num_parallel_tree=1,
   

In [44]:
random_hyp.best_params_

{'learning_rate': 0.014255533717547383,
 'max_depth': 2,
 'n_estimators': 100,
 'subsample': 0.9}

In [45]:
model_hyp = random_hyp.best_estimator_

In [46]:
y_pred_hyp = model_hyp.predict(X_test)

In [47]:
accuracy_score(y_test, y_pred_hyp)

0.6303501945525292

## Dumping and loading model_hyp. model_saved and model_hyp are the same.

In [48]:
from joblib import dump, load

In [49]:
dump(model_hyp, 'model_nba.joblib')

['model_nba.joblib']

In [50]:
model_saved = load('model_nba.joblib')

We find that the accuracy is the same as previous

In [51]:
accuracy_score(y_test, model_saved.predict(X_test))

0.6303501945525292

# What should the API do?

## Take a home team and an away team, and return a prediciton result of the winning team

In [52]:
team_home = 'Toronto Raptors'
team_away = 'Boston Celtics'

In [53]:
def predict_games(team_home, team_away):
    #Copied from above
    from nba_api.stats.endpoints import leaguegamefinder
    gamefinder = leaguegamefinder.LeagueGameFinder(date_from_nullable='01/31/2021', league_id_nullable='00') 
    games = gamefinder.get_data_frames()[0]

    games = games[['TEAM_NAME', 'GAME_ID',
           'GAME_DATE', 'MATCHUP', 'WL', 'PLUS_MINUS']]

    import pandas as pd

    games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

    #Most recent 30 games by date, for Toronto Raptors
    msk_home = (games['TEAM_NAME']==team_home)
    games_30_home = games[msk_home].sort_values('GAME_DATE').tail(30)
    home_plus_minus = games_30_home['PLUS_MINUS'].mean()

    #Most recent 30 games by date, for Boston Celtics
    msk_away = (games['TEAM_NAME']==team_away)
    games_30_away = games[msk_away].sort_values('GAME_DATE').tail(30)
    away_plus_minus = games_30_away['PLUS_MINUS'].mean()


    # games = games.sort_values('GAME_DATE')

    # games['avg_30_plus_minus'] = games.groupby('TEAM_NAME')['PLUS_MINUS'].transform(lambda x: x.rolling(30, closed='left').mean())

    # msk = games['MATCHUP'].str.contains('@')
    # games_away = games[msk]
    # games_home = games[~msk]

    # games_merged = pd.merge(games_home, games_away, on='GAME_ID', suffixes=('_home', '_away'))


    # games_merged['avg_30_plus_minus_diff'] = games_merged['avg_30_plus_minus_home']-games_merged['avg_30_plus_minus_away']

    games_diff = home_plus_minus - away_plus_minus

    model_saved = load('model_nba.joblib')

    import numpy as np

    predict_home_win = model_saved.predict(np.array([games_diff]))[0]

    predict_winning_probability = model_saved.predict_proba(np.array([games_diff]))[0][1]
    
    return predict_home_win, predict_winning_probability

In [54]:
predict_games('Toronto Raptors', 'Boston Celtics')

(0, 0.42300433)