# XGBoost Regression Model
This model tries to predict the point difference between the home and the visiting team based on traditional box score stats of the previous matches (15 in this case).

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

from nba_analysis import NBAPlayer, NBATeam, NBAGame

pd.set_option("display.max_columns", None)

### Getting data
##### Mainly; Teams, Players, and Matches of the Current Season.

In [2]:
teams = NBATeam.get_teams_list()
players = NBAPlayer.get_players_list(is_active=True)

In [3]:
all_matches = {
    k: team.get_last_matches().dropna()
    for k, team in tqdm(teams.items(), "Teams", len(teams))
}

Teams: 100%|██████████| 30/30 [01:16<00:00,  2.55s/it]


### Data Wrangling

#### Combining stats for both home and visiting team in one row per game.

In [4]:
matches = pd.concat([v for _, v in all_matches.items()])
matches = matches.merge(matches, on=["Game_ID", "GAME_DATE"])
matches = matches[~matches["MATCHUP_x"].str.contains("@")]
matches["GAME_DATE"] = pd.to_datetime(matches["GAME_DATE"])
matches = matches[matches.Team_ID_x != matches.Team_ID_y]
matches = matches.sort_values("GAME_DATE", ascending=False).reset_index(drop=True)
matches

Unnamed: 0,Team_ID_x,Game_ID,GAME_DATE,MATCHUP_x,WL_x,W_x,L_x,W_PCT_x,MIN_x,FGM_x,FGA_x,FG_PCT_x,FG3M_x,FG3A_x,FG3_PCT_x,FTM_x,FTA_x,FT_PCT_x,OREB_x,DREB_x,REB_x,AST_x,STL_x,BLK_x,TOV_x,PF_x,PTS_x,Team_ID_y,MATCHUP_y,WL_y,W_y,L_y,W_PCT_y,MIN_y,FGM_y,FGA_y,FG_PCT_y,FG3M_y,FG3A_y,FG3_PCT_y,FTM_y,FTA_y,FT_PCT_y,OREB_y,DREB_y,REB_y,AST_y,STL_y,BLK_y,TOV_y,PF_y,PTS_y
0,1610612749,0021900615,2020-01-16,MIL vs. BOS,W,37.0,6.0,0.860,240,43,87,0.494,16,31,0.516,26,37,0.703,7,43,50,21,6,6,10,22,128,1610612738,BOS @ MIL,L,27.0,13.0,0.675,240,43,98,0.439,15,49,0.306,22,27,0.815,10,33,43,16,4,7,7,25,123
1,1610612740,0021900616,2020-01-16,NOP vs. UTA,W,16.0,26.0,0.381,265,51,93,0.548,11,26,0.423,25,35,0.714,9,36,45,29,8,9,13,27,138,1610612762,UTA @ NOP,L,28.0,13.0,0.683,265,46,100,0.460,15,39,0.385,25,32,0.781,11,32,43,23,9,6,13,29,132
2,1610612752,0021900614,2020-01-16,NYK vs. PHX,L,11.0,31.0,0.262,240,37,90,0.411,8,26,0.308,16,24,0.667,10,30,40,21,12,3,18,21,98,1610612756,PHX @ NYK,W,17.0,24.0,0.415,240,47,92,0.511,12,30,0.400,15,18,0.833,14,39,53,31,11,4,19,22,121
3,1610612743,0021900610,2020-01-15,DEN vs. CHA,W,28.0,12.0,0.700,240,38,79,0.481,9,28,0.321,15,25,0.600,3,37,40,28,8,7,11,18,100,1610612766,CHA @ DEN,L,15.0,29.0,0.341,240,33,81,0.407,12,35,0.343,8,14,0.571,9,33,42,28,4,6,14,24,86
4,1610612760,0021900609,2020-01-15,OKC vs. TOR,L,23.0,18.0,0.561,240,43,85,0.506,16,42,0.381,19,20,0.950,4,27,31,32,14,3,18,17,121,1610612761,TOR @ OKC,W,26.0,14.0,0.650,240,52,85,0.612,15,27,0.556,11,15,0.733,5,31,36,33,13,1,18,23,130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,1610612762,0021900011,2019-10-23,UTA vs. OKC,W,1.0,0.0,1.000,240,40,90,0.444,6,24,0.250,14,20,0.700,11,38,49,15,4,5,11,31,100,1610612760,OKC @ UTA,L,0.0,1.0,0.000,240,32,83,0.386,10,27,0.370,21,31,0.677,7,41,48,15,5,4,13,23,95
612,1610612755,0021900008,2019-10-23,PHI vs. BOS,W,1.0,0.0,1.000,240,37,85,0.435,7,29,0.241,26,36,0.722,12,50,62,24,6,7,15,34,107,1610612738,BOS @ PHI,L,0.0,1.0,0.000,240,33,90,0.367,7,26,0.269,20,34,0.588,10,31,41,18,4,2,10,29,93
613,1610612753,0021900005,2019-10-23,ORL vs. CLE,W,1.0,0.0,1.000,240,37,86,0.430,9,30,0.300,11,19,0.579,7,39,46,24,12,4,13,18,94,1610612739,CLE @ ORL,L,0.0,1.0,0.000,240,33,88,0.375,9,34,0.265,10,15,0.667,10,44,54,20,8,2,16,15,85
614,1610612746,0021900002,2019-10-22,LAC vs. LAL,W,1.0,0.0,1.000,240,42,81,0.519,11,31,0.355,17,24,0.708,11,34,45,24,8,5,14,25,112,1610612747,LAL @ LAC,L,0.0,1.0,0.000,240,37,85,0.435,13,33,0.394,15,21,0.714,9,32,41,20,4,7,14,24,102


#### Aggregating the last n games (home & road) of each team to predict the score difference of the current game.

In [5]:
last_n_games = 15
result = []
for game in tqdm(sorted(matches.Game_ID)):
    my_game = matches[matches.Game_ID == game]
    team1 = my_game["Team_ID_x"].values[0]
    team2 = my_game["Team_ID_y"].values[0]

    home_team = matches[(matches.Game_ID < game) & ((matches.Team_ID_x == team1))].iloc[
        :last_n_games
    ]
    home_team1 = (
        home_team[home_team.columns[home_team.columns.str.contains("_x")]]
        .drop(["MATCHUP_x", "WL_x"], axis=1)
        .rename(columns=lambda x: x[:-2])
    )
    home_team2 = (
        home_team[home_team.columns[home_team.columns.str.contains("_y")]]
        .drop(["MATCHUP_y", "WL_y"], axis=1)
        .rename(columns=lambda x: x[:-2])
    )
    home_team = pd.concat([home_team1, home_team2])

    road_team = matches[(matches.Game_ID < game) & ((matches.Team_ID_y == team2))].iloc[
        :last_n_games
    ]
    road_team1 = (
        road_team[road_team.columns[road_team.columns.str.contains("_x")]]
        .drop(["MATCHUP_x", "WL_x"], axis=1)
        .rename(columns=lambda x: x[:-2])
    )
    road_team2 = (
        road_team[road_team.columns[road_team.columns.str.contains("_y")]]
        .drop(["MATCHUP_y", "WL_y"], axis=1)
        .rename(columns=lambda x: x[:-2])
    )
    road_team = pd.concat([road_team1, road_team2])

    home_team = home_team.mean()
    road_team = road_team.mean()
    final = home_team - road_team
    final["TARGET"] = my_game["PTS_x"].values[0] - my_game["PTS_y"].values[0]
    final["GAME_ID"] = game
    final["TEAM_ID"] = my_game["Team_ID_x"].values[0]
    result.append(final)

    ## This may seem like a duplication, not sure how much it affects the model to add the match both ways.
    final = road_team - home_team
    final["TARGET"] = my_game["PTS_y"].values[0] - my_game["PTS_x"].values[0]
    final["GAME_ID"] = game
    final["TEAM_ID"] = my_game["Team_ID_y"].values[0]
    result.append(final)

100%|██████████| 616/616 [00:13<00:00, 47.16it/s]


#### Ordering by time

In [6]:
features = pd.DataFrame(result).dropna().drop(columns=["Team_ID","GAME_ID", "W", "L"]).reset_index(drop=True)
features.head()

Unnamed: 0,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,TARGET,TEAM_ID
0,0.0,-25.0,0.0,-8.0,0.0385,2.0,-1.5,0.04,-3.0,-8.0,0.134,3.5,-8.5,-5.0,5.5,-1.5,-2.5,0.0,-7.0,-1.0,-22.0,1610612766
1,0.0,25.0,0.0,8.0,-0.0385,-2.0,1.5,-0.04,3.0,8.0,-0.134,-3.5,8.5,5.0,-5.5,1.5,2.5,0.0,7.0,1.0,22.0,1610612750
2,0.0,25.0,4.0,16.0,-0.0365,5.5,13.0,0.0385,-2.5,0.0,-0.0825,3.5,1.5,5.0,-2.0,-2.5,3.5,-3.0,1.0,11.0,4.0,1610612751
3,0.0,-25.0,-4.0,-16.0,0.0365,-5.5,-13.0,-0.0385,2.5,0.0,0.0825,-3.5,-1.5,-5.0,2.0,2.5,-3.5,3.0,-1.0,-11.0,-4.0,1610612752
4,0.0,-25.0,-3.0,-8.0,0.001,-0.5,4.5,-0.0455,-5.5,-0.5,-0.224,-8.0,8.0,0.0,1.5,1.0,2.0,-5.0,-3.0,-12.0,3.0,1610612745


#### As we have a time-series dataset, we are splitting at a fixed point to make sure that our test set comes after our train set chronologically.

In [7]:
cutoff = 850
features_tr, features_te = features.iloc[:850, :], features.iloc[850:, :]
x_tr, x_te,  = features_tr.iloc[:, :-2], features_te.iloc[:, :-2]
y_tr, y_te, = features_tr.iloc[:, -2], features_te.iloc[:, -2]
groups_tr, groups_te = features_tr.iloc[:, -1], features_te.iloc[:, -1]

#### Fitting an XGBoost Regressor with BayesSearchCV and GroupKFold

In [8]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from skopt import BayesSearchCV
from sklearn.model_selection import GroupKFold
from functools import partial

xgb = BayesSearchCV(
    XGBRegressor(n_estimators=30, random_state=1),
    search_spaces={
        "learning_rate": (0.01, 1.0, "log-uniform"),
        "min_child_weight": (0, 10),
        "max_depth": (0, 3),
        "subsample": (0.01, 1.0, "uniform"),
        "colsample_bytree": (0.01, 1.0, "uniform"),
        "colsample_bylevel": (0.01, 1.0, "uniform"),
        "reg_lambda": (1e-9, 1000, "log-uniform"),
        "reg_alpha": (1e-9, 1.0, "log-uniform"),
        "gamma": (1e-9, 0.5, "log-uniform"),
    },
    cv=GroupKFold(),
    n_jobs=4,
    n_iter=40,
    verbose=0,
    refit=True,
    random_state=1,
)
xgb.fit(x_tr, y_tr, groups=groups_tr)

y_pred = xgb.predict(x_te)

print(mean_absolute_error(y_te, y_pred))
print(r2_score(y_te, y_pred))



10.706859073577784
0.13781037865362855


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


### Conclusion
- We can notice that in this case (although we are addressing the problem in a different way) that the R-squared score is double what we got from the baseline. It means that we can explain 14% of the variation in the dataset. Although this is not good, it is still better than the baseline model.