In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [13]:
df0 = pd.read_pickle('.\\data\\MRegularSeasonDetailedResults.pkl')
df1 = df0.loc[df0['Season'] >= 2021].copy(deep=True)

In [14]:
kp_eff = pd.read_pickle('.\\data\\\kp_eff_stats.pkl')
teams = pd.read_pickle('.\\data\\MTeams.pkl')
spelling = pd.read_pickle('.\\data\\MTeamSpellings_lex.pkl')
kp_height = pd.read_pickle('.\\data\\kp_height.pkl')
kp_four_factors = pd.read_pickle('.\\data\\kp_four_factors.pkl')
kp_point_dist = pd.read_pickle('.\\data\\kp_point_dist.pkl')

In [15]:
kp_eff2 = kp_eff.merge(spelling, how='left', left_on = 'Team', right_on = 'TeamNameSpelling')
kp_height2 = kp_height.merge(spelling, how='left', left_on = 'Team', right_on = 'TeamNameSpelling')
kp_four_factors2 = kp_four_factors.merge(spelling, how='left', left_on = 'Team', right_on = 'TeamNameSpelling')
kp_point_dist2 = kp_point_dist.merge(spelling, how='left', left_on = 'Team', right_on = 'TeamNameSpelling')

In [16]:
#Efficiency Table 

eff_cols = ['TeamID', 'season', 'Tempo-Adj', 'Avg. Poss Length-Offense', 'Avg. Poss Length-Offense.Rank', 
            'Avg. Poss Length-Defense', 'Avg. Poss Length-Defense.Rank', 'Off. Efficiency-Adj', 
            'Off. Efficiency-Adj.Rank', 'Def. Efficiency-Adj', 'Def. Efficiency-Adj.Rank'
]
   
kp_eff3 = kp_eff2.loc[:, eff_cols]
kp_eff3.columns =  ['TeamID', 'season', 'tempo_adj', 'avg_poss_len_off', 'avg_poss_len_off_rnk', 
                    'avg_poss_len_def', 'avg_poss_len_def_rnk', 'off_eff_adj', 'off_eff_adj_rnk',
                    'def_eff_adj', 'def_eff_adj_rnk']
   

In [17]:
#Height Table 
hgt_cols = ['TeamID', 'season', 'AvgHgt', 'AvgHgt.Rank', 'EffHgt', 'EffHgt.Rank', 'C-Hgt', 
            'PF-Hgt', 'SF-Hgt', 'SG-Hgt', 'PG-Hgt', 'Experience', 'Bench', 'Continuity',
           ]
kp_height3 = kp_height2.loc[:, hgt_cols]
kp_height3.columns = ['TeamID', 'season', 'avg_hgt', 'avg_hgt_rank', 'eff_hgt', 'eff_hgt_rank', 'c_hgt', 
                                          'pf_hgt', 'sf_hgt', 'sg_hgt', 'pg_hgt', 'exp', 'bench', 'continuity',]

In [18]:
#Four Factos Table 
ff_cols = ['TeamID', 'season', 'Off-eFG%', 'Off-TO%', 'Off-OR%', 'Off-FTRate', 
           'Def-eFG%', 'Def-TO%', 'Def-OR%', 'Def-FTRate',
]
           
kp_four_factors3 = kp_four_factors2.loc[:, ff_cols]
kp_four_factors3.columns = ['TeamID', 'season', 'off_efg_pct', 'off_to_pct', 'off_or_pct', 'off_ft_pct', 
                                                'def_efg_pct', 'def_to_pct', 'def_or_pct', 'def_ft_pct',]

In [19]:
pd_cols = ['TeamID', 'season', 'Off-FT', 'Off-2P', 'Off-3P', 'Def-FT', 
           'Def-2P', 'Def-3P', ]
kp_point_dist3 = kp_point_dist2.loc[:, pd_cols]
kp_point_dist3.columns = ['TeamID', 'season', 'off_ft', 'off_2p', 'off_3p',
                                              'def_ft', 'def_2p', 'def_3p', ]

In [20]:
kp_master = kp_eff3.merge(kp_height3.merge(kp_four_factors3.merge(kp_point_dist3, how='left', on=['TeamID', 'season']), how='left', on=['TeamID', 'season']), how='left', on = ['TeamID', 'season'])

In [21]:
kp_master.head()

Unnamed: 0,TeamID,season,tempo_adj,avg_poss_len_off,avg_poss_len_off_rnk,avg_poss_len_def,avg_poss_len_def_rnk,off_eff_adj,off_eff_adj_rnk,def_eff_adj,...,def_efg_pct,def_to_pct,def_or_pct,def_ft_pct,off_ft,off_2p,off_3p,def_ft,def_2p,def_3p
0,1164.0,2021,76.8,14.0,1,16.1,13,91.0,334,102.6,...,47.9,20.4,32.5,27.0,21.9,41.9,36.1,17.1,50.3,32.6
1,1184.0,2021,74.8,15.4,16,16.1,17,101.0,197,101.5,...,51.0,24.3,30.2,32.2,15.0,49.4,35.6,18.8,53.2,28.0
2,1376.0,2021,74.6,15.2,12,16.7,64,103.9,139,98.9,...,53.2,22.0,31.3,39.0,16.3,55.7,28.0,21.8,48.0,30.1
3,1154.0,2021,74.2,15.5,18,16.7,62,104.0,136,109.1,...,52.8,13.9,25.7,25.9,18.7,46.6,34.7,16.0,57.1,26.9
4,1284.0,2021,74.2,14.6,4,17.8,271,99.0,237,105.1,...,48.6,20.9,28.5,38.4,21.8,47.9,30.4,21.5,46.4,32.1


In [22]:
df1['game_count'] = 1
df1['WLoc'] = np.where((df1['WLoc'] == 'N') & (df1['WTeamID'] > df1['LTeamID']), 'H', df1['WLoc'] )
df1['WLoc'] = np.where(df1['WLoc'] == 'N', 'A', df1['WLoc'])
df1.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,game_count
92832,2021,23,1101,70,1190,47,A,0,20,49,...,14,17,11,27,5,22,6,2,0,1
92833,2021,23,1104,81,1240,57,H,0,30,77,...,10,17,18,28,10,20,5,2,0,1
92834,2021,23,1111,81,1354,61,A,0,32,62,...,6,11,12,18,12,9,13,0,14,1
92835,2021,23,1113,94,1348,88,A,0,27,61,...,21,35,7,26,14,16,8,2,0,1
92836,2021,23,1114,71,1341,66,A,0,22,51,...,4,4,7,15,14,24,12,1,0,1


In [23]:
df1['home_id'] = np.where(df1['WLoc'] == 'H', df1['WTeamID'], df1['LTeamID'])
df1['away_id'] = np.where(df1['WLoc'] == 'H', df1['LTeamID'], df1['WTeamID'])
df1['home_score'] = np.where(df1['WLoc'] == 'H', df1['WScore'], df1['LScore'])
df1['away_score'] = np.where(df1['WLoc'] == 'H', df1['LScore'], df1['WScore'])
df1['home_win'] = np.where(df1['WLoc'] == 'H', 1, 0)

In [24]:
result_cols = ['Season', 'home_id', 'away_id', 'home_score', 'away_score', 'home_win']
df_results = df1.loc[:, result_cols]
df_results.columns = ['season', 'home_id', 'away_id', 'home_score', 'away_score', 'home_win']

In [25]:
kp_home = kp_master.copy(deep=True)
col_home = list(kp_home.columns)
col_home2 = [f'home_{i}' if _ > 1 else i for _, i in enumerate(col_home)]
kp_home.columns = col_home2

kp_away = kp_master.copy(deep=True)
col_away = list(kp_away.columns)
col_away2 = [f'away_{i}' if _ > 1 else i for _, i in enumerate(col_away)]
kp_away.columns = col_away2

df_results1 = df_results.merge(kp_home, how='left', left_on=['season', 'home_id'], right_on = ['season', 'TeamID'])
df_results2 = df_results1.merge(kp_away, how='left', left_on=['season', 'away_id'], right_on = ['season', 'TeamID'])

In [36]:
df_results2.head()

Unnamed: 0,season,home_id,away_id,home_score,away_score,home_win,TeamID_x,home_tempo_adj,home_avg_poss_len_off,home_avg_poss_len_off_rnk,...,away_def_efg_pct,away_def_to_pct,away_def_or_pct,away_def_ft_pct,away_off_ft,away_off_2p,away_off_3p,away_def_ft,away_def_2p,away_def_3p
0,2021,1190,1101,47,70,0,1190.0,65.4,18.1,256,...,46.0,26.5,27.9,41.1,18.2,50.5,31.4,23.5,50.9,25.5
1,2021,1104,1240,81,57,1,1104.0,73.3,14.2,2,...,48.1,15.7,27.9,25.5,16.9,50.4,32.7,15.3,48.9,35.9
2,2021,1354,1111,61,81,0,1354.0,71.6,16.6,89,...,50.0,19.8,30.2,23.7,20.1,43.8,36.1,14.6,53.0,32.4
3,2021,1348,1113,88,94,0,1348.0,69.0,16.9,132,...,52.5,20.7,30.1,36.1,19.8,51.3,28.9,20.4,49.7,29.9
4,2021,1341,1114,66,71,0,1341.0,69.2,16.7,107,...,50.7,20.1,28.0,27.8,21.0,54.4,24.6,16.6,49.4,33.9


In [29]:
x_cols = list(df_results2.columns)[7:]

X = df_results2.loc[:, x_cols]
y = df_results2.loc[:, 'home_score']

In [39]:

X[x_cols] = X[x_cols].apply(pd.to_numeric, errors='coerce', axis=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15815 entries, 0 to 15814
Data columns (total 71 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   home_tempo_adj             15770 non-null  float64
 1   home_avg_poss_len_off      15770 non-null  float64
 2   home_avg_poss_len_off_rnk  15770 non-null  float64
 3   home_avg_poss_len_def      15770 non-null  float64
 4   home_avg_poss_len_def_rnk  15770 non-null  float64
 5   home_off_eff_adj           15770 non-null  float64
 6   home_off_eff_adj_rnk       15770 non-null  float64
 7   home_def_eff_adj           15770 non-null  float64
 8   home_def_eff_adj_rnk       15770 non-null  float64
 9   home_avg_hgt               15770 non-null  float64
 10  home_avg_hgt_rank          15770 non-null  float64
 11  home_eff_hgt               15770 non-null  float64
 12  home_eff_hgt_rank          15770 non-null  float64
 13  home_c_hgt                 15770 non-null  flo

In [40]:
xgb_cl = xgb.XGBClassifier()
regressor = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [42]:
regressor.fit(X_train, y_train)

In [43]:
pd.DataFrame(regressor.feature_importances_.reshape(1, -1), columns=x_cols)

Unnamed: 0,home_tempo_adj,home_avg_poss_len_off,home_avg_poss_len_off_rnk,home_avg_poss_len_def,home_avg_poss_len_def_rnk,home_off_eff_adj,home_off_eff_adj_rnk,home_def_eff_adj,home_def_eff_adj_rnk,home_avg_hgt,...,away_def_efg_pct,away_def_to_pct,away_def_or_pct,away_def_ft_pct,away_off_ft,away_off_2p,away_off_3p,away_def_ft,away_def_2p,away_def_3p
0,0.026467,0.008672,0.067173,0.010534,0.00727,0.104596,0.091761,0.008983,0.00579,0.006085,...,0.007003,0.006319,0.013195,0.004919,0.006216,0.005556,0.007504,0.007673,0.002918,0.00685


In [44]:
y_pred = regressor.predict(X_test)

In [45]:
mean_squared_error(y_test, y_pred)

88.59434427493655

In [50]:
df_home_output = pd.DataFrame({'predicted_home' : y_pred, 
                               'actual_home' : y_test})

In [58]:
#y_away = df_results2.loc[:, 'home_score']


0        47
1        81
2        61
3        88
4        66
         ..
15810    67
15811    69
15812    67
15813    80
15814    74
Name: home_score, Length: 15815, dtype: int64

In [63]:
y_away_train = df_results2.loc[y_train.index, 'away_score']
y_away_test = df_results2.loc[y_test.index, 'away_score']

In [64]:
regressor_away = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=3
)

In [65]:
regressor_away.fit(X_train, y_away_train)
pd.DataFrame(regressor_away.feature_importances_.reshape(1, -1), columns=x_cols)

Unnamed: 0,home_tempo_adj,home_avg_poss_len_off,home_avg_poss_len_off_rnk,home_avg_poss_len_def,home_avg_poss_len_def_rnk,home_off_eff_adj,home_off_eff_adj_rnk,home_def_eff_adj,home_def_eff_adj_rnk,home_avg_hgt,...,away_def_efg_pct,away_def_to_pct,away_def_or_pct,away_def_ft_pct,away_off_ft,away_off_2p,away_off_3p,away_def_ft,away_def_2p,away_def_3p
0,0.047787,0.009354,0.009715,0.006816,0.004587,0.004774,0.008677,0.083548,0.066081,0.00689,...,0.00666,0.004775,0.007418,0.005749,0.003673,0.007639,0.006242,0.005442,0.004787,0.006214


In [66]:
y_away_pred = regressor_away.predict(X_test)

In [67]:
mean_squared_error(y_away_test, y_away_pred)

87.17815719911235

In [69]:
df_away_output = pd.DataFrame({'predicted_away' : y_away_pred, 
                               'actual_away' : y_away_test})

In [72]:
df_test = df_away_output.join(df_home_output)

In [74]:
df_test['pred_home_win'] = np.where(df_test['predicted_home']>df_test['predicted_away'], 1, 0)
df_test['actual_home_win'] = np.where(df_test['actual_home']>df_test['actual_away'], 1, 0)

In [75]:
df_test

Unnamed: 0,predicted_away,actual_away,predicted_home,actual_home,dff,diff_sq,pred_home_win,actual_home_win
1191,67.480240,80,71.422958,73,-1.577042,2.487060,1,0
10351,86.709122,84,74.757790,63,11.757790,138.245617,0,0
15411,79.949928,93,69.785568,79,-9.214432,84.905753,0,0
10388,69.328926,58,67.038567,57,10.038567,100.772819,0,0
9549,75.602951,79,67.532181,71,-3.467819,12.025770,0,0
...,...,...,...,...,...,...,...,...
1017,64.542671,53,78.691574,67,11.691574,136.692905,1,1
8471,74.316528,67,77.873962,69,8.873962,78.747209,1,1
8087,70.187630,53,61.274902,58,3.274902,10.724985,0,1
13332,72.145691,75,77.016045,87,-9.983955,99.679365,1,1


In [76]:
from sklearn.metrics import f1_score

f1_score(df_test['actual_home_win'], df_test['pred_home_win'], average='macro') 

0.7277315000669076

In [104]:
round1 = pd.read_pickle('.\\data\\round1_games.pkl').dropna()
round1['season'] = 2023

In [105]:
round1_0 = round1.merge(kp_home, how='left', left_on=['season', 'home_id'], right_on = ['season', 'TeamID'])
round1_1 = round1_0.merge(kp_away, how='left', left_on=['season', 'away_id'], right_on = ['season', 'TeamID'])

In [106]:
rnd1_x = round1_1.loc[:, x_cols]

rnd1_x[x_cols] = rnd1_x[x_cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [107]:
rnd1_away = regressor_away.predict(rnd1_x)
rnd1_home = regressor.predict(rnd1_x)

In [108]:
rnd1_teams = round1_1[['home_id', 'away_id']]

In [109]:
rnd1_scores = pd.DataFrame({'away_score' : rnd1_away, 
                            'home_score' : rnd1_home})

In [115]:
rnd1_frcst = rnd1_teams.join(rnd1_scores)

In [114]:
teams = pd.read_pickle('.\\data\\MTeams.pkl')
teams['home_team'] = teams['TeamName']
teams['away_team'] = teams['TeamName']
away_teams = teams.loc[:, ['TeamID', 'away_team']].copy()
home_teams = teams.loc[:, ['TeamID', 'home_team']].copy()

In [117]:
rnd1_frcst_1 = rnd1_frcst.merge(away_teams, how='left', left_on = 'away_id', right_on='TeamID')
rnd1_frcst_2 = rnd1_frcst_1.merge(home_teams, how='left', left_on = 'home_id', right_on = 'TeamID')

In [119]:
rnd1_frcst_2['total'] = rnd1_frcst_2['home_score'] + rnd1_frcst_2['away_score']

In [121]:
rnd1_frcst_2.loc[rnd1_frcst_2['away_score'] > rnd1_frcst_2['home_score']]

Unnamed: 0,home_id,away_id,away_score,home_score,TeamID_x,away_team,TeamID_y,home_team,total
12,1281.0,1429.0,85.92308,83.128235,1429,Utah St,1281,Missouri,169.051315
29,1369.0,1394.0,82.48217,77.749962,1394,TAM C. Christi,1369,SE Missouri St,160.232132
