In [1]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt

In [21]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [125]:
from interpret import show
from interpret.data import ClassHistogram
from interpret.glassbox import ExplainableBoostingClassifier, ClassificationTree, DecisionListClassifier
from interpret.perf import ROC

In [3]:
connection = sqlite3.connect('data/nba.db')

In [109]:
df=pd.read_sql("select * from LAL", connection)

In [110]:
df.columns

Index(['index', 'SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME',
       'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'opponent'],
      dtype='object')

In [111]:
df['target']=df.WL.apply(lambda x: 1 if x=='W' else 0)

In [18]:
#Match count
df.groupby('opponent').index.count()

opponent
ATL     71
BAR      2
BKN     19
BOS    105
CHA     36
CHH     29
CHI     77
CLE     76
DAL    171
DEN    194
DET     90
GOS     79
GSW    127
HOU    183
IND     78
KCK     13
LAC    174
MAC      1
MEM     70
MIA     73
MIL     73
MIN    131
NJN     59
NOH     35
NOK      8
NOP     27
NYK     77
OKC     55
ORL     68
PHI     58
PHL     26
PHX    214
POR    209
SAC    183
SAN     69
SAS    124
SDC      6
SEA    135
TOR     52
UTA    131
UTH     64
VAN     23
WAS     72
Name: index, dtype: int64

In [20]:
df.target.value_counts()/len(df)

1    0.597421
0    0.402579
Name: target, dtype: float64

In [17]:
#Win rate by team
(df.groupby('opponent').target.sum()/df.groupby('opponent').index.count()).sort_values(ascending=False)

opponent
KCK    1.000000
MAC    1.000000
VAN    0.956522
NOH    0.771429
NJN    0.745763
PHL    0.730769
CHH    0.724138
ATL    0.690141
MIN    0.671756
DAL    0.666667
SDC    0.666667
SAC    0.650273
ORL    0.647059
GOS    0.645570
LAC    0.632184
WAS    0.625000
NOK    0.625000
GSW    0.622047
DEN    0.608247
MIL    0.602740
SEA    0.600000
IND    0.589744
CHA    0.583333
UTH    0.578125
TOR    0.576923
PHX    0.574766
BOS    0.571429
CHI    0.571429
PHI    0.568966
HOU    0.568306
SAN    0.565217
MIA    0.561644
NYK    0.558442
DET    0.555556
POR    0.555024
BKN    0.526316
MEM    0.514286
CLE    0.513158
BAR    0.500000
UTA    0.488550
NOP    0.481481
SAS    0.467742
OKC    0.454545
dtype: float64

### Baseline

In [22]:
y=df.target

In [23]:
X=df.opponent

In [24]:
X=pd.get_dummies(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

In [27]:
baseline_model=LogisticRegression().fit(X_train,y_train)

In [29]:
y_pred=baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[::,1]

In [31]:
print('Test Accuracy: '+str(accuracy_score(y_test, y_pred)))
print('Train Accuracy: '+str(accuracy_score(y_train, y_pred=baseline_model.predict(X_train))))

Test Accuracy: 0.5952380952380952
Train Accuracy: 0.6021731510690501


In [37]:
print('Test ROC: '+str(roc_auc_score(y_test, y_pred_proba)))
print('Train ROC: '+str(roc_auc_score(y_train, baseline_model.predict_proba(X_train)[::,1])))

Test ROC: 0.5167965389451393
Train ROC: 0.5862287919832169


In [35]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [36]:
tn, fp, fn, tp

(35, 241, 48, 390)

### Last N-games

In [112]:
df=df.astype({'GAME_DATE':'datetime64'}).sort_values('GAME_DATE')

In [113]:
df.columns

Index(['index', 'SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME',
       'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'opponent', 'target'],
      dtype='object')

In [114]:
numerical_features=['MIN', 'PTS', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

In [115]:
final_df=df.copy(deep=True)

In [116]:
for f in numerical_features:
    final_df[f]=df[f].rolling(window=10).mean()

In [117]:
final_df.dropna(inplace=True)

In [118]:
final_df

Unnamed: 0,index,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,opponent,target
2329,2329,21996,1610612747,LAL,Los Angeles Lakers,0029600126,1996-11-17,LAL @ PHX,W,245.3,...,30.9,42.7,22.5,6.9,6.5,18.3,21.7,4.20,PHX,1
2328,2328,21996,1610612747,LAL,Los Angeles Lakers,0029600136,1996-11-19,LAL @ GSW,W,245.1,...,29.6,41.2,23.3,7.2,6.3,17.3,21.3,3.10,GSW,1
2327,2327,21996,1610612747,LAL,Los Angeles Lakers,0029600144,1996-11-20,LAL vs. UTA,L,245.3,...,28.6,40.4,23.1,7.0,6.2,16.7,21.8,0.90,UTA,0
2326,2326,21996,1610612747,LAL,Los Angeles Lakers,0029600157,1996-11-22,LAL vs. SAS,W,245.1,...,28.1,40.2,23.3,7.7,5.8,16.8,21.6,1.30,SAS,1
2325,2325,21996,1610612747,LAL,Los Angeles Lakers,0029600173,1996-11-24,LAL vs. HOU,L,244.9,...,27.5,39.8,23.1,8.4,5.3,16.5,22.0,1.80,HOU,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,4,42019,1610612747,LAL,Los Angeles Lakers,0041900402,2020-10-02,LAL vs. MIA,W,240.2,...,33.4,43.0,26.8,7.8,5.4,14.1,24.6,9.40,MIA,1
3,3,42019,1610612747,LAL,Los Angeles Lakers,0041900403,2020-10-04,LAL @ MIA,L,240.1,...,33.2,43.0,26.6,8.0,5.0,14.7,25.3,7.30,MIA,0
2,2,42019,1610612747,LAL,Los Angeles Lakers,0041900404,2020-10-06,LAL @ MIA,W,240.1,...,32.4,42.0,26.1,7.6,5.0,14.6,24.0,6.90,MIA,1
1,1,42019,1610612747,LAL,Los Angeles Lakers,0041900405,2020-10-09,LAL vs. MIA,L,239.9,...,30.9,41.1,25.7,8.1,4.8,14.2,23.7,4.26,MIA,0


In [119]:
pd.get_dummies(final_df.opponent).columns

Index(['ATL', 'BAR', 'BKN', 'BOS', 'CHA', 'CHH', 'CHI', 'CLE', 'DAL', 'DEN',
       'DET', 'GSW', 'HOU', 'IND', 'LAC', 'MAC', 'MEM', 'MIA', 'MIL', 'MIN',
       'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR',
       'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'VAN', 'WAS'],
      dtype='object')

In [120]:
X=final_df[numerical_features].merge(pd.get_dummies(final_df.opponent), how='left', left_index=True,right_index=True)

In [121]:
X=X.rename(columns={'MIN_x':'minutes','MIN_y':'MIN'})

In [145]:
X=X.shift(1).dropna()#.merge(df.target, how='left', left_index=True, right_index=True).dropna()

In [146]:
y=final_df.target[1:]

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

### EBM

In [151]:
hist = ClassHistogram().explain_data(X_train, y_train, name = 'Train Data')
show(hist)

In [154]:
ebm = ExplainableBoostingClassifier(feature_names=None,
    feature_types=None,
    max_bins=100,
    max_interaction_bins=32,
    binning='uniform',
    mains='all',
    interactions=0,
    outer_bags=16,
    inner_bags=0,
    learning_rate=0.1,
    validation_size=0.1,
    early_stopping_rounds=150,
    early_stopping_tolerance=0.0001,
    max_rounds=5000,
    max_leaves=3,
    min_samples_leaf=2,
    n_jobs=-2,
    random_state=117)
ebm.fit(X_train, y_train)   

ExplainableBoostingClassifier(binning='uniform', early_stopping_rounds=150,
                              feature_names=['minutes', 'PTS', 'FGM', 'FGA',
                                             'FG_PCT', 'FG3M', 'FG3A',
                                             'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                                             'OREB', 'DREB', 'REB', 'AST',
                                             'STL', 'BLK', 'TOV', 'PF',
                                             'PLUS_MINUS', 'ATL', 'BAR', 'BKN',
                                             'BOS', 'CHA', 'CHH', 'CHI', 'CLE',
                                             'DAL', 'DEN', ...],
                              feature_types=['continuous', 'continuous',
                                             'continuous...
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             

In [155]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [156]:
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [157]:
y_pred_ebm=ebm.predict(X_test)
y_pred_proba_ebm = ebm.predict_proba(X_test)[::,1]

In [158]:
print('Test Accuracy: '+str(accuracy_score(y_test, y_pred_ebm)))
print('Train Accuracy: '+str(accuracy_score(y_train, y_pred=ebm.predict(X_train))))

Test Accuracy: 0.6263736263736264
Train Accuracy: 0.6800660792951542


In [159]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_ebm).ravel()

In [160]:
tn, fp, fn, tp

(66, 142, 28, 219)