In [1]:
from nba_api.stats.static import teams
import pandas as pd
import sqlite3

In [87]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [90]:
from interpret import show
from interpret.data import ClassHistogram
from interpret.glassbox import ExplainableBoostingClassifier, ClassificationTree, DecisionListClassifier
from interpret.perf import ROC

In [2]:
team_data=teams.get_teams()

In [5]:
connection = sqlite3.connect('data/nba.db')

In [133]:
df=pd.read_sql("select * from TOR", connection)

In [23]:
for idx,t in enumerate(team_data):
    if idx==0:
        continue
    print(t['abbreviation'])
    df=df.append(pd.read_sql("select * from "+str(t['abbreviation']), connection))


BOS
CLE
NOP
CHI
DAL
DEN
GSW
HOU
LAC
LAL
MIA
MIL
MIN
BKN
NYK
ORL
IND
PHI
PHX
POR
SAC
SAS
OKC
TOR
UTA
MEM
WAS
DET
CHA


In [134]:
df

Unnamed: 0,index,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,opponent
0,0,42019,1610612761,TOR,Toronto Raptors,0041900217,2020-09-11,TOR vs. BOS,L,240,...,7,37,44,17,1,6,18,23,-5.0,BOS
1,1,42019,1610612761,TOR,Toronto Raptors,0041900216,2020-09-09,TOR @ BOS,W,289,...,8,38,46,22,10,8,12,26,3.0,BOS
2,2,42019,1610612761,TOR,Toronto Raptors,0041900215,2020-09-07,TOR vs. BOS,L,241,...,10,26,36,19,8,5,13,24,-22.0,BOS
3,3,42019,1610612761,TOR,Toronto Raptors,0041900214,2020-09-05,TOR @ BOS,W,240,...,8,33,41,23,4,5,12,21,7.0,BOS
4,4,42019,1610612761,TOR,Toronto Raptors,0041900213,2020-09-03,TOR @ BOS,W,239,...,6,33,39,23,7,5,13,20,1.0,BOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,2292,21995,1610612761,TOR,Toronto Raptors,0029500065,1995-11-10,TOR vs. PHX,L,240,...,12,31,43,21,5,7,16,29,,PHX
2293,2293,21995,1610612761,TOR,Toronto Raptors,0029500046,1995-11-08,TOR vs. SAC,L,240,...,20,27,47,10,7,13,18,28,,SAC
2294,2294,21995,1610612761,TOR,Toronto Raptors,0029500035,1995-11-07,TOR @ CHI,L,240,...,7,25,32,25,6,3,19,26,,CHI
2295,2295,21995,1610612761,TOR,Toronto Raptors,0029500017,1995-11-04,TOR @ IND,L,240,...,18,19,37,20,10,6,11,29,,IND


In [135]:
df['target']=df.WL.apply(lambda x: 1 if x=='W' else 0)

In [136]:
(df.target.value_counts()/len(df))[1]

0.48802786242925555

In [137]:
df=df.astype({'GAME_DATE':'datetime64'}).sort_values('GAME_DATE')

In [138]:
df=df.rename(columns={'MIN':'minutes'})

In [139]:
numerical_features=['minutes', 'PTS', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS']

In [140]:
for f in numerical_features:
    df[f]=df[f].rolling(window=10).mean()

In [141]:
df

Unnamed: 0,index,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,minutes,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,opponent,target
2296,2296,21995,1610612761,TOR,Toronto Raptors,0029500012,1995-11-03,TOR vs. NJN,W,,...,,,,,,,,,NJN,1
2295,2295,21995,1610612761,TOR,Toronto Raptors,0029500017,1995-11-04,TOR @ IND,L,,...,,,,,,,,,IND,0
2294,2294,21995,1610612761,TOR,Toronto Raptors,0029500035,1995-11-07,TOR @ CHI,L,,...,,,,,,,,,CHI,0
2293,2293,21995,1610612761,TOR,Toronto Raptors,0029500046,1995-11-08,TOR vs. SAC,L,,...,,,,,,,,,SAC,0
2292,2292,21995,1610612761,TOR,Toronto Raptors,0029500065,1995-11-10,TOR vs. PHX,L,,...,,,,,,,,,PHX,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,4,42019,1610612761,TOR,Toronto Raptors,0041900213,2020-09-03,TOR @ BOS,W,239.7,...,38.0,47.7,27.1,8.2,5.4,12.6,22.7,8.2,BOS,1
3,3,42019,1610612761,TOR,Toronto Raptors,0041900214,2020-09-05,TOR @ BOS,W,239.8,...,37.4,46.3,27.0,7.5,5.3,12.5,22.1,8.1,BOS,1
2,2,42019,1610612761,TOR,Toronto Raptors,0041900215,2020-09-07,TOR vs. BOS,L,239.9,...,36.4,45.2,26.1,7.5,5.1,12.6,22.2,5.5,BOS,0
1,1,42019,1610612761,TOR,Toronto Raptors,0041900216,2020-09-09,TOR @ BOS,W,244.7,...,36.4,44.7,25.6,7.7,5.4,11.9,22.4,5.0,BOS,1


In [142]:
df.dropna(inplace=True)

In [143]:
df

Unnamed: 0,index,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,minutes,...,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,opponent,target
2205,2205,21996,1610612761,TOR,Toronto Raptors,0029600147,1996-11-21,TOR vs. CLE,L,240.0,...,26.1,39.1,18.7,10.2,5.5,18.4,26.9,-3.7,CLE,0
2204,2204,21996,1610612761,TOR,Toronto Raptors,0029600162,1996-11-23,TOR vs. ATL,L,239.9,...,25.5,39.0,18.3,10.1,5.5,17.6,25.5,-3.2,ATL,0
2203,2203,21996,1610612761,TOR,Toronto Raptors,0029600182,1996-11-26,TOR vs. SAC,L,239.8,...,26.6,40.3,18.1,9.9,5.5,18.3,25.4,-3.2,SAC,0
2202,2202,21996,1610612761,TOR,Toronto Raptors,0029600189,1996-11-27,TOR vs. CHH,W,239.9,...,26.2,40.2,18.1,10.1,5.1,17.7,24.4,-3.2,CHH,1
2201,2201,21996,1610612761,TOR,Toronto Raptors,0029600215,1996-11-30,TOR @ MIN,L,240.1,...,25.5,39.8,17.0,9.9,4.8,17.7,23.9,-4.2,MIN,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,4,42019,1610612761,TOR,Toronto Raptors,0041900213,2020-09-03,TOR @ BOS,W,239.7,...,38.0,47.7,27.1,8.2,5.4,12.6,22.7,8.2,BOS,1
3,3,42019,1610612761,TOR,Toronto Raptors,0041900214,2020-09-05,TOR @ BOS,W,239.8,...,37.4,46.3,27.0,7.5,5.3,12.5,22.1,8.1,BOS,1
2,2,42019,1610612761,TOR,Toronto Raptors,0041900215,2020-09-07,TOR vs. BOS,L,239.9,...,36.4,45.2,26.1,7.5,5.1,12.6,22.2,5.5,BOS,0
1,1,42019,1610612761,TOR,Toronto Raptors,0041900216,2020-09-09,TOR @ BOS,W,244.7,...,36.4,44.7,25.6,7.7,5.4,11.9,22.4,5.0,BOS,1


In [144]:
pd.get_dummies(df.opponent)

Unnamed: 0,ATL,BKN,BOS,CHA,CHH,CHI,CLE,DAL,DEN,DET,...,RMD,SAC,SAS,SEA,SLA,TOR,UTA,VAN,WAS,ZAK
2205,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2204,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2203,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2202,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
t=df[numerical_features].shift(1)

In [146]:
t['target']=df.target

In [147]:
t.dropna(inplace=True)

In [148]:
t

Unnamed: 0,minutes,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,target
2204,240.0,95.5,34.3,80.3,0.4285,8.7,21.9,0.4045,18.2,23.7,...,13.0,26.1,39.1,18.7,10.2,5.5,18.4,26.9,-3.7,0
2203,239.9,94.4,34.4,80.7,0.4276,8.1,21.6,0.3838,17.5,21.9,...,13.5,25.5,39.0,18.3,10.1,5.5,17.6,25.5,-3.2,0
2202,239.8,93.3,33.7,79.9,0.4231,8.0,21.6,0.3792,17.9,22.3,...,13.7,26.6,40.3,18.1,9.9,5.5,18.3,25.4,-3.2,1
2201,239.9,92.5,33.3,80.1,0.4175,8.1,21.6,0.3842,17.8,22.5,...,14.0,26.2,40.2,18.1,10.1,5.1,17.7,24.4,-3.2,0
2200,240.1,90.2,32.2,79.5,0.4057,7.5,21.0,0.3619,18.3,22.8,...,14.3,25.5,39.8,17.0,9.9,4.8,17.7,23.9,-4.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,239.8,116.2,40.8,88.6,0.4592,15.5,40.1,0.3842,19.1,25.5,...,9.6,38.2,47.8,27.3,9.2,5.6,13.8,23.0,9.0,1
3,239.7,115.8,41.5,90.3,0.4579,15.3,40.2,0.3782,17.5,23.8,...,9.7,38.0,47.7,27.1,8.2,5.4,12.6,22.7,8.2,1
2,239.8,114.4,40.7,89.2,0.4541,15.8,40.6,0.3868,17.2,23.1,...,8.9,37.4,46.3,27.0,7.5,5.3,12.5,22.1,8.1,0
1,239.9,110.8,40.0,88.7,0.4485,15.2,40.3,0.3749,15.6,20.4,...,8.8,36.4,45.2,26.1,7.5,5.1,12.6,22.2,5.5,1


In [85]:
for idx,team in enumerate(team_data):
    if idx==0:
        continue
    
    df=pd.read_sql("select * from "+str(team['abbreviation']), connection)
    df=df.astype({'GAME_DATE':'datetime64'}).sort_values('GAME_DATE')
    df=df.rename(columns={'MIN':'minutes'})
    df['target']=df.WL.apply(lambda x: 1 if x=='W' else 0)
    print(team['abbreviation'],(df.target.value_counts()/len(df))[1])
    df.dropna(inplace=True)
    temp=df[numerical_features].shift(1)
    temp['target']=df.target
    temp.dropna(inplace=True)
    t=t.append(temp)



BOS 0.5579399141630901
CLE 0.494026284348865
NOP 0.4589082183563287
CHI 0.5310850439882698
DAL 0.5120285120285121
DEN 0.4848851269649335
GSW 0.47944377267230953
HOU 0.5703399765533411
LAC 0.4206471494607088
LAL 0.5974208017942249
MIA 0.5229846768820786
MIL 0.4797460701330109
MIN 0.4035608308605341
BKN 0.42110091743119266
NYK 0.4639423076923077
ORL 0.4813473379210431
IND 0.5173951828724354
PHI 0.4612146722790138
PHX 0.5304295942720764
POR 0.5503256364712847
SAC 0.42383900928792567
SAS 0.6132533561839475
OKC 0.5553254437869822
TOR 0.48802786242925555
UTA 0.5728070175438597
MEM 0.42679127725856697
WAS 0.42014210688909487
DET 0.5322959483264826
CHA 0.43735676088617265


In [116]:
t

Unnamed: 0,minutes,PTS,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PLUS_MINUS,target
2163,240.4,87.2,30.2,74.2,0.4061,7.8,21.1,0.3560,19.0,25.6,...,12.4,27.4,39.8,16.3,8.1,5.1,16.6,19.0,-2.2,0
2162,240.3,85.4,30.0,74.8,0.4000,8.1,21.8,0.3614,17.3,23.9,...,12.4,27.5,39.9,16.6,7.8,5.1,15.9,19.0,-1.9,1
2161,240.3,84.9,29.9,74.5,0.4003,7.4,20.5,0.3372,17.7,24.8,...,12.2,27.9,40.1,16.4,8.1,5.6,15.4,19.4,0.1,1
2160,240.3,84.6,29.9,73.3,0.4087,6.9,20.3,0.3125,17.9,25.1,...,12.0,27.7,39.7,16.5,7.6,5.2,16.0,19.4,-1.4,1
2159,240.0,83.0,29.7,73.7,0.4038,6.1,19.3,0.2994,17.5,25.1,...,12.2,28.9,41.1,16.8,7.3,5.7,16.0,18.7,-1.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,244.9,122.0,42.9,92.1,0.4667,13.8,37.7,0.3658,22.4,26.4,...,9.8,33.5,43.3,27.1,7.4,5.2,13.1,21.8,-1.4,0
3,244.8,120.1,42.6,93.4,0.4583,13.4,37.3,0.3582,21.5,25.4,...,10.2,32.8,43.0,27.0,7.3,5.2,13.4,21.8,-4.8,0
2,239.5,117.3,42.3,91.2,0.4664,13.2,36.5,0.3607,19.5,23.5,...,9.7,31.9,41.6,26.6,7.3,4.8,13.7,21.9,-5.9,0
1,239.6,114.8,40.8,91.9,0.4467,12.9,37.7,0.3440,20.3,24.6,...,10.7,32.4,43.1,25.6,7.7,4.5,13.9,22.5,-6.7,1


In [155]:
X=t[numerical_features]

In [156]:
y=t.target

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

In [158]:

ebm = ExplainableBoostingClassifier(feature_names=None,
    feature_types=None,
    max_bins=100,
    max_interaction_bins=32,
    binning='uniform',
    mains='all',
    interactions=0,
    outer_bags=16,
    inner_bags=0,
    learning_rate=0.1,
    validation_size=0.1,
    early_stopping_rounds=150,
    early_stopping_tolerance=0.0001,
    max_rounds=5000,
    max_leaves=3,
    min_samples_leaf=2,
    n_jobs=-2,
    random_state=117)
ebm.fit(X_train, y_train)   

ExplainableBoostingClassifier(binning='uniform', early_stopping_rounds=150,
                              feature_names=['minutes', 'PTS', 'FGM', 'FGA',
                                             'FG_PCT', 'FG3M', 'FG3A',
                                             'FG3_PCT', 'FTM', 'FTA', 'FT_PCT',
                                             'OREB', 'DREB', 'REB', 'AST',
                                             'STL', 'BLK', 'TOV', 'PF',
                                             'PLUS_MINUS'],
                              feature_types=['continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'continuous', 'continuous',
                                             'conti

In [159]:
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [160]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [161]:
y_pred_ebm=ebm.predict(X_test)
y_pred_proba_ebm = ebm.predict_proba(X_test)[::,1]

In [162]:
print('Test Accuracy: '+str(accuracy_score(y_test, y_pred_ebm)))
print('Train Accuracy: '+str(accuracy_score(y_train, y_pred=ebm.predict(X_train))))

Test Accuracy: 0.6014492753623188
Train Accuracy: 0.6592009685230025


In [189]:
baseline_model=LogisticRegression(max_iter=10000, solver='liblinear',penalty='l1',C=10).fit(X_train,y_train)

In [190]:
y_pred=baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[::,1]

In [191]:
print('Test Accuracy: '+str(accuracy_score(y_test, y_pred)))
print('Train Accuracy: '+str(accuracy_score(y_train, y_pred=baseline_model.predict(X_train))))

Test Accuracy: 0.6304347826086957
Train Accuracy: 0.5962469733656174


In [192]:
print('Test ROC: '+str(roc_auc_score(y_test, y_pred_proba)))
print('Train ROC: '+str(roc_auc_score(y_train, baseline_model.predict_proba(X_train)[::,1])))

Test ROC: 0.6525
Train ROC: 0.6321515102213116


In [193]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [194]:
tn, fp, fn, tp

(135, 79, 74, 126)