In [1]:
import pandas as pd
import sqlite3
import numpy as np
import matplotlib.pyplot as plt

In [21]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [3]:
connection = sqlite3.connect('data/nba.db')

In [6]:
df=pd.read_sql("select * from LAL", connection)

In [11]:
df.columns

Index(['index', 'SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME',
       'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA',
       'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB',
       'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS',
       'opponent', 'target'],
      dtype='object')

In [10]:
df['target']=df.WL.apply(lambda x: 1 if x=='W' else 0)

In [18]:
#Match count
df.groupby('opponent').index.count()

opponent
ATL     71
BAR      2
BKN     19
BOS    105
CHA     36
CHH     29
CHI     77
CLE     76
DAL    171
DEN    194
DET     90
GOS     79
GSW    127
HOU    183
IND     78
KCK     13
LAC    174
MAC      1
MEM     70
MIA     73
MIL     73
MIN    131
NJN     59
NOH     35
NOK      8
NOP     27
NYK     77
OKC     55
ORL     68
PHI     58
PHL     26
PHX    214
POR    209
SAC    183
SAN     69
SAS    124
SDC      6
SEA    135
TOR     52
UTA    131
UTH     64
VAN     23
WAS     72
Name: index, dtype: int64

In [20]:
df.target.value_counts()/len(df)

1    0.597421
0    0.402579
Name: target, dtype: float64

In [17]:
#Win rate by team
(df.groupby('opponent').target.sum()/df.groupby('opponent').index.count()).sort_values(ascending=False)

opponent
KCK    1.000000
MAC    1.000000
VAN    0.956522
NOH    0.771429
NJN    0.745763
PHL    0.730769
CHH    0.724138
ATL    0.690141
MIN    0.671756
DAL    0.666667
SDC    0.666667
SAC    0.650273
ORL    0.647059
GOS    0.645570
LAC    0.632184
WAS    0.625000
NOK    0.625000
GSW    0.622047
DEN    0.608247
MIL    0.602740
SEA    0.600000
IND    0.589744
CHA    0.583333
UTH    0.578125
TOR    0.576923
PHX    0.574766
BOS    0.571429
CHI    0.571429
PHI    0.568966
HOU    0.568306
SAN    0.565217
MIA    0.561644
NYK    0.558442
DET    0.555556
POR    0.555024
BKN    0.526316
MEM    0.514286
CLE    0.513158
BAR    0.500000
UTA    0.488550
NOP    0.481481
SAS    0.467742
OKC    0.454545
dtype: float64

### Baseline

In [22]:
y=df.target

In [23]:
X=df.opponent

In [24]:
X=pd.get_dummies(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=117)

In [27]:
baseline_model=LogisticRegression().fit(X_train,y_train)

In [29]:
y_pred=baseline_model.predict(X_test)
y_pred_proba = baseline_model.predict_proba(X_test)[::,1]

In [31]:
print('Test Accuracy: '+str(accuracy_score(y_test, y_pred)))
print('Train Accuracy: '+str(accuracy_score(y_train, y_pred=baseline_model.predict(X_train))))

Test Accuracy: 0.5952380952380952
Train Accuracy: 0.6021731510690501


In [37]:
print('Test ROC: '+str(roc_auc_score(y_test, y_pred_proba)))
print('Train ROC: '+str(roc_auc_score(y_train, baseline_model.predict_proba(X_train)[::,1])))

Test ROC: 0.5167965389451393
Train ROC: 0.5862287919832169


In [35]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [36]:
tn, fp, fn, tp

(35, 241, 48, 390)

### Last N-games

In [43]:
df=df.astype({'GAME_DATE':'datetime64'}).sort_values('GAME_DATE')

In [47]:
df.dtypes=='int64'

index                 True
SEASON_ID            False
TEAM_ID               True
TEAM_ABBREVIATION    False
TEAM_NAME            False
GAME_ID              False
GAME_DATE            False
MATCHUP              False
WL                   False
MIN                   True
PTS                   True
FGM                   True
FGA                   True
FG_PCT               False
FG3M                  True
FG3A                  True
FG3_PCT              False
FTM                   True
FTA                   True
FT_PCT               False
OREB                  True
DREB                  True
REB                   True
AST                   True
STL                   True
BLK                   True
TOV                   True
PF                    True
PLUS_MINUS           False
opponent             False
target                True
dtype: bool