In [1]:
import pickle
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LogisticRegression
from sklearn .linear_model import Lasso
from sklearn .linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [2]:
games = pd.read_pickle('game_data/games_2017.pkl')

In [3]:
'''Shuffle DataFrame'''
games = games.sample(frac=1).reset_index(drop=True)

I don't need: Date, Ws, Tm, OPTm, ID, count, or matchup...

In [4]:
Xy = games[['W', 'Wp', 'ppg', 'pApg', 'FGp', '3Pp', 'FTp', 'ORBpg', 'RBpg', 
            'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'sos', 'OPppg', 
            'OPpApg', 'OPFGp', 'OP3Pp', 'OPFTp', 'OPORBpg', 'OPRBpg', 
            'OPASTpg', 'OPSTLpg', 'OPBLKpg', 'OPTOpg', 'OPPFpg', 'OPsos']]

In [5]:
# Set up features and targets
X = Xy.iloc[:, 1:]
y = Xy.iloc[:, 0]

In [6]:
'''Train test split'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [7]:
'''Standardize Data'''
scale = StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

In [8]:
'''Fit model on training data'''
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
'''predict on testing data'''
y_hat = model.predict(X_test)

In [11]:
accuracy = metrics.accuracy_score(y_test, y_hat)
ave_precision = metrics.average_precision_score(y_test, y_hat)
f1 = metrics.f1_score(y_test, y_hat)
log_loss = metrics.log_loss(y_test, y_hat)
precision = metrics.precision_score(y_test, y_hat)
recall = metrics.recall_score(y_test, y_hat)
# roc_auc_score = metrics.roc_auc_score(y_test, y_hat)
print('Accuracy: {:.2f} (% predicted correctly)'.format(accuracy))
print('Precision: {:.2f} (predicted positives % correct)'.format(precision))
print('Ave. Precision: {:.2f} (predicted positives % correct)'.format(ave_precision))
print('Recall: {:.2f} (% of positives predicted correctly)'.format(recall))

Accuracy: 0.80 (% predicted correctly)
Precision: 0.80 (predicted positives % correct)
Ave. Precision: 0.73 (predicted positives % correct)
Recall: 0.80 (% of positives predicted correctly)


Next Steps:
- Standardize!
    - right after train test split
    - before cross-validation
- Cross-validation (KFolds)
- Regularization
    - Ridge
    - Lasso
    - Elastinet
        - optimize alpha parameters for each
- ROC threshold optimization


Future:
- Train on data from multiple years.
    