In [2]:
import pickle
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [3]:
games = pd.read_pickle('game_data/games_four_years.pkl')
g2018 = pd.read_pickle('game_data/games_2018.pkl')

In [57]:
# g2018.info()

In [17]:
'''Shuffle DataFrames'''
games = games.sample(frac=1).reset_index(drop=True)
g2018 = g2018.sample(frac=1).reset_index(drop=True)

In [18]:
Xy_train = games[['W', 'Wp', 'ppg', 'pApg', 'FGp', '3Pp', 'FTp', 'ORBpg', 'RBpg', 
            'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'sos', 'OPppg', 
            'OPpApg', 'OPFGp', 'OP3Pp', 'OPFTp', 'OPORBpg', 'OPRBpg', 
            'OPASTpg', 'OPSTLpg', 'OPBLKpg', 'OPTOpg', 'OPPFpg', 'OPsos']]

Xy_test = g2018[['W', 'Wp', 'ppg', 'pApg', 'FGp', '3Pp', 'FTp', 'ORBpg', 'RBpg', 
            'ASTpg', 'STLpg', 'BLKpg', 'TOpg', 'PFpg', 'sos', 'OPppg', 
            'OPpApg', 'OPFGp', 'OP3Pp', 'OPFTp', 'OPORBpg', 'OPRBpg', 
            'OPASTpg', 'OPSTLpg', 'OPBLKpg', 'OPTOpg', 'OPPFpg', 'OPsos']]

In [28]:
Xy_test['W'].value_counts()

0    616
1    587
Name: W, dtype: int64

In [19]:
# Set up features and targets
X_train = Xy_train.iloc[:, 1:].as_matrix()
y_train = Xy_train.iloc[:, 0].as_matrix()

In [20]:
# Set up features and targets
X_test = Xy_test.iloc[:, 1:].as_matrix()
y_test = Xy_test.iloc[:, 0].as_matrix()

In [21]:
'''Standardize Data'''
scale = StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

In [31]:
'''Fit model on training data'''
lg = LogisticRegression()
lg.fit(X_train, y_train)
lg_predict = lg.predict(X_test)
lg_predict_proba = lg.predict_proba(X_test)

In [42]:
lg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [33]:
lg_predict_proba

array([[ 0.77482879,  0.22517121],
       [ 0.76544544,  0.23455456],
       [ 0.54638219,  0.45361781],
       ..., 
       [ 0.3049869 ,  0.6950131 ],
       [ 0.9964017 ,  0.0035983 ],
       [ 0.93803814,  0.06196186]])

In [35]:
y_score = lg_predict_proba[:, [0]]

In [25]:
lg_accuracy = metrics.accuracy_score(y_test, lg_predict)
lg_precision = metrics.precision_score(y_test, lg_predict)
lg_recall = metrics.recall_score(y_test, lg_predict)
lg_f1 = metrics.f1_score(y_test, lg_predict)
print('Accuracy: {:.2f} (% predicted correctly)'.format(lg_accuracy))
print('Precision: {:.2f} (predicted positives % correct)'.format(lg_precision))
print('Recall: {:.2f} (% of positives predicted correctly)'.format(lg_recall))
print('f1 Score: {:.2f} (weighted average of Precision and Recall)'.format(lg_f1))

Accuracy: 0.78 (% predicted correctly)
Precision: 0.78 (predicted positives % correct)
Recall: 0.77 (% of positives predicted correctly)
f1 Score: 0.78 (weighted average of Precision and Recall)


In [36]:
metrics.roc_curve(y_test, y_score, pos_label=None, sample_weight=None, drop_intermediate=True)

(array([ 0.        ,  0.00162338,  0.10876623,  0.10876623,  0.13149351,
         0.13149351,  0.18993506,  0.18993506,  0.20616883,  0.20616883,
         0.21590909,  0.21590909,  0.21753247,  0.21753247,  0.24350649,
         0.24350649,  0.26623377,  0.26623377,  0.27597403,  0.27597403,
         0.28896104,  0.28896104,  0.29707792,  0.29707792,  0.31331169,
         0.31331169,  0.32954545,  0.32954545,  0.34253247,  0.34253247,
         0.36038961,  0.36038961,  0.36363636,  0.36363636,  0.36850649,
         0.36850649,  0.37337662,  0.37337662,  0.38311688,  0.38311688,
         0.3961039 ,  0.3961039 ,  0.40746753,  0.40746753,  0.41720779,
         0.41720779,  0.42207792,  0.42207792,  0.42532468,  0.42532468,
         0.42694805,  0.42694805,  0.43993506,  0.43993506,  0.44480519,
         0.44480519,  0.46428571,  0.46428571,  0.47564935,  0.47564935,
         0.47727273,  0.47727273,  0.48376623,  0.48376623,  0.48701299,
         0.48701299,  0.49188312,  0.49188312,  0.4

In [40]:
metrics.roc_auc_score(y_test, y_score)

0.13960762406247926

In [33]:
np.isnan(y_train)[np.isnan(y_train) == False].size

3664

In [25]:
def cross_val_logistic(X, y, model, n_folds=5, random_seed=154):
    """Estimate the in- and out-of-sample error of a model using cross
    validation.
 
    Parameters
    ----------
 
    X: np.array
      Matrix of predictors, standardized
 
    y: np.array
      Target array, standardized
 
    model: sklearn model object.
      The estimator to fit.  Must have fit and predict methods.
 
    n_folds: int
      The number of folds in the cross validation.
 
    random_seed: int
      A seed for the random number generator, for repeatability.
 
    Returns
    -------
    mean accuracy across folds
    """
    kf = KFold(n_folds)
    accuracy_list = []
    cfs = []
    for train_index,test_index in kf.split(X):
        # define variables
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        print(y_test)
        # fit model
#         model.fit(X_train, y_train)
#         y_hat_train = model.predict(X_train)
#         y_hat_test = model.predict(X_test) 
#         # evaluate model
#         accuracy_test = metrics.accuracy_score(y_test, y_hat_test)
#         errorlist.append(accuracy_test)
#         # store coefficients
#         cfs.append (model.coef_)
#     # select best coefficients 
#     accuracies = np.asarray(accuracy_list)
#     idx_max_test_accuracy = accuracies.argmax()   
#     cfs_best = cfs[idx_min_test_accuracy]
    
#     return(errors)

In [53]:
logcv = LogisticRegression()
ridgecv = Ridge(alpha=.5)
lassocv = Lasso(alpha=.5)
enetcv = ElasticNet(alpha=.5)
log_cv_acc = cross_validate(logcv, X_train, y_train, scoring='accuracy', cv=5, return_train_score=False)
# ridge_cv_acc = cross_validate(ridgecv, X_train, y_train, scoring='accuracy', cv=5, return_train_score=False)
# lasso_cv_acc = cross_validate(lassocv, X_train, y_train, scoring='accuracy', cv=5, return_train_score=False)
# enet_cv_acc = cross_validate(enetcv, X_train, y_train, scoring='accuracy', cv=5, return_train_score=False)
print(log_cv_acc['test_score'].mean())
# print(ridge_cv_acc['test_score'].mean())

0.786570862186


In [None]:
model = Ridge(alpha=.5)
X = X_train
y = y_train
cv_accuracy = cross_val_logistic(X, y, model)