In [1]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
root_dir = os.getcwd()
ctmc_dir = 'ctmc'
glicko_dir = 'glicko'
curseason_dir = 'curseason'
scorepreds_dir = 'scorepreds'
ultimate_dir = 'ultimate'
scores_pe_dir = 'scores_pe'
bcs_dir = 'BCS'
conf_dir = 'conferences'
data_dir = os.path.join(root_dir, "data")

##### IJH Data
file = os.path.join(data_dir, ultimate_dir, "ultimate_2.csv")
scores_pe_df = pd.read_csv(file)
scores_pe_df = scores_pe_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])
scores_pe_df['target_margin'] = scores_pe_df['HomeFinal'] - scores_pe_df['VisFinal']
scores_pe_df = scores_pe_df[['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                            'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                            'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                            'VisPythWins','VisWinPct','SpreadElo','HomeConf_NotMajor',
                            'VisConf_NotMajor','target_margin','HomeFinal','VisFinal']]
scores_pe_df = scores_pe_df.drop_duplicates()

######## SWC Data
file = os.path.join(data_dir, ctmc_dir, "scores_ctmc_ultimate_data_final.csv")
scores_ctmc_df = pd.read_csv(file,index_col=[0,1,2,3]).drop_duplicates()
scores_ctmc_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, glicko_dir, "glicko_ultimate_data_final.csv")
glicko_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
glicko_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, curseason_dir, "curseason.csv")
curseason_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
file = os.path.join(data_dir, scorepreds_dir, "scorepreds.csv")
scorepreds_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()

# Join SC Data
data_final = scores_pe_df.join(scores_ctmc_df, how='left')
data_final = data_final.join(glicko_df)
data_final = data_final.join(curseason_df)
data_final = data_final.join(scorepreds_df, how='left')

##### CDR Data
file = os.path.join(data_dir, bcs_dir, "BCS-SOS.csv")
bcs_df = pd.read_csv(file).drop_duplicates()
bcs_df = bcs_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])

# Join CDR Data
data_final = data_final.join(bcs_df)

# Join Conference Data
file = os.path.join(data_dir, conf_dir, "mergedConferences.csv")
conf_df = pd.read_csv(file).drop_duplicates()
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['HomeID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Home'))
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['VisID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Vis'))
data_final = data_final.set_index(['HomeID', 'VisID', 'Season', 'Week'])
data_final = data_final.drop(['ID','Year','IDVis','index'],1)

# Impute HomeConf Data
data_final['HomeConf_NotMajor'] = data_final['HomeConf_NotMajor'].fillna(0)

################################################################################
# Train - Val - Test Splits
################################################################################

X_train = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') &
                     (data_final.index.get_level_values(3)>4)].\
                                       drop(['target_margin'], axis=1).\
                                       fillna(data_final.mean())
y_train = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') & \
                     (data_final.index.get_level_values(3)>4)]['target_margin']

X_val = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)].\
                                     drop(['target_margin'], axis=1).\
                                     fillna(data_final.mean())
y_val = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)]['target_margin']

X_test = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)].\
                                      drop(['target_margin'], axis=1).\
                                      fillna(data_final.mean())
y_test = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)]['target_margin']

################################################################################
# Final Feature Selection
################################################################################

base_featurestouse = [col for col in data_final.columns if \
                      col.find('InSeason') > -1]

swc_featurestouse = ['Glicko_Rating_Home', 'Glicko_Rating_Away',
                     'Glicko_Rating_Deviance_Home', 'Glicko_Rating_Deviance_Away',
                     'Glicko_Sigma_Home', 'Glicko_Sigma_Away',
                     'CTMC_Rating_Home', 'CTMC_Rating_Away','HomePredFinal',
                     'VisPredFinal','CurSeason_HH_VA_mean_margin']

ijh_featurestouse = ['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                    'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                    'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                    'VisPythWins','VisWinPct','SpreadElo','HomeConf_NotMajor',
                    'VisConf_NotMajor']

cdr_featurestouse = bcs_df.columns.values.tolist()

# featurestouse = base_featurestouse + swc_featurestouse + ijh_featurestouse + cdr_featurestouse
featurestouse = swc_featurestouse + ijh_featurestouse + cdr_featurestouse

################################################################################
# Baseline Model
################################################################################

# Current season average margins diff
X_train['CurSeason_HH_VA_mean_margin'] = (X_train['SpreadHomeInSeasonAvg'] \
                                    - X_train['SpreadVisInSeasonAvg']*-1)
X_val['CurSeason_HH_VA_mean_margin'] = (X_val['SpreadHomeInSeasonAvg'] \
                                        - X_val['SpreadVisInSeasonAvg']*-1)
X_test['CurSeason_HH_VA_mean_margin'] = (X_test['SpreadHomeInSeasonAvg'] \
                                        - X_test['SpreadVisInSeasonAvg']*-1)

# Train and report baseline model with current season metric
ols = LinearRegression()
ols.fit(X_train['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_train)
ols.score(X_train['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_train)
ols.score(X_val['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_val)
preds = ols.predict(X_val['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1))
mean_squared_error(preds, y_val)

415.4627773651895

In [3]:
standardscaler = StandardScaler()
X_trainscaled = standardscaler.fit_transform(X_train[featurestouse])
X_valscaled = standardscaler.transform(X_val[featurestouse])
X_testscaled = standardscaler.transform(X_test[featurestouse])

In [79]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression #penalty, C, 
from sklearn.linear_model import ElasticNet #alpha, l1_ratio, warm_start, 
from sklearn.svm import SVC #C, kernel, degree-polynomial, gamma, 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier

In [5]:
data_final['HomeWin'] = data_final.apply(lambda x: 1 if x['target_margin'] > 0 else 0, axis=1)
X_trainC = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') &
                     (data_final.index.get_level_values(3)>4)].\
                                       drop(['target_margin', 'HomeWin'], axis=1).\
                                       fillna(data_final.mean())
y_trainC = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') & \
                     (data_final.index.get_level_values(3)>4)]['HomeWin']

X_valC = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)].\
                                     drop(['target_margin', 'HomeWin'], axis=1).\
                                     fillna(data_final.mean())
y_valC = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)]['HomeWin']

X_testC = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)].\
                                      drop(['target_margin', 'HomeWin'], axis=1).\
                                      fillna(data_final.mean())
y_testC = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)]['HomeWin']

# Current season average margins diff
X_trainC['CurSeason_HH_VA_mean_margin'] = (X_trainC['SpreadHomeInSeasonAvg'] \
                                    - X_trainC['SpreadVisInSeasonAvg']*-1)
X_valC['CurSeason_HH_VA_mean_margin'] = (X_valC['SpreadHomeInSeasonAvg'] \
                                        - X_valC['SpreadVisInSeasonAvg']*-1)
X_testC['CurSeason_HH_VA_mean_margin'] = (X_testC['SpreadHomeInSeasonAvg'] \
                                        - X_testC['SpreadVisInSeasonAvg']*-1)



standardscaler = StandardScaler()
X_trainscaledC = standardscaler.fit_transform(X_trainC[featurestouse])
X_valscaledC = standardscaler.transform(X_valC[featurestouse])
X_testscaledC = standardscaler.transform(X_testC[featurestouse])


In [None]:
X_trainC2 = X_trainscaledC.copy()
X_valC2 = X_valscaledC.copy()
X_testC2 = X_testscaledC.copy()

In [112]:
def doGridSearchC(X_train, y_train, X_val, y_val, model, param_grid):
    X_train_val = np.vstack((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))
    val_fold = [-1]*len(X_train) + [0]*len(X_val) #0 corresponds to validation
    estimator = model.fit(X_train, y_train)
    grid = GridSearchCV(estimator,
                        param_grid,
                        return_train_score=False,
                        cv = PredefinedSplit(test_fold=val_fold),
                        refit = True,
                        scoring = make_scorer(roc_auc_score,
                                              greater_is_better = True))
    grid.fit(X_train_val, y_train_val)
    return grid.best_estimator_

dataDict = {'X_train':X_trainscaledC, 'X_val':X_valscaledC, 
            'y_trainC':y_trainC, 'y_valC':y_valC, 'y_train':y_train, 'y_val':y_val}

def chainGridSearch(name, estimator, param_gridC, param_gridR, dataDict=dataDict):

    X_trainC2 = dataDict['X_train'].copy()
    X_valC2 = dataDict['X_val'].copy()
    
    clssifer = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], estimator, param_gridC)
    clssifer = clssifer.fit(dataDict['X_train'], dataDict['y_trainC'])
    predsTrain = clssifer.predict_proba(X_trainC2)[:,1]
    predsVal = clssifer.predict_proba(X_valC2)[:,1]
    print(name, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)

    regrssor = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], KernelRidge(kernel='rbf'), param_gridR)
    regrssor = regrssor.fit(X_trainC2, dataDict['y_train'])
    predsReg = reg.predict(X_valC2)
    print(name, mean_squared_error(dataDict['y_val'], predsReg))
    return(clssifer)

In [107]:
X_trainC2 = dataDict['X_train'].copy()
predsTrain = logit.predict_proba(X_trainC2)[:,1]
np.where(predsTrain >0.5, 1, 0)

array([1, 1, 1, ..., 0, 0, 1])

In [101]:
featurestouse

['Glicko_Rating_Home',
 'Glicko_Rating_Away',
 'Glicko_Rating_Deviance_Home',
 'Glicko_Rating_Deviance_Away',
 'Glicko_Sigma_Home',
 'Glicko_Sigma_Away',
 'CTMC_Rating_Home',
 'CTMC_Rating_Away',
 'HomePredFinal',
 'VisPredFinal',
 'CurSeason_HH_VA_mean_margin',
 'HomeElo',
 'HomeEloProb',
 'HomeLuck',
 'HomePrevLuck',
 'HomePythPct',
 'HomePythWins',
 'HomeWinPct',
 'VisElo',
 'VisEloProb',
 'VisLuck',
 'VisPrevLuck',
 'VisPythPct',
 'VisPythWins',
 'VisWinPct',
 'SpreadElo',
 'HomeConf_NotMajor',
 'VisConf_NotMajor',
 'HomeBCSSOS',
 'HomeRPI',
 'AwayBCSSOS',
 'AwayRPI']

In [111]:
logit = chainGridSearch('Logit', LogisticRegression(), 
                        [{'C':10**np.arange(-5, 3, 0.25)}], 
                        [{'alpha':10**np.arange(-5, 1, 1.0),
                          'gamma':10**np.arange(-5, 1, 1.0)}],
                        dataDict)
logit

Logit 0.8205505913089398


ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 34 while Y.shape[1] == 33

In [92]:
rfc = chainGridSearch('RFC', RandomForestClassifier(criterion='entropy', n_estimators=60, min_samples_split=15), 
                      [{'min_samples_leaf':np.arange(5,50,5)}], 
                      [{'alpha':10**np.arange(-5, 1, 1.0),
                        'gamma':10**np.arange(-5, 1, 1.0)}],
                      dataDict)
rfc

RFC 0.8031795883767144
RFC 295.7245718885387


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [93]:
gbc = chainGridSearch('GBC', GradientBoostingClassifier(n_estimators=900), 
                      [{'min_samples_split':np.arange(5,50,50)}], 
                      [{'alpha':10**np.arange(-5, 1, 1.0),
                          'gamma':10**np.arange(-5, 1, 1.0)}],
                      dataDict)

GBC 0.8054447671590768
GBC 334.5836944057731
