In [4]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

from construct_data import load_csvs

%matplotlib inline

In [139]:
root_dir = os.getcwd()
ctmc_dir = 'ctmc'
glicko_dir = 'glicko'
curseason_dir = 'curseason'
scorepreds_dir = 'scorepreds'
snooz_dir = 'snoozle'
bcs_dir = 'BCS'
conf_dir = 'conferences'
odds_dir = 'new_odds'
data_dir = os.path.join(root_dir, "data")

##### IJH Data
file = os.path.join(data_dir, snooz_dir, "snoozle_ijh.csv")
snooz_df = pd.read_csv(file)
snooz_df = snooz_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])
snooz_df['target_margin'] = snooz_df['HomeFinal'] - snooz_df['VisFinal']
snooz_df = snooz_df[['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                            'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                            'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                            'VisPythWins','VisWinPct','SpreadElo',
                            'target_margin','HomeFinal','VisFinal']]
snooz_df = snooz_df.drop_duplicates()

####Odds
file = os.path.join(data_dir, odds_dir, 'new_odds.csv')
odds_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
odds_df.drop(['target_margin','Conf'], axis=1, inplace=True)

######## SWC Data
file = os.path.join(data_dir, ctmc_dir, "score_ctmc_snoozle.csv")
scores_ctmc_df = pd.read_csv(file,index_col=[0,1,2,3]).drop_duplicates()
scores_ctmc_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, glicko_dir, "glicko_snoozle.csv")
glicko_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
glicko_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, curseason_dir, "curseason.csv")
curseason_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
file = os.path.join(data_dir, scorepreds_dir, "scorepreds.csv")
scorepreds_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()

# Join SC Data
data_final = odds_df.join(snooz_df, how='left')
data_final = data_final.join(scores_ctmc_df, how='left')
data_final = data_final.join(glicko_df, how='left')
data_final = data_final.join(curseason_df, how='left')
data_final = data_final.join(scorepreds_df, how='left')

##### CDR Data
file = os.path.join(data_dir, bcs_dir, "BCS-SOS.csv")
bcs_df = pd.read_csv(file).drop_duplicates()
bcs_df = bcs_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])

# Join CDR Data
data_final = data_final.join(bcs_df)

# Join Conference Data
file = os.path.join(data_dir, conf_dir, "mergedConferences.csv")
conf_df = pd.read_csv(file).drop_duplicates()
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['HomeID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Home'))
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['VisID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Vis'))
data_final = data_final.set_index(['HomeID', 'VisID', 'Season', 'Week'])
data_final = data_final.drop(['ID','Year','IDVis','index'],1)


# Impute HomeConf Data
data_final['HomeConf_NotMajor'] = np.where(data_final['Conf'] == 'NotMajor', 1, 0)
data_final['VisConf_NotMajor'] = np.where(data_final['ConfVis'] == 'NotMajor', 1, 0)
data_final['PredSpread'] = data_final['HomePredFinal'] - data_final['VisPredFinal']

glicko_diff = data_final['Glicko_Rating_Home'] - data_final['Glicko_Rating_Away']
data_final['SpreadGlicko'] = (glicko_diff)/25 + 2.6 #predicted spread
data_final['HomeGlickoProb'] = 1/(10**(-glicko_diff/400)+1)
data_final['VisGlickoProb'] = 1/(10**(glicko_diff/400)+1)

################################################################################
# Train - Val - Test Splits
################################################################################

data_final_clean = data_final.copy().dropna()
data_final_clean.reset_index(inplace=True)
data_final_clean.sort_values(['Season','Week'])
#data_final_clean.set_index(['HomeID', 'VisID', 'Season', 'Week'], inplace=True)

X_train = data_final_clean.loc[(data_final_clean['Season']<2016) & \
                     (data_final_clean['Week'] > 4)].\
                                       drop(['target_margin'], axis=1)
    
y_train = data_final_clean.loc[(data_final_clean['Season']<2016) & \
                     (data_final_clean['Week'] > 4), 'target_margin']

X_val = data_final_clean.loc[(data_final_clean['Season']==2016) & \
                     (data_final_clean['Week'] > 4)].\
                                       drop(['target_margin'], axis=1)
    
y_val = data_final_clean.loc[(data_final_clean['Season']==2016) & \
                     (data_final_clean['Week'] > 4), 'target_margin']

#X_test = data_final[(data_final.index.get_level_values(2)==2017) & \
#                    (data_final['Conf']!='NotMajor') & \
#                    (data_final.index.get_level_values(3)>4)].\
#                                      drop(['target_margin'], axis=1).\
#                                      fillna(data_final.mean())
#y_test = data_final[(data_final.index.get_level_values(2)==2017) & \
#                    (data_final['Conf']!='NotMajor') & \
#                    (data_final.index.get_level_values(3)>4)]['target_margin']

# Current season average margins diff
X_train['CurSeason_HH_VA_mean_margin'] = (X_train['SpreadHomeInSeasonAvg'] \
                                    - X_train['SpreadVisInSeasonAvg']*-1)
X_val['CurSeason_HH_VA_mean_margin'] = (X_val['SpreadHomeInSeasonAvg'] \
                                        - X_val['SpreadVisInSeasonAvg']*-1)
#X_test['CurSeason_HH_VA_mean_margin'] = (X_test['SpreadHomeInSeasonAvg'] \
#                                        - X_test['SpreadVisInSeasonAvg']*-1)

################################################################################
# Final Feature Selection
################################################################################

base_features = [col for col in data_final.columns if \
                      col.find('InSeason') > -1]

swc_features = ['Glicko_Rating_Home', 'Glicko_Rating_Away',
                'Glicko_Rating_Deviance_Home', 'Glicko_Rating_Deviance_Away',
                'Glicko_Sigma_Home', 'Glicko_Sigma_Away',
                'CTMC_Rating_Home', 'CTMC_Rating_Away','HomePredFinal',
                'VisPredFinal','CurSeason_HH_VA_mean_margin',
                'HomeOddsHomeInSeasonAvg','HomeOddsVisInSeasonAvg']

ijh_features = ['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                'VisPythWins','VisWinPct','SpreadElo',
                'HomeConf_NotMajor','VisConf_NotMajor']

odds_features= ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
                'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
                'Spread_SBG', 'Spread_BetUS', 'Spread_Med', 'Spread_Mode']

test_features = ['SpreadGlicko', 'HomeGlickoProb', 'VisGlickoProb']

cdr_features = bcs_df.columns.values.tolist()

# featurestouse = base_featurestouse + swc_featurestouse + ijh_featurestouse + cdr_featurestouse
features = swc_features + ijh_features + cdr_features + odds_features + test_features
features.append('PredSpread')

################################################################################
# Standardize data
################################################################################

####### Scale data select features
standardscaler = StandardScaler()
X_trainS = standardscaler.fit_transform(X_train[features])
X_valS = standardscaler.transform(X_val[features])
#X_testscaled = standardscaler.transform(X_test[featurestouse])


In [116]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression #penalty, C, 
from sklearn.linear_model import ElasticNet #alpha, l1_ratio, warm_start, 
from sklearn.svm import SVC #C, kernel, degree-polynomial, gamma, 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor

In [130]:
data_final_clean['HomeWin'] = data_final_clean.apply(lambda x: 1 if x['target_margin'] > 0 else 0, axis=1)

X_trainC = data_final_clean.loc[(data_final_clean['Season']<2016) & \
                     (data_final_clean['Week'] > 4)].\
                                       drop(['target_margin', 'HomeWin'], axis=1)
    
y_trainC = data_final_clean.loc[(data_final_clean['Season']<2016) & \
                     (data_final_clean['Week'] > 4), 'HomeWin']

X_valC = data_final_clean.loc[(data_final_clean['Season']==2016) & \
                     (data_final_clean['Week'] > 4)].\
                                       drop(['target_margin', 'HomeWin'], axis=1)
    
y_valC = data_final_clean.loc[(data_final_clean['Season']==2016) & \
                     (data_final_clean['Week'] > 4), 'HomeWin']

# Current season average margins diff
X_trainC['CurSeason_HH_VA_mean_margin'] = (X_trainC['SpreadHomeInSeasonAvg'] \
                                    - X_trainC['SpreadVisInSeasonAvg']*-1)
X_valC['CurSeason_HH_VA_mean_margin'] = (X_valC['SpreadHomeInSeasonAvg'] \
                                        - X_valC['SpreadVisInSeasonAvg']*-1)


standardscaler = StandardScaler()
X_trainscaledC = standardscaler.fit_transform(X_trainC[odds_featurestouse])
X_valscaledC = standardscaler.transform(X_valC[odds_featurestouse])


dataDict = {'X_train':X_trainscaledC, 'X_val':X_valscaledC, 
            'y_trainC':y_trainC, 'y_valC':y_valC, 'y_train':y_train, 'y_val':y_val}


In [119]:
def doGridSearchC(X_train, y_train, X_val, y_val, estimator, param_grid, scorer):
    X_train_val = np.vstack((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))
    val_fold = [-1]*len(X_train) + [0]*len(X_val) #0 corresponds to validation
    grid = GridSearchCV(estimator,
                        param_grid,
                        return_train_score=False,
                        cv = PredefinedSplit(test_fold=val_fold),
                        refit = True,
                        scoring = scorer)
    grid.fit(X_train_val, y_train_val)
    return grid.best_estimator_

def chainClassiferGridSearch(nameC, nameR, estimatorC, estimatorR, paramC, dataDict=dataDict):

    scorer = make_scorer(roc_auc_score, greater_is_better = True)
    
    X_trainC2 = dataDict['X_train'].copy()
    X_valC2 = dataDict['X_val'].copy()
    
    classifer = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], estimatorC, paramC, scorer)
    classifer = classifer.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)

    regressor = estimatorR.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(classifer)

def chainRegressorGridSearch(nameC, nameR, estimatorC, estimatorR, paramR, dataDict=dataDict):

    scorer = make_scorer(mean_squared_error, greater_is_better = False)
    
    X_trainC2 = dataDict['X_train'].copy()
    X_valC2 = dataDict['X_val'].copy()
    
    classifer = estimatorC.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    
    
    regressor = doGridSearchC(X_trainC2, dataDict['y_train'], X_valC2, dataDict['y_val'], estimatorR, paramR, scorer)
    regressor = regressor.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(regressor)


def chainDoubleGridSearch(nameC, nameR, estimatorC, estimatorR, paramC, paramR, dataDict=dataDict):

    scorerC = make_scorer(roc_auc_score, greater_is_better = True)
    scorerR = make_scorer(mean_squared_error, greater_is_better = False)    
    
    X_trainC2 = dataDict['X_train'].copy()
    X_valC2 = dataDict['X_val'].copy()
    
    classifer = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], estimatorC, paramC, scorerC)
    classifer = classifer.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)

    regressor = doGridSearchC(X_trainC2, dataDict['y_train'], X_valC2, dataDict['y_val'], estimatorR, paramR, scorerR)
    regrsregressorsor = regressor.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(classifer, regressor)

def chainGridSearchWrapper(nameC, nameR, clssfier, estimatorC, estimatorR, params, dataDict=dataDict):
    if len(params) == 2:
        return chainDoubleGridSearch(nameC, nameR, estimatorC, estimatorR, params[0], params[1], dataDict)
    elif clssfier:
        return chainClassiferGridSearch(nameC, nameR, estimatorC, estimatorR, params, dataDict)
    else:
        return chainRegressorGridSearch(nameC, nameR, estimatorC, estimatorR, params, dataDict)

In [132]:
rfc_sgd = chainGridSearchWrapper('RFC', 'SGDR', True,
                          RandomForestClassifier(), 
                          SGDRegressor(loss='epsilon_insensitive', penalty='l1', max_iter=10000,
                                       alpha=0.001, eta0=0.0001, learning_rate='invscaling'),
                          [{'min_samples_leaf':np.arange(2, 20, 1),
                            'min_samples_split':np.arange(2, 20, 1)}], 
                          dataDict)



RFC 0.6493172860360361
SGDR 277.4708015019177
