In [1]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

%matplotlib inline

In [135]:
root_dir = os.getcwd()
ctmc_dir = 'ctmc'
glicko_dir = 'glicko'
curseason_dir = 'curseason'
scorepreds_dir = 'scorepreds'
ultimate_dir = 'ultimate'
scores_pe_dir = 'scores_pe'
bcs_dir = 'BCS'
conf_dir = 'conferences'
data_dir = os.path.join(root_dir, "data")

##### IJH Data
file = os.path.join(data_dir, ultimate_dir, "ultimate_2.csv")
scores_pe_df = pd.read_csv(file)
scores_pe_df = scores_pe_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])
scores_pe_df['target_margin'] = scores_pe_df['HomeFinal'] - scores_pe_df['VisFinal']
scores_pe_df = scores_pe_df[['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                            'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                            'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                            'VisPythWins','VisWinPct','SpreadElo','HomeConf_NotMajor',
                            'VisConf_NotMajor','target_margin','HomeFinal','VisFinal']]
scores_pe_df = scores_pe_df.drop_duplicates()

######## SWC Data
file = os.path.join(data_dir, ctmc_dir, "scores_ctmc_ultimate_data_final.csv")
scores_ctmc_df = pd.read_csv(file,index_col=[0,1,2,3]).drop_duplicates()
scores_ctmc_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, glicko_dir, "glicko_ultimate_data_final.csv")
glicko_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
glicko_df.index.names = ['HomeID', 'VisID', 'Season', 'Week']
file = os.path.join(data_dir, curseason_dir, "curseason.csv")
curseason_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()
file = os.path.join(data_dir, scorepreds_dir, "scorepreds.csv")
scorepreds_df = pd.read_csv(file, index_col=[0,1,2,3]).drop_duplicates()

# Join SC Data
data_final = scores_pe_df.join(scores_ctmc_df, how='left')
data_final = data_final.join(glicko_df)
data_final = data_final.join(curseason_df)
data_final = data_final.join(scorepreds_df, how='left')

##### CDR Data
file = os.path.join(data_dir, bcs_dir, "BCS-SOS.csv")
bcs_df = pd.read_csv(file).drop_duplicates()
bcs_df = bcs_df.set_index(['HomeID', 'VisID', 'Season', 'Week'])

# Join CDR Data
data_final = data_final.join(bcs_df)

# Join Conference Data
file = os.path.join(data_dir, conf_dir, "mergedConferences.csv")
conf_df = pd.read_csv(file).drop_duplicates()
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['HomeID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Home'))
data_final = data_final.reset_index().merge(conf_df,
                                            left_on=['VisID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Vis'))
data_final = data_final.set_index(['HomeID', 'VisID', 'Season', 'Week'])
data_final = data_final.drop(['ID','Year','IDVis','index'],1)

# Impute HomeConf Data
data_final['HomeConf_NotMajor'] = data_final['HomeConf_NotMajor'].fillna(0)
data_final['PredSpread'] = data_final['HomePredFinal'] - data_final['VisPredFinal']

################################################################################
# Train - Val - Test Splits
################################################################################

X_train = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') &
                     (data_final.index.get_level_values(3)>4)].\
                                       drop(['target_margin'], axis=1).\
                                       fillna(data_final.mean())
y_train = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') & \
                     (data_final.index.get_level_values(3)>4)]['target_margin']

X_val = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)].\
                                     drop(['target_margin'], axis=1).\
                                     fillna(data_final.mean())
y_val = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)]['target_margin']

X_test = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)].\
                                      drop(['target_margin'], axis=1).\
                                      fillna(data_final.mean())
y_test = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)]['target_margin']

################################################################################
# Final Feature Selection
################################################################################

base_featurestouse = [col for col in data_final.columns if \
                      col.find('InSeason') > -1]

swc_featurestouse = ['Glicko_Rating_Home', 'Glicko_Rating_Away',
                     'Glicko_Rating_Deviance_Home', 'Glicko_Rating_Deviance_Away',
                     'Glicko_Sigma_Home', 'Glicko_Sigma_Away',
                     'CTMC_Rating_Home', 'CTMC_Rating_Away','HomePredFinal',
                     'VisPredFinal','CurSeason_HH_VA_mean_margin']

ijh_featurestouse = ['HomeElo', 'HomeEloProb','HomeLuck','HomePrevLuck',
                    'HomePythPct','HomePythWins','HomeWinPct','VisElo',
                    'VisEloProb','VisLuck', 'VisPrevLuck','VisPythPct',
                    'VisPythWins','VisWinPct','SpreadElo','HomeConf_NotMajor',
                    'VisConf_NotMajor']

cdr_featurestouse = bcs_df.columns.values.tolist()

# featurestouse = base_featurestouse + swc_featurestouse + ijh_featurestouse + cdr_featurestouse
featurestouse = swc_featurestouse + ijh_featurestouse + cdr_featurestouse
featurestouse.append('PredSpread')

################################################################################
# Baseline Model
################################################################################

# Current season average margins diff
X_train['CurSeason_HH_VA_mean_margin'] = (X_train['SpreadHomeInSeasonAvg'] \
                                    - X_train['SpreadVisInSeasonAvg']*-1)
X_val['CurSeason_HH_VA_mean_margin'] = (X_val['SpreadHomeInSeasonAvg'] \
                                        - X_val['SpreadVisInSeasonAvg']*-1)
X_test['CurSeason_HH_VA_mean_margin'] = (X_test['SpreadHomeInSeasonAvg'] \
                                        - X_test['SpreadVisInSeasonAvg']*-1)

# Train and report baseline model with current season metric
ols = LinearRegression()
ols.fit(X_train['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_train)
ols.score(X_train['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_train)
ols.score(X_val['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1), y_val)
preds = ols.predict(X_val['CurSeason_HH_VA_mean_margin'].values.reshape(-1,1))
mean_squared_error(preds, y_val)

################################################################################
# Standardize data
################################################################################

####### Scale data select features
standardscaler = StandardScaler()
X_trainscaled = standardscaler.fit_transform(X_train[featurestouse])
X_valscaled = standardscaler.transform(X_val[featurestouse])
X_testscaled = standardscaler.transform(X_test[featurestouse])


In [20]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression #penalty, C, 
from sklearn.linear_model import ElasticNet #alpha, l1_ratio, warm_start, 
from sklearn.svm import SVC #C, kernel, degree-polynomial, gamma, 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

In [136]:
data_final['HomeWin'] = data_final.apply(lambda x: 1 if x['target_margin'] > 0 else 0, axis=1)
X_trainC = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') &
                     (data_final.index.get_level_values(3)>4)].\
                                       drop(['target_margin', 'HomeWin'], axis=1).\
                                       fillna(data_final.mean())
y_trainC = data_final[(data_final.index.get_level_values(2)<2016) & \
                     (data_final['Conf']!='NotMajor') & \
                     (data_final.index.get_level_values(3)>4)]['HomeWin']

X_valC = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)].\
                                     drop(['target_margin', 'HomeWin'], axis=1).\
                                     fillna(data_final.mean())
y_valC = data_final[(data_final.index.get_level_values(2)==2016) & \
                   (data_final['Conf']!='NotMajor') & \
                   (data_final.index.get_level_values(3)>4)]['HomeWin']

X_testC = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)].\
                                      drop(['target_margin', 'HomeWin'], axis=1).\
                                      fillna(data_final.mean())
y_testC = data_final[(data_final.index.get_level_values(2)==2017) & \
                    (data_final['Conf']!='NotMajor') & \
                    (data_final.index.get_level_values(3)>4)]['HomeWin']

# Current season average margins diff
X_trainC['CurSeason_HH_VA_mean_margin'] = (X_trainC['SpreadHomeInSeasonAvg'] \
                                    - X_trainC['SpreadVisInSeasonAvg']*-1)
X_valC['CurSeason_HH_VA_mean_margin'] = (X_valC['SpreadHomeInSeasonAvg'] \
                                        - X_valC['SpreadVisInSeasonAvg']*-1)
X_testC['CurSeason_HH_VA_mean_margin'] = (X_testC['SpreadHomeInSeasonAvg'] \
                                        - X_testC['SpreadVisInSeasonAvg']*-1)


featurestouse2 = ['Glicko_Rating_Home','Glicko_Rating_Away','Glicko_Rating_Deviance_Home','Glicko_Rating_Deviance_Away',
 'Glicko_Sigma_Home','Glicko_Sigma_Away','CTMC_Rating_Home','CTMC_Rating_Away',
 'CurSeason_HH_VA_mean_margin', 'PredSpread',
 'HomeElo','HomeEloProb','HomeLuck','HomePrevLuck','HomePythPct','HomePythWins','HomeWinPct',
 'VisElo','VisEloProb','VisLuck','VisPrevLuck','VisPythPct','VisPythWins','VisWinPct',
 'SpreadElo','HomeConf_NotMajor','VisConf_NotMajor',
 'HomeBCSSOS','HomeRPI','AwayBCSSOS','AwayRPI']

standardscaler = StandardScaler()
X_trainscaledC = standardscaler.fit_transform(X_trainC[featurestouse])
X_valscaledC = standardscaler.transform(X_valC[featurestouse])
X_testscaledC = standardscaler.transform(X_testC[featurestouse])

X_trainscaledC2 = standardscaler.fit_transform(X_trainC[featurestouse2])
X_valscaledC2 = standardscaler.transform(X_valC[featurestouse2])
X_testscaledC2 = standardscaler.transform(X_testC[featurestouse2])

dataDict = {'X_train':X_trainscaledC, 'X_val':X_valscaledC, 'X_train2':X_trainscaledC2, 'X_val2':X_valscaledC2,
            'y_trainC':y_trainC, 'y_valC':y_valC, 'y_train':y_train, 'y_val':y_val}


In [100]:
def chainedRegression(nameC, nameR, estimatorC, estimatorR, dataDict=dataDict, feats=False, est=False):
    
    if feats:
        X_trainC2 = dataDict['X_train2'].copy()
        X_valC2 = dataDict['X_val2'].copy()
    else:
        X_trainC2 = dataDict['X_train'].copy()
        X_valC2 = dataDict['X_val'].copy()
    
    classifer = estimatorC.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))
    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    
    if est:
        X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
        X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)
    
    regressor = estimatorR.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    
    return regressor

In [137]:
logit_kr1 = chainedRegression('Logit1', 'KernelRidge1',
                              LogisticRegression(C=0.1), KernelRidge(alpha=0.0001, gamma=1e-5),
                              dataDict, False, False)

logit_kr2 = chainedRegression('Logit2', 'KernelRidge1',
                              LogisticRegression(C=0.1), KernelRidge(alpha=0.0001, gamma=1e-5),
                              dataDict, True, False)

logit_kr3 = chainedRegression('Logit1', 'KernelRidge2',
                              LogisticRegression(C=0.1), KernelRidge(alpha=0.0001, gamma=1e-5),
                              dataDict, False, True)

logit_kr4 = chainedRegression('Logit2', 'KernelRidge2',
                              LogisticRegression(C=0.1), KernelRidge(alpha=0.0001, gamma=1e-5),
                              dataDict, True, True)

Logit1 0.8203282424714073
KernelRidge1 277.11463569267886
Logit2 0.8201336872385663
KernelRidge1 277.15381310803093
Logit1 0.8203282424714073
KernelRidge2 277.0286763213798
Logit2 0.8201336872385663
KernelRidge2 277.05406977076245


In [143]:
mean_squared_error(dataDict['y_val'], KernelRidge(alpha=0.0001, gamma=1e-5).fit(dataDict['X_train'][:,9].reshape(-1, 1), dataDict['y_train']).predict(dataDict['X_val'][:,9].reshape(-1, 1)))

338.0486320337523

In [138]:
rfr = RandomForestRegressor(min_samples_split=10).fit(dataDict['X_train'], dataDict['y_train'])
print(mean_squared_error(dataDict['y_val'], rfr.predict(dataDict['X_val'])))

importances = rfr.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfr.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(dataDict['X_train'].shape[1]):
    print("%d. %s (%f)" % (f + 1, featurestouse[indices[f]], importances[indices[f]]))

314.50065455164804
Feature ranking:
1. PredSpread (0.512126)
2. CurSeason_HH_VA_mean_margin (0.033038)
3. HomePredFinal (0.030247)
4. VisPredFinal (0.027358)
5. VisPythWins (0.024368)
6. VisPythPct (0.024131)
7. Glicko_Sigma_Home (0.023550)
8. Glicko_Rating_Away (0.023321)
9. CTMC_Rating_Away (0.021645)
10. VisElo (0.021572)
11. Glicko_Sigma_Away (0.020074)
12. HomePythWins (0.019631)
13. Glicko_Rating_Home (0.019098)
14. HomePythPct (0.019096)
15. CTMC_Rating_Home (0.018839)
16. HomeElo (0.018345)
17. Glicko_Rating_Deviance_Home (0.018263)
18. Glicko_Rating_Deviance_Away (0.018207)
19. HomeBCSSOS (0.015169)
20. AwayBCSSOS (0.013260)
21. AwayRPI (0.010767)
22. VisEloProb (0.010357)
23. VisWinPct (0.010041)
24. SpreadElo (0.009716)
25. HomeEloProb (0.009478)
26. HomeRPI (0.009321)
27. HomeWinPct (0.009034)
28. HomePrevLuck (0.002649)
29. HomeLuck (0.002631)
30. VisPrevLuck (0.002356)
31. VisLuck (0.002307)
32. VisConf_NotMajor (0.000003)
33. HomeConf_NotMajor (0.000000)


In [139]:
rfr = RandomForestRegressor(min_samples_split=10).fit(dataDict['X_train2'], dataDict['y_train'])
print(mean_squared_error(dataDict['y_val'], rfr.predict(dataDict['X_val2'])))

importances = rfr.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfr.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(dataDict['X_train2'].shape[1]):
    print("%d. %s (%f)" % (f + 1, featurestouse2[indices[f]], importances[indices[f]]))

299.45737193475327
Feature ranking:
1. PredSpread (0.534885)
2. CurSeason_HH_VA_mean_margin (0.033658)
3. VisPythPct (0.025309)
4. HomePythPct (0.025081)
5. CTMC_Rating_Home (0.024776)
6. CTMC_Rating_Away (0.024658)
7. Glicko_Rating_Away (0.024461)
8. Glicko_Rating_Home (0.023823)
9. HomePythWins (0.022531)
10. Glicko_Sigma_Home (0.022097)
11. VisPythWins (0.020667)
12. VisElo (0.020454)
13. Glicko_Sigma_Away (0.020385)
14. Glicko_Rating_Deviance_Away (0.018933)
15. Glicko_Rating_Deviance_Home (0.018644)
16. HomeElo (0.017466)
17. HomeBCSSOS (0.015095)
18. AwayBCSSOS (0.014990)
19. VisWinPct (0.013045)
20. AwayRPI (0.011996)
21. VisEloProb (0.011470)
22. HomeEloProb (0.011155)
23. HomeWinPct (0.010838)
24. HomeRPI (0.010537)
25. SpreadElo (0.010086)
26. HomeLuck (0.003515)
27. HomePrevLuck (0.003252)
28. VisLuck (0.003063)
29. VisPrevLuck (0.003012)
30. VisConf_NotMajor (0.000118)
31. HomeConf_NotMajor (0.000000)


In [97]:
def doGridSearchC(X_train, y_train, X_val, y_val, model, param_grid, scorer):
    X_train_val = np.vstack((X_train, X_val))
    y_train_val = np.concatenate((y_train, y_val))
    val_fold = [-1]*len(X_train) + [0]*len(X_val) #0 corresponds to validation
    estimator = model.fit(X_train, y_train)
    grid = GridSearchCV(estimator,
                        param_grid,
                        return_train_score=False,
                        cv = PredefinedSplit(test_fold=val_fold),
                        refit = True,
                        scoring = scorer)
    grid.fit(X_train_val, y_train_val)
    return grid.best_estimator_

def chainClassiferGridSearch(nameC, nameR, estimatorC, estimatorR, paramC, dataDict=dataDict, feats=False, est=False):

    scorer = make_scorer(roc_auc_score, greater_is_better = True)
    
    if feats:
        X_trainC2 = dataDict['X_train2'].copy()
        X_valC2 = dataDict['X_val2'].copy()
    else:
        X_trainC2 = dataDict['X_train'].copy()
        X_valC2 = dataDict['X_val'].copy()
    
    classifer = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], estimatorC, paramC, scorer)
    classifer = classifer.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    
    if est:
        X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
        X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)

    regressor = estimatorR.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(clssifer)

def chainRegressorGridSearch(nameC, nameR, estimatorC, estimatorR, paramR, dataDict=dataDict, feats=False, est=False):

    scorer = make_scorer(mean_squared_error, greater_is_better = False)
    
    if feats:
        X_trainC2 = dataDict['X_train2'].copy()
        X_valC2 = dataDict['X_val2'].copy()
    else:
        X_trainC2 = dataDict['X_train'].copy()
        X_valC2 = dataDict['X_val'].copy()
    
    classifer = estimatorC.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    
    if est:
        X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
        X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)
    
    regressor = doGridSearchC(X_trainC2, dataDict['y_train'], X_valC2, dataDict['y_val'], estimatorR, paramR, scorer)
    regressor = regressor.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(regressor)


def chainDoubleGridSearch(nameC, nameR, estimatorC, estimatorR, paramC, paramR, dataDict=dataDict, feats=False, est=False):

    scorerC = make_scorer(roc_auc_score, greater_is_better = True)
    scorerR = make_scorer(mean_squared_error, greater_is_better = False)    
    
    if feats:
        X_trainC2 = dataDict['X_train2'].copy()
        X_valC2 = dataDict['X_val2'].copy()
    else:
        X_trainC2 = dataDict['X_train'].copy()
        X_valC2 = dataDict['X_val'].copy()
    
    classifer = doGridSearchC(X_trainC2, dataDict['y_trainC'], X_valC2, dataDict['y_valC'], estimatorC, paramC, scorerC)
    classifer = classifer.fit(X_trainC2, dataDict['y_trainC'])
    predsTrain = classifer.predict_proba(X_trainC2)[:,1]
    predsVal = classifer.predict_proba(X_valC2)[:,1]
    print(nameC, roc_auc_score(dataDict['y_valC'], predsVal))

    X_trainC2 = np.concatenate((X_trainC2, predsTrain.reshape(-1,1)), 1)
    X_trainC2 = np.concatenate((X_trainC2, np.where(predsTrain >0.5, 1, 0).reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, predsVal.reshape(-1,1)), 1)
    X_valC2 = np.concatenate((X_valC2, np.where(predsVal >0.5, 1, 0).reshape(-1,1)), 1)

    regressor = doGridSearchC(X_trainC2, dataDict['y_train'], X_valC2, dataDict['y_val'], estimatorR, paramR, scorerR)
    regrsregressorsor = regressor.fit(X_trainC2, dataDict['y_train'])
    predsReg = regressor.predict(X_valC2)
    print(nameR, mean_squared_error(dataDict['y_val'], predsReg))
    return(classifer, regressor)

def chainGridSearchWrapper(nameC, nameR, clssfier, estimatorC, estimatorR, params, dataDict=dataDict, feats=False, est=False):
    if len(params) == 2:
        return chainDoubleGridSearch(nameC, nameR, estimatorC, estimatorR, params[0], params[1], dataDict, feats, est)
    elif clssfier:
        return chainClassiferGridSearch(nameC, nameR, estimatorC, estimatorR, params, dataDict, feats, est)
    else:
        return chainRegressorGridSearch(nameC, nameR, estimatorC, estimatorR, params, dataDict, feats, est)

In [98]:
logit_kr1 = chainGridSearchWrapper('Logit1', 'KernelRidge1', False,
                          LogisticRegression(), KernelRidge(),
                          [[{'C':10**np.arange(-5, 1, 1.0)}],
                          [{'alpha':10**np.arange(-5, 1, 1.0),
                           'gamma':10**np.arange(-5, 1, 1.0)}]], 
                          dataDict, False, False)

logit_kr2 = chainGridSearchWrapper('Logit2', 'KernelRidge1', False,
                          LogisticRegression(), KernelRidge(),
                          [[{'C':10**np.arange(-5, 1, 1.0)}],
                          [{'alpha':10**np.arange(-5, 1, 1.0),
                           'gamma':10**np.arange(-5, 1, 1.0)}]], 
                          dataDict, True, False)

logit_kr3 = chainGridSearchWrapper('Logit1', 'KernelRidge2', False,
                          LogisticRegression(), KernelRidge(),
                          [[{'C':10**np.arange(-5, 1, 1.0)}],
                          [{'alpha':10**np.arange(-5, 1, 1.0),
                           'gamma':10**np.arange(-5, 1, 1.0)}]], 
                          dataDict, False, True)

logit_kr4 = chainGridSearchWrapper('Logit2', 'KernelRidge2', False,
                          LogisticRegression(), KernelRidge(),
                          [[{'C':10**np.arange(-5, 1, 1.0)}],
                          [{'alpha':10**np.arange(-5, 1, 1.0),
                           'gamma':10**np.arange(-5, 1, 1.0)}]], 
                          dataDict, True, True)

Logit1 0.820036409622146


KeyboardInterrupt: 