In [2]:
import os
import itertools
import glob
import datetime
import numpy as np
import pandas as pd
from datetime import *

In [9]:
root_dir = os.getcwd()
conf_dir = 'conferences'
data_dir = os.path.join(root_dir, "data")


def load_csvs(file_names):
    """Loads and concatentates csv's from a directory"""
    df = pd.DataFrame()
    for each_file in file_names:
        new_df = pd.read_csv(each_file)
        df = pd.concat([df, new_df])
    return df

def join_data(scores_df, stats_df, odds_df):
    """
    Creates a unique key for each game using the date the game was played
    and the home and away abbreviated names (Not all data sets have a HomeID
    and AwayID)
    """

    # Add dates to join on
    scores_df['Year'] = scores_df['WeekDate'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year)
    scores_df['Month'] = scores_df['WeekDate'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").month)
    scores_df['Day'] = scores_df['WeekDate'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").day)
    stats_df['Year'] = stats_df['Start'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").year)
    stats_df['Month'] = stats_df['Start'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").month)
    stats_df['Day'] = stats_df['Start'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S").day)
    odds_df['Year'] = odds_df['DATE(date)'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").year)
    odds_df['Month'] = odds_df['DATE(date)'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").month)
    odds_df['Day'] = odds_df['DATE(date)'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").day)

    # Join Data
    data = scores_df.merge(
        stats_df.drop(['Season', 'Start', 'Week'], axis=1),
        left_on = ['Year', 'Month', 'Day', 'Home', 'Visiter'],
        right_on = ['Year', 'Month', 'Day', 'Home', 'Away'])
    data = data.merge(odds_df.drop(['DATE(date)', 'HomeScore', 'AwayScore'],
                                    axis=1),
        left_on = ['Year', 'Month', 'Day', 'Home', 'Visiter'],
        right_on = ['Year', 'Month', 'Day', 'Home', 'Away'])

    # Target feature
    data['target_margin'] = data['HomeFinal'] - data['VisFinal']

    # Other features
    data['D1_Match'] = [True if not pd.isnull(x) else False for \
                        x in data['Spread_Mirage']]

    return data

# Load data locations
scores_dir = 'scores_pe'
stats_dir = 'stats'
odds_dir = ''

scores_names = glob.glob(os.path.join(root_dir, data_dir, scores_dir, "scores_pythElo201?.csv"))
stats_names =  glob.glob(os.path.join(root_dir, data_dir, stats_dir, "ncaastats201?.csv"))
odds_names = [os.path.join(root_dir, data_dir, odds_dir, "NCAAF_Odds.csv")]

# Import data and join
scores_df = load_csvs(scores_names)
stats_df = load_csvs(stats_names)
odds_df = load_csvs(odds_names)
data = join_data(scores_df, stats_df, odds_df)

spreads = data.set_index(['HomeID','VisID','Season','Week']).filter(regex="Spread_")
#m = spreads.mean(axis=1)
#for i, col in enumerate(spreads):
    # using i allows for duplicate columns
    # inplace *may* not always work here, so IMO the next line is preferred
    # df.iloc[:, i].fillna(m, inplace=True)
    #spreads.iloc[:, i] = spreads.iloc[:, i].fillna(m)
# spreads['target_margin'] = data['target_margin']
#spreads.dropna(axis=0, inplace=True)
spreads = spreads.join(pd.DataFrame(data.set_index(['HomeID','VisID','Season','Week'])['target_margin']))

# Join Conference Data
file = os.path.join(data_dir, conf_dir, "mergedConferences.csv")
conf_df = pd.read_csv(file).drop_duplicates()
spreads= spreads.reset_index().merge(conf_df,
                                            left_on=['HomeID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Home'))
spreads = spreads.reset_index().merge(conf_df,
                                            left_on=['VisID', 'Season'],
                                            right_on=['ID','Year'],
                                            suffixes=('','Vis'))
spreads['Week'] = spreads['Week'].astype(int)
spreads['Week'] = np.where(spreads['Season']==2016, spreads['Week'] - 1, spreads['Week'])
#spreads = spreads.set_index(['HomeID', 'VisID', 'Season', 'Week'])
spreads = spreads.drop(['ID','Year','IDVis','index','Team','TeamVis','ConfVis','Year','YearVis'],1)


In [12]:
casinos = ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
                'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
                'Spread_SBG', 'Spread_BetUS']

spreads['SpreadMed'] = spreads[casinos].median(axis=1)
spreads['SpreadMode'] = spreads[casinos].mode(axis=1)[0]

# SPLIT FOR USE IN R

In [13]:
for y, g in spreads.groupby('Season'):
    weeks = [group for _, group in g.groupby('Week')]
    for i, w in enumerate(weeks):
        i += 1
        if i == 5:
            pd.concat(weeks[:i]).to_csv('data/new_odds/pre/pre_{}/odds{}_{}.csv'.format(y, y, i), index=False)
        elif i>5:
            w.to_csv('data/new_odds/pre/pre_{}/odds{}_{}.csv'.format(y, y, i), index=False)

# RECOMBINE AFTER R

In [14]:
new_nnList = [0 for i in range(4)]

for j, n in enumerate(range(2,6)):
    new_oddsList = [0 for i in range(4)]
    for i, yr in enumerate(range(2013,2017)):
        lst_wk = spreads.loc[spreads['Season']==yr,'Week'].max()
        new_oddsList[i] = pd.read_csv('data/new_odds/post/post_{}/odds{}_{}_{}.csv'.format(yr, yr, lst_wk, n))
    
    new_nnList[j] = pd.concat(new_oddsList)
    new_nnList[j]['Spread_Med2'] = new_nnList[j][casinos].median(axis=1)
    new_nnList[j]['Spread_Mode2'] = new_nnList[j][casinos].mode(axis=1)[0]
    

# MODELING

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

from sklearn.linear_model import *
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor

In [16]:
orig_casinos = ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
                'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
                'Spread_SBG', 'Spread_BetUS']

orig_med = ['Spread_Med']
orig_mode = ['Spread_Mode']
orig_summaries = ['Spread_Med', 'Spread_Mode']
orig_total = ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
               'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
               'Spread_SBG', 'Spread_BetUS', 'Spread_Med', 'Spread_Mode']

new_med = ['Spread_Med2']
new_mode = ['Spread_Mode2']
new_rec = ['Spread_Rec']
new_summaries = ['Spread_Med2', 'Spread_Mode2']
new_total = ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
             'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
             'Spread_SBG', 'Spread_BetUS', 
             'Spread_Med', 'Spread_Mode', 'Spread_Rec']

new_total_total = ['Spread_Mirage', 'Spread_Pinnacle', 'Spread_Sportsbet', 
               'Spread_Westgate', 'Spread_Station', 'Spread_SIA',
               'Spread_SBG', 'Spread_BetUS', 'Spread_Med', 'Spread_Mode',
                'Spread_Med2', 'Spread_Mode2', 'Spread_Rec']

features = [('orig_casinos', orig_casinos), ('orig_summaries', orig_summaries), ('orig_total', orig_total),
            ('orig_med', orig_med), ('orig_mode', orig_mode), 
            ('new_med', new_med), ('new_mode', new_mode), 
            ('new_summaries', new_summaries), ('new_total', new_total), ('new_rec', new_rec),
            ('new_total_total',new_total_total)]


# What's the best nn?

In [17]:
for i, nn in enumerate(range(2,6)):
    print('Results for nn: {}'.format(nn))
    for name, feats in features:
        new_feats = feats.copy()
        new_feats.append('Season')
        new_feats.append('target_margin')
        new_spreads = new_nnList[i][new_feats].copy().dropna()

        X_train = new_spreads.loc[new_spreads['Season']<2016,feats]
        X_val = new_spreads.loc[new_spreads['Season']==2016,feats]
        y_train = new_spreads.loc[new_spreads['Season']<2016,'target_margin']
        y_val = new_spreads.loc[new_spreads['Season']==2016,'target_margin']

        standardscaler = StandardScaler()
        X_trainS = standardscaler.fit_transform(X_train)
        X_valS = standardscaler.transform(X_val)
        
    
        print('{}: {}' .format(name, 
                               mean_squared_error(y_val, LinearRegression().fit(X_trainS, y_train).predict(X_valS))))
    print('*****************************************************')

Results for nn: 2
orig_casinos: 283.2016754426179
orig_summaries: 272.3366434836036
orig_total: 290.2088879050677
orig_med: 272.2493773623063
orig_mode: 272.2827716719263
new_med: 317.0148668191828
new_mode: 297.9693020273856
new_summaries: 300.43638899094105
new_total: 270.90237413115915
new_rec: 425.5254904407057
new_total_total: 272.3969770544182
*****************************************************
Results for nn: 3
orig_casinos: 288.8386342518825
orig_summaries: 272.3366434836036
orig_total: 294.3823559019624
orig_med: 272.2493773623063
orig_mode: 272.2827716719263
new_med: 317.119491003821
new_mode: 299.74308140732523
new_summaries: 301.26496411682405
new_total: 283.5560140094089
new_rec: 430.96058421973464
new_total_total: 285.96907191419
*****************************************************
Results for nn: 4
orig_casinos: 289.7095097342245
orig_summaries: 272.3366434836036
orig_total: 294.05749100560746
orig_med: 272.2784528139681
orig_mode: 272.2827716719263
new_med: 317.23120

In [18]:
new_spreads = new_nnList[0]
new_spreads.to_csv('data/new_odds/new_odds.csv', index=False)

# Test Out-of-the-box Models

In [20]:
for name, feats in features:
    new_feats = feats.copy()
    new_feats.append('Season')
    new_feats.append('target_margin')
    new_spreadsTemp = new_spreads[new_feats].copy().dropna()

    X_train = new_spreadsTemp.loc[new_spreadsTemp['Season']<2016,feats]
    X_val = new_spreadsTemp.loc[new_spreadsTemp['Season']==2016,feats]
    y_train = new_spreadsTemp.loc[new_spreadsTemp['Season']<2016,'target_margin']
    y_val = new_spreadsTemp.loc[new_spreadsTemp['Season']==2016,'target_margin']

    standardscaler = StandardScaler()
    X_trainS = standardscaler.fit_transform(X_train)
    X_valS = standardscaler.transform(X_val)
        
    
    print('{}: {}' .format(name, 
                           mean_squared_error(y_val, KernelRidge().fit(X_trainS, y_train).predict(X_valS))))
    print('*****************************************************')

orig_casinos: 293.6657070410407
*****************************************************
orig_summaries: 323.91335619412985
*****************************************************
orig_total: 293.9583650617588
*****************************************************
orig_med: 323.8073677617688
*****************************************************
orig_mode: 323.6316464408711
*****************************************************
new_med: 355.20891382311106
*****************************************************
new_mode: 344.66858318887574
*****************************************************
new_summaries: 344.41061926852336
*****************************************************
new_total: 275.39117511175635
*****************************************************
new_rec: 440.86547681091815
*****************************************************
new_total_total: 277.2248937946552
*****************************************************


# Grid Search

In [36]:
def oddsGridSearch(new_spreads, estimators):
    for name, feats, est, params in estimators:
        new_feats = feats.copy()
        new_feats.append('Season')
        new_feats.append('target_margin')
        new_spreadTemp = new_spreads[new_feats].copy().dropna()

        X_train = new_spreadTemp.loc[new_spreadTemp['Season']<2016,feats]
        X_val = new_spreadTemp.loc[new_spreadTemp['Season']==2016,feats]
        y_train = new_spreadTemp.loc[new_spreadTemp['Season']<2016,'target_margin']
        y_val = new_spreadTemp.loc[new_spreadTemp['Season']==2016,'target_margin']

        standardscaler = StandardScaler()
        X_trainS = standardscaler.fit_transform(X_train)
        X_valS = standardscaler.transform(X_val)

        X_train_val = np.vstack((X_trainS, X_valS))
        y_train_val = np.concatenate((y_train, y_val))
        val_fold = [-1]*len(X_trainS) + [0]*len(X_valS) #0 corresponds to validation
        grid = GridSearchCV(est,
                            params,
                            return_train_score=False,
                            cv = PredefinedSplit(test_fold=val_fold),
                            refit = True,
                            scoring = make_scorer(mean_squared_error, greater_is_better = False))
        grid.fit(X_train_val, y_train_val)
        bestimator = grid.best_estimator_
        print(name)
        print(grid.best_params_ )
        print('MSE: {}' .format(mean_squared_error(y_val, bestimator.fit(X_trainS, y_train).predict(X_valS))))
        print('*****************************************************')

In [37]:
oddsGridSearch(new_spreads, 
    [('orig_summaries',orig_summaries, BayesianRidge(), [{'alpha_1':10**np.arange(0, 3, 0.25),
                                                        'alpha_2':10**np.arange(-10, -7, 0.25),
                                                        'lambda_1':10**np.arange(-10, -7, 0.25),
                                                        'lambda_2':10**np.arange(0, 3, 0.25)}]),
     ('new_total',new_total, BayesianRidge(), [{'alpha_1':10**np.arange(0, 3, 0.25),
                                              'alpha_2':10**np.arange(-10, -7, 0.25),
                                              'lambda_1':10**np.arange(-10, -7, 0.25),
                                              'lambda_2':10**np.arange(0, 3, 0.25)}])])


orig_summaries
{'alpha_1': 1.0, 'alpha_2': 1e-10, 'lambda_1': 5.6234132519034905e-08, 'lambda_2': 1.0}
MSE: 272.2890741636246
*****************************************************
new_total
{'alpha_1': 100.0, 'alpha_2': 5.623413251903491e-09, 'lambda_1': 3.1622776601683795e-09, 'lambda_2': 316.22776601683796}
MSE: 268.38235593292285
*****************************************************


In [35]:
new_feats = new_total.copy()
new_feats.append('Season')
new_feats.append('target_margin')
new_spreadTemp = new_spreads[new_feats].copy().dropna()

X_train = new_spreadTemp.loc[new_spreadTemp['Season']<2016,feats]
X_val = new_spreadTemp.loc[new_spreadTemp['Season']==2016,feats]
y_train = new_spreadTemp.loc[new_spreadTemp['Season']<2016,'target_margin']
y_val = new_spreadTemp.loc[new_spreadTemp['Season']==2016,'target_margin']

standardscaler = StandardScaler()
X_trainS = standardscaler.fit_transform(X_train)
X_valS = standardscaler.transform(X_val)

br = BayesianRidge(alpha_1=100.0, alpha_2=5.623413251903491e-09, 
              lambda_1=3.1622776601683795e-09, lambda_2=316.22776601683796).fit(X_trainS, y_train)
mean_squared_error(y_val, br.predict(X_valS))

268.38235593292285

In [38]:
oddsGridSearch(new_spreads, 
    [('orig_summaries',orig_summaries, HuberRegressor(), [{'epsilon':np.arange(1.0, 10, 0.25),
                                                           'alpha':10**np.arange(-7,3,0.25)}]),
     ('new_total',new_total, HuberRegressor(), [{'epsilon':np.arange(1.0, 10, 0.25),
                                                'alpha':10**np.arange(-7,3,0.25)}])])


orig_summaries
{'alpha': 17.78279410038923, 'epsilon': 9.5}
MSE: 271.1672110862091
*****************************************************
new_total
{'alpha': 0.1778279410038923, 'epsilon': 1.75}
MSE: 267.57427540354036
*****************************************************


In [41]:
oddsGridSearch(new_spreads, 
    [('orig_summaries',orig_summaries, KernelRidge(kernel='rbf'), [{'alpha':10**np.arange(-7,3,0.5),
                                                                    'gamma':10**np.arange(-7,3,0.5)}]),
     ('new_total',new_total, KernelRidge(kernel='linear'), [{'alpha':10**np.arange(-7,3,0.5)}])])


orig_summaries
{'alpha': 3.1622776601683795, 'gamma': 0.01}
MSE: 271.0657360782656
*****************************************************
new_total
{'alpha': 3.1622776601683795}
MSE: 275.2552739053091
*****************************************************
