In [1]:
import numpy as np
import pandas as pd
from datetime import *

In [20]:
odds = pd.read_csv('data/oddsAdjusted.csv')

scores_list = [0,0,0,0]

for i, year in enumerate(range(2013,2017)):
    scores_list[i] = pd.read_csv('data/scores_pe/scores_pythElo{}.csv'.format(year))
    scores_list[i]['SpreadMed'] = scores_list[i][['SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG', 'SpreadWag']].median(axis=1)

train = pd.concat(scores_list[0:3])
test = scores_list[-1]

In [23]:
train['HomePythProb'] = (train['HomePythPct'] - (train['HomePythPct'] * train['VisPythPct']))/ \
    (train['HomePythPct'] + train['VisPythPct'] - 2 * (train['HomePythPct'] * train['VisPythPct']))
train['VisPythProb'] = (train['VisPythPct'] - (train['VisPythPct'] * train['HomePythPct']))/ \
    (train['VisPythPct'] + train['HomePythPct'] - 2 * (train['VisPythPct'] * train['HomePythPct']))
    
test['HomePythProb'] = (test['HomePythPct'] - (test['HomePythPct'] * test['VisPythPct']))/ \
    (test['HomePythPct'] + test['VisPythPct'] - 2 * (test['HomePythPct'] * test['VisPythPct']))
test['VisPythProb'] = (test['VisPythPct'] - (test['VisPythPct'] * test['HomePythPct']))/ \
    (test['VisPythPct'] + test['HomePythPct'] - 2 * (test['VisPythPct'] * test['HomePythPct']))

In [None]:
['HomePtsF','HomePtsA','HomeWins','HomeGames','HomeWinPct','HomePtsFPG','HomePtsAPG','HomeDiffOD',
 'VisPtsF','VisPtsA','VisWins','VisGames','VisWinPct','VisPtsFPG','VisPtsAPG','VisDiffOD',
 'DiffPtsFPG','DiffPtsAPG']

In [41]:
train.groupby('Week')['HomePtsFG'].mean()

Week
1.0      0.000000
2.0     15.398058
3.0     26.305373
4.0     26.790685
5.0     27.885241
6.0     27.767751
7.0     27.150230
8.0     27.378685
9.0     27.860027
10.0    27.635616
11.0    27.797824
12.0    28.315481
13.0    31.118653
14.0    33.334579
15.0    39.053956
16.0    39.149930
17.0    34.825725
18.0    33.419872
19.0    33.948413
20.0    35.485561
21.0    45.000000
Name: HomePtsFG, dtype: float64

In [25]:
from sklearn.linear_model import *
from sklearn.kernel_ridge import * 
from sklearn.metrics import mean_squared_error

train2Sk = train[['HomePythProb','VisPythProb','Spread']].dropna()
test2Sk = test[['HomePythProb','VisPythProb','Spread']].dropna()

print('train size:', train2Sk.shape[0])
print('test size:', test2Sk.shape[0])

preds = LinearRegression().fit(train2Sk[['HomePythProb','VisPythProb']], train2Sk[['Spread']]).predict(test2Sk[['HomePythProb','VisPythProb']])
print('Filled Median: ', mean_squared_error(test2Sk[['Spread']], preds))

train size: 10437
test size: 3481
Filled Median:  425.36342118725423


In [40]:
def team_spreadBias(team, prev_weeks):
    prev_games = prev_weeks.loc[(prev_weeks['HomeID'] == team) | (prev_weeks['VisID'] == team)].reset_index(drop=True)
    mask = prev_games['VisID'] == team
    prev_games.loc[mask,'Spread'] = -1 * prev_games.loc[mask,'Spread']
    prev_games.loc[mask,'SpreadMed'] = -1 * prev_games.loc[mask,'SpreadMed']
    errSpread = prev_games.loc[:,'SpreadMed'] - prev_games.loc[:,'Spread']
    return errSpread.mean()

def update_teamSpreadBiases(teams, prev_weeks):
    
    teams['AvgErr'] = teams.index.to_series().apply(lambda x: team_spreadBias(x, prev_weeks))
    teams['PlsMns'] = teams['AvgErr'] - teams['AvgErr'].mean()
    return teams

In [36]:
def spreads(teams, ind, teams_data): #Find pts scores, wins, games up to given week, for teams in a given game
    
    game = pd.Series([0.0 for x in range(len(ind))], index=ind) #pre-allocate array to return
        
    for i, team in enumerate(teams): #repeat for home and vis
        pos = 'Home' if i == 0 else 'Vis'
        
        game['{}AvgErr'.format(pos)] =  teams_data.loc[team,'AvgErr'] #SAE
        game['{}PlsMns'.format(pos)] =  teams_data.loc[team,'PlsMns'] #SAE
    
    return game

def add_spreadBiases(scores): #initialize df for year

    teams = pd.DataFrame(np.union1d(scores['HomeID'].unique(), scores['VisID'].unique()), columns=['Team'])
    teams['AvgErr'] = np.nan
    teams.set_index('Team', inplace=True)
    
    scores['SpreadMed'] = scores[['SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG', 'SpreadWag']].median(axis=1)
    scores = scores.assign(HomeAvgErr=0,HomePlsMns=0,
                           VisAvgErr=0,VisPlsMns=0) #init features in df
    ind = ['HomeAvgErr','HomePlsMns','VisAvgErr','VisPlsMns'] #features to add
    ind_prev = [['HomeID','VisID','SpreadMed','Spread']]
    
    scores_list = [group for _, group in scores.groupby('Week')] #split df by week
    
    for i, score in enumerate(scores_list[1:]): #iterate through weeks, ignore Week1
        i += 1
        prev_weeks = pd.concat(scores_list[:i]) #get df of prev weeks
        teams_data = update_teamSpreadBiases(teams, prev_weeks)
        scores_list[i][ind] = scores_list[i].apply(lambda x: spreads((x['HomeID'],x['VisID']),ind,teams_data), axis = 1) #get pts,wins,games for every game in week 
    scores = pd.concat(scores_list) #rebuild df
    
    return scores

In [38]:
teams = pd.DataFrame(np.union1d(train2013['HomeID'].unique(), train2013['VisID'].unique()), columns=['Team'])
teams['AvgErr'] = np.nan
teams.set_index('Team', inplace=True)
teams = update_teamSpreadBiases(teams, train2013)

In [99]:
odds = pd.read_csv('data/oddsAdjusted.csv')

scores_list = [0,0,0,0]

for i, year in enumerate(range(2013,2017)):
    scores_list[i] = pd.read_csv('data/scores_pe/scores_pythElo{}.csv'.format(year))
    snoozle_odds = pd.read_csv('data/snoozle/odds_fixed/odds{}.csv'.format(year))
    snoozle_stats = pd.read_csv('data/snoozle/stats_fixed/stats{}.csv'.format(year))
    scores_list[i] = pd.merge(scores_list[i], snoozle_odds.drop(['Home','Visiter'], axis=1), on=['HomeID', 'VisID', 'Month', 'Day', 'Year'], how='left')
    scores_list[i] = pd.merge(scores_list[i], snoozle_stats.drop(['Home','Visiter'], axis=1), on=['HomeID', 'VisID', 'Month', 'Day', 'Year'], how='left')
    scores_list[i] = pd.merge(scores_list[i], odds, on=['Year','Month','Day','Home','Visiter'], how='left')
    scores_list[i]['SpreadMed'] = scores_list[i][['SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG', 'SpreadWag']].median(axis=1)



In [47]:
#before_ind = ['User','SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG','SpreadWag','EloSpread']
before_ind = ['User','SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG','SpreadWag']
spreads_ind = ['SpreadMirage','SpreadPinnacle','SpreadSportsbet','SpreadWestgate','SpreadStation','SpreadSIA','SpreadSBG','SpreadWag']

#def format_user(month, day, home, vis, home_pyth, vis_pyth):
#    if home_pyth - vis_pyth >= 0:
#        return '{}-{}-{}|{}'.format(month, day, home, vis)
#    else:
#        return '{}-{}-{}|{}'.format(month, day, vis, home)

for i, year in enumerate(range(2013,2017)):
    before_rec = scores_list[i]
    #before_rec = before_rec.assign(User=before_rec.apply(lambda x: format_user(x['Month'], x['Day'], x['HomeID'],x['VisID'],x['HomePythWins'],x['VisPythWins']), axis=1))
    before_rec = before_rec.assign(User=before_rec.apply(lambda x: '{}|{}'.format(x['HomeID'],x['VisID']), axis=1))
    before_rec = before_rec[before_ind].replace(0, np.nan)
    before_rec = pd.melt(before_rec, id_vars='User', var_name='Item')
    #before_rec['value'] = before_rec['value'].abs()
    before_rec.to_csv('data/rec/before_dubIndex/data{}.csv'.format(year), index=False)


In [49]:
rec_list, c_list = [0,0,0,0], [0,0,0,0]
casinos = ['Spread{}'.format(casino) for casino in ['Mirage','Pinnacle','Sportsbet','Westgate','Station','SIA','SBG','Wag']] 
#casinos.append('EloSpread')
#after_ind = ['Month', 'Day', 'HomeID', 'VisID']
after_ind = ['HomeID', 'VisID']

'''def unformat_user(user, value, scores):
    split1 = user.split('-')
    split2 = split1[2].split('|')
    month, day = int(float(split1[0])), int(float(split1[1]))
    tm1, tm2 = int(float(split2[0])), int(float(split2[1]))
    value = int(value)
    dates = scores.loc[(scores['Month']==month)&(scores['Day']==day)]
    if dates[(dates['HomeID']==tm1)&(dates['VisID']==tm2)].shape[0] == 0:
        return pd.Series([month, day, tm2, tm1, value], index=['Month', 'Day', 'VisID', 'HomeID', 'rating'])
    else:
        return pd.Series([month, day, tm1, tm2, -value], index=['Month', 'Day', 'VisID', 'HomeID', 'rating'])
'''

for i, year in enumerate(range(2013,2017)):
    train_recs = pd.read_csv('data/rec/after_dubIndex/data{}.csv'.format(year))
    #train_recs = train_recs.assign(Month=0, Day=0, HomeID=0, VisID=0)
    #train_recs[['Month','Day','HomeID','VisID','rating']] = train_recs.apply(lambda x: unformat_user(x['user'], x['rating'], scores_list[i]), axis=1)
    #train_recs.drop('user', axis=1)
    train_recs = train_recs.assign(HomeID=train_recs.apply(lambda x: int(x['user'].split('|')[0]), axis=1), 
                                   VisID=train_recs.apply(lambda x: int(x['user'].split('|')[1]), axis=1))
    
    rec_list[i] = pd.DataFrame(train_recs.pivot_table(values='rating', index=after_ind, columns='item').to_records())
    rec_list[i]['SpreadRec'] = rec_list[i][casinos].median(axis=1)
    rec_list[i].drop(casinos, axis=1, inplace=True)
    c_list[i] = pd.merge(scores_list[i], rec_list[i], on=after_ind, how='left')
    c_list[i]['SpreadMed'] = c_list[i]['SpreadMed'].fillna(c_list[i]['SpreadRec'])
    
train2 = pd.concat(c_list[0:3])
test2 = c_list[-1]


In [86]:
train2['SpreadPyth'] = (train2['HomePythWins'] - train2['VisPythWins']) * 2 + 3.5
test2['SpreadPyth'] = (test2['HomePythWins'] - test2['VisPythWins']) * 2 + 3.5

preds = LinearRegression().fit(train2[['SpreadPyth']], train2[['Spread']]).predict(test2[['SpreadPyth']])
print('Filled Median: ', mean_squared_error(test2[['Spread']], preds))

Filled Median:  431.8453921968406


In [59]:
from sklearn.linear_model import *
from sklearn.kernel_ridge import * 
from sklearn.metrics import mean_squared_error

train2Sk = train2[['SpreadRec','SpreadMed','EloSpread','Spread']].dropna()
test2Sk = test2[['SpreadRec','SpreadMed','EloSpread','Spread']].dropna()

print('train size:', train2Sk.shape[0])
print('test size:', test2Sk.shape[0])

preds = LinearRegression().fit(train2Sk[['SpreadMed']], train2Sk[['Spread']]).predict(test2Sk[['SpreadMed']])
print('Filled Median: ', mean_squared_error(test2Sk[['Spread']], preds))

preds = LinearRegression().fit(train2Sk[['EloSpread']], train2Sk[['Spread']]).predict(test2Sk[['EloSpread']])
print('Filled Elo: ', mean_squared_error(test2Sk[['Spread']], preds))

preds = LinearRegression().fit(train2Sk[['SpreadRec']], train2Sk[['Spread']]).predict(test2Sk[['SpreadRec']])
print('Rec: ', mean_squared_error(test2Sk[['Spread']], preds))

train size: 2525
test size: 868
Filled Median:  275.76358859569933
Filled Elo:  317.42894488097085
Rec:  278.11026681974096


In [51]:
scores2013 = add_ptsWinsGames(scores2013)