In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pymc3 as pm
import numpy as np
import pandas as pd
import scipy.stats

import RatingsHelper as helper

from scipy import optimize
import theano as thno
import theano.tensor as T

# data retrieval helper module
from general.DB import DB
import util

import statsmodels.api as sm

# plotting libraries
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
%matplotlib inline

In [60]:
# get the data we need from the database
year = 2015
all_teams = pd.read_sql("SELECT ncaa, ncaaid, kenpom, conf FROM teams", DB.conn)
games_df, stacked_games_df, teams = util.get_data(year)
teams['i_team'] = np.arange(teams.shape[0])
games_df = games_df.sort('dt').reset_index().drop('index', 1)  # sort by date ascending
stacked_games_df = stacked_games_df.sort('dt').reset_index().drop('index', 1)
kenpom = pd.read_sql("SELECT team, adjo, adjd FROM kenpom_ranks WHERE year = %s" % year, DB.conn)
teams = teams.merge(all_teams, left_on='team_id', right_on='ncaaid')
teams = teams.merge(kenpom, how='left', left_on='kenpom', right_on='team').drop(['team', 'kenpom', 'ncaaid'], 1)
num_teams = teams.shape[0]
print("Got data for %s games and %s teams, between %s and %s" % (games_df.shape[0], num_teams,
                                                   games_df['dt'].min(), games_df['dt'].max()))

Got data for 5279 games and 351 teams, between 2014-11-16 and 2015-04-06


In [61]:
def get_indices(unstacked, approx_burn_games, approx_interval):
    date_counts = unstacked.groupby('dt').count()['game_id']
    cum_indices = np.cumsum(date_counts).values
    next_ = cum_indices[-1]
    indices = []
    for gp in cum_indices[::-1]:
        if gp <= approx_burn_games:
            break
        if gp <= next_:
            indices.append(gp)
            next_ -= approx_interval
    return indices[::-1]

In [62]:
def get_home(neutral, teamid, homeid):
    if neutral:
        return 0
    elif teamid == homeid:
        return 1
    else:
        return -1
stacked_games_df['home'] = stacked_games_df.apply(lambda row: get_home(row.neutral, row.team_id, row.hteam_id), 1)
feature_df = stacked_games_df[['dt', 'i_team', 'i_opp', 'home', 'ppp']]
off_dummies = pd.get_dummies(feature_df['i_team'].astype(int), prefix='off')
def_dummies = pd.get_dummies(feature_df['i_opp'].astype(int), prefix='def')
glm_df = pd.concat([feature_df, helper.constrained_dummies(off_dummies), 
                    helper.constrained_dummies(def_dummies)], 1)
glm_df['const'] = 1

In [63]:
feature_columns = ['off_%s' % i for i in range(num_teams - 1)] + ['def_%s' % i for i in range(num_teams - 1)] + \
['home'] + ['const']
X = glm_df[feature_columns].values
y = glm_df['ppp'].values
res = sm.GLM(y, X).fit()

In [74]:
glm_df.head()

Unnamed: 0,dt,i_team,i_opp,home,ppp,off_0,off_1,off_2,off_3,off_4,...,def_341,def_342,def_343,def_344,def_345,def_346,def_347,def_348,def_349,const
0,2014-11-16,226,339.0,1,1.104338,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2014-11-16,77,173.0,1,1.022654,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2014-11-16,111,71.0,-1,0.658579,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2014-11-16,122,259.0,-1,0.905874,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,2014-11-16,252,155.0,1,0.968858,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [64]:
home_factor = res.params[-2]

In [76]:
indices = get_indices(games_df, 800, 200)
prev_idx = indices[0]
results = []
all_home_indices = games_df['i_hteam'].values
all_away_indices = games_df['i_ateam'].values
all_home_off_ratings = np.ones(games_df.shape[0]) * -1
all_home_def_ratings = np.ones(games_df.shape[0]) * -1
all_away_off_ratings = np.ones(games_df.shape[0]) * -1
all_away_def_ratings = np.ones(games_df.shape[0]) * -1
all_intercepts = np.ones(games_df.shape[0]) * -1
all_home_factors = np.ones(games_df.shape[0]) * -1
for i, idx in enumerate(indices[1:]):
    # TODO: should not be using the home column here
    _X = X[:prev_idx * 2]
    _y = y[:prev_idx * 2]
    if i == 0:
        res = sm.GLM(_y, _X).fit()
    else:
        res = sm.GLM(_y, _X).fit(results[-1].params)
    results.append(res)
    params = helper.extract_coefs(res, num_teams)
    _home_indices = all_home_indices[prev_idx:idx]
    _away_indices = all_away_indices[prev_idx:idx]
    all_home_off_ratings[prev_idx:idx] = params['coefs']['off'][_home_indices]
    all_away_off_ratings[prev_idx:idx] = params['coefs']['off'][_away_indices]
    all_home_def_ratings[prev_idx:idx] = params['coefs']['def'][_home_indices]
    all_away_def_ratings[prev_idx:idx] = params['coefs']['def'][_away_indices]
    all_intercepts[prev_idx:idx] = params['coefs']['intercept']
    all_home_factors[prev_idx:idx] = params['coefs']['home']
    prev_idx = idx
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


In [66]:
games_df.head()

Unnamed: 0,game_id,dt,hteam,hteam_id,ateam,ateam_id,hpts,hposs,apts,aposs,i_hteam,i_ateam,hppp,appp,neutral,home_outcome,poss,season
0,3509779,2014-11-16,Miami (OH),414.0,Southern Utah,667.0,76,71.15,63,67.5,156,268,1.068166,0.933333,False,True,69.325,2015
1,3512375,2014-11-16,East Carolina,196.0,UNC Asheville,456.0,79,77.25,83,76.15,77,173,1.022654,1.089954,False,False,76.7,2015
2,3512373,2014-11-16,UCF,128.0,Stetson,678.0,64,62.825,55,64.125,49,274,1.018703,0.8577,False,True,63.475,2015
3,3510544,2014-11-16,Seton Hall,635.0,Mercer,406.0,63,65.025,47,63.75,252,155,0.968858,0.737255,False,True,64.3875,2015
4,3512441,2014-11-16,Virginia,746.0,Norfolk St.,485.0,67,58.075,39,56.275,308,189,1.153681,0.693025,False,True,57.175,2015


In [77]:
games_df['home_ortg'] = all_home_off_ratings
games_df['home_drtg'] = all_home_def_ratings
games_df['away_ortg'] = all_away_off_ratings
games_df['away_drtg'] = all_away_def_ratings
games_df['intercept'] = all_intercepts
games_df['home_factor'] = all_home_factors
games_df['hpredict'] = games_df['home_ortg'] + games_df['away_drtg'] + games_df['intercept'] + games_df['home_factor']
games_df['apredict'] = games_df['home_drtg'] + games_df['away_ortg'] + games_df['intercept'] + games_df['home_factor']

In [78]:
games_df['predict_outcome'] = (games_df['hpredict'] > games_df['apredict'])
games_df['correct'] = games_df['predict_outcome'] == games_df['home_outcome']

In [82]:
games_df['month'] = games_df.dt.map(lambda d: d.month)
gb = games_df[games_df['home_ortg'] != -1].groupby('month')

In [84]:
gb.mean()['correct']

month
1     0.681788
2     0.697134
3     0.686244
4     0.500000
12    0.703523
Name: correct, dtype: float64

In [68]:
home_off_ratings = np.ones(games_df.shape[0]) * -1
away_off_ratings = np.ones(games_df.shape[0]) * -1
home_def_ratings = np.ones(games_df.shape[0]) * -1
away_def_ratings = np.ones(games_df.shape[0]) * -1
intercepts = np.ones(games_df.shape[0]) * -1
home_factors = np.ones(games_df.shape[0]) * -1
for i, (lb, ub) in enumerate(list(zip(np.roll(indices, 1), indices))[1:]):
    params = helper.extract_coefs(results[i], num_teams)
    home_idx = games_df['i_hteam'].values[lb:ub]
    away_idx = games_df['i_ateam'].values[lb:ub]
    home_off_ratings[lb:ub] = params['coefs']['off'][home_idx]
    away_off_ratings[lb:ub] = params['coefs']['off'][away_idx]
    home_def_ratings[lb:ub] = params['coefs']['def'][home_idx]
    away_def_ratings[lb:ub] = params['coefs']['def'][away_idx]
    intercepts[lb:ub] = params['coefs']['intercept']
    home_factors[lb:ub] = params['coefs']['home']

In [70]:
games_df.tail()

Unnamed: 0,game_id,dt,hteam,hteam_id,ateam,ateam_id,hpts,hposs,apts,aposs,...,poss,season,home_ortg,home_drtg,away_ortg,away_drtg,intercept,home_factor,hpredict,apredict
5274,3837146,2015-04-02,Evansville,219.0,Northern Ariz.,501.0,71,65.825,65,66.6,...,66.2125,2015,0.009847,-0.028561,-0.018779,0.005113,1.020779,0.025062,1.060801,0.998501
5275,3837131,2015-04-02,Stanford,674.0,Miami (FL),415.0,66,70.025,64,68.45,...,69.2375,2015,0.102858,-0.055981,0.095772,-0.067714,1.020779,0.025062,1.080985,1.085631
5276,3839536,2015-04-04,Kentucky,334.0,Wisconsin,796.0,64,58.75,71,56.45,...,57.6,2015,0.181895,-0.23476,0.244855,-0.117121,1.020779,0.025062,1.110615,1.055935
5277,3839464,2015-04-04,Duke,193.0,Michigan St.,416.0,81,68.575,61,67.6,...,68.0875,2015,0.22752,-0.102068,0.130871,-0.111987,1.020779,0.025062,1.161375,1.074643
5278,3841113,2015-04-06,Duke,193.0,Wisconsin,796.0,68,59.5,63,59.75,...,59.625,2015,0.22752,-0.102068,0.244855,-0.117121,1.020779,0.025062,1.15624,1.188627


In [73]:
gb['correct']

month
1     0.681788
2     0.697134
3     0.686244
4     0.500000
12    0.703523
Name: correct, dtype: float64