In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import pymc3 as pm
import numpy as np
import pandas as pd
import scipy.stats
import time

import RatingsHelper as helper

from scipy import optimize
import theano as thno
import theano.tensor as T

# data retrieval helper module
from general.DB import DB
import util

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, SGDRegressor

# plotting libraries
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
%matplotlib inline

In [3]:
# get the data we need from the database
year = 2015
all_teams = pd.read_sql("SELECT ncaa, ncaaid, kenpom, conf FROM teams", DB.conn)
games_df, stacked_games_df, teams = util.get_data(year)
teams['i_team'] = np.arange(teams.shape[0])
games_df = games_df.sort('dt').reset_index().drop('index', 1)  # sort by date ascending
stacked_games_df = stacked_games_df.sort('dt').reset_index().drop('index', 1)
kenpom = pd.read_sql("SELECT team, adjo, adjd FROM kenpom_ranks WHERE year = %s" % year, DB.conn)
teams = teams.merge(all_teams, left_on='team_id', right_on='ncaaid')
teams = teams.merge(kenpom, how='left', left_on='kenpom', right_on='team').drop(['team', 'kenpom', 'ncaaid'], 1)
num_teams = teams.shape[0]
print("Got data for %s games and %s teams, between %s and %s" % (games_df.shape[0], num_teams,
                                                   games_df['dt'].min(), games_df['dt'].max()))

Got data for 5279 games and 351 teams, between 2014-11-16 and 2015-04-06


In [88]:
def get_home(neutral, teamid, homeid):
    if neutral:
        return 0
    elif teamid == homeid:
        return 1
    else:
        return -1
stacked_games_df['home'] = stacked_games_df.apply(lambda row: get_home(row.neutral, row.team_id, row.hteam_id), 1)
feature_df = stacked_games_df[['dt', 'i_team', 'i_opp', 'home', 'ppp']]
off_dummies = pd.get_dummies(feature_df['i_team'].astype(int), prefix='off')
def_dummies = pd.get_dummies(feature_df['i_opp'].astype(int), prefix='def')
glm_df = pd.concat([feature_df, helper.constrained_dummies(off_dummies), 
                    helper.constrained_dummies(def_dummies)], 1)
glm_df['const'] = 1
glm_df.set_index(pd.DatetimeIndex(glm_df['dt']), inplace=True)

In [89]:
feature_columns = ['off_%s' % i for i in range(num_teams - 1)] + ['def_%s' % i for i in range(num_teams - 1)] + \
['home'] + ['const']
X = glm_df[feature_columns].values
y = glm_df['ppp'].values


In [97]:
glm_df.loc[glm_df.index < '2014-11-30'].shape

(1410, 706)

In [17]:
print(t1 - t0)

14.929063081741333


In [59]:
sgd = SGDRegressor(n_iter=500, penalty='none', shuffle=True, 
                   )
sgd.fit(X, y)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=500, penalty='none', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

In [60]:
sgd.intercept_

array([ 0.51113257])

In [21]:
lr = LinearRegression()
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [44]:
lr.intercept_

1.0215353429856175

In [61]:
def obj(beta, X, y):
    return np.sum((np.dot(X, beta) - y)**2)

In [77]:
beta = np.zeros(X.shape[1])
beta[-1] = 1
beta_hat = optimize.fmin_powell(obj, lr.coef_, args=(X[:1200], y[:1200]), maxiter=5)



In [78]:
beta_hat

array([ -7.77126808e-02,   2.87230414e-02,  -2.89218370e-01,
        -1.07737888e-01,   1.53388575e-01,  -1.06978829e-01,
         9.35160443e-02,  -1.08783557e-01,  -5.31662583e-02,
        -5.31801918e-02,  -4.67064509e-02,   7.91963243e-02,
        -1.55709590e-01,   3.18093195e-01,  -9.77453723e-02,
         1.31005640e-02,  -8.64398906e-02,   5.30447807e-02,
        -1.62646048e-01,   1.72253572e-01,  -2.01107309e-01,
        -1.41703332e-01,   1.06030910e-01,   4.60581321e-02,
         9.58505689e-02,  -1.32527510e-01,  -7.44707847e-02,
         1.61860242e-01,  -1.11282563e-01,  -1.01706008e-01,
        -2.24109094e-02,   1.39389453e-01,   1.87868280e-01,
         5.92224320e-02,  -1.76624362e-01,  -1.09692280e-02,
        -9.55277356e-02,  -6.73982329e-03,  -4.42304631e-02,
        -7.04598826e-02,   1.64937047e-01,   1.35386311e-01,
         5.42256593e-02,   4.56280554e-02,   1.73682858e-01,
        -2.15847042e-01,  -9.45746522e-02,  -9.42633312e-02,
        -1.91690952e-01,

In [79]:
lr = LinearRegression(fit_intercept=False)
lr.fit(X[:1200], y[:1200])

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [80]:
lr.coef_

array([ -1.47663517e-01,  -1.73098756e-02,  -2.82342981e-01,
        -1.30149389e-01,   1.43796090e-01,  -8.75234995e-02,
         1.14256594e-01,  -8.38241203e-02,  -2.14550011e-02,
        -4.26574747e-02,  -4.87006788e-02,   1.08282060e-01,
        -1.31633796e-01,   2.98117224e-01,  -6.29995449e-02,
        -3.88859528e-03,  -1.02967205e-01,   4.73493736e-02,
        -2.11331943e-01,   1.39538259e-01,  -2.01175520e-01,
        -1.68754316e-01,   8.15249466e-02,   5.74414820e-02,
         1.60127445e-01,  -2.18471484e-01,  -1.14072520e-01,
         1.62527347e-01,  -1.45942423e-01,  -4.20241918e-02,
        -1.23087835e-02,   5.26232936e-02,   1.70998473e-01,
         4.49143914e-02,  -1.87616679e-01,  -3.68693703e-02,
        -1.49567093e-01,  -1.18620579e-02,  -2.39653487e-02,
         4.72256247e-02,   1.91527280e-01,   1.66490486e-01,
         7.87258511e-02,  -1.06653748e-02,   1.41663340e-01,
        -2.17918271e-01,  -1.09633002e-01,  -1.02266466e-01,
        -2.02626272e-01,