In [36]:
import numpy as np
import pandas as pd
from scipy.stats import norm, zscore
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import *
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics import log_loss
import itertools
import utils.thutil

np.random.seed(1)

V2U =  ['Seed',
'Wins Last 10 Games',
'Adj Def Efficiency',
'Adj Off Efficiency',
'Turnovers per game']

sig1 = 3.2
sig2 = 3.2
ls1 = 93 * 0.631578947368421
ls2 = 93 * 0.3157894736842105
ws = 1.8421052631578947
wl = 2.3684210526315788

# Load all the game data
game_data = []
for year in range(2014, 2018+1):
	year = f'{year - 1}-{year}'
	rep = pd.read_csv(f'data/{year}_combo.csv')
	rep.rename(columns=lambda x: ' '.join(x.split()), inplace=True)
	rep[rep.select_dtypes(include=[np.number]).columns].apply(zscore)
	rep['pdiff'] = rep['Winner Points'] - rep['Loser points']
	rep['Winner'] = rep['Winner'] + '_' + year
	rep['Loser'] = rep['Loser'] + '_' + year
	game_data.append(rep)
game_data = pd.concat(game_data)

# Get the names of rows
raw_names = [foo.strip() for foo in list(pd.read_csv('data/2014-2015_data.csv').columns)]
raw_names.remove('Team Name')

# Flip some of the rows
for i, row in game_data.iterrows():
    if np.random.random() > 0.5:
        for name in raw_names:
            vname_w = name + ' Winner'
            vname_l = name + ' Loser'
            wv = game_data.at[i, vname_w]
            game_data.at[i, vname_w] = game_data.at[i, vname_l]
            game_data.at[i, vname_l] = wv
        game_data.at[i, 'pdiff'] *= -1

# Construct "winner" "loser" names
v2u_mod = []
for v in V2U:
    v2u_mod.append(v + ' Winner')
    v2u_mod.append(v + ' Loser')

In [37]:
# Train the "clean" model
train = game_data[game_data['Winner'].str.endswith('2014') == False]
train = train[train['Winner'].str.endswith('2014') == False]
train = train[train['Winner'].str.endswith('2016') == False]
train = train[train['Winner'].str.endswith('2018') == False]

val = pd.read_csv('data/2018_2019_allgames.csv')
val[val.select_dtypes(include=[np.number]).columns].apply(zscore)

# Select the proper subsets of the dataframe
X_train = train[v2u_mod]
y_train = train['pdiff']
X_val = val[v2u_mod]

rbf1 = sig1 ** 2 * RBF(length_scale=ls1)
rbf2 = sig2 ** 2 * RBF(length_scale=ls2)
white = ws ** 2 * WhiteKernel(noise_level=wl**2)
kernel = rbf1 + rbf2 + white

noisy_gp = GaussianProcessRegressor(kernel,
# 		n_restarts_optimizer=10,
    normalize_y=True)

noisy_gp.fit(X_train, y_train)    # train
gp_y_pred, gp_sd = noisy_gp.predict(X_val, return_std=True)    # validate
y_pred_probs = 1 - norm.cdf(-gp_y_pred / gp_sd)

outdf = pd.DataFrame()

outdf['game'] = '2019_' + val["Kaggle ID Winner"].map(str) + '_' + val["Kaggle ID Loser"].map(str)
outdf['val'] = y_pred_probs

outdf.to_csv('2019_clean_out.csv', header=False, index=False)