In [None]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
data_dir = '../input/'
tourney_results = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WNCAATourneyDetailedResults.csv')
seeds = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WNCAATourneySeeds.csv')
regular_results = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WRegularSeasonDetailedResults.csv')

Discover and visualize the data

In [None]:
regular_results.info()

In [None]:
regular_results.head()

In [None]:
regular_results.groupby('Season').mean()

In [None]:
tourney_results.info()

In [None]:
tourney_results.head()

In [None]:
tourney_results.groupby('Season').mean()

In [None]:
seeds.head()

In [None]:
all(regular_results.columns == tourney_results.columns)

Prepare the data

In [None]:
regular_results_swap = regular_results[[
    'Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

In [None]:
regular_results_swap.loc[regular_results['WLoc'] == 'H', 'WLoc'] = 'A'
regular_results_swap.loc[regular_results['WLoc'] == 'A', 'WLoc'] = 'H'
regular_results.columns.values[6] = 'location'
regular_results_swap.columns.values[6] = 'location'

In [None]:
regular_results.head()

In [None]:
regular_results_swap.head()

In [None]:
regular_results.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(regular_results.columns)]
regular_results_swap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(regular_results.columns)]

In [None]:
regular_results.head()

In [None]:
regular_results_swap.head()

In [None]:
regular_data = pd.concat([regular_results, regular_results_swap]).sort_index().reset_index(drop = True)

In [None]:
regular_data.head(10)

In [None]:
data_dir = '../input/'
tourney_results = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WNCAATourneyDetailedResults.csv')
seeds = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WNCAATourneySeeds.csv')
regular_results = pd.read_csv('../input/womens-machine-learning-competition-2019/Stage2WDataFiles/WRegularSeasonDetailedResults.csv')

def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

In [None]:
regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [None]:
regular_data.head()

In [None]:
tourney_data.head()

Feature engineering

In [None]:
tourney_data.columns

In [None]:
boxscore_cols = ['T1_Score', 'T2_Score', 
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF', 
        'PointDiff']

boxscore_cols = [
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff']

funcs = [np.mean]

In [None]:
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs)
season_statistics.head()

In [None]:
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics.head()

In [None]:
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]
season_statistics.head()

In [None]:
season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

In [None]:
season_statistics_T1.head()

In [None]:
season_statistics_T2.head()

In [None]:
tourney_data.head()

In [None]:
tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]
tourney_data.head()

In [None]:
tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [None]:
tourney_data.head()

In [None]:
last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')

In [None]:
tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [None]:
regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])
regular_season_effects.shape

In [None]:
def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality

Hard Code for 15th and 16th seed

In [None]:
t_3126_quality = pd.DataFrame(np.array([[3126,0,2010],[3126,0,2011],[3126,0,2012],[3126,0,2013],[3126,0,2014],[3126,0,2015],[3126,0,2016],[3126,0,2017],[3126,0,2018],[3126,0,2019]]), columns=['TeamID','quality','Season'])
t_3413_quality = pd.DataFrame(np.array([[3413,0,2010],[3413,0,2011],[3413,0,2012],[3413,0,2013],[3413,0,2014],[3413,0,2015],[3413,0,2016],[3413,0,2017],[3413,0,2018],[3413,0,2019]]), columns=['TeamID','quality','Season'])
t_3352_quality = pd.DataFrame(np.array([[3352,0,2010],[3352,0,2011],[3352,0,2012],[3352,0,2013],[3352,0,2014],[3352,0,2015],[3352,0,2016],[3352,0,2017],[3352,0,2018],[3352,0,2019]]), columns=['TeamID','quality','Season'])
t_3406_quality = pd.DataFrame(np.array([[3406,0,2010],[3406,0,2011],[3406,0,2012],[3406,0,2013],[3406,0,2014],[3406,0,2015],[3406,0,2016],[3406,0,2017],[3406,0,2018],[3406,0,2019]]), columns=['TeamID','quality','Season'])
t_3101_quality = pd.DataFrame(np.array([[3101,0,2010],[3101,0,2011],[3101,0,2012],[3101,0,2013],[3101,0,2014],[3101,0,2015],[3101,0,2016],[3101,0,2017],[3101,0,2018],[3101,0,2019]]), columns=['TeamID','quality','Season'])
t_3273_quality = pd.DataFrame(np.array([[3273,0,2010],[3273,0,2011],[3273,0,2012],[3273,0,2013],[3273,0,2014],[3273,0,2015],[3273,0,2016],[3273,0,2017],[3273,0,2018],[3273,0,2019]]), columns=['TeamID','quality','Season'])
t_3380_quality = pd.DataFrame(np.array([[3380,0,2010],[3380,0,2011],[3380,0,2012],[3380,0,2013],[3380,0,2014],[3380,0,2015],[3380,0,2016],[3380,0,2017],[3380,0,2018],[3380,0,2019]]), columns=['TeamID','quality','Season'])
t_3340_quality = pd.DataFrame(np.array([[3340,0,2010],[3340,0,2011],[3340,0,2012],[3340,0,2013],[3340,0,2014],[3340,0,2015],[3340,0,2016],[3340,0,2017],[3340,0,2018],[3340,0,2019]]), columns=['TeamID','quality','Season'])

In [None]:
team_quality_2010 = team_quality(2010)
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3126]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3413]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3352]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3406]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3101]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3273]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3380]
team_quality_2010 = team_quality_2010[team_quality_2010.TeamID != 3340]

team_quality_2011 = team_quality(2011)
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3126]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3413]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3352]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3406]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3101]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3273]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3380]
team_quality_2011 = team_quality_2011[team_quality_2011.TeamID != 3340]

team_quality_2012 = team_quality(2012)
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3126]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3413]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3352]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3406]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3101]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3273]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3380]
team_quality_2012 = team_quality_2012[team_quality_2012.TeamID != 3340]

team_quality_2013 = team_quality(2013)
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3126]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3413]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3352]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3406]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3101]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3273]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3380]
team_quality_2013 = team_quality_2013[team_quality_2013.TeamID != 3340]

team_quality_2014 = team_quality(2014)
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3126]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3413]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3352]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3406]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3101]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3273]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3380]
team_quality_2014 = team_quality_2014[team_quality_2014.TeamID != 3340]

team_quality_2015 = team_quality(2015)
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3126]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3413]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3352]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3406]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3101]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3273]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3380]
team_quality_2015 = team_quality_2015[team_quality_2015.TeamID != 3340]

team_quality_2016 = team_quality(2016)
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3126]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3413]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3352]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3406]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3101]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3273]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3380]
team_quality_2016 = team_quality_2016[team_quality_2016.TeamID != 3340]

team_quality_2017 = team_quality(2017)
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3126]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3413]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3352]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3406]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3101]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3273]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3380]
team_quality_2017 = team_quality_2017[team_quality_2017.TeamID != 3340]

team_quality_2018 = team_quality(2018)
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3126]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3413]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3352]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3406]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3101]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3273]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3380]
team_quality_2018 = team_quality_2018[team_quality_2018.TeamID != 3340]

team_quality_2019 = team_quality(2019)
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3126]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3413]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3352]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3406]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3101]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3273]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3380]
team_quality_2019 = team_quality_2019[team_quality_2019.TeamID != 3340]

In [None]:
glm_quality = pd.concat([team_quality_2010,
                         team_quality_2011,
                         team_quality_2012,
                         team_quality_2013,
                         team_quality_2014,
                         team_quality_2015,
                         team_quality_2016,
                         team_quality_2017,
                         team_quality_2018,
                         team_quality_2019,
                         t_3126_quality,
                         t_3413_quality,
                         t_3352_quality,
                         t_3406_quality,
                         t_3101_quality,
                         t_3273_quality,
                         t_3380_quality,
                         t_3340_quality,
                        ]).reset_index(drop=True)
#glm_quality = pd.concat([team_quality(2010),
#                         team_quality(2011),
#                         team_quality(2012),
#                         team_quality(2013),
#                         team_quality(2014),
#                         team_quality(2015),
#                         team_quality(2016),
#                         team_quality(2017),
#                         team_quality(2018),
#                         team_quality(2019),
#                         s_quality]).reset_index(drop=True)
glm_quality.info()

In [None]:
glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

In [None]:
tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [None]:
seeds.head()

In [None]:
seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
seeds.head()

In [None]:
seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

In [None]:
tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')

In [None]:
tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]

build some models

In [None]:
y = tourney_data['T1_Score'] - tourney_data['T2_Score']
y.describe()

In [None]:
features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality","T2_quality"]

len(features)

In [None]:
X = tourney_data[features].values
dtrain = xgb.DMatrix(X, label = y)

In [None]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

In [None]:
param = {'eval_metric':'mae',
         'booster':'gbtree',
         'eta': 0.05 #change to ~0.02 for final run
         'subsample':0.35
         'colsample_bytree': 0.7
         'num_parallel_tree': 3 #recommend 10
         'min_child_weight': 40
         'gamma': 10
         'max_depth':  3
         'silent': 1
    
} 

print(param)

In [None]:
xgb_cv = []
repeat_cv = 100 # recommend 10

for i in range(repeat_cv): 
    print(f"Fold repeater {i}")
    xgb_cv.append(
        xgb.cv(
          params = param,
          dtrain = dtrain,
          obj = cauchyobj,
          num_boost_round = 3000,
          folds = KFold(n_splits = 5, shuffle = True, random_state = i),
          early_stopping_rounds = 25,
          verbose_eval = 50
        )
    )

In [None]:
iteration_counts = [np.argmin(x['test-mae-mean'].values) for x in xgb_cv]
val_mae = [np.min(x['test-mae-mean'].values) for x in xgb_cv]
iteration_counts, val_mae

In [None]:
oof_preds = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    preds = y.copy()
    kfold = KFold(n_splits = 5, shuffle = True, random_state = i)    
    for train_index, val_index in kfold.split(X,y):
        dtrain_i = xgb.DMatrix(X[train_index], label = y[train_index])
        dval_i = xgb.DMatrix(X[val_index], label = y[val_index])  
        model = xgb.train(
              params = param,
              dtrain = dtrain_i,
              num_boost_round = iteration_counts[i],
              verbose_eval = 50
        )
        preds[val_index] = model.predict(dval_i)
    oof_preds.append(np.clip(preds,-30,30))

In [None]:
plot_df = pd.DataFrame({"pred":oof_preds[0], "label":np.where(y>0,1,0)})
plot_df["pred_int"] = plot_df["pred"].astype(int)
plot_df = plot_df.groupby('pred_int')['label'].mean().reset_index(name='average_win_pct')

plt.figure()
plt.plot(plot_df.pred_int,plot_df.average_win_pct)

In [None]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
        
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    
    print(f"logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

In [None]:
plot_df = pd.DataFrame({"pred":oof_preds[0], "label":np.where(y>0,1,0), "spline":spline_model[0](oof_preds[0])})
plot_df["pred_int"] = (plot_df["pred"]).astype(int)
plot_df = plot_df.groupby('pred_int')['spline','label'].mean().reset_index()

plt.figure()
plt.plot(plot_df.pred_int,plot_df.spline)
plt.plot(plot_df.pred_int,plot_df.label)

In [None]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

In [None]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    spline_fit[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16)] = 1.0
    spline_fit[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15)] = 1.0
    spline_fit[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14)] = 1.0
    spline_fit[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13)] = 1.0
    spline_fit[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1)] = 0.0
    spline_fit[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2)] = 0.0
    spline_fit[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3)] = 0.0
    spline_fit[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4)] = 0.0
    
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

In [None]:
#looking for upsets
pd.concat(
    [tourney_data[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16) & (tourney_data.T1_Score < tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15) & (tourney_data.T1_Score < tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14) & (tourney_data.T1_Score < tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13) & (tourney_data.T1_Score < tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1) & (tourney_data.T1_Score > tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2) & (tourney_data.T1_Score > tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3) & (tourney_data.T1_Score > tourney_data.T2_Score)],
     tourney_data[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4) & (tourney_data.T1_Score > tourney_data.T2_Score)]]
)

In [None]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    spline_fit[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

In [None]:
val_cv = []
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    spline_fit[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    spline_fit[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    spline_fit[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    
    val_cv.append(pd.DataFrame({"y":np.where(y>0,1,0), "pred":spline_fit, "season":tourney_data.Season}))
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 
    
val_cv = pd.concat(val_cv)
val_cv.groupby('season').apply(lambda x: log_loss(x.y, x.pred))

In [None]:
sub = pd.read_csv('../input/womens-machine-learning-competition-2019/WSampleSubmissionStage2.csv')
sub.info()

In [None]:
sub["Season"] = 2019
sub["T1_TeamID"] = sub["ID"].apply(lambda x: x[5:9]).astype(int)
sub["T2_TeamID"] = sub["ID"].apply(lambda x: x[10:14]).astype(int)
sub.info()

In [None]:
sub = pd.merge(sub, season_statistics_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, season_statistics_T2, on = ['Season', 'T2_TeamID'])
sub.info()

In [None]:
sub = pd.merge(sub, glm_quality_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, glm_quality_T2, on = ['Season', 'T2_TeamID'])
sub.info()

In [None]:
sub = pd.merge(sub, seeds_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, seeds_T2, on = ['Season', 'T2_TeamID'])
sub = pd.merge(sub, last14days_stats_T1, on = ['Season', 'T1_TeamID'])
sub = pd.merge(sub, last14days_stats_T2, on = ['Season', 'T2_TeamID'])
sub["Seed_diff"] = sub["T1_seed"] - sub["T2_seed"]
sub.info()

In [None]:
Xsub = sub[features].values
dtest = xgb.DMatrix(Xsub)

In [None]:
sub_models = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    sub_models.append(
        xgb.train(
          params = param,
          dtrain = dtrain,
          num_boost_round = int(iteration_counts[i] * 1.05),
          verbose_eval = 50
        )
    )

In [None]:
sub_preds = []
for i in range(repeat_cv):
    sub_preds.append(np.clip(spline_model[i](np.clip(sub_models[i].predict(dtest),-30,30)),0.025,0.975))
    
sub["Pred"] = pd.DataFrame(sub_preds).mean(axis=0)

sub.loc[(sub.T1_seed==1) & (sub.T2_seed==16), 'Pred'] = 1.0
sub.loc[(sub.T1_seed==2) & (sub.T2_seed==15), 'Pred'] = 1.0
sub.loc[(sub.T1_seed==3) & (sub.T2_seed==14), 'Pred'] = 1.0
sub.loc[(sub.T1_seed==4) & (sub.T2_seed==13), 'Pred'] = 1.0
sub.loc[(sub.T1_seed==16) & (sub.T2_seed==1), 'Pred'] = 0.0
sub.loc[(sub.T1_seed==15) & (sub.T2_seed==2), 'Pred'] = 0.0
sub.loc[(sub.T1_seed==14) & (sub.T2_seed==3), 'Pred'] = 0.0
sub.loc[(sub.T1_seed==13) & (sub.T2_seed==4), 'Pred'] = 0.0

sub[['ID','Pred']].to_csv("submission.csv", index = None)