In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

[Original Notebook](https://www.kaggle.com/aburkard/4th-place-solution/output)

In [None]:
def load_data(filename, path='../input/ncaaw-march-mania-2021/WDataFiles_Stage2/', **kwargs):
    return pd.read_csv(f"{path}/{filename}", **kwargs)

In [None]:
reg_df = load_data("WRegularSeasonDetailedResults.csv")
tourney_df = load_data("WNCAATourneyDetailedResults.csv")

reg_df['is_tourney'] = False
tourney_df['is_tourney'] = True

df = pd.concat([reg_df, tourney_df]).reset_index(drop=True)

In [None]:
teams = load_data("WTeams.csv")
id_to_team = dict(teams[['TeamID', 'TeamName']].values)
df["WTeamName"] = df.WTeamID.map(id_to_team)
df["LTeamName"] = df.LTeamID.map(id_to_team)

In [None]:
seeds = load_data('WNCAATourneySeeds.csv')
seeds['seed_num'] = seeds.Seed.apply(lambda x: int(x[1:3]))


df = df.merge(seeds[['Season', 'TeamID', 'seed_num']].rename(columns={'seed_num': 'WTeam_seed', 'TeamID': 'WTeamID'}),
                how='left', on=['Season', 'WTeamID'])
df = df.merge(seeds[['Season', 'TeamID', 'seed_num']].rename(columns={'seed_num': 'LTeam_seed', 'TeamID': 'LTeamID'}),
                how='left', on=['Season', 'LTeamID'])

### Add conference data

In [None]:
conf = load_data('WTeamConferences.csv')

df = df.merge(conf.rename(columns={'ConfAbbrev': 'WTeam_ConfAbbrev', 'TeamID': 'WTeamID'}),
                how='left', on=['Season', 'WTeamID'])
df = df.merge(conf.rename(columns={'ConfAbbrev': 'LTeam_ConfAbbrev', 'TeamID': 'LTeamID'}),
                how='left', on=['Season', 'LTeamID'])

In [None]:
power_confs = ['acc', 'big_east', 'sec', 'big_ten', 'big_twelve', 'pac_ten', 'pac_twelve']
df['WTeam_power_conf'] = df.WTeam_ConfAbbrev.isin(power_confs)
df['LTeam_power_conf'] = df.LTeam_ConfAbbrev.isin(power_confs)

df['diff_power_conf'] = df.WTeam_power_conf.astype(np.int) - df.LTeam_power_conf.astype(np.int)
df.diff_power_conf.value_counts()

### Get Round numbers

In [None]:
# Womens version
def get_round(row):
    season = row['Season']
    day_num = row['DayNum']
    if season >= 2017:
        if day_num <= 136:
            return 0 # reg season
        if day_num <= 138:
            return 1
        if day_num <= 140:
            return 2
        if day_num <= 145:
            return 3
        if day_num <= 147:
            return 4
        if day_num <= 151:
            return 5
        if day_num <= 153:
            return 6
    if season >= 2015 and season <= 2016:
        if day_num <= 136:
            return 0 # first four
        if day_num <= 138:
            return 1
        if day_num <= 140:
            return 2
        if day_num <= 145:
            return 3
        if day_num <= 147:
            return 4
        if day_num <= 153:
            return 5
        if day_num <= 155:
            return 6
        
    if season >= 2003 and season <= 2014:
        if day_num <= 137:
            return 0 # first four
        if day_num <= 139:
            return 1
        if day_num <= 141:
            return 2
        if day_num <= 146:
            return 3
        if day_num <= 148:
            return 4
        if day_num <= 153:
            return 5
        if day_num <= 155:
            return 6
        
    if season >= 1998 and season <= 2002:
        if day_num <= 136:
            return 0 # first four
        if day_num <= 138:
            return 1
        if day_num <= 140:
            return 2
        if day_num <= 145:
            return 3
        if day_num <= 147:
            return 4
        if day_num <= 151:
            return 5
        if day_num <= 153:
            return 6
    
df['round_num'] = None
df.loc[df.is_tourney, 'round_num'] = df.loc[df.is_tourney].apply(get_round, axis=1)

## Recreate KenPom

In [None]:
df['WTeam_won'] = True

In [None]:
# https://kenpom.com/blog/the-possession/
# https://kenpom.com/blog/ratings-glossary/
Y = 0.475
df['W_est_possesions'] = (df.WFGA - df.WOR) + df.WTO + (Y * df.WFTA)
df['L_est_possesions'] = (df.LFGA - df.LOR) + df.LTO + (Y * df.LFTA)


In [None]:
df[['W_est_possesions', 'L_est_possesions']].mean(axis=0)

In [None]:
df['score_margin'] = df.WScore - df.LScore
df['home_court'] = df.WLoc.map({'H': 1.0, 'N': 0.0, 'A': -1.0})
#df["Season_WTeamID"] = df.apply(lambda row: f"{row.Season}-{row.WTeamID}", axis=1)
#df["Season_LTeamID"] = df.apply(lambda row: f"{row.Season}-{row.LTeamID}", axis=1)
df['score_margin_per_poss'] = df.score_margin / df[['W_est_possesions', 'L_est_possesions']].mean(axis=1)
df['log_score_margin_per_poss'] = np.log(df.score_margin_per_poss)
df.head()

In [None]:
df['W_scoring_off'] = df.WScore / df.W_est_possesions
df['L_scoring_off'] = df.LScore / df.L_est_possesions

df['W_scoring_off_denom'] = df.W_est_possesions
df['L_scoring_off_denom'] = df.L_est_possesions


In [None]:
# log(WOR) = log(W_coeff) + log(L_coeff)

#df['W_off_reb_rate'] = df.WOR / (df.WOR + df.LDR)
#df['W_def_reb_rate'] = df.WDR / (df.WDR + df.LOR)

df['W_reb_off'] = df.WOR / (df.WOR + df.LDR)
df['L_reb_off'] = df.LOR / (df.WDR + df.LOR)

df['W_reb_off_denom'] = (df.WOR + df.LDR)
df['L_reb_off_denom'] = (df.WDR + df.LOR)

In [None]:
df['W_3pt_off_denom'] = df.WFGA3
df['L_3pt_off_denom'] = df.LFGA3

df['W_3pt_off'] = (df.WFGM3 / df.W_3pt_off_denom).fillna(0)
df['L_3pt_off'] = (df.LFGM3 / df.L_3pt_off_denom).fillna(0)

In [None]:
df['W_draw_foul_off'] = df.LPF / (df.W_est_possesions + df.L_est_possesions)
df['L_draw_foul_off'] = df.WPF / (df.W_est_possesions + df.L_est_possesions)

df['W_draw_foul_off_denom'] = (df.W_est_possesions + df.L_est_possesions)
df['L_draw_foul_off_denom'] = (df.W_est_possesions + df.L_est_possesions)

In [None]:
# Weighted decay param

lam = .01
arr= 133 - np.array([0, 66, 126, 133])
np.exp(-lam * arr)

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor, BayesianRidge, ElasticNet
from sklearn.model_selection import train_test_split

kenpom_ratings = {}
consistency_ratings = {}
all_metrics = ['reb', '3pt', 'draw_foul', 'scoring', 'scoring_weighted']

for metric in all_metrics:
    print(metric)
    weighted = metric.endswith('_weighted')
    kenpom_ratings[metric] = {}
    consistency_ratings[metric] = {}
    for season in range(2010, 2022):
        print(season)

        season_df = df[(~df.is_tourney) & (df.Season==season)]
        win_one_hot = pd.get_dummies(season_df.WTeamID)
        loss_one_hot = pd.get_dummies(season_df.LTeamID)
        team_ids = list(set(win_one_hot.columns).union(loss_one_hot.columns))
        
        win_one_hot = win_one_hot.reindex(columns=team_ids, fill_value=0)
        loss_one_hot = loss_one_hot.reindex(columns=team_ids, fill_value=0)
        
        
        a1 = pd.concat([
                win_one_hot.rename(columns={c: f"{c}_off" for c in win_one_hot.columns}),
                loss_one_hot.rename(columns={c: f"{c}_def" for c in loss_one_hot.columns})
            ], axis=1)
        a1['offense_home_court'] = season_df.home_court
        a1['offense_won'] = True
        a1['offense_team'] = season_df.WTeamID
        a1['defense_team'] = season_df.LTeamID
        
        a2 = pd.concat([
                loss_one_hot.rename(columns={c: f"{c}_off" for c in loss_one_hot.columns}),
                win_one_hot.rename(columns={c: f"{c}_def" for c in win_one_hot.columns})
            ], axis=1)
        a2['offense_home_court'] = -season_df.home_court
        a2['offense_won'] = False
        a2['offense_team'] = season_df.LTeamID
        a2['defense_team'] = season_df.WTeamID
        
        one_hot = pd.concat([a1, a2], axis=0)

        season_df = season_df.join(one_hot)
        num_teams = len(team_ids)

        features = [f"{team_id}_off" for team_id in team_ids] + [f"{team_id}_def" for team_id in team_ids] + ['offense_home_court']
        X = season_df[features]
        y = np.zeros(len(X))
        
        metric_name = metric.replace('_weighted', '')
        y[season_df.offense_won] = season_df.loc[season_df.offense_won, f'W_{metric_name}_off']
        y[~season_df.offense_won] = season_df.loc[~season_df.offense_won, f'L_{metric_name}_off']
        
        reg = LinearRegression(fit_intercept=False)
        
        weights = np.zeros(len(X))
        weights[season_df.offense_won] = season_df.loc[season_df.offense_won, f'W_{metric_name}_off_denom']
        weights[~season_df.offense_won] = season_df.loc[~season_df.offense_won, f'L_{metric_name}_off_denom']
        
        if weighted:
            season_end_date = 133
            weights *= np.exp(-lam * (season_end_date - season_df.DayNum)).values.reshape(-1)

        reg.fit(X, y, sample_weight=weights)

        mean_off_strength = reg.coef_[:num_teams].mean()
        mean_def_strength = reg.coef_[num_teams:num_teams*2].mean()
        mean_strengths = np.array([mean_off_strength]*num_teams + [mean_def_strength]*num_teams)
        kenpom_ratings[metric][season] = {
            team_id: score
            for (score, team_id) in zip(reg.coef_[:num_teams*2] - mean_strengths, features[:num_teams*2])
        }
        kenpom_ratings[metric][season]['home_court'] = reg.coef_[-1]
        
        # Consistency ratings = avg sq residual when trying to predict performance on off or def
        # Higher rating = less consistent / harder to predict
        residuals = (y - reg.predict(X))
        season_df['sq_residual'] = residuals * residuals
        
        consistency_ratings[metric][season] = {
            **{f'{k}_off': v for k,v in season_df.groupby("offense_team").sq_residual.mean().to_dict().items()},
            **{f'{k}_def': v for k,v in season_df.groupby("defense_team").sq_residual.mean().to_dict().items()}
}


In [None]:
for metric in all_metrics:
    print(metric)
    for team in ['WTeam', 'LTeam']:
        for side in ['off', 'def']:
            df[f'{team}_{metric}_{side}_efficiency'] = df.apply(
                lambda row: kenpom_ratings[metric][row.Season][f"{row[f'{team}ID']}_{side}"], axis=1)
            df[f'{team}_{metric}_{side}_consistency'] = df.apply(
                lambda row: consistency_ratings[metric][row.Season][f"{row[f'{team}ID']}_{side}"], axis=1)
    
    
    # Positive values = advantage Winning team offense has vs Losing team defense
    df[f'W_off_L_def_{metric}_efficiency_diff'] = df[f'WTeam_{metric}_off_efficiency'] + df[f'LTeam_{metric}_def_efficiency']
    # Positive values = advantage Losing team offense has vs Winning team defense
    df[f'W_def_L_off_{metric}_efficiency_diff'] = df[f'WTeam_{metric}_def_efficiency'] + df[f'LTeam_{metric}_off_efficiency']
    

In [None]:
df.loc[df.is_tourney, ['W_off_L_def_scoring_efficiency_diff', 'W_def_L_off_scoring_efficiency_diff',
                       'WTeamName', 'LTeamName', 'WTeam_won']].sort_values(
'W_off_L_def_scoring_efficiency_diff', ascending=False).head()

In [None]:
for metric in all_metrics:
    df[f'WTeam_{metric}_overall_efficiency'] = df[f'WTeam_{metric}_off_efficiency'] - df[f'WTeam_{metric}_def_efficiency']
    df[f'LTeam_{metric}_overall_efficiency'] = df[f'LTeam_{metric}_off_efficiency'] - df[f'LTeam_{metric}_def_efficiency']
    df[f'diff_{metric}_overall_efficiency'] = df[f'WTeam_{metric}_overall_efficiency'] - df[f'LTeam_{metric}_overall_efficiency']
    

In [None]:
[kenpom_ratings['scoring'][i]['home_court'] for i in range(2010, 2022)]

In [None]:
metrics = ['WTeam_scoring_overall_efficiency', 'WTeam_scoring_weighted_overall_efficiency']
df[df.Season<=2021][[
    'Season', 'WTeamName', *metrics, 'score_margin_per_poss', 'WTeam_seed'
]].groupby(['Season', 'WTeamName']).mean().sort_values(metrics, ascending=False).reset_index().head(100)

In [None]:
for metric in all_metrics:
    df[df.is_tourney].plot.scatter(f'W_off_L_def_{metric}_efficiency_diff', 'score_margin_per_poss', s=10)
    df[df.is_tourney].plot.scatter(f'W_def_L_off_{metric}_efficiency_diff', 'score_margin_per_poss', s=10)

In [None]:
# Calculate team FT percentages

ft_temp = df[['Season', 'WTeamID', 'WFTA', 'WFTM']].copy().rename(columns={
    'WTeamID': 'TeamID',
    'WFTA': 'FTA',
    'WFTM': 'FTM',
})
ft_temp2 = df[['Season', 'LTeamID', 'LFTA', 'LFTM']].copy().rename(columns={
    'LTeamID': 'TeamID',
    'LFTA': 'FTA',
    'LFTM': 'FTM',
})

ft_pcts = pd.concat([ft_temp, ft_temp2]).groupby(['Season', 'TeamID']).apply(lambda group: group.FTM.sum() / group.FTA.sum())

df['WTeam_ft_pct'] = df.apply(lambda row: ft_pcts.loc[row.Season, row.WTeamID], axis=1)
df['LTeam_ft_pct'] = df.apply(lambda row: ft_pcts.loc[row.Season, row.LTeamID], axis=1)
df['diff_ft_pct'] = df.WTeam_ft_pct - df.LTeam_ft_pct


In [None]:
df.WTeam_ft_pct.hist(bins=100)
df.LTeam_ft_pct.hist(bins=100)

### Normalize Seed diff

In [None]:
seed_strengths = df[df.LTeam_seed.notnull()].drop_duplicates(
    subset=['Season', 'LTeamID']).groupby('LTeam_seed').LTeam_scoring_overall_efficiency.mean().to_dict()

df['seed_strength_diff'] = df.WTeam_seed.map(seed_strengths) - df.LTeam_seed.map(seed_strengths)

## Train Model

In [None]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])
round_decay_lam = .02
np.exp(-round_decay_lam * arr)

df['round_decayed_diff_scoring_overall_efficiency'] = df['diff_scoring_overall_efficiency'] * np.exp(
    -round_decay_lam * df['round_num'].fillna(0)
)


In [None]:
df['interaction__diff_3pt_overall_efficiency__diff_reb_overall_efficiency'] = (
    df.diff_reb_overall_efficiency * df.diff_3pt_overall_efficiency
)

df['interaction__diff_scoring_weighted_overall_efficiencyy__diff_3pt_overall_efficiency'] = (
    df.diff_scoring_weighted_overall_efficiency * df.diff_3pt_overall_efficiency
)

df['interaction__diff_scoring_weighted_overall_efficiency__diff_reb_overall_efficiency'] = (
    df.diff_scoring_weighted_overall_efficiency * df.diff_reb_overall_efficiency
)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import log_loss

features = [
    'diff_scoring_weighted_overall_efficiency',
    'diff_3pt_overall_efficiency',
    'diff_draw_foul_overall_efficiency'
]




X = df.loc[:, features].copy()
y = df.loc[:, 'score_margin'].copy()


idxes = np.array(df.index.copy())
np.random.seed(42)
np.random.shuffle(idxes)
idxes = idxes[int(len(idxes)/2): ]

negate_features = [f for f in features if f.startswith('diff') or f.endswith('diff')]
X.loc[idxes, negate_features] = -X.loc[idxes, negate_features]
y.loc[idxes] = -y.loc[idxes]



X_train = X.loc[df.is_tourney]
y_train = y.loc[df.is_tourney]

# Eval against 2019 only
if False:
    year = 2019
    X_train = X.loc[(df.is_tourney) & (df.Season!=year)]
    y_train = y.loc[(df.is_tourney) & (df.Season!=year)]
    X_test = X.loc[(df.is_tourney) & (df.Season==year)]
    y_test = y.loc[(df.is_tourney) & (df.Season==year)]
    
else:
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=41)

clf = LinearRegression(fit_intercept=False)

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
    ('clf', clf)
])

pipe.fit(X_train, y_train)

pipe2 = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(random_state=42))
])


pred_train= pipe.predict(X_train)
consistency_train = df.loc[X_train.index, ['WTeam_scoring_weighted_off_consistency', 'WTeam_scoring_weighted_def_consistency',
                       'LTeam_scoring_weighted_off_consistency', 'LTeam_scoring_weighted_def_consistency'
                      ]].mean(axis=1).values

preds_with_consistency = np.concatenate([
    pred_train.reshape(-1, 1),
    (pred_train / (consistency_train**1)).reshape(-1, 1)
], axis=1)

pipe2.fit(preds_with_consistency, y_train > 0)


In [None]:
pred_test = pipe.predict(X_test)
consistency_test = df.loc[X_test.index, ['WTeam_scoring_weighted_off_consistency', 'WTeam_scoring_weighted_def_consistency',
                       'LTeam_scoring_weighted_off_consistency', 'LTeam_scoring_weighted_def_consistency'
                      ]].mean(axis=1).values

preds_with_consistency = np.concatenate([
    pred_test.reshape(-1, 1),
    (pred_test / (consistency_test**1)).reshape(-1, 1)
], axis=1)

pred_probs = pipe2.predict_proba(preds_with_consistency)

log_loss(y_test > 0, pred_probs, labels=[False, True])



In [None]:
from scipy import stats

pred_train= pipe.predict(X_train)
pred_test = pipe.predict(X_test)

sum_errs = np.sum((y_test - pred_test)**2)
stdev = np.sqrt(1 / (len(y_test) - 2) * sum_errs)
#sum_errs = np.sum((y_train - pred_train)**2)
#stdev = np.sqrt(1 / (len(y_train) - 2) * sum_errs)

pred_probs = stats.norm.cdf(pred_test, loc=0, scale=stdev)
pred_probs = stats.t.cdf(pred_test, df=len(y_test)-1, loc=0, scale=stdev)

#pred_probs[1]= 0
log_loss(y_test > 0, pred_probs, labels=[False, True])

In [None]:
#sorted(list(zip(features, pipe.named_steps['clf'].coef_[0])), key=lambda x: -abs(x[1]))
sorted(list(zip(features, pipe.named_steps['clf'].coef_)), key=lambda x: -abs(x[1]))

In [None]:
old_scores = None

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import Lasso

print(f'Original scores: {old_scores}')

cv_features = [
# 'diff_scoring_overall_efficiency',
 'diff_scoring_weighted_overall_efficiency',
# 'diff_reb_overall_efficiency',
 'diff_3pt_overall_efficiency',
 'diff_draw_foul_overall_efficiency',
#    'diff_power_conf'
# 'diff_ft_pct',
# 'round_decayed_diff_scoring_overall_efficiency',
# 'interaction__diff_3pt_overall_efficiency__diff_reb_overall_efficiency',
# 'interaction__diff_scoring_weighted_overall_efficiencyy__diff_3pt_overall_efficiency',
# 'interaction__diff_scoring_weighted_overall_efficiency__diff_reb_overall_efficiency'
]


clf = LinearRegression(fit_intercept=False)
#clf = Lasso(alpha=0.075, fit_intercept=False)
#clf = ElasticNet(alpha=0.05, l1_ratio=.5, fit_intercept=False, random_state=0)
#clf = GradientBoostingRegressor(n_estimators=500, learning_rate=1.0,
#    max_depth=3, random_state=0)

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
    ('clf', clf)
])

feat_cv = {}

    
scoring_funcs = ['accuracy', 'neg_log_loss']
scoring_funcs = ['explained_variance', 'neg_root_mean_squared_error',
                 'neg_mean_absolute_error', 'neg_mean_squared_error']

folds = X.loc[df.is_tourney].shape[0]
cv_results = cross_validate(pipe, X.loc[df.is_tourney, cv_features], y.loc[df.is_tourney],
                                        cv=folds, scoring=(scoring_funcs))
old_scores = (cv_results['test_neg_mean_squared_error'].mean(),
             cv_results['test_neg_root_mean_squared_error'].mean(),
             cv_results['test_explained_variance'].mean())
print(f'New score: {old_scores}')

In [None]:
# Train production model
X_train = X.loc[df.is_tourney]
y_train = y.loc[df.is_tourney]
pipe.fit(X_train, y_train)

pred_train= pipe.predict(X_train)

consistency_train = df.loc[X_train.index, ['WTeam_scoring_weighted_off_consistency', 'WTeam_scoring_weighted_def_consistency',
                       'LTeam_scoring_weighted_off_consistency', 'LTeam_scoring_weighted_def_consistency'
                      ]].mean(axis=1).values

preds_with_consistency = np.concatenate([
    pred_train.reshape(-1, 1),
    (pred_train / (consistency_train**1)).reshape(-1, 1)
], axis=1)



pipe2.fit(preds_with_consistency, y_train > 0)

In [None]:
#ppreds = np.array([preds[i, y_test.iloc[i].astype(int)] for i in range(len(preds))])
ppreds = np.array([pred_probs[i] if y_test.iloc[i] > 0 else 1-pred_probs[i]  for i in range(len(pred_probs))])


pred_df = df.loc[X_test.index, ['Season', 'WTeamName', 'LTeamName', 'WTeam_seed', 'LTeam_seed']].copy()
pred_df['seed_diff'] = pred_df.WTeam_seed - pred_df.LTeam_seed
pred_df['pred'] = ppreds
pred_df.sort_values('pred')

# Prediction time

In [None]:
slots = load_data('WNCAATourneySlots.csv') 
#slots = slots[slots.Season==2021]
seeds = load_data('WNCAATourneySeeds.csv')
seeds = seeds[seeds.Season==2021]

teams = load_data('WTeams.csv')

slots = slots.merge(seeds.rename(columns={'Seed': 'StrongSeed', 'TeamID': 'StrongTeamID'}),
                    how='left', on=['StrongSeed'] )
slots = slots.merge(seeds.rename(columns={'Seed': 'WeakSeed', 'TeamID': 'WeakTeamID'}),
                    how='left', on=[ 'WeakSeed'] )

slots = slots.merge(teams[['TeamID', 'TeamName']].rename(columns={'TeamID': 'StrongTeamID', 'TeamName': 'StrongTeamName'}), how='left', on=['StrongTeamID'] )
slots = slots.merge(teams[['TeamID', 'TeamName']].rename(columns={'TeamID': 'WeakTeamID', 'TeamName': 'WeakTeamName'}), how='left', on=['WeakTeamID'] )

# TODO: see who wins the first four
#slots.loc[[0, 5, 8, 13], ['WeakTeamID']] = [1291, 1277, 1313, 1179]

# Slot names are in order since it's WvX and YvZ in the final four and stronger seeds don't go to double digits
slots.shape

In [None]:
sub_df = load_data('WSampleSubmissionStage2.csv')
season = int(sub_df.ID.iloc[0][:4])
sub_df['WTeamID'] = sub_df.ID.apply(lambda x: int(x[5:9]))
sub_df['LTeamID'] = sub_df.ID.apply(lambda x: int(x[10:14]))

metric_cols = [f'WTeam_{metric}_overall_efficiency' for metric in all_metrics]
metric_cols += [f'WTeam_{metric}_{side}_consistency' for metric in all_metrics for side in ['off', 'def']]

merge_cols = ['WTeamName', 'WTeam_lat', 'WTeam_lng', 'WTeam_preseason_ap_score', 'WTeam_preseason_ap_score_exp'] + metric_cols
merge_cols = ['WTeamName'] + metric_cols
lookup_df = df.loc[df.Season==season, ['WTeamID'] + merge_cols].drop_duplicates()
lookup_df.head()

sub_df = sub_df.merge(lookup_df, on='WTeamID').merge(
    lookup_df.rename(columns={c: f'L{c[1:]}' for c in lookup_df.columns}), on='LTeamID'
)

sub_df.head()

In [None]:
for metric in all_metrics:
    sub_df[f'diff_{metric}_overall_efficiency'] = sub_df[f'WTeam_{metric}_overall_efficiency'] - sub_df[f'LTeam_{metric}_overall_efficiency']
    

In [None]:
sub_df.head()

#### Figure out round teams would meet in

In [None]:
# team id => list of potential slots
slots_for_seed = {}


strong_idx_df = slots.set_index('StrongSeed')
weak_idx_df = slots.set_index('WeakSeed')

for seed in slots[['StrongTeamID', 'StrongTeamName', 'StrongSeed']].dropna().StrongSeed.tolist() + slots[['WeakTeamID', 'WeakTeamName', 'WeakSeed']].dropna().WeakSeed.tolist():
    seed_slots = []
    slot = seed
    while slot is not None:
        if slot in strong_idx_df.index:
            slot = strong_idx_df.loc[slot].Slot
            seed_slots.append(slot)
        elif slot in weak_idx_df.index:
            slot = weak_idx_df.loc[slot].Slot
            seed_slots.append(slot)
        else:
            slot = None
    slots_for_seed[seed] = seed_slots
    

In [None]:
slots_for_seed
team_id_to_seed = {**dict(slots.dropna(subset=['StrongTeamName'])[['StrongTeamID', 'StrongSeed']].values),
                   **dict(slots.dropna(subset=['WeakTeamName'])[['WeakTeamID', 'WeakSeed']].values)}
team_id_to_slots = {int(k): slots_for_seed[v] for k,v in team_id_to_seed.items()}

In [None]:
def determine_slot(row):
    slots = set(team_id_to_slots[row['WTeamID']]) & set(team_id_to_slots[row['LTeamID']])
    return min(slots) # relies on fact slots are named like R3W2

In [None]:
sub_df['slot'] = sub_df.apply(determine_slot, axis=1)
sub_df['round_num'] = sub_df.slot.apply(lambda x: int(x[1]))

In [None]:
sub_df[features].isnull().mean()

In [None]:
pred_train= pipe.predict(X_train)
pred_eval = pipe.predict(sub_df[features])

sum_errs = np.sum((y_train - pred_train)**2)
stdev = np.sqrt(1 / (len(y_train) - 2) * sum_errs)
#sum_errs = np.sum((y_train - pred_train)**2)
#stdev = np.sqrt(1 / (len(y_train) - 2) * sum_errs)

#pred_probs = stats.norm.cdf(pred_eval, loc=0, scale=stdev)
#pred_probs_old = stats.t.cdf(pred_eval, df=len(pred_train)-1, loc=0, scale=stdev)





consistency_eval = sub_df[['WTeam_scoring_weighted_off_consistency', 'WTeam_scoring_weighted_def_consistency',
                       'LTeam_scoring_weighted_off_consistency', 'LTeam_scoring_weighted_def_consistency'
                      ]].mean(axis=1).values

preds_with_consistency = np.concatenate([
    pred_eval.reshape(-1, 1),
    (pred_eval / (consistency_eval**1)).reshape(-1, 1)
], axis=1)

pred_probs = pipe2.predict_proba(preds_with_consistency)[:, 1]



#pred_probs = pipe2.predict_proba(pred_eval.reshape(-1, 1))[:, 1]



In [None]:
sub_df['Pred'] = pred_probs
sub_df['pred_margin'] = pred_eval

In [None]:
temp = sub_df[['WTeamName', 'LTeamName', 'Pred', 'pred_margin']].copy()
temp2 = temp.copy()
temp2 = temp2.rename(columns = {'WTeamName': 'LTeamName', 'LTeamName': 'WTeamName'})
temp2.Pred = 1-temp2.Pred
temp2.pred_margin = -temp2.pred_margin

temp = pd.concat([
    temp,
    temp2
], sort=False)

In [None]:
# Avg margin across possible tourney games
new_means2 = temp.groupby('WTeamName').pred_margin.mean().sort_values(ascending=False).to_dict()
new_means2

In [None]:
#sorted([(k, new_means2[k]-old_means[k]) for k in new_means2 ], key=lambda x: -x[1])

In [None]:
idx = temp[(temp.WTeamName=="Stanford") & (temp.LTeamName=='Connecticut')].index[0]
sub_df.loc[idx, ['WTeamName', 'LTeamName', 'Pred', 'pred_margin']]

In [None]:
team1 = df[(df.Season==2021) & (df.WTeamName=='Georgetown')].iloc[0]
team2 = df[(df.Season==2021) & (df.WTeamName=='Colorado')].iloc[0]

In [None]:
def vegas_to_prob(odds):
    if odds <= -100:
        prob = odds / (odds - 100)
    else:
        prob = 100 / (odds + 100)
    return prob

def avg_odds(odds, other):
    return (vegas_to_prob(odds) + (1 - vegas_to_prob(other))) / 2


In [None]:

# Score spread prediction contest
sub_df[['ID', 'pred_margin']].rename(columns={'pred_margin': 'Pred'}).to_csv('submission_spread.csv', index=False)

# SUBMIT!!!!!!!!!!!!!!!!!!!!!!!!!!
sub_df2 = sub_df.copy()
sub_df2.loc[1678, 'Pred'] = 0.0 # VT vs. Marquette
sub_df2[['ID', 'Pred']].to_csv('submission.csv', index=False)

sub_df_risky = sub_df.copy()
sub_df_risky.loc[1678, 'Pred'] = 1.0  # VT vs. Marquette
sub_df_risky[['ID', 'Pred']].to_csv('submission_risky.csv', index=False)




# Simulation

In [None]:
slots.head()

In [None]:
slots

bracket_order = [f'{region}0{num}' for region in ['W', 'X', 'Y', 'Z'] for num in [1, 8, 5, 4, 6, 3, 7, 2]]
team_ids = slots.set_index("StrongSeed").loc[bracket_order,
                                  ['StrongTeamID', 'WeakTeamID']].values.flatten()


In [None]:
preds_matrix = np.zeros((64, 64))

for i in range(64):
    for j in range(64):
        team_i = team_ids[i]
        team_j = team_ids[j]
        if team_i < team_j:
            preds_matrix[i, j] = sub_df[(sub_df.WTeamID==team_i) & (sub_df.LTeamID==team_j)].iloc[0].Pred
        elif team_i > team_j:
            preds_matrix[i, j] = 1- sub_df[(sub_df.WTeamID==team_j) & (sub_df.LTeamID==team_i)].iloc[0].Pred
        else:
            preds_matrix[i, j] = 0

In [None]:
def get_opp_idxes(round_idx, team_idx):
    num = 2**round_idx
    opp_num_idxes = 2**(round_idx-1)
    opp_start_idx = team_idx // num * num
    if team_idx - opp_start_idx < opp_num_idxes:
        opp_start_idx += opp_num_idxes
        
    opp_idxes = np.arange(opp_num_idxes) + opp_start_idx
    return opp_idxes
    
get_opp_idxes(6, 1)

In [None]:
arr = np.zeros((64, 7))
arr[:, 0] = 1

for round_idx in range(1, 7):
    for team_idx in range(len(arr)):
        
        opp_idxes = get_opp_idxes(round_idx, team_idx)
        
        opp_preds = preds_matrix[team_idx, opp_idxes]
        prob_opp = arr[opp_idxes, round_idx-1]
        
        # Prob(team is in round) * sum(Prob(opp_i is in round) * Prob(team beats opp_i))
        prob = arr[team_idx, round_idx-1] * sum(opp_preds * prob_opp)
        arr[team_idx, round_idx] = prob


arr.sum(axis=0)

In [None]:
sub_df[features].isna().mean()

In [None]:
sim_df = pd.DataFrame(arr, columns=[f"prob_win_round_{i}" for i in range(7)])
sim_df['TeamID'] = team_ids.astype(int)
sim_df = sim_df.merge(teams[['TeamID', 'TeamName']], on='TeamID')
sim_df = sim_df[['TeamName'] + sim_df.columns.tolist()[1:-1]]

pd.options.display.float_format = "{:,.6f}".format
sim_df.sort_values('prob_win_round_6', ascending=False)