In [1]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd

from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
#pd.set_option('display.max_columns', None)

## Data Preparation

### Seeds

In [2]:

df_seeds = pd.read_csv("data/2022_Stage1/MNCAATourneySeeds.csv")
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


### Season Results

In [3]:
df_season_results = pd.read_csv("data/2022_Stage1/MRegularSeasonCompactResults.csv")
df_season_results.drop(['WLoc', 'NumOT'], axis=1, inplace=True)

In [4]:
df_season_results['ScoreDiff'] = df_season_results['WScore'] - df_season_results['LScore']

In [5]:
df_season_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,ScoreDiff
0,1985,20,1228,81,1328,64,17
1,1985,25,1106,77,1354,70,7
2,1985,25,1112,63,1223,56,7
3,1985,25,1165,70,1432,54,16
4,1985,25,1192,86,1447,74,12


#### Features

For each team for each season, we need to compute:

<li>Number of wins</li>
<li>Number of losses</li>
<li>Average score gap of wins</li>
<li>Average score gap of losses</li>

In order to calculate the following features: 
<li>Win Percentage</li>
<li>Average margin of victory/loss</li>


In [6]:
win_count = df_season_results.groupby(['Season', 'WTeamID']).count()
win_count = win_count.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "WinCount", "WTeamID": "TeamID"})
win_count

Unnamed: 0,Season,TeamID,WinCount
0,1985,1102,5
1,1985,1103,9
2,1985,1104,21
3,1985,1106,10
4,1985,1108,19
...,...,...,...
12276,2022,1468,9
12277,2022,1469,7
12278,2022,1470,8
12279,2022,1471,8


In [7]:
loss_count = df_season_results.groupby(['Season', 'LTeamID']).count()
loss_count = loss_count.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "LossCount", "LTeamID": "TeamID"})
loss_count

Unnamed: 0,Season,TeamID,LossCount
0,1985,1102,19
1,1985,1103,14
2,1985,1104,9
3,1985,1106,14
4,1985,1108,6
...,...,...,...
12290,2022,1468,11
12291,2022,1469,12
12292,2022,1470,13
12293,2022,1471,12


In [8]:
win_margin = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
win_margin = win_margin[['Season', 'WTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageWinMargin", "WTeamID": "TeamID"})
win_margin

Unnamed: 0,Season,TeamID,AverageWinMargin
0,1985,1102,10.000000
1,1985,1103,7.555556
2,1985,1104,13.190476
3,1985,1106,9.500000
4,1985,1108,13.842105
...,...,...,...
12276,2022,1468,10.666667
12277,2022,1469,11.428571
12278,2022,1470,11.125000
12279,2022,1471,10.125000


In [9]:
loss_margin = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
loss_margin = loss_margin[['Season', 'LTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageLossMargin", "LTeamID": "TeamID"})
loss_margin

Unnamed: 0,Season,TeamID,AverageLossMargin
0,1985,1102,9.947368
1,1985,1103,9.857143
2,1985,1104,4.777778
3,1985,1106,13.285714
4,1985,1108,10.666667
...,...,...,...
12290,2022,1468,16.545455
12291,2022,1469,19.333333
12292,2022,1470,11.769231
12293,2022,1471,13.416667


In [10]:
df_features_season_wins = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_wins

Unnamed: 0,Season,TeamID
0,1985,1102
1,1985,1103
2,1985,1104
3,1985,1106
4,1985,1108
...,...,...
12276,2022,1468
12277,2022,1469
12278,2022,1470
12279,2022,1471


In [11]:
df_features_season_losses = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})
df_features_season_losses

Unnamed: 0,Season,TeamID
0,1985,1102
1,1985,1103
2,1985,1104
3,1985,1106
4,1985,1108
...,...,...
12290,2022,1468
12291,2022,1469
12292,2022,1470
12293,2022,1471


In [12]:
df_features_season = pd.concat([df_features_season_wins, df_features_season_losses], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [13]:
df_features_season = df_features_season.merge(win_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(win_margin, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_margin, on=['Season', 'TeamID'], how='left')
df_features_season.fillna(0, inplace=True)  
df_features_season

Unnamed: 0,Season,TeamID,WinCount,LossCount,AverageWinMargin,AverageLossMargin
0,1985,1102,5.0,19.0,10.000000,9.947368
1,1985,1103,9.0,14.0,7.555556,9.857143
2,1985,1104,21.0,9.0,13.190476,4.777778
3,1985,1106,10.0,14.0,9.500000,13.285714
4,1985,1108,19.0,6.0,13.842105,10.666667
...,...,...,...,...,...,...
12294,2022,1468,9.0,11.0,10.666667,16.545455
12295,2022,1469,7.0,12.0,11.428571,19.333333
12296,2022,1470,8.0,13.0,11.125000,11.769231
12297,2022,1471,8.0,12.0,10.125000,13.416667


In [14]:
df_features_season['WinPercentage'] = df_features_season['WinCount'] / (df_features_season['WinCount'] + df_features_season['LossCount'])
df_features_season['GapAvg'] = (
    (df_features_season['WinCount'] * df_features_season['AverageWinMargin'] - 
    df_features_season['LossCount'] * df_features_season['AverageLossMargin'])
    / (df_features_season['WinCount'] + df_features_season['LossCount'])
)

In [15]:
df_features_season.drop(['WinCount', 'LossCount', 'AverageWinMargin', 'AverageLossMargin'], axis=1, inplace=True)

In [16]:
df_tourney_results = pd.read_csv("data/2022_Stage1/MNCAATourneyCompactResults.csv")
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

In [17]:
df_538 = pd.read_csv("data/538ratingsMen.csv")
df_538.drop('TeamName', axis=1, inplace=True)
df_538

Unnamed: 0,Season,TeamID,538rating
0,2016,1242,94.46
1,2016,1314,93.94
2,2016,1438,92.46
3,2016,1277,91.84
4,2016,1328,89.96
...,...,...,...
403,2022,1168,71.78
404,2022,1136,71.55
405,2022,1313,71.39
406,2022,1411,71.14


## Feature Engineering

### Training data

In [18]:
df = df_tourney_results.copy()
df = df[df['Season'] >= 2016].reset_index(drop=True)
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,2016,134,1195,96,1192,65
1,2016,134,1455,70,1435,50
2,2016,135,1221,59,1380,55
3,2016,135,1276,67,1409,62
4,2016,136,1114,85,1345,83
...,...,...,...,...,...,...
329,2021,148,1211,85,1425,66
330,2021,148,1417,51,1276,49
331,2021,152,1124,78,1222,59
332,2021,152,1211,93,1417,90


In [19]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})

In [20]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})

In [21]:
#Remove region and play in tournament marker from seed
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

In [22]:
df['SeedW'] = df['SeedW'].apply(treat_seed)
df['SeedL'] = df['SeedL'].apply(treat_seed)

In [23]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2016,134,1195,96,1192,65,16,16
1,2016,134,1455,70,1435,50,11,11
2,2016,135,1221,59,1380,55,16,16
3,2016,135,1276,67,1409,62,11,11
4,2016,136,1114,85,1345,83,12,5


In [24]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountW',
    'LossCount': 'LossCountW',
    'AverageWinMargin': 'AverageWinMarginW',
    'AverageLossMargin': 'AverageLossMarginW',
    'WinPercentage': 'WinPercentageW',
    'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)


In [25]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountL',
    'LossCount': 'LossCountL',
    'AverageWinMargin': 'AverageWinMarginL',
    'AverageLossMargin': 'AverageLossMarginL',
    'WinPercentage': 'WinPercentageL',
    'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)


In [26]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinPercentageW,GapAvgW,WinPercentageL,GapAvgL
0,2016,134,1195,96,1192,65,16,16,0.551724,2.724138,0.548387,-1.677419
1,2016,134,1455,70,1435,50,11,11,0.741935,13.709677,0.59375,9.40625
2,2016,135,1221,59,1380,55,16,16,0.424242,-4.333333,0.612903,1.870968
3,2016,135,1276,67,1409,62,11,11,0.636364,6.242424,0.645161,4.322581
4,2016,136,1114,85,1345,83,12,5,0.870968,9.935484,0.764706,13.147059


In [27]:
df = pd.merge(
    df,
    df_538,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingW'})

In [28]:
df = pd.merge(
    df, 
    df_538, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingL'})

In [29]:
df.head()


Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinPercentageW,GapAvgW,WinPercentageL,GapAvgL,538ratingW,538ratingL
0,2016,134,1195,96,1192,65,16,16,0.551724,2.724138,0.548387,-1.677419,71.41,66.72
1,2016,134,1455,70,1435,50,11,11,0.741935,13.709677,0.59375,9.40625,86.59,85.59
2,2016,135,1221,59,1380,55,16,16,0.424242,-4.333333,0.612903,1.870968,66.85,67.96
3,2016,135,1276,67,1409,62,11,11,0.636364,6.242424,0.645161,4.322581,79.57,79.93
4,2016,136,1114,85,1345,83,12,5,0.870968,9.935484,0.764706,13.147059,78.9,88.68


In [30]:
def add_loosing_matches(df):
    win_rename = {
        "WTeamID": "TeamIdA", 
        "WScore" : "ScoreA", 
        "LTeamID" : "TeamIdB",
        "LScore": "ScoreB",
     }
    win_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('W')})
    win_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('L')})
    
    lose_rename = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
    }
    lose_rename.update({c : c[:-1] + "B" for c in df.columns if c.endswith('W')})
    lose_rename.update({c : c[:-1] + "A" for c in df.columns if c.endswith('L')})
    
    win_df = df.copy()
    lose_df = df.copy()
    
    win_df = win_df.rename(columns=win_rename)
    lose_df = lose_df.rename(columns=lose_rename)
    
    return pd.concat([win_df, lose_df], axis=0, sort=False)


In [31]:
df = add_loosing_matches(df)

In [32]:
df.head()


Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,GapAvgB,538ratingA,538ratingB
0,2016,134,1195,96,1192,65,16,16,0.551724,2.724138,0.548387,-1.677419,71.41,66.72
1,2016,134,1455,70,1435,50,11,11,0.741935,13.709677,0.59375,9.40625,86.59,85.59
2,2016,135,1221,59,1380,55,16,16,0.424242,-4.333333,0.612903,1.870968,66.85,67.96
3,2016,135,1276,67,1409,62,11,11,0.636364,6.242424,0.645161,4.322581,79.57,79.93
4,2016,136,1114,85,1345,83,12,5,0.870968,9.935484,0.764706,13.147059,78.9,88.68


#### Feature Differences

In [33]:
cols_to_diff = ['Seed', 'WinPercentage', 'GapAvg', '538rating']

for col in cols_to_diff:
    df[col + 'Diff'] = df[col + 'A'] - df[col + 'B']

In [34]:
df.head()

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,GapAvgB,538ratingA,538ratingB,SeedDiff,WinPercentageDiff,GapAvgDiff,538ratingDiff
0,2016,134,1195,96,1192,65,16,16,0.551724,2.724138,0.548387,-1.677419,71.41,66.72,0,0.003337,4.401557,4.69
1,2016,134,1455,70,1435,50,11,11,0.741935,13.709677,0.59375,9.40625,86.59,85.59,0,0.148185,4.303427,1.0
2,2016,135,1221,59,1380,55,16,16,0.424242,-4.333333,0.612903,1.870968,66.85,67.96,0,-0.188661,-6.204301,-1.11
3,2016,135,1276,67,1409,62,11,11,0.636364,6.242424,0.645161,4.322581,79.57,79.93,0,-0.008798,1.919844,-0.36
4,2016,136,1114,85,1345,83,12,5,0.870968,9.935484,0.764706,13.147059,78.9,88.68,7,0.106262,-3.211575,-9.78


### Test Data
##### Preparing

In [35]:
df_test = pd.read_csv("data/2022_Stage1/MSampleSubmissionStage1.csv")

In [36]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [37]:
df_test.head()


Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2016_1112_1114,0.5,2016,1112,1114
1,2016_1112_1122,0.5,2016,1112,1122
2,2016_1112_1124,0.5,2016,1112,1124
3,2016_1112_1138,0.5,2016,1112,1138
4,2016_1112_1139,0.5,2016,1112,1139


##### Seeds

In [38]:
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'})

In [39]:
df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'})


In [40]:
df_test['SeedA'] = df_test['SeedA'].apply(treat_seed)
df_test['SeedB'] = df_test['SeedB'].apply(treat_seed)

In [41]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountA',
    'LossCount': 'LossCountA',
    'AverageWinMargin': 'AverageWinMarginA',
    'AverageLossMargin': 'AverageLossMarginA',
    'WinPercentage': 'WinPercentageA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)

In [42]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountB',
    'LossCount': 'LossCountB',
    'AverageWinMargin': 'AverageWinMarginB',
    'AverageLossMargin': 'AverageLossMarginB',
    'WinPercentage': 'WinPercentageB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)

In [43]:
df_test = pd.merge(
    df_test,
    df_538,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingA'})

In [44]:
df_test = pd.merge(
    df_test,
    df_538,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'538rating': '538ratingB'})

In [45]:
for col in cols_to_diff:
    df_test[col + 'Diff'] = df_test[col + 'A'] - df_test[col + 'B']

In [46]:
df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,GapAvgB,538ratingA,538ratingB,SeedDiff,WinPercentageDiff,GapAvgDiff,538ratingDiff
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.870968,9.935484,89.04,78.9,-6,-0.113392,2.276637,10.14
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.484848,-2.363636,89.04,68.83,-10,0.272727,14.575758,20.21
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.65625,6.6875,89.04,85.47,1,0.101326,5.524621,3.57
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.575758,0.666667,89.04,75.66,-8,0.181818,11.545455,13.38
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.677419,9.419355,89.04,84.17,-3,0.080156,2.792766,4.87


In [47]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

## Modeling

In [48]:
features = ['SeedDiff', '538ratingDiff', 'WinPercentageDiff', 'GapAvgDiff']

In [49]:
def rescale(features, df_train, df_val, df_test=None):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_)
    df_val[features] = (df_val[features] - min_) / (max_ - min_)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_) / (max_ - min_)
        
    return df_train, df_val, df_test

In [50]:
#cross-validates data using k-fold method
def kfold(df, df_test_=None, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[1:]:
        if verbose:
            print(f'\nValidating on season {season}')
        
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        
        df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
            model = LinearRegression()
        else:
            model = LogisticRegression()

        model.fit(df_train[features], df_train[target])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
            pred = (pred - pred.min()) / (pred.max() - pred.min())
        else:
            pred = model.predict_proba(df_val[features])[:, 1]
        
        if df_test is not None:
            if mode == "reg":
                pred_test = model.predict(df_test[features])
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            else:
                pred_test = model.predict_proba(df_test[features])[:, 1]
                
            pred_tests.append(pred_test)
        
        loss = log_loss(df_val['WinA'].values, pred)
        cvs.append(loss)

        if verbose:
            print(f'\t -> Scored {loss:.3f}')
        
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    
    return pred_tests


In [51]:
kfold(df, df_test, verbose=1, mode="reg")


Validating on season 2017
	 -> Scored 0.580

Validating on season 2018
	 -> Scored 0.596

Validating on season 2019
	 -> Scored 0.530

Validating on season 2021
	 -> Scored 0.607

 Local CV is 0.578


[array([0.62478154, 0.81479344, 0.58875994, ..., 0.51302119, 0.35757375,
        0.33261713]),
 array([0.62448221, 0.81227274, 0.57421116, ..., 0.52080849, 0.37812071,
        0.34731643]),
 array([0.63194109, 0.80572819, 0.57204608, ..., 0.53704631, 0.36180679,
        0.31228843]),
 array([0.64537069, 0.81608679, 0.57372421, ..., 0.52211733, 0.34601792,
        0.3112136 ])]

In [52]:
kfold(df, df_test, verbose=1, mode="cls")


Validating on season 2017
	 -> Scored 0.547

Validating on season 2018
	 -> Scored 0.593

Validating on season 2019
	 -> Scored 0.509

Validating on season 2021
	 -> Scored 0.614

 Local CV is 0.566


[array([0.65226462, 0.87431487, 0.58948947, ..., 0.38841155, 0.41896034,
        0.531695  ]),
 array([0.70947213, 0.89055729, 0.5691628 , ..., 0.45258916, 0.37988681,
        0.42560058]),
 array([0.71131317, 0.88705453, 0.56642754, ..., 0.46859532, 0.3770147 ,
        0.40697168]),
 array([0.73795047, 0.91141592, 0.58063971, ..., 0.46685325, 0.35170628,
        0.38254376])]

In [53]:
pred_tests = kfold(df, df_test, verbose=1, mode="cls")
pred_test = np.mean(pred_tests, 0)


Validating on season 2017
	 -> Scored 0.547

Validating on season 2018
	 -> Scored 0.593

Validating on season 2019
	 -> Scored 0.509

Validating on season 2021
	 -> Scored 0.614

 Local CV is 0.566


In [54]:
submission = df_test[['ID', 'Season', 'Pred', 'TeamIdA', 'TeamIdB', 'SeedA', 'SeedB']].copy()
submission['Pred'] = pred_test


In [55]:
df_teams = pd.read_csv("data/2022_Stage1/MTeams.csv")
submission = submission.merge(df_teams, left_on="TeamIdA", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamA"})
submission = submission.merge(df_teams, left_on="TeamIdB", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamB"})


In [56]:
df_seeds['Seed'] = df_seeds['Seed'].apply(lambda x:x[0])

submission = submission.merge(df_seeds, left_on=["TeamIdA", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionA"})
submission = submission.merge(df_seeds, left_on=["TeamIdB", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionB"})
submission

Unnamed: 0,ID,Season,Pred,TeamIdA,TeamIdB,SeedA,SeedB,TeamA,FirstD1Season_x,LastD1Season_x,TeamB,FirstD1Season_y,LastD1Season_y,RegionA,RegionB
0,2016_1112_1114,2016,0.702750,1112,1114,6,12,Arizona,1985,2022,Ark Little Rock,1985,2022,Y,X
1,2016_1112_1122,2016,0.890836,1112,1122,6,16,Arizona,1985,2022,Austin Peay,1985,2022,Y,Y
2,2016_1114_1122,2016,0.775771,1114,1122,12,16,Ark Little Rock,1985,2022,Austin Peay,1985,2022,X,Y
3,2016_1112_1124,2016,0.576430,1112,1124,6,5,Arizona,1985,2022,Baylor,1985,2022,Y,Z
4,2016_1114_1124,2016,0.365190,1114,1124,12,5,Ark Little Rock,1985,2022,Baylor,1985,2022,X,Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2018_1438_1460,2018,0.884500,1438,1460,1,14,Virginia,1985,2022,Wright St,1988,2022,Y,Y
11386,2018_1439_1460,2018,0.700369,1439,1460,8,14,Virginia Tech,1985,2022,Wright St,1988,2022,W,Y
11387,2018_1452_1460,2018,0.797535,1452,1460,5,14,West Virginia,1985,2022,Wright St,1988,2022,W,Y
11388,2018_1455_1460,2018,0.812199,1455,1460,4,14,Wichita St,1985,2022,Wright St,1988,2022,W,Y


In [57]:
submission.to_csv("full_results_firstmodel.csv")

In [58]:
final_submission = submission[['ID', 'Pred']].copy()

In [59]:
final_submission.to_csv("results_firstmodel.csv")