In [1]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd
from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Data Preparation

### Team ID

In [2]:
team_ids = pd.read_csv("../data/2022_Stage1/MTeams.csv")
team_ids = team_ids[["TeamID", "TeamName"]]
team_ids

Unnamed: 0,TeamID,TeamName
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M
...,...,...
367,1468,Bellarmine
368,1469,Dixie St
369,1470,Tarleton St
370,1471,UC San Diego


### Seeds

In [3]:
df_seeds = pd.read_csv("../data/2022_Stage1/MNCAATourneySeeds.csv")
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


### Regular Season Results

In [4]:
df_season_results = pd.read_csv("../data/2022_Stage1/MRegularSeasonCompactResults.csv")
df_season_results.drop(['WLoc', 'NumOT'], axis=1, inplace=True)

In [5]:
df_season_results['ScoreDiff'] = df_season_results['WScore'] - df_season_results['LScore']
df_season_results = df_season_results[df_season_results['Season'] >= 2002].reset_index(drop=True)

In [6]:
df_season_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,ScoreDiff
0,2002,10,1112,71,1268,67,4
1,2002,10,1196,72,1396,64,8
2,2002,11,1112,75,1196,71,4
3,2002,11,1268,82,1396,74,8
4,2002,14,1116,64,1263,47,17
...,...,...,...,...,...,...,...
104973,2022,98,1400,79,1242,76,3
104974,2022,98,1411,66,1126,63,3
104975,2022,98,1422,68,1441,49,19
104976,2022,98,1438,69,1181,68,1


### Tournamet Results

In [7]:
df_tourney_results = pd.read_csv("../data/2022_Stage1/MNCAATourneyCompactResults.csv")
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)
#Remove results before 2002 since Pomeroy ratings don't start until 2002
df_tourney_results = df_tourney_results[df_tourney_results['Season'] >= 2002].reset_index(drop=True)
df_tourney_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,2002,134,1373,81,1108,77
1,2002,136,1104,86,1194,78
2,2002,136,1112,86,1364,81
3,2002,136,1181,84,1457,37
4,2002,136,1231,75,1428,56
...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66
1241,2021,148,1417,51,1276,49
1242,2021,152,1124,78,1222,59
1243,2021,152,1211,93,1417,90


### Pomeroy Ratings

In [8]:
df_pomeroy = pd.read_csv("../data/pomeroy_data.csv")
df_pomeroy = df_pomeroy.drop(['Unnamed: 0', "W-L", "Conf", "Seed"], axis=1)
team_ids_pomeroy = team_ids.copy()
df_pomeroy_nan = df_pomeroy.merge(team_ids_pomeroy, on=['TeamName'], how='outer')

In [9]:
teamId_nan = df_pomeroy_nan[df_pomeroy_nan['TeamID'].isna()]
nan_teams_pomeroy = teamId_nan["TeamName"].unique()
nan_teams_pomeroy = sorted(nan_teams_pomeroy)

In [10]:
indexes_to_change_pomeroy = [0,5,6,7,9,10,12,13,14,25,28,30,66,67,68,47,40,48,55,56,57,60,63,74,89,83,85,91,93,94,98,100,102,103,108,352,135,126,128,131,134,137,139,142,144,152,317,159,169,176,191,353,179,189,182,183,185,186,187,190,192,200,207,212,198,199,194,193,196,221,225,228,232,235,239,240,285,286,287,288,257,260,265,253,254,267,279,255,281,282,283,284,271,293,310,326,328,349,350,342,343,354,359]
for i in range(len(indexes_to_change_pomeroy)):
    team_ids_pomeroy.at[indexes_to_change_pomeroy[i],"TeamName"] = nan_teams_pomeroy[i]

In [11]:
df_pomeroy = df_pomeroy.merge(team_ids_pomeroy, on=['TeamName'], how='right')
df_pomeroy = df_pomeroy.dropna()

In [12]:
df_pomeroy["Year"] = df_pomeroy["Year"].astype(int)
df_pomeroy["AdjO Rank"] = df_pomeroy["AdjO Rank"].astype(int)
df_pomeroy["AdjD Rank"] = df_pomeroy["AdjD Rank"].astype(int)
df_pomeroy["AdjT Rank"] = df_pomeroy["AdjT Rank"].astype(int)
df_pomeroy["Luck Rank"] = df_pomeroy["Luck Rank"].astype(int)
df_pomeroy["AdjEM Rank"] = df_pomeroy["AdjEM Rank"].astype(int)
df_pomeroy["OppO Rank"] = df_pomeroy["OppO Rank"].astype(int)
df_pomeroy["OppD Rank"] = df_pomeroy["OppD Rank"].astype(int)
df_pomeroy["AdjEM Rank.1"] = df_pomeroy["AdjEM Rank.1"].astype(int)
df_pomeroy = df_pomeroy.rename(columns={"Year": "Season",
                                        "AdjO": "AdjO Eff",
                                        "AdjO Rank": "AdjO Eff Rank",
                                        "AdjD": "AdjD Eff",
                                        "AdjD Rank": "AdjD Eff Rank",
                                        "AdjT": "AdjTempo",
                                        "AdjT Rank": "AdjTempo Rank",
                                        "AdjEM.1": "Opp AdjEM",
                                        "AdjEM Rank": "Opp AdjEM Rank",
                                        "AdjEM.2": "NCSOS AdjEM",
                                        "AdjEM Rank.1": "NCSOS AdjEM Rank"})

In [13]:
df_pomeroy

Unnamed: 0,Rank,TeamName,AdjEM,AdjO Eff,AdjO Eff Rank,AdjD Eff,AdjD Eff Rank,AdjTempo,AdjTempo Rank,Luck,...,Opp AdjEM,Opp AdjEM Rank,OppO,OppO Rank,OppD,OppD Rank,NCSOS AdjEM,NCSOS AdjEM Rank,Season,TeamID
0,152.0,Abilene Christian,1.35,103.4,196,102.0,121,66.1,256,0.123,...,-8.67,341,100.0,326,108.7,336,-4.75,295,2019,1101
1,84.0,Abilene Christian,10.11,101.7,184,91.6,26,69.3,115,0.009,...,-5.43,298,99.4,287,104.8,300,4.93,137,2021,1101
2,36.0,Air Force,15.42,111.4,47,96.0,43,51.9,326,0.025,...,0.58,143,103.3,170,102.8,138,-10.49,318,2004,1102
3,40.0,Air Force,14.87,113.2,21,98.3,89,54.7,332,-0.001,...,0.57,147,102.8,176,102.2,137,-6.63,300,2006,1102
4,20.0,Air Force,20.20,116.5,20,96.3,45,57.5,332,-0.021,...,3.24,99,106.5,95,103.2,117,-1.91,200,2007,1102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1877,15.0,Xavier,21.51,120.1,8,98.6,57,70.5,62,0.091,...,9.35,35,109.9,37,100.6,29,1.84,108,2018,1462
1878,65.0,Xavier,10.60,111.1,56,100.5,102,65.2,302,-0.004,...,7.48,55,108.7,45,101.2,61,1.04,124,2019,1462
1879,47.0,Yale,14.33,110.4,88,96.0,34,66.0,275,0.026,...,-0.85,169,105.5,122,106.4,209,-1.00,193,2016,1463
1880,77.0,Yale,8.79,111.6,51,102.8,141,71.0,41,0.041,...,-1.10,167,103.0,224,104.2,140,-2.09,222,2019,1463


### Betting Odds Data

In [14]:
betting_odds_df = pd.read_csv("../data/betting_data/betting_data.csv")
betting_odds_df = betting_odds_df.drop(['Unnamed: 0'], axis=1)
betting_odds_df["Championship Win Probability"] = 100 / (betting_odds_df["Round 1 Odds"].str[1:].astype(int) + 100)
betting_odds_df = betting_odds_df.rename(columns={"Team": "TeamName", "Year": "Season", "Round 1 Odds": "Championship Moneyline Odds Before Round 1"})
betting_odds_df["TeamName"] = betting_odds_df["TeamName"].str.strip()
betting_odds_df = betting_odds_df[betting_odds_df["TeamName"] != "FIELD"]

In [15]:
team_ids_betting_odds = team_ids.copy()
df_betting_nan = betting_odds_df.merge(team_ids_betting_odds, on=['TeamName'], how='outer')

In [16]:
teamId_nan = df_betting_nan[df_betting_nan['TeamID'].isna()]
nan_teams_betting = teamId_nan["TeamName"].unique()
nan_teams_betting = sorted(nan_teams_betting)

In [17]:
indexes_to_change_betting = [0,5,6,9,10,12,13,26,28,30,30,67,47,315,40,57,55,56,60,63,74,89,83,85,91,93,94,98,100,104,102,108,352,126,131,134,137,139,142,144,152,153,317,159,169,173,174,176,191,353,179,189,183,185,186,187,190,192,322,207,212,194,193,196,221,225,228,178,232,235,234,239,240,257,260,262,253,254,267,279,255,285,287,288,271,271,293,293,326,310,306,168,320,325,328,349,350,342,343,354,356,359]
for i in range(len(indexes_to_change_betting)):
    team_ids_betting_odds.at[indexes_to_change_betting[i],"TeamName"] = nan_teams_betting[i]

In [18]:
betting_odds_df = betting_odds_df.merge(team_ids_betting_odds, on=['TeamName'], how='right')
betting_odds_df = betting_odds_df.dropna()
betting_odds_df

Unnamed: 0,TeamName,Championship Moneyline Odds Before Round 1,Season,Championship Win Probability,TeamID
0,Abilene Christian,+500000,2019.0,0.000200,1101
1,Abilene Christian,+100000,2021.0,0.000999,1101
2,Air Force,+75000,2004.0,0.001332,1102
3,Air Force,+50000,2006.0,0.001996,1102
4,Akron,+250000,2009.0,0.000400,1103
...,...,...,...,...,...
1336,Xavier,+52000,2017.0,0.001919,1462
1337,Xavier,+2450,2018.0,0.039216,1462
1338,Yale,+100000,2016.0,0.000999,1463
1339,Yale,+150000,2019.0,0.000666,1463


## Game Result Engineering

For each team for each season, we need to compute:

<li>Number of wins</li>
<li>Number of losses</li>
<li>Average margin of victory</li>
<li>Average margin of losses</li>

In order to calculate the following features: 
<li>Win Percentage</li>
<li>Average margin of victory/loss</li>

In [19]:
win_count = df_season_results.groupby(['Season', 'WTeamID']).count()
win_count = win_count.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "WinCount", "WTeamID": "TeamID"})
win_count

Unnamed: 0,Season,TeamID,WinCount
0,2002,1102,9
1,2002,1103,9
2,2002,1104,26
3,2002,1105,15
4,2002,1106,18
...,...,...,...
7199,2022,1468,9
7200,2022,1469,7
7201,2022,1470,8
7202,2022,1471,8


In [20]:
loss_count = df_season_results.groupby(['Season', 'LTeamID']).count()
loss_count = loss_count.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "LossCount", "LTeamID": "TeamID"})
loss_count

Unnamed: 0,Season,TeamID,LossCount
0,2002,1102,19
1,2002,1103,21
2,2002,1104,7
3,2002,1105,10
4,2002,1106,11
...,...,...,...
7205,2022,1468,11
7206,2022,1469,12
7207,2022,1470,13
7208,2022,1471,12


In [21]:
win_margin = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
win_margin = win_margin[['Season', 'WTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageWinMargin", "WTeamID": "TeamID"})
win_margin

Unnamed: 0,Season,TeamID,AverageWinMargin
0,2002,1102,6.888889
1,2002,1103,8.111111
2,2002,1104,15.923077
3,2002,1105,15.266667
4,2002,1106,9.222222
...,...,...,...
7199,2022,1468,10.666667
7200,2022,1469,11.428571
7201,2022,1470,11.125000
7202,2022,1471,10.125000


In [22]:
loss_margin = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
loss_margin = loss_margin[['Season', 'LTeamID', 'ScoreDiff']].rename(columns={"ScoreDiff": "AverageLossMargin", "LTeamID": "TeamID"})
loss_margin

Unnamed: 0,Season,TeamID,AverageLossMargin
0,2002,1102,9.421053
1,2002,1103,10.904762
2,2002,1104,12.142857
3,2002,1105,11.700000
4,2002,1106,12.727273
...,...,...,...
7205,2022,1468,16.545455
7206,2022,1469,19.333333
7207,2022,1470,11.769231
7208,2022,1471,13.416667


In [23]:
df_features_season_wins = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_wins

Unnamed: 0,Season,TeamID
0,2002,1102
1,2002,1103
2,2002,1104
3,2002,1105
4,2002,1106
...,...,...
7199,2022,1468
7200,2022,1469
7201,2022,1470
7202,2022,1471


In [24]:
df_features_season_losses = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})
df_features_season_losses

Unnamed: 0,Season,TeamID
0,2002,1102
1,2002,1103
2,2002,1104
3,2002,1105
4,2002,1106
...,...,...
7205,2022,1468
7206,2022,1469
7207,2022,1470
7208,2022,1471


In [25]:
df_features_season = pd.concat([df_features_season_wins, df_features_season_losses], axis=0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [26]:
#Join all of the dataframes into one dataframe
df_features_season = df_features_season.merge(win_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_count, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(win_margin, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(loss_margin, on=['Season', 'TeamID'], how='left')
df_features_season.fillna(0, inplace=True)  
df_features_season

Unnamed: 0,Season,TeamID,WinCount,LossCount,AverageWinMargin,AverageLossMargin
0,2002,1102,9.0,19.0,6.888889,9.421053
1,2002,1103,9.0,21.0,8.111111,10.904762
2,2002,1104,26.0,7.0,15.923077,12.142857
3,2002,1105,15.0,10.0,15.266667,11.700000
4,2002,1106,18.0,11.0,9.222222,12.727273
...,...,...,...,...,...,...
7208,2022,1468,9.0,11.0,10.666667,16.545455
7209,2022,1469,7.0,12.0,11.428571,19.333333
7210,2022,1470,8.0,13.0,11.125000,11.769231
7211,2022,1471,8.0,12.0,10.125000,13.416667


In [27]:
#Calculate win percentage from win and loss count
df_features_season['WinPercentage'] = df_features_season['WinCount'] / (df_features_season['WinCount'] + df_features_season['LossCount'])

In [28]:
#Calculate average margin of victory/defeat 
df_features_season['GapAvg'] = (
    (df_features_season['WinCount'] * df_features_season['AverageWinMargin'] - 
    df_features_season['LossCount'] * df_features_season['AverageLossMargin'])
    / (df_features_season['WinCount'] + df_features_season['LossCount'])
)

In [29]:
#Drop values we dont need after calculating win percentage and average margin of victory/defeat
df_features_season.drop(['WinCount', 'LossCount', 'AverageWinMargin', 'AverageLossMargin'], axis=1, inplace=True)

## Feature Engineering

### Training Data

In [30]:
df = df_tourney_results.copy()
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,2002,134,1373,81,1108,77
1,2002,136,1104,86,1194,78
2,2002,136,1112,86,1364,81
3,2002,136,1181,84,1457,37
4,2002,136,1231,75,1428,56
...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66
1241,2021,148,1417,51,1276,49
1242,2021,152,1124,78,1222,59
1243,2021,152,1211,93,1417,90


In [31]:
#Add SeedW column
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})


In [32]:
#Add SeedL column
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})


In [33]:
#Remove region and play in tournament marker from seed (Convert seed into ints)
def seed_string_to_int(seed):
    return int(re.sub("[^0-9]", "", seed))

In [34]:
df['SeedW'] = df['SeedW'].apply(seed_string_to_int)
df['SeedL'] = df['SeedL'].apply(seed_string_to_int)

In [35]:
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL
0,2002,134,1373,81,1108,77,16,16
1,2002,136,1104,86,1194,78,2,15
2,2002,136,1112,86,1364,81,3,14
3,2002,136,1181,84,1457,37,1,16
4,2002,136,1231,75,1428,56,5,12
...,...,...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66,1,6
1241,2021,148,1417,51,1276,49,11,1
1242,2021,152,1124,78,1222,59,1,2
1243,2021,152,1211,93,1417,90,1,11


In [36]:
#Add WinPercentageW and GapAvgW columns
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountW',
    'LossCount': 'LossCountW',
    'AverageWinMargin': 'AverageWinMarginW',
    'AverageLossMargin': 'AverageLossMarginW',
    'WinPercentage': 'WinPercentageW',
    'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)

In [37]:
#Add WinPercentageL and GapAvgL columns
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountL',
    'LossCount': 'LossCountL',
    'AverageWinMargin': 'AverageWinMarginL',
    'AverageLossMargin': 'AverageLossMarginL',
    'WinPercentage': 'WinPercentageL',
    'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)

In [38]:
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinPercentageW,GapAvgW,WinPercentageL,GapAvgL
0,2002,134,1373,81,1108,77,16,16,0.470588,1.352941,0.689655,3.137931
1,2002,136,1104,86,1194,78,2,15,0.787879,9.969697,0.642857,1.714286
2,2002,136,1112,86,1364,81,3,14,0.709677,5.129032,0.655172,5.965517
3,2002,136,1181,84,1457,37,1,16,0.906250,19.937500,0.576923,3.538462
4,2002,136,1231,75,1428,56,5,12,0.633333,7.266667,0.703704,6.407407
...,...,...,...,...,...,...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66,1,6,1.000000,23.000000,0.758621,9.655172
1241,2021,148,1417,51,1276,49,11,1,0.653846,4.346154,0.833333,10.875000
1242,2021,152,1124,78,1222,59,1,2,0.916667,17.958333,0.884615,18.000000
1243,2021,152,1211,93,1417,90,1,11,1.000000,23.000000,0.653846,4.346154


In [39]:
#Add PomeroyW columns
df = pd.merge(
    df,
    df_pomeroy,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={"Rank": "RankW",
                                         "AdjEM": "AdjEMW",
                                         "AdjO Eff": "AdjO EffW",
                                         "AdjO Eff Rank": "AdjO Eff RankW",
                                         "AdjD Eff": "AdjD EffW",
                                         "AdjD Eff Rank": "AdjD Eff RankW",
                                         "AdjTempo": "AdjTempoW",
                                         "AdjTempo Rank": "AdjTempo RankW",
                                         "Luck": "LuckW",
                                         "Luck Rank": "Luck RankW",
                                         "Opp AdjEM": "Opp AdjEMW",
                                         "Opp AdjEM Rank": "Opp AdjEM RankW",
                                         "OppO": "OppOW",
                                         "OppO Rank": "OppO RankW",
                                         "OppD": "OppDW",
                                         "OppD Rank": "OppD RankW",
                                         "NCSOS AdjEM": "NCSOS AdjEMW",
                                         "NCSOS AdjEM Rank": "NCSOS AdjEM RankW"})

In [40]:
#Add PomeroyL columns
df = pd.merge(
    df, 
    df_pomeroy, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={"Rank": "RankL",
                                         "AdjEM": "AdjEML",
                                         "AdjO Eff": "AdjO EffL",
                                         "AdjO Eff Rank": "AdjO Eff RankL",
                                         "AdjD Eff": "AdjD EffL",
                                         "AdjD Eff Rank": "AdjD Eff RankL",
                                         "AdjTempo": "AdjTempoL",
                                         "AdjTempo Rank": "AdjTempo RankL",
                                         "Luck": "LuckL",
                                         "Luck Rank": "Luck RankL",
                                         "Opp AdjEM": "Opp AdjEML",
                                         "Opp AdjEM Rank": "Opp AdjEM RankL",
                                         "OppO": "OppOL",
                                         "OppO Rank": "OppO RankL",
                                         "OppD": "OppDL",
                                         "OppD Rank": "OppD RankL",
                                         "NCSOS AdjEM": "NCSOS AdjEML",
                                         "NCSOS AdjEM Rank": "NCSOS AdjEM RankL"})

In [41]:
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinPercentageW,GapAvgW,...,LuckL,Luck RankL,Opp AdjEML,Opp AdjEM RankL,OppOL,OppO RankL,OppDL,OppD RankL,NCSOS AdjEML,NCSOS AdjEM RankL
0,2002,134,1373,81,1108,77,16,16,0.470588,1.352941,...,0.097,8,-14.53,325,95.0,325,109.6,325,2.59,101
1,2002,136,1104,86,1194,78,2,15,0.787879,9.969697,...,0.060,38,-4.94,247,101.9,213,106.9,289,-4.52,267
2,2002,136,1112,86,1364,81,3,14,0.709677,5.129032,...,-0.058,284,0.45,146,103.1,175,102.6,134,3.37,82
3,2002,136,1181,84,1457,37,1,16,0.906250,19.937500,...,0.038,79,-10.73,311,97.0,313,107.7,304,-0.98,188
4,2002,136,1231,75,1428,56,5,12,0.633333,7.266667,...,0.010,140,5.38,73,106.2,74,100.8,75,1.57,123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66,1,6,1.000000,23.000000,...,0.001,171,16.27,21,111.1,19,94.8,28,6.28,111
1241,2021,148,1417,51,1276,49,11,1,0.653846,4.346154,...,0.007,156,16.44,19,111.7,9,95.3,39,2.51,178
1242,2021,152,1124,78,1222,59,1,2,0.916667,17.958333,...,0.018,133,9.21,90,105.7,105,96.5,67,2.11,187
1243,2021,152,1211,93,1417,90,1,11,1.000000,23.000000,...,0.028,110,17.38,11,111.3,14,93.9,6,9.87,66


In [42]:
#Add Betting Odds W column
df = pd.merge(
    df, 
    betting_odds_df, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop(["TeamName", "Championship Moneyline Odds Before Round 1", 'TeamID'], axis=1).rename(columns={"Championship Win Probability": "Championship Win ProbabilityW"})
df["Championship Win ProbabilityW"] = df["Championship Win ProbabilityW"].fillna(0)

In [43]:
#Add Betting Odds L column
df = pd.merge(
    df, 
    betting_odds_df, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop(["TeamName", "Championship Moneyline Odds Before Round 1", 'TeamID'], axis=1).rename(columns={"Championship Win Probability": "Championship Win ProbabilityL"})
df["Championship Win ProbabilityL"] = df["Championship Win ProbabilityL"].fillna(0)

In [44]:
df

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,SeedW,SeedL,WinPercentageW,GapAvgW,...,Opp AdjEML,Opp AdjEM RankL,OppOL,OppO RankL,OppDL,OppD RankL,NCSOS AdjEML,NCSOS AdjEM RankL,Championship Win ProbabilityW,Championship Win ProbabilityL
0,2002,134,1373,81,1108,77,16,16,0.470588,1.352941,...,-14.53,325,95.0,325,109.6,325,2.59,101,0.000200,0.000000
1,2002,136,1104,86,1194,78,2,15,0.787879,9.969697,...,-4.94,247,101.9,213,106.9,289,-4.52,267,0.038462,0.000200
2,2002,136,1112,86,1364,81,3,14,0.709677,5.129032,...,0.45,146,103.1,175,102.6,134,3.37,82,0.062500,0.000999
3,2002,136,1181,84,1457,37,1,16,0.906250,19.937500,...,-10.73,311,97.0,313,107.7,304,-0.98,188,0.285714,0.000000
4,2002,136,1231,75,1428,56,5,12,0.633333,7.266667,...,5.38,73,106.2,74,100.8,75,1.57,123,0.013158,0.009901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,2021,148,1211,85,1425,66,1,6,1.000000,23.000000,...,16.27,21,111.1,19,94.8,28,6.28,111,0.333333,0.027778
1241,2021,148,1417,51,1276,49,11,1,0.653846,4.346154,...,16.44,19,111.7,9,95.3,39,2.51,178,0.007937,0.125000
1242,2021,152,1124,78,1222,59,1,2,0.916667,17.958333,...,9.21,90,105.7,105,96.5,67,2.11,187,0.166667,0.058824
1243,2021,152,1211,93,1417,90,1,11,1.000000,23.000000,...,17.38,11,111.3,14,93.9,6,9.87,66,0.333333,0.007937


In [45]:
def concact_win_loss_df(df):
    #Convert win team to team A, loss team to team B 
    win_df = df.copy()
    rename_win = {
        "WTeamID": "TeamIdA", 
        "WScore" : "ScoreA", 
        "LTeamID" : "TeamIdB",
        "LScore": "ScoreB",
     }
    
    rename_win.update({col : col[:-1] + "A" for col in df.columns if col.endswith('W')})
    rename_win.update({col : col[:-1] + "B" for col in df.columns if col.endswith('L')})
    
    win_df = win_df.rename(columns=rename_win)
    
    loss_df = df.copy()

    rename_loss = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
    }

    rename_loss.update({col : col[:-1] + "B" for col in df.columns if col.endswith('W')})
    rename_loss.update({col : col[:-1] + "A" for col in df.columns if col.endswith('L')})
    
    loss_df = loss_df.rename(columns=rename_loss)

    
    return pd.concat([win_df, loss_df], axis=0, sort=False)

In [46]:
df = concact_win_loss_df(df)
df

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,WinPercentageA,GapAvgA,...,Opp AdjEMB,Opp AdjEM RankB,OppOB,OppO RankB,OppDB,OppD RankB,NCSOS AdjEMB,NCSOS AdjEM RankB,Championship Win ProbabilityA,Championship Win ProbabilityB
0,2002,134,1373,81,1108,77,16,16,0.470588,1.352941,...,-14.53,325,95.0,325,109.6,325,2.59,101,0.000200,0.000000
1,2002,136,1104,86,1194,78,2,15,0.787879,9.969697,...,-4.94,247,101.9,213,106.9,289,-4.52,267,0.038462,0.000200
2,2002,136,1112,86,1364,81,3,14,0.709677,5.129032,...,0.45,146,103.1,175,102.6,134,3.37,82,0.062500,0.000999
3,2002,136,1181,84,1457,37,1,16,0.906250,19.937500,...,-10.73,311,97.0,313,107.7,304,-0.98,188,0.285714,0.000000
4,2002,136,1231,75,1428,56,5,12,0.633333,7.266667,...,5.38,73,106.2,74,100.8,75,1.57,123,0.013158,0.009901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,2021,148,1425,66,1211,85,6,1,0.758621,9.655172,...,11.35,72,108.1,70,96.8,74,6.21,112,0.027778,0.333333
1241,2021,148,1276,49,1417,51,1,11,0.833333,10.875000,...,17.38,11,111.3,14,93.9,6,9.87,66,0.125000,0.007937
1242,2021,152,1222,59,1124,78,2,1,0.884615,18.000000,...,13.25,54,108.7,64,95.5,44,-2.83,258,0.058824,0.166667
1243,2021,152,1417,90,1211,93,11,1,0.653846,4.346154,...,11.35,72,108.1,70,96.8,74,6.21,112,0.007937,0.333333


#### Feature Differences

In [47]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

In [48]:
diff_cols = ['Seed',
             'WinPercentage',
             'GapAvg',
             "Rank",
             "AdjEM",
                                         "AdjO Eff",
                                         "AdjO Eff Rank",
                                         "AdjD Eff",
                                         "AdjD Eff Rank",
                                         "AdjTempo",
                                         "AdjTempo Rank",
                                         "Luck",
                                         "Luck Rank",
                                         "Opp AdjEM",
                                         "Opp AdjEM Rank",
                                         "OppO",
                                         "OppO Rank",
                                         "OppD",
                                         "OppD Rank",
                                         "NCSOS AdjEM",
                                         "NCSOS AdjEM Rank",
                                         "Championship Win Probability"]
#Compute difference between team A and B for each feature
for col in diff_cols:
    df[col + 'Diff'] = df[col + 'A'] - df[col + 'B']

In [49]:
df

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,SeedA,SeedB,WinPercentageA,GapAvgA,...,Luck RankDiff,Opp AdjEMDiff,Opp AdjEM RankDiff,OppODiff,OppO RankDiff,OppDDiff,OppD RankDiff,NCSOS AdjEMDiff,NCSOS AdjEM RankDiff,Championship Win ProbabilityDiff
0,2002,134,1373,81,1108,77,16,16,0.470588,1.352941,...,288,10.39,-88,5.1,-79,-5.4,-126,-3.01,66,0.000200
1,2002,136,1104,86,1194,78,2,15,0.787879,9.969697,...,-17,13.53,-218,5.2,-167,-8.4,-274,2.96,-61,0.038262
2,2002,136,1112,86,1364,81,3,14,0.709677,5.129032,...,-269,13.77,-145,8.2,-174,-5.5,-131,14.19,-81,0.061501
3,2002,136,1181,84,1457,37,1,16,0.906250,19.937500,...,144,20.60,-293,12.1,-300,-8.5,-273,7.64,-154,0.285714
4,2002,136,1231,75,1428,56,5,12,0.633333,7.266667,...,130,8.73,-71,4.6,-72,-4.1,-73,11.96,-119,0.003257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,2021,148,1425,66,1211,85,6,1,0.758621,9.655172,...,44,4.92,-51,3.0,-51,-2.0,-46,0.07,-1,-0.305556
1241,2021,148,1276,49,1417,51,1,11,0.833333,10.875000,...,46,-0.94,8,0.4,-5,1.4,33,-7.36,112,0.117063
1242,2021,152,1222,59,1124,78,2,1,0.884615,18.000000,...,53,-4.04,36,-3.0,41,1.0,23,4.94,-71,-0.107843
1243,2021,152,1417,90,1211,93,11,1,0.653846,4.346154,...,-17,6.03,-61,3.2,-56,-2.9,-68,3.66,-46,-0.325397


### Test Data

In [50]:
df_test = pd.read_csv("../data/2022_Stage1/MSampleSubmissionStage1.csv")
df_test

Unnamed: 0,ID,Pred
0,2016_1112_1114,0.5
1,2016_1112_1122,0.5
2,2016_1112_1124,0.5
3,2016_1112_1138,0.5
4,2016_1112_1139,0.5
...,...,...
11385,2021_1452_1457,0.5
11386,2021_1452_1458,0.5
11387,2021_1455_1457,0.5
11388,2021_1455_1458,0.5


In [51]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [52]:
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2016_1112_1114,0.5,2016,1112,1114
1,2016_1112_1122,0.5,2016,1112,1122
2,2016_1112_1124,0.5,2016,1112,1124
3,2016_1112_1138,0.5,2016,1112,1138
4,2016_1112_1139,0.5,2016,1112,1139
...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457
11386,2021_1452_1458,0.5,2021,1452,1458
11387,2021_1455_1457,0.5,2021,1455,1457
11388,2021_1455_1458,0.5,2021,1455,1458


In [53]:
#Add SeedA column
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'})
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA
0,2016_1112_1114,0.5,2016,1112,1114,Y06
1,2016_1112_1122,0.5,2016,1112,1122,Y06
2,2016_1112_1124,0.5,2016,1112,1124,Y06
3,2016_1112_1138,0.5,2016,1112,1138,Y06
4,2016_1112_1139,0.5,2016,1112,1139,Y06
...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,Y03
11386,2021_1452_1458,0.5,2021,1452,1458,Y03
11387,2021_1455_1457,0.5,2021,1455,1457,X11b
11388,2021_1455_1458,0.5,2021,1455,1458,X11b


In [54]:
#Add SeedB column
df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'})
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB
0,2016_1112_1114,0.5,2016,1112,1114,Y06,X12
1,2016_1112_1122,0.5,2016,1112,1122,Y06,Y16
2,2016_1112_1124,0.5,2016,1112,1124,Y06,Z05
3,2016_1112_1138,0.5,2016,1112,1138,Y06,Y14
4,2016_1112_1139,0.5,2016,1112,1139,Y06,X09
...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,Y03,Z12
11386,2021_1452_1458,0.5,2021,1452,1458,Y03,Z09
11387,2021_1455_1457,0.5,2021,1455,1457,X11b,Z12
11388,2021_1455_1458,0.5,2021,1455,1458,X11b,Z09


In [55]:
#Convert seed strings into ints
df_test['SeedA'] = df_test['SeedA'].apply(seed_string_to_int)
df_test['SeedB'] = df_test['SeedB'].apply(seed_string_to_int)

In [56]:
#Add WinPercentageA and GapAvgA columns
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountA',
    'LossCount': 'LossCountA',
    'AverageWinMargin': 'AverageWinMarginA',
    'AverageLossMargin': 'AverageLossMarginA',
    'WinPercentage': 'WinPercentageA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinPercentageA,GapAvgA
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121
...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,3,12,0.666667,5.185185
11386,2021_1452_1458,0.5,2021,1452,1458,3,9,0.666667,5.185185
11387,2021_1455_1457,0.5,2021,1455,1457,11,12,0.736842,2.631579
11388,2021_1455_1458,0.5,2021,1455,1458,11,9,0.736842,2.631579


In [57]:
#Add WinPercentageB and GapAvgB columns
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'WinCount': 'WinCountB',
    'LossCount': 'LossCountB',
    'AverageWinMargin': 'AverageWinMarginB',
    'AverageLossMargin': 'AverageLossMarginB',
    'WinPercentage': 'WinPercentageB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,GapAvgB
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.870968,9.935484
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.484848,-2.363636
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.656250,6.687500
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.575758,0.666667
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.677419,9.419355
...,...,...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,3,12,0.666667,5.185185,0.958333,12.750000
11386,2021_1452_1458,0.5,2021,1452,1458,3,9,0.666667,5.185185,0.586207,5.310345
11387,2021_1455_1457,0.5,2021,1455,1457,11,12,0.736842,2.631579,0.958333,12.750000
11388,2021_1455_1458,0.5,2021,1455,1458,11,9,0.736842,2.631579,0.586207,5.310345


In [58]:
#Add PomeroyA columns
df_test = pd.merge(
    df_test,
    df_pomeroy,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={"Rank": "RankA",
                                         "AdjEM": "AdjEMA",
                                         "AdjO Eff": "AdjO EffA",
                                         "AdjO Eff Rank": "AdjO Eff RankA",
                                         "AdjD Eff": "AdjD EffA",
                                         "AdjD Eff Rank": "AdjD Eff RankA",
                                         "AdjTempo": "AdjTempoA",
                                         "AdjTempo Rank": "AdjTempo RankA",
                                         "Luck": "LuckA",
                                         "Luck Rank": "Luck RankA",
                                         "Opp AdjEM": "Opp AdjEMA",
                                         "Opp AdjEM Rank": "Opp AdjEM RankA",
                                         "OppO": "OppOA",
                                         "OppO Rank": "OppO RankA",
                                         "OppD": "OppDA",
                                         "OppD Rank": "OppD RankA",
                                         "NCSOS AdjEM": "NCSOS AdjEMA",
                                         "NCSOS AdjEM Rank": "NCSOS AdjEM RankA"})
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,...,LuckA,Luck RankA,Opp AdjEMA,Opp AdjEM RankA,OppOA,OppO RankA,OppDA,OppD RankA,NCSOS AdjEMA,NCSOS AdjEM RankA
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.870968,...,-0.038,272,6.66,63,107.9,67,101.3,54,-3.66,269
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.484848,...,-0.038,272,6.66,63,107.9,67,101.3,54,-3.66,269
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.656250,...,-0.038,272,6.66,63,107.9,67,101.3,54,-3.66,269
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.575758,...,-0.038,272,6.66,63,107.9,67,101.3,54,-3.66,269
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.677419,...,-0.038,272,6.66,63,107.9,67,101.3,54,-3.66,269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,3,12,0.666667,5.185185,0.958333,...,-0.056,294,15.18,31,109.6,45,94.4,20,13.77,29
11386,2021_1452_1458,0.5,2021,1452,1458,3,9,0.666667,5.185185,0.586207,...,-0.056,294,15.18,31,109.6,45,94.4,20,13.77,29
11387,2021_1455_1457,0.5,2021,1455,1457,11,12,0.736842,2.631579,0.958333,...,0.089,23,10.47,81,106.4,96,96.0,57,13.71,30
11388,2021_1455_1458,0.5,2021,1455,1458,11,9,0.736842,2.631579,0.586207,...,0.089,23,10.47,81,106.4,96,96.0,57,13.71,30


In [59]:
#Add PomeroyB columns
df_test = pd.merge(
    df_test,
    df_pomeroy,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={"Rank": "RankB",
                                         "AdjEM": "AdjEMB",
                                         "AdjO Eff": "AdjO EffB",
                                         "AdjO Eff Rank": "AdjO Eff RankB",
                                         "AdjD Eff": "AdjD EffB",
                                         "AdjD Eff Rank": "AdjD Eff RankB",
                                         "AdjTempo": "AdjTempoB",
                                         "AdjTempo Rank": "AdjTempo RankB",
                                         "Luck": "LuckB",
                                         "Luck Rank": "Luck RankB",
                                         "Opp AdjEM": "Opp AdjEMB",
                                         "Opp AdjEM Rank": "Opp AdjEM RankB",
                                         "OppO": "OppOB",
                                         "OppO Rank": "OppO RankB",
                                         "OppD": "OppDB",
                                         "OppD Rank": "OppD RankB",
                                         "NCSOS AdjEM": "NCSOS AdjEMB",
                                         "NCSOS AdjEM Rank": "NCSOS AdjEM RankB"})

In [60]:
#Add Betting OddsA column
df_test = pd.merge(
    df_test, 
    betting_odds_df, 
    how='left', 
    left_on=['Season', 'TeamIdA'], 
    right_on=['Season', 'TeamID']
).drop(["TeamName", "Championship Moneyline Odds Before Round 1", 'TeamID'], axis=1).rename(columns={"Championship Win Probability": "Championship Win ProbabilityA"})
df_test["Championship Win ProbabilityA"] = df_test["Championship Win ProbabilityA"].fillna(0)

In [61]:
#Add Betting OddsB column
df_test = pd.merge(
    df_test, 
    betting_odds_df, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop(["TeamName", "Championship Moneyline Odds Before Round 1", 'TeamID'], axis=1).rename(columns={"Championship Win Probability": "Championship Win ProbabilityB"})
df_test["Championship Win ProbabilityB"] = df_test["Championship Win ProbabilityB"].fillna(0)

In [62]:
df_test

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinPercentageA,GapAvgA,WinPercentageB,...,Opp AdjEMB,Opp AdjEM RankB,OppOB,OppO RankB,OppDB,OppD RankB,NCSOS AdjEMB,NCSOS AdjEM RankB,Championship Win ProbabilityA,Championship Win ProbabilityB
0,2016_1112_1114,0.5,2016,1112,1114,6,12,0.757576,12.212121,0.870968,...,-0.77,167,104.2,188,104.9,155,-0.20,171,0.016393,0.000571
1,2016_1112_1122,0.5,2016,1112,1122,6,16,0.757576,12.212121,0.484848,...,-2.92,213,105.1,149,108.0,292,-0.19,170,0.016393,0.000100
2,2016_1112_1124,0.5,2016,1112,1124,6,5,0.757576,12.212121,0.656250,...,11.21,11,109.7,34,98.4,2,-3.12,257,0.016393,0.011628
3,2016_1112_1138,0.5,2016,1112,1138,6,14,0.757576,12.212121,0.575758,...,0.97,131,106.5,95,105.6,177,2.00,107,0.016393,0.000200
4,2016_1112_1139,0.5,2016,1112,1139,6,9,0.757576,12.212121,0.677419,...,8.10,47,108.5,62,100.4,25,-4.10,282,0.016393,0.003623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2021_1452_1457,0.5,2021,1452,1457,3,12,0.666667,5.185185,0.958333,...,-5.62,302,99.6,281,105.2,311,4.18,149,0.047619,0.000000
11386,2021_1452_1458,0.5,2021,1452,1458,3,9,0.666667,5.185185,0.586207,...,18.13,6,112.1,7,93.9,8,1.24,199,0.047619,0.024390
11387,2021_1455_1457,0.5,2021,1455,1457,11,12,0.736842,2.631579,0.958333,...,-5.62,302,99.6,281,105.2,311,4.18,149,0.009901,0.000000
11388,2021_1455_1458,0.5,2021,1455,1458,11,9,0.736842,2.631579,0.586207,...,18.13,6,112.1,7,93.9,8,1.24,199,0.009901,0.024390


In [63]:
#Compute difference between team A and B for each feature
for col in diff_cols:
    df_test[col + 'Diff'] = df_test[col + 'A'] - df_test[col + 'B']

## Modeling

In [64]:
features = ['SeedDiff',
       'WinPercentageDiff', 'GapAvgDiff', 'RankDiff', 'AdjEMDiff',
       'AdjO EffDiff', 'AdjO Eff RankDiff', 'AdjD EffDiff',
       'AdjD Eff RankDiff', 'AdjTempoDiff', 'AdjTempo RankDiff', 'LuckDiff',
       'Luck RankDiff', 'Opp AdjEMDiff', 'Opp AdjEM RankDiff', 'OppODiff',
       'OppO RankDiff', 'OppDDiff', 'OppD RankDiff', 'NCSOS AdjEMDiff',
       'NCSOS AdjEM RankDiff', "Championship Win ProbabilityDiff"]
len(features)

22

In [65]:
#Rescale train, test, and val data w/ features
def rescale(features, df_train, df_val, df_test=None):
    min_value = df_train[features].min()
    max_value = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_value) / (max_value - min_value)
    df_val[features] = (df_val[features] - min_value) / (max_value - min_value)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_value) / (max_value - min_value)
        
    return df_test, df_val, df_train

In [66]:
#Cross-validates data using k-fold method
def kfold_validation(df, df_test=None, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    log_loss_scores = []
    pred_win_probabilities = []
    target_y = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[1:]:        
        #Training data uses the previous season
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        
        #Validation data uses the current season
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test.copy()
        
        df_test, df_val, df_train = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
            model = LinearRegression()
        elif mode == "cls":
            model = LogisticRegression()

        model.fit(df_train[features], df_train[target_y])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
            pred = (pred - pred.min()) / (pred.max() - pred.min())
        elif mode == "cls":
            pred = model.predict_proba(df_val[features])[:, 1]
        
        if df_test is not None:
            if mode == "reg":
                pred_win_probability = model.predict(df_test[features])
                pred_win_probability = (pred_win_probability - pred_win_probability.min()) / (pred_win_probability.max() - pred_win_probability.min())
            elif mode == "cls":
                pred_win_probability = model.predict_proba(df_test[features])[:, 1]
                
            pred_win_probabilities.append(pred_win_probability)
        
        log_loss_score = log_loss(df_val['WinA'].values, pred)
        log_loss_scores.append(log_loss_score)
        
        #Log loss score for the current season, prints if verbose value truthy
        if verbose:
            print(f'Log loss score for season {season}: {round(log_loss_score, 3)}')
    #Prints summary results if verbose value truthy
    if verbose:
        print(f'Average log loss score: {np.mean(log_loss_scores):.3f}')

    return pred_win_probabilities

In [67]:
kfold_validation(df, df_test, verbose=1, mode="reg")

Log loss score for season 2003: 0.545
Log loss score for season 2004: 0.561
Log loss score for season 2005: 0.532
Log loss score for season 2006: 0.505
Log loss score for season 2007: 0.505
Log loss score for season 2008: 0.504
Log loss score for season 2009: 0.476
Log loss score for season 2010: 0.545
Log loss score for season 2011: 0.568
Log loss score for season 2012: 0.55
Log loss score for season 2013: 0.518
Log loss score for season 2014: 0.528
Log loss score for season 2015: 0.507
Log loss score for season 2016: 0.521
Log loss score for season 2017: 0.509
Log loss score for season 2018: 0.534
Log loss score for season 2019: 0.46
Log loss score for season 2021: 0.516
Average log loss score: 0.521


[array([0.48234188, 0.54134693, 0.44235622, ..., 0.55314024, 0.42071646,
        0.38206959]),
 array([0.76931588, 0.62872656, 0.53037421, ..., 0.63400233, 0.24883075,
        0.16854765]),
 array([0.75109833, 0.48075957, 0.49517579, ..., 0.61581862, 0.21625158,
        0.13200574]),
 array([0.75921819, 0.66137939, 0.57882852, ..., 0.52317683, 0.26613617,
        0.30370247]),
 array([0.74620529, 0.68603786, 0.58562362, ..., 0.49432866, 0.27376671,
        0.33679851]),
 array([0.74144045, 0.73908617, 0.60757424, ..., 0.46788875, 0.29266547,
        0.38679429]),
 array([0.7409723 , 0.74362151, 0.60943162, ..., 0.46553524, 0.29427471,
        0.39112397]),
 array([0.74048578, 0.74394099, 0.609419  , ..., 0.46467448, 0.2943396 ,
        0.39184244]),
 array([0.74051788, 0.74834403, 0.61137609, ..., 0.46313526, 0.2959539 ,
        0.39560219]),
 array([0.74074107, 0.74884246, 0.61166708, ..., 0.46329744, 0.29616011,
        0.39582765]),
 array([0.25435323, 0.23945164, 0.38160514, ..., 0

In [68]:
kfold_validation(df, df_test, verbose=1, mode="cls")

Log loss score for season 2003: 0.491
Log loss score for season 2004: 0.461
Log loss score for season 2005: 0.46
Log loss score for season 2006: 0.49
Log loss score for season 2007: 0.408
Log loss score for season 2008: 0.423
Log loss score for season 2009: 0.417
Log loss score for season 2010: 0.472
Log loss score for season 2011: 0.56
Log loss score for season 2012: 0.513
Log loss score for season 2013: 0.471
Log loss score for season 2014: 0.437
Log loss score for season 2015: 0.452
Log loss score for season 2016: 0.476
Log loss score for season 2017: 0.392
Log loss score for season 2018: 0.481
Log loss score for season 2019: 0.378
Log loss score for season 2021: 0.479
Average log loss score: 0.459


[array([0.79104321, 0.93974718, 0.52535675, ..., 0.75685526, 0.23042193,
        0.08774811]),
 array([0.78429056, 0.81170216, 0.80879371, ..., 0.82513621, 0.84866561,
        0.8352699 ]),
 array([0.97763354, 0.97396932, 0.98940226, ..., 0.99647354, 0.99858054,
        0.99692776]),
 array([0.99998952, 0.99998614, 0.99999943, ..., 0.99999998, 1.        ,
        1.        ]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.]),
 array([1., 1., 1., ..., 1., 1., 1.])]

In [69]:
pred_tests = kfold_validation(df, df_test, verbose=0, mode="cls")
pred_test = np.mean(pred_tests, 0)

In [70]:
submission = df_test[['ID', 'Season', 'Pred', 'TeamIdA', 'TeamIdB', 'SeedA', 'SeedB']].copy()
submission['Pred'] = pred_test

In [71]:
df_teams = pd.read_csv("../data/2022_Stage1/MTeams.csv")
submission = submission.merge(df_teams, left_on="TeamIdA", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamA"})
submission = submission.merge(df_teams, left_on="TeamIdB", right_on="TeamID").drop('TeamID', axis=1).rename(columns={"TeamName": "TeamB"})

In [72]:
df_seeds['Seed'] = df_seeds['Seed'].apply(lambda x:x[0])

submission = submission.merge(df_seeds, left_on=["TeamIdA", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionA"})
submission = submission.merge(df_seeds, left_on=["TeamIdB", "Season"], right_on=["TeamID", "Season"]).drop('TeamID', axis=1).rename(columns={"Seed": "RegionB"})
submission

Unnamed: 0,ID,Season,Pred,TeamIdA,TeamIdB,SeedA,SeedB,TeamA,FirstD1Season_x,LastD1Season_x,TeamB,FirstD1Season_y,LastD1Season_y,RegionA,RegionB
0,2016_1112_1114,2016,0.975164,1112,1114,6,12,Arizona,1985,2022,Ark Little Rock,1985,2022,Y,X
1,2016_1112_1122,2016,0.984745,1112,1122,6,16,Arizona,1985,2022,Austin Peay,1985,2022,Y,Y
2,2016_1114_1122,2016,0.979486,1114,1122,12,16,Ark Little Rock,1985,2022,Austin Peay,1985,2022,X,Y
3,2016_1112_1124,2016,0.962420,1112,1124,6,5,Arizona,1985,2022,Baylor,1985,2022,Y,Z
4,2016_1114_1124,2016,0.947498,1114,1124,12,5,Ark Little Rock,1985,2022,Baylor,1985,2022,X,Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11385,2018_1438_1460,2018,0.989775,1438,1460,1,14,Virginia,1985,2022,Wright St,1988,2022,Y,Y
11386,2018_1439_1460,2018,0.979162,1439,1460,8,14,Virginia Tech,1985,2022,Wright St,1988,2022,W,Y
11387,2018_1452_1460,2018,0.983506,1452,1460,5,14,West Virginia,1985,2022,Wright St,1988,2022,W,Y
11388,2018_1455_1460,2018,0.981427,1455,1460,4,14,Wichita St,1985,2022,Wright St,1988,2022,W,Y


In [73]:
final_submission = submission[['ID', 'Pred']].copy()

In [74]:
final_submission

Unnamed: 0,ID,Pred
0,2016_1112_1114,0.975164
1,2016_1112_1122,0.984745
2,2016_1114_1122,0.979486
3,2016_1112_1124,0.962420
4,2016_1114_1124,0.947498
...,...,...
11385,2018_1438_1460,0.989775
11386,2018_1439_1460,0.979162
11387,2018_1452_1460,0.983506
11388,2018_1455_1460,0.981427


In [75]:
final_submission.to_csv("../submission/BettingDataModel_submission.csv")