In [2]:
import pandas as pd
import numpy as np
import itertools


In [3]:
data_dir = "../data/"

In [4]:
# Read seeds df
seeds_df = pd.read_csv(data_dir + "MNCAATourneySeeds.csv").sample(frac=1)
# Shuffle the order to avoid bias in TeamA vs TeamB later
seeds_copy = seeds_df.copy().sample(frac=1) 
print(seeds_df.head(5))

      Season  Seed  TeamID
307     1989   Z04    1199
2378    2022   X08    1129
2001    2015   Z05    1116
2458    2023   Y03    1462
1071    2001  Y16a    1322


In [5]:
years = np.sort(seeds_df.Season.unique())
df_list = []
for x in years:
    year_df = seeds_df.loc[seeds_df["Season"]==x]
    teams = year_df.TeamID.unique()
    np.random.shuffle(teams)
    combos = list(itertools.combinations(teams, 2))
    temp_df = pd.DataFrame(combos, columns=["TeamID_A", "TeamID_B"])
    temp_df["Season"] = x
    df_list.append(temp_df)
all_matchups = pd.concat(df_list)
all_matchups.head()

Unnamed: 0,TeamID_A,TeamID_B,Season
0,1192,1396,1985
1,1192,1276,1985
2,1192,1229,1985
3,1192,1130,1985
4,1192,1345,1985


In [6]:
# Rejoin in seeds 
seeds_A = all_matchups.merge(seeds_df, left_on=["Season", "TeamID_A"], right_on=["Season","TeamID"]).drop(columns="TeamID").rename(columns={"Seed":"Seed_A"})
all_seeds = seeds_A.merge(seeds_df, left_on=["Season", "TeamID_B"], right_on=["Season","TeamID"]).drop(columns="TeamID").rename(columns={"Seed":"Seed_B"})
all_seeds

Unnamed: 0,TeamID_A,TeamID_B,Season,Seed_A,Seed_B
0,1192,1396,1985,Z16,W08
1,1192,1276,1985,Z16,Z01
2,1192,1229,1985,Z16,Y09
3,1192,1130,1985,Z16,Y11
4,1192,1345,1985,Z16,Z06
...,...,...,...,...,...
80387,1277,1272,2023,W07,W08
80388,1277,1158,2023,W07,X12
80389,1233,1272,2023,Z13,W08
80390,1233,1158,2023,Z13,X12


In [7]:
# Read in compact tourney results
results_df = pd.read_csv(data_dir + "MNCAATourneyCompactResults.csv")
results_df.head(5)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [8]:
all_seeds["seed_set"] = all_seeds[["TeamID_A","TeamID_B","Season"]].values.tolist()
all_seeds["seed_set"] = all_seeds.apply(lambda row: frozenset(row["seed_set"]),axis=1)
results_df["res_set"] = results_df[["WTeamID","LTeamID","Season"]].values.tolist()
results_df["res_set"] = results_df.apply(lambda row: frozenset(row["res_set"]),axis=1)

full_df = all_seeds.merge(results_df, left_on="seed_set", right_on="res_set").rename(columns={"Season_y":"Season"})
print(full_df.head())
full_df['y'] = np.where(full_df["TeamID_A"] == full_df["WTeamID"], 1, 0)
X = full_df[["Season", "Seed_A", "Seed_B"]]

y = full_df[["y"]]


   TeamID_A  TeamID_B  Season_x Seed_A Seed_B            seed_set  Season  \
0      1192      1276      1985    Z16    Z01  (1192, 1985, 1276)    1985   
1      1396      1207      1985    W08    W01  (1985, 1396, 1207)    1985   
2      1396      1439      1985    W08    W09  (1985, 1396, 1439)    1985   
3      1276      1437      1985    Z01    Z08  (1985, 1276, 1437)    1985   
4      1229      1328      1985    Y09    Y01  (1328, 1985, 1229)    1985   

   DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT             res_set  
0     137     1276      59     1192      55    N      0  (1192, 1985, 1276)  
1     138     1207      63     1396      46    N      0  (1985, 1396, 1207)  
2     136     1396      60     1439      57    N      0  (1985, 1396, 1439)  
3     139     1437      59     1276      55    N      0  (1985, 1276, 1437)  
4     138     1328      75     1229      69    N      0  (1328, 1985, 1229)  


In [9]:
X

Unnamed: 0,Season,Seed_A,Seed_B
0,1985,Z16,Z01
1,1985,W08,W01
2,1985,W08,W09
3,1985,Z01,Z08
4,1985,Y09,Y01
...,...,...,...
2446,2023,Z02,Z03
2447,2023,Z02,Z15
2448,2023,X07,X15
2449,2023,X08,X09


In [None]:
X.to_csv("./X_train.csv")
y.to_csv("./y_train.csv")