# Step 1: Imports

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

def load_raw_data(filename: str) -> pd.DataFrame:
    mens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/M{filename}.csv"
    weomens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/W{filename}.csv"
    df_mens = pd.read_csv(mens_filepath)
    df_mens["Gender"] = "M"
    df_weomens = pd.read_csv(weomens_filepath)
    df_weomens["Gender"] = "W"
    return pd.concat([df_mens, df_weomens])

# Step 2: Load the data

In [2]:
RegularSeasonDetailedResults = load_raw_data("RegularSeasonDetailedResults")
RegularSeasonDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
70002,2023,113,3268,96,3234,68,H,0,34,71,14,26,14,15,11,23,19,11,15,1,14,23,66,11,38,11,15,18,19,14,24,5,3,14,W
70003,2023,113,3385,69,3163,64,A,0,23,63,9,18,14,18,12,26,14,12,4,6,18,22,62,8,21,12,15,11,25,14,12,5,7,19,W
70004,2023,113,3416,57,3396,53,H,0,19,60,4,17,15,27,13,32,9,17,3,7,14,16,70,7,26,14,17,16,19,5,9,10,9,20,W
70005,2023,113,3437,67,3177,64,H,0,24,67,8,23,11,15,10,31,19,14,7,9,10,26,77,4,19,8,11,21,26,13,12,9,2,14,W
70006,2023,113,3466,75,3146,42,A,0,27,62,12,32,9,11,10,27,15,10,11,3,16,15,51,1,5,11,21,8,19,6,15,3,3,12,W


In [3]:
NCAATourneyDetailedResults = load_raw_data("NCAATourneyDetailedResults")
NCAATourneyDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
755,2022,147,3163,91,3301,87,N,2,37,77,5,21,12,20,12,23,10,7,5,2,16,32,66,7,23,16,19,6,30,20,13,4,7,16,W
756,2022,147,3257,62,3276,50,N,0,25,58,5,15,7,9,6,20,12,11,15,4,17,16,46,3,14,15,20,10,24,9,21,6,2,12,W
757,2022,151,3163,63,3390,58,N,0,21,57,5,14,16,20,12,30,14,19,5,2,16,23,66,4,23,8,13,11,23,10,11,11,3,16,W
758,2022,151,3376,72,3257,59,N,0,27,57,6,17,12,17,8,24,19,14,11,4,11,27,63,1,8,4,7,11,18,5,15,13,2,17,W
759,2022,153,3376,64,3163,49,N,0,22,60,3,16,17,26,18,23,9,14,6,4,11,22,54,4,16,1,4,3,16,14,14,4,5,21,W


In [4]:
NCAATourneySeeds = load_raw_data("NCAATourneySeeds")
NCAATourneySeeds.tail()

Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,Z12,3125,W
1536,2022,Z13,3138,W
1537,2022,Z14,3110,W
1538,2022,Z15,3218,W
1539,2022,Z16,3107,W


# Step 3: Prepare the data



In [5]:
def process_detailed_results(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    df = clean_detailed_results(df)
    df = aggregate_detailed_results(df)
    df = compute_percentages(df)
    return df

def clean_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(["WLoc", "DayNum"], axis=1)

def reshape_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    winner_columns, looser_columns = split_winner_and_looser_columns(df)
    df_winner = df.copy()
    df_winner = df_winner[winner_columns]
    df_winner.columns = clean_column_names(df_winner)
    df_winner["Win"] = 1
    df_looser = df.copy()
    df_looser = df_looser[looser_columns]
    df_looser.columns = clean_column_names(df_looser)
    df_looser["Win"] = 0
    return pd.concat([df_winner, df_looser], ignore_index=True)

def aggregate_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    df = reshape_detailed_results(df)
    df_agg = df.groupby(["Season", "TeamID"]).agg("mean")
    return df_agg.reset_index()

def compute_percentages(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    df["FGP"] =  df["FGM"] / df["FGA"]
    df["FGP3"] =  df["FGM3"] / df["FGA3"]
    df["FTP"] =  df["FTM"] / df["FTA"]
    return df

def split_winner_and_looser_columns(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    winner_columns = [name for name in df.columns if not name.startswith("L")]
    looser_columns = [name for name in df.columns if not name.startswith("W")]
    return winner_columns, looser_columns

def clean_column_names(df: pd.DataFrame) -> List[str]:
    column_names = [
        name[1:] if 
        name.startswith("L") or name.startswith("W")
        else name 
        for name in df.columns
    ]
    return column_names

# Test data
test_df = pd.DataFrame([
    {"Season": 1, "WTeamID": "A", "LTeamID": "B", "stat1": 1, "Wstat2": 2, "Lstat2": 3 },
    {"Season": 1, "WTeamID": "A", "LTeamID": "B", "stat1": 4, "Wstat2": 5, "Lstat2": 6 },
])
expected_column_names = [
    "Season", "TeamID", "TeamID", "stat1", "stat2", "stat2"
]
expected_column_split = (
    ["Season", "WTeamID", "stat1", "Wstat2"], 
    ["Season", "LTeamID", "stat1", "Lstat2"]
)
expected_reshaped_df = pd.DataFrame([
    { "Season": 1, "TeamID": "A", "stat1": 1, "stat2": 2, "Win": 1 },
    { "Season": 1, "TeamID": "A", "stat1": 4, "stat2": 5, "Win": 1 },
    { "Season": 1, "TeamID": "B", "stat1": 1, "stat2": 3, "Win": 0 },
    { "Season": 1, "TeamID": "B", "stat1": 4, "stat2": 6, "Win": 0 },
    
])
expected_aggregated_df = pd.DataFrame([
    {"Season": 1, "TeamID": "A", "stat1": 2.5, "stat2": 3.5, "Win": 1.0 },
    {"Season": 1, "TeamID": "B","stat1": 2.5, "stat2": 4.5, "Win": 0.0 },
])
test_df_copy = test_df.copy()

# Tests
assert clean_column_names(test_df) == expected_column_names, "Function clean_column_names failed."
assert split_winner_and_looser_columns(test_df) == expected_column_split, "Function split_winner_and_looser_columns failed."
assert expected_reshaped_df.equals(reshape_detailed_results(test_df)), "Function reshape_detailed_results failed."
assert expected_aggregated_df.equals(aggregate_detailed_results(test_df)), "Function aggregate_detailed_results failed."

In [6]:
ProcessedRegularSeasonDetailedResults = process_detailed_results(
    RegularSeasonDetailedResults
)
ProcessedRegularSeasonDetailedResults.tail()

Unnamed: 0,Season,TeamID,Score,NumOT,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,Win,FGP,FGP3,FTP
12130,2023,3473,54.521739,0.0,19.73913,50.434783,6.391304,21.347826,8.652174,13.086957,6.26087,19.086957,11.913043,19.304348,6.347826,2.043478,16.086957,0.043478,0.391379,0.299389,0.66113
12131,2023,3474,56.913043,0.0,20.26087,57.956522,4.565217,16.565217,11.826087,18.043478,8.521739,20.130435,6.434783,14.652174,7.304348,2.173913,21.521739,0.217391,0.349587,0.275591,0.655422
12132,2023,3475,62.416667,0.041667,21.708333,55.416667,4.375,15.75,14.625,19.041667,8.916667,23.916667,13.375,20.083333,7.708333,2.875,18.916667,0.333333,0.391729,0.277778,0.768053
12133,2023,3476,58.576923,0.038462,21.423077,55.846154,6.115385,20.153846,9.615385,13.5,8.615385,22.269231,13.884615,14.807692,4.0,2.576923,15.307692,0.269231,0.383609,0.303435,0.712251
12134,2023,3477,65.115385,0.0,23.423077,59.615385,3.884615,14.961538,14.384615,21.576923,11.307692,23.961538,11.923077,16.923077,8.038462,3.423077,15.769231,0.423077,0.392903,0.25964,0.666667


In [7]:
ProcessedNCAATourneyDetailedResults = process_detailed_results(
    NCAATourneyDetailedResults
)
ProcessedNCAATourneyDetailedResults.tail()

Unnamed: 0,Season,TeamID,Score,NumOT,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,Win,FGP,FGP3,FTP
2034,2022,3426,71.0,0.0,27.0,62.0,7.0,16.0,10.0,16.0,7.0,26.0,11.0,9.0,6.0,5.0,20.0,0.0,0.435484,0.4375,0.625
2035,2022,3428,74.0,0.0,27.5,53.5,10.5,25.5,8.5,12.0,6.0,25.5,16.5,17.5,3.0,1.5,14.5,0.5,0.514019,0.411765,0.708333
2036,2022,3437,55.0,0.0,20.5,58.5,6.0,23.5,8.0,12.0,7.5,19.5,11.0,10.0,8.5,5.0,18.0,0.5,0.350427,0.255319,0.666667
2037,2022,3439,81.0,0.0,30.0,60.0,6.0,17.0,15.0,20.0,7.0,28.0,13.0,10.0,1.0,1.0,13.0,0.0,0.5,0.352941,0.75
2038,2022,3450,40.0,0.0,14.0,56.0,3.0,22.0,9.0,14.0,8.0,31.0,5.0,17.0,2.0,2.0,23.0,0.0,0.25,0.136364,0.642857


In [8]:
def process_seeds(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    mask = df["Season"] > 2002
    df = df[mask]
    df["Seed"] = df["Seed"].str.replace(r"\D+","")
    df["Seed"] = df["Seed"].astype(int)
    return df

In [9]:
ProcessedNCAATourneySeeds = process_seeds(NCAATourneySeeds)
ProcessedNCAATourneySeeds.tail()

  """


Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,12,3125,W
1536,2022,13,3138,W
1537,2022,14,3110,W
1538,2022,15,3218,W
1539,2022,16,3107,W


## Merge features

In [10]:
features = pd.merge(
    ProcessedRegularSeasonDetailedResults,
    ProcessedNCAATourneyDetailedResults,
    how="inner",
    on=["Season", "TeamID"],
    suffixes=("Reg", "Tou")
)

features = features.merge(
    ProcessedNCAATourneySeeds,
    how="inner",
    on=["Season", "TeamID"]
)

features.tail()

Unnamed: 0,Season,TeamID,ScoreReg,NumOTReg,FGMReg,FGAReg,FGM3Reg,FGA3Reg,FTMReg,FTAReg,ORReg,DRReg,AstReg,TOReg,StlReg,BlkReg,PFReg,WinReg,FGPReg,FGP3Reg,FTPReg,ScoreTou,NumOTTou,FGMTou,FGATou,FGM3Tou,FGA3Tou,FTMTou,FTATou,ORTou,DRTou,AstTou,TOTou,StlTou,BlkTou,PFTou,WinTou,FGPTou,FGP3Tou,FTPTou,Seed,Gender
2034,2022,3426,69.962963,0.074074,25.740741,60.185185,3.962963,14.333333,14.518519,20.888889,9.222222,24.814815,15.555556,13.740741,7.925926,3.555556,16.444444,0.740741,0.427692,0.276486,0.695035,71.0,0.0,27.0,62.0,7.0,16.0,10.0,16.0,7.0,26.0,11.0,9.0,6.0,5.0,20.0,0.0,0.435484,0.4375,0.625,14,W
2035,2022,3428,76.032258,0.064516,26.032258,60.129032,9.387097,26.612903,14.580645,18.645161,10.645161,23.129032,15.258065,14.0,6.83871,2.806452,16.580645,0.645161,0.43294,0.352727,0.782007,74.0,0.0,27.5,53.5,10.5,25.5,8.5,12.0,6.0,25.5,16.5,17.5,3.0,1.5,14.5,0.5,0.514019,0.411765,0.708333,7,W
2036,2022,3437,65.580645,0.032258,23.741935,58.677419,7.419355,23.774194,10.677419,13.935484,8.419355,23.096774,15.419355,12.387097,7.645161,3.677419,14.516129,0.741935,0.404618,0.312076,0.766204,55.0,0.0,20.5,58.5,6.0,23.5,8.0,12.0,7.5,19.5,11.0,10.0,8.5,5.0,18.0,0.5,0.350427,0.255319,0.666667,11,W
2037,2022,3439,70.5,0.03125,24.90625,56.34375,8.59375,23.65625,12.09375,15.9375,7.5625,25.125,14.28125,12.53125,4.46875,3.5,14.3125,0.71875,0.442041,0.363276,0.758824,81.0,0.0,30.0,60.0,6.0,17.0,15.0,20.0,7.0,28.0,13.0,10.0,1.0,1.0,13.0,0.0,0.5,0.352941,0.75,5,W
2038,2022,3450,60.758621,0.034483,22.655172,58.344828,6.931034,21.551724,8.517241,11.896552,9.103448,21.689655,13.344828,13.482759,7.896552,4.172414,15.482759,0.655172,0.388298,0.3216,0.715942,40.0,0.0,14.0,56.0,3.0,22.0,9.0,14.0,8.0,31.0,5.0,17.0,2.0,2.0,23.0,0.0,0.25,0.136364,0.642857,8,W


## Build Dataset

In [11]:
def get_outcomes(df):
    input_rows = df.to_records()
    output_rows = [parse_row(input_row) for input_row in input_rows]
    out_df = pd.DataFrame(output_rows)
    out_df = out_df[out_df["Season"] < 2017]
    return out_df

def parse_row(row):
    season = row['Season']
    winning_team_id = row['WTeamID']
    losing_team_id = row['LTeamID']
    if winning_team_id < losing_team_id:
        small_id = winning_team_id
        big_id = losing_team_id
        outcome = 1
    elif losing_team_id < winning_team_id:
        small_id = losing_team_id
        big_id = winning_team_id
        outcome = 0
    record = {
        "ID": f"{season}_{small_id}_{big_id}",
        'Season': season,
        'LowID': small_id,
        'HighID': big_id,
        'Win': outcome
    }
    return record

In [12]:
outcomes = get_outcomes(NCAATourneyDetailedResults)
print(outcomes.shape)
outcomes.tail()

(1355, 5)


Unnamed: 0,ID,Season,LowID,HighID,Win
1684,2016_3163_3400,2016,3163,3400,1
1685,2016_3124_3333,2016,3124,3333,0
1686,2016_3163_3333,2016,3163,3333,1
1687,2016_3393_3449,2016,3393,3449,1
1688,2016_3163_3393,2016,3163,3393,1


In [13]:
def merge_outcomes_with_features(outcomes: pd.DataFrame, features: pd.DataFrame) -> pd.DataFrame:
    data = pd.merge(
        outcomes, 
        features, 
        how="left", 
        left_on=["Season", "HighID"], 
        right_on=["Season", "TeamID"]
    )
    data = pd.merge(
        data, 
        features, 
        how="left", 
        left_on=["Season", "LowID"], 
        right_on=["Season", "TeamID"],
        suffixes=("High", "Low")
    )
    data.drop(
        ["Season", "HighID", "LowID","TeamIDHigh","TeamIDLow"], 
        axis=1, 
        inplace=True
    )
    data.set_index("ID", inplace=True)
    return data

In [14]:
data = merge_outcomes_with_features(outcomes, features)
print(data.shape)
data.tail()

(1355, 81)


Unnamed: 0_level_0,Win,ScoreRegHigh,NumOTRegHigh,FGMRegHigh,FGARegHigh,FGM3RegHigh,FGA3RegHigh,FTMRegHigh,FTARegHigh,ORRegHigh,DRRegHigh,AstRegHigh,TORegHigh,StlRegHigh,BlkRegHigh,PFRegHigh,WinRegHigh,FGPRegHigh,FGP3RegHigh,FTPRegHigh,ScoreTouHigh,NumOTTouHigh,FGMTouHigh,FGATouHigh,FGM3TouHigh,FGA3TouHigh,FTMTouHigh,FTATouHigh,ORTouHigh,DRTouHigh,AstTouHigh,TOTouHigh,StlTouHigh,BlkTouHigh,PFTouHigh,WinTouHigh,FGPTouHigh,FGP3TouHigh,FTPTouHigh,SeedHigh,GenderHigh,ScoreRegLow,NumOTRegLow,FGMRegLow,FGARegLow,FGM3RegLow,FGA3RegLow,FTMRegLow,FTARegLow,ORRegLow,DRRegLow,AstRegLow,TORegLow,StlRegLow,BlkRegLow,PFRegLow,WinRegLow,FGPRegLow,FGP3RegLow,FTPRegLow,ScoreTouLow,NumOTTouLow,FGMTouLow,FGATouLow,FGM3TouLow,FGA3TouLow,FTMTouLow,FTATouLow,ORTouLow,DRTouLow,AstTouLow,TOTouLow,StlTouLow,BlkTouLow,PFTouLow,WinTouLow,FGPTouLow,FGP3TouLow,FTPTouLow,SeedLow,GenderLow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2016_3163_3400,1,70.75,0.0,26.15625,59.90625,4.4375,13.5,14.0,20.15625,13.5625,27.71875,14.09375,16.15625,7.78125,5.34375,18.90625,0.875,0.43662,0.328704,0.694574,74.0,0.0,28.75,68.0,6.25,18.25,10.25,14.75,16.75,27.5,14.0,13.75,10.25,3.5,18.75,0.75,0.422794,0.342466,0.694915,2,W,87.5625,0.0,34.34375,65.4375,7.5625,20.75,11.3125,14.21875,12.875,28.1875,21.625,11.90625,12.1875,6.4375,11.25,1.0,0.524833,0.364458,0.795604,90.666667,0.0,33.333333,59.833333,8.833333,18.5,15.166667,18.5,11.0,28.0,21.5,11.833333,9.5,5.833333,10.166667,1.0,0.557103,0.477477,0.81982,1,W
2016_3124_3333,0,66.84375,0.0,24.84375,56.5625,6.5625,18.1875,10.59375,14.4375,12.0,31.5625,15.375,15.4375,5.09375,6.375,14.125,0.875,0.439227,0.360825,0.733766,67.2,0.0,24.8,59.0,7.2,22.2,10.4,14.4,14.4,30.6,16.2,14.6,5.6,5.8,12.2,0.8,0.420339,0.324324,0.722222,2,W,77.735294,0.0,30.0,61.617647,3.470588,9.352941,14.264706,21.529412,14.617647,31.176471,20.794118,15.382353,8.705882,6.088235,14.794118,0.970588,0.486874,0.371069,0.662568,77.0,0.0,30.0,65.25,5.25,14.0,11.75,17.0,15.0,28.25,19.75,12.5,9.25,8.0,14.0,0.75,0.45977,0.375,0.691176,1,W
2016_3163_3333,1,66.84375,0.0,24.84375,56.5625,6.5625,18.1875,10.59375,14.4375,12.0,31.5625,15.375,15.4375,5.09375,6.375,14.125,0.875,0.439227,0.360825,0.733766,67.2,0.0,24.8,59.0,7.2,22.2,10.4,14.4,14.4,30.6,16.2,14.6,5.6,5.8,12.2,0.8,0.420339,0.324324,0.722222,2,W,87.5625,0.0,34.34375,65.4375,7.5625,20.75,11.3125,14.21875,12.875,28.1875,21.625,11.90625,12.1875,6.4375,11.25,1.0,0.524833,0.364458,0.795604,90.666667,0.0,33.333333,59.833333,8.833333,18.5,15.166667,18.5,11.0,28.0,21.5,11.833333,9.5,5.833333,10.166667,1.0,0.557103,0.477477,0.81982,1,W
2016_3393_3449,1,71.0625,0.0,24.84375,60.1875,7.0625,20.9375,14.3125,18.25,10.1875,28.875,11.5,13.0625,7.21875,3.53125,13.75,0.6875,0.412773,0.337313,0.784247,73.6,0.0,26.0,60.6,9.0,25.2,12.6,16.8,6.8,30.6,14.2,10.4,7.2,4.6,13.0,0.8,0.429043,0.357143,0.75,7,W,72.0625,0.0,25.46875,68.78125,8.875,30.3125,12.25,18.34375,17.28125,23.1875,13.4375,14.1875,13.0625,4.875,17.75,0.78125,0.370286,0.292784,0.667802,74.833333,0.0,26.666667,67.333333,8.333333,27.0,13.166667,19.333333,16.833333,22.666667,10.833333,11.666667,9.833333,3.5,17.5,0.833333,0.39604,0.308642,0.681034,4,W
2016_3163_3393,1,72.0625,0.0,25.46875,68.78125,8.875,30.3125,12.25,18.34375,17.28125,23.1875,13.4375,14.1875,13.0625,4.875,17.75,0.78125,0.370286,0.292784,0.667802,74.833333,0.0,26.666667,67.333333,8.333333,27.0,13.166667,19.333333,16.833333,22.666667,10.833333,11.666667,9.833333,3.5,17.5,0.833333,0.39604,0.308642,0.681034,4,W,87.5625,0.0,34.34375,65.4375,7.5625,20.75,11.3125,14.21875,12.875,28.1875,21.625,11.90625,12.1875,6.4375,11.25,1.0,0.524833,0.364458,0.795604,90.666667,0.0,33.333333,59.833333,8.833333,18.5,15.166667,18.5,11.0,28.0,21.5,11.833333,9.5,5.833333,10.166667,1.0,0.557103,0.477477,0.81982,1,W


## Train Test Split 

In [15]:
# For splitting data
from sklearn.model_selection import train_test_split

# Create train, validate, and test sets.
X = data.copy().drop("Win", axis=1)
y = data["Win"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

X_train.shape

(1016, 80)

In [16]:
def partition_features(df: pd.DataFrame, cardnality_threshold: int) -> Tuple[pd.DataFrame]:
    cat_cols = [name for name, data_type in df.dtypes.items() if data_type == object]
    num_cols = list(set(df.columns) - set(cat_cols))
    col_cardnality = {col_name: X_train[col_name].nunique() for col_name in cat_cols}
    cat_cols_high = []
    cat_cols_low = []
    for name, cardnality in col_cardnality.items():
        if cardnality > cardnality_threshold:
            cat_cols_high.append(name)
        else:
            cat_cols_low.append(name)
    return num_cols, cat_cols_high, cat_cols_low

In [17]:
num_cols, cat_cols_high, cat_cols_low = partition_features(X_train, 10)
print(num_cols)
print(cat_cols_high)
print(cat_cols_low)

['TORegHigh', 'FGARegLow', 'FTPRegHigh', 'TORegLow', 'FGMTouLow', 'ORRegLow', 'DRRegLow', 'ScoreRegHigh', 'FTPTouLow', 'DRTouLow', 'PFRegHigh', 'FTMRegHigh', 'AstTouHigh', 'ORTouLow', 'AstRegHigh', 'FGM3TouHigh', 'BlkTouHigh', 'SeedLow', 'BlkRegLow', 'FGA3RegHigh', 'ORRegHigh', 'PFTouLow', 'FTATouHigh', 'StlRegHigh', 'NumOTTouHigh', 'StlRegLow', 'SeedHigh', 'WinTouHigh', 'FTATouLow', 'FGATouLow', 'FGM3RegHigh', 'FGMTouHigh', 'ScoreTouHigh', 'FGA3TouHigh', 'PFTouHigh', 'WinTouLow', 'BlkRegHigh', 'WinRegHigh', 'TOTouHigh', 'ScoreTouLow', 'NumOTRegHigh', 'StlTouLow', 'FGM3RegLow', 'StlTouHigh', 'FGMRegLow', 'DRTouHigh', 'NumOTTouLow', 'DRRegHigh', 'FGATouHigh', 'FTMTouHigh', 'FGP3RegHigh', 'ScoreRegLow', 'FGA3RegLow', 'FTPRegLow', 'FGA3TouLow', 'BlkTouLow', 'FGM3TouLow', 'FTARegHigh', 'FGP3RegLow', 'FGPRegHigh', 'ORTouHigh', 'WinRegLow', 'FTARegLow', 'AstTouLow', 'FGARegHigh', 'FTMRegLow', 'FTMTouLow', 'FGMRegHigh', 'NumOTRegLow', 'FGP3TouHigh', 'FGPRegLow', 'PFRegLow', 'FGPTouHigh', 'FGP

### Data preperation pipelines

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="median")

# Preprocessing for high cardnality categorical data
cat_high_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OrdinalEncoder(handle_unknown="ignore"))
])

# Preprocessing for low cardnality categorical data
cat_low_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),
        ("cat_high", cat_high_transformer, cat_cols_high),
        ("cat_low", cat_low_transformer, cat_cols_low)
    ])

### Preprocess Data

In [19]:
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

# Step 4: Train a model


### Setup Hyperparameter Tuning
See https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [20]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error, log_loss, brier_score_loss
from xgboost import XGBClassifier
import numpy as np
import warnings
warnings.filterwarnings("ignore",category=Warning)

In [21]:
space={
    "n_estimators": hp.quniform("n_estimators", 1000, 2000, 250),
    "learning_rate": hp.uniform("learning_rate", 0.1, 0.2),
    "max_depth": hp.quniform("max_depth", 2, 8, 1),
    "reg_lambda": hp.uniform("reg_lambda", 50, 150),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
    "tree_method": "gpu_hist",
    "random_state": 42
}

def objective(space):
    regressor=XGBClassifier(
        n_estimators = int(space["n_estimators"]),
        learning_rate = space["learning_rate"],
        max_depth = int(space["max_depth"]),
        reg_lambda = space["reg_lambda"],
        colsample_bytree = space["colsample_bytree"],
        tree_method = "gpu_hist",
        random_state = 42,
    )
    evaluation = [(X_valid_processed, y_valid)]
    regressor.fit(
        X_train_processed, y_train,
        eval_set=evaluation,
        eval_metric="rmse",
        early_stopping_rounds=10,
        verbose=False
    )
    preds = regressor.predict(X_valid_processed)
    score = brier_score_loss(y_valid, preds)
    return {"loss": score, "status": STATUS_OK }

In [22]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

100%|██████████| 100/100 [01:36<00:00,  1.04trial/s, best loss: 0.0029498525073746312]


In [23]:
# Define model
model = XGBClassifier(
    n_estimators = int(best_hyperparams["n_estimators"]),
    max_depth = int(best_hyperparams["max_depth"]),
    learning_rate = best_hyperparams["learning_rate"],
    colsample_bytree = best_hyperparams["colsample_bytree"],
    reg_lambda = best_hyperparams["reg_lambda"], 
    tree_method = "gpu_hist",
    random_state = 42
)
model.fit(X_train_processed, y_train,
          early_stopping_rounds=10, 
          eval_set=[(X_valid_processed, y_valid)],
          verbose=False)
preds = model.predict(X_valid_processed)
print("RMSE:", mean_squared_error(y_valid, preds, squared=False))

RMSE: 0.05431254465935684


In the code cell above, we set `squared=False` to get the root mean squared error (RMSE) on the validation data.

# Step 5: Submit to the competition

We"ll begin by using the trained model to generate predictions, which we"ll save to a CSV file.

In [24]:
SampleSubmissionWarmup = pd.read_csv("/kaggle/input/warmup-round-march-machine-learning-mania-2023/SampleSubmissionWarmup.csv")

print(SampleSubmissionWarmup.shape)
SampleSubmissionWarmup.tail()

(614319, 2)


Unnamed: 0,ID,Pred
614314,2022_3469_3471,0.5
614315,2022_3469_3472,0.5
614316,2022_3470_3471,0.5
614317,2022_3470_3472,0.5
614318,2022_3471_3472,0.5


In [25]:
def get_submission_outcomes(sample_submission: pd.DataFrame) -> pd.DataFrame:
    df = sample_submission.copy()
    df.drop("Pred", axis=1, inplace=True)
    df[["Season", "LowID", "HighID"]] = df["ID"].str.split("_", expand=True)
    df[["Season", "LowID", "HighID"]] = df[["Season", "LowID", "HighID"]].astype(int)
    return df

In [26]:
submission_outcomes = get_submission_outcomes(SampleSubmissionWarmup)
print(submission_outcomes.shape)
submission_outcomes.tail()

(614319, 4)


Unnamed: 0,ID,Season,LowID,HighID
614314,2022_3469_3471,2022,3469,3471
614315,2022_3469_3472,2022,3469,3472
614316,2022_3470_3471,2022,3470,3471
614317,2022_3470_3472,2022,3470,3472
614318,2022_3471_3472,2022,3471,3472


In [27]:
X_submission = merge_outcomes_with_features(submission_outcomes, features)
print(X_submission.shape)
X_submission.tail()

(614319, 80)


Unnamed: 0_level_0,ScoreRegHigh,NumOTRegHigh,FGMRegHigh,FGARegHigh,FGM3RegHigh,FGA3RegHigh,FTMRegHigh,FTARegHigh,ORRegHigh,DRRegHigh,AstRegHigh,TORegHigh,StlRegHigh,BlkRegHigh,PFRegHigh,WinRegHigh,FGPRegHigh,FGP3RegHigh,FTPRegHigh,ScoreTouHigh,NumOTTouHigh,FGMTouHigh,FGATouHigh,FGM3TouHigh,FGA3TouHigh,FTMTouHigh,FTATouHigh,ORTouHigh,DRTouHigh,AstTouHigh,TOTouHigh,StlTouHigh,BlkTouHigh,PFTouHigh,WinTouHigh,FGPTouHigh,FGP3TouHigh,FTPTouHigh,SeedHigh,GenderHigh,ScoreRegLow,NumOTRegLow,FGMRegLow,FGARegLow,FGM3RegLow,FGA3RegLow,FTMRegLow,FTARegLow,ORRegLow,DRRegLow,AstRegLow,TORegLow,StlRegLow,BlkRegLow,PFRegLow,WinRegLow,FGPRegLow,FGP3RegLow,FTPRegLow,ScoreTouLow,NumOTTouLow,FGMTouLow,FGATouLow,FGM3TouLow,FGA3TouLow,FTMTouLow,FTATouLow,ORTouLow,DRTouLow,AstTouLow,TOTouLow,StlTouLow,BlkTouLow,PFTouLow,WinTouLow,FGPTouLow,FGP3TouLow,FTPTouLow,SeedLow,GenderLow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
2022_3469_3471,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2022_3469_3472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2022_3470_3471,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2022_3470_3472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2022_3471_3472,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [28]:
X_submission_processed = preprocessor.transform(X_submission)

In [29]:
# Use the model to generate predictions

predictions = model.predict(X_submission_processed)

# Save the predictions to a CSV file
output = pd.DataFrame({"ID": X_submission.index,
                       "Pred": predictions})
output.to_csv("submission.csv", index=False)
output.shape

(614319, 2)