# Step 1: Imports

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

def load_raw_data(filename: str) -> pd.DataFrame:
    mens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/M{filename}.csv"
    weomens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/W{filename}.csv"
    df_mens = pd.read_csv(mens_filepath)
    df_mens["Gender"] = 0
    df_weomens = pd.read_csv(weomens_filepath)
    df_weomens["Gender"] = 1
    return pd.concat([df_mens, df_weomens])

def process_detailed_results(detailed_results: pd.DataFrame) -> pd.DataFrame:
    df = detailed_results.copy()
    df = clean_detailed_results(df)
    df = game_to_team_conversion(df)
    df = enrich_team_results(df)
    df = transform_team_results(df)
    return df

def clean_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(["WLoc", "DayNum", "Gender"], axis=1)

def game_to_team_conversion(game_results: pd.DataFrame) -> pd.DataFrame:
    winners = rename_columns(game_results, "W")
    loosers = rename_columns(game_results, "L")
    team_results = pd.concat((winners, loosers))
    team_results.drop(["TeamIDOpp"], axis=1, inplace=True)
    return team_results

def enrich_team_results(df: pd.DataFrame) -> pd.DataFrame:
    df["Dur"] = 40 + 5 * df["NumOT"]
    df.drop(["NumOT"], axis=1, inplace=True)
    return df

def transform_team_results(df: pd.DataFrame) -> pd.DataFrame:
    df = df.groupby(["Season", "TeamID"]).median()
    return df.reset_index()
    
def rename_columns(df: pd.DataFrame, team_prefix: str) -> pd.DataFrame:
    df = df.copy()
    df.columns =  (rename_column(column_name, team_prefix) for column_name in df.columns)
    return df

def rename_column(column_name: str, team_prefix: str) -> pd.DataFrame:
    if team_prefix == "W":
        opponent_prefix = "L"
    elif team_prefix == "L":
        opponent_prefix = "W"
    else:
        raise ValueError
    if column_name.startswith(team_prefix):
        column_name = column_name.lstrip(team_prefix)
    elif column_name.startswith(opponent_prefix):
        column_name = f"{column_name.lstrip(opponent_prefix)}Opp"
    return column_name

def split_winner_and_looser_columns(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    winner_columns = [name for name in df.columns if not name.startswith("L")]
    looser_columns = [name for name in df.columns if not name.startswith("W")]
    return winner_columns, looser_columns

def clean_column_names(df: pd.DataFrame) -> List[str]:
    column_names = [
        name[1:] if 
        name.startswith("L") or name.startswith("W")
        else name 
        for name in df.columns
    ]
    return column_names

def process_seeds(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    mask = df["Season"] > 2002
    df = df[mask]
    df["Seed"] = df["Seed"].str.replace(r"\D+","", regex=True)
    df["Seed"] = df["Seed"].astype(int)
    return df

def merge_features(season_features: pd.DataFrame, tournament_features: pd.DataFrame, seed_features: pd.DataFrame) -> pd.DataFrame:
    features = pd.merge(
        season_features,
        tournament_features,
        how="inner",
        on=["Season", "TeamID"],
        suffixes=("Reg", "Tou")
    )
    features = features.merge(
        seed_features,
        how="inner",
        on=["Season", "TeamID"]
    )
    return features

def get_outcomes(df):
    input_rows = df.to_records()
    output_rows = []
    for input_row in input_rows:
        output_rows.extend(parse_row(input_row))
    out_df = pd.DataFrame(output_rows)
    return out_df

def parse_row(row):
    season = row['Season']
    winning_team_id = row['WTeamID']
    losing_team_id = row['LTeamID']
    if winning_team_id < losing_team_id:
        small_id = winning_team_id
        big_id = losing_team_id
        outcome = True
    elif losing_team_id < winning_team_id:
        small_id = losing_team_id
        big_id = winning_team_id
        outcome = False
    records = [
        {
            "ID": f"{season}_{small_id}_{big_id}",
            'Season': season,
            'LowID': small_id,
            'HighID': big_id,
            'Win': outcome
        },
        {
            "ID": f"{season}_{big_id}_{small_id}",
            'Season': season,
            'LowID': big_id,
            'HighID': small_id,
            'Win': not outcome
        },
    ]
    return records

def merge_outcomes_with_features(outcomes: pd.DataFrame, features: pd.DataFrame, how: str = "inner") -> pd.DataFrame:
    data = pd.merge(
        outcomes, 
        features, 
        how=how, 
        left_on=["Season", "HighID"], 
        right_on=["Season", "TeamID"]
    )
    data = pd.merge(
        data, 
        features, 
        how=how, 
        left_on=["Season", "LowID"], 
        right_on=["Season", "TeamID"],
        suffixes=("High", "Low")
    )
    data.drop(
        ["Season", "HighID", "LowID","TeamIDHigh","TeamIDLow"], 
        axis=1, 
        inplace=True
    )
    data.set_index("ID", inplace=True)
    return data

# Step 2: Load the data

## Season Detailed Results

In [2]:
RegularSeasonDetailedResults = load_raw_data("RegularSeasonDetailedResults")
RegularSeasonDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
70673,2023,127,3415,63,3142,54,N,0,20,56,2,10,21,30,11,23,9,14,5,2,22,17,43,4,13,16,21,1,21,5,15,7,2,24,1
70674,2023,127,3424,71,3361,68,H,0,23,55,2,12,23,33,10,23,12,14,7,6,16,28,58,5,10,7,10,7,24,12,13,4,1,26,1
70675,2023,127,3455,65,3378,53,A,0,24,51,6,13,11,13,2,28,14,11,7,2,12,19,60,4,22,11,13,8,22,11,12,2,6,12,1
70676,2023,127,3461,65,3161,56,H,0,25,57,5,17,10,16,13,35,15,14,3,2,13,21,55,9,24,5,8,1,20,12,8,8,4,17,1
70677,2023,127,3477,65,3230,62,A,0,23,50,3,13,16,19,12,20,10,9,6,0,11,22,51,8,22,10,12,8,15,12,9,5,1,13,1


In [3]:
season_features = process_detailed_results(RegularSeasonDetailedResults)
season_features.tail()

Unnamed: 0,Season,TeamID,Score,ScoreOpp,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGMOpp,FGAOpp,FGM3Opp,FGA3Opp,FTMOpp,FTAOpp,OROpp,DROpp,AstOpp,TOOpp,StlOpp,BlkOpp,PFOpp,Dur
12130,2023,3473,56.0,71.0,21.0,51.0,7.0,21.0,8.0,13.0,6.0,19.0,12.0,19.0,6.0,2.0,16.0,27.0,61.0,5.0,17.0,11.0,17.0,10.0,21.0,13.0,13.0,11.0,2.0,17.0,40.0
12131,2023,3474,57.5,73.0,20.0,58.5,4.0,15.0,12.5,17.5,8.0,20.0,7.0,14.0,7.0,2.0,21.5,24.5,55.5,5.0,14.5,16.0,24.0,9.5,28.0,14.0,17.0,6.0,3.0,18.5,40.0
12132,2023,3475,59.5,63.0,21.0,55.5,4.0,16.0,15.0,19.0,9.0,24.0,13.0,20.5,7.0,2.0,19.0,22.0,56.0,7.0,22.5,14.0,21.5,9.0,20.5,12.0,16.0,11.0,3.0,20.0,40.0
12133,2023,3476,61.0,66.0,22.0,56.5,6.0,21.0,9.5,12.0,9.0,21.0,13.5,14.5,3.0,3.0,14.0,26.0,57.5,5.0,14.0,10.5,15.0,8.0,22.0,12.0,11.0,7.0,3.0,15.5,40.0
12134,2023,3477,65.0,71.0,22.0,58.0,4.0,14.0,14.0,21.0,11.0,23.0,11.0,16.0,7.0,3.0,15.0,27.0,60.0,7.0,19.0,10.0,13.0,9.0,24.0,15.0,16.0,8.0,4.0,19.0,40.0


## Tournament Detailed Results

In [4]:
NCAATourneyDetailedResults = load_raw_data("NCAATourneyDetailedResults")
NCAATourneyDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
755,2022,147,3163,91,3301,87,N,2,37,77,5,21,12,20,12,23,10,7,5,2,16,32,66,7,23,16,19,6,30,20,13,4,7,16,1
756,2022,147,3257,62,3276,50,N,0,25,58,5,15,7,9,6,20,12,11,15,4,17,16,46,3,14,15,20,10,24,9,21,6,2,12,1
757,2022,151,3163,63,3390,58,N,0,21,57,5,14,16,20,12,30,14,19,5,2,16,23,66,4,23,8,13,11,23,10,11,11,3,16,1
758,2022,151,3376,72,3257,59,N,0,27,57,6,17,12,17,8,24,19,14,11,4,11,27,63,1,8,4,7,11,18,5,15,13,2,17,1
759,2022,153,3376,64,3163,49,N,0,22,60,3,16,17,26,18,23,9,14,6,4,11,22,54,4,16,1,4,3,16,14,14,4,5,21,1


In [5]:
tournament_features = process_detailed_results(NCAATourneyDetailedResults)
tournament_features["Season"] += 1
tournament_features.tail()

Unnamed: 0,Season,TeamID,Score,ScoreOpp,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF,FGMOpp,FGAOpp,FGM3Opp,FGA3Opp,FTMOpp,FTAOpp,OROpp,DROpp,AstOpp,TOOpp,StlOpp,BlkOpp,PFOpp,Dur
2034,2023,3426,71.0,78.0,27.0,62.0,7.0,16.0,10.0,16.0,7.0,26.0,11.0,9.0,6.0,5.0,20.0,28.0,65.0,7.0,20.0,15.0,26.0,11.0,27.0,6.0,7.0,2.0,6.0,14.0,40.0
2035,2023,3428,74.0,73.5,27.5,53.5,10.5,25.5,8.5,12.0,6.0,25.5,16.5,17.5,3.0,1.5,14.5,31.5,67.0,3.5,14.5,7.0,12.5,9.5,18.5,10.5,8.5,9.0,4.5,12.0,40.0
2036,2023,3437,55.0,60.5,20.5,58.5,6.0,23.5,8.0,12.0,7.5,19.5,11.0,10.0,8.5,5.0,18.0,22.5,55.5,4.0,15.0,11.5,16.5,11.5,29.5,12.5,15.0,7.0,1.0,12.5,40.0
2037,2023,3439,81.0,84.0,30.0,60.0,6.0,17.0,15.0,20.0,7.0,28.0,13.0,10.0,1.0,1.0,13.0,30.0,64.0,15.0,38.0,9.0,11.0,2.0,25.0,19.0,3.0,7.0,1.0,18.0,40.0
2038,2023,3450,40.0,50.0,14.0,56.0,3.0,22.0,9.0,14.0,8.0,31.0,5.0,17.0,2.0,2.0,23.0,14.0,53.0,3.0,14.0,19.0,22.0,6.0,30.0,7.0,11.0,6.0,3.0,11.0,40.0


## Tournement Seeds

In [6]:
NCAATourneySeeds = load_raw_data("NCAATourneySeeds")
NCAATourneySeeds.tail()

Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,Z12,3125,1
1536,2022,Z13,3138,1
1537,2022,Z14,3110,1
1538,2022,Z15,3218,1
1539,2022,Z16,3107,1


In [7]:
seed_features = process_seeds(NCAATourneySeeds)
seed_features.tail()

Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,12,3125,1
1536,2022,13,3138,1
1537,2022,14,3110,1
1538,2022,15,3218,1
1539,2022,16,3107,1


## Merge features

In [8]:
features = merge_features(season_features, tournament_features, seed_features)
features.tail()

Unnamed: 0,Season,TeamID,ScoreReg,ScoreOppReg,FGMReg,FGAReg,FGM3Reg,FGA3Reg,FTMReg,FTAReg,ORReg,DRReg,AstReg,TOReg,StlReg,BlkReg,PFReg,FGMOppReg,FGAOppReg,FGM3OppReg,FGA3OppReg,FTMOppReg,FTAOppReg,OROppReg,DROppReg,AstOppReg,TOOppReg,StlOppReg,BlkOppReg,PFOppReg,DurReg,ScoreTou,ScoreOppTou,FGMTou,FGATou,FGM3Tou,FGA3Tou,FTMTou,FTATou,ORTou,DRTou,AstTou,TOTou,StlTou,BlkTou,PFTou,FGMOppTou,FGAOppTou,FGM3OppTou,FGA3OppTou,FTMOppTou,FTAOppTou,OROppTou,DROppTou,AstOppTou,TOOppTou,StlOppTou,BlkOppTou,PFOppTou,DurTou,Seed,Gender
977,2022,3397,68.0,58.0,24.0,64.0,4.0,15.0,12.0,20.0,13.0,29.0,13.0,16.0,6.0,6.0,15.0,22.0,64.0,6.0,23.0,11.0,15.0,9.0,20.0,12.0,13.0,9.0,3.0,19.0,40.0,71.0,66.0,27.0,62.5,2.0,13.0,15.0,19.0,17.0,29.0,10.0,18.0,7.5,7.0,15.5,23.0,64.0,8.0,26.5,12.0,14.5,7.5,18.5,12.5,12.0,10.0,0.5,13.5,40.0,4,1
978,2022,3400,70.0,57.5,25.5,61.0,4.0,13.5,12.5,19.0,11.5,21.0,13.0,13.0,9.0,3.0,18.0,19.0,49.5,4.0,14.0,13.0,18.5,8.0,21.0,9.5,19.0,5.5,2.0,19.5,40.0,67.5,62.0,24.5,59.5,6.0,16.0,12.5,17.0,9.5,22.0,9.5,10.0,5.5,2.0,14.0,22.5,58.5,4.0,18.0,12.0,15.5,10.5,24.0,11.5,12.5,4.5,6.0,19.5,40.0,2,1
979,2022,3416,61.0,47.0,23.0,56.0,3.0,10.5,11.0,16.0,11.0,21.5,13.5,14.0,10.5,3.5,13.0,18.0,52.0,5.5,19.0,6.0,11.0,8.5,18.0,11.0,18.0,7.5,3.0,15.0,40.0,51.0,62.0,21.0,57.0,3.0,13.0,6.0,12.0,13.0,22.0,10.0,19.0,5.0,2.0,22.0,22.0,43.0,3.0,10.0,15.0,21.0,2.0,19.0,19.0,13.0,8.0,7.0,14.0,40.0,7,1
980,2022,3439,70.5,59.5,24.0,55.0,8.5,23.0,10.0,15.5,8.0,24.5,14.5,12.0,5.0,4.0,15.0,21.0,60.0,5.0,14.5,9.0,13.0,8.5,21.5,9.5,13.0,6.0,3.0,17.0,40.0,59.0,76.5,21.0,56.0,6.5,20.0,10.5,12.5,6.5,19.5,14.0,14.0,5.5,1.5,18.0,30.5,62.0,5.0,12.0,10.5,15.5,11.0,27.0,16.5,11.5,7.0,6.5,15.5,40.0,5,1
981,2022,3450,62.0,59.0,22.0,58.0,7.0,21.0,9.0,12.0,9.0,22.0,12.0,13.0,8.0,4.0,16.0,22.0,57.0,6.0,18.0,10.0,14.0,9.0,24.0,11.0,14.0,6.0,3.0,14.0,40.0,53.0,57.0,17.0,55.0,8.0,25.0,11.0,12.0,8.0,24.0,11.0,11.0,6.0,4.0,15.0,22.0,58.0,5.0,17.0,8.0,10.0,10.0,27.0,7.0,8.0,4.0,3.0,11.0,40.0,8,1


## Build Dataset

In [9]:
from sklearn.model_selection import train_test_split

data = load_raw_data("NCAATourneyCompactResults")
data_train, data_valid = train_test_split(data, random_state=0)

outcomes_train = get_outcomes(data_train)
outcomes_valid = get_outcomes(data_valid)

In [10]:
features_train = merge_outcomes_with_features(outcomes_train, features)
features_valid = merge_outcomes_with_features(outcomes_valid, features)
print(features_train.shape)
features_train.tail()

(1100, 121)


Unnamed: 0_level_0,Win,ScoreRegHigh,ScoreOppRegHigh,FGMRegHigh,FGARegHigh,FGM3RegHigh,FGA3RegHigh,FTMRegHigh,FTARegHigh,ORRegHigh,DRRegHigh,AstRegHigh,TORegHigh,StlRegHigh,BlkRegHigh,PFRegHigh,FGMOppRegHigh,FGAOppRegHigh,FGM3OppRegHigh,FGA3OppRegHigh,FTMOppRegHigh,FTAOppRegHigh,OROppRegHigh,DROppRegHigh,AstOppRegHigh,TOOppRegHigh,StlOppRegHigh,BlkOppRegHigh,PFOppRegHigh,DurRegHigh,ScoreTouHigh,ScoreOppTouHigh,FGMTouHigh,FGATouHigh,FGM3TouHigh,FGA3TouHigh,FTMTouHigh,FTATouHigh,ORTouHigh,DRTouHigh,AstTouHigh,TOTouHigh,StlTouHigh,BlkTouHigh,PFTouHigh,FGMOppTouHigh,FGAOppTouHigh,FGM3OppTouHigh,FGA3OppTouHigh,FTMOppTouHigh,FTAOppTouHigh,OROppTouHigh,DROppTouHigh,AstOppTouHigh,TOOppTouHigh,StlOppTouHigh,BlkOppTouHigh,PFOppTouHigh,DurTouHigh,SeedHigh,GenderHigh,ScoreRegLow,ScoreOppRegLow,FGMRegLow,FGARegLow,FGM3RegLow,FGA3RegLow,FTMRegLow,FTARegLow,ORRegLow,DRRegLow,AstRegLow,TORegLow,StlRegLow,BlkRegLow,PFRegLow,FGMOppRegLow,FGAOppRegLow,FGM3OppRegLow,FGA3OppRegLow,FTMOppRegLow,FTAOppRegLow,OROppRegLow,DROppRegLow,AstOppRegLow,TOOppRegLow,StlOppRegLow,BlkOppRegLow,PFOppRegLow,DurRegLow,ScoreTouLow,ScoreOppTouLow,FGMTouLow,FGATouLow,FGM3TouLow,FGA3TouLow,FTMTouLow,FTATouLow,ORTouLow,DRTouLow,AstTouLow,TOTouLow,StlTouLow,BlkTouLow,PFTouLow,FGMOppTouLow,FGAOppTouLow,FGM3OppTouLow,FGA3OppTouLow,FTMOppTouLow,FTAOppTouLow,OROppTouLow,DROppTouLow,AstOppTouLow,TOOppTouLow,StlOppTouLow,BlkOppTouLow,PFOppTouLow,DurTouLow,SeedLow,GenderLow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
2012_1462_1323,True,65.0,61.0,23.0,53.0,6.0,19.0,13.0,19.0,10.0,24.0,15.0,10.0,5.0,3.0,14.0,23.0,55.0,5.0,15.0,9.0,13.0,11.0,23.0,13.0,10.0,5.0,3.0,17.0,40.0,63.0,63.5,20.0,54.5,7.0,23.5,16.0,22.5,10.5,29.0,14.0,11.0,4.5,5.0,18.0,22.5,56.0,8.0,20.0,10.5,15.5,8.5,27.5,12.5,10.0,4.0,6.5,19.5,40.0,7,0,71.0,66.0,25.0,56.0,5.0,14.0,14.0,21.0,10.0,26.0,12.0,13.0,6.0,4.0,19.0,22.0,56.0,6.0,20.0,16.0,22.0,10.0,24.0,13.0,13.0,6.0,3.0,19.0,40.0,55.0,66.0,21.0,51.0,2.0,13.0,11.0,15.0,11.0,19.0,11.0,15.0,3.0,2.0,19.0,24.0,45.0,5.0,12.0,13.0,19.0,5.0,21.0,13.0,15.0,6.0,2.0,18.0,40.0,10,0
2015_1455_1242,True,72.0,63.0,24.0,53.0,6.0,16.0,17.0,24.0,11.0,26.0,11.0,14.0,6.0,5.0,18.0,23.5,58.0,6.0,18.0,13.5,20.0,12.0,22.0,10.0,12.0,6.5,4.5,19.0,40.0,68.5,64.5,25.5,55.5,2.5,11.5,15.0,21.0,13.5,25.5,12.0,14.0,5.5,4.5,18.0,21.5,52.0,6.0,20.0,15.5,21.0,8.0,20.0,10.0,11.5,9.0,2.5,18.0,40.0,2,0,68.0,55.0,25.0,54.0,6.0,18.0,13.0,20.0,12.0,23.0,14.0,9.0,7.0,4.0,16.0,20.0,49.0,5.0,14.0,12.0,18.0,9.0,22.0,9.0,13.0,4.0,3.0,18.0,40.0,70.0,57.5,25.0,49.5,8.0,20.0,12.0,19.5,7.5,27.0,15.5,10.5,5.0,4.5,19.0,19.5,54.0,6.5,23.0,12.0,20.0,11.5,21.5,8.0,8.5,6.5,2.0,16.0,40.0,7,0
2015_1308_1242,False,72.0,63.0,24.0,53.0,6.0,16.0,17.0,24.0,11.0,26.0,11.0,14.0,6.0,5.0,18.0,23.5,58.0,6.0,18.0,13.5,20.0,12.0,22.0,10.0,12.0,6.5,4.5,19.0,40.0,68.5,64.5,25.5,55.5,2.5,11.5,15.0,21.0,13.5,25.5,12.0,14.0,5.5,4.5,18.0,21.5,52.0,6.0,20.0,15.5,21.0,8.0,20.0,10.0,11.5,9.0,2.5,18.0,40.0,2,0,70.0,61.0,23.0,50.0,4.0,13.0,15.0,22.0,11.0,23.0,12.0,13.0,6.0,4.0,16.0,23.0,50.0,3.0,12.0,10.0,16.0,10.0,19.0,11.0,13.0,5.0,1.0,19.0,40.0,69.0,73.0,26.0,65.0,5.0,16.0,12.0,20.0,14.0,27.0,13.0,13.0,4.0,4.0,22.0,23.0,59.0,6.0,17.0,21.0,27.0,10.0,30.0,9.0,11.0,8.0,9.0,18.0,45.0,15,0
2015_1326_1433,True,71.0,65.0,25.0,59.0,8.0,23.0,15.0,22.0,12.0,23.0,12.0,10.0,9.0,4.0,18.0,22.0,53.0,6.0,19.0,13.0,20.0,11.0,26.0,13.0,16.0,5.0,3.0,19.0,40.0,75.0,77.0,30.0,65.0,8.0,21.0,7.0,11.0,11.0,18.0,13.0,11.0,11.0,5.0,21.0,27.0,51.0,6.0,16.0,17.0,23.0,10.0,25.0,16.0,17.0,6.0,2.0,16.0,45.0,7,0,74.0,61.0,27.0,57.0,6.0,18.0,14.0,21.0,11.0,24.0,16.0,11.0,8.0,5.0,16.0,22.0,55.0,7.0,21.0,10.0,14.0,11.0,21.0,13.0,14.0,5.0,2.0,18.0,40.0,59.0,60.0,24.0,50.0,3.0,12.0,8.0,12.0,3.0,25.0,12.0,14.0,10.0,4.0,16.0,22.0,49.0,3.0,13.0,13.0,17.0,4.0,24.0,12.0,13.0,10.0,0.0,14.0,40.0,10,0
2015_1433_1326,False,74.0,61.0,27.0,57.0,6.0,18.0,14.0,21.0,11.0,24.0,16.0,11.0,8.0,5.0,16.0,22.0,55.0,7.0,21.0,10.0,14.0,11.0,21.0,13.0,14.0,5.0,2.0,18.0,40.0,59.0,60.0,24.0,50.0,3.0,12.0,8.0,12.0,3.0,25.0,12.0,14.0,10.0,4.0,16.0,22.0,49.0,3.0,13.0,13.0,17.0,4.0,24.0,12.0,13.0,10.0,0.0,14.0,40.0,10,0,71.0,65.0,25.0,59.0,8.0,23.0,15.0,22.0,12.0,23.0,12.0,10.0,9.0,4.0,18.0,22.0,53.0,6.0,19.0,13.0,20.0,11.0,26.0,13.0,16.0,5.0,3.0,19.0,40.0,75.0,77.0,30.0,65.0,8.0,21.0,7.0,11.0,11.0,18.0,13.0,11.0,11.0,5.0,21.0,27.0,51.0,6.0,16.0,17.0,23.0,10.0,25.0,16.0,17.0,6.0,2.0,16.0,45.0,7,0


In [11]:
y_train = features_train["Win"]
X_train = features_train.drop("Win", axis=1)
y_valid = features_valid["Win"]
X_valid = features_valid.drop("Win", axis=1)

# Step 4: Train a model


### Setup Hyperparameter Tuning
See https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [12]:
from sklearn.metrics import brier_score_loss, roc_auc_score, confusion_matrix
import lightgbm

model = lightgbm.LGBMClassifier(objective="binary", min_split_gain=5)
model.fit(X_train, y_train)

preds_train = model.predict(X_train)
preds_valid = model.predict(X_valid)

print("Training Score:", model.score(X_train, y_train))
print("Validation Score:", model.score(X_valid, y_valid))
print("Brier Score Validation:", brier_score_loss(y_valid, preds_valid))
print("ROC AUC Validation:", roc_auc_score(y_valid, preds_valid))
print("Parameters")
print(*(f"- {key}: {value}" for key, value in model.get_params(deep=True).items()), sep="\n")
print("Features")
print(*(f"- {name}: {imp}" for name, imp in sorted(zip(model.feature_name_, model.feature_importances_), key=lambda x: x[1], reverse=True)), sep="\n")


Training Score: 0.8254545454545454
Validation Score: 0.73224043715847
Brier Score Validation: 0.2677595628415301
ROC AUC Validation: 0.73224043715847
Parameters
- boosting_type: gbdt
- class_weight: None
- colsample_bytree: 1.0
- importance_type: split
- learning_rate: 0.1
- max_depth: -1
- min_child_samples: 20
- min_child_weight: 0.001
- min_split_gain: 5
- n_estimators: 100
- n_jobs: -1
- num_leaves: 31
- objective: binary
- random_state: None
- reg_alpha: 0.0
- reg_lambda: 0.0
- silent: warn
- subsample: 1.0
- subsample_for_bin: 200000
- subsample_freq: 0
Features
- SeedHigh: 42
- SeedLow: 40
- FGMRegHigh: 15
- FGMRegLow: 13
- AstRegLow: 13
- AstRegHigh: 11
- OROppTouHigh: 5
- DROppRegLow: 5
- AstTouLow: 5
- FGMOppRegHigh: 4
- DROppTouHigh: 4
- FGMOppRegLow: 4
- FGA3OppTouLow: 4
- OROppTouLow: 4
- DROppRegHigh: 3
- FGMTouHigh: 3
- AstTouHigh: 3
- FGARegLow: 3
- FTAOppRegLow: 3
- OROppRegLow: 3
- DROppTouLow: 3
- FGARegHigh: 2
- FTAOppRegHigh: 2
- OROppRegHigh: 2
- AstOppRegHigh: 2


# Step 5: Submit to the competition

We"ll begin by using the trained model to generate predictions, which we"ll save to a CSV file.

In [13]:
SampleSubmissionWarmup = pd.read_csv("/kaggle/input/warmup-round-march-machine-learning-mania-2023/SampleSubmissionWarmup.csv")

print(SampleSubmissionWarmup.shape)
SampleSubmissionWarmup.tail()

(614319, 2)


Unnamed: 0,ID,Pred
614314,2022_3469_3471,0.5
614315,2022_3469_3472,0.5
614316,2022_3470_3471,0.5
614317,2022_3470_3472,0.5
614318,2022_3471_3472,0.5


In [14]:
def get_submission_outcomes(sample_submission: pd.DataFrame) -> pd.DataFrame:
    df = sample_submission.copy()
    df.drop("Pred", axis=1, inplace=True)
    df[["Season", "LowID", "HighID"]] = df["ID"].str.split("_", expand=True)
    df[["Season", "LowID", "HighID"]] = df[["Season", "LowID", "HighID"]].astype(int)
    return df

In [15]:
submission_outcomes = get_submission_outcomes(SampleSubmissionWarmup)
print(submission_outcomes.shape)
submission_outcomes.tail()

(614319, 4)


Unnamed: 0,ID,Season,LowID,HighID
614314,2022_3469_3471,2022,3469,3471
614315,2022_3469_3472,2022,3469,3472
614316,2022_3470_3471,2022,3470,3471
614317,2022_3470_3472,2022,3470,3472
614318,2022_3471_3472,2022,3471,3472


In [16]:
X_submission = merge_outcomes_with_features(submission_outcomes, features, how="left").fillna(0)
print(X_submission.shape)
X_submission.tail()

(614319, 120)


Unnamed: 0_level_0,ScoreRegHigh,ScoreOppRegHigh,FGMRegHigh,FGARegHigh,FGM3RegHigh,FGA3RegHigh,FTMRegHigh,FTARegHigh,ORRegHigh,DRRegHigh,AstRegHigh,TORegHigh,StlRegHigh,BlkRegHigh,PFRegHigh,FGMOppRegHigh,FGAOppRegHigh,FGM3OppRegHigh,FGA3OppRegHigh,FTMOppRegHigh,FTAOppRegHigh,OROppRegHigh,DROppRegHigh,AstOppRegHigh,TOOppRegHigh,StlOppRegHigh,BlkOppRegHigh,PFOppRegHigh,DurRegHigh,ScoreTouHigh,ScoreOppTouHigh,FGMTouHigh,FGATouHigh,FGM3TouHigh,FGA3TouHigh,FTMTouHigh,FTATouHigh,ORTouHigh,DRTouHigh,AstTouHigh,TOTouHigh,StlTouHigh,BlkTouHigh,PFTouHigh,FGMOppTouHigh,FGAOppTouHigh,FGM3OppTouHigh,FGA3OppTouHigh,FTMOppTouHigh,FTAOppTouHigh,OROppTouHigh,DROppTouHigh,AstOppTouHigh,TOOppTouHigh,StlOppTouHigh,BlkOppTouHigh,PFOppTouHigh,DurTouHigh,SeedHigh,GenderHigh,ScoreRegLow,ScoreOppRegLow,FGMRegLow,FGARegLow,FGM3RegLow,FGA3RegLow,FTMRegLow,FTARegLow,ORRegLow,DRRegLow,AstRegLow,TORegLow,StlRegLow,BlkRegLow,PFRegLow,FGMOppRegLow,FGAOppRegLow,FGM3OppRegLow,FGA3OppRegLow,FTMOppRegLow,FTAOppRegLow,OROppRegLow,DROppRegLow,AstOppRegLow,TOOppRegLow,StlOppRegLow,BlkOppRegLow,PFOppRegLow,DurRegLow,ScoreTouLow,ScoreOppTouLow,FGMTouLow,FGATouLow,FGM3TouLow,FGA3TouLow,FTMTouLow,FTATouLow,ORTouLow,DRTouLow,AstTouLow,TOTouLow,StlTouLow,BlkTouLow,PFTouLow,FGMOppTouLow,FGAOppTouLow,FGM3OppTouLow,FGA3OppTouLow,FTMOppTouLow,FTAOppTouLow,OROppTouLow,DROppTouLow,AstOppTouLow,TOOppTouLow,StlOppTouLow,BlkOppTouLow,PFOppTouLow,DurTouLow,SeedLow,GenderLow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1
2022_3469_3471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022_3469_3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022_3470_3471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022_3470_3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022_3471_3472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Use the model to generate predictions

predictions = model.predict(X_submission)

# Save the predictions to a CSV file
output = pd.DataFrame({"ID": X_submission.index,
                       "Pred": predictions})
output["Pred"] = output["Pred"].astype(int)
output.to_csv("submission.csv", index=False)
print(output.shape)
output.describe()

(614319, 2)


Unnamed: 0,Pred
count,614319.0
mean,0.9247
std,0.263874
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0
