# Step 1: Imports

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

def load_raw_data(filename: str) -> pd.DataFrame:
    mens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/M{filename}.csv"
    weomens_filepath = f"/kaggle/input/warmup-round-march-machine-learning-mania-2023/W{filename}.csv"
    df_mens = pd.read_csv(mens_filepath)
    df_mens["Gender"] = 0
    df_weomens = pd.read_csv(weomens_filepath)
    df_weomens["Gender"] = 1
    return pd.concat([df_mens, df_weomens])

def process_detailed_results(detailed_results: pd.DataFrame) -> pd.DataFrame:
    df = detailed_results.copy()
    df = clean_detailed_results(df)
    df = game_to_team_conversion(df)
    df = enrich_team_results(df)
    df = transform_team_results(df)
    return df

def clean_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(["WLoc", "DayNum", "Gender"], axis=1)

def game_to_team_conversion(game_results: pd.DataFrame) -> pd.DataFrame:
    winners = rename_columns(game_results, "W")
    loosers = rename_columns(game_results, "L")
    team_results = pd.concat((winners, loosers))
    team_results.drop(["TeamIDOpp"], axis=1, inplace=True)
    return team_results

def enrich_team_results(df: pd.DataFrame) -> pd.DataFrame:
    df["Dur"] = 40 + 5 * df["NumOT"]
    df.drop(["NumOT"], axis=1, inplace=True)
    return df

def transform_team_results(df: pd.DataFrame) -> pd.DataFrame:
    df = df.groupby(["Season", "TeamID"]).sum()
    df["ScorePM"] = df["Score"] / df["Dur"]
    df["ScorePMOpp"] = df["ScoreOpp"] / df["Dur"]
    df["FGAPM"] = df["FGA"] / df["Dur"]
    df["FGAPMOpp"] = df["FGAOpp"] / df["Dur"]
    df["FGA3PM"] = df["FGA3"] / df["Dur"]
    df["FGA3PMOpp"] = df["FGA3Opp"] / df["Dur"]
    df["FTAPM"] = df["FTA"] / df["Dur"]
    df["FTAPMOpp"] = df["FTAOpp"] / df["Dur"]
    df["ORPM"] = df["OR"] / df["Dur"]
    df["ORPMOpp"] = df["OROpp"] / df["Dur"]
    df["DRPM"] = df["DR"] / df["Dur"]
    df["DRPMOpp"] = df["DROpp"] / df["Dur"]
    df["AstPM"] = df["Ast"] / df["Dur"]
    df["AstPMOpp"] = df["AstOpp"] / df["Dur"]
    df["TOPM"] = df["TO"] / df["Dur"]
    df["TOPMOpp"] = df["TOOpp"] / df["Dur"]
    df["StlPM"] = df["Stl"] / df["Dur"]
    df["StlPMOpp"] = df["StlOpp"] / df["Dur"]
    df["BlkPM"] = df["Blk"] / df["Dur"]
    df["BlkPMOpp"] = df["BlkOpp"] / df["Dur"]
    df["PFPM"] = df["PF"] / df["Dur"]
    df["PFPMOpp"] = df["PFOpp"] / df["Dur"]
    df["FGSR"] = df["FGM"] / df["FGA"]
    df["FGSROpp"] = df["FGMOpp"] / df["FGAOpp"]
    df["FG3SR"] = df["FGM3"] / df["FGA3"]
    df["FG3SROpp"] = df["FGM3Opp"] / df["FGA3Opp"]
    df["FTSR"] = df["FTM"] / df["FTA"]
    df["FTSROpp"] = df["FTMOpp"] / df["FTAOpp"]
    drop_columns = ["Score", "FGM", "FGA", "FGM3", "FGA3", "FTM", "FTA", "OR", "DR", "Ast", "TO", "Stl", "Blk", "PF"]
    drop_columns += [f"{name}Opp" for name in drop_columns]
    drop_columns.append("Dur")
    df.drop(drop_columns, axis=1, inplace=True)
    return df.reset_index()
    
def rename_columns(df: pd.DataFrame, team_prefix: str) -> pd.DataFrame:
    df = df.copy()
    df.columns =  (rename_column(column_name, team_prefix) for column_name in df.columns)
    return df

def rename_column(column_name: str, team_prefix: str) -> pd.DataFrame:
    if team_prefix == "W":
        opponent_prefix = "L"
    elif team_prefix == "L":
        opponent_prefix = "W"
    else:
        raise ValueError
    if column_name.startswith(team_prefix):
        column_name = column_name.lstrip(team_prefix)
    elif column_name.startswith(opponent_prefix):
        column_name = f"{column_name.lstrip(opponent_prefix)}Opp"
    return column_name

def split_winner_and_looser_columns(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    winner_columns = [name for name in df.columns if not name.startswith("L")]
    looser_columns = [name for name in df.columns if not name.startswith("W")]
    return winner_columns, looser_columns

def clean_column_names(df: pd.DataFrame) -> List[str]:
    column_names = [
        name[1:] if 
        name.startswith("L") or name.startswith("W")
        else name 
        for name in df.columns
    ]
    return column_names

def process_seeds(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    mask = df["Season"] > 2002
    df = df[mask]
    df["Seed"] = df["Seed"].str.replace(r"\D+","", regex=True)
    df["Seed"] = df["Seed"].astype(int)
    return df

def merge_features(season_features: pd.DataFrame, tournament_features: pd.DataFrame, seed_features: pd.DataFrame) -> pd.DataFrame:
    features = pd.merge(
        season_features,
        tournament_features,
        how="inner",
        on=["Season", "TeamID"],
        suffixes=("Reg", "Tou")
    )
    features = features.merge(
        seed_features,
        how="inner",
        on=["Season", "TeamID"]
    )
    return features

def get_outcomes(df):
    input_rows = df.to_records()
    output_rows = []
    for input_row in input_rows:
        output_rows.extend(parse_row(input_row))
    out_df = pd.DataFrame(output_rows)
    out_df = out_df[out_df["Season"] < 2017]
    return out_df

def parse_row(row):
    season = row['Season']
    winning_team_id = row['WTeamID']
    losing_team_id = row['LTeamID']
    if winning_team_id < losing_team_id:
        small_id = winning_team_id
        big_id = losing_team_id
        outcome = True
    elif losing_team_id < winning_team_id:
        small_id = losing_team_id
        big_id = winning_team_id
        outcome = False
    records = [
        {
            "ID": f"{season}_{small_id}_{big_id}",
            'Season': season,
            'LowID': small_id,
            'HighID': big_id,
            'Win': outcome
        },
        {
            "ID": f"{season}_{big_id}_{small_id}",
            'Season': season,
            'LowID': big_id,
            'HighID': small_id,
            'Win': not outcome
        },
    ]
    return records

def merge_outcomes_with_features(outcomes: pd.DataFrame, features: pd.DataFrame, how: str = "inner") -> pd.DataFrame:
    data = pd.merge(
        outcomes, 
        features, 
        how=how, 
        left_on=["Season", "HighID"], 
        right_on=["Season", "TeamID"]
    )
    data = pd.merge(
        data, 
        features, 
        how=how, 
        left_on=["Season", "LowID"], 
        right_on=["Season", "TeamID"],
        suffixes=("High", "Low")
    )
    data.drop(
        ["Season", "HighID", "LowID","TeamIDHigh","TeamIDLow"], 
        axis=1, 
        inplace=True
    )
    data.set_index("ID", inplace=True)
    return data

# Step 2: Load the data

## Season Detailed Results

In [2]:
RegularSeasonDetailedResults = load_raw_data("RegularSeasonDetailedResults")
RegularSeasonDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
70673,2023,127,3415,63,3142,54,N,0,20,56,2,10,21,30,11,23,9,14,5,2,22,17,43,4,13,16,21,1,21,5,15,7,2,24,1
70674,2023,127,3424,71,3361,68,H,0,23,55,2,12,23,33,10,23,12,14,7,6,16,28,58,5,10,7,10,7,24,12,13,4,1,26,1
70675,2023,127,3455,65,3378,53,A,0,24,51,6,13,11,13,2,28,14,11,7,2,12,19,60,4,22,11,13,8,22,11,12,2,6,12,1
70676,2023,127,3461,65,3161,56,H,0,25,57,5,17,10,16,13,35,15,14,3,2,13,21,55,9,24,5,8,1,20,12,8,8,4,17,1
70677,2023,127,3477,65,3230,62,A,0,23,50,3,13,16,19,12,20,10,9,6,0,11,22,51,8,22,10,12,8,15,12,9,5,1,13,1


In [3]:
season_features = process_detailed_results(RegularSeasonDetailedResults)
season_features.tail()

Unnamed: 0,Season,TeamID,ScorePM,ScorePMOpp,FGAPM,FGAPMOpp,FGA3PM,FGA3PMOpp,FTAPM,FTAPMOpp,ORPM,ORPMOpp,DRPM,DRPMOpp,AstPM,AstPMOpp,TOPM,TOPMOpp,StlPM,StlPMOpp,BlkPM,BlkPMOpp,PFPM,PFPMOpp,FGSR,FGSROpp,FG3SR,FG3SROpp,FTSR,FTSROpp
12130,2023,3473,1.366,1.801,1.258,1.492,0.525,0.421,0.329,0.4,0.159,0.266,0.481,0.551,0.297,0.353,0.482,0.343,0.158,0.27,0.048,0.073,0.401,0.41,0.392687,0.463807,0.304762,0.287411,0.662614,0.74
12131,2023,3474,1.427885,1.793269,1.451923,1.374038,0.417308,0.383654,0.463462,0.585577,0.211538,0.235577,0.513462,0.675,0.163462,0.352885,0.378846,0.400962,0.180769,0.164423,0.058654,0.083654,0.529808,0.444231,0.345033,0.456963,0.264977,0.325815,0.680498,0.704433
12132,2023,3475,1.552153,1.638278,1.374163,1.395215,0.386603,0.517703,0.465072,0.5311,0.223923,0.230622,0.591388,0.538756,0.323445,0.30622,0.495694,0.430622,0.18756,0.277512,0.069856,0.079426,0.461244,0.487081,0.394847,0.396433,0.284653,0.310536,0.76749,0.699099
12133,2023,3476,1.472889,1.628444,1.389333,1.454222,0.504,0.393778,0.338667,0.36,0.216,0.218667,0.561778,0.571556,0.348444,0.293333,0.361778,0.279111,0.096889,0.196444,0.067556,0.08,0.372444,0.401778,0.386436,0.430929,0.305115,0.316027,0.724409,0.696296
12134,2023,3477,1.620968,1.748387,1.466935,1.533871,0.36129,0.495968,0.556452,0.366129,0.287097,0.237097,0.592742,0.58629,0.295968,0.389516,0.415323,0.396774,0.201613,0.222581,0.080645,0.104032,0.389516,0.466935,0.393073,0.432702,0.254464,0.331707,0.675362,0.700441


## Tournament Detailed Results

In [4]:
NCAATourneyDetailedResults = load_raw_data("NCAATourneyDetailedResults")
NCAATourneyDetailedResults.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,Gender
755,2022,147,3163,91,3301,87,N,2,37,77,5,21,12,20,12,23,10,7,5,2,16,32,66,7,23,16,19,6,30,20,13,4,7,16,1
756,2022,147,3257,62,3276,50,N,0,25,58,5,15,7,9,6,20,12,11,15,4,17,16,46,3,14,15,20,10,24,9,21,6,2,12,1
757,2022,151,3163,63,3390,58,N,0,21,57,5,14,16,20,12,30,14,19,5,2,16,23,66,4,23,8,13,11,23,10,11,11,3,16,1
758,2022,151,3376,72,3257,59,N,0,27,57,6,17,12,17,8,24,19,14,11,4,11,27,63,1,8,4,7,11,18,5,15,13,2,17,1
759,2022,153,3376,64,3163,49,N,0,22,60,3,16,17,26,18,23,9,14,6,4,11,22,54,4,16,1,4,3,16,14,14,4,5,21,1


In [5]:
tournament_features = process_detailed_results(NCAATourneyDetailedResults)
tournament_features["Season"] += 1
tournament_features.tail()

Unnamed: 0,Season,TeamID,ScorePM,ScorePMOpp,FGAPM,FGAPMOpp,FGA3PM,FGA3PMOpp,FTAPM,FTAPMOpp,ORPM,ORPMOpp,DRPM,DRPMOpp,AstPM,AstPMOpp,TOPM,TOPMOpp,StlPM,StlPMOpp,BlkPM,BlkPMOpp,PFPM,PFPMOpp,FGSR,FGSROpp,FG3SR,FG3SROpp,FTSR,FTSROpp
2034,2023,3426,1.775,1.95,1.55,1.625,0.4,0.5,0.4,0.65,0.175,0.275,0.65,0.675,0.275,0.15,0.225,0.175,0.15,0.05,0.125,0.15,0.5,0.35,0.435484,0.430769,0.4375,0.35,0.625,0.576923
2035,2023,3428,1.85,1.8375,1.3375,1.675,0.6375,0.3625,0.3,0.3125,0.15,0.2375,0.6375,0.4625,0.4125,0.2625,0.4375,0.2125,0.075,0.225,0.0375,0.1125,0.3625,0.3,0.514019,0.470149,0.411765,0.241379,0.708333,0.56
2036,2023,3437,1.375,1.5125,1.4625,1.3875,0.5875,0.375,0.3,0.4125,0.1875,0.2875,0.4875,0.7375,0.275,0.3125,0.25,0.375,0.2125,0.175,0.125,0.025,0.45,0.3125,0.350427,0.405405,0.255319,0.266667,0.666667,0.69697
2037,2023,3439,2.025,2.1,1.5,1.6,0.425,0.95,0.5,0.275,0.175,0.05,0.7,0.625,0.325,0.475,0.25,0.075,0.025,0.175,0.025,0.025,0.325,0.45,0.5,0.46875,0.352941,0.394737,0.75,0.818182
2038,2023,3450,1.0,1.25,1.4,1.325,0.55,0.35,0.35,0.55,0.2,0.15,0.775,0.75,0.125,0.175,0.425,0.275,0.05,0.15,0.05,0.075,0.575,0.275,0.25,0.264151,0.136364,0.214286,0.642857,0.863636


## Tournement Seeds

In [6]:
NCAATourneySeeds = load_raw_data("NCAATourneySeeds")
NCAATourneySeeds.tail()

Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,Z12,3125,1
1536,2022,Z13,3138,1
1537,2022,Z14,3110,1
1538,2022,Z15,3218,1
1539,2022,Z16,3107,1


In [7]:
seed_features = process_seeds(NCAATourneySeeds)
seed_features.tail()

Unnamed: 0,Season,Seed,TeamID,Gender
1535,2022,12,3125,1
1536,2022,13,3138,1
1537,2022,14,3110,1
1538,2022,15,3218,1
1539,2022,16,3107,1


## Merge features

In [8]:
features = merge_features(season_features, tournament_features, seed_features)
features.tail()

Unnamed: 0,Season,TeamID,ScorePMReg,ScorePMOppReg,FGAPMReg,FGAPMOppReg,FGA3PMReg,FGA3PMOppReg,FTAPMReg,FTAPMOppReg,ORPMReg,ORPMOppReg,DRPMReg,DRPMOppReg,AstPMReg,AstPMOppReg,TOPMReg,TOPMOppReg,StlPMReg,StlPMOppReg,BlkPMReg,BlkPMOppReg,PFPMReg,PFPMOppReg,FGSRReg,FGSROppReg,FG3SRReg,FG3SROppReg,FTSRReg,FTSROppReg,ScorePMTou,ScorePMOppTou,FGAPMTou,FGAPMOppTou,FGA3PMTou,FGA3PMOppTou,FTAPMTou,FTAPMOppTou,ORPMTou,ORPMOppTou,DRPMTou,DRPMOppTou,AstPMTou,AstPMOppTou,TOPMTou,TOPMOppTou,StlPMTou,StlPMOppTou,BlkPMTou,BlkPMOppTou,PFPMTou,PFPMOppTou,FGSRTou,FGSROppTou,FG3SRTou,FG3SROppTou,FTSRTou,FTSROppTou,Seed,Gender
977,2022,3397,1.7344,1.4928,1.5768,1.5968,0.3584,0.5856,0.492,0.3632,0.3416,0.228,0.728,0.5168,0.3632,0.284,0.4136,0.3184,0.156,0.2192,0.148,0.0776,0.3848,0.4648,0.415525,0.339679,0.303571,0.277322,0.64065,0.676211,1.775,1.65,1.5625,1.6,0.325,0.6625,0.475,0.3625,0.425,0.1875,0.725,0.4625,0.25,0.3125,0.45,0.3,0.1875,0.25,0.175,0.0125,0.3875,0.3375,0.432,0.359375,0.153846,0.301887,0.789474,0.827586,4,1
978,2022,3400,1.794595,1.4,1.530502,1.248649,0.334363,0.369884,0.498069,0.457915,0.296525,0.192278,0.522008,0.516602,0.314286,0.253282,0.342857,0.501931,0.247876,0.176834,0.088803,0.080309,0.457915,0.47722,0.433401,0.385281,0.357968,0.281837,0.699225,0.728499,1.5625,1.54375,1.48125,1.45625,0.41875,0.40625,0.4125,0.4125,0.3,0.2875,0.55625,0.6375,0.24375,0.26875,0.26875,0.3125,0.13125,0.1125,0.05,0.18125,0.38125,0.44375,0.379747,0.390558,0.328358,0.215385,0.727273,0.772727,2,1
979,2022,3416,1.516964,1.186607,1.392857,1.277679,0.276786,0.483929,0.395536,0.289286,0.292857,0.225,0.538393,0.466964,0.34375,0.292857,0.373214,0.465179,0.277679,0.188393,0.086607,0.071429,0.341964,0.408036,0.411538,0.333333,0.316129,0.300738,0.715576,0.654321,1.275,1.55,1.425,1.075,0.325,0.25,0.3,0.525,0.325,0.05,0.55,0.475,0.25,0.475,0.475,0.325,0.125,0.2,0.05,0.175,0.55,0.35,0.368421,0.511628,0.230769,0.3,0.5,0.714286,7,1
980,2022,3439,1.755642,1.435019,1.403113,1.476265,0.589105,0.402335,0.396887,0.320623,0.188327,0.215564,0.625681,0.537743,0.355642,0.250584,0.312062,0.319066,0.111284,0.150973,0.08716,0.070039,0.35642,0.438911,0.442041,0.363205,0.363276,0.313346,0.758824,0.737864,1.475,1.9125,1.4,1.55,0.5,0.3,0.3125,0.3875,0.1625,0.275,0.4875,0.675,0.35,0.4125,0.35,0.2875,0.1375,0.175,0.0375,0.1625,0.45,0.3875,0.375,0.491935,0.325,0.416667,0.84,0.677419,5,1
981,2022,3450,1.512446,1.502146,1.452361,1.391416,0.536481,0.437768,0.296137,0.346781,0.226609,0.218026,0.539914,0.584549,0.332189,0.278112,0.335622,0.35794,0.196567,0.175966,0.103863,0.079828,0.385408,0.357082,0.388298,0.395435,0.3216,0.329412,0.715942,0.742574,1.325,1.425,1.375,1.45,0.625,0.425,0.3,0.25,0.2,0.25,0.6,0.675,0.275,0.175,0.275,0.2,0.15,0.1,0.1,0.075,0.375,0.275,0.309091,0.37931,0.32,0.294118,0.916667,0.8,8,1


## Build Dataset

In [9]:
from sklearn.model_selection import train_test_split

data = load_raw_data("NCAATourneyCompactResults")
data_train, data_valid = train_test_split(data, random_state=0)

outcomes_train = get_outcomes(data_train)
outcomes_valid = get_outcomes(data_valid)

In [10]:
features_train = merge_outcomes_with_features(outcomes_train, features)
features_valid = merge_outcomes_with_features(outcomes_valid, features)
print(features_train.shape)
features_train.tail()

(794, 117)


Unnamed: 0_level_0,Win,ScorePMRegHigh,ScorePMOppRegHigh,FGAPMRegHigh,FGAPMOppRegHigh,FGA3PMRegHigh,FGA3PMOppRegHigh,FTAPMRegHigh,FTAPMOppRegHigh,ORPMRegHigh,ORPMOppRegHigh,DRPMRegHigh,DRPMOppRegHigh,AstPMRegHigh,AstPMOppRegHigh,TOPMRegHigh,TOPMOppRegHigh,StlPMRegHigh,StlPMOppRegHigh,BlkPMRegHigh,BlkPMOppRegHigh,PFPMRegHigh,PFPMOppRegHigh,FGSRRegHigh,FGSROppRegHigh,FG3SRRegHigh,FG3SROppRegHigh,FTSRRegHigh,FTSROppRegHigh,ScorePMTouHigh,ScorePMOppTouHigh,FGAPMTouHigh,FGAPMOppTouHigh,FGA3PMTouHigh,FGA3PMOppTouHigh,FTAPMTouHigh,FTAPMOppTouHigh,ORPMTouHigh,ORPMOppTouHigh,DRPMTouHigh,DRPMOppTouHigh,AstPMTouHigh,AstPMOppTouHigh,TOPMTouHigh,TOPMOppTouHigh,StlPMTouHigh,StlPMOppTouHigh,BlkPMTouHigh,BlkPMOppTouHigh,PFPMTouHigh,PFPMOppTouHigh,FGSRTouHigh,FGSROppTouHigh,FG3SRTouHigh,FG3SROppTouHigh,FTSRTouHigh,FTSROppTouHigh,SeedHigh,GenderHigh,ScorePMRegLow,ScorePMOppRegLow,FGAPMRegLow,FGAPMOppRegLow,FGA3PMRegLow,FGA3PMOppRegLow,FTAPMRegLow,FTAPMOppRegLow,ORPMRegLow,ORPMOppRegLow,DRPMRegLow,DRPMOppRegLow,AstPMRegLow,AstPMOppRegLow,TOPMRegLow,TOPMOppRegLow,StlPMRegLow,StlPMOppRegLow,BlkPMRegLow,BlkPMOppRegLow,PFPMRegLow,PFPMOppRegLow,FGSRRegLow,FGSROppRegLow,FG3SRRegLow,FG3SROppRegLow,FTSRRegLow,FTSROppRegLow,ScorePMTouLow,ScorePMOppTouLow,FGAPMTouLow,FGAPMOppTouLow,FGA3PMTouLow,FGA3PMOppTouLow,FTAPMTouLow,FTAPMOppTouLow,ORPMTouLow,ORPMOppTouLow,DRPMTouLow,DRPMOppTouLow,AstPMTouLow,AstPMOppTouLow,TOPMTouLow,TOPMOppTouLow,StlPMTouLow,StlPMOppTouLow,BlkPMTouLow,BlkPMOppTouLow,PFPMTouLow,PFPMOppTouLow,FGSRTouLow,FGSROppTouLow,FG3SRTouLow,FG3SROppTouLow,FTSRTouLow,FTSROppTouLow,SeedLow,GenderLow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1
2012_1462_1323,True,1.637313,1.514925,1.306716,1.394776,0.480597,0.391791,0.495522,0.344776,0.252985,0.276866,0.585075,0.571642,0.356716,0.30597,0.247015,0.266418,0.127612,0.134328,0.084328,0.072388,0.338806,0.442537,0.431753,0.411985,0.329193,0.310476,0.707831,0.707792,1.575,1.5875,1.3625,1.4,0.5875,0.5,0.5625,0.3875,0.2625,0.2125,0.725,0.6875,0.35,0.3125,0.275,0.25,0.1125,0.1,0.125,0.1625,0.45,0.4875,0.366972,0.401786,0.297872,0.4,0.711111,0.677419,7,0,1.749813,1.667416,1.371536,1.417978,0.345318,0.506367,0.559551,0.516105,0.268165,0.274157,0.63221,0.576779,0.331086,0.32809,0.314607,0.314607,0.150562,0.159551,0.092884,0.08764,0.465918,0.478652,0.452758,0.397781,0.349241,0.306213,0.692102,0.744557,1.375,1.65,1.275,1.125,0.325,0.3,0.375,0.475,0.275,0.125,0.475,0.525,0.275,0.325,0.375,0.375,0.075,0.15,0.05,0.05,0.475,0.45,0.411765,0.533333,0.153846,0.416667,0.733333,0.684211,10,0
2015_1455_1242,True,1.773626,1.610989,1.369231,1.448352,0.386813,0.462271,0.589011,0.486447,0.294505,0.304029,0.650549,0.550183,0.330403,0.279121,0.318681,0.291575,0.162637,0.158242,0.125275,0.127473,0.438828,0.483516,0.439807,0.394537,0.375,0.309033,0.720149,0.668675,1.7125,1.6125,1.3875,1.3,0.2875,0.5,0.525,0.525,0.3375,0.2,0.6375,0.5,0.3,0.25,0.35,0.2875,0.1375,0.225,0.1125,0.0625,0.45,0.45,0.459459,0.413462,0.217391,0.3,0.714286,0.738095,2,0,1.7008,1.3832,1.3512,1.1984,0.4768,0.356,0.4792,0.4416,0.288,0.2136,0.5768,0.5408,0.3384,0.2144,0.232,0.3296,0.1752,0.0984,0.0936,0.0864,0.412,0.4384,0.443458,0.400534,0.35906,0.34382,0.691152,0.681159,1.75,1.4375,1.2375,1.35,0.5,0.575,0.4875,0.5,0.1875,0.2875,0.675,0.5375,0.3875,0.2,0.2625,0.2125,0.125,0.1625,0.1125,0.05,0.475,0.4,0.505051,0.361111,0.4,0.282609,0.615385,0.6,7,0
2015_1308_1242,False,1.773626,1.610989,1.369231,1.448352,0.386813,0.462271,0.589011,0.486447,0.294505,0.304029,0.650549,0.550183,0.330403,0.279121,0.318681,0.291575,0.162637,0.158242,0.125275,0.127473,0.438828,0.483516,0.439807,0.394537,0.375,0.309033,0.720149,0.668675,1.7125,1.6125,1.3875,1.3,0.2875,0.5,0.525,0.525,0.3375,0.2,0.6375,0.5,0.3,0.25,0.35,0.2875,0.1375,0.225,0.1125,0.0625,0.45,0.45,0.459459,0.413462,0.217391,0.3,0.714286,0.738095,2,0,1.6856,1.4816,1.2608,1.2968,0.3184,0.3048,0.5656,0.4216,0.3056,0.2496,0.5632,0.4648,0.292,0.2736,0.344,0.3224,0.1568,0.1336,0.0904,0.0544,0.4,0.476,0.463832,0.423812,0.364322,0.293963,0.707214,0.694497,1.533333,1.622222,1.444444,1.311111,0.355556,0.377778,0.444444,0.6,0.311111,0.222222,0.6,0.666667,0.288889,0.2,0.288889,0.244444,0.088889,0.177778,0.088889,0.2,0.488889,0.4,0.4,0.389831,0.3125,0.352941,0.6,0.777778,15,0
2015_1326_1433,True,1.774126,1.603497,1.451748,1.283217,0.579021,0.453846,0.544056,0.485315,0.301399,0.251049,0.565734,0.648951,0.309091,0.315385,0.260839,0.395804,0.236364,0.143357,0.105594,0.081818,0.470629,0.458042,0.420039,0.432698,0.341787,0.343606,0.655527,0.694524,1.666667,1.711111,1.444444,1.133333,0.466667,0.355556,0.244444,0.511111,0.244444,0.222222,0.4,0.555556,0.288889,0.355556,0.244444,0.377778,0.244444,0.133333,0.111111,0.044444,0.466667,0.355556,0.461538,0.529412,0.380952,0.375,0.636364,0.73913,7,0,1.888302,1.553962,1.423396,1.384151,0.456604,0.523774,0.495094,0.378868,0.280755,0.279245,0.612075,0.541887,0.384151,0.321509,0.281509,0.365283,0.193962,0.125283,0.126792,0.070943,0.399245,0.447547,0.485684,0.405671,0.371901,0.318444,0.678354,0.697211,1.475,1.5,1.25,1.225,0.3,0.325,0.3,0.425,0.075,0.1,0.625,0.6,0.3,0.3,0.35,0.325,0.25,0.25,0.1,0.0,0.4,0.35,0.48,0.44898,0.25,0.230769,0.666667,0.764706,10,0
2015_1433_1326,False,1.888302,1.553962,1.423396,1.384151,0.456604,0.523774,0.495094,0.378868,0.280755,0.279245,0.612075,0.541887,0.384151,0.321509,0.281509,0.365283,0.193962,0.125283,0.126792,0.070943,0.399245,0.447547,0.485684,0.405671,0.371901,0.318444,0.678354,0.697211,1.475,1.5,1.25,1.225,0.3,0.325,0.3,0.425,0.075,0.1,0.625,0.6,0.3,0.3,0.35,0.325,0.25,0.25,0.1,0.0,0.4,0.35,0.48,0.44898,0.25,0.230769,0.666667,0.764706,10,0,1.774126,1.603497,1.451748,1.283217,0.579021,0.453846,0.544056,0.485315,0.301399,0.251049,0.565734,0.648951,0.309091,0.315385,0.260839,0.395804,0.236364,0.143357,0.105594,0.081818,0.470629,0.458042,0.420039,0.432698,0.341787,0.343606,0.655527,0.694524,1.666667,1.711111,1.444444,1.133333,0.466667,0.355556,0.244444,0.511111,0.244444,0.222222,0.4,0.555556,0.288889,0.355556,0.244444,0.377778,0.244444,0.133333,0.111111,0.044444,0.466667,0.355556,0.461538,0.529412,0.380952,0.375,0.636364,0.73913,7,0


In [11]:
y_train = features_train["Win"]
X_train = features_train.drop("Win", axis=1)
y_valid = features_valid["Win"]
X_valid = features_valid.drop("Win", axis=1)

# Step 4: Train a model


### Setup Hyperparameter Tuning
See https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning

In [12]:
from sklearn.metrics import brier_score_loss, roc_auc_score, confusion_matrix
import lightgbm

model = lightgbm.LGBMClassifier(objective="binary", n_estimators=500)
model.fit(X_train, y_train)

preds_train = model.predict(X_train)
preds_valid = model.predict(X_valid)

print("Training Score:", model.score(X_train, y_train))
print("Validation Score:", model.score(X_valid, y_valid))
print("Brier Score Validation:", brier_score_loss(y_valid, preds_valid))
print("ROC AUC Validation:", roc_auc_score(y_valid, preds_valid))
print("Parameters")
print(*(f"- {key}: {value}" for key, value in model.get_params(deep=True).items()), sep="\n")
print("Features")
print(*(f"- {name}: {imp}" for name, imp in sorted(zip(model.feature_name_, model.feature_importances_), key=lambda x: x[1], reverse=True)), sep="\n")


Training Score: 1.0
Validation Score: 0.609375
Brier Score Validation: 0.390625
ROC AUC Validation: 0.609375
Parameters
- boosting_type: gbdt
- class_weight: None
- colsample_bytree: 1.0
- importance_type: split
- learning_rate: 0.1
- max_depth: -1
- min_child_samples: 20
- min_child_weight: 0.001
- min_split_gain: 0.0
- n_estimators: 500
- n_jobs: -1
- num_leaves: 31
- objective: binary
- random_state: None
- reg_alpha: 0.0
- reg_lambda: 0.0
- silent: warn
- subsample: 1.0
- subsample_for_bin: 200000
- subsample_freq: 0
Features
- SeedLow: 333
- SeedHigh: 330
- ScorePMRegHigh: 189
- FTSRRegHigh: 175
- ScorePMRegLow: 167
- FTSRRegLow: 155
- ORPMOppRegLow: 151
- ORPMOppRegHigh: 138
- ScorePMOppRegHigh: 136
- FG3SROppTouHigh: 132
- FGAPMRegHigh: 125
- AstPMOppRegHigh: 124
- ScorePMOppRegLow: 124
- FGA3PMOppRegLow: 121
- FG3SROppTouLow: 121
- StlPMRegHigh: 118
- FGA3PMOppTouHigh: 117
- AstPMOppRegLow: 117
- DRPMOppTouHigh: 114
- StlPMRegLow: 113
- BlkPMOppRegLow: 110
- StlPMOppRegHigh: 10

# Step 5: Submit to the competition

We"ll begin by using the trained model to generate predictions, which we"ll save to a CSV file.

In [13]:
SampleSubmissionWarmup = pd.read_csv("/kaggle/input/warmup-round-march-machine-learning-mania-2023/SampleSubmissionWarmup.csv")

print(SampleSubmissionWarmup.shape)
SampleSubmissionWarmup.tail()

(614319, 2)


Unnamed: 0,ID,Pred
614314,2022_3469_3471,0.5
614315,2022_3469_3472,0.5
614316,2022_3470_3471,0.5
614317,2022_3470_3472,0.5
614318,2022_3471_3472,0.5


In [14]:
def get_submission_outcomes(sample_submission: pd.DataFrame) -> pd.DataFrame:
    df = sample_submission.copy()
    df.drop("Pred", axis=1, inplace=True)
    df[["Season", "LowID", "HighID"]] = df["ID"].str.split("_", expand=True)
    df[["Season", "LowID", "HighID"]] = df[["Season", "LowID", "HighID"]].astype(int)
    return df

In [15]:
submission_outcomes = get_submission_outcomes(SampleSubmissionWarmup)
print(submission_outcomes.shape)
submission_outcomes.tail()

(614319, 4)


Unnamed: 0,ID,Season,LowID,HighID
614314,2022_3469_3471,2022,3469,3471
614315,2022_3469_3472,2022,3469,3472
614316,2022_3470_3471,2022,3470,3471
614317,2022_3470_3472,2022,3470,3472
614318,2022_3471_3472,2022,3471,3472


In [16]:
X_submission = merge_outcomes_with_features(submission_outcomes, features, how="left")
print(X_submission.shape)
X_submission.describe()

(614319, 116)


Unnamed: 0,ScorePMRegHigh,ScorePMOppRegHigh,FGAPMRegHigh,FGAPMOppRegHigh,FGA3PMRegHigh,FGA3PMOppRegHigh,FTAPMRegHigh,FTAPMOppRegHigh,ORPMRegHigh,ORPMOppRegHigh,DRPMRegHigh,DRPMOppRegHigh,AstPMRegHigh,AstPMOppRegHigh,TOPMRegHigh,TOPMOppRegHigh,StlPMRegHigh,StlPMOppRegHigh,BlkPMRegHigh,BlkPMOppRegHigh,PFPMRegHigh,PFPMOppRegHigh,FGSRRegHigh,FGSROppRegHigh,FG3SRRegHigh,FG3SROppRegHigh,FTSRRegHigh,FTSROppRegHigh,ScorePMTouHigh,ScorePMOppTouHigh,FGAPMTouHigh,FGAPMOppTouHigh,FGA3PMTouHigh,FGA3PMOppTouHigh,FTAPMTouHigh,FTAPMOppTouHigh,ORPMTouHigh,ORPMOppTouHigh,DRPMTouHigh,DRPMOppTouHigh,AstPMTouHigh,AstPMOppTouHigh,TOPMTouHigh,TOPMOppTouHigh,StlPMTouHigh,StlPMOppTouHigh,BlkPMTouHigh,BlkPMOppTouHigh,PFPMTouHigh,PFPMOppTouHigh,FGSRTouHigh,FGSROppTouHigh,FG3SRTouHigh,FG3SROppTouHigh,FTSRTouHigh,FTSROppTouHigh,SeedHigh,GenderHigh,ScorePMRegLow,ScorePMOppRegLow,FGAPMRegLow,FGAPMOppRegLow,FGA3PMRegLow,FGA3PMOppRegLow,FTAPMRegLow,FTAPMOppRegLow,ORPMRegLow,ORPMOppRegLow,DRPMRegLow,DRPMOppRegLow,AstPMRegLow,AstPMOppRegLow,TOPMRegLow,TOPMOppRegLow,StlPMRegLow,StlPMOppRegLow,BlkPMRegLow,BlkPMOppRegLow,PFPMRegLow,PFPMOppRegLow,FGSRRegLow,FGSROppRegLow,FG3SRRegLow,FG3SROppRegLow,FTSRRegLow,FTSROppRegLow,ScorePMTouLow,ScorePMOppTouLow,FGAPMTouLow,FGAPMOppTouLow,FGA3PMTouLow,FGA3PMOppTouLow,FTAPMTouLow,FTAPMOppTouLow,ORPMTouLow,ORPMOppTouLow,DRPMTouLow,DRPMOppTouLow,AstPMTouLow,AstPMOppTouLow,TOPMTouLow,TOPMOppTouLow,StlPMTouLow,StlPMOppTouLow,BlkPMTouLow,BlkPMOppTouLow,PFPMTouLow,PFPMOppTouLow,FGSRTouLow,FGSROppTouLow,FG3SRTouLow,FG3SROppTouLow,FTSRTouLow,FTSROppTouLow,SeedLow,GenderLow
count,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49144.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0,49499.0
mean,1.858486,1.590076,1.490356,1.445211,0.507434,0.498032,0.4629,0.400293,0.285958,0.251438,0.646332,0.566554,0.373576,0.296915,0.317045,0.355298,0.179419,0.152058,0.098286,0.075875,0.410129,0.444144,0.451721,0.397679,0.352146,0.318204,0.719876,0.703846,1.702026,1.746,1.48138,1.463001,0.492698,0.483937,0.414851,0.440886,0.274206,0.263663,0.599813,0.610691,0.321294,0.334457,0.306497,0.2997,0.151111,0.152585,0.092464,0.090276,0.428744,0.413527,0.421098,0.431677,0.32723,0.347176,0.718022,0.726483,6.509849,0.512921,1.875173,1.589861,1.49646,1.451773,0.512861,0.491211,0.459217,0.400281,0.284569,0.25374,0.648357,0.563671,0.388164,0.295589,0.321129,0.360194,0.181053,0.151218,0.099914,0.074257,0.407894,0.441469,0.455768,0.397157,0.351937,0.31632,0.718453,0.702696,1.730575,1.758024,1.484696,1.469402,0.490962,0.468326,0.418485,0.444262,0.269837,0.267539,0.59444,0.606425,0.335586,0.333594,0.309411,0.308687,0.157737,0.157772,0.087903,0.089159,0.428777,0.413374,0.427727,0.435992,0.330491,0.335625,0.716384,0.724614,6.861351,0.55884
std,0.149846,0.151692,0.095091,0.098173,0.105734,0.075884,0.068625,0.06757,0.059898,0.042734,0.056294,0.048347,0.049103,0.041122,0.047731,0.067508,0.041488,0.024845,0.026382,0.014119,0.046836,0.040566,0.027855,0.029885,0.029059,0.025045,0.037541,0.025312,0.272351,0.24376,0.14381,0.148658,0.142127,0.126595,0.12924,0.147228,0.090745,0.082141,0.117728,0.125696,0.089528,0.0897,0.087327,0.087007,0.057467,0.066017,0.0478,0.048683,0.08747,0.081758,0.070211,0.062782,0.09026,0.085059,0.105032,0.086549,4.283826,0.499838,0.158058,0.146308,0.088055,0.089952,0.117763,0.078457,0.070211,0.068374,0.054796,0.038442,0.062073,0.049908,0.058408,0.038795,0.044502,0.06641,0.041881,0.024915,0.02836,0.013706,0.04607,0.040183,0.028748,0.029876,0.029046,0.024283,0.035587,0.024746,0.266675,0.244481,0.141467,0.138853,0.139793,0.125899,0.132887,0.144124,0.091486,0.086918,0.111028,0.117531,0.098203,0.087046,0.08699,0.088363,0.0621,0.062614,0.046428,0.043586,0.086714,0.079476,0.064051,0.06154,0.085555,0.088847,0.108093,0.086616,4.446288,0.496531
min,1.500444,1.152344,1.259398,1.186923,0.208594,0.272932,0.29465,0.219531,0.165079,0.154887,0.4696,0.439844,0.250193,0.205469,0.209639,0.223016,0.089157,0.086861,0.035206,0.046324,0.236948,0.354506,0.368984,0.312268,0.267516,0.248588,0.615238,0.634615,0.775,1.2,1.0,1.022222,0.175,0.075,0.075,0.166667,0.075,0.05,0.225,0.25,0.075,0.175,0.1,0.111111,0.025,0.0,0.0,0.0,0.225,0.125,0.162162,0.298507,0.0,0.0,0.3,0.444444,1.0,0.0,1.500444,1.152344,1.259398,1.186923,0.208594,0.272932,0.29465,0.219531,0.165079,0.154887,0.4696,0.439844,0.250193,0.205469,0.209639,0.223016,0.089157,0.086861,0.035206,0.046324,0.236948,0.354506,0.368984,0.312268,0.267516,0.248588,0.615238,0.634615,0.775,1.2,1.0,1.022222,0.175,0.075,0.075,0.166667,0.075,0.05,0.225,0.25,0.075,0.175,0.1,0.111111,0.025,0.0,0.0,0.0,0.225,0.125,0.162162,0.298507,0.0,0.0,0.3,0.444444,1.0,0.0
25%,1.762278,1.502146,1.427237,1.375875,0.4392,0.438222,0.416912,0.348529,0.236364,0.216461,0.604633,0.532353,0.342187,0.267188,0.283268,0.306349,0.149398,0.135606,0.079715,0.0656,0.383936,0.417557,0.434196,0.375833,0.33274,0.30102,0.695761,0.6841,1.55,1.575,1.3875,1.377778,0.4,0.4,0.325,0.344,0.2125,0.208,0.525,0.525,0.25,0.275,0.25,0.241667,0.116667,0.108333,0.05,0.058333,0.375,0.3625,0.377358,0.386957,0.269231,0.292683,0.676923,0.666667,3.0,0.0,1.775806,1.472794,1.438843,1.383658,0.447809,0.437363,0.404979,0.353846,0.242182,0.22735,0.607812,0.525283,0.348663,0.266909,0.293182,0.310145,0.153676,0.135606,0.080833,0.064493,0.382258,0.4125,0.436238,0.377642,0.333333,0.298755,0.693069,0.686047,1.536,1.583333,1.4,1.3875,0.3875,0.3875,0.341667,0.344,0.2125,0.2,0.5125,0.525,0.275,0.273171,0.25,0.25,0.116667,0.115,0.05,0.058333,0.36875,0.36,0.38785,0.390244,0.283019,0.285714,0.666667,0.666667,3.0,0.0
50%,1.84812,1.589272,1.486466,1.452885,0.515018,0.501832,0.464057,0.397701,0.286777,0.253488,0.638545,0.565625,0.367969,0.293976,0.313534,0.344856,0.171984,0.151562,0.095312,0.075445,0.406867,0.444,0.451276,0.399369,0.352884,0.318015,0.71848,0.704348,1.733333,1.715152,1.475,1.475,0.5,0.4875,0.4125,0.41875,0.26875,0.25,0.6125,0.6,0.325,0.313725,0.291667,0.291667,0.15,0.142857,0.0875,0.075,0.425,0.4125,0.426471,0.426087,0.328358,0.340206,0.729412,0.727273,6.0,1.0,1.879537,1.594422,1.493436,1.456818,0.516129,0.49927,0.458333,0.398496,0.288889,0.2576,0.638545,0.560606,0.3792,0.292248,0.315909,0.352896,0.175104,0.151004,0.09781,0.073387,0.406867,0.440496,0.454295,0.401226,0.353116,0.314554,0.718009,0.704274,1.75,1.75,1.4875,1.475,0.5,0.469388,0.425,0.425,0.2625,0.25,0.6125,0.6,0.325,0.31875,0.296,0.3,0.15,0.15,0.075,0.082927,0.422222,0.40625,0.431818,0.429245,0.333333,0.323529,0.730769,0.72,6.0,1.0
75%,1.970588,1.696667,1.5424,1.512351,0.568498,0.543939,0.50989,0.444531,0.327419,0.282731,0.688281,0.597727,0.398535,0.3275,0.3464,0.403101,0.206818,0.167078,0.114801,0.084127,0.438168,0.472453,0.469867,0.419441,0.37037,0.334672,0.746154,0.724026,1.9,1.9,1.575,1.55,0.575,0.566667,0.495833,0.525,0.325,0.3125,0.6875,0.675,0.375,0.4,0.35,0.35,0.18,0.175,0.125,0.1125,0.475,0.466667,0.468468,0.467742,0.390625,0.4,0.785714,0.78125,10.0,1.0,1.99144,1.696667,1.546875,1.52342,0.571984,0.543966,0.504839,0.448387,0.320588,0.278571,0.6875,0.6,0.412851,0.325,0.346988,0.407469,0.212448,0.164315,0.1168,0.082625,0.438168,0.472453,0.473931,0.41895,0.374092,0.333333,0.743151,0.722488,1.925,1.908333,1.5875,1.55,0.5875,0.557576,0.50625,0.525,0.325,0.325,0.675,0.675,0.391667,0.3875,0.3625,0.3625,0.195833,0.196,0.11875,0.108333,0.49375,0.466667,0.473404,0.473684,0.387097,0.384615,0.777778,0.777778,11.0,1.0
max,2.246875,1.978832,1.840833,1.657576,0.850442,0.694208,0.658608,0.598333,0.493333,0.345455,0.845455,0.681275,0.583594,0.398387,0.490833,0.555,0.295833,0.2375,0.2,0.125292,0.594167,0.582418,0.531798,0.4819,0.420103,0.382883,0.841035,0.762864,2.345455,2.575,1.9,1.925,1.0,0.844444,0.766667,1.075,0.55,0.6,0.875,1.275,0.725,0.658333,0.725,0.575,0.35,0.525,0.2375,0.325,0.825,0.65,0.581994,0.618182,0.5,0.75,1.0,1.0,16.0,1.0,2.246875,1.978832,1.840833,1.657576,0.850442,0.694208,0.658608,0.598333,0.493333,0.345455,0.845455,0.681275,0.583594,0.398387,0.490833,0.555,0.295833,0.2375,0.2,0.125292,0.594167,0.582418,0.531798,0.4819,0.420103,0.382883,0.841035,0.762864,2.345455,2.575,1.9,1.925,1.0,0.844444,0.766667,1.075,0.55,0.6,0.875,1.275,0.725,0.658333,0.725,0.575,0.35,0.525,0.2375,0.325,0.825,0.65,0.581994,0.618182,0.5,0.75,1.0,1.0,16.0,1.0


In [17]:
# Use the model to generate predictions

predictions = model.predict(X_submission)

# Save the predictions to a CSV file
output = pd.DataFrame({"ID": X_submission.index,
                       "Pred": predictions})
output.to_csv("submission.csv", index=False)
print(output.shape)
output.describe()

(614319, 2)


Unnamed: 0,ID,Pred
count,614319,614319
unique,614319,2
top,2017_1101_1102,True
freq,1,569524
