# Imports and Utilities

In [1]:
from typing import List, Tuple
import numpy as np
import pandas as pd
from IPython.display import Markdown
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.model_selection import cross_val_score

COMPETITION_NAME = "march-machine-learning-mania-2023"
VERBOSE = False
SCORING = "neg_brier_score"

pd.set_option("display.max_columns", None)

def show_df(df: pd.DataFrame, name: str = "DataFrame", verbose: bool = VERBOSE):
    if verbose:
        display(Markdown(f"## {name}"))
        display(Markdown("### Head"))
        display(df.head())
        display(Markdown("### Tail"))
        display(df.tail())
        display(Markdown("### Description"))
        display(df.describe())


def extract_data(filename: str, competition_name:str = COMPETITION_NAME) -> pd.DataFrame:
    mens_filepath = f"/kaggle/input/{competition_name}/M{filename}.csv"
    womens_filepath = f"/kaggle/input/{competition_name}/W{filename}.csv"
    try:
        df_mens = pd.read_csv(mens_filepath)
    except FileNotFoundError:
        df_mens = None
    try:
        df_womens = pd.read_csv(womens_filepath)
    except FileNotFoundError:
        df_womens = None
    df = pd.concat([df_mens, df_womens])
    show_df(df, filename)
    return df


def get_team_features(detailed_results: pd.DataFrame) -> pd.DataFrame:
    df = detailed_results.copy()
    df = clean_detailed_results(df)
    df = transform_game_to_team(df)
    df = transform_team_results(df)
    show_df(df)
    return df


def get_seed_features(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    mask = df["Season"] > 2002
    df = df[mask]
    df["Seed"] = df["Seed"].str.replace(r"\D+","", regex=True)
    df["Seed"] = df["Seed"].astype(int)
    show_df(df)
    return df


def get_ranking_features(df_in:pd.DataFrame) -> pd.DataFrame:
    df = df_in.copy()
    mask = df["RankingDayNum"] == df["RankingDayNum"].max()
    df = df[mask]
    df.drop(["SystemName", "RankingDayNum"], axis=1, inplace=True)
    df = df.groupby(["Season", "TeamID"]).agg("median")
    df = df.reset_index()
    show_df(df)
    return df


def get_game_outcomes(df):
    input_rows = df.to_records()
    output_rows = []
    for input_row in input_rows:
        output_rows.extend(parse_row(input_row))
    out_df = pd.DataFrame(output_rows)
    return out_df

def parse_row(row):
    season = row['Season']
    winning_team_id = row['WTeamID']
    losing_team_id = row['LTeamID']
    if winning_team_id < losing_team_id:
        small_id = winning_team_id
        big_id = losing_team_id
        outcome = True
    elif losing_team_id < winning_team_id:
        small_id = losing_team_id
        big_id = winning_team_id
        outcome = False
    records = [
        {
            "ID": f"{season}_{small_id}_{big_id}",
            'Season': season,
            'LowID': small_id,
            'HighID': big_id,
            'Win': outcome
        },
        {
            "ID": f"{season}_{big_id}_{small_id}",
            'Season': season,
            'LowID': big_id,
            'HighID': small_id,
            'Win': not outcome
        },
    ]
    return records


def clean_detailed_results(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop(["WLoc", "DayNum"], axis=1)


def transform_game_to_team(game_results: pd.DataFrame) -> pd.DataFrame:
    winners = rename_columns(game_results, "W")
    loosers = rename_columns(game_results, "L")
    team_results = pd.concat((winners, loosers))
    team_results.drop(["TeamIDOpp"], axis=1, inplace=True)
    return team_results


def transform_team_results(df: pd.DataFrame) -> pd.DataFrame:
    df = df.groupby(["Season", "TeamID"]).median()
    df["FGP"] =  df["FGM"] / df["FGA"]
    df["FGP3"] =  df["FGM3"] / df["FGA3"]
    df["FTP"] =  df["FTM"] / df["FTA"]
    df["FGPOpp"] =  df["FGMOpp"] / df["FGAOpp"]
    df["FGP3Opp"] =  df["FGM3Opp"] / df["FGA3Opp"]
    df["FTPOpp"] =  df["FTMOpp"] / df["FTAOpp"]
    return df.reset_index()
    
    
def rename_columns(df: pd.DataFrame, team_prefix: str) -> pd.DataFrame:
    df = df.copy()
    df.columns =  (rename_column(column_name, team_prefix) for column_name in df.columns)
    return df


def rename_column(column_name: str, team_prefix: str) -> pd.DataFrame:
    if team_prefix == "W":
        opponent_prefix = "L"
    elif team_prefix == "L":
        opponent_prefix = "W"
    else:
        raise ValueError
    if column_name.startswith(team_prefix):
        column_name = column_name.lstrip(team_prefix)
    elif column_name.startswith(opponent_prefix):
        column_name = f"{column_name.lstrip(opponent_prefix)}Opp"
    return column_name


def split_winner_and_looser_columns(df: pd.DataFrame) -> Tuple[List[str], List[str]]:
    winner_columns = [name for name in df.columns if not name.startswith("L")]
    looser_columns = [name for name in df.columns if not name.startswith("W")]
    return winner_columns, looser_columns


def merge_features(
    season_features: pd.DataFrame, 
    tournament_features: pd.DataFrame, 
    seed_features: pd.DataFrame, 
    ranking_features: pd.DataFrame
) -> pd.DataFrame:
    features = pd.merge(
        season_features,
        tournament_features,
        how="inner",
        on=["Season", "TeamID"],
        suffixes=("Reg", "Tou")
    )
    features = features.merge(
        seed_features,
        how="inner",
        on=["Season", "TeamID"]
    )
    features = features.merge(
        ranking_features,
        how="left",
        on=["Season", "TeamID"],
    )
    show_df(features)
    return features


def merge_outcomes_with_features(outcomes: pd.DataFrame, features: pd.DataFrame, how: str = "inner") -> pd.DataFrame:
    feature_names = [name for name in features.columns if name not in ["Season", "TeamID", "Gender"]]
    data = pd.merge(
        outcomes, 
        features, 
        how=how, 
        left_on=["Season", "HighID"], 
        right_on=["Season", "TeamID"]
    )
    data = pd.merge(
        data, 
        features, 
        how=how, 
        left_on=["Season", "LowID"], 
        right_on=["Season", "TeamID"],
        suffixes=("High", "Low")
    )
    for name in feature_names:
        data[f"{name}Diff"] = data[f"{name}High"] - data[f"{name}Low"]
        data.drop([f"{name}High", f"{name}Low"], axis=1, inplace=True)
    data.drop(
        ["Season", "HighID", "LowID","TeamIDHigh","TeamIDLow"], 
        axis=1, 
        inplace=True
    )
    data.set_index("ID", inplace=True)
    show_df(data)
    return data


def get_submission_outcomes() -> pd.DataFrame:
    sample_submission = pd.read_csv(f"/kaggle/input/{COMPETITION_NAME}/SampleSubmission2023.csv")
    df = sample_submission.copy()
    df.drop("Pred", axis=1, inplace=True)
    df[["Season", "LowID", "HighID"]] = df["ID"].str.split("_", expand=True)
    df[["Season", "LowID", "HighID"]] = df[["Season", "LowID", "HighID"]].astype(int)
    show_df(df)
    return df


def objective(trial: optuna.Trial, X_train, y_train, scoring = SCORING):
    params = {
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if params["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        params["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    model = train_model(params, X_train, y_train, scoring)
    score = np.mean(cross_val_score(model, X_train, y_train, scoring=scoring, cv=5))
    return score


def run_study(X_train, y_train):
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=100)
    return study


def train_model(params, X, y, scoring):
    model = XGBClassifier(
        objective="binary:logistic",
        tree_method="exact",
        verbosity=0,
        boosting_type="gbdt",
        **params
    )
    model = model.fit(X, y)
    return model

# Load Data

In [2]:
season_results = extract_data("RegularSeasonDetailedResults")
tournament_results = extract_data("NCAATourneyDetailedResults")
seeds = extract_data("NCAATourneySeeds")
rankings = extract_data("MasseyOrdinals_thru_Season2023_Day128")

# Transform Data

In [3]:
season_team_features = get_team_features(season_results)

tournament_results["Season"] += 1
tournament_team_features = get_team_features(tournament_results)

seed_features = get_seed_features(seeds)

ranking_features = get_ranking_features(rankings)

## Merge features

In [4]:
features = merge_features(season_team_features, tournament_team_features, seed_features, ranking_features)

## Build Datasets

In [5]:
from sklearn.model_selection import train_test_split

data = extract_data("NCAATourneyCompactResults")
data_train, data_valid = train_test_split(data, random_state=0, test_size=0.1)

outcomes = get_game_outcomes(data)
outcomes_train = get_game_outcomes(data_train)
outcomes_valid = get_game_outcomes(data_valid)
outcomes_submission = get_submission_outcomes()

In [6]:
features_train = merge_outcomes_with_features(outcomes_train, features)
features_valid = merge_outcomes_with_features(outcomes_valid, features)
features_full = merge_outcomes_with_features(outcomes, features)

In [7]:
y_train = features_train["Win"]
X_train = features_train.drop("Win", axis=1)
y_valid = features_valid["Win"]
X_valid = features_valid.drop("Win", axis=1)
y = features_full["Win"]
X = features_full.drop("Win", axis=1)
X_submission = merge_outcomes_with_features(outcomes_submission, features, how="left").fillna(0)

# Step 4: Train a model


In [8]:
study = run_study(X_train, y_train)

[32m[I 2023-03-15 21:51:19,104][0m A new study created in memory with name: no-name-3d67da4c-7ed9-4a85-b898-53dc2dec9322[0m
[32m[I 2023-03-15 21:51:23,387][0m Trial 0 finished with value: -0.18434372681507988 and parameters: {'booster': 'gblinear', 'lambda': 1.3541307200264502e-05, 'alpha': 1.107874791885793e-06, 'subsample': 0.3030260263800013, 'colsample_bytree': 0.5034369751333039}. Best is trial 0 with value: -0.18434372681507988.[0m
[32m[I 2023-03-15 21:51:25,614][0m Trial 1 finished with value: -0.2499778317549481 and parameters: {'booster': 'gbtree', 'lambda': 1.022943167233302e-08, 'alpha': 0.0008950981811377106, 'subsample': 0.28083315977553913, 'colsample_bytree': 0.9356577484033892, 'max_depth': 5, 'min_child_weight': 3, 'eta': 1.9793084278476034e-06, 'gamma': 2.3453182411241426e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: -0.18434372681507988.[0m
[32m[I 2023-03-15 21:51:26,133][0m Trial 2 finished with value: -0.1843581482900654 and parameters: {

In [9]:
params = study.best_params
model = train_model(params, X_train, y_train, SCORING)
preds = model.predict_proba(X_valid)[:,1]
print("Brier Score:", brier_score_loss(y_valid, preds))
print("ROC AUC:", roc_auc_score(y_valid, preds))
print("Parameters")
print(*(f"- {key}: {value}" for key, value in model.get_params(deep=True).items()), sep="\n")

Brier Score: 0.140594972609948
ROC AUC: 0.8809799382716048
Parameters
- objective: binary:logistic
- use_label_encoder: False
- base_score: 0.5
- booster: gblinear
- callbacks: None
- colsample_bylevel: None
- colsample_bynode: None
- colsample_bytree: 0.3763472673228348
- early_stopping_rounds: None
- enable_categorical: False
- eval_metric: None
- gamma: None
- gpu_id: -1
- grow_policy: None
- importance_type: None
- interaction_constraints: None
- learning_rate: 0.5
- max_bin: None
- max_cat_to_onehot: None
- max_delta_step: None
- max_depth: None
- max_leaves: None
- min_child_weight: None
- missing: nan
- monotone_constraints: None
- n_estimators: 100
- n_jobs: 0
- num_parallel_tree: None
- predictor: None
- random_state: 0
- reg_alpha: 0
- reg_lambda: 0
- sampling_method: None
- scale_pos_weight: 1
- subsample: 0.24105156550168522
- tree_method: exact
- validate_parameters: 1
- verbosity: 0
- boosting_type: gbdt
- lambda: 0.0004491704713556265
- alpha: 4.673747608662968e-05


# Submit to the competition

We"ll begin by using the trained model to generate predictions, which we"ll save to a CSV file.

In [10]:
# Use the model to generate predictions
model.fit(X, y)
predictions = model.predict_proba(X_submission)[:,1]

# Save the predictions to a CSV file
output = pd.DataFrame({"ID": X_submission.index,
                       "Pred": predictions})
output.to_csv("submission.csv", index=False)
print(output.shape)
output.describe()

(130683, 2)


Unnamed: 0,Pred
count,130683.0
mean,0.499975
std,0.028806
min,0.006626
25%,0.5
50%,0.5
75%,0.5
max,0.996123
