# 1. Download

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 100)

In [None]:
test = pd.read_csv("../input/ncaam-march-mania-2021/MSampleSubmissionStage1.csv")

test["Season"] = test["ID"].apply(lambda x: int(x.split("_")[0]))
test["TeamA"] = test["ID"].apply(lambda x: int(x.split("_")[1]))
test["TeamB"] = test["ID"].apply(lambda x: int(x.split("_")[2]))

print(test.shape)
test.head()

# 2. Tourney

In [None]:
tourney = pd.read_csv("../input/ncaam-march-mania-2021/MNCAATourneyCompactResults.csv")
print(tourney.shape)
tourney.head()

In [None]:
tourney.tail()

In [None]:
def create_all_df(df):
    win_df = df.rename(columns = {
        "WTeamID" : "TeamA",
        "WScore" : "ScoreA",
        "LTeamID" : "TeamB",
        "LScore" : "ScoreB"
    }).copy()

    lose_df = df.rename(columns = {
        "WTeamID" : "TeamB",
        "WScore" : "ScoreB",
        "LTeamID" : "TeamA",
        "LScore" : "ScoreA"
    }).copy()
    
    win_df["ScoreGap"] = win_df["ScoreA"] - win_df["ScoreB"]
    lose_df["ScoreGap"] = lose_df["ScoreA"] - lose_df["ScoreB"]
    all_df = pd.concat([win_df, lose_df], axis = 0)
    all_df["WinA"] = (all_df["ScoreGap"] > 0).astype("int8")
    all_df = all_df.sort_values(by = ["Season", "TeamA", "TeamB"]).reset_index(drop = True)
    return all_df

all_df = create_all_df(tourney)

# 3. RegularFeatures

In [None]:
regular = pd.read_csv("../input/ncaam-march-mania-2021/MRegularSeasonCompactResults.csv")
print(regular.shape)
regular.head()

In [None]:
regular.tail()

In [None]:
win_df = regular.groupby(["Season", "WTeamID"], as_index = False)["DayNum"].count()
win_df.rename(columns = {"DayNum" : "WCount", "WTeamID" : "TeamID"}, inplace = True)
lose_df = regular.groupby(["Season", "LTeamID"], as_index = False)["DayNum"].count()
lose_df.rename(columns = {"DayNum" : "LCount", "LTeamID" : "TeamID"}, inplace = True)

features_df = pd.concat([win_df[["Season", "TeamID"]], lose_df[["Season", "TeamID"]]], axis = 0).drop_duplicates()
features_df = features_df.sort_values(by = ["Season", "TeamID"]).reset_index(drop = True)

features_df = pd.merge(features_df, win_df, on = ["Season", "TeamID"], how = "left")
features_df = pd.merge(features_df, lose_df, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

In [None]:
win_df = regular.groupby(["Season", "WTeamID"])["WScore"].agg(["sum", "mean"]).reset_index(drop = False)
win_df.rename(columns = {"WTeamID" : "TeamID", "sum" : "WScoreSum", "mean" : "WScoreMean"}, inplace = True)
lose_df = regular.groupby(["Season", "LTeamID"])["LScore"].agg(["sum", "mean"]).reset_index(drop = False)
lose_df.rename(columns = {"LTeamID" : "TeamID", "sum" : "LScoreSum", "mean" : "LScoreMean"}, inplace = True)

features_df = pd.merge(features_df, win_df, on = ["Season", "TeamID"], how = "left")
features_df = pd.merge(features_df, lose_df, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

features_df["SeasonScoreSum"] = features_df["WScoreSum"] + features_df["LScoreSum"]
features_df["SeasonScoreMean"] = features_df["SeasonScoreSum"] / (features_df["WCount"] + features_df["LCount"])

In [None]:
dummy = features_df.copy()
dummy["Season"] = dummy["Season"] + 1
dummy.rename(columns = {
    "WCount" : "WCount_shift1",
    "LCount" : "LCount_shift1",
    "WScoreSum" : "WScoreSum_shift1",
    "WScoreMean" : "WScoreMean_shift1",
    "LScoreSum" : "LScoreSum_shift1",
    "LScoreMean" : "LScoreMean_shift1",
    "SeasonScoreSum" : "SeasonScoreSum_shift1",
    "SeasonScoreMean" : "SeasonScoreMean_shift1",
}, inplace = True)

features_df = pd.merge(features_df, dummy, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

In [None]:
features_df

# 4. Preprocessing

In [None]:
import re
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

def preprocess(df):
    df = pd.merge(
        df, features_df, left_on = ["Season", "TeamA"], right_on = ["Season", "TeamID"], how = "left"
    ).rename(columns = {
        "WCount" : "WCountA",
        "LCount" : "LCountA",
        "WScoreSum" : "WScoreSumA",
        "WScoreMean" : "WScoreMeanA",
        "LScoreSum" : "LScoreSumA",
        "LScoreMean" : "LScoreMeanA",
        "SeasonScoreSum" : "SeasonScoreSumA",
        "SeasonScoreMean" : "SeasonScoreMeanA",

        "WCount_shift1" : "WCount_shift1A",
        "LCount_shift1" : "LCount_shift1A",
        "WScoreSum_shift1" : "WScoreSum_shift1A",
        "WScoreMean_shift1" : "WScoreMean_shift1A",
        "LScoreSum_shift1" : "LScoreSum_shift1A",
        "LScoreMean_shift1" : "LScoreMean_shift1A",
        "SeasonScoreSum_shift1" : "SeasonScoreSum_shift1A",
        "SeasonScoreMean_shift1" : "SeasonScoreMean_shift1A",
    }).drop(columns = "TeamID")

    df = pd.merge(
        df, features_df, left_on = ["Season", "TeamB"], right_on = ["Season", "TeamID"], how = "left"
    ).rename(columns = {
        "WCount" : "WCountB",
        "LCount" : "LCountB",
        "WScoreSum" : "WScoreSumB",
        "WScoreMean" : "WScoreMeanB",
        "LScoreSum" : "LScoreSumB",
        "LScoreMean" : "LScoreMeanB",
        "SeasonScoreSum" : "SeasonScoreSumB",
        "SeasonScoreMean" : "SeasonScoreMeanB",

        "WCount_shift1" : "WCount_shift1B",
        "LCount_shift1" : "LCount_shift1B",
        "WScoreSum_shift1" : "WScoreSum_shift1B",
        "WScoreMean_shift1" : "WScoreMean_shift1B",
        "LScoreSum_shift1" : "LScoreSum_shift1B",
        "LScoreMean_shift1" : "LScoreMean_shift1B",
        "SeasonScoreSum_shift1" : "SeasonScoreSum_shift1B",
        "SeasonScoreMean_shift1" : "SeasonScoreMean_shift1B",
    }).drop(columns = "TeamID")
    
    seed = pd.read_csv("../input/ncaam-march-mania-2021/MNCAATourneySeeds.csv")
    df = pd.merge(df, seed, left_on = ["Season", "TeamA"], right_on = ["Season", "TeamID"], how = "left").rename(columns = {"Seed" : "SeedA"}).drop(columns = "TeamID")
    df = pd.merge(df, seed, left_on = ["Season", "TeamB"], right_on = ["Season", "TeamID"], how = "left").rename(columns = {"Seed" : "SeedB"}).drop(columns = "TeamID")
    df["SeedA"] = df["SeedA"].apply(treat_seed)
    df["SeedB"] = df["SeedB"].apply(treat_seed)
    return df

In [None]:
all_df = preprocess(all_df)
all_df

In [None]:
test = preprocess(test)
test

# 5. Training

In [None]:
print(list(all_df.columns))

In [None]:
use_cols = list(all_df.columns)
print(len(use_cols))
for col in ['DayNum', 'ScoreA', 'ScoreB', 'WLoc', 'NumOT', 'ScoreGap', 'WinA']:
    use_cols.remove(col)
print(len(use_cols))

In [None]:
import lightgbm as lgb
from sklearn.metrics import log_loss
params = {
    "objective" : "binary",
    "metric" : "binary_logloss",
    "verbosity" : -1
}

models = []
for season in range(2015, 2020):
    train = all_df.loc[all_df["Season"] < season]
    valid = all_df.loc[all_df["Season"] == season]
    X_train = train[use_cols]
    X_valid = valid[use_cols]
    y_train = train["WinA"]
    y_valid = valid["WinA"]
    train_set = lgb.Dataset(X_train, y_train)
    valid_set = lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(
        params = params,
        train_set = train_set,
        valid_sets = [train_set, valid_set],
        num_boost_round = 100,
        early_stopping_rounds = 10,
        verbose_eval = 20
    )
    models.append(model)
    
    score = log_loss(y_true = y_valid, y_pred = model.predict(X_valid))
    print(f"season{season} logloss : {round(score, 3)}")
    print("=" * 100)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-white")
import shap
shap.initjs()
explainer = shap.TreeExplainer(models[-1])
shap_values = explainer.shap_values(X = X_valid)
shap.summary_plot(shap_values, X_valid)

# 6. Submit

In [None]:
test.head()

In [None]:
test.tail()

In [None]:
submit_preds = []
for i, season in enumerate(range(2015, 2020)):
    X_test = test.loc[test["Season"] == season][use_cols]
    preds = models[i].predict(X_test)
    submit_preds.append(preds)
submit_preds = np.concatenate(submit_preds, axis = 0)

In [None]:
submit = pd.DataFrame()
submit["ID"] = test["ID"]
submit["Pred"] = submit_preds
submit.to_csv("submission.csv", index = False)
submit.head()