In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 100)

In [None]:
test = pd.read_csv("../input/ncaam-march-mania-2021/MDataFiles_Stage2/MSampleSubmissionStage2.csv")

test["Season"] = test["ID"].apply(lambda x: int(x.split("_")[0]))
test["TeamA"] = test["ID"].apply(lambda x: int(x.split("_")[1]))
test["TeamB"] = test["ID"].apply(lambda x: int(x.split("_")[2]))

print(test.shape)
test

In [None]:
tourney = pd.read_csv("../input/ncaam-march-mania-2021/MDataFiles_Stage2/MNCAATourneyCompactResults.csv")
print(tourney.shape)
tourney.head()

In [None]:
def create_all_df(df):
    win_df = df.rename(columns = {
        "WTeamID" : "TeamA",
        "WScore" : "ScoreA",
        "LTeamID" : "TeamB",
        "LScore" : "ScoreB"
    }).copy()

    lose_df = df.rename(columns = {
        "WTeamID" : "TeamB",
        "WScore" : "ScoreB",
        "LTeamID" : "TeamA",
        "LScore" : "ScoreA"
    }).copy()
    
    win_df["ScoreGap"] = win_df["ScoreA"] - win_df["ScoreB"]
    lose_df["ScoreGap"] = lose_df["ScoreA"] - lose_df["ScoreB"]
    all_df = pd.concat([win_df, lose_df], axis = 0)
    all_df["WinA"] = (all_df["ScoreGap"] > 0).astype("int8")
    all_df = all_df.sort_values(by = ["Season", "TeamA", "TeamB"]).reset_index(drop = True)
    return all_df

all_df = create_all_df(tourney)

In [None]:
regular = pd.read_csv("../input/ncaam-march-mania-2021/MDataFiles_Stage2/MRegularSeasonCompactResults.csv")
print(regular.shape)
regular.head()

In [None]:
win_df = regular.groupby(["Season", "WTeamID"], as_index = False)["DayNum"].count()
win_df.rename(columns = {"DayNum" : "WCount", "WTeamID" : "TeamID"}, inplace = True)
lose_df = regular.groupby(["Season", "LTeamID"], as_index = False)["DayNum"].count()
lose_df.rename(columns = {"DayNum" : "LCount", "LTeamID" : "TeamID"}, inplace = True)

features_df = pd.concat([win_df[["Season", "TeamID"]], lose_df[["Season", "TeamID"]]], axis = 0).drop_duplicates()
features_df = features_df.sort_values(by = ["Season", "TeamID"]).reset_index(drop = True)

features_df = pd.merge(features_df, win_df, on = ["Season", "TeamID"], how = "left")
features_df = pd.merge(features_df, lose_df, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

win_df = regular.groupby(["Season", "WTeamID"])["WScore"].agg(["sum", "mean"]).reset_index(drop = False)
win_df.rename(columns = {"WTeamID" : "TeamID", "sum" : "WScoreSum", "mean" : "WScoreMean"}, inplace = True)
lose_df = regular.groupby(["Season", "LTeamID"])["LScore"].agg(["sum", "mean"]).reset_index(drop = False)
lose_df.rename(columns = {"LTeamID" : "TeamID", "sum" : "LScoreSum", "mean" : "LScoreMean"}, inplace = True)

features_df = pd.merge(features_df, win_df, on = ["Season", "TeamID"], how = "left")
features_df = pd.merge(features_df, lose_df, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

features_df["SeasonScoreSum"] = features_df["WScoreSum"] + features_df["LScoreSum"]
features_df["SeasonScoreMean"] = features_df["SeasonScoreSum"] / (features_df["WCount"] + features_df["LCount"])

dummy = features_df.copy()
dummy["Season"] = dummy["Season"] + 1
dummy.rename(columns = {
    "WCount" : "WCount_shift1",
    "LCount" : "LCount_shift1",
    "WScoreSum" : "WScoreSum_shift1",
    "WScoreMean" : "WScoreMean_shift1",
    "LScoreSum" : "LScoreSum_shift1",
    "LScoreMean" : "LScoreMean_shift1",
    "SeasonScoreSum" : "SeasonScoreSum_shift1",
    "SeasonScoreMean" : "SeasonScoreMean_shift1",
}, inplace = True)

features_df = pd.merge(features_df, dummy, on = ["Season", "TeamID"], how = "left")
features_df.fillna(0, inplace = True)

In [None]:
features_df

In [None]:
import re
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

def preprocess(df):
    df = pd.merge(
        df, features_df, left_on = ["Season", "TeamA"], right_on = ["Season", "TeamID"], how = "left"
    ).rename(columns = {
        "WCount" : "WCountA",
        "LCount" : "LCountA",
        "WScoreSum" : "WScoreSumA",
        "WScoreMean" : "WScoreMeanA",
        "LScoreSum" : "LScoreSumA",
        "LScoreMean" : "LScoreMeanA",
        "SeasonScoreSum" : "SeasonScoreSumA",
        "SeasonScoreMean" : "SeasonScoreMeanA",

        "WCount_shift1" : "WCount_shift1A",
        "LCount_shift1" : "LCount_shift1A",
        "WScoreSum_shift1" : "WScoreSum_shift1A",
        "WScoreMean_shift1" : "WScoreMean_shift1A",
        "LScoreSum_shift1" : "LScoreSum_shift1A",
        "LScoreMean_shift1" : "LScoreMean_shift1A",
        "SeasonScoreSum_shift1" : "SeasonScoreSum_shift1A",
        "SeasonScoreMean_shift1" : "SeasonScoreMean_shift1A",
    }).drop(columns = "TeamID")

    df = pd.merge(
        df, features_df, left_on = ["Season", "TeamB"], right_on = ["Season", "TeamID"], how = "left"
    ).rename(columns = {
        "WCount" : "WCountB",
        "LCount" : "LCountB",
        "WScoreSum" : "WScoreSumB",
        "WScoreMean" : "WScoreMeanB",
        "LScoreSum" : "LScoreSumB",
        "LScoreMean" : "LScoreMeanB",
        "SeasonScoreSum" : "SeasonScoreSumB",
        "SeasonScoreMean" : "SeasonScoreMeanB",

        "WCount_shift1" : "WCount_shift1B",
        "LCount_shift1" : "LCount_shift1B",
        "WScoreSum_shift1" : "WScoreSum_shift1B",
        "WScoreMean_shift1" : "WScoreMean_shift1B",
        "LScoreSum_shift1" : "LScoreSum_shift1B",
        "LScoreMean_shift1" : "LScoreMean_shift1B",
        "SeasonScoreSum_shift1" : "SeasonScoreSum_shift1B",
        "SeasonScoreMean_shift1" : "SeasonScoreMean_shift1B",
    }).drop(columns = "TeamID")
    
    seed = pd.read_csv("../input/ncaam-march-mania-2021/MDataFiles_Stage2/MNCAATourneySeeds.csv")
    df = pd.merge(df, seed, left_on = ["Season", "TeamA"], right_on = ["Season", "TeamID"], how = "left").rename(columns = {"Seed" : "SeedA"}).drop(columns = "TeamID")
    df = pd.merge(df, seed, left_on = ["Season", "TeamB"], right_on = ["Season", "TeamID"], how = "left").rename(columns = {"Seed" : "SeedB"}).drop(columns = "TeamID")
    df["SeedA"] = df["SeedA"].apply(treat_seed)
    df["SeedB"] = df["SeedB"].apply(treat_seed)
    return df

In [None]:
all_df = preprocess(all_df)
all_df

In [None]:
test = preprocess(test)
test

In [None]:
print(list(all_df.columns))

In [None]:
use_cols = list(all_df.columns)
print(len(use_cols))
#for col in ['DayNum', 'ScoreA', 'ScoreB', 'WLoc', 'NumOT', 'ScoreGap', 'WinA']:
for col in ['DayNum', 'ScoreA', 'ScoreB', 'WLoc', 'NumOT', 'ScoreGap']:
    use_cols.remove(col)
print(len(use_cols))

In [None]:
X_train = all_df[use_cols]
#for season in range(2015, 2021):
#    train = all_df.loc[all_df["Season"] < season]
#    valid = all_df.loc[all_df["Season"] == season]
#    X_train = train[use_cols]
#    X_valid = valid[use_cols]
#    y_train = train["WinA"]
#    y_valid = valid["WinA"]

In [None]:
!pip install pycaret
import pycaret

In [None]:
%%time
from pycaret.classification import *
target = 'WinA'
clf1 = setup(data = X_train, #test_data = X_valid, 
             target = target, 
             session_id=123, log_experiment=False, experiment_name='experiment1',normalize=True,normalize_method='robust', 
             fix_imbalance = False, 
             imputation_type="iterative", categorical_iterative_imputer="lightgbm", numeric_iterative_imputer="lightgbm", iterative_imputation_iters = 5,
             trigonometry_features = True, polynomial_features = True, polynomial_degree = 3, polynomial_threshold = 0.1,
             feature_interaction = True, feature_ratio = True, interaction_threshold = 0.01,
             remove_outliers = False, outliers_threshold = 0.05, remove_multicollinearity = False, multicollinearity_threshold = 0.8, 
             remove_perfect_collinearity = False, pca = False, pca_method='linear', pca_components = .95, 
             feature_selection = True, feature_selection_threshold=.8, feature_selection_method='boruta', train_size = .8, silent=True, use_gpu=True, n_jobs=-1
             #,ignore_features= ['']
             )

In [None]:
print(models(internal=True)[['Name', 'GPU Enabled']])

In [None]:
# add Log Loss metric in pycaret
from sklearn.metrics import log_loss
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False)

In [None]:
#compare_models(n_select=1, turbo=True, cross_validation=False, sort='LogLoss')
#compare_models(n_select=1, fold=10, turbo=True, cross_validation=True, sort='LogLoss')

In [None]:
selected1 = create_model('et', fold=10,  cross_validation=True)
selected2 = create_model('lda', fold=10,  cross_validation=True)
#selected3 = create_model('lightgbm', fold=10,  cross_validation=False)
selected4 = create_model('nb', fold=10,  cross_validation=True)
#selected5 = create_model('catboost', fold=10,  cross_validation=False)

#selected1 = create_model('et', fold=10,  cross_validation=True)
#selected2 = create_model('lda', fold=10,  cross_validation=True)
#selected3 = create_model('rf', fold=10,  cross_validation=True)
#selected3 = create_model('catboost', fold=2,  cross_validation=True)


In [None]:
blender = blend_models(estimator_list = [selected1, selected2, selected4], method = 'soft', fold=10)

In [None]:
tuned = tune_model(blender, fold=10, optimize='LogLoss', n_iter=10)

In [None]:
finalized = finalize_model(tuned)

In [None]:
#calibrated = calibrate_model(finalized)

In [None]:
use_cols = list(all_df.columns)
print(len(use_cols))
for col in ['DayNum', 'ScoreA', 'ScoreB', 'WLoc', 'NumOT', 'ScoreGap', 'WinA']:
    use_cols.remove(col)
print(len(use_cols))

In [None]:
X_test = test[use_cols]
preds = predict_model(finalized, data=X_test)
#preds = predict_model(finalized, data=X_test)
#preds["ScoreWinA"] = 1-preds[(preds['Label'] == 0)]["Score"]
preds["ScoreWinA"] = np.where(preds['Label'] == 0, 1 - preds["Score"], preds["Score"])

In [None]:
preds

In [None]:
submit = pd.DataFrame()
submit["ID"] = preds["Season"].astype(str) + "_" + preds["TeamA"].astype(str) + "_" + preds["TeamB"].astype(str)
submit["Pred"] = preds["ScoreWinA"]
submit.to_csv("submission.csv", index = False)
submit