In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re

# EDA

In [None]:
# View data file directory
os.listdir('../input/mens-march-mania-2022/MDataFiles_Stage2/')

In [None]:
# View the format of the submission file
test = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv')
test

In [None]:
# View regular season data
SeasonDetailedResults = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MRegularSeasonDetailedResults.csv')
SeasonDetailedResults

In [None]:
SeasonDetailedResults.describe()

In [None]:
SeasonDetailedResults.info()

In [None]:
# View tournament data
TourneyDetailedResults = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneyDetailedResults.csv')
TourneyDetailedResults

In [None]:
TourneyDetailedResults.columns

In [None]:
TourneyDetailedResults.describe()

* There are other features that can be analyzed here to show the difference between tournament and regular season data

In [None]:
# There are few tournaments, so the difference are not obvious
features = ['WScore','LScore','WFGM','LFGM']
SeasonDetailedResults_count = SeasonDetailedResults.shape[0]
TourneyDetailedResults_count = TourneyDetailedResults.shape[0]
for feature in features:
    SeasonDetailedResults[feature].value_counts().sort_index().plot()
    TourneyDetailedResults[feature].value_counts().sort_index().plot()
    plt.xlabel(feature)
    plt.legend(['TourneyDetailedResults','TourneyDetailedResults'])
    plt.ylabel('count')
    plt.show()

In [None]:
features = ['WScore','LScore','WFGM','LFGM']
SeasonDetailedResults_count = SeasonDetailedResults.shape[0]
TourneyDetailedResults_count = TourneyDetailedResults.shape[0]
for feature in features:
    (SeasonDetailedResults[feature].value_counts().sort_index()/SeasonDetailedResults_count).plot()
    (TourneyDetailedResults[feature].value_counts().sort_index()/TourneyDetailedResults_count).plot()
    plt.xlabel(feature)
    plt.legend(['SeasonDetailedResults','SeasonDetailedResults'])
    plt.ylabel('ratio')
    plt.show()

* Detailed regular season data is of little significance for predicting championships(the chart above shows the difference). Referring to the NBA, detailed regular season data (such as rebounds, assists, turnovers, etc.) is very different from the playoffs, but the teams with strong regular seasons tend to make it to the end. So here we only need to get some basic data through the regular season (team winning percentage, wins and average points difference)

In [None]:
# Ranking of each team under different forecasting agencies, the data is important for show the overall strength of the team
MMasseyOrdinals = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MMasseyOrdinals_thruDay128.csv')
MMasseyOrdinals

In [None]:
# Does the coach's change indicate that the team is in poor state?
MTeamCoaches = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MTeamCoaches.csv')
MTeamCoaches

# Features

* Here I change the WL team to the AB team(Team with the larger number is A team), and the A_win feature is used to see if the A team wins (1 means victory, 0 means loss). In the end, I convert the battle data into per game data for each team.

In [None]:
for fea in ['WLoc', 'NumOT']:
    del TourneyDetailedResults[fea]
TourneyDetailedResults['TeamIDcoompare'] = TourneyDetailedResults['WTeamID'] < TourneyDetailedResults['LTeamID']
TourneyDetailedResults['TeamIDcoompare'] = TourneyDetailedResults['TeamIDcoompare'].astype(str).map({'True':1, 'False':0})
df1 = TourneyDetailedResults[TourneyDetailedResults['TeamIDcoompare'] == 1]
df2 = TourneyDetailedResults[TourneyDetailedResults['TeamIDcoompare'] == 0]
TourneyDetailedResults

In [None]:
# A can be understood as oneself，B means opponent
new_fea_list1 = []
for col in df1.columns.values.tolist()[2:]:
    if col[0] == 'W':
        col = str(col).replace('W', 'A', 1)
    if col[0] == 'L':
        col = str(col).replace('L', 'B', 1)
    new_fea_list1.append(col)
df1.columns = ['Season', 'DayNum'] + new_fea_list1
new_fea_list2 = []
for col in df2.columns.values.tolist()[2:]:
    if col[0] == 'W':
        col = str(col).replace('W', 'B', 1)
    if col[0] == 'L':
        col = str(col).replace('L', 'A', 1)
    new_fea_list2.append(col)
df2.columns = ['Season', 'DayNum'] + new_fea_list2

df = pd.concat([df1, df2])
df['ScoreDiff'] = df['AScore'] - df['BScore']
df['A_win'] = df['ScoreDiff'] > 0
df['A_win'] = df['A_win'].astype(str).map({'True':1, 'False':0})
for fea in ['TeamIDcoompare']:
    del df[fea]
df

In [None]:
df['ScoreDiff_B'] = -df['ScoreDiff']
A_cols = ['Season', 'ATeamID', 'AFGM', 'AFGA', 'AFGM3', 'AFGA3', 'AFTM', 'AFTA', 'AOR', 'ADR', 'AAst', 'ATO',
       'AStl', 'ABlk', 'APF', 'ScoreDiff']
B_cols = ['Season', 'BTeamID','BFGM', 'BFGA', 'BFGM3', 'BFGA3', 'BFTM', 'BFTA','BOR', 'BDR', 'BAst', 'BTO', 
          'BStl', 'BBlk', 'BPF', 'ScoreDiff_B']
A_df = df[A_cols]
B_df = df[B_cols]
new_cols = []
for col in A_df.columns.values.tolist():
    if col[0] == 'A':
        col = col[1:]
    new_cols.append(col)
A_df.columns = new_cols
B_df.columns = new_cols

In [None]:
tournament_season_df = pd.concat([A_df, B_df])
tournament_season_df

* Construct training and test dataset(It should be noted here that the tournament data cannot be used, because the data does not contain 2022, so it cannot be constructed when creating the test dataset).I still constructed some tournament features here, but I'm not using

In [None]:
train = df.copy()
train = train[['Season', 'ATeamID', 'BTeamID', 'A_win']]
train

In [None]:
test = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv')
test['Season'] = test['ID'].apply(lambda x:x.split('_')[0])
test['ATeamID'] = test['ID'].apply(lambda x:x.split('_')[1])
test['BTeamID'] = test['ID'].apply(lambda x:x.split('_')[2])
del test['ID']
test = test.reindex(columns=['Season', 'ATeamID', 'BTeamID', 'Pred'])
test[['Season', 'ATeamID', 'BTeamID']] = test[['Season', 'ATeamID', 'BTeamID']].astype(int)
del test['Pred']
test

* Add features

In [None]:
tournament_season_df.columns

In [None]:
# selectfea = [col for col in df.columns if col not in ['BFGM', 'BFGA', 'BFGM3', 'BFGA3', 'BFTM', 'BFTA',
#        'BOR', 'BDR', 'BAst', 'BTO', 'BStl', 'BBlk', 'BPF']] 
tournament_season_df['shot_percentage'] = tournament_season_df['FGM'] / tournament_season_df['FGA']
tournament_season_df['3_shot_percentage'] = tournament_season_df['FGM3'] / tournament_season_df['FGA3']
tournament_season_df['all_rebounds'] = tournament_season_df['OR'] + tournament_season_df['DR']
tournament_season_df['off_rebounds_percentage'] = tournament_season_df['OR'] / tournament_season_df['all_rebounds']
tournament_season_df['def_rebounds_percentage'] = tournament_season_df['DR'] / tournament_season_df['all_rebounds']
tournament_season_df['ast_to_percentage'] = tournament_season_df['TO'] / tournament_season_df['Ast']
tournament_season_df['def_data'] = tournament_season_df['Stl'] + tournament_season_df['Blk']
for col in ['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'PF', 'OR', 'DR', 'Ast', 'Stl', 'Blk']:
    del tournament_season_df[col]
tournament_season_df

In [None]:
def judge_win(x):
    if x > 0:
        return 1
    else:
        return 0
tournament_season_df['win'] = tournament_season_df['ScoreDiff'].apply(lambda x:judge_win(x))
tournament_season_df

* Tournament Basic Statistics

In [None]:
tournament_season_df['tournament_win_scorediff_mean'] = tournament_season_df[tournament_season_df['win'] == 1].\
                                                        groupby(['Season', 'TeamID'])['ScoreDiff'].transform('mean')
tournament_season_df['tournament_win_sum'] = tournament_season_df.groupby(['Season', 'TeamID'])['win'].transform(sum)
tournament_start_fea_df = tournament_season_df[['Season', 'TeamID', 'tournament_win_scorediff_mean', 
                                                'tournament_win_sum']]
tournament_start_fea_df.drop_duplicates(['Season', 'TeamID'], inplace=True)
tournament_start_fea_df

* Regular Basic Statistics

In [None]:
def generate_basicdata(df):  
    
    for fea in ['WLoc', 'NumOT']:
        del df[fea]
    df['TeamIDcoompare'] = df['WTeamID'] < df['LTeamID']
    df['TeamIDcoompare'] = df['TeamIDcoompare'].astype(str).map({'True':1, 'False':0})
    df1 = df[df['TeamIDcoompare'] == 1]
    df2 = df[df['TeamIDcoompare'] == 0]
    # A可以理解为自己，B为对手
    new_fea_list1 = []
    for col in df1.columns.values.tolist()[2:]:
        if col[0] == 'W':
            col = str(col).replace('W', 'A', 1)
        if col[0] == 'L':
            col = str(col).replace('L', 'B', 1)
        new_fea_list1.append(col)
    df1.columns = ['Season', 'DayNum'] + new_fea_list1
    new_fea_list2 = []
    for col in df2.columns.values.tolist()[2:]:
        if col[0] == 'W':
            col = str(col).replace('W', 'B', 1)
        if col[0] == 'L':
            col = str(col).replace('L', 'A', 1)
        new_fea_list2.append(col)
    df2.columns = ['Season', 'DayNum'] + new_fea_list2
    df = pd.concat([df1, df2])
    df['ScoreDiff'] = df['AScore'] - df['BScore']
    df['A_win'] = df['ScoreDiff'] > 0
    df['A_win'] = df['A_win'].astype(str).map({'True':1, 'False':0})
    for fea in ['TeamIDcoompare']:
        del df[fea]
    df['ScoreDiff_B'] = -df['ScoreDiff']
    A_cols = ['Season', 'ATeamID', 'AFGM', 'AFGA', 'AFGM3', 'AFGA3', 'AFTM', 'AFTA', 'AOR', 'ADR', 'AAst', 'ATO',
           'AStl', 'ABlk', 'APF', 'ScoreDiff']
    B_cols = ['Season', 'BTeamID','BFGM', 'BFGA', 'BFGM3', 'BFGA3', 'BFTM', 'BFTA','BOR', 'BDR', 'BAst', 'BTO', 
              'BStl', 'BBlk', 'BPF', 'ScoreDiff_B']
    A_df = df[A_cols]
    B_df = df[B_cols]
    new_cols = []
    for col in A_df.columns.values.tolist():
        if col[0] == 'A':
            col = col[1:]
        new_cols.append(col)
    A_df.columns = new_cols
    B_df.columns = new_cols
    df = pd.concat([A_df, B_df])
    def judge_win(x):
        if x > 0:
            return 1
        else:
            return 0
    df['win'] = df['ScoreDiff'].apply(lambda x:judge_win(x))
    
    return df

SeasonDetailedResults = generate_basicdata(SeasonDetailedResults)
SeasonDetailedResults

In [None]:
SeasonDetailedResults['season_win_scorediff_mean'] = SeasonDetailedResults[SeasonDetailedResults['win'] == 1].\
                                                     groupby(['Season', 'TeamID'])['ScoreDiff'].transform('mean')
SeasonDetailedResults['season_lose_scorediff_mean'] = SeasonDetailedResults[SeasonDetailedResults['win'] == 0].\
                                                     groupby(['Season', 'TeamID'])['ScoreDiff'].transform('mean')

SeasonDetailedResults['season_win_sum'] = SeasonDetailedResults.groupby(['Season', 'TeamID'])['win'].transform(sum)
SeasonDetailedResults['season_game_sum'] = SeasonDetailedResults.groupby(['Season', 'TeamID'])['TeamID'].transform('count')
SeasonDetailedResults['season_win_ratio'] = SeasonDetailedResults['season_win_sum'] / SeasonDetailedResults['season_game_sum']
Season_start_fea_df = SeasonDetailedResults[['Season', 'TeamID', 'season_win_sum', 'season_win_ratio', 'season_game_sum', 
                               'season_win_scorediff_mean', 'season_lose_scorediff_mean']]
Season_start_fea_df.drop_duplicates(['Season', 'TeamID'], inplace=True)
Season_start_fea_df

* Agg detail data

In [None]:
%%time
def aggregate_transactions(df_, title): 

    df = df_.copy()

    agg_func = {
        'TO':          ['mean','std'],
        'ScoreDiff':    ['mean','std'],
        'shot_percentage':        ['mean','std'],
        '3_shot_percentage':        ['mean','std'],
        'all_rebounds':         ['mean','std'],
        'off_rebounds_percentage':         ['mean','std'],
        'def_rebounds_percentage':          ['mean','std'],
        'ast_to_percentage':          ['mean','std'],
        'def_data':         ['mean','std'],
        } 
    agg_df = df.groupby(['Season', 'TeamID']).agg(agg_func).fillna(0)
    agg_df.columns = [title + '_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(drop=False, inplace=True)
    
    return agg_df

tournament_base_stat  = aggregate_transactions(tournament_season_df, title='tour_')
tournament_base_stat

In [None]:
MMasseyOrdinals['OrdinalRank_season_mean'] = MMasseyOrdinals.groupby(['Season', 'TeamID'])['OrdinalRank'].transform('mean')
MMasseyOrdinals['OrdinalRank_season_std'] = MMasseyOrdinals.groupby(['Season', 'TeamID'])['OrdinalRank'].transform('std')
for col in ['RankingDayNum', 'SystemName', 'OrdinalRank']:
    del MMasseyOrdinals[col]
MMasseyOrdinals = MMasseyOrdinals.drop_duplicates()
MMasseyOrdinals

* Seed team rankings are determined based on the team's ranking for the season

In [None]:
df_seeds = pd.read_csv( '../input/mens-march-mania-2022/MDataFiles_Stage2/MNCAATourneySeeds.csv')
df_seeds['Seed'] = df_seeds['Seed'].apply(lambda x:int(re.sub("[^0-9]", "", x)))
df_seeds = df_seeds[df_seeds['Season'] >= 2003]
df_seeds

In [None]:
MTeamCoaches = MTeamCoaches[MTeamCoaches['Season'] >= 2003]
MTeamCoaches['Last_Day'] = MTeamCoaches['LastDayNum'] - MTeamCoaches['FirstDayNum']
MTeamCoaches['Coach_season_count'] = MTeamCoaches.groupby(['Season', 'TeamID'])['CoachName'].transform('count')
for col in ['FirstDayNum', 'LastDayNum', 'CoachName', 'Last_Day']:
    del MTeamCoaches[col]
MTeamCoaches.drop_duplicates(inplace=True)
MTeamCoaches

In [None]:
%%time
print(train.shape)
print(test.shape)
## 合并到训练集和测试集
print('#_____基础统计特征')
# train = pd.merge(train, tournament_base_stat, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
# train = pd.merge(train, tournament_base_stat, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
# test = pd.merge(test, tournament_base_stat, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
# test = pd.merge(test, tournament_base_stat, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])

# train = pd.merge(train, tournament_start_fea_df, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
# train = pd.merge(train, tournament_start_fea_df, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
# test = pd.merge(test, tournament_start_fea_df, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
# test = pd.merge(test, tournament_start_fea_df, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])

train = pd.merge(train, Season_start_fea_df, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
train = pd.merge(train, Season_start_fea_df, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
test = pd.merge(test, Season_start_fea_df, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
test = pd.merge(test, Season_start_fea_df, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])

train = pd.merge(train, df_seeds, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
train = pd.merge(train, df_seeds, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
test = pd.merge(test, df_seeds, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
test = pd.merge(test, df_seeds, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])

train = pd.merge(train, MMasseyOrdinals, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
train = pd.merge(train, MMasseyOrdinals, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
test = pd.merge(test, MMasseyOrdinals, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
test = pd.merge(test, MMasseyOrdinals, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])

train = pd.merge(train, MTeamCoaches, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
train = pd.merge(train, MTeamCoaches, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
test = pd.merge(test, MTeamCoaches, left_on=['Season', 'ATeamID'], right_on=['Season', 'TeamID'], how='left')
test = pd.merge(test, MTeamCoaches, left_on=['Season', 'BTeamID'], right_on=['Season', 'TeamID'], how='left', suffixes=['_littleID','_bigID'])
print(train.shape)
print(test.shape)

In [None]:
for col in ['TeamID_bigID', 'TeamID_littleID']:
    del train[col]
for col in ['TeamID_bigID', 'TeamID_littleID']:
    del test[col]

# Modeling

In [None]:
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
import lightgbm as lgb
from hyperopt import hp, fmin, tpe
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
import optuna
from lightgbm import LGBMClassifier
from sklearn.preprocessing import PowerTransformer

* hyperopt

In [None]:
# def params_append(params):
#     params['objective'] = 'binary'
#     params['metric'] = 'binary_logloss'
#     params['bagging_seed'] = 2022
    
#     return params
    
# def lgb_param_hyperopt(train):
#     label = 'A_win'
#     features = [f for f in train.columns if f not in ['A_win']]
#     train_data = lgb.Dataset(train[features], train[label], silent=True)
#     def hyperopt_objective(params):
#         params = params_append(params)
#         print(params)
#         res = lgb.cv(params, train_data, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, 
#                      metrics='binary_logloss', early_stopping_rounds=20, verbose_eval=False, seed=2022
#                      )
#         return min(res['binary_logloss-mean'])
#     params_space = {
#         'learning_rate': hp.uniform('learning_rate', 1e-2, 5e-1),
#         'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
#         'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
#         'num_leaves': hp.choice('num_leaves', list(range(10, 300, 10))),
#         'reg_alpha': hp.randint('reg_alpha', 0, 10),
#         'reg_lambda': hp.uniform('reg_lambda', 0, 10),
#         'bagging_freq': hp.randint('bagging_freq', 1, 10),
#         'min_child_samples': hp.choice('min_child_samples', list(range(1, 30, 5))),
#         'min_data_in_leaf': 20
#     }
#     params_best = fmin(
#         hyperopt_objective,
#         space=params_space,
#         algo=tpe.suggest,
#         max_evals=30)
    
#     return params_best

# params_best = lgb_param_hyperopt(train)

* optuna

In [None]:
# def objective(trial, model_type='xgb'):
    
#     X_train, X_test, y_train, y_test=train_test_split(
#         train[[col for col in train.columns if col not in ['A_win']]], train['A_win'], train_size=0.25)# 数据集划分
#     if model_type == 'lgb':
#         param = {
#                 'n_estimators': 20000,
#                 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#                 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
#                 'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
#                 'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
#                 'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
#                 'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17, 20, 50]),
#                 'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
#                 'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
#                 'cat_smooth' : trial.suggest_int('cat_smooth', 1, 100)     
#         }

#         lgbm = LGBMClassifier(**param)
#         lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
#         pred_lgb=lgbm.predict(X_test)
#         logloss = log_loss(y_test, pred_lgb)
    
#     if model_type == 'xgb':
#         xgb_params = dict(
#             max_depth=trial.suggest_int("max_depth", 1, 100),
#             learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#             n_estimators=trial.suggest_int("n_estimators", 100, 200),
#             min_child_weight=trial.suggest_int("min_child_weight", 1, 20),
#             colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
#             subsample=trial.suggest_float("subsample", 0.2, 1.0),
#             reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
#             reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True)
#         )
#         xgbm = xgb.XGBClassifier(**xgb_params, objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
#         xgbm.fit(X_train, y_train)
#         pred_xgb = xgbm.predict(X_test)
#         logloss = log_loss(y_test, pred_xgb)
    
#     return logloss

# study=optuna.create_study(direction='minimize')
# n_trials=50 # try50次
# study.optimize(objective, n_trials=n_trials)
# params = study.best_params

In [None]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    # np.zeros生成的是1行n列的数组
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            verbose_eval=100, early_stopping_rounds=300)
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            # num_iteration=clf.best_iteration参数，如果模型提前停止，给予这个模型最好的参数
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        
        if model_type == 'xgb':
            trn_data = xgb.DMatrix(X[trn_idx], y[trn_idx])
            val_data = xgb.DMatrix(X[val_idx], y[val_idx])
            watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
            clf = xgb.train(dtrain=trn_data, num_boost_round=20000, 
                            evals=watchlist, early_stopping_rounds=200, 
                            verbose_eval=100, params=params)
            oof[val_idx] = clf.predict(xgb.DMatrix(X[val_idx]), ntree_limit=clf.best_ntree_limit)
            predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
        
#         if (model_type == 'cat') and (eval_type == 'regression'):
#             clf = CatBoostRegressor(iterations=20000, eval_metric='RMSE', **params)
#             clf.fit(X[trn_idx], y[trn_idx], 
#                     eval_set=(X[val_idx], y[val_idx]),
#                     cat_features=[], use_best_model=True, verbose=100)
#             oof[val_idx] = clf.predict(X[val_idx])
#             predictions += clf.predict(X_test) / folds.n_splits
            
#         if (model_type == 'cat') and (eval_type == 'binary'):
#             clf = CatBoostClassifier(iterations=20000, eval_metric='Logloss', **params)
#             clf.fit(X[trn_idx], y[trn_idx], 
#                     eval_set=(X[val_idx], y[val_idx]),
#                     cat_features=[], use_best_model=True, verbose=100)
#             oof[val_idx] = clf.predict_proba(X[val_idx])[:,1]
#             predictions += clf.predict_proba(X_test)[:,1] / folds.n_splits
        print(predictions)
        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [None]:
fea_cols = [col for col in train.columns if col not in ['A_win']]

In [None]:
lgb_params ={'num_leaves': 60,
             'min_data_in_leaf': 30, 
             'objective':'binary',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.95,
             "bagging_freq": 1,
             "bagging_fraction": 0.95,
             "bagging_seed": 11,
             "metric": 'binary_logloss',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2022)
X_train = train[fea_cols].values
X_test = test[fea_cols].values
y_train = train['A_win'].values
print('='*10,'分类模型','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, 
                                                      params=lgb_params, folds=folds, model_type='lgb', 
                                                      eval_type='binary')

In [None]:
# xgb_params = {'max_depth': 73,
#              'learning_rate': 0.020717623786209245,
#              'n_estimators': 155,
#              'min_child_weight': 1,
#              'colsample_bytree': 0.5643575367200632,
#              'subsample': 0.95,
#              'reg_alpha': 0.5,
#              'reg_lambda': 0.5,
#               'objective':'binary:logistic'}
# # eta：学习率；subsample：用于每次分裂一颗树时随机样本（行）采样的比例；colsample_bytree：用于每次分裂一颗树时随机样本
# #（列）采样的比例；min_child_weight：分裂的叶子节点中样本权重和的最小值。如果新分裂的节点的样本权重和小于
# # min_child_weight则停止分裂。；nthread：线程数；max_bin：最大直方图
# folds = KFold(n_splits=10, shuffle=True, random_state=2022)
# oof_xgb , predictions_xgb , scores_xgb  = train_model(X_train , X_test, y_train , params=xgb_params, folds=folds, model_type='xgb', eval_type='binary')

In [None]:
# def train_model2(X, X_test_, y, params, model_type='lgb', eval_type='regression'):
#     oof = np.zeros(X.shape[0])
#     # np.zeros生成的是1行n列的数组
#     predictions = np.zeros(X_test_.shape[0])
#     seasons = X['Season'].unique().astype(int).tolist()
#     scores = []
#     for season in seasons[12:]:
#         print(f'\nValidating on season {season}', 'started at', time.ctime())
        
#         X_train = X[X['Season'] < season].copy()
#         X_val = X[X['Season'] == season].copy()
#         y_train = y[y['Season'] < season]['A_win'].copy()
#         y_val = y[y['Season'] == season]['A_win'].copy()
#         X_test = X_test_.copy()
#         X_train, X_val, X_test = standard_scale(X_train, X_val, X_test)
        
#         if model_type == 'lgb':
#             df_train = lgb.Dataset(X_train, y_train)
#             df_val = lgb.Dataset(X_val, y_val)
#             clf = lgb.train(params, df_train, num_boost_round=20000, 
#                             valid_sets=[df_train, df_val], 
#                             verbose_eval=100, early_stopping_rounds=300)
#             # num_iteration=clf.best_iteration参数，如果模型提前停止，给予这个模型最好的参数
#             predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / len(seasons[12:])
#         if model_type == 'xgb':
#             trn_data = xgb.DMatrix(X_train, y_train)
#             val_data = xgb.DMatrix(X_val, y_val)
#             watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
#             clf = xgb.train(dtrain=trn_data, num_boost_round=20000, 
#                             evals=watchlist, early_stopping_rounds=200, 
#                             verbose_eval=100, params=params)
#             predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / len(seasons[12:])
    
#     return predictions

In [None]:
# lgb_params ={'num_leaves': 60,
#              'min_data_in_leaf': 30, 
#              'objective':'binary',
#              'max_depth': -1,
#              'learning_rate': 0.01,
#              "min_child_samples": 20,
#              "boosting": "gbdt",
#              "feature_fraction": 0.95,
#              "bagging_freq": 1,
#              "bagging_fraction": 0.95,
#              "bagging_seed": 11,
#              "metric": 'binary_logloss',
#              "lambda_l1": 0.1,
#              "verbosity": -1}
# X_train = train[fea_cols]
# X_test = test[fea_cols]
# y_train = train[['Season', 'A_win']]
# print('='*10,'分类模型','='*10)
# predictions_lgb = train_model2(X_train , X_test, y_train, 
#                                                       params=lgb_params, model_type='lgb', 
#                                                       eval_type='binary')

In [None]:
sub_df = pd.read_csv('../input/mens-march-mania-2022/MDataFiles_Stage2/MSampleSubmissionStage2.csv')
sub_df['Pred'] = predictions_lgb
sub_df.to_csv('submission.csv', index=False)
sub_df