パラメータチューニング時に参考にしたサイト：<br>
https://qiita.com/R1ck29/items/50ba7fa5afa49e334a8f<br>
https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html<br>
https://hackerdemy.com/2020/09/15/lightgbm-classification/<br>
https://zenn.dev/mosamosa/articles/07d0076c9292136a3639<br>

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna.integration.lightgbm as lgbo
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

DIR_PATH = '../kaggle/MDataFiles_Stage2'

In [None]:
train = pd.read_csv(DIR_PATH+'/final_regular_result.csv')
test = pd.read_csv(DIR_PATH+'/stage1_test.csv')

y = train["result"]
s = train["Season"]
X = train.drop(['Season','TeamID1','TeamID2','result'], axis=1)
X_test = test.drop(['ID', 'Season','TeamID1','TeamID2'], axis=1)

X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

In [None]:
# Onehot-encodingでtrain, testデータのカラムデータを合わせる
x_column = X.columns
test_column = X_test.columns
no_train_feat = []
no_test_feat = []

for i in x_column:
    if i not in test_column:
        no_train_feat.append(i)
        
X.drop(no_train_feat, axis=1, inplace=True)

x_column = X.columns
test_column = X_test.columns
for i in test_column:
    if i not in x_column:
        no_test_feat.append(i)
X_test.drop(no_test_feat, axis=1, inplace=True)

使用モデル：llightgbm<br>
CV： groupKfold(n=4)<br>
アンサンブル： CV毎にtestデータの確率予測を行い、nfoldで加重平均を行う<br>

In [None]:
# パラメータチューニング：https://qiita.com/R1ck29/items/50ba7fa5afa49e334a8f
# -------------------現状のベストパラメータ保管場所------------------
# 候補1 : cv mean : 0.4949, 使用特徴量：全て(2015~2021), kfold=6
best_lgb_params1 = {'objective': 'binary',
              'metric': 'binary_logloss',
              'boosting': 'gbdt',
              'num_leaves': 128,
              'feature_fraction': 0.6,
              'bagging_fraction': 0.6,
              'bagging_freq': 5,
              'learning_rate': 0.05,
              'max_bin':600,
              'seed':2021,
                'num_iterations':10000
}

# 候補3: cv mean:0.4925, kfold=6
best_lgb_params3 = {'objective': 'binary',
              'metric': 'binary_logloss',
              'boosting': 'gbdt',
              'num_leaves': 1000,
              'max_depth': 15,
              'feature_fraction': 0.3,
              'bagging_fraction': 0.6,
              'bagging_freq': 5,
              'learning_rate': 0.01,
              'max_bin':125,
              'seed':2021,
              'min_data_in_leaf':13,
              'lambda_l1': 2.5,
              'extra_trees': True,
              'num_iterations':10000
}

# -------------------色々パラメータで遊んで精度見てみる場所------------------
# lgb_params = {'objective': 'binary',
#               'metric': 'binary_logloss',
#               'boosting': 'gbdt',
#               'num_leaves': 23,
#               'max_depth' : 5,
#               'max_bin' : 600,
#               'min_sum_hessian_in_leaf' : 1e-5,
#               'feature_fraction': 0.8,
#               'bagging_fraction': 0.6,
#               'bagging_freq': 5,
#               'learning_rate': 0.05,
#               'num_iterations' : 5000,
#               'random_state' : 42
#              }

# cv mean:0.492→特徴量全乗せ、2019までののシーズン 　0.4813→全乗せ、全てのシーズン
# cv mean:→特徴量全盛、2015~2021使用
# lgb_params = {'objective': 'binary',
#               'metric': 'binary_logloss',
#               'boosting': 'gbdt',
#               'num_leaves': 1000,
#               'max_depth': 15,
#               'feature_fraction': 0.1,
#               'bagging_fraction': 0.6,
#               'bagging_freq': 5,
#               'learning_rate': 0.01,
#               'max_bin':125,
#               'seed':2021,
#               'min_data_in_leaf':13,
#               'lambda_l1': 2.5,
#               'extra_trees': True,
#               'num_iterations':10000
# }

lgb_params = {'objective': 'regression',
              'metric': 'mae',
              'boosting': 'dart',
              'num_leaves': 32,
              'min_data_in_leaf': 40,
              'feature_fraction': 0.9,
              'bagging_fraction': 0.8,
              'bagging_freq': 3,
              'learning_rate': 0.02,
              'num_iterations' : 300,
              'max_depth':5
             }

def model_training(X, y, cv, groups, params, metric, early_stopping=10, \
    plt_iter=True, X_test=[], cat_features=[]):

    feature_importance = pd.DataFrame()
    val_scores=[]
    train_evals=[]
    valid_evals=[]

    if len(X_test) > 0:
        test_pred=np.zeros(len(X_test))

    for idx, (train_index, val_index) in enumerate(cv.split(X, y, groups)):

        print("###### fold %d ######" % (idx+1))
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
    
        model = lgb.LGBMClassifier(**params)

        model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  early_stopping_rounds=early_stopping,
                  verbose=20
                  #categorical_feature=list(cate_ft_lst),
                  )

        val_scores.append(model.best_score_['valid_1'][metric])
        train_evals.append(model.evals_result_['training'][metric])
        valid_evals.append(model.evals_result_['valid_1'][metric])
        

        if len(X_test) > 0:
            test_pred = test_pred + model.predict_proba(X_test)[:,1]

        fold_importance = pd.DataFrame()
        fold_importance["feature"] = X_train.columns
        fold_importance["importance"] = model.feature_importances_
        fold_importance["fold"] = idx+1
        feature_importance = pd.concat([feature_importance, fold_importance]
                                       , axis=0)
    
    
    if plt_iter:
        
        fig, axs = plt.subplots(2, 2, figsize=(9,20))
        
        for i, ax in enumerate(axs.flatten()):
            ax.plot(train_evals[i], label='training')
            ax.plot(valid_evals[i], label='validation')
            ax.set(xlabel='interations', ylabel=f'{metric}')
            ax.set_title(f'fold {i+1}', fontsize=12)
            ax.legend(loc='upper right', prop={'size': 9})
        fig.tight_layout()
        plt.show()
    
    print('### CV scores by fold ###')
    for i in range(cv.get_n_splits(X)):
        print(f'fold {i+1}: {val_scores[i]:.4f}')
    print('CV mean score: {0:.4f}, std: {1:.4f}.'\
          .format(np.mean(val_scores), np.std(val_scores)))
    
    feature_importance = feature_importance[["feature", "importance"]]\
                         .groupby("feature").mean().sort_values(
                         by="importance", ascending=False)
    feature_importance.reset_index(inplace=True)

    if len(X_test) > 0:
        test_pred = test_pred / cv.get_n_splits(X)
        return feature_importance, test_pred
    else:
        return feature_importance

In [None]:
# 学習・推定
%%time
group_kfold = GroupKFold(n_splits=4)

feature_importance, test_pred = \
    model_training(X, y, group_kfold, s, lgb_params, 
    'l1', early_stopping=50, plt_iter=True, X_test=X_test)

In [None]:
# feature importance出力
plt.figure(figsize=(10, 10));
sns.barplot(x="importance", y="feature", data=feature_importance)
plt.title('Feature Importnace')

In [None]:
# 最終予測結果出力
MSampleSubmission = pd.read_csv(DIR_PATH+'/MSampleSubmissionStage2.csv')
idx = test_pred.shape[0] // 2
test_pred[idx:] = 1 - test_pred[idx:]

pred = pd.concat([test.ID, pd.Series(test_pred)], axis=1).groupby('ID')[0]\
        .mean().reset_index().rename(columns={0:'Pred'})
sub = MSampleSubmission.drop(['Pred'],axis=1).merge(pred, on='ID')
sub.to_csv('small_mae_score.csv', index=False)
sub.head()

In [None]:
# 予測結果の分布確認
sub['Pred'].hist(bins=100)
print(len(sub.loc[sub['Pred'] > 0.5, 'Pred']))
print(len(sub.loc[sub['Pred'] < 0.5, 'Pred']))