In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import optuna
import warnings
from tqdm import tqdm_notebook as tqdm
warnings.simplefilter('ignore')

In [2]:
# 設定値
class CFG:
    # 変更するパラメータ
    n_folds = 5 # 公差検証の分割数(多くて20)
    n_trials = 20 # ハイパーパラメータチューニングの試行回数(100)
    device_type = "cpu"
    # device_type = "cuda"
    boosting_type = "gbdt"
    # boosting_type = "dart"
    
    
    # その他設定値
    learning_rate = 0.01
    seed = 3407 
    target_col = 'Class'
    num_boost_round = 50500
    early_stopping_round = 300
    verbose_eval = 0  # この数字を1にすると学習時のスコア推移がコマンドライン表示される

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
test_df[CFG.target_col] = -1
submission_df = pd.read_csv("../data/sample_submission.csv")
all_df = pd.concat([train_df, test_df])

BC, CLはいらんかも

In [4]:
numerical_features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']
categorical_features = ['EJ']
features = numerical_features + categorical_features

### balanced loglossの計算（学習で使う？）

In [5]:
# 前処理
def Preprocessing(input_df: pd.DataFrame)->pd.DataFrame:
    output_df = input_df.copy()
    output_df['EJ'] = input_df['EJ'].replace({'A': 0, 'B': 1})
    return output_df

all_df = Preprocessing(all_df)

train_df = all_df[all_df[CFG.target_col] != -1].copy()
test_df = all_df[all_df[CFG.target_col] == -1].copy()

In [14]:
# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

"""
# 重み計算
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1
"""

'\n# 重み計算\ndef calc_log_loss_weight(y_true):\n    nc = np.bincount(y_true)\n    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])\n    return w0, w1\n'

In [17]:
def objective(trial):
    # light-gbm設定値
    lgb_params = {
        # 探索するパラメータ
        'verbosity': -1, # 学習途中の情報を表示するかどうか
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        # "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        
        # 固定値
        "boosting_type": CFG.boosting_type,
        "device_type": CFG.device_type,
        "objective": "binary",
        "learning_rate": CFG.learning_rate,
        "metric": "binary_logloss",
        'seed': CFG.seed,
        'n_jobs': -1, # -1でコア数をマックスで使う
        'is_unbalance':True, # 不均衡データの場合にTrueにする
    }
    
    scores = []
    # K-分割交差検証(層化抽出法)
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    pbar = tqdm(enumerate(kfold.split(train_df, train_df[CFG.target_col])))
    for fold, (train_index, valid_index) in pbar:
    # for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
        # 現在の試行回数を出力
        pbar.set_description("[train] trials {}".format(trial.number+1))
        
        x_train = train_df[features].iloc[train_index]
        y_train = train_df[CFG.target_col].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df[CFG.target_col].iloc[valid_index]
        
        # 訓練データの重みを計算
        # train_w0, train_w1 = calc_log_loss_weight(y_train)
        # 検証データの重みを計算
        # valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
        # 訓練データをlgb用に変換
        # lgb_train = lgb.Dataset(x_train, y_train, weight=y_train.map({0: train_w0, 1: train_w1}), categorical_feature=categorical_features)
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
        # 検証データをlgb用に変換
        # lgb_valid = lgb.Dataset(x_valid, y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1}), categorical_feature=categorical_features)
        lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
        
        model = lgb.train(
                    params = lgb_params,
                    train_set = lgb_train,
                    num_boost_round = CFG.num_boost_round,
                    valid_sets = [lgb_train, lgb_valid],
                    early_stopping_rounds = CFG.early_stopping_round,
                    verbose_eval = CFG.verbose_eval,
                    # 学習段階でbalanced_log_lossを使う場合はコメントアウト外す
                    # feval = lgb_metric,
                )
        # 予測
        preds = model.predict(x_valid)
        # 予測値をラベルに変換
        # pred_labels = np.rint(preds)
        # 評価
        # val_score = balanced_log_loss(y_valid, pred_labels)
        val_score = balanced_log_loss(y_valid, preds)
        
        scores.append(val_score)
    # クロスバリデーションの平均値を計算
    mean_score = np.mean(scores)
    
    return mean_score

In [18]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=CFG.n_trials)

[32m[I 2023-05-27 16:01:58,052][0m A new study created in memory with name: no-name-3e58ba28-ba89-45fd-be68-8696f92526c2[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:03,435][0m Trial 0 finished with value: 0.29290165828829473 and parameters: {'lambda_l1': 0.000372703234329152, 'lambda_l2': 2.1822312843412873e-08, 'num_leaves': 112, 'feature_fraction': 0.28702606645245266, 'bagging_fraction': 0.9648588149307604, 'min_child_samples': 86}. Best is trial 0 with value: 0.29290165828829473.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:09,985][0m Trial 1 finished with value: 0.2843394327753753 and parameters: {'lambda_l1': 1.1443975964581281e-06, 'lambda_l2': 0.8863748768257927, 'num_leaves': 45, 'feature_fraction': 0.7140779339192966, 'bagging_fraction': 0.9239846456268686, 'min_child_samples': 26}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:13,242][0m Trial 2 finished with value: 0.29368730102566437 and parameters: {'lambda_l1': 4.16659979545921e-05, 'lambda_l2': 4.322224889113739e-06, 'num_leaves': 145, 'feature_fraction': 0.5667953162263044, 'bagging_fraction': 0.2891835421077626, 'min_child_samples': 55}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:17,387][0m Trial 3 finished with value: 0.29198482430990685 and parameters: {'lambda_l1': 2.022422844031056, 'lambda_l2': 0.21173268730411626, 'num_leaves': 141, 'feature_fraction': 0.7210277110943939, 'bagging_fraction': 0.43958747814461074, 'min_child_samples': 13}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:21,568][0m Trial 4 finished with value: 0.28451494803231886 and parameters: {'lambda_l1': 2.650280652948881, 'lambda_l2': 3.104631206139215, 'num_leaves': 230, 'feature_fraction': 0.4798902544638362, 'bagging_fraction': 0.49431761387752726, 'min_child_samples': 48}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:25,212][0m Trial 5 finished with value: 0.2869062161643786 and parameters: {'lambda_l1': 3.424624511646802e-06, 'lambda_l2': 8.481672481407634e-07, 'num_leaves': 87, 'feature_fraction': 0.671352279310308, 'bagging_fraction': 0.8502972013789423, 'min_child_samples': 83}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:33,832][0m Trial 6 finished with value: 0.28446040641581927 and parameters: {'lambda_l1': 8.697129805230756e-05, 'lambda_l2': 5.060364353922726, 'num_leaves': 43, 'feature_fraction': 0.3988383196446328, 'bagging_fraction': 0.24059025848893373, 'min_child_samples': 60}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:37,566][0m Trial 7 finished with value: 0.28883592647399003 and parameters: {'lambda_l1': 0.09443285299031148, 'lambda_l2': 2.445745505897436e-06, 'num_leaves': 186, 'feature_fraction': 0.25011480143823595, 'bagging_fraction': 0.5300555858137639, 'min_child_samples': 57}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:40,934][0m Trial 8 finished with value: 0.42860257010152303 and parameters: {'lambda_l1': 0.013617127866837676, 'lambda_l2': 0.0006936696197731677, 'num_leaves': 34, 'feature_fraction': 0.24494322873350802, 'bagging_fraction': 0.5606333026937805, 'min_child_samples': 14}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:43,991][0m Trial 9 finished with value: 0.2846888597162505 and parameters: {'lambda_l1': 1.6762824749264735e-07, 'lambda_l2': 5.3850291108527945e-06, 'num_leaves': 80, 'feature_fraction': 0.7496997058249917, 'bagging_fraction': 0.41365485147555114, 'min_child_samples': 67}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:49,135][0m Trial 10 finished with value: 0.28830267336919385 and parameters: {'lambda_l1': 2.244749805397705e-08, 'lambda_l2': 0.05852243897637206, 'num_leaves': 15, 'feature_fraction': 0.9454981857316376, 'bagging_fraction': 0.751813385963885, 'min_child_samples': 32}. Best is trial 1 with value: 0.2843394327753753.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:02:59,945][0m Trial 11 finished with value: 0.28413675987046405 and parameters: {'lambda_l1': 3.9934767562612766e-06, 'lambda_l2': 9.987860799477788, 'num_leaves': 51, 'feature_fraction': 0.4331545478737735, 'bagging_fraction': 0.7031145671668497, 'min_child_samples': 32}. Best is trial 11 with value: 0.28413675987046405.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:03,600][0m Trial 12 finished with value: 0.29049164400193545 and parameters: {'lambda_l1': 8.862772485604057e-07, 'lambda_l2': 0.042311030190225016, 'num_leaves': 57, 'feature_fraction': 0.535901279134162, 'bagging_fraction': 0.7005435517112131, 'min_child_samples': 32}. Best is trial 11 with value: 0.28413675987046405.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:14,740][0m Trial 13 finished with value: 0.2836099591110607 and parameters: {'lambda_l1': 1.477027203108106e-08, 'lambda_l2': 8.235082641629747, 'num_leaves': 20, 'feature_fraction': 0.4205511988676821, 'bagging_fraction': 0.9753292634384617, 'min_child_samples': 31}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:21,766][0m Trial 14 finished with value: 0.3897418834286434 and parameters: {'lambda_l1': 1.7587232285684382e-08, 'lambda_l2': 7.1552099379881575, 'num_leaves': 2, 'feature_fraction': 0.39736541292878996, 'bagging_fraction': 0.9876432295767531, 'min_child_samples': 41}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:25,414][0m Trial 15 finished with value: 0.31767289006605504 and parameters: {'lambda_l1': 1.1023220239037546e-08, 'lambda_l2': 0.00482507291794402, 'num_leaves': 82, 'feature_fraction': 0.3948162816410289, 'bagging_fraction': 0.6642548621892002, 'min_child_samples': 23}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:30,658][0m Trial 16 finished with value: 0.28887938145105535 and parameters: {'lambda_l1': 1.8190022074463324e-07, 'lambda_l2': 0.38942278847803746, 'num_leaves': 3, 'feature_fraction': 0.474521768684922, 'bagging_fraction': 0.878243454885192, 'min_child_samples': 5}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:37,564][0m Trial 17 finished with value: 0.2968690869463586 and parameters: {'lambda_l1': 8.627645505497526e-06, 'lambda_l2': 8.74508927086069, 'num_leaves': 115, 'feature_fraction': 0.32697861148566093, 'bagging_fraction': 0.8121878441660212, 'min_child_samples': 40}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:43,864][0m Trial 18 finished with value: 0.29848787852640135 and parameters: {'lambda_l1': 1.7470309516308746e-07, 'lambda_l2': 0.015825713110714804, 'num_leaves': 178, 'feature_fraction': 0.20885989199022764, 'bagging_fraction': 0.7804120009260685, 'min_child_samples': 98}. Best is trial 13 with value: 0.2836099591110607.[0m


0it [00:00, ?it/s]

[32m[I 2023-05-27 16:03:48,339][0m Trial 19 finished with value: 0.2968203969226896 and parameters: {'lambda_l1': 0.0009239122737725872, 'lambda_l2': 0.5428707367596065, 'num_leaves': 69, 'feature_fraction': 0.3426008679130671, 'bagging_fraction': 0.627039737454987, 'min_child_samples': 43}. Best is trial 13 with value: 0.2836099591110607.[0m


In [None]:
best_params = study.best_trial.params
best_value = study.best_value
print("best_param: {}\n\nbest_value: {}".format(best_params, best_value))

In [None]:
"""
# best_paramsには固定値が保存されないので改めて設定
best_params["boosting_type"] = CFG.boosting_type
best_params["device_type"] = CFG.device_type
best_params["seed"] = CFG.seed
best_params["n_jobs"] = -1
best_params["is_unbalance"] = True
best_params["objective"] = "binary"
best_params["learning_rate"] = 0.005
best_params["metric"] = "binary_logloss"
best_params["verbosity"] = -1

# 設定したハイパーパラメータを基にモデルを作成
model = lgb.LGBMClassifier(**best_params)
# 学習
model.fit(train_df[features], train_df[CFG.target_col])
"""

In [None]:
"""
# 予測
prediction = model.predict_proba(test_df.drop(["Id", "Class"], axis=1))
# 提出用に値を変換
prediction = max(min(prediction, 1-10**(-15)), 10**(-15))
submission = pd.DataFrame(columns = submission_df.columns)
submission['Id'] = test_df['Id']
submission[['class_0','class_1']] = prediction
submission.to_csv('submission.csv',index=False)
submission
"""