In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import optuna
import warnings
from tqdm import tqdm_notebook as tqdm
warnings.simplefilter('ignore')

In [2]:
# 設定値
class CFG:
    # 変更するパラメータ
    n_folds = 5 # 公差検証の分割数(多くて20)
    n_trials = 20 # ハイパーパラメータチューニングの試行回数(100)
    device_type = "cpu"
    # device_type = "cuda"
    boosting_type = "gbdt"
    # boosting_type = "dart"
    
    
    # その他設定値
    learning_rate = 0.01
    seed = 3407 
    target_col = 'Class'
    num_boost_round = 50500
    early_stopping_round = 300
    verbose_eval = 0  # この数字を1にすると学習時のスコア推移がコマンドライン表示される
    
    # light-gbm設定値
    lgb_params = {
        'verbosity': -1, # 学習途中の情報を表示するかどうか
        "lambda_l1": 2,
        "lambda_l2": 4,
        "num_leaves": 5,
        "feature_fraction": 0.50,
        "bagging_fraction": 0.80,
        "min_child_samples": 0,
        "boosting_type": boosting_type,
        "device_type": device_type,
        "objective": "binary",
        "learning_rate": learning_rate,
        "metric": "binary_logloss",
        'seed': seed,
        'n_jobs': -1, # -1でコア数をマックスで使う
        'is_unbalance':True, # 不均衡データの場合にTrueにする
    }

In [3]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
test_df[CFG.target_col] = -1
submission_df = pd.read_csv("../data/sample_submission.csv")
all_df = pd.concat([train_df, test_df])

BC, CLはいらんかも

In [4]:
numerical_features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']
categorical_features = ['EJ']
features = numerical_features + categorical_features

### balanced loglossの計算（学習で使う？）

In [5]:
# 前処理
def Preprocessing(input_df: pd.DataFrame)->pd.DataFrame:
    output_df = input_df.copy()
    output_df['EJ'] = input_df['EJ'].replace({'A': 0, 'B': 1})
    return output_df

all_df = Preprocessing(all_df)

train_df = all_df[all_df[CFG.target_col] != -1].copy()
test_df = all_df[all_df[CFG.target_col] == -1].copy()

In [6]:
# 評価基準
def balanced_log_loss(y_true, y_pred):
    N = len(y_true)

    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.maximum(np.minimum(y_pred, 1 - 1e-15), 1e-15)

    # balanced logarithmic loss
    loss_numerator = - (1/N_0) * np.sum((1 - y_true) * np.log(1-y_pred)) - (1/N_1) * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

In [7]:
def lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features):
    # 訓練データをlgb用に変換
    # lgb_train = lgb.Dataset(x_train, y_train, weight=y_train.map({0: train_w0, 1: train_w1}), categorical_feature=categorical_features)
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    # 検証データをlgb用に変換
    # lgb_valid = lgb.Dataset(x_valid, y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1}), categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)

    model = lgb.train(
        params = CFG.lgb_params,
        train_set = lgb_train,
        num_boost_round = CFG.num_boost_round,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = CFG.early_stopping_round,
        verbose_eval = CFG.verbose_eval,
        # 学習段階でbalanced_log_lossを使う場合はコメントアウト外す
        # feval = lgb_metric,
    )
    
    # 予測
    valid_pred = model.predict(x_valid)
    return model, valid_pred

In [42]:
# 各分割ごとのテストデータに対する予測値を格納
preds = np.zeros(len(test_df.drop(["Id", "Class"], axis=1)))
# 各分割ごとのバリデーションスコアを格納
scores = 0

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
    print('training fold {}'.format(fold + 1))
    
    x_train = train_df[features].iloc[train_index]
    y_train = train_df[CFG.target_col].iloc[train_index]
    x_valid = train_df[features].iloc[valid_index]
    y_valid = train_df[CFG.target_col].iloc[valid_index]

    # 訓練データの重みを計算
    # train_w0, train_w1 = calc_log_loss_weight(y_train)
    # 検証データの重みを計算
    # valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
    # 訓練データをlgb用に変換
    # lgb_train = lgb.Dataset(x_train, y_train, weight=y_train.map({0: train_w0, 1: train_w1}), categorical_feature=categorical_features)
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=categorical_features)
    # 検証データをlgb用に変換
    # lgb_valid = lgb.Dataset(x_valid, y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1}), categorical_feature=categorical_features)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=categorical_features)
    
    # 学習
    model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
    # 評価
    val_score = balanced_log_loss(y_valid, valid_pred)
    # 予測
    pred = model.predict(test_df.drop(["Id", "Class"], axis=1))
    
    # 予測を保存
    preds += pred
    # スコアを保存
    scores += val_score
    
test_pred = preds / CFG.n_folds
cv_score = scores /  CFG.n_folds
print(f'our out of folds CV score is {scores /  CFG.n_folds}')

training fold 1
training fold 2
training fold 3
training fold 4
training fold 5
our out of folds CV score is 0.284627902958425


In [39]:
# 提出用に値を変換
submission = pd.DataFrame(columns = submission_df.columns)
submission['Id'] = test_df['Id']
submission['class_0'] = 1 - test_pred
submission['class_1'] = test_pred
# submission.to_csv('submission.csv',index=False)
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.731868,0.268132
1,010ebe33f668,0.731868,0.268132
2,02fa521e1838,0.731868,0.268132
3,040e15f562a2,0.731868,0.268132
4,046e85c7cc7f,0.731868,0.268132
