# Kaggleで磨く 機械学習の実践力
# 第7章 2値分類のコンペ (Home Credit Default Risk)

# 7.3: ベースライン作成

## 7.3.2 データ前処理
#### スクリプト7-1: ライブラリの読み込み

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

#### スクリプト7-2: ファイルの読み込み・データ確認

In [2]:
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### スクリプト7-3: メモリ削減のための関数

In [3]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#### スクリプト7-4: メモリ削減の実行

In [4]:
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


## 7.3.3 データセット作成
#### スクリプト7-5: データセットの作成

In [5]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

#### スクリプト7-6: カテゴリ変数をcategory型に変換 

In [6]:
for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

## 7.3.4 バリデーション設計
#### スクリプト7-7: 1の割合とそれぞれの件数を確認

In [7]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


0    282686
1     24825
Name: TARGET, dtype: int64

#### スクリプト7-8: バリデーションのindexリスト作成

In [8]:
# 層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# indexの確認：fold=0のtrainデータ
print("index(train):", cv[0][0])

# indexの確認：fold=0のvalidデータ
print("index(valid):", cv[0][1])

index(train): [     0      1      3 ... 307508 307509 307510]
index(valid): [     2     11     22 ... 307488 307495 307497]


## 7.3.5 モデル学習
#### スクリプト7-9: 学習データと検証データに分離

In [9]:
# foldごとのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# 0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


#### スクリプト7-10: モデル学習

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          early_stopping_rounds=100,
          verbose=100
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774


#### スクリプト7-11: モデル評価

In [11]:
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成（最初のfoldのときのみ）
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

[auc] tr:0.8126, va:0.7586


#### スクリプト7-12: OOFデータの推論値取得

In [12]:
# oofの予測値を入れる変数の作成
train_oof = np.zeros(len(x_train))

# validデータのindexに予測値を格納
train_oof[idx_va] = y_va_pred

#### スクリプト7-13: 説明変数の重要度取得

In [13]:
# 重要度の取得
imp_fold = pd.DataFrame({"col":x_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# 確認（重要度の上位10個）
display(imp_fold.sort_values("imp", ascending=False)[:10])

# 重要度を格納する5fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


#### スクリプト7-14: モデル評価（全foldのサマリ）

In [14]:
# リスト型をarray型に変換
metrics = np.array(metrics)
print(metrics)

# 学習/検証データの評価値の平均値と標準偏差を算出
print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:,1].mean(), metrics[:,1].std(),
    metrics[:,2].mean(), metrics[:,2].std(),
))

# oofの評価値を算出
print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126+-0.0000, va:0.7586+-0.0000
[oof] 0.5103


#### スクリプト7-15: OOFデータの推論値取得（全foldのサマリ）

In [15]:
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031866
3,100006,0,0.0
4,100007,0,0.0


#### スクリプト7-16: 説明変数の重要度取得（全foldのサマリ）

In [16]:
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
imp.columns = ["col", "imp", "imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,


#### スクリプト7-17: 学習関数の定義

In [17]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

#### スクリプト7-18: 学習処理の実行

In [18]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# 学習の実行
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
[auc] tr:0.8126, va:0.7586
-------------------- 1 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782531	valid_1's auc: 0.756239
[200]	training's auc: 0.808862	valid_1's auc: 0.758924
[300]	training's auc: 0.829564	valid_1's auc: 0.758779
[auc] tr:0.8170, va:0.7590
-------------------- 2 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782101	valid_1's auc: 0.758221
[200]	training's auc: 0.809587	valid_1's auc: 0.760104
[300]	training's auc: 0.830474	valid_1's auc: 0.760275
[400]	training's auc: 0.847388	valid_1's auc: 0.759875
[auc] tr:0.8362, va:0.7604
-------------------- 3 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.783853	valid_1's auc: 0.754567
[200]	training's auc: 0.811501

#### スクリプト7-19: 説明変数の重要度の確認

In [19]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


# 7.3.6 モデル推論
#### スクリプト7-20: 推論用データセットの作成

In [20]:
# ファイルの読み込み
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

# データセットの作成
x_test = application_test.drop(columns=["SK_ID_CURR" ])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


#### スクリプト7-21: 学習済モデルの読み込み

In [21]:
with open("model_lgb_fold0.pickle", "rb") as f:
    model = pickle.load(f)

#### スクリプト7-22: モデルを用いた推論

In [22]:
# 推論
test_pred_fold = model.predict_proba(x_test)[:,1]

# 推論値を格納する変数を作成
test_pred = np.zeros((len(x_test), 5))

# 1fold目の予測値を格納
test_pred[:, 0] = test_pred_fold

#### スクリプト7-23: 推論用データセットの推論値算出

In [23]:
# 各foldの推論値の平均値を算出
test_pred_mean = test_pred.mean(axis=1)

# 推論値のデータフレームを作成
df_test_pred = pd.concat([
        id_test,
        pd.DataFrame({"pred": test_pred_mean}),
    ], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006572
1,100005,0.023874
2,100013,0.004233
3,100028,0.008966
4,100038,0.030794


#### スクリプト7-24: 推論関数の定義

In [24]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

#### スクリプト7-25: 推論処理の実行

In [25]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-26: 提出ファイルの作成

In [26]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04181
1,100005,0.1264
2,100013,0.022495
3,100028,0.03968
4,100038,0.156628


# 7.4 特徴量エンジニアリング
## 7.4.1 特徴量エンジニアリング: application_train.csv

#### スクリプト7-27: データの確認

In [27]:
display(application_train["DAYS_EMPLOYED"].value_counts())
print("正の値の割合: {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).mean()))
print("正の値の個数: {}".format((application_train["DAYS_EMPLOYED"]>0).sum()))
# -> 正の値が18%。しかもすべて8割が365243と同一値。働き始めてからの日数をマイナス表記しているためこれは欠損と判断。

 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-13961         1
-11827         1
-10176         1
-9459          1
-8694          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

正の値の割合: 0.1801
正の値の個数: 55374


#### スクリプト7-28: 欠損値の対処（nullに変換）

In [28]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

#### スクリプト7-29: 仮説に基づく特徴量生成

In [29]:
# 特徴量1: 総所得金額を世帯人数で割った値
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# 特徴量2: 総所得金額を就労期間で割った値
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# 特徴量3: 外部スコアの平均値など
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 特徴量4: 就労期間を年齢で割った値 (年齢に占める就労期間の割合)
application_train['DAYS_EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# 特徴量5: 年金支払額を所得金額で割った値
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# 特徴量6: 年金支払額を借入金で割った値
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

#### スクリプト7-30: データセットの作成

In [30]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-31: モデル学習

In [31]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 130) (61503, 130)
[100]	training's auc: 0.787817	valid_1's auc: 0.760032
[200]	training's auc: 0.816788	valid_1's auc: 0.763696
[300]	training's auc: 0.838351	valid_1's auc: 0.764008
[400]	training's auc: 0.856611	valid_1's auc: 0.764045
[500]	training's auc: 0.871304	valid_1's auc: 0.764075
[auc] tr:0.8585, va:0.7641
-------------------- 1 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.788378	valid_1's auc: 0.763077
[200]	training's auc: 0.816816	valid_1's auc: 0.766784
[300]	training's auc: 0.838169	valid_1's auc: 0.767287
[400]	training's auc: 0.856163	valid_1's auc: 0.767434
[auc] tr:0.8471, va:0.7675
-------------------- 2 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.787655	valid_1's auc: 0.764182
[200]	training's auc: 0.817121	valid_1's auc: 0.767566
[300]	training's auc: 0.837872	valid_1's auc: 0.767677
[400]	training's auc: 0.855451	valid_1's auc: 0.76783
[auc] tr:0.8519, va

#### スクリプト7-32: 説明変数の重要度の確認

In [32]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
44,EXT_SOURCE_mean,114005.214702,1381.645644
10,ANNUITY_div_CREDIT,23720.30155,805.397477
112,ORGANIZATION_TYPE,22660.210567,1372.230448
41,EXT_SOURCE_3,12046.854638,886.653726
24,DAYS_BIRTH,8108.684084,578.972393
45,EXT_SOURCE_min,7727.391587,314.203161
39,EXT_SOURCE_1,7155.619219,472.422492
2,AMT_GOODS_PRICE,6148.167858,364.159044
0,AMT_ANNUITY,6091.80521,581.9879
46,EXT_SOURCE_std,5830.39069,679.963947


#### スクリプト7-33: 推論用データのデータセット作成

In [33]:
# nullに置き換え
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

# 特徴量の生成
application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['DAYS_EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

# データセット作成
x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")


#### スクリプト7-34: 推論処理

In [34]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-35: 提出ファイルの作成

In [35]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering1.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.029002
1,100005,0.121782
2,100013,0.022668
3,100028,0.044435
4,100038,0.18194


## 7.4.2 特徴量エンジニアリング: POS_CASH_balance.csv

#### スクリプト7-36: ファイル読み込み

In [36]:
pos = pd.read_csv("../input/home-credit-default-risk/POS_CASH_balance.csv")
pos = reduce_mem_usage(pos)
print(pos.shape)
pos.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 238.45 MB
Decreased by 60.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


#### スクリプト7-37: ①カテゴリ変数をone-hot-encodingで数値に変換

In [37]:
pos_ohe = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], dummy_na=True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

10


['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_nan']

#### スクリプト7-38: ②SK_ID_CURRをキーに集約処理

In [38]:
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        # 数値の集約
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        # カテゴリ変数をone-hot-encodingした値の集約
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        # IDのユニーク数をカウント (ついでにレコード数もカウント)
        "SK_ID_PREV":["count", "nunique"],
    }
)

# カラム名の付与
pos_ohe_agg.columns = [i + "_" + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

(337252, 33)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_std,MONTHS_BALANCE_min,MONTHS_BALANCE_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_std,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_FUTURE_mean,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100001,-72.555556,20.863312,-96,-53,4.0,0.0,4.0,4.0,1.444336,...,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,9,2
1,100002,-10.0,5.627314,-19,-1,24.0,0.0,24.0,24.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1
2,100003,-43.785714,24.640162,-77,-18,10.109375,2.806597,6.0,12.0,5.785156,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28,3
3,100004,-25.5,1.290994,-27,-24,3.75,0.5,3.0,4.0,2.25,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4,1
4,100005,-20.0,3.316625,-25,-15,11.703125,0.948683,9.0,12.0,7.199219,...,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,11,1


#### スクリプト7-39: ③SK_ID_CURRをキーにして結合

In [39]:
df_train = pd.merge(application_train, pos_ohe_agg, on="SK_ID_CURR", how="left")
print(df_train.shape)
df_train.head()

(307511, 164)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


#### スクリプト7-40: データセット作成

In [40]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-41: モデル学習

In [41]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.794548	valid_1's auc: 0.76534
[200]	training's auc: 0.825571	valid_1's auc: 0.77062
[300]	training's auc: 0.848468	valid_1's auc: 0.771422
[400]	training's auc: 0.866087	valid_1's auc: 0.771575
[auc] tr:0.8578, va:0.7719
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794349	valid_1's auc: 0.769227
[200]	training's auc: 0.824921	valid_1's auc: 0.774694
[300]	training's auc: 0.847296	valid_1's auc: 0.775643
[400]	training's auc: 0.864781	valid_1's auc: 0.775882
[500]	training's auc: 0.880069	valid_1's auc: 0.775861
[auc] tr:0.8744, va:0.7761
-------------------- 2 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794977	valid_1's auc: 0.768857
[200]	training's auc: 0.82562	valid_1's auc: 0.773055
[300]	training's auc: 0.847297	valid_1's auc: 0.773516
[400]	training's auc: 0.865995	valid_1's auc: 0.774559
[500]	training's auc:

#### スクリプト7-42: 説明変数の重要度の確認

In [42]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
52,EXT_SOURCE_mean,112438.907936,1217.139287
134,ORGANIZATION_TYPE,21573.968751,1044.080966
10,ANNUITY_div_CREDIT,18349.279658,1039.471604
49,EXT_SOURCE_3,10710.855987,490.719084
53,EXT_SOURCE_min,7021.835349,444.955386
32,DAYS_BIRTH,6666.389282,814.801948
47,EXT_SOURCE_1,6605.474412,601.782028
21,CNT_INSTALMENT_FUTURE_mean,6289.278576,365.694448
0,AMT_ANNUITY,5563.190447,368.625974
108,MONTHS_BALANCE_std,5340.370365,466.201881


#### スクリプト7-43: 推論用のデータセット作成

In [43]:
# テーブル結合
df_test = pd.merge(application_test, pos_ohe_agg, on="SK_ID_CURR", how="left")

# データセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

#### スクリプト7-44: 推論用データセットを用いた推論処理

In [44]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-45: 提出ファイルの作成

In [45]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering2.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.032163
1,100005,0.1044
2,100013,0.025425
3,100028,0.047522
4,100038,0.210907


# 7.5 モデルチューニング
- LightGBMのハイパーパラメータのチューニング

#### スクリプト7-46: 重要度を用いて絞り込んだ特徴量リストの作成（以降では利用しない）

In [46]:
col_filter = sorted(list(imp.sort_values("imp", ascending=False)[:100]["col"]))
# col_filter

## 7.5.1 optunaによる自動チューニングの実行

#### スクリプト7-47: optunaライブラリのインポート

In [47]:
import optuna

#### スクリプト7-48: 学習用のデータセット作成

In [48]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-49: 目的関数の定義

In [49]:
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "bagging_freq": 1,
    "random_state": 123,
}

# 目的関数の定義
def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold = [0]  # 処理高速化のために1つめのfoldのみとする。
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = x_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # 評価指標をAUCにする
        list_metrics.append(metric_va)
    
    # 評価指標の算出
    metrics = np.mean(list_metrics)
    
    return metrics

#### スクリプト7-50: 最適化処理（探索の実行）
- 注意点
    - optunaではシード固定しても探索結果の再現性がありません。このため，これ以降は書籍と結果が異なります。
    - 再現性を求める場合は，並列化なし（n_jobs=1）としてください。

In [50]:
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=5)

[32m[I 2022-05-01 11:35:18,366][0m A new study created in memory with name: no-name-cfd9a5ae-7a01-4419-9f35-49a328f319f5[0m




[32m[I 2022-05-01 11:38:26,740][0m Trial 4 finished with value: 0.7714917756924314 and parameters: {'num_leaves': 64, 'min_child_samples': 130, 'min_sum_hessian_in_leaf': 0.0024978957455850945, 'feature_fraction': 0.7817759970376179, 'bagging_fraction': 0.6566551679807331, 'lambda_l1': 1.3606218340577327, 'lambda_l2': 0.5395629428659839}. Best is trial 4 with value: 0.7714917756924314.[0m




[32m[I 2022-05-01 11:38:48,511][0m Trial 3 finished with value: 0.7701907337709434 and parameters: {'num_leaves': 131, 'min_child_samples': 136, 'min_sum_hessian_in_leaf': 1.884544735324778e-05, 'feature_fraction': 0.5583668235175054, 'bagging_fraction': 0.9193457471700364, 'lambda_l1': 0.04293976246058936, 'lambda_l2': 0.4771555111756843}. Best is trial 4 with value: 0.7714917756924314.[0m




[32m[I 2022-05-01 11:40:08,390][0m Trial 2 finished with value: 0.7676205510454036 and parameters: {'num_leaves': 248, 'min_child_samples': 37, 'min_sum_hessian_in_leaf': 6.643565959251743e-05, 'feature_fraction': 0.8732478098491906, 'bagging_fraction': 0.9480596402659909, 'lambda_l1': 0.08444478709976956, 'lambda_l2': 0.027348269010253605}. Best is trial 4 with value: 0.7714917756924314.[0m




[32m[I 2022-05-01 11:42:18,147][0m Trial 0 finished with value: 0.7711608519176489 and parameters: {'num_leaves': 189, 'min_child_samples': 191, 'min_sum_hessian_in_leaf': 1.4893993455624866e-05, 'feature_fraction': 0.5608343915309344, 'bagging_fraction': 0.7693391574060586, 'lambda_l1': 53.157306712049554, 'lambda_l2': 0.011461518173978315}. Best is trial 4 with value: 0.7714917756924314.[0m
[32m[I 2022-05-01 11:42:21,300][0m Trial 1 finished with value: 0.7685363464517638 and parameters: {'num_leaves': 177, 'min_child_samples': 17, 'min_sum_hessian_in_leaf': 0.000305495240860746, 'feature_fraction': 0.9074145102964661, 'bagging_fraction': 0.7984389964609526, 'lambda_l1': 57.55033459202042, 'lambda_l2': 0.013197845007375216}. Best is trial 4 with value: 0.7714917756924314.[0m




[32m[I 2022-05-01 11:43:05,114][0m Trial 5 finished with value: 0.7705684957246268 and parameters: {'num_leaves': 170, 'min_child_samples': 95, 'min_sum_hessian_in_leaf': 2.4986512403836488e-05, 'feature_fraction': 0.6254847716390584, 'bagging_fraction': 0.9352527002338837, 'lambda_l1': 22.4678211062519, 'lambda_l2': 3.436390729654449}. Best is trial 4 with value: 0.7714917756924314.[0m




[32m[I 2022-05-01 11:43:40,144][0m Trial 6 finished with value: 0.771672730372646 and parameters: {'num_leaves': 61, 'min_child_samples': 76, 'min_sum_hessian_in_leaf': 0.0014872264297663706, 'feature_fraction': 0.8810665415372644, 'bagging_fraction': 0.7553580838947347, 'lambda_l1': 1.8766809822422164, 'lambda_l2': 0.05190585268212917}. Best is trial 6 with value: 0.771672730372646.[0m




[32m[I 2022-05-01 11:43:51,580][0m Trial 7 finished with value: 0.7723731371288146 and parameters: {'num_leaves': 22, 'min_child_samples': 168, 'min_sum_hessian_in_leaf': 0.0012352012463534097, 'feature_fraction': 0.8036398835865824, 'bagging_fraction': 0.5620129755217046, 'lambda_l1': 8.015797731747192, 'lambda_l2': 0.4816836755166696}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:45:51,691][0m Trial 10 finished with value: 0.7723517806576774 and parameters: {'num_leaves': 57, 'min_child_samples': 195, 'min_sum_hessian_in_leaf': 7.880210279614931e-05, 'feature_fraction': 0.7223054722484546, 'bagging_fraction': 0.8555659156023624, 'lambda_l1': 0.21748802015758123, 'lambda_l2': 0.018854374353022223}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:46:18,985][0m Trial 9 finished with value: 0.77122372080883 and parameters: {'num_leaves': 168, 'min_child_samples': 143, 'min_sum_hessian_in_leaf': 0.0003429694515197618, 'feature_fraction': 0.650959297991036, 'bagging_fraction': 0.6742040549965194, 'lambda_l1': 10.282173253460472, 'lambda_l2': 0.2785365194399337}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:46:56,474][0m Trial 8 finished with value: 0.7685805662809928 and parameters: {'num_leaves': 236, 'min_child_samples': 144, 'min_sum_hessian_in_leaf': 0.00012128928247209661, 'feature_fraction': 0.982578543804355, 'bagging_fraction': 0.6364447321861115, 'lambda_l1': 18.601419059826505, 'lambda_l2': 0.3630333486716112}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:47:26,133][0m Trial 12 finished with value: 0.7723097053815136 and parameters: {'num_leaves': 84, 'min_child_samples': 105, 'min_sum_hessian_in_leaf': 1.7012333146629243e-05, 'feature_fraction': 0.8727436453909863, 'bagging_fraction': 0.9790729986645993, 'lambda_l1': 0.015860523075659168, 'lambda_l2': 2.0525610433062016}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:48:26,687][0m Trial 13 finished with value: 0.7700091984227062 and parameters: {'num_leaves': 65, 'min_child_samples': 191, 'min_sum_hessian_in_leaf': 1.3787511983449289e-05, 'feature_fraction': 0.8906916850136228, 'bagging_fraction': 0.7412083060096379, 'lambda_l1': 0.022482002960242487, 'lambda_l2': 1.3292467530819916}. Best is trial 7 with value: 0.7723731371288146.[0m
[32m[I 2022-05-01 11:48:29,160][0m Trial 11 finished with value: 0.771075593464984 and parameters: {'num_leaves': 177, 'min_child_samples': 50, 'min_sum_hessian_in_leaf': 6.003404604846925e-05, 'feature_fraction': 0.6508058130840766, 'bagging_fraction': 0.9783887014658836, 'lambda_l1': 19.621218910713292, 'lambda_l2': 0.16837391395308382}. Best is trial 7 with value: 0.7723731371288146.[0m




[32m[I 2022-05-01 11:49:15,739][0m Trial 14 finished with value: 0.7725676181678128 and parameters: {'num_leaves': 24, 'min_child_samples': 165, 'min_sum_hessian_in_leaf': 0.009359835012461381, 'feature_fraction': 0.7780819739431071, 'bagging_fraction': 0.5095784491849814, 'lambda_l1': 0.010374599370054695, 'lambda_l2': 40.67599612066364}. Best is trial 14 with value: 0.7725676181678128.[0m




[32m[I 2022-05-01 11:51:42,651][0m Trial 16 finished with value: 0.7736781902907532 and parameters: {'num_leaves': 10, 'min_child_samples': 200, 'min_sum_hessian_in_leaf': 0.006487348172813453, 'feature_fraction': 0.7533122568560118, 'bagging_fraction': 0.5255992767496774, 'lambda_l1': 0.1826408327783265, 'lambda_l2': 12.16477025999942}. Best is trial 16 with value: 0.7736781902907532.[0m
[32m[I 2022-05-01 11:51:42,915][0m Trial 15 finished with value: 0.7736601112096821 and parameters: {'num_leaves': 9, 'min_child_samples': 196, 'min_sum_hessian_in_leaf': 0.009709598106486705, 'feature_fraction': 0.7529917860005172, 'bagging_fraction': 0.5292724536235864, 'lambda_l1': 0.2177333111314291, 'lambda_l2': 94.51987086551506}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:52:04,130][0m Trial 18 finished with value: 0.7727465244792361 and parameters: {'num_leaves': 12, 'min_child_samples': 169, 'min_sum_hessian_in_leaf': 0.0074450391759240085, 'feature_fraction': 0.7737368318435336, 'bagging_fraction': 0.5035091058406657, 'lambda_l1': 0.22307424705890072, 'lambda_l2': 25.806357989971495}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:52:43,783][0m Trial 17 finished with value: 0.772148325982183 and parameters: {'num_leaves': 18, 'min_child_samples': 171, 'min_sum_hessian_in_leaf': 0.008338079979723863, 'feature_fraction': 0.754030735241468, 'bagging_fraction': 0.5374274003238141, 'lambda_l1': 0.18812831719012948, 'lambda_l2': 73.6798663423486}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:52:52,637][0m Trial 19 finished with value: 0.773296556029459 and parameters: {'num_leaves': 18, 'min_child_samples': 169, 'min_sum_hessian_in_leaf': 0.009661397813490845, 'feature_fraction': 0.764981976128038, 'bagging_fraction': 0.5047118197240547, 'lambda_l1': 0.4583396175806724, 'lambda_l2': 34.18592793732849}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:54:53,155][0m Trial 20 finished with value: 0.7728608448320742 and parameters: {'num_leaves': 18, 'min_child_samples': 170, 'min_sum_hessian_in_leaf': 0.009545779875764413, 'feature_fraction': 0.7155776320136286, 'bagging_fraction': 0.5108460605522516, 'lambda_l1': 0.2534994645432852, 'lambda_l2': 35.71649364229052}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:55:12,822][0m Trial 21 finished with value: 0.7727797116160359 and parameters: {'num_leaves': 10, 'min_child_samples': 167, 'min_sum_hessian_in_leaf': 0.0064174027192839475, 'feature_fraction': 0.7284563550773234, 'bagging_fraction': 0.5010807424753472, 'lambda_l1': 0.31608344347530143, 'lambda_l2': 78.1541397851394}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:55:25,660][0m Trial 22 finished with value: 0.7707033247020417 and parameters: {'num_leaves': 107, 'min_child_samples': 199, 'min_sum_hessian_in_leaf': 0.004133195241429847, 'feature_fraction': 0.7088508288843905, 'bagging_fraction': 0.6082112449332547, 'lambda_l1': 0.42637734956219364, 'lambda_l2': 8.81143285534599}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:55:50,741][0m Trial 23 finished with value: 0.7697068378148259 and parameters: {'num_leaves': 113, 'min_child_samples': 120, 'min_sum_hessian_in_leaf': 0.0032283133664017154, 'feature_fraction': 0.7031573150071807, 'bagging_fraction': 0.597507797262484, 'lambda_l1': 0.6394333573768488, 'lambda_l2': 16.407511193233738}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:56:04,897][0m Trial 24 finished with value: 0.7685516183769958 and parameters: {'num_leaves': 122, 'min_child_samples': 198, 'min_sum_hessian_in_leaf': 0.0035763432729662543, 'feature_fraction': 0.6980503747817011, 'bagging_fraction': 0.5901847808823646, 'lambda_l1': 3.380179651859003, 'lambda_l2': 6.089080442967356}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:57:58,895][0m Trial 27 finished with value: 0.7705938206876485 and parameters: {'num_leaves': 41, 'min_child_samples': 200, 'min_sum_hessian_in_leaf': 0.0036035561534540268, 'feature_fraction': 0.8302430831366436, 'bagging_fraction': 0.5744595134306741, 'lambda_l1': 0.7257163869813893, 'lambda_l2': 11.47588663134806}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:58:03,416][0m Trial 26 finished with value: 0.7700740978707757 and parameters: {'num_leaves': 102, 'min_child_samples': 198, 'min_sum_hessian_in_leaf': 0.002664082937994302, 'feature_fraction': 0.8253622979330543, 'bagging_fraction': 0.5904077337210729, 'lambda_l1': 0.6733154758493748, 'lambda_l2': 9.592029275625382}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:58:58,856][0m Trial 25 finished with value: 0.7706634189156064 and parameters: {'num_leaves': 102, 'min_child_samples': 200, 'min_sum_hessian_in_leaf': 0.00359319041190123, 'feature_fraction': 0.827263806818863, 'bagging_fraction': 0.5901072224782853, 'lambda_l1': 0.623327025754389, 'lambda_l2': 9.35101587113855}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:59:19,869][0m Trial 28 finished with value: 0.7717690892029698 and parameters: {'num_leaves': 44, 'min_child_samples': 183, 'min_sum_hessian_in_leaf': 0.0012029532753087611, 'feature_fraction': 0.8273254325163798, 'bagging_fraction': 0.5684420361833098, 'lambda_l1': 0.07228631900555492, 'lambda_l2': 7.793103881809226}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 11:59:45,906][0m Trial 29 finished with value: 0.7716823203009698 and parameters: {'num_leaves': 37, 'min_child_samples': 154, 'min_sum_hessian_in_leaf': 0.0011639087722019085, 'feature_fraction': 0.8261331236622983, 'bagging_fraction': 0.6991152132614744, 'lambda_l1': 0.08162522053364786, 'lambda_l2': 13.362526464200625}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:01:55,087][0m Trial 31 finished with value: 0.7718521603540037 and parameters: {'num_leaves': 42, 'min_child_samples': 158, 'min_sum_hessian_in_leaf': 0.0007725307045602569, 'feature_fraction': 0.9472446755405861, 'bagging_fraction': 0.7043685989738626, 'lambda_l1': 0.09319350275344146, 'lambda_l2': 92.21762031695367}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:02:29,473][0m Trial 30 finished with value: 0.7729985272762747 and parameters: {'num_leaves': 42, 'min_child_samples': 150, 'min_sum_hessian_in_leaf': 0.0012501165270182345, 'feature_fraction': 0.837696514612162, 'bagging_fraction': 0.6852129041426301, 'lambda_l1': 0.09765139093978645, 'lambda_l2': 93.35780996278153}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:03:24,262][0m Trial 34 finished with value: 0.7715305557666267 and parameters: {'num_leaves': 82, 'min_child_samples': 182, 'min_sum_hessian_in_leaf': 0.0003863078218838296, 'feature_fraction': 0.6733497995777573, 'bagging_fraction': 0.5400280555887712, 'lambda_l1': 0.13694554646248658, 'lambda_l2': 51.464628429983996}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:03:35,550][0m Trial 32 finished with value: 0.7721522624126429 and parameters: {'num_leaves': 42, 'min_child_samples': 156, 'min_sum_hessian_in_leaf': 0.0009786946009003562, 'feature_fraction': 0.607897101099848, 'bagging_fraction': 0.6942587934616335, 'lambda_l1': 0.08963604637974677, 'lambda_l2': 56.3068220784748}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:03:57,246][0m Trial 33 finished with value: 0.773263732255471 and parameters: {'num_leaves': 43, 'min_child_samples': 151, 'min_sum_hessian_in_leaf': 0.0006572892814627545, 'feature_fraction': 0.6021561465670731, 'bagging_fraction': 0.7130734560434281, 'lambda_l1': 0.0779321828521109, 'lambda_l2': 97.11829116150953}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:05:27,699][0m Trial 36 finished with value: 0.7725992022333846 and parameters: {'num_leaves': 31, 'min_child_samples': 176, 'min_sum_hessian_in_leaf': 0.00540755829463685, 'feature_fraction': 0.5138157602047937, 'bagging_fraction': 0.54767143109193, 'lambda_l1': 0.1345798694027298, 'lambda_l2': 46.005885073524325}. Best is trial 16 with value: 0.7736781902907532.[0m




[32m[I 2022-05-01 12:06:23,710][0m Trial 35 finished with value: 0.773913852448408 and parameters: {'num_leaves': 8, 'min_child_samples': 182, 'min_sum_hessian_in_leaf': 0.005648102844029995, 'feature_fraction': 0.6231293151037325, 'bagging_fraction': 0.5400643721950571, 'lambda_l1': 2.3145977966055207, 'lambda_l2': 33.505733342004284}. Best is trial 35 with value: 0.773913852448408.[0m




[32m[I 2022-05-01 12:06:54,926][0m Trial 38 finished with value: 0.7736320930869975 and parameters: {'num_leaves': 25, 'min_child_samples': 183, 'min_sum_hessian_in_leaf': 0.005027715444940295, 'feature_fraction': 0.748845019303335, 'bagging_fraction': 0.8228943442677358, 'lambda_l1': 0.03522551287537929, 'lambda_l2': 23.26357516965278}. Best is trial 35 with value: 0.773913852448408.[0m




[32m[I 2022-05-01 12:07:20,523][0m Trial 37 finished with value: 0.7740694999775035 and parameters: {'num_leaves': 29, 'min_child_samples': 183, 'min_sum_hessian_in_leaf': 0.006214375855002281, 'feature_fraction': 0.7514566059816202, 'bagging_fraction': 0.8022663475328027, 'lambda_l1': 0.036490510273135365, 'lambda_l2': 28.040356568227118}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:08:16,904][0m Trial 39 finished with value: 0.7739951139101449 and parameters: {'num_leaves': 27, 'min_child_samples': 181, 'min_sum_hessian_in_leaf': 0.005612161110241938, 'feature_fraction': 0.5371443300284879, 'bagging_fraction': 0.6307012532543521, 'lambda_l1': 0.039891656370535315, 'lambda_l2': 23.99974380142292}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:09:14,322][0m Trial 40 finished with value: 0.7698194767240649 and parameters: {'num_leaves': 148, 'min_child_samples': 185, 'min_sum_hessian_in_leaf': 0.0020712237089454844, 'feature_fraction': 0.5959387515411809, 'bagging_fraction': 0.6298458983085963, 'lambda_l1': 0.040897283829341886, 'lambda_l2': 23.61762245836251}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:10:53,705][0m Trial 41 finished with value: 0.770463804486298 and parameters: {'num_leaves': 146, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.002007016686903535, 'feature_fraction': 0.7513067889436882, 'bagging_fraction': 0.6167844354306887, 'lambda_l1': 2.391777566789162, 'lambda_l2': 23.314713834317562}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:11:07,704][0m Trial 42 finished with value: 0.7710120405967457 and parameters: {'num_leaves': 144, 'min_child_samples': 185, 'min_sum_hessian_in_leaf': 0.0019953966893219, 'feature_fraction': 0.5849979625084633, 'bagging_fraction': 0.8041142026980888, 'lambda_l1': 0.042220064682153204, 'lambda_l2': 24.225175373478233}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:11:39,913][0m Trial 43 finished with value: 0.7725871257634671 and parameters: {'num_leaves': 145, 'min_child_samples': 14, 'min_sum_hessian_in_leaf': 0.0022827757433597456, 'feature_fraction': 0.5723896784726493, 'bagging_fraction': 0.8791233711371605, 'lambda_l1': 2.6006634286646846, 'lambda_l2': 4.1907772179650165}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:12:14,459][0m Trial 44 finished with value: 0.7700645079424521 and parameters: {'num_leaves': 219, 'min_child_samples': 122, 'min_sum_hessian_in_leaf': 0.001955060288282266, 'feature_fraction': 0.567773430851432, 'bagging_fraction': 0.7949940237379882, 'lambda_l1': 0.042960950500772525, 'lambda_l2': 4.4036662899056545}. Best is trial 37 with value: 0.7740694999775035.[0m




[32m[I 2022-05-01 12:15:52,022][0m Trial 47 finished with value: 0.7744062126206093 and parameters: {'num_leaves': 56, 'min_child_samples': 127, 'min_sum_hessian_in_leaf': 0.0053812109322359785, 'feature_fraction': 0.5092583278396488, 'bagging_fraction': 0.8814860573993653, 'lambda_l1': 1.3919237532030029, 'lambda_l2': 3.3075452323627004}. Best is trial 47 with value: 0.7744062126206093.[0m
[32m[I 2022-05-01 12:16:02,595][0m Trial 45 finished with value: 0.7741580393826153 and parameters: {'num_leaves': 8, 'min_child_samples': 186, 'min_sum_hessian_in_leaf': 0.0018040061000251064, 'feature_fraction': 0.5453317434947169, 'bagging_fraction': 0.8159066475441707, 'lambda_l1': 1.8194645331219828, 'lambda_l2': 3.237260332175004}. Best is trial 47 with value: 0.7744062126206093.[0m
[32m[I 2022-05-01 12:16:16,451][0m Trial 49 finished with value: 0.7730881852688655 and parameters: {'num_leaves': 53, 'min_child_samples': 60, 'min_sum_hessian_in_leaf': 0.005561537937464585, 'feature_frac

#### スクリプト7-51: 探索結果の確認

In [51]:
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7753


{'num_leaves': 8,
 'min_child_samples': 188,
 'min_sum_hessian_in_leaf': 0.005372519889325684,
 'feature_fraction': 0.5040706766176062,
 'bagging_fraction': 0.8906803370906399,
 'lambda_l1': 0.02422415508286258,
 'lambda_l2': 3.8031957598067416}

#### スクリプト7-52: ベストなハイパーパラメータの取得

In [52]:
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 8,
 'min_child_samples': 188,
 'min_sum_hessian_in_leaf': 0.005372519889325684,
 'feature_fraction': 0.5040706766176062,
 'bagging_fraction': 0.8906803370906399,
 'lambda_l1': 0.02422415508286258,
 'lambda_l2': 3.8031957598067416,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'verbosity': -1,
 'learning_rate': 0.05,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'random_state': 123}

#### スクリプト7-53: ベストなハイパーパラメータを用いたモデル学習

In [53]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                    params=params_best,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.762984	valid_1's auc: 0.756262
[200]	training's auc: 0.775454	valid_1's auc: 0.765933
[300]	training's auc: 0.782744	valid_1's auc: 0.769901
[400]	training's auc: 0.788169	valid_1's auc: 0.771682
[500]	training's auc: 0.793047	valid_1's auc: 0.773021
[600]	training's auc: 0.797187	valid_1's auc: 0.773595
[700]	training's auc: 0.801366	valid_1's auc: 0.773865
[800]	training's auc: 0.805038	valid_1's auc: 0.774357
[900]	training's auc: 0.808634	valid_1's auc: 0.774459
[1000]	training's auc: 0.811895	valid_1's auc: 0.774566
[1100]	training's auc: 0.815212	valid_1's auc: 0.77464
[1200]	training's auc: 0.818422	valid_1's auc: 0.774841
[1300]	training's auc: 0.821748	valid_1's auc: 0.774856
[1400]	training's auc: 0.824722	valid_1's auc: 0.775093
[1500]	training's auc: 0.827718	valid_1's auc: 0.775251
[auc] tr:0.8266, va:0.7753
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[

#### スクリプト7-54: 推論データ作成とモデル推論

In [54]:
# 推論用のデータセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

# predict
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

# make submission-file
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_HyperParameterTuning.csv", index=None)

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.
(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.039234
1,100005,0.130036
2,100013,0.027832
3,100028,0.0455
4,100038,0.194294
