# 1 ベースライン作成

## 1-1ライブラリの読み込み等の初期処理

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [2]:
# ファイルの読み込み・データ確認
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# メモリ削減用関数　（ファイルサイズが大きいとき）データ型を最適化する
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
#試しで使用
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


## 1-2 データセットの作成

In [4]:
#データセットの作成
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]
#カテゴリ変数をcategory型へ変換（lightGBMに使うため）
for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

## 1-3 バリデーション設計

In [5]:
#1の割合とそれぞれの件数を確認
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


0    282686
1     24825
Name: TARGET, dtype: int64

In [6]:
'''
バリデーションのindexeリスト作成
分割の割合が一定になるように層化分割を行う（StratifidKFold）
'''
# 層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# indexの確認：fold=0のtrainデータ
print("index(train):", cv[0][0])

# indexの確認：fold=0のvalidデータ
print("index(valid):", cv[0][1])

index(train): [     0      1      3 ... 307508 307509 307510]
index(valid): [     2     11     22 ... 307488 307495 307497]


## 1-4 モデル学習

In [7]:
'''学習データと検証データに分類'''
# foldごとのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# 0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


In [8]:
# モデルの学習
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          early_stopping_rounds=100,
          verbose=100
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774


In [9]:
'''モデル評価'''
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成（最初のfoldのときのみ）
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

[auc] tr:0.8126, va:0.7586


In [10]:
'''OOFデータの推定値取得(out of fold)'''
#推論値や誤差の分布確認やアンサンブル時のモデルごとの重みを決めるのに有効
# oofの予測値を入れる変数の作成
train_oof = np.zeros(len(x_train))

# validデータのindexに予測値を格納
train_oof[idx_va] = y_va_pred

In [11]:
'''説明変数の重要度取得（学習したモデルから説明変数の重要度を取得）'''
# 重要度の取得
imp_fold = pd.DataFrame({"col":x_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# 確認（重要度の上位10個）
display(imp_fold.sort_values("imp", ascending=False)[:10])

# 重要度を格納する5fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


In [12]:
'''モデル評価（全foldのサマリ）'''
# リスト型をarray型に変換
metrics = np.array(metrics)
print(metrics)

# 学習/検証データの評価値の平均値と標準偏差を算出
print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:,1].mean(), metrics[:,1].std(),
    metrics[:,2].mean(), metrics[:,2].std(),
))

# oofの評価値を算出
print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126+-0.0000, va:0.7586+-0.0000
[oof] 0.5103


In [13]:
#OOFデータの推論値取得
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031866
3,100006,0,0.0
4,100007,0,0.0


In [14]:
#説明変数の重要度取得
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
imp.columns = ["col", "imp", "imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,


In [15]:
#モデル学習
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

#学習関数の定義
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

In [16]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# 学習の実行
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
[auc] tr:0.8126, va:0.7586
-------------------- 1 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782531	valid_1's auc: 0.756239
[200]	training's auc: 0.808862	valid_1's auc: 0.758924
[300]	training's auc: 0.829564	valid_1's auc: 0.758779
[auc] tr:0.8170, va:0.7590
-------------------- 2 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782101	valid_1's auc: 0.758221
[200]	training's auc: 0.809587	valid_1's auc: 0.760104
[300]	training's auc: 0.830474	valid_1's auc: 0.760275
[400]	training's auc: 0.847388	valid_1's auc: 0.759875
[auc] tr:0.8362, va:0.7604
-------------------- 3 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.783853	valid_1's auc: 0.754567
[200]	training's auc: 0.811501

In [17]:
#説明変数の重要度の確認
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


## 1-5 モデル推論

In [18]:
'''推論用データセットの作成'''
# ファイルの読み込み
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

# データセットの作成
x_test = application_test.drop(columns=["SK_ID_CURR" ])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


In [19]:
#学習済みモデル読み込み
with open("model_lgb_fold0.pickle", "rb") as f:
    model = pickle.load(f)

In [20]:
'''モデルを用いた推論'''
# 推論
test_pred_fold = model.predict_proba(x_test)[:,1]

# 推論値を格納する変数を作成
test_pred = np.zeros((len(x_test), 5))

# 1fold目の予測値を格納
test_pred[:, 0] = test_pred_fold

In [21]:
'''推論用データセットの推論値算出'''
# 各foldの推論値の平均値を算出
test_pred_mean = test_pred.mean(axis=1)

# 推論値のデータフレームを作成
df_test_pred = pd.concat([
        id_test,
        pd.DataFrame({"pred": test_pred_mean}),
    ], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006572
1,100005,0.023874
2,100013,0.004233
3,100028,0.008966
4,100038,0.030794


In [22]:
#推論関数の定義
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

In [23]:
#推論処理の実行
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [24]:
#提出ファイルの作成
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04181
1,100005,0.1264
2,100013,0.022495
3,100028,0.03968
4,100038,0.156628


# 2 特徴量エンジニアリング

## 2-1 特徴量エンジニアリング:application_train.csv

In [25]:
#データの確認
display(application_train["DAYS_EMPLOYED"].value_counts())
print("正の値の割合: {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).mean()))
print("正の値の個数: {}".format((application_train["DAYS_EMPLOYED"]>0).sum()))
# -> 正の値が18%。しかもすべて8割が365243と同一値。働き始めてからの日数をマイナス表記しているためこれは欠損と判断。

 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-13961         1
-11827         1
-10176         1
-9459          1
-8694          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

正の値の割合: 0.1801
正の値の個数: 55374


In [26]:
#欠損値をnullに変換
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

In [27]:
'''仮説に基づく特徴量生成'''
# 特徴量1: 総所得金額を世帯人数で割った値
# [仮説1]　所得金額が同じでも家族人数が多いほうが経済的な負担が大きいため貸し倒れリスクが高そう
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# 特徴量2: 総所得金額を就労期間で割った値
# [仮説2]　所得金額が同じでも就労期間が短い方が優秀そうで貸し倒れリスクが低そう
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# 特徴量3: 外部スコアの平均値など
# [仮説3]　外部機関によるスコアが平均的に高い方が貸し倒れリスクが低そう
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 特徴量4: 就労期間を年齢で割った値 (年齢に占める就労期間の割合)
# [仮説4]　念連に占める就労期間が長いほうが貸し倒れリスクが低そう（離職リスクが少ないと考えるため）
application_train['DAYS_EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# 特徴量5: 年金支払額を所得金額で割った値
# [仮説5]　所得金額に占める年金支払額が少ないほうが貸し倒れリスクが低そう
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# 特徴量6: 年金支払額を借入金で割った値
# [仮説6]　借入金に占める年金支払額が少ないほうが貸し倒れリスクが低そう
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

In [28]:
#データセットの作成
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

In [29]:
#モデルの学習
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 130) (61503, 130)
[100]	training's auc: 0.787817	valid_1's auc: 0.760032
[200]	training's auc: 0.816788	valid_1's auc: 0.763696
[300]	training's auc: 0.838351	valid_1's auc: 0.764008
[400]	training's auc: 0.856611	valid_1's auc: 0.764045
[500]	training's auc: 0.871304	valid_1's auc: 0.764075
[auc] tr:0.8585, va:0.7641
-------------------- 1 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.788378	valid_1's auc: 0.763077
[200]	training's auc: 0.816816	valid_1's auc: 0.766784
[300]	training's auc: 0.838169	valid_1's auc: 0.767287
[400]	training's auc: 0.856163	valid_1's auc: 0.767434
[auc] tr:0.8471, va:0.7675
-------------------- 2 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.787655	valid_1's auc: 0.764182
[200]	training's auc: 0.817121	valid_1's auc: 0.767566
[300]	training's auc: 0.837872	valid_1's auc: 0.767677
[400]	training's auc: 0.855451	valid_1's auc: 0.76783
[auc] tr:0.8519, va

In [30]:
#説明変数の重要度の確認
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
44,EXT_SOURCE_mean,114005.214702,1381.645644
10,ANNUITY_div_CREDIT,23720.30155,805.397477
112,ORGANIZATION_TYPE,22660.210567,1372.230448
41,EXT_SOURCE_3,12046.854638,886.653726
24,DAYS_BIRTH,8108.684084,578.972393
45,EXT_SOURCE_min,7727.391587,314.203161
39,EXT_SOURCE_1,7155.619219,472.422492
2,AMT_GOODS_PRICE,6148.167858,364.159044
0,AMT_ANNUITY,6091.80521,581.9879
46,EXT_SOURCE_std,5830.39069,679.963947


In [31]:
#推論用データのデータセット作成
# nullに置き換え
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

# 特徴量の生成
application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['DAYS_EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

# データセット作成
x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")


In [32]:
#推論処理
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [33]:
#提出ファイルの作成
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering1.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.029002
1,100005,0.121782
2,100013,0.022668
3,100028,0.044435
4,100038,0.18194


## 2-2 特徴量エンジニアリング：POS_CASH_balance.csv

In [34]:
#ファイル読み込み
pos = pd.read_csv("../input/home-credit-default-risk/POS_CASH_balance.csv")
pos = reduce_mem_usage(pos)
print(pos.shape)
pos.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 238.45 MB
Decreased by 60.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [35]:
'''カテゴリ変数への変換～キー結合（１対多であるため）'''
#①カテゴリ変数をone-hot-encodingで数値に変換
pos_ohe = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], dummy_na=True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

#②SK_ID_CURRをキーに集約処理
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        # 数値の集約
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        # カテゴリ変数をone-hot-encodingした値の集約
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        # IDのユニーク数をカウント (ついでにレコード数もカウント)
        "SK_ID_PREV":["count", "nunique"],
    }
)

# カラム名の付与
pos_ohe_agg.columns = [i + "_" + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

#③SK_ID?CURRをキーにして結合
df_train = pd.merge(application_train, pos_ohe_agg, on="SK_ID_CURR", how="left")
print(df_train.shape)
df_train.head()

10
(337252, 33)
(307511, 164)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


In [36]:
#データセットの作成
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

In [37]:
#モデル学習
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.794548	valid_1's auc: 0.76534
[200]	training's auc: 0.825571	valid_1's auc: 0.77062
[300]	training's auc: 0.848468	valid_1's auc: 0.771422
[400]	training's auc: 0.866087	valid_1's auc: 0.771575
[auc] tr:0.8578, va:0.7719
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794349	valid_1's auc: 0.769227
[200]	training's auc: 0.824921	valid_1's auc: 0.774694
[300]	training's auc: 0.847296	valid_1's auc: 0.775643
[400]	training's auc: 0.864781	valid_1's auc: 0.775882
[500]	training's auc: 0.880069	valid_1's auc: 0.775861
[auc] tr:0.8744, va:0.7761
-------------------- 2 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794977	valid_1's auc: 0.768857
[200]	training's auc: 0.82562	valid_1's auc: 0.773055
[300]	training's auc: 0.847297	valid_1's auc: 0.773516
[400]	training's auc: 0.865995	valid_1's auc: 0.774559
[500]	training's auc:

In [38]:
#説明変数の重要度の確認
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
52,EXT_SOURCE_mean,112438.907936,1217.139287
134,ORGANIZATION_TYPE,21573.968751,1044.080966
10,ANNUITY_div_CREDIT,18349.279658,1039.471604
49,EXT_SOURCE_3,10710.855987,490.719084
53,EXT_SOURCE_min,7021.835349,444.955386
32,DAYS_BIRTH,6666.389282,814.801948
47,EXT_SOURCE_1,6605.474412,601.782028
21,CNT_INSTALMENT_FUTURE_mean,6289.278576,365.694448
0,AMT_ANNUITY,5563.190447,368.625974
108,MONTHS_BALANCE_std,5340.370365,466.201881


In [39]:
#推論用のデータセット作成
# テーブル結合
df_test = pd.merge(application_test, pos_ohe_agg, on="SK_ID_CURR", how="left")

# データセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

In [40]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [41]:
#提出ファイルの作成
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering2.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.032163
1,100005,0.1044
2,100013,0.025425
3,100028,0.047522
4,100038,0.210907


# 3 モデルチューニング

In [42]:
#重要度を用いて絞り込んだ特徴量リストの作成
col_filter = sorted(list(imp.sort_values("imp", ascending=False)[:100]["col"]))
# col_filter

## 3-1 optunaによる自動チューニングの実行

In [43]:
import optuna

#学習用のデータセット作成
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

In [44]:
#目的関数の定義
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "bagging_freq": 1,
    "random_state": 123,
}

# 目的関数の定義
def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold = [0]  # 処理高速化のために1つめのfoldのみとする。
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = x_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # 評価指標をAUCにする
        list_metrics.append(metric_va)
    
    # 評価指標の算出
    metrics = np.mean(list_metrics)
    
    return metrics

In [45]:
#最適化処理（探索の実行）
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=5)

[32m[I 2023-04-30 14:26:45,253][0m A new study created in memory with name: no-name-b7441575-3b69-4e32-93de-1a3ba7cd9ab9[0m




[32m[I 2023-04-30 14:30:50,539][0m Trial 3 finished with value: 0.7727685008045815 and parameters: {'num_leaves': 39, 'min_child_samples': 99, 'min_sum_hessian_in_leaf': 2.9983948823347098e-05, 'feature_fraction': 0.5850376878696498, 'bagging_fraction': 0.7159694334795093, 'lambda_l1': 0.023068077831079503, 'lambda_l2': 0.020336781644361635}. Best is trial 3 with value: 0.7727685008045815.[0m




[32m[I 2023-04-30 14:32:08,385][0m Trial 2 finished with value: 0.7693193754990227 and parameters: {'num_leaves': 188, 'min_child_samples': 5, 'min_sum_hessian_in_leaf': 0.0002407931541331706, 'feature_fraction': 0.5281797830802837, 'bagging_fraction': 0.5279289777706112, 'lambda_l1': 3.057219878560467, 'lambda_l2': 60.36744583378966}. Best is trial 3 with value: 0.7727685008045815.[0m




[32m[I 2023-04-30 14:32:14,615][0m Trial 1 finished with value: 0.7670490098416817 and parameters: {'num_leaves': 256, 'min_child_samples': 61, 'min_sum_hessian_in_leaf': 0.003498325557175782, 'feature_fraction': 0.7915560266087379, 'bagging_fraction': 0.8662663534366088, 'lambda_l1': 0.03078325894658214, 'lambda_l2': 0.05298577618076196}. Best is trial 3 with value: 0.7727685008045815.[0m




[32m[I 2023-04-30 14:32:36,687][0m Trial 4 finished with value: 0.773745284165215 and parameters: {'num_leaves': 10, 'min_child_samples': 115, 'min_sum_hessian_in_leaf': 0.00011943022057432947, 'feature_fraction': 0.649035392367971, 'bagging_fraction': 0.6688701597563487, 'lambda_l1': 2.963292598591806, 'lambda_l2': 0.6715752053425988}. Best is trial 4 with value: 0.773745284165215.[0m




[32m[I 2023-04-30 14:32:56,745][0m Trial 0 finished with value: 0.769876834612602 and parameters: {'num_leaves': 208, 'min_child_samples': 112, 'min_sum_hessian_in_leaf': 1.4201095790918764e-05, 'feature_fraction': 0.7128939008365752, 'bagging_fraction': 0.9295691553493768, 'lambda_l1': 2.6646728406458804, 'lambda_l2': 0.24593204506532299}. Best is trial 4 with value: 0.773745284165215.[0m




[32m[I 2023-04-30 14:35:00,330][0m Trial 5 finished with value: 0.769288375663854 and parameters: {'num_leaves': 43, 'min_child_samples': 96, 'min_sum_hessian_in_leaf': 4.714028673031651e-05, 'feature_fraction': 0.9209435355185196, 'bagging_fraction': 0.500240166436195, 'lambda_l1': 1.3930408779822414, 'lambda_l2': 3.690943573970376}. Best is trial 4 with value: 0.773745284165215.[0m




[32m[I 2023-04-30 14:36:26,473][0m Trial 7 finished with value: 0.7696995313724067 and parameters: {'num_leaves': 119, 'min_child_samples': 125, 'min_sum_hessian_in_leaf': 0.0034196552085086806, 'feature_fraction': 0.5367125816883797, 'bagging_fraction': 0.6221672971653607, 'lambda_l1': 0.10620890279781825, 'lambda_l2': 0.16634517707532664}. Best is trial 4 with value: 0.773745284165215.[0m




[32m[I 2023-04-30 14:37:52,320][0m Trial 8 finished with value: 0.7739668677951077 and parameters: {'num_leaves': 14, 'min_child_samples': 64, 'min_sum_hessian_in_leaf': 2.1822354176257006e-05, 'feature_fraction': 0.6517588538325736, 'bagging_fraction': 0.6403897018708236, 'lambda_l1': 0.34848479086760753, 'lambda_l2': 43.085601228865194}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:38:38,891][0m Trial 10 finished with value: 0.7691424712454441 and parameters: {'num_leaves': 107, 'min_child_samples': 123, 'min_sum_hessian_in_leaf': 0.009308766200595049, 'feature_fraction': 0.9521308259778436, 'bagging_fraction': 0.6571120601206357, 'lambda_l1': 0.035904044425813825, 'lambda_l2': 0.48676577678967453}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:38:54,463][0m Trial 6 finished with value: 0.7702918376921019 and parameters: {'num_leaves': 104, 'min_child_samples': 84, 'min_sum_hessian_in_leaf': 0.0006312414094763962, 'feature_fraction': 0.8336215842654653, 'bagging_fraction': 0.7058401885578065, 'lambda_l1': 37.85113452982476, 'lambda_l2': 13.738770122552033}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:39:39,973][0m Trial 9 finished with value: 0.7666111434040904 and parameters: {'num_leaves': 237, 'min_child_samples': 44, 'min_sum_hessian_in_leaf': 2.454020514522036e-05, 'feature_fraction': 0.8247545803043157, 'bagging_fraction': 0.6656417865118325, 'lambda_l1': 2.513540205784352, 'lambda_l2': 0.37793969184757337}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:41:52,805][0m Trial 12 finished with value: 0.7703638975249898 and parameters: {'num_leaves': 53, 'min_child_samples': 162, 'min_sum_hessian_in_leaf': 0.0001376821385007092, 'feature_fraction': 0.599607362486716, 'bagging_fraction': 0.6270658496386041, 'lambda_l1': 0.018400401686804124, 'lambda_l2': 0.058288996558707055}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:43:35,455][0m Trial 15 finished with value: 0.7737438556506319 and parameters: {'num_leaves': 22, 'min_child_samples': 180, 'min_sum_hessian_in_leaf': 8.958773624199919e-05, 'feature_fraction': 0.6629758419075759, 'bagging_fraction': 0.7879720462589559, 'lambda_l1': 0.33487443727854393, 'lambda_l2': 2.024470981502225}. Best is trial 8 with value: 0.7739668677951077.[0m




[32m[I 2023-04-30 14:44:14,264][0m Trial 13 finished with value: 0.7741755057342393 and parameters: {'num_leaves': 15, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.0001285105525128105, 'feature_fraction': 0.6600242808843753, 'bagging_fraction': 0.691086331902068, 'lambda_l1': 4.17127173745554, 'lambda_l2': 0.07170186177257691}. Best is trial 13 with value: 0.7741755057342393.[0m




[32m[I 2023-04-30 14:45:41,806][0m Trial 14 finished with value: 0.7727634386618815 and parameters: {'num_leaves': 70, 'min_child_samples': 165, 'min_sum_hessian_in_leaf': 1.282154611628788e-05, 'feature_fraction': 0.6948205891209409, 'bagging_fraction': 0.810896984694953, 'lambda_l1': 0.28638741520309485, 'lambda_l2': 95.37325063725282}. Best is trial 13 with value: 0.7741755057342393.[0m




[32m[I 2023-04-30 14:47:06,332][0m Trial 16 finished with value: 0.774228916505175 and parameters: {'num_leaves': 12, 'min_child_samples': 198, 'min_sum_hessian_in_leaf': 7.530342372742158e-05, 'feature_fraction': 0.6835805373858773, 'bagging_fraction': 0.8202328522800826, 'lambda_l1': 0.32414985919792577, 'lambda_l2': 1.8047043072244473}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:47:56,129][0m Trial 11 finished with value: 0.7736821765945403 and parameters: {'num_leaves': 8, 'min_child_samples': 89, 'min_sum_hessian_in_leaf': 0.0006399494987799596, 'feature_fraction': 0.6354710382099644, 'bagging_fraction': 0.8865195753442177, 'lambda_l1': 67.75135561587287, 'lambda_l2': 0.011296561511023241}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:48:33,017][0m Trial 17 finished with value: 0.7724898371518312 and parameters: {'num_leaves': 76, 'min_child_samples': 39, 'min_sum_hessian_in_leaf': 1.2013092374365642e-05, 'feature_fraction': 0.6643598507131919, 'bagging_fraction': 0.7748727031198798, 'lambda_l1': 0.2733071066160774, 'lambda_l2': 4.273776810347577}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:50:51,219][0m Trial 18 finished with value: 0.772834917826747 and parameters: {'num_leaves': 62, 'min_child_samples': 11, 'min_sum_hessian_in_leaf': 1.2437327781977613e-05, 'feature_fraction': 0.7262240661279012, 'bagging_fraction': 0.7910510106231791, 'lambda_l1': 0.5012013227878978, 'lambda_l2': 90.79990629432264}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:52:11,887][0m Trial 19 finished with value: 0.7678671354616918 and parameters: {'num_leaves': 153, 'min_child_samples': 15, 'min_sum_hessian_in_leaf': 1.0209010093448084e-05, 'feature_fraction': 0.7452015657933997, 'bagging_fraction': 0.5785484490682736, 'lambda_l1': 12.534843629969728, 'lambda_l2': 0.012042995902882192}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:52:25,766][0m Trial 20 finished with value: 0.7731770203515593 and parameters: {'num_leaves': 78, 'min_child_samples': 11, 'min_sum_hessian_in_leaf': 0.0004378260995697136, 'feature_fraction': 0.7441407489048526, 'bagging_fraction': 0.9872156359402651, 'lambda_l1': 10.255823037639985, 'lambda_l2': 2.97250426028707}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:53:40,163][0m Trial 22 finished with value: 0.7719443761357984 and parameters: {'num_leaves': 78, 'min_child_samples': 199, 'min_sum_hessian_in_leaf': 5.990514298361231e-05, 'feature_fraction': 0.7199017584540721, 'bagging_fraction': 0.8249298085036878, 'lambda_l1': 11.113573419892143, 'lambda_l2': 1.3091540114832185}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:54:47,001][0m Trial 21 finished with value: 0.7729052926536553 and parameters: {'num_leaves': 159, 'min_child_samples': 12, 'min_sum_hessian_in_leaf': 6.924504875009663e-05, 'feature_fraction': 0.7361736352908776, 'bagging_fraction': 0.9884591175297743, 'lambda_l1': 12.860015493523816, 'lambda_l2': 1.253174738118891}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:56:54,416][0m Trial 23 finished with value: 0.7716236835178307 and parameters: {'num_leaves': 150, 'min_child_samples': 193, 'min_sum_hessian_in_leaf': 5.8257186809470116e-05, 'feature_fraction': 0.7663083841317482, 'bagging_fraction': 0.9487185587054988, 'lambda_l1': 16.644666252358554, 'lambda_l2': 1.0033575166493947}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:57:35,847][0m Trial 25 finished with value: 0.7728433464190255 and parameters: {'num_leaves': 34, 'min_child_samples': 198, 'min_sum_hessian_in_leaf': 5.01878020923559e-05, 'feature_fraction': 0.6154308933856387, 'bagging_fraction': 0.7337970256606691, 'lambda_l1': 0.9476504745465095, 'lambda_l2': 1.08765107501717}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:58:25,105][0m Trial 24 finished with value: 0.7732909417177807 and parameters: {'num_leaves': 88, 'min_child_samples': 145, 'min_sum_hessian_in_leaf': 5.761720029473912e-05, 'feature_fraction': 0.7662919518011788, 'bagging_fraction': 0.9984983222154569, 'lambda_l1': 8.67671528421637, 'lambda_l2': 1.4980219069887775}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 14:59:37,820][0m Trial 27 finished with value: 0.7739057409079945 and parameters: {'num_leaves': 30, 'min_child_samples': 68, 'min_sum_hessian_in_leaf': 4.7782317911216575e-05, 'feature_fraction': 0.5995004070922038, 'bagging_fraction': 0.7277105753287096, 'lambda_l1': 0.7727801382366235, 'lambda_l2': 10.6196733969307}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 15:00:01,547][0m Trial 26 finished with value: 0.7739219782383437 and parameters: {'num_leaves': 31, 'min_child_samples': 67, 'min_sum_hessian_in_leaf': 4.361733252953772e-05, 'feature_fraction': 0.6022136653283279, 'bagging_fraction': 0.7162479767468483, 'lambda_l1': 0.1374110042112386, 'lambda_l2': 11.159826700165997}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 15:01:31,327][0m Trial 28 finished with value: 0.7729079181280887 and parameters: {'num_leaves': 33, 'min_child_samples': 141, 'min_sum_hessian_in_leaf': 2.7153083827321975e-05, 'feature_fraction': 0.6019181514165113, 'bagging_fraction': 0.7416881785053507, 'lambda_l1': 0.7293973822763121, 'lambda_l2': 13.01560695842564}. Best is trial 16 with value: 0.774228916505175.[0m




[32m[I 2023-04-30 15:02:51,773][0m Trial 29 finished with value: 0.774293156912851 and parameters: {'num_leaves': 24, 'min_child_samples': 60, 'min_sum_hessian_in_leaf': 2.9394846124200733e-05, 'feature_fraction': 0.5688914516331409, 'bagging_fraction': 0.7382794606134832, 'lambda_l1': 0.1449294951904485, 'lambda_l2': 9.376345368719733}. Best is trial 29 with value: 0.774293156912851.[0m




[32m[I 2023-04-30 15:03:47,135][0m Trial 30 finished with value: 0.7746074764320922 and parameters: {'num_leaves': 29, 'min_child_samples': 66, 'min_sum_hessian_in_leaf': 0.0001793720298613534, 'feature_fraction': 0.5747013859867727, 'bagging_fraction': 0.7578955936635908, 'lambda_l1': 0.11418385747659184, 'lambda_l2': 9.645952020790984}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:05:30,064][0m Trial 31 finished with value: 0.774368942995749 and parameters: {'num_leaves': 21, 'min_child_samples': 30, 'min_sum_hessian_in_leaf': 0.000203796600641694, 'feature_fraction': 0.6901628996802489, 'bagging_fraction': 0.7669046880302925, 'lambda_l1': 0.1150170019530935, 'lambda_l2': 32.03144159927253}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:05:51,619][0m Trial 33 finished with value: 0.772290525524866 and parameters: {'num_leaves': 54, 'min_child_samples': 36, 'min_sum_hessian_in_leaf': 2.1959666633228807e-05, 'feature_fraction': 0.6863553341067072, 'bagging_fraction': 0.7540892879680635, 'lambda_l1': 1.3873894648560485, 'lambda_l2': 0.19673344936042406}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:08:08,036][0m Trial 32 finished with value: 0.7743927147608697 and parameters: {'num_leaves': 10, 'min_child_samples': 33, 'min_sum_hessian_in_leaf': 2.4717142802937743e-05, 'feature_fraction': 0.684529258024212, 'bagging_fraction': 0.7609177678743813, 'lambda_l1': 0.10845046447215113, 'lambda_l2': 38.00686222712848}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:08:36,373][0m Trial 34 finished with value: 0.7722625857745526 and parameters: {'num_leaves': 54, 'min_child_samples': 33, 'min_sum_hessian_in_leaf': 0.000184631383396415, 'feature_fraction': 0.5548449288658687, 'bagging_fraction': 0.7721929746327284, 'lambda_l1': 0.010394410688998384, 'lambda_l2': 0.2389394037881191}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:10:08,557][0m Trial 36 finished with value: 0.7734291977052429 and parameters: {'num_leaves': 54, 'min_child_samples': 34, 'min_sum_hessian_in_leaf': 0.0002667529359454006, 'feature_fraction': 0.5562816549802805, 'bagging_fraction': 0.7688207591397724, 'lambda_l1': 0.07662245510531593, 'lambda_l2': 29.42402111233435}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:10:20,411][0m Trial 35 finished with value: 0.7732286071836756 and parameters: {'num_leaves': 44, 'min_child_samples': 38, 'min_sum_hessian_in_leaf': 0.00017968250488417282, 'feature_fraction': 0.5564209756266147, 'bagging_fraction': 0.7551259592246963, 'lambda_l1': 0.07861982002366585, 'lambda_l2': 6.965798014407255}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:11:33,853][0m Trial 37 finished with value: 0.7734483383757049 and parameters: {'num_leaves': 52, 'min_child_samples': 28, 'min_sum_hessian_in_leaf': 0.00022173451540202072, 'feature_fraction': 0.5508595468524508, 'bagging_fraction': 0.8462222504708984, 'lambda_l1': 0.07264812265403622, 'lambda_l2': 27.681740004548242}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:13:03,414][0m Trial 38 finished with value: 0.7735664206023578 and parameters: {'num_leaves': 49, 'min_child_samples': 28, 'min_sum_hessian_in_leaf': 0.0002305884257891211, 'feature_fraction': 0.558659369430582, 'bagging_fraction': 0.7579246987197883, 'lambda_l1': 0.06960971993906868, 'lambda_l2': 20.579738760168382}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:14:21,131][0m Trial 39 finished with value: 0.7737994572855793 and parameters: {'num_leaves': 47, 'min_child_samples': 54, 'min_sum_hessian_in_leaf': 0.0002935670103387586, 'feature_fraction': 0.5097390791000265, 'bagging_fraction': 0.8560948831459739, 'lambda_l1': 0.06420510430307676, 'lambda_l2': 29.74289980929732}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:15:47,980][0m Trial 42 finished with value: 0.7702869037951001 and parameters: {'num_leaves': 92, 'min_child_samples': 52, 'min_sum_hessian_in_leaf': 3.450131839355056e-05, 'feature_fraction': 0.5103845186113236, 'bagging_fraction': 0.6974400097938047, 'lambda_l1': 0.1806408347696071, 'lambda_l2': 21.759607508506605}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:16:44,913][0m Trial 40 finished with value: 0.7719074948104132 and parameters: {'num_leaves': 187, 'min_child_samples': 57, 'min_sum_hessian_in_leaf': 3.523935911299695e-05, 'feature_fraction': 0.52831918455964, 'bagging_fraction': 0.8572302109461916, 'lambda_l1': 0.06618063551979977, 'lambda_l2': 26.59757466265051}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:17:16,097][0m Trial 41 finished with value: 0.7705405096633667 and parameters: {'num_leaves': 216, 'min_child_samples': 54, 'min_sum_hessian_in_leaf': 3.2494134267802474e-05, 'feature_fraction': 0.5069165455535134, 'bagging_fraction': 0.8525313306882738, 'lambda_l1': 0.16860164461847502, 'lambda_l2': 23.40127908773455}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:19:22,147][0m Trial 43 finished with value: 0.7736096643393278 and parameters: {'num_leaves': 24, 'min_child_samples': 56, 'min_sum_hessian_in_leaf': 1.8413382785141788e-05, 'feature_fraction': 0.5021702054524212, 'bagging_fraction': 0.6841817185952711, 'lambda_l1': 0.1627132895658912, 'lambda_l2': 48.99745802948103}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:20:25,807][0m Trial 44 finished with value: 0.7736290009407177 and parameters: {'num_leaves': 24, 'min_child_samples': 78, 'min_sum_hessian_in_leaf': 1.7359911715372785e-05, 'feature_fraction': 0.6259991529097793, 'bagging_fraction': 0.6955582519866143, 'lambda_l1': 0.1468263027563057, 'lambda_l2': 47.88640855147094}. Best is trial 30 with value: 0.7746074764320922.[0m




[32m[I 2023-04-30 15:22:00,674][0m Trial 45 finished with value: 0.774470542087798 and parameters: {'num_leaves': 26, 'min_child_samples': 79, 'min_sum_hessian_in_leaf': 9.013490967237669e-05, 'feature_fraction': 0.629992506166484, 'bagging_fraction': 0.8022072827186417, 'lambda_l1': 0.20652556084451001, 'lambda_l2': 51.91666063716048}. Best is trial 30 with value: 0.7746074764320922.[0m
[32m[I 2023-04-30 15:22:12,164][0m Trial 46 finished with value: 0.7743733354109136 and parameters: {'num_leaves': 21, 'min_child_samples': 76, 'min_sum_hessian_in_leaf': 1.6783041267345774e-05, 'feature_fraction': 0.6242181884086013, 'bagging_fraction': 0.8108101973648404, 'lambda_l1': 0.15339984411904464, 'lambda_l2': 50.004513625235234}. Best is trial 30 with value: 0.7746074764320922.[0m
[32m[I 2023-04-30 15:22:41,837][0m Trial 47 finished with value: 0.7743586192170407 and parameters: {'num_leaves': 20, 'min_child_samples': 74, 'min_sum_hessian_in_leaf': 1.6823507200351032e-05, 'feature_fr

In [46]:
#探査結果の確認
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7750


{'num_leaves': 12,
 'min_child_samples': 106,
 'min_sum_hessian_in_leaf': 8.623253649320915e-05,
 'feature_fraction': 0.633612863123612,
 'bagging_fraction': 0.8056982211623093,
 'lambda_l1': 0.04043314263083684,
 'lambda_l2': 6.59894543540371}

In [47]:
#ベストなハイパーパラメータ
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 12,
 'min_child_samples': 106,
 'min_sum_hessian_in_leaf': 8.623253649320915e-05,
 'feature_fraction': 0.633612863123612,
 'bagging_fraction': 0.8056982211623093,
 'lambda_l1': 0.04043314263083684,
 'lambda_l2': 6.59894543540371,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'verbosity': -1,
 'learning_rate': 0.05,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'random_state': 123}

In [48]:
#ベストなハイパーパラメータを用いたモデル学習
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                    params=params_best,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.770488	valid_1's auc: 0.761265
[200]	training's auc: 0.783427	valid_1's auc: 0.768502
[300]	training's auc: 0.79259	valid_1's auc: 0.771517
[400]	training's auc: 0.799643	valid_1's auc: 0.772832
[500]	training's auc: 0.805975	valid_1's auc: 0.773612
[600]	training's auc: 0.811629	valid_1's auc: 0.774023
[700]	training's auc: 0.817242	valid_1's auc: 0.774301
[800]	training's auc: 0.822105	valid_1's auc: 0.774613
[900]	training's auc: 0.827035	valid_1's auc: 0.774893
[1000]	training's auc: 0.831656	valid_1's auc: 0.774824
[auc] tr:0.8282, va:0.7750
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.769731	valid_1's auc: 0.763308
[200]	training's auc: 0.783171	valid_1's auc: 0.771986
[300]	training's auc: 0.791963	valid_1's auc: 0.775425
[400]	training's auc: 0.799553	valid_1's auc: 0.776997
[500]	training's auc: 0.80583	valid_1's auc: 0.777997
[600]	t

In [49]:
#推論データ作成とモデル推論
# 推論用のデータセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

# predict
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

# make submission-file
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_HyperParameterTuning.csv", index=None)

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.
(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.039667
1,100005,0.13622
2,100013,0.026738
3,100028,0.044876
4,100038,0.198612
