In [1]:
import os, sys
from pathlib import Path

BASE_DIR = Path(Path.home(), "workspace", "services", "credit_model")
DATA_DIR = Path(Path.home(), "workspace", "data")
if BASE_DIR not in sys.path:
    sys.path.insert(0, f"{BASE_DIR}")

In [2]:
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [5]:
from util import metric_helper

# 加载数据

In [6]:
fp_data = Path(DATA_DIR, 'tutorial', 'data_woe_result.csv')
df_woe = pd.read_csv(fp_data, index_col=None)

In [7]:
num_cols = ['Collateral_valuation', 'Age', 'Properties_Total', 'Amount', 'Term', 'Historic_Loans', 'Current_Loans', 'Max_Arrears']
cat_cols = ['Region', 'Area', 'Activity', 'Guarantor', 'Collateral', 'Properties_Status']
features = num_cols + cat_cols
label = 'Defaulter'

# Gridsearch

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
params = {
    "learning_rate": 0.1,
    "max_depth": 3,
    "num_leaves": 7,
    "min_child_samples": 800,
    "subsample": 1,
    "subsample_freq": 0,
    "colsample_bytree": 1,
    "reg_alpha": 157,
    "reg_lambda": 500,
}

model = lgb.LGBMClassifier(
    **params,
    n_estimators=200,
    objective="cross_entropy",
    class_weight="balanced",
    importance_type="gain",
    boosting_type="gbdt",
    silent=True,
    n_jobs=8,
    random_state=19910908
)

In [10]:
param_grid = {
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 4, 5],
    "num_leaves": [15, 31, 63],
    "min_child_samples": [1, 20, 50],
}

grid = GridSearchCV(
    model,
    param_grid,
    verbose=3,
    cv=3,
    scoring={"AUC": "roc_auc"},
    n_jobs=1,
    refit="AUC",
)
result = grid.fit(df_woe.loc[df_woe['sample_type']=='train', features], 
                  df_woe.loc[df_woe['sample_type']=='train', label])

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.755) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.755) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.755) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.755) total time=   0.0s
[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.754) total time=   0.0s




[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.755) total time=   0.0s
[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.754) total time=   0.0s




[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.755) total time=   0.0s
[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.760) total time=   0.0s




[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.755) total time=   0.0s
[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.755) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.760) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.755) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.754) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.762) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.759) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.758) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.763) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.761) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.759) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.763) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.761) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.759) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.763) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.761) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.759) total time=   0.0s
[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.763) total time=   0.0s
[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.761) total time=   0.0s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.759) total time=   0.2s




[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.763) total time=   0.1s




[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.761) total time=   0.1s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.759) total time=   0.1s




[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.763) total time=   0.1s




[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.761) total time=   0.1s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.759) total time=   0.1s




[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.763) total time=   0.1s




[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.760) total time=   0.1s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.759) total time=   0.1s




[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.763) total time=   0.1s




[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.760) total time=   0.1s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.759) total time=   0.1s




[CV 1/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.763) total time=   0.1s




[CV 2/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.760) total time=   0.1s




[CV 3/3] END learning_rate=0.01, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.759) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.785) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.782) total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=15; AUC: (test=0.781) total time=   0.0s




[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.785) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.782) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=31; AUC: (test=0.781) total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.785) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.782) total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=1, num_leaves=63; AUC: (test=0.781) total time=   0.0s




[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.785) total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.782) total time=   0.0s




[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=15; AUC: (test=0.781) total time=   0.0s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.785) total time=   0.0s




[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.782) total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=31; AUC: (test=0.781) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.785) total time=   0.0s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.782) total time=   0.0s




[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=20, num_leaves=63; AUC: (test=0.781) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.785) total time=   0.0s




[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.782) total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=15; AUC: (test=0.781) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.785) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.782) total time=   0.0s




[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=31; AUC: (test=0.781) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.785) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.782) total time=   0.0s
[CV 3/3] END learning_rate=0.1, max_depth=3, min_child_samples=50, num_leaves=63; AUC: (test=0.781) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=15; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=31; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=1, num_leaves=63; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=15; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=31; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=20, num_leaves=63; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.785) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=15; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.785) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=31; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.785) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=4, min_child_samples=50, num_leaves=63; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=15; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=31; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=1, num_leaves=63; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=15; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=31; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=20, num_leaves=63; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=15; AUC: (test=0.782) total time=   0.1s




[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.786) total time=   0.1s
[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.784) total time=   0.1s




[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=31; AUC: (test=0.782) total time=   0.1s
[CV 1/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.786) total time=   0.1s




[CV 2/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.784) total time=   0.1s
[CV 3/3] END learning_rate=0.1, max_depth=5, min_child_samples=50, num_leaves=63; AUC: (test=0.782) total time=   0.1s




In [11]:
df_cv = pd.DataFrame(result.cv_results_["params"])
df_cv["mean_test_AUC"] = result.cv_results_["mean_test_AUC"]
print(df_cv)

    learning_rate  max_depth  min_child_samples  num_leaves  mean_test_AUC
0            0.01          3                  1          15       0.756151
1            0.01          3                  1          31       0.756151
2            0.01          3                  1          63       0.756151
3            0.01          3                 20          15       0.756151
4            0.01          3                 20          31       0.756151
5            0.01          3                 20          63       0.756151
6            0.01          3                 50          15       0.756149
7            0.01          3                 50          31       0.756149
8            0.01          3                 50          63       0.756149
9            0.01          4                  1          15       0.759626
10           0.01          4                  1          31       0.759626
11           0.01          4                  1          63       0.759626
12           0.01        

# 贝叶斯优化

In [12]:
import ast
import csv
from timeit import default_timer as timer
import lightgbm as lgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import StratifiedKFold

In [13]:
class BO:
    def __init__(self, fp_path, **kwargs):
        self.fp_path = fp_path
        self.iter = 0
        self.train_set = None

        self.kfold = kwargs.get("kfold", 3)

        csv_conn = open(self.fp_path, "w")
        writer = csv.writer(csv_conn)
        writer.writerow(["loss", "auc_train", "auc_valid", "params", "iteration", "train_time"])
        csv_conn.close()

    def load_data(self, df_xtrain, df_ytrain):
        self.df_xtrain = df_xtrain.reset_index(drop=True)
        self.df_ytrain = df_ytrain.reset_index(drop=True)

    def objective(self, params):
        self.iter += 1

        start = timer()
        model = lgb.LGBMClassifier(
            **params,
            n_estimators=200,
            objective="cross_entropy",
            class_weight="balanced",
            importance_type="gain",
            boosting_type="gbdt",
            n_jobs=1,
            random_state=19910908
        )

        lst_auc_train, lst_auc_valid = list(), list()
        kf = StratifiedKFold(n_splits=self.kfold, shuffle=False)
        for itrain, ivalid in kf.split(self.df_xtrain, self.df_ytrain):
            df_xtrain, df_ytrain = (
                self.df_xtrain.loc[itrain, :],
                self.df_ytrain.loc[itrain],
            )
            df_xvalid, df_yvalid = (
                self.df_xtrain.loc[ivalid, :],
                self.df_ytrain.loc[ivalid],
            )

            eval_set = [(df_xtrain, df_ytrain), (df_xvalid, df_yvalid)]
            model.fit(df_xtrain, df_ytrain, 
                      eval_set=eval_set, eval_metric="auc",
                      callbacks=[lgb.log_evaluation(0)],)

            auc_train = metric_helper.Metric.get_auc(df_ytrain, model.predict(df_xtrain))
            auc_valid = metric_helper.Metric.get_auc(df_yvalid, model.predict(df_xvalid))
            lst_auc_train.append(auc_train)
            lst_auc_valid.append(auc_valid)

        run_time = timer() - start

        auc_train_avg = np.mean(lst_auc_train)
        auc_valid_avg = np.mean(lst_auc_valid)
        loss = -np.mean(lst_auc_valid)

        csv_conn = open(self.fp_path, "a")
        writer = csv.writer(csv_conn)
        writer.writerow([loss, auc_train_avg, auc_valid_avg, params, self.iter, run_time])

        res = {
            "loss": loss,
            "auc_train": auc_train_avg,
            "auc_valid": auc_valid_avg,
            "params": params,
            "iteration": self.iter,
            "train_time": run_time,
            "status": STATUS_OK,
        }
        print(res)

        return res

    def optimize(self, max_evals):
        self.iter = 0

        space = {
            "learning_rate": hp.choice("learning_rate", [0.01, 0.1]),
            "max_depth": hp.choice("max_depth", [3, 4, 5]),
            "num_leaves": hp.choice("num_leaves", [15, 31, 63]),
            "min_child_samples": hp.choice("min_child_samples", [1, 20, 50]),
        }

        best = fmin(
            fn=self.objective,
            space=space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=Trials(),
            max_queue_len=10,
            show_progressbar=True,
            rstate=np.random.default_rng(199198),
        )

        return best

In [14]:
fp_path = Path(DATA_DIR, "bo.csv")
bo = BO(fp_path)
bo.load_data(df_woe.loc[df_woe['sample_type']=='train', features], 
             df_woe.loc[df_woe['sample_type']=='train', label])
bo.optimize(80)

{'loss': -0.7291371735408757, 'auc_train': 0.7582412837874494, 'auc_valid': 0.7291371735408757, 'params': {'learning_rate': 0.1, 'max_depth': 4, 'min_child_samples': 50, 'num_leaves': 31}, 'iteration': 1, 'train_time': 5.547456199987209, 'status': 'ok'}
{'loss': -0.7272398918549515, 'auc_train': 0.7583559117962616, 'auc_valid': 0.7272398918549515, 'params': {'learning_rate': 0.1, 'max_depth': 4, 'min_child_samples': 1, 'num_leaves': 15}, 'iteration': 2, 'train_time': 5.491355100006331, 'status': 'ok'}
{'loss': -0.7104573465863293, 'auc_train': 0.7190461077331465, 'auc_valid': 0.7104573465863293, 'params': {'learning_rate': 0.01, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 15}, 'iteration': 3, 'train_time': 5.452411399994162, 'status': 'ok'}
{'loss': -0.7083906756270414, 'auc_train': 0.716844605466831, 'auc_valid': 0.7083906756270414, 'params': {'learning_rate': 0.01, 'max_depth': 4, 'min_child_samples': 1, 'num_leaves': 63}, 'iteration': 4, 'train_time': 5.613233699987177, '

{'loss': -0.7104573465863293, 'auc_train': 0.7190461077331465, 'auc_valid': 0.7104573465863293, 'params': {'learning_rate': 0.01, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 15}, 'iteration': 65, 'train_time': 5.310110599995824, 'status': 'ok'}
{'loss': -0.7293101315152555, 'auc_train': 0.7869310055659571, 'auc_valid': 0.7293101315152555, 'params': {'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 31}, 'iteration': 66, 'train_time': 6.058748200011905, 'status': 'ok'}
{'loss': -0.7293101315152555, 'auc_train': 0.7869310055659571, 'auc_valid': 0.7293101315152555, 'params': {'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 31}, 'iteration': 67, 'train_time': 6.104465800002799, 'status': 'ok'}
{'loss': -0.7293101315152555, 'auc_train': 0.7869310055659571, 'auc_valid': 0.7293101315152555, 'params': {'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 31}, 'iteration': 68, 'train_time': 6.141981699998

{'learning_rate': 1, 'max_depth': 2, 'min_child_samples': 2, 'num_leaves': 1}

In [15]:
df_cv = pd.read_csv(fp_path)
df_cv = df_cv.sort_values(by=["auc_valid", "auc_train"], ascending=[False, False]).reset_index(drop=True)
best_param = df_cv.iloc[0]["params"]
print(ast.literal_eval(best_param))
print(df_cv.iloc[0])

{'learning_rate': 0.1, 'max_depth': 5, 'min_child_samples': 50, 'num_leaves': 31}
loss                                                   -0.72931
auc_train                                              0.786931
auc_valid                                               0.72931
params        {'learning_rate': 0.1, 'max_depth': 5, 'min_ch...
iteration                                                    49
train_time                                             5.513026
Name: 0, dtype: object
