## 集成学习算法  

集成学习通过构建多个学习器，并结合多个学习器的结果进行预测，通常可以获得比单一学习器更高的准确性和泛化能力。  
运行时选择【**风控专用镜像**】  

**1. 随机森林（random forest）**  
**2. GBDT（Gradient Boost Decision Tree）**  
**3. XGBoost（eXtreme Gradient Boosting）**  
**4. LightGBM（Light Gradient Boosting Machine）**  


## 1. 随机森林

In [1]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from utils import data_utils

# 导入数值型样例数据
train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)

# 训练随机森林模型
clf = RandomForestClassifier(n_estimators=200,
                             criterion='gini',
                             max_depth=6,
                             min_samples_leaf=15,
                             bootstrap=True,
                             oob_score=True,
                             random_state=88)
clf.fit(train_x, train_y)

auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
print("随机森林模型 AUC: ", auc_score)

随机森林模型 AUC:  0.7455357142857143


  "X does not have valid feature names, but"


## 2. GBDT

In [2]:
# GBDT
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from utils import data_utils

# 导入数值型样例数据
train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)

# 训练GBDT模型
clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=0.1,
                                 subsample=0.9,
                                 max_depth=4,
                                 min_samples_leaf=20,
                                 random_state=88)
clf.fit(train_x, train_y)

auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
print("GBDT模型 AUC: ", auc_score)

GBDT模型 AUC:  0.7826140873015873


## 3. XGBoost

In [3]:
#import shap
import numpy as np
import pandas as pd
import xgboost as xgb
import bayes_opt as bo
import sklearn.model_selection as sk_ms
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_auc_score
from utils import data_utils

In [4]:
# 确定最优树的颗数
def xgb_cv(param, x, y, num_boost_round=10000):
    dtrain = xgb.DMatrix(x, label=y)
    cv_res = xgb.cv(param, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=30)
    num_boost_round = cv_res.shape[0]
    return num_boost_round

In [5]:
def train_xgb(params, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, early_stopping_rounds=30, verbose_eval=50):
    """
    训练xgb模型
    """
    dtrain = xgb.DMatrix(x_train, label=y_train)
    if x_test is None:
        num_boost_round = xgb_cv(params, x_train, y_train)
        early_stopping_rounds = None
        eval_sets = ()
    else:
        dtest = xgb.DMatrix(x_test, label=y_test)
        eval_sets = [(dtest, 'test')]
    model = xgb.train(params, dtrain, num_boost_round, evals=eval_sets, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
    return model

In [6]:
def xgboost_grid_search(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000):
    """
    网格调参, 确定其他参数
    """
    # 设置训练参数
    if x_test is None:
        x_train, x_test, y_train, y_test = sk_ms.train_test_split(x_train, y_train, test_size=0.2, random_state=1)
    score_list = []
    test_params = list(ParameterGrid(params_space))
    for params_try in test_params:
        params_try['eval_metric'] = "auc"
        params_try['random_state'] = 1
        clf_obj = train_xgb(params_try, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round,
                            early_stopping_rounds=30, verbose_eval=0)
        score_list.append(roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test))))
    result = pd.DataFrame(dict(zip(score_list, test_params))).T
    print(result)
    # 取测试集上效果最好的参数组合
    params = test_params[np.array(score_list).argmax()]
    return params

In [7]:
def xgboost_bayesian_optimization(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, nfold=5, init_points=2, n_iter=5, verbose_eval=0, early_stopping_rounds=30):
    """
    贝叶斯调参, 确定其他参数
    """
    # 设置需要调节的参数及效果评价指标
    def xgboost_cv_for_bo(eta, gamma, max_depth, min_child_weight,
                          subsample, colsample_bytree):
        params = {
            'eval_metric': 'auc',
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eta': eta,
            'gamma': gamma,
            'max_depth': int(max_depth),
            'min_child_weight': int(min_child_weight),
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'seed': 1
        }
        if x_test is None:
            dtrain = xgb.DMatrix(x_train, label=y_train)
            xgb_cross = xgb.cv(params,
                               dtrain,
                               nfold=nfold,
                               metrics='auc',
                               early_stopping_rounds=early_stopping_rounds,
                               num_boost_round=num_boost_round)
            test_auc = xgb_cross['test-auc-mean'].iloc[-1]
        else:
            clf_obj = train_xgb(params, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round,
                                early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
            test_auc = roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test)))
        return test_auc

    # 指定需要调节参数的取值范围
    xgb_bo_obj = bo.BayesianOptimization(xgboost_cv_for_bo, params_space, random_state=1)
    xgb_bo_obj.maximize(init_points=init_points, n_iter=n_iter)
    best_params = xgb_bo_obj.max['params']
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['min_child_weight'] = int(best_params['min_child_weight'])
    best_params['eval_metric'] = 'auc'
    best_params['booster'] = 'gbtree'
    best_params['objective'] = 'binary:logistic'
    best_params['seed'] = 1
    return best_params


In [8]:
# 导入数值型样例数据
train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)

# 经验参数
exp_params = {
    'eval_metric': 'auc',
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eta': 0.1,
    'gamma': 0.01,
    'max_depth': 4,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'seed': 1
}
final_xgb_model = train_xgb(exp_params, train_x, train_y, test_x, test_y)
auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x)))
print("经验参数模型AUC: ", auc_score)

# 随机搜索调参
choose_tuner = 'bayesian'  # bayesian grid_search
if choose_tuner == 'grid_search':
    params_test = {
        'learning_rate': [0.1, 0.15],
        'gamma': [0.01, 0],
        'max_depth': [4, 3],
        'min_child_weight': [1, 2],
        'subsample': [0.95, 1],
        'colsample_bytree': [1]
    }
    optimal_params = xgboost_grid_search(params_test, train_x, train_y, test_x, test_y)
elif choose_tuner == 'bayesian':
    # 贝叶斯调参
    params_test = {'eta': (0.05, 0.2),
                   'gamma': (0.005, 0.05),
                   'max_depth': (3, 5),
                   'min_child_weight': (0, 3),
                   'subsample': (0.9, 1.0),
                   'colsample_bytree': (0.9, 1.0)}
    optimal_params = xgboost_bayesian_optimization(params_test, train_x, train_y, test_x, test_y, init_points=5, n_iter=8)

print("随机搜索调参最优参数: ", optimal_params)

final_xgb_model = train_xgb(optimal_params, train_x, train_y, test_x, test_y)
auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x)))
print("随机搜索调参模型AUC: ", auc_score)

# Pickle方式保存和读取模型
def save_model_as_pkl(model, path):
    """
    保存模型到路径path
    :param model: 训练完成的模型
    :param path: 保存的目标路径
    """
    import pickle
    with open(path, 'wb') as f:
        pickle.dump(model, f, protocol=2)

# 保存模型
save_model_as_pkl(final_xgb_model, "./data/xgb_model.pkl")

# SHAP计算
#explainer = shap.TreeExplainer(final_xgb_model)
#shap_values = explainer.shap_values(train_x)
# SHAP可视化
#shap.summary_plot(shap_values, train_x, max_display=5)


[0]	test-auc:0.71255
[50]	test-auc:0.76736
[100]	test-auc:0.78212
[150]	test-auc:0.79377
[200]	test-auc:0.79836
[249]	test-auc:0.80109
经验参数模型AUC:  0.8002232142857143
|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8073  [0m | [0m 0.9417  [0m | [0m 0.158   [0m | [0m 0.005005[0m | [0m 3.605   [0m | [0m 0.4403  [0m | [0m 0.9092  [0m |
| [0m 2       [0m | [0m 0.8025  [0m | [0m 0.9186  [0m | [0m 0.1018  [0m | [0m 0.02285 [0m | [0m 4.078   [0m | [0m 1.258   [0m | [0m 0.9685  [0m |
| [0m 3       [0m | [0m 0.7852  [0m | [0m 0.9204  [0m | [0m 0.1817  [0m | [0m 0.006232[0m | [0m 4.341   [0m | [0m 1.252   [0m | [0m 0.9559  [0m |
| [0m 4       [0m | [0m 0.7953  [0m | [0m 0.914   [0m | [0m 0.07972 [0m | [0m 0.04103 [0m | [0m 4.937   [0m | [0m 0.9403  [0m | [0m 0.969

## 4. LightGBM

In [9]:
import lightgbm as lgb
from lightgbm import early_stopping 
from sklearn.metrics import roc_auc_score
from utils import data_utils

# 导入数值型样例数据
train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)

clf = lgb.LGBMClassifier(objective='binary',
                         boosting_type='gbdt',
                         max_depth=3,
                         n_estimators=1000,
                         subsample=1,
                         colsample_bytree=1,
                         n_jobs=4)
callbacks = [early_stopping(stopping_rounds=30)]

lgb_model = clf.fit(train_x, train_y, eval_set=[(test_x, test_y)], eval_metric='auc', callbacks=callbacks)

auc_score = roc_auc_score(test_y, lgb_model.predict_proba(test_x)[:, 1])
print("LightGBM模型 AUC: ", auc_score)


[LightGBM] [Info] Number of positive: 244, number of negative: 556
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 396
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305000 -> initscore=-0.823600
[LightGBM] [Info] Start training from score -0.823600
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[95]	valid_0's auc: 0.800223	valid_0's binary_logloss: 0.481291
LightGBM模型 AUC:  0.8002232142857142
