#### 针对直播中提到对于模型融合stacking和blending方法的使用
直播链接：[Datawhale 零基础入门金融风控 建模调参&模型融合](https://tianchi.aliyun.com/course/live?liveId=41206)
> 优点：可以直接使用xgb、lgb原生库进行建模，效率远高于sklearn接口下的建模

### 使用heamy模块进行模型在线融合

In [1]:
import pandas as pd
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
# import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
sns 相关设置
@return:
"""
# 声明使用 Seaborn 样式
sns.set()
# 有五种seaborn的绘图风格，它们分别是：darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。
sns.set_style("whitegrid")
# 有四个预置的环境，按大小从小到大排列分别为：paper, notebook, talk, poster。其中，notebook是默认的。
sns.set_context('talk')
# 中文字体设置-黑体
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像是负号'-'显示为方块的问题
plt.rcParams['axes.unicode_minus'] = False
# 解决Seaborn中文显示问题并调整字体大小
sns.set(font='SimHei')

### 数据预操作 

In [3]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum()  / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
"""读取数据"""
df_data = pd.read_csv("dataset/data_20200922_V1.csv", encoding='gbk')
df_data = reduce_mem_usage(df_data)

Memory usage of dataframe is 328.06 MB
Memory usage after optimization is: 80.11 MB
Decreased by 75.6%


## 模型融合 

In [5]:
"""建立模型：【模型参数：xgb-->鱼佬baseline，lgb --> 贝叶斯调参】"""
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb

def xgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = xgb.DMatrix(X_train_split , label=y_train_split)
    valid_matrix = xgb.DMatrix(X_val , label=y_val)
    test_matrix = xgb.DMatrix(X_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 1,
        'min_child_weight': 1.5,
        'max_depth': 5,
        'lambda': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'colsample_bylevel': 0.7,
        'eta': 0.04,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
    }
    watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
    
    model = xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
    """计算在验证集上的得分"""
    val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后xgboost单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)
    
    return test_pred
    

def lgb_model(X_train, y_train, X_test, y_test=None):
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)
    
    # 调参后的最优参数
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.01,
        'min_child_weight': 0.32,
        'num_leaves': 14,
        'max_depth': 4,
        'feature_fraction': 0.81,
        'bagging_fraction': 0.61,
        'bagging_freq': 9,
        'min_data_in_leaf': 13,
        'min_split_gain': 0.27,
        'reg_alpha': 9.58,
        'reg_lambda': 4.62,
        'seed': 2020,
        'n_jobs':-1,
        'silent': True,
        'verbose': -1,
    }
    
    model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500, early_stopping_rounds=500)
    """计算在验证集上的得分"""
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
    roc_auc = metrics.auc(fpr, tpr)
    print('调参后lightgbm单模型在验证集上的AUC：{}'.format(roc_auc))
    """对测试集进行预测"""
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
    return test_pred

#### 基于模型层面的融合 

In [6]:
"""对训练集数据进行划分，分成训练集和验证集，并进行相应的操作"""
from sklearn.model_selection import train_test_split

"""数据集设置"""
X_train = df_data.loc[df_data['sample']=='train', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
X_test = df_data.loc[df_data['sample']=='test', :].drop(['id','issueDate','isDefault', 'sample'], axis=1)
y_train = df_data.loc[df_data['sample']=='train', 'isDefault']
# 数据集划分
# X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)


In [7]:
from heamy.dataset import Dataset
from heamy.estimator import Classifier

model_dataset = Dataset(X_train=X_train, y_train=y_train, X_test=X_test)
model_xgb = Classifier(dataset=model_dataset, estimator=xgb_model, name='xgb', use_cache=False)
model_lgb = Classifier(dataset=model_dataset, estimator=lgb_model, name='lgb', use_cache=False)

#### 使用stacking 方法进行模型融合 

In [8]:
from heamy.pipeline import ModelsPipeline

pipeline = ModelsPipeline(model_xgb, model_lgb)
pipeline

<heamy.pipeline.ModelsPipeline at 0x211f2896cc8>

In [9]:
# 构建第一层新特征，其中k默认是5，表示5折交叉验证，full_test=True，对全部训练集进行训练得到基学习器，然后用基学习器对测试集预测得到新特征
stack_ds = pipeline.stack(k=5, seed=111, full_test=True)

[0]	train-auc:0.69834	eval-auc:0.696377
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.734398	eval-auc:0.726436
[400]	train-auc:0.742841	eval-auc:0.730239
[600]	train-auc:0.748613	eval-auc:0.731679
[800]	train-auc:0.753368	eval-auc:0.732525
[1000]	train-auc:0.757552	eval-auc:0.732917
[1200]	train-auc:0.761406	eval-auc:0.733174
[1400]	train-auc:0.764981	eval-auc:0.733355
[1600]	train-auc:0.768341	eval-auc:0.73344
[1800]	train-auc:0.771698	eval-auc:0.73353
[2000]	train-auc:0.774873	eval-auc:0.733519
Stopping. Best iteration:
[1825]	train-auc:0.772115	eval-auc:0.733588

调参后xgboost单模型在验证集上的AUC：0.733587776147621
[0]	train-auc:0.6978	eval-auc:0.694898
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.734141	eval-auc:0.727192
[400]	train-auc:0.742663	eval-auc:0.730949
[600]

In [10]:
from sklearn.linear_model import LogisticRegression
# 第二层使用逻辑回归进行stack
LogisticRegression(solver='lbfgs')
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = stacker.predict()
test_pred

array([0.08922319, 0.30949224, 0.6613638 , ..., 0.12711087, 0.22912402,
       0.0694135 ])

In [11]:
"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head(20)

Unnamed: 0,id,isDefault
0,800000,0.089223
1,800001,0.309492
2,800002,0.661364
3,800003,0.273962
4,800004,0.364396
5,800005,0.068242
6,800006,0.269167
7,800007,0.071079
8,800008,0.746882
9,800009,0.074709


In [12]:
"""保存数据用于预测建模"""
df_result.to_csv('dataset/submission_data_stacking_model_20200924_V1_5folds.csv', encoding='gbk', index=False)

#### 使用blending方法进行模型融合

In [13]:
# 构建第一层新特征，将训练集切分成8:2，其中80%用于训练基学习器，20%用于构建新特征
blend_ds = pipeline.blend(proportion=0.2,seed=111)
# 第二层使用逻辑回归进行blend
blender = Classifier(dataset=blend_ds, estimator=LogisticRegression, parameters={'solver': 'lbfgs'})
# 测试集的预测结果
test_pred = blender.predict()
test_pred

[0]	train-auc:0.696733	eval-auc:0.696829
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.733312	eval-auc:0.729499
[400]	train-auc:0.74144	eval-auc:0.73372
[600]	train-auc:0.746949	eval-auc:0.73543
[800]	train-auc:0.751826	eval-auc:0.736586
[1000]	train-auc:0.756271	eval-auc:0.737348
[1200]	train-auc:0.760104	eval-auc:0.737686
[1400]	train-auc:0.763704	eval-auc:0.737969
[1600]	train-auc:0.767129	eval-auc:0.738104
[1800]	train-auc:0.770353	eval-auc:0.738215
[2000]	train-auc:0.773465	eval-auc:0.738211
Stopping. Best iteration:
[1844]	train-auc:0.771048	eval-auc:0.73825

调参后xgboost单模型在验证集上的AUC：0.7382499110600509
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.72468	valid_1's auc: 0.723239
[1000]	training's auc: 0.730817	valid_1's auc: 0.728304
[1500]	training's auc: 0.734481	valid_1's auc: 0.730946
[2000]	training's auc: 0.737013	valid_1's au

array([0.0923473 , 0.28176227, 0.59130493, ..., 0.1341528 , 0.23243946,
       0.06736474])

In [14]:
"""生成提交格式的DataFrame"""
df_result = pd.DataFrame({'id': df_data.loc[df_data['sample']=='test', 'id'].values, 'isDefault': test_pred})
df_result.sort_values(by='id').head()

Unnamed: 0,id,isDefault
0,800000,0.092347
1,800001,0.281762
2,800002,0.591305
3,800003,0.23581
4,800004,0.368891


In [15]:
"""保存数据用于预测建模"""
df_result.to_csv('dataset/submission_data_blending_model_20200924_V1.csv', encoding='gbk', index=False)