In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
                            confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
%matplotlib inline
plt.rc('font', family='SimHei', size=14)
plt.rcParams['axes.unicode_minus']=False
%config InlineBackend.figure_format = 'retina'


data_del = pd.read_csv('data_processed_2.csv',encoding='unicode_escape')
data_del.head()

Unnamed: 0,custid,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,...,ÃÃ¤ÃÃ»Â³ÃÃÃ,ÃÃ½ÃÃÂ³ÃÃÃ,ÃÂ»ÃÃÂ³ÃÃÃ,latest_query_time_year,latest_query_time_month,latest_query_time_weekday,loans_latest_time_year,loans_latest_time_month,loans_latest_time_weekday,status
0,2791858,0.01,0.99,0,0.9,0.55,0.313,17.0,27.0,26.0,...,0,0,1,2018,4,2,2018,4,3,1
1,534047,0.02,0.94,2000,1.28,1.0,0.458,19.0,30.0,14.0,...,0,0,1,2018,5,3,2018,5,5,0
2,2849787,0.04,0.96,0,1.0,1.0,0.114,13.0,68.0,22.0,...,0,0,1,2018,5,5,2018,5,1,1
3,1809708,0.0,0.96,2000,0.13,0.57,0.777,22.0,14.0,6.0,...,0,1,0,2018,5,5,2018,5,3,0
4,2499829,0.01,0.99,0,0.46,1.0,0.175,13.0,66.0,42.0,...,0,0,1,2018,4,6,2018,1,6,1


In [2]:
X_train, X_test, y_train, y_test = train_test_split(data_del.drop(['status'], axis=1).values, 
                                                    data_del['status'].values, test_size=0.3, 
                                                    random_state=2018)

[X_train.shape, y_train.shape, X_test.shape, y_test.shape]

[(3327, 92), (3327,), (1427, 92), (1427,)]

模型融合(Stacking)   
  
参考资料：https://blog.csdn.net/bear507/article/details/86726962  
https://blog.csdn.net/wstcjf/article/details/77989963  

In [3]:
def get_stacking_data(models, X_train, y_train, X_test, y_test, k=5):
    '''获得下一模型的训练集，测试集
    models: 当前模型
    X_train: 当前训练数据
    y_train： 当前训练标签
    X_test: 当前测试数据
    y_test: 当前测试标签
    k: K折交叉验证
    return: new_train: 下一个模型的训练集
            new_test: 下一个模型的测试集
    '''
    kfold = KFold(n_splits=k, random_state=2018, shuffle=True)
    next_train = np.zeros((X_train.shape[0], len(models)))
    next_test = np.zeros((X_test.shape[0], len(models)))
    
    for j, model in enumerate(models):
        next_test_temp = np.zeros((X_test.shape[0], k))
        ksplit = kfold.split(X_train)
        for i, (train_index, val_index) in enumerate(ksplit):
            X_train_fold, y_train_fold = X_train[train_index], y_train[train_index]
            X_val = X_train[val_index]
            model.fit(X_train_fold, y_train_fold)
            next_train[val_index, j] = model.predict(X_val)
            next_test_temp[:, i] = model.predict(X_test)
        next_test[:, j] = np.mean(next_test_temp, axis=1)
    
    return next_train, next_test

In [4]:
rnd_clf = RandomForestClassifier(random_state=2018)
gbdt = GradientBoostingClassifier(random_state=2018)
xgb = XGBClassifier(random_state=2018)
lgbm = LGBMClassifier(random_state=2018)
log = LogisticRegression(random_state=2018, max_iter=1000)
svc = SVC(random_state=2018, probability=True)
tree = DecisionTreeClassifier(random_state=2018)

In [6]:
base_models = [rnd_clf, gbdt, log]#lgbm
next_train, next_test = get_stacking_data(base_models, X_train, y_train, X_test, y_test, k=10)

In [7]:
stacking_model= XGBClassifier(random_state=2018)
stacking_model.fit(next_test, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=2018, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)