In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [6]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [7]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        if self.voting == "soft":
            res_proba = self.clf.predict_proba(X)
        else:
            res_proba = None
        return res_proba

    def predict(self, X):
        if self.voting == "soft":
            res = (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
        else:
            res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [8]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [9]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [10]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [11]:
test_predict_proba_RF = []

for i in range(5):
    test_predict_proba_RF.append(RF_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = []

for i in range(5):
    test_predict_proba_tree.append(tree_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = []

for i in range(5):
    test_predict_proba_lr.append(lr_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [12]:
logistic_search = load("./model/grid_lr_1.joblib")
tree_model = load("./model/tree_model.joblib")
RF_0 = load("./model/RF_0.joblib")

# 训练集上的预测概率
train_prediction1_proba = logistic_search.best_estimator_.predict_proba(X_train_OE)[
    :, 1
]
train_prediction2_proba = tree_model.predict_proba(X_train_OE)[:, 1]
train_prediction3_proba = RF_0.predict_proba(X_train_OE)[:, 1]

# 测试集上的预测概率
test_prediction1_proba = logistic_search.best_estimator_.predict_proba(X_test_OE)[:, 1]
test_prediction2_proba = tree_model.predict_proba(X_test_OE)[:, 1]
test_prediction3_proba = RF_0.predict_proba(X_test_OE)[:, 1]

In [13]:
train_prediction1_proba

array([0.01128869, 0.54147942, 0.14173826, ..., 0.67394159, 0.04530194,
       0.00242249])

In [14]:
train_stack = np.vstack(
    [[train_prediction1_proba], [train_prediction2_proba], [train_prediction3_proba]]
).T

In [15]:
test_stack = np.vstack(
    [[test_prediction1_proba], [test_prediction2_proba], [test_prediction3_proba]]
).T

In [16]:
lr_final = LogisticRegression().fit(train_stack, y_train)
lr_final.score(train_stack, y_train), lr_final.score(test_stack, y_test)

(0.8775085195001894, 0.7927314026121521)

In [17]:
# 训练集上的预测结果
train_prediction1 = logistic_search.best_estimator_.predict(X_train_OE)
train_prediction2 = tree_model.predict(X_train_OE)
train_prediction3 = RF_0.predict(X_train_OE)

# 测试集上的预测结果
test_prediction1 = logistic_search.best_estimator_.predict(X_test_OE)
test_prediction2 = tree_model.predict(X_test_OE)
test_prediction3 = RF_0.predict(X_test_OE)

In [18]:
train_stack_hard = np.vstack(
    [[train_prediction1], [train_prediction2], [train_prediction3]]
).T

In [19]:
test_stack_hard = np.vstack(
    [[test_prediction1], [test_prediction2], [test_prediction3]]
).T

In [20]:
lr_final = LogisticRegression().fit(train_stack_hard, y_train)
lr_final.score(train_stack_hard, y_train), lr_final.score(test_stack_hard, y_test)

(0.8483528966300644, 0.7955706984667802)

In [21]:
# 元学习器训练集
train_stack = pd.DataFrame(
    {
        "train_stack_RF": eval_predict_proba_RF,
        "train_stack_lr": eval_predict_proba_lr,
        "train_stack_tree": eval_predict_proba_tree,
    }
)

train_stack

Unnamed: 0,train_stack_RF,train_stack_lr,train_stack_tree
0,0.044787,0.011289,0.037669
1,0.572187,0.543190,0.787986
2,0.161815,0.151200,0.222819
3,0.250871,0.273393,0.259434
4,0.122533,0.158399,0.107345
...,...,...,...
5277,0.082653,0.062756,0.062959
5278,0.346562,0.346367,0.222819
5279,0.551481,0.688556,0.438538
5280,0.049011,0.050627,0.066419


In [22]:
# 元学习器测试集
test_stack = pd.DataFrame(
    {
        "test_stack_RF": test_predict_proba_RF,
        "test_stack_lr": test_predict_proba_lr,
        "test_stack_tree": test_predict_proba_tree,
    }
)

test_stack

Unnamed: 0,test_stack_RF,test_stack_lr,test_stack_tree
0,0.029220,0.039438,0.046473
1,0.311980,0.238789,0.158900
2,0.016244,0.005101,0.046473
3,0.025769,0.031015,0.046473
4,0.035476,0.059170,0.051573
...,...,...,...
1756,0.193367,0.177800,0.212513
1757,0.048821,0.034432,0.046473
1758,0.145346,0.130264,0.158900
1759,0.530686,0.500205,0.437401


In [23]:
lr_final = LogisticRegression().fit(train_stack, y_train)
lr_final.score(train_stack, y_train), lr_final.score(test_stack, y_test)

(0.8176826959485044, 0.7950028392958546)

In [24]:
from sklearn.ensemble import StackingClassifier

In [25]:
estimators = [
    ("lr", logistic_search.best_estimator_),
    ("tree", tree_model),
    ("rf", RF_0),
]

In [26]:
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [27]:
clf.fit(X_train_OE, y_train)

In [28]:
clf.estimators_

[Pipeline(steps=[('columntransformer',
                  ColumnTransformer(transformers=[('cat',
                                                   OneHotEncoder(drop='if_binary'),
                                                   ['gender', 'SeniorCitizen',
                                                    'Partner', 'Dependents',
                                                    'PhoneService',
                                                    'MultipleLines',
                                                    'InternetService',
                                                    'OnlineSecurity',
                                                    'OnlineBackup',
                                                    'DeviceProtection',
                                                    'TechSupport', 'StreamingTV',
                                                    'StreamingMovies',
                                                    'Contract',
                            

In [29]:
clf.final_estimator_

In [30]:
clf.final_estimator_.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [31]:
clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.8256342294585385, 0.7898921067575241)

In [32]:
clf = StackingClassifier(estimators=estimators, stack_method="predict").fit(
    X_train_OE, y_train
)

In [33]:
clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.829420673987126, 0.7876206700738216)

In [35]:
start = time.time()

parameter_space = {
    "cv": range(2, 11),
    "stack_method": ["predict_proba", "decision_function", "predict"],
    "passthrough": [True, False],
}

clf = StackingClassifier(estimators=estimators)
stack_grid = GridSearchCV(clf, parameter_space, n_jobs=-1)

stack_grid.fit(X_train_OE, y_train)

print(time.time() - start)

38.824546098709106


In [36]:
stack_grid.best_params_

{'cv': 2, 'passthrough': True, 'stack_method': 'predict_proba'}

In [37]:
stack_grid.best_score_

0.8065119620997103

In [38]:
stack_grid.score(X_train_OE, y_train), stack_grid.score(X_test_OE, y_test)

(0.8089738735327527, 0.7842135150482681)

In [40]:
start = time.time()

# 设置超参数空间
clf1 = DecisionTreeClassifier()
clf2 = LogisticRegression()
clf3 = RandomForestClassifier()

parameter_space = {
    "cv": range(2, 11),
    "stack_method": ["predict_proba", "decision_function", "predict"],
    "final_estimator": [clf1, clf2, clf3],
    "passthrough": [True, False],
}

# 实例化Stacking评估器
clf = StackingClassifier(estimators=estimators)
stack_grid = GridSearchCV(clf, parameter_space, n_jobs=15)

# 模型训练
stack_grid.fit(X_train_OE, y_train)

print(time.time() - start)

117.01866936683655


In [41]:
stack_grid.best_estimator_

In [42]:
stack_grid.best_params_

{'cv': 2,
 'final_estimator': DecisionTreeClassifier(),
 'passthrough': True,
 'stack_method': 'predict_proba'}

In [43]:
stack_grid.best_score_

0.7370289340901922

In [44]:
stack_grid.score(X_train_OE, y_train), stack_grid.score(X_test_OE, y_test)

(0.8159787959106399, 0.7257240204429302)

In [45]:
final_lr = LogisticRegression(penalty="l1", solver="saga")

In [46]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_lr).fit(
    X_train_OE, y_train
)

In [47]:
clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.8265808405906854, 0.7881885292447472)

In [48]:
final_tree = DecisionTreeClassifier(max_depth=2)

In [49]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_tree).fit(
    X_train_OE, y_train
)

In [50]:
clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.807080651268459, 0.7796706416808632)

In [53]:
class Stacking_tree_Cascade(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(
        self,
        estimators,
        cv=None,
        passthrough=False,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_leaf_nodes=None,
    ):
        self.estimators = estimators
        self.cv = cv
        self.passthrough = passthrough
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_leaf_nodes = max_leaf_nodes
        self.final_estimator = DecisionTreeClassifier(
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            max_leaf_nodes=self.max_leaf_nodes,
        )

    def fit(self, X, y):
        SC = StackingClassifier(
            estimators=self.estimators,
            final_estimator=self.final_estimator,
            cv=self.cv,
            passthrough=self.passthrough,
        )

        SC.fit(X, y)
        self.clf = SC
        self.classes_ = pd.Series(y).unique()
        return self

    def predict_proba(self, X):
        res_proba = self.clf.predict_proba(X)
        return res_proba

    def predict(self, X):
        res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [54]:
STC = Stacking_tree_Cascade(estimators).fit(X_train_OE, y_train)

In [55]:
STC.score(X_train_OE, y_train), STC.score(X_test_OE, y_test)

(0.7464975388110564, 0.7081203861442362)

In [56]:
STC = Stacking_tree_Cascade(estimators, max_depth=2).fit(X_train_OE, y_train)

In [57]:
STC.score(X_train_OE, y_train), STC.score(X_test_OE, y_test)

(0.807080651268459, 0.7796706416808632)

In [58]:
start = time.time()

parameter_space = {
    "max_depth": np.arange(2, 7, 1).tolist(),
    "min_samples_split": np.arange(2, 7, 1).tolist(),
    "min_samples_leaf": np.arange(2, 7, 1).tolist(),
    "max_leaf_nodes": np.arange(4, 10, 1).tolist(),
    "cv": np.arange(2, 6, 1).tolist(),
}

# 实例化Stacking评估器
STC = Stacking_tree_Cascade(estimators)
STC_grid = GridSearchCV(STC, parameter_space, n_jobs=-1)

# 模型训练
STC_grid.fit(X_train_OE, y_train)

print(time.time() - start)

1600.2967023849487


In [59]:
STC_grid.best_params_

{'cv': 4,
 'max_depth': 6,
 'max_leaf_nodes': 7,
 'min_samples_leaf': 6,
 'min_samples_split': 2}

In [60]:
STC_grid.score(X_train_OE, y_train), STC_grid.score(X_test_OE, y_test)

(0.8135176069670579, 0.7904599659284497)

In [61]:
# 先实例化一级元学习器
tree_final = DecisionTreeClassifier()
RF_final = RandomForestClassifier()

# 然后构建两层元学习器之间的Stacking评估器
final_layer = StackingClassifier(
    estimators=[("tree_final", tree_final), ("RF_final", RF_final)],
    final_estimator=LogisticRegression(penalty="l1", solver="saga"),
)

# 然后构建一级学习器
multi_layer = StackingClassifier(estimators=estimators, final_estimator=final_layer)

In [62]:
multi_layer.fit(X_train_OE, y_train)

In [63]:
multi_layer.score(X_train_OE, y_train), multi_layer.score(X_test_OE, y_test)

(0.8121923513820523, 0.7666098807495741)