In [65]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings("ignore")

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *
from manual_ensemble import *
# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe, Trials
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [3]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [4]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [5]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [6]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [7]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [8]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        if self.voting == "soft":
            res_proba = self.clf.predict_proba(X)
        else:
            res_proba = None
        return res_proba

    def predict(self, X):
        if self.voting == "soft":
            res = (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
        else:
            res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [9]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [10]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [11]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [12]:
test_predict_proba_RF = []

for i in range(5):
    test_predict_proba_RF.append(RF_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = []

for i in range(5):
    test_predict_proba_tree.append(tree_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = []

for i in range(5):
    test_predict_proba_lr.append(lr_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [13]:
import manual_ensemble as me

In [14]:
me?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'manual_ensemble' from 'f:\\study\\python\\code6\\More advanced machine learning\\03_模型融合\\manual_ensemble.py'>
[1;31mFile:[0m        f:\study\python\code6\more advanced machine learning\03_模型融合\manual_ensemble.py
[1;31mDocstring:[0m   自动模型融合模块

In [33]:
tree_params_space = {
    "tree_max_depth": hp.choice("tree_max_depth", np.arange(2, 20).tolist()),
    "tree_min_samples_split": hp.choice(
        "tree_min_samples_split", np.arange(2, 15).tolist()
    ),
    "tree_min_samples_leaf": hp.choice(
        "tree_min_samples_leaf", np.arange(1, 15).tolist()
    ),
    "tree_max_leaf_nodes": hp.choice("tree_max_leaf_nodes", np.arange(2, 51).tolist()),
}

In [34]:
def hyperopt_tree(params, train=True):
    # 读取参数
    if train == True:
        max_depth = params["tree_max_depth"]
        min_samples_split = params["tree_min_samples_split"]
        min_samples_leaf = params["tree_min_samples_leaf"]
        max_leaf_nodes = params["tree_max_leaf_nodes"]
    else:
        max_depth = params["tree_max_depth"] + 2
        min_samples_split = params["tree_min_samples_split"] + 2
        min_samples_leaf = params["tree_min_samples_leaf"] + 1
        max_leaf_nodes = params["tree_max_leaf_nodes"] + 2
    tree = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
    )

    if train == True:
        res = -cross_val_score(tree, X_train_OE, y_train).mean()
    else:
        res = tree.fit(X_train_OE, y_train)

    return res

In [35]:
def param_hyperopt_tree(max_evals):
    return fmin(
        fn=hyperopt_tree,
        space=tree_params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(9),
    )

In [36]:
tree_params_best = param_hyperopt_tree(1000)

100%|██████████| 1000/1000 [00:39<00:00, 25.06trial/s, best loss: -0.7962873770820791]


In [37]:
tree_params_best

{'tree_max_depth': 3,
 'tree_max_leaf_nodes': 44,
 'tree_min_samples_leaf': 10,
 'tree_min_samples_split': 3}

In [38]:
hyperopt_tree(tree_params_best, train=False)

In [39]:
clf = hyperopt_tree(tree_params_best, train=False)

In [40]:
clf.score(X_test_OE, y_test)

0.7773992049971608

而训练状态和测试状态的重要区别，就在于参数的导入。对于hyperOPT来说，hp.choice的搜索结果其实是原始参数取值列表的索引值，例如max_depth：3，其实代表的是原始参数空间中'tree_max_depth': hp.choice('RF_max_depth', np.arange(2, 20).tolist())的第3个值，也就是2+3=5:

因此目标函数在定义train=False的代码时，对于整数列表的数值提取，只需要用得到的索引值+列表初始值即可。再比如max_leaf_nodes的最佳值索引是27,则真实值为2+27=29。当然，对于字符串列表，则需要直接把字符串完整列表带入进行索引。

In [44]:
class tree_cascade(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, tree_params_space, max_evals=1000):
        self.tree_params_space = tree_params_space
        self.max_evals = max_evals

    def fit(self, X, y):
        def hyperopt_tree(params, train=True):
            # 读取参数
            if train == True:
                max_depth = params["tree_max_depth"]
                min_samples_split = params["tree_min_samples_split"]
                min_samples_leaf = params["tree_min_samples_leaf"]
                max_leaf_nodes = params["tree_max_leaf_nodes"]
            else:
                max_depth = params["tree_max_depth"] + 2
                min_samples_split = params["tree_min_samples_split"] + 2
                min_samples_leaf = params["tree_min_samples_leaf"] + 1
                max_leaf_nodes = params["tree_max_leaf_nodes"] + 2

            # 实例化模型
            tree = DecisionTreeClassifier(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_leaf_nodes=max_leaf_nodes,
                random_state=12,
            )

            if train == True:
                res = -cross_val_score(tree, X, y).mean()
            else:
                res = tree.fit(X, y)

            return res

        def param_hyperopt_tree(max_evals):
            params_best = fmin(
                fn=hyperopt_tree,
                space=self.tree_params_space,
                algo=tpe.suggest,
                max_evals=max_evals,
                rstate=np.random.default_rng(9),
            )

            return params_best

        tree_params_best = param_hyperopt_tree(self.max_evals)
        self.clf = hyperopt_tree(tree_params_best, train=False)
        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

    def predict(self, X):
        return self.clf.predict(X)

    def score(self, X, y):
        return self.clf.score(X, y)

In [45]:
tree_hyper = tree_cascade(tree_params_space)

In [46]:
tree_hyper.fit(X_train_OE, y_train)

100%|██████████| 1000/1000 [00:40<00:00, 24.83trial/s, best loss: -0.7962873770820791]


In [47]:
tree_hyper.predict(X_test_OE)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [48]:
tree_hyper.score(X_test_OE, y_test)

0.7768313458262351

In [49]:
tree_hyper = tree_cascade(tree_params_space, max_evals=2000).fit(X_train_OE, y_train)

100%|██████████| 2000/2000 [01:26<00:00, 23.13trial/s, best loss: -0.7962873770820791]


In [50]:
tree_hyper.score(X_test_OE, y_test)

0.7768313458262351

In [51]:
RF_params_space = {
    "RF_min_samples_leaf": hp.choice("RF_min_samples_leaf", np.arange(1, 20).tolist()),
    "RF_min_samples_split": hp.choice(
        "RF_min_samples_split", np.arange(2, 20).tolist()
    ),
    "RF_max_depth": hp.choice("RF_max_depth", np.arange(2, 20).tolist()),
    "RF_max_leaf_nodes": hp.choice("RF_max_leaf_nodes", np.arange(20, 200).tolist()),
    "RF_n_estimators": hp.choice("RF_n_estimators", np.arange(20, 200).tolist()),
    "RF_max_samples": hp.uniform("RF_max_samples", 0.2, 0.8),
}

In [52]:
class RF_cascade(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, RF_params_space, max_evals=500):
        self.RF_params_space = RF_params_space
        self.max_evals = max_evals

    def fit(self, X, y):
        def hyperopt_RF(params, train=True):
            # 读取参数
            if train == True:
                min_samples_leaf = params["RF_min_samples_leaf"]
                min_samples_split = params["RF_min_samples_split"]
                max_depth = params["RF_max_depth"]
                max_leaf_nodes = params["RF_max_leaf_nodes"]
                n_estimators = params["RF_n_estimators"]
                max_samples = params["RF_max_samples"]
            else:
                min_samples_leaf = params["RF_min_samples_leaf"] + 1
                min_samples_split = params["RF_min_samples_split"] + 2
                max_depth = params["RF_max_depth"] + 2
                max_leaf_nodes = params["RF_max_leaf_nodes"] + 20
                n_estimators = params["RF_n_estimators"] + 20
                max_samples = params["RF_max_samples"]
            # 实例化模型
            RF = RandomForestClassifier(
                min_samples_leaf=min_samples_leaf,
                min_samples_split=min_samples_split,
                max_depth=max_depth,
                max_leaf_nodes=max_leaf_nodes,
                n_estimators=n_estimators,
                max_samples=max_samples,
            )
            if train == True:
                res = -cross_val_score(RF, X, y).mean()
            else:
                res = RF.fit(X, y)

            return res

        def param_hyperopt_RF(max_evals):
            params_best = fmin(
                fn=hyperopt_RF,
                space=self.RF_params_space,
                algo=tpe.suggest,
                max_evals=max_evals,
            )

            return params_best

        RF_params_best = param_hyperopt_RF(self.max_evals)
        self.clf = hyperopt_RF(RF_params_best, train=False)
        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

    def predict(self, X):
        return self.clf.predict(X)

    def score(self, X, y):
        return self.clf.score(X, y)

In [53]:
RF_hyper = RF_cascade(RF_params_space)
RF_hyper.fit(X_train_OE, y_train)

100%|██████████| 500/500 [06:14<00:00,  1.34trial/s, best loss: -0.8099199779249447]


In [54]:
RF_hyper.score(X_test_OE, y_test)

0.7893242475865985

In [55]:
RF_hyper = RF_cascade(RF_params_space, max_evals=1000).fit(X_train_OE, y_train)

100%|██████████| 1000/1000 [11:37<00:00,  1.43trial/s, best loss: -0.8099194403830164]


In [56]:
RF_hyper.score(X_test_OE, y_test)

0.7842135150482681

In [57]:
lr_params_space = {
    "lr_C": hp.uniform("lr_C", 0, 1),
    "lr_penalty": hp.choice("lr_penalty", ["l1", "l2"]),
    "lr_thr": hp.uniform("lr_thr", 0, 1),
}

In [60]:
class lr_cascade(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, lr_params_space, max_evals=20):
        self.lr_params_space = lr_params_space
        self.max_evals = max_evals

    def fit(self, X, y):
        def hyperopt_lr(params, train=True):
            # 读取参数
            if train == True:
                C = params["lr_C"]
                penalty = params["lr_penalty"]
                thr = params["lr_thr"]
            else:
                C = params["lr_C"]
                penalty = ["l1", "l2"][params["lr_penalty"]]
                thr = params["lr_thr"]
            # 实例化模型
            lr = logit_threshold(
                C=C, thr=thr, penalty=penalty, solver="saga", max_iter=int(1e6)
            )

            if train == True:
                res = -cross_val_score(lr, X, y).mean()
            else:
                res = lr.fit(X, y)

            return res

        def param_hyperopt_lr(max_evals):
            params_best = fmin(
                fn=hyperopt_lr,
                space=self.lr_params_space,
                algo=tpe.suggest,
                max_evals=max_evals,
                rstate=np.random.default_rng(9),
            )

            return params_best

        lr_params_best = param_hyperopt_lr(self.max_evals)
        self.clf = hyperopt_lr(lr_params_best, train=False)
        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(X)

    def predict(self, X):
        return self.clf.predict(X)

    def score(self, X, y):
        return self.clf.score(X, y)

In [61]:
lr_hyper = lr_cascade(lr_params_space).fit(X_train_OE, y_train)

100%|██████████| 20/20 [01:54<00:00,  5.74s/trial, best loss: -0.7890945285398928]


In [62]:
lr_hyper.score(X_test_OE, y_test)

0.7717206132879046

In [63]:
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [('lr', lr_hyper), ('tree', tree_hyper), ('rf', RF_hyper)]

In [66]:
train_oof, test_predict = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators
)

100%|██████████| 20/20 [01:44<00:00,  5.20s/trial, best loss: -0.7874556213017752]
100%|██████████| 20/20 [01:42<00:00,  5.11s/trial, best loss: -0.7886390532544378]
100%|██████████| 20/20 [01:40<00:00,  5.02s/trial, best loss: -0.7884507672723713]
100%|██████████| 20/20 [01:41<00:00,  5.10s/trial, best loss: -0.792946969379048] 
100%|██████████| 20/20 [01:39<00:00,  4.97s/trial, best loss: -0.7865567166058165]
100%|██████████| 1000/1000 [00:34<00:00, 28.90trial/s, best loss: -0.7983431952662723]
100%|██████████| 1000/1000 [00:33<00:00, 29.54trial/s, best loss: -0.7919526627218935]
100%|██████████| 1000/1000 [00:33<00:00, 29.66trial/s, best loss: -0.7986249248115043]
100%|██████████| 1000/1000 [00:35<00:00, 28.43trial/s, best loss: -0.8033578133087135]
100%|██████████| 1000/1000 [00:35<00:00, 28.12trial/s, best loss: -0.7905781470756921]
100%|██████████| 500/500 [04:42<00:00,  1.77trial/s, best loss: -0.8049704142011833]
100%|██████████| 500/500 [04:37<00:00,  1.80trial/s, best loss: -

In [85]:
# 设置超参数空间
logistic_param = [
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["saga"],
    },
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

# 实例化相关评估器
logistic_final = logit_threshold(max_iter=int(1e6))

# 执行网格搜索
lfg = GridSearchCV(
    estimator=logistic_final, param_grid=logistic_param, scoring="accuracy", n_jobs=15
).fit(train_oof.iloc[:, :3], y_train)

lfg.score(train_oof.iloc[:, :3], y_train), lfg.score(test_predict, y_test)

(0.8065126845891708, 0.787052810902896)

In [68]:
def final_model_opt(final_model_l, param_space_l, X, y, test_predict):
    """
    Stacking元学习器自动优化与预测函数

    :param final_model_l: 备选元学习器组成的列表
    :param param_space_l: 备选元学习器各自超参数搜索空间组成的列表
    :param X: oof_train训练集特征
    :param y: oof_train训练集标签
    :param test_predict: 一级评估器输出的测试集预测结果

    :return：多组元学习器在oof_train上的最佳评分，以及最佳元学习器在test_predict上的预测结果
    """

    # 不同组元学习器结果存储列表
    # res_l用于存储模型在训练集上的评分
    res_l = np.zeros(len(final_model_l)).tolist()
    # test_predict_l用于存储模型在测试集test_predict上的预测结果
    test_predict_l = np.zeros(len(final_model_l)).tolist()

    for i, model in enumerate(final_model_l):
        # 输出元学习器单模预测结果
        # 执行网格搜索
        model_grid = GridSearchCV(
            estimator=model, param_grid=param_space_l[i], scoring="accuracy", n_jobs=15
        )
        model_grid.fit(X, y)
        # 记录单模最佳模型，方便后续作为Bagging的基础评估器
        res1_best_model = model_grid.best_estimator_
        # 测试在训练oof数据集上的准确率
        res1 = model_grid.score(X, y)
        # 输出单模在test_predict上的预测结果
        res1_test_predict = model_grid.predict_proba(test_predict)[:, 1]

        # 输出元学习器交叉训练预测结果
        res2_temp = np.zeros(y.shape[0])
        res2_test_predict = np.zeros(test_predict.shape[0])
        # 交叉训练过程附带网格搜索以提升精度
        folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12)
        for trn_idx, val_idx in folds.split(X, y):
            model_grid = GridSearchCV(
                estimator=model,
                param_grid=param_space_l[i],
                scoring="accuracy",
                n_jobs=15,
            )
            model_grid.fit(X.loc[trn_idx], y.loc[trn_idx])
            res2_temp += model_grid.predict_proba(X)[:, 1] / 10
            # 记录测试集上的预测结果
            res2_test_predict += model_grid.predict_proba(test_predict)[:, 1] / 10
        # 交叉训练模型组评分
        res2 = accuracy_score((res2_temp >= 0.5) * 1, y)

        # 元学习器的Bagging过程
        bagging_param_space = {
            "n_estimators": range(10, 21),
            "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
        }

        bagging_final = BaggingClassifier(res1_best_model)
        BG = GridSearchCV(bagging_final, bagging_param_space, n_jobs=15).fit(X, y)
        # Bagging元学习器评分
        res3 = BG.score(X, y)
        # Bagging元学习器在测试集上评分
        res3_test_predict = BG.predict_proba(test_predict)[:, 1]

        # 三组模型评分组成列表
        res_l_temp = [res1, res2, res3]
        # 三组模型在测试集上预测结果组成列表
        test_predict_l_temp = [res1_test_predict, res2_test_predict, res3_test_predict]
        # 挑选评分最高模型
        best_res = np.max(res_l_temp)
        # 挑选评分最高模型输出的测试集概率预测结果
        best_test_predict = test_predict_l_temp[np.argmax(res_l_temp)]
        # 将最佳模型写入res_l对应位置
        res_l[i] = best_res
        # 将最佳模型在测试集上的评分写入test_predict_l
        test_predict_l[i] = best_test_predict

    # 再从res_l中选取训练集上最佳评分
    best_res_final = np.max(res_l)
    # 根据训练集上的最佳评分，选取挑选最佳测试集预测结果
    best_test_predict_final = test_predict_l[np.argmax(res_l)]

    return best_res_final, best_test_predict_final

In [69]:
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

In [88]:
lr_final_param = [
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["saga"],
    },
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

tree_final_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

param_space_l = [lr_final_param, tree_final_param]

In [89]:
best_res_final, best_test_predict_final = final_model_opt(
    final_model_l, param_space_l, train_oof.iloc[:, :3], y_train, test_predict
)

In [90]:
accuracy_score((best_test_predict_final >= 0.5) * 1, y_test)

0.7921635434412265

In [91]:
lr_hyper = lr_cascade(lr_params_space, max_evals=50)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space, max_evals=1000)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

In [None]:
train_oof, test_predict = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators
)

In [None]:
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

In [None]:
best_res_final, best_test_predict_final = final_model_opt(
    final_model_l, param_space_l, train_oof.iloc[:, :3], y_train, test_predict
)

In [None]:
accuracy_score((best_test_predict_final >= 0.5) * 1, y_test)