In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings("ignore")

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *
from manual_ensemble import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe, Trials
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [6]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [7]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        if self.voting == "soft":
            res_proba = self.clf.predict_proba(X)
        else:
            res_proba = None
        return res_proba

    def predict(self, X):
        if self.voting == "soft":
            res = (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
        else:
            res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [8]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [9]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [10]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [11]:
test_predict_proba_RF = []

for i in range(5):
    test_predict_proba_RF.append(RF_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = []

for i in range(5):
    test_predict_proba_tree.append(tree_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = []

for i in range(5):
    test_predict_proba_lr.append(lr_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [16]:
tree = load("./model/tree_model.joblib")
RF = load("./model/RF_0.joblib")
# logistic_search = load("./model/logistic_search.joblib")

lr = lr_1

In [17]:
estimators = [("lr", lr), ("tree", tree), ("RF", RF)]

In [18]:
start = time.time()

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators, blending=True
)

print(time.time() - start)

1.5392773151397705


In [20]:
lr = LogisticRegression().fit(
    train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]
)
print("The results of LR-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        lr.score(train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]),
        lr.score(test_predict_blending, y_test),
    )
)

The results of LR-final:
Train2-Accuracy: 0.824976, Test-Accuracy: 0.792164


In [21]:
split_space = {"test_size": hp.uniform("test_size", 0.1, 0.5)}

In [22]:
def split_res(params, train=True):
    test_size = params["test_size"]
    train_oof_blending, test_predict_blending = train_cross(
        X_train_OE, y_train, X_test_OE, estimators, blending=True, test_size=test_size
    )
    lr = LogisticRegression().fit(
        train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]
    )
    if train == True:
        res = -lr.score(train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1])
    else:
        res = (train_oof_blending, test_predict_blending)
    return res

In [23]:
def param_split_res(max_evals):
    return fmin(
        fn=split_res,
        space=split_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(11),
    )

In [24]:
best_split = param_split_res(100)

100%|██████████| 100/100 [02:23<00:00,  1.44s/trial, best loss: -0.8296007789678675]


In [25]:
best_split

{'test_size': 0.19433909363294544}

In [26]:
train_oof_blending, test_predict_blending = split_res(best_split, train=False)

In [27]:
lr = LogisticRegression().fit(
    train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]
)
print("The results of LR-final:")
print(
    "Train-oof-Accuracy: %f, Test-Accuracy: %f"
    % (
        lr.score(train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]),
        lr.score(test_predict_blending, y_test),
    )
)

The results of LR-final:
Train-oof-Accuracy: 0.829601, Test-Accuracy: 0.792164


In [28]:
# 定义一级学习器
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

In [29]:
train_oof_blending, test_predict_blending = train_cross(
    X_train_OE,
    y_train,
    X_test_OE,
    estimators=estimators,
    test_size=0.19433,
    blending=True,
)

100%|██████████| 20/20 [01:38<00:00,  4.92s/trial, best loss: -0.789663125161959] 
100%|██████████| 20/20 [01:32<00:00,  4.62s/trial, best loss: -0.7726220091560854]
100%|██████████| 20/20 [01:30<00:00,  4.52s/trial, best loss: -0.7658624859635484]
100%|██████████| 20/20 [01:35<00:00,  4.79s/trial, best loss: -0.7934831994471796]
100%|██████████| 20/20 [01:33<00:00,  4.66s/trial, best loss: -0.7923054331864904]
100%|██████████| 1000/1000 [00:30<00:00, 32.35trial/s, best loss: -0.7993534594454521]
100%|██████████| 1000/1000 [00:31<00:00, 31.91trial/s, best loss: -0.7973041375140365]
100%|██████████| 1000/1000 [00:30<00:00, 32.74trial/s, best loss: -0.7975995508335492]
100%|██████████| 1000/1000 [00:30<00:00, 32.91trial/s, best loss: -0.7987691111686965]
100%|██████████| 1000/1000 [00:30<00:00, 32.50trial/s, best loss: -0.7990658201606633]
100%|██████████| 500/500 [04:23<00:00,  1.90trial/s, best loss: -0.8064114191932278]
100%|██████████| 500/500 [04:44<00:00,  1.76trial/s, best loss: -

In [30]:
# 定义元学习器搜索空间
lr_final_param = [
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["saga"],
    },
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

tree_final_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

param_space_l = [lr_final_param, tree_final_param]

In [31]:
# 定义元学习器列表
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

In [32]:
# 执行元学习器训练搜索
best_res_final, best_test_predict_final = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

In [33]:
best_res_final

0.8481012658227848

In [34]:
accuracy_score((best_test_predict_final >= 0.5) * 1, y_test)

0.7853492333901193

In [35]:
# 一级学习器交叉训练
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, test_size=0.1, blending=True
)

# 元学习器训练与优化
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

best_res_final1, best_test_predict_final1 = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

100%|██████████| 20/20 [01:44<00:00,  5.24s/trial, best loss: -0.7919527629849921]
100%|██████████| 20/20 [01:54<00:00,  5.75s/trial, best loss: -0.7893204924268622]
100%|██████████| 20/20 [01:47<00:00,  5.37s/trial, best loss: -0.7646012863960163]
100%|██████████| 20/20 [01:37<00:00,  4.86s/trial, best loss: -0.796736288816654] 
100%|██████████| 20/20 [01:32<00:00,  4.60s/trial, best loss: -0.7956840030430874]
100%|██████████| 1000/1000 [00:32<00:00, 30.67trial/s, best loss: -0.7958990939899024]
100%|██████████| 1000/1000 [00:32<00:00, 30.97trial/s, best loss: -0.7861650183276853]
100%|██████████| 1000/1000 [00:32<00:00, 30.84trial/s, best loss: -0.7887986721073379]
100%|██████████| 1000/1000 [00:32<00:00, 30.60trial/s, best loss: -0.8006867694861333]
100%|██████████| 1000/1000 [00:32<00:00, 30.48trial/s, best loss: -0.7996331004910436]
100%|██████████| 500/500 [04:15<00:00,  1.96trial/s, best loss: -0.8129932913756137]
100%|██████████| 500/500 [04:28<00:00,  1.86trial/s, best loss: -

In [36]:
# 一级学习器交叉训练
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, test_size=0.2, blending=True
)

# 元学习器训练与优化
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

best_res_final2, best_test_predict_final2 = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

100%|██████████| 20/20 [01:38<00:00,  4.93s/trial, best loss: -0.7914201183431953]
100%|██████████| 20/20 [01:25<00:00,  4.30s/trial, best loss: -0.792603550295858] 
100%|██████████| 20/20 [01:33<00:00,  4.67s/trial, best loss: -0.7837278106508876]
100%|██████████| 20/20 [01:36<00:00,  4.83s/trial, best loss: -0.7884615384615385]
100%|██████████| 20/20 [01:34<00:00,  4.70s/trial, best loss: -0.7902366863905326]
100%|██████████| 1000/1000 [00:30<00:00, 33.26trial/s, best loss: -0.7973372781065089]
100%|██████████| 1000/1000 [00:30<00:00, 32.99trial/s, best loss: -0.8041420118343193]
100%|██████████| 1000/1000 [00:30<00:00, 32.50trial/s, best loss: -0.7899408284023668]
100%|██████████| 1000/1000 [00:30<00:00, 32.38trial/s, best loss: -0.8023668639053254]
100%|██████████| 1000/1000 [00:30<00:00, 32.97trial/s, best loss: -0.7911242603550297]
100%|██████████| 500/500 [04:45<00:00,  1.75trial/s, best loss: -0.8038461538461539]
100%|██████████| 500/500 [04:45<00:00,  1.75trial/s, best loss: -

In [37]:
# 一级学习器交叉训练
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, test_size=0.3, blending=True
)

# 元学习器训练与优化
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

best_res_final3, best_test_predict_final3 = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

100%|██████████| 20/20 [01:28<00:00,  4.43s/trial, best loss: -0.788297434490328] 
100%|██████████| 20/20 [01:29<00:00,  4.50s/trial, best loss: -0.7855907303242328]
100%|██████████| 20/20 [01:31<00:00,  4.57s/trial, best loss: -0.7829657703388668]
100%|██████████| 20/20 [01:26<00:00,  4.34s/trial, best loss: -0.782290094663191]
100%|██████████| 20/20 [01:48<00:00,  5.45s/trial, best loss: -0.789044564869438]
100%|██████████| 1000/1000 [00:28<00:00, 34.73trial/s, best loss: -0.8038642703617324]
100%|██████████| 1000/1000 [00:28<00:00, 35.08trial/s, best loss: -0.7994632322678008]
100%|██████████| 1000/1000 [00:28<00:00, 34.91trial/s, best loss: -0.8015559976219876]
100%|██████████| 1000/1000 [00:29<00:00, 34.26trial/s, best loss: -0.7866825581927104]
100%|██████████| 1000/1000 [00:28<00:00, 34.58trial/s, best loss: -0.8123713815338182]
100%|██████████| 500/500 [04:19<00:00,  1.93trial/s, best loss: -0.8079120364018841]
100%|██████████| 500/500 [03:48<00:00,  2.19trial/s, best loss: -0.

In [38]:
# 一级学习器交叉训练
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, test_size=0.4, blending=True
)

# 元学习器训练与优化
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

best_res_final4, best_test_predict_final4 = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

100%|██████████| 20/20 [01:17<00:00,  3.89s/trial, best loss: -0.7609467455621302]
100%|██████████| 20/20 [01:22<00:00,  4.11s/trial, best loss: -0.7893491124260354]
100%|██████████| 20/20 [01:21<00:00,  4.08s/trial, best loss: -0.778698224852071] 
100%|██████████| 20/20 [01:20<00:00,  4.01s/trial, best loss: -0.7834319526627219]
100%|██████████| 20/20 [01:25<00:00,  4.28s/trial, best loss: -0.7886447995775676]
100%|██████████| 1000/1000 [00:27<00:00, 36.79trial/s, best loss: -0.7893491124260356]
100%|██████████| 1000/1000 [00:26<00:00, 37.12trial/s, best loss: -0.7964497041420119]
100%|██████████| 1000/1000 [00:26<00:00, 38.24trial/s, best loss: -0.791715976331361]
100%|██████████| 1000/1000 [00:26<00:00, 37.95trial/s, best loss: -0.7952662721893491]
100%|██████████| 1000/1000 [00:26<00:00, 37.97trial/s, best loss: -0.7981099256084113]
100%|██████████| 500/500 [03:47<00:00,  2.20trial/s, best loss: -0.8039447731755424]
100%|██████████| 500/500 [04:07<00:00,  2.02trial/s, best loss: -0

KeyboardInterrupt: 

In [None]:
# 一级学习器交叉训练
lr_hyper = lr_cascade(lr_params_space)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, test_size=0.5, blending=True
)

# 元学习器训练与优化
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

best_res_final5, best_test_predict_final5 = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

In [None]:
Blending_res = pd.DataFrame(
    {
        "res1": best_test_predict_final1,
        "res2": best_test_predict_final2,
        "res3": best_test_predict_final3,
        "res4": best_test_predict_final4,
        "res5": best_test_predict_final5,
    }
)

In [None]:
# 写入本地
Blending_res.to_csv("Blending_res.csv", index=False)

In [None]:
accuracy_score(Blending_res.mean(axis=1) > 0.5, y_test)

In [None]:
pd.Series(
    [
        best_res_final1,
        best_res_final2,
        best_res_final3,
        best_res_final4,
        best_res_final5,
    ],
    index=[
        "best_res_final1",
        "best_res_final2",
        "best_res_final3",
        "best_res_final4",
        "best_res_final5",
    ],
)

In [None]:
# 加权平均融合
Blending_res1 = (
    (Blending_res["res1"] * 4)
    + (Blending_res["res2"] * 5)
    + (Blending_res["res3"] * 2)
    + (Blending_res["res4"] * 3)
    + (Blending_res["res5"] * 1)
) / 15

In [None]:
accuracy_score(Blending_res1 > 0.5, y_test)