In [10]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [11]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [12]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [13]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [14]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [15]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [16]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(X) if self.voting == "soft" else None

    def predict(self, X):
        return (
            (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
            if self.voting == "soft"
            else self.clf.predict(X)
        )

    def score(self, X, y):
        return accuracy_score(self.predict(X), y)

In [17]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [18]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [19]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [20]:
test_predict_proba_RF = [RF_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)]
test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = [tree_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)]
test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = [lr_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)]
test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [21]:
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight1": hp.uniform("weight1", 0, 1),
    "weight2": hp.uniform("weight2", 0, 1),
    "weight3": hp.uniform("weight3", 0, 1),
}

In [22]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight1 = params["weight1"]
    weight2 = params["weight2"]
    weight3 = params["weight3"]

    weights_sum = weight1 + weight2 + weight3

    predict_proba_weight = (
        test_predict_proba_lr * weight1
        + test_predict_proba_tree * weight2
        + test_predict_proba_RF * weight3
    ) / weights_sum

    res_weight = (predict_proba_weight >= thr) * 1

    eval_score = accuracy_score(res_weight, y_test)

    return -eval_score

In [23]:
# 优化函数
def param_hyperopt_weight(max_evals):
    return fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(17),
    )

In [24]:
params_best = param_hyperopt_weight(5000)

100%|██████████| 5000/5000 [01:26<00:00, 57.65trial/s, best loss: -0.8006814310051107] 


In [25]:
params_best

{'thr': 0.47024504133638434,
 'weight1': 0.0012022354762377982,
 'weight2': 0.0054382475309307995,
 'weight3': 0.6089296721079599}

In [26]:
# 定义超参数空间
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight_lr1": hp.uniform("weight_lr1", 0, 1),
    "weight_lr2": hp.uniform("weight_lr2", 0, 1),
    "weight_lr3": hp.uniform("weight_lr3", 0, 1),
    "weight_lr4": hp.uniform("weight_lr4", 0, 1),
    "weight_lr5": hp.uniform("weight_lr5", 0, 1),
    "weight_tree1": hp.uniform("weight_tree1", 0, 1),
    "weight_tree2": hp.uniform("weight_tree2", 0, 1),
    "weight_tree3": hp.uniform("weight_tree3", 0, 1),
    "weight_tree4": hp.uniform("weight_tree4", 0, 1),
    "weight_tree5": hp.uniform("weight_tree5", 0, 1),
    "weight_RF1": hp.uniform("weight_RF1", 0, 1),
    "weight_RF2": hp.uniform("weight_RF2", 0, 1),
    "weight_RF3": hp.uniform("weight_RF3", 0, 1),
    "weight_RF4": hp.uniform("weight_RF4", 0, 1),
    "weight_RF5": hp.uniform("weight_RF5", 0, 1),
}

In [27]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight_lr1 = params["weight_lr1"]
    weight_lr2 = params["weight_lr2"]
    weight_lr3 = params["weight_lr3"]
    weight_lr4 = params["weight_lr4"]
    weight_lr5 = params["weight_lr5"]

    weight_tree1 = params["weight_tree1"]
    weight_tree2 = params["weight_tree2"]
    weight_tree3 = params["weight_tree3"]
    weight_tree4 = params["weight_tree4"]
    weight_tree5 = params["weight_tree5"]

    weight_RF1 = params["weight_RF1"]
    weight_RF2 = params["weight_RF2"]
    weight_RF3 = params["weight_RF3"]
    weight_RF4 = params["weight_RF4"]
    weight_RF5 = params["weight_RF5"]

    eval1_predict_proba_weight = (
        pd.Series(lr_1.predict_proba(X_eval1)[:, 1], index=X_eval1.index) * weight_lr1
        + pd.Series(tree_1.predict_proba(X_eval1)[:, 1], index=X_eval1.index)
        * weight_tree1
        + pd.Series(RF_1.predict_proba(X_eval1)[:, 1], index=X_eval1.index) * weight_RF1
    ) / (weight_lr1 + weight_tree1 + weight_RF1)

    eval2_predict_proba_weight = (
        pd.Series(lr_2.predict_proba(X_eval2)[:, 1], index=X_eval2.index) * weight_lr2
        + pd.Series(tree_2.predict_proba(X_eval2)[:, 1], index=X_eval2.index)
        * weight_tree2
        + pd.Series(RF_2.predict_proba(X_eval2)[:, 1], index=X_eval2.index) * weight_RF2
    ) / (weight_lr2 + weight_tree2 + weight_RF2)

    eval3_predict_proba_weight = (
        pd.Series(lr_3.predict_proba(X_eval3)[:, 1], index=X_eval3.index) * weight_lr3
        + pd.Series(tree_3.predict_proba(X_eval3)[:, 1], index=X_eval3.index)
        * weight_tree3
        + pd.Series(RF_3.predict_proba(X_eval3)[:, 1], index=X_eval3.index) * weight_RF3
    ) / (weight_lr3 + weight_tree3 + weight_RF3)

    eval4_predict_proba_weight = (
        pd.Series(lr_4.predict_proba(X_eval4)[:, 1], index=X_eval4.index) * weight_lr4
        + pd.Series(tree_4.predict_proba(X_eval4)[:, 1], index=X_eval4.index)
        * weight_tree4
        + pd.Series(RF_4.predict_proba(X_eval4)[:, 1], index=X_eval4.index) * weight_RF4
    ) / (weight_lr4 + weight_tree4 + weight_RF4)

    eval5_predict_proba_weight = (
        pd.Series(lr_5.predict_proba(X_eval5)[:, 1], index=X_eval5.index) * weight_lr5
        + pd.Series(tree_5.predict_proba(X_eval5)[:, 1], index=X_eval5.index)
        * weight_tree5
        + pd.Series(RF_5.predict_proba(X_eval5)[:, 1], index=X_eval5.index) * weight_RF5
    ) / (weight_lr5 + weight_tree5 + weight_RF5)

    eval_predict_proba_weight = pd.concat(
        [
            eval1_predict_proba_weight,
            eval2_predict_proba_weight,
            eval3_predict_proba_weight,
            eval4_predict_proba_weight,
            eval5_predict_proba_weight,
        ]
    ).sort_index()

    eval_predict = (eval_predict_proba_weight >= thr) * 1

    eval_acc = accuracy_score(eval_predict, y_train)

    return -eval_acc

In [28]:
def param_hyperopt_weight(max_evals):
    return fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(2),
    )

In [29]:
best_params = param_hyperopt_weight(5000)

100%|██████████| 5000/5000 [10:08<00:00,  8.22trial/s, best loss: -0.8244982960999622]


In [30]:
best_params

{'thr': 0.47419191141454525,
 'weight_RF1': 0.13388324321898987,
 'weight_RF2': 0.809291476570488,
 'weight_RF3': 0.8180431149306414,
 'weight_RF4': 0.449932107868016,
 'weight_RF5': 0.7528134149351241,
 'weight_lr1': 0.9985040011202883,
 'weight_lr2': 0.39454849420821103,
 'weight_lr3': 6.261034632367574e-05,
 'weight_lr4': 0.7783270953869424,
 'weight_lr5': 0.8593342729626122,
 'weight_tree1': 0.2252489955409815,
 'weight_tree2': 0.16220323052327767,
 'weight_tree3': 0.0004439727975010005,
 'weight_tree4': 0.09031007610111841,
 'weight_tree5': 0.2592556306534197}

In [31]:
def muti_weight_test_acc(params):
    thr = params["thr"]
    weight_lr1 = params["weight_lr1"]
    weight_lr2 = params["weight_lr2"]
    weight_lr3 = params["weight_lr3"]
    weight_lr4 = params["weight_lr4"]
    weight_lr5 = params["weight_lr5"]

    weight_lr_l = np.array([weight_lr1, weight_lr2, weight_lr3, weight_lr4, weight_lr5])
    weight_lr_sum = weight_lr_l.sum()

    weight_tree1 = params["weight_tree1"]
    weight_tree2 = params["weight_tree2"]
    weight_tree3 = params["weight_tree3"]
    weight_tree4 = params["weight_tree4"]
    weight_tree5 = params["weight_tree5"]

    weight_tree_l = np.array(
        [weight_tree1, weight_tree2, weight_tree3, weight_tree4, weight_tree5]
    )
    weight_tree_sum = weight_tree_l.sum()

    weight_RF1 = params["weight_RF1"]
    weight_RF2 = params["weight_RF2"]
    weight_RF3 = params["weight_RF3"]
    weight_RF4 = params["weight_RF4"]
    weight_RF5 = params["weight_RF5"]

    weight_RF_l = np.array([weight_RF1, weight_RF2, weight_RF3, weight_RF4, weight_RF5])
    weight_RF_sum = weight_RF_l.sum()

    test_predict_proba = (
        lr_1.predict_proba(X_test_OE)[:, 1] * weight_lr1
        + lr_2.predict_proba(X_test_OE)[:, 1] * weight_lr2
        + lr_3.predict_proba(X_test_OE)[:, 1] * weight_lr3
        + lr_4.predict_proba(X_test_OE)[:, 1] * weight_lr4
        + lr_5.predict_proba(X_test_OE)[:, 1] * weight_lr5
        + tree_1.predict_proba(X_test_OE)[:, 1] * weight_tree1
        + tree_2.predict_proba(X_test_OE)[:, 1] * weight_tree2
        + tree_3.predict_proba(X_test_OE)[:, 1] * weight_tree3
        + tree_4.predict_proba(X_test_OE)[:, 1] * weight_tree4
        + tree_5.predict_proba(X_test_OE)[:, 1] * weight_tree5
        + RF_1.predict_proba(X_test_OE)[:, 1] * weight_RF1
        + RF_2.predict_proba(X_test_OE)[:, 1] * weight_RF2
        + RF_3.predict_proba(X_test_OE)[:, 1] * weight_RF3
        + RF_4.predict_proba(X_test_OE)[:, 1] * weight_RF4
        + RF_5.predict_proba(X_test_OE)[:, 1] * weight_RF5
    ) / (weight_lr_sum + weight_tree_sum + weight_RF_sum)

    test_predict = (test_predict_proba >= thr) * 1

    test_acc = accuracy_score(test_predict, y_test)
    return test_acc

In [32]:
muti_weight_test_acc(best_params)

0.7927314026121521

In [33]:
estimators_RF = [
    ("RF_1", RF_1),
    ("RF_2", RF_2),
    ("RF_3", RF_3),
    ("RF_4", RF_4),
    ("RF_5", RF_5),
]

In [34]:
# 定义超参数空间
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight1": hp.uniform("weight1", 0, 1),
    "weight2": hp.uniform("weight2", 0, 1),
    "weight3": hp.uniform("weight3", 0, 1),
    "weight4": hp.uniform("weight4", 0, 1),
    "weight5": hp.uniform("weight5", 0, 1),
}

In [35]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight1 = params["weight1"]
    weight2 = params["weight2"]
    weight3 = params["weight3"]
    weight4 = params["weight4"]
    weight5 = params["weight5"]

    weights = [weight1, weight2, weight3, weight4, weight5]

    # 创建带阈值的平均法评估器
    VC_weight_search = VotingClassifier_threshold(
        estimators=estimators_RF, weights=weights, voting="soft", thr=thr
    )

    # 输出验证集上的平均得分
    val_score = cross_val_score(
        VC_weight_search, X_train_OE, y_train, scoring="accuracy", n_jobs=-1, cv=5
    ).mean()

    return -val_score

In [36]:
# 定义优化函数
def param_hyperopt_weight(max_evals):
    return fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(2),
    )

In [37]:
best_params = param_hyperopt_weight(500)

100%|██████████| 500/500 [05:23<00:00,  1.55trial/s, best loss: -0.8080260385310055]


In [38]:
def weights_extract(best_params):
    thr = best_params["thr"]
    weight1 = best_params["weight1"]
    weight2 = best_params["weight2"]
    weight3 = best_params["weight3"]
    weight4 = best_params["weight4"]
    weight5 = best_params["weight5"]

    weights_sum = weight1 + weight2 + weight3 + weight4 + weight5

    weight1 = weight1 / weights_sum
    weight2 = weight2 / weights_sum
    weight3 = weight3 / weights_sum
    weight4 = weight4 / weights_sum
    weight5 = weight5 / weights_sum

    weights = [weight1, weight2, weight3, weight4, weight5]
    return weights

In [39]:
RF_weights = weights_extract(best_params)

In [40]:
eval_predict_proba_RF = 0

for i in range(5):
    eval_predict_proba_RF += (RF_l[i].predict_proba(X_train_OE)[:, 1]) * RF_weights[i]

eval_predict_proba_RF

array([0.03915926, 0.54221843, 0.12445596, ..., 0.56028362, 0.03539781,
       0.01897568])

In [41]:
test_predict_proba_RF = 0

for i in range(5):
    test_predict_proba_RF += (RF_l[i].predict_proba(X_test_OE)[:, 1]) * RF_weights[i]

test_predict_proba_RF

array([0.03260294, 0.30040562, 0.01954919, ..., 0.14389084, 0.52681528,
       0.10989063])

In [42]:
estimators_lr = [
    ("lr_1", lr_1),
    ("lr_2", lr_2),
    ("lr_3", lr_3),
    ("lr_4", lr_4),
    ("lr_5", lr_5),
]

In [43]:
# 定义超参数空间
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight1": hp.uniform("weight1", 0, 1),
    "weight2": hp.uniform("weight2", 0, 1),
    "weight3": hp.uniform("weight3", 0, 1),
    "weight4": hp.uniform("weight4", 0, 1),
    "weight5": hp.uniform("weight5", 0, 1),
}

In [44]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight1 = params["weight1"]
    weight2 = params["weight2"]
    weight3 = params["weight3"]
    weight4 = params["weight4"]
    weight5 = params["weight5"]

    weights = [weight1, weight2, weight3, weight4, weight5]

    # 创建带阈值的平均法评估器
    VC_weight_search = VotingClassifier_threshold(
        estimators=estimators_lr, weights=weights, voting="soft", thr=thr
    )

    # 输出验证集上的平均得分
    val_score = cross_val_score(
        VC_weight_search, X_train_OE, y_train, scoring="accuracy", n_jobs=15, cv=5
    ).mean()

    return -val_score

In [45]:
# 定义优化函数
def param_hyperopt_weight(max_evals):
    params_best = fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(17),
    )
    return params_best

In [46]:
best_params = param_hyperopt_weight(300)

100%|██████████| 300/300 [02:02<00:00,  2.45trial/s, best loss: -0.8116230899343482]


In [47]:
lr_weights = weights_extract(best_params)

In [48]:
eval_predict_proba_lr = 0

for i in range(5):
    eval_predict_proba_lr += (lr_l[i].predict_proba(X_train_OE)[:, 1]) * lr_weights[i]

eval_predict_proba_lr

array([0.01259023, 0.55030369, 0.14584096, ..., 0.68102564, 0.05046448,
       0.00386074])

In [49]:
test_predict_proba_lr = 0

for i in range(5):
    test_predict_proba_lr += (lr_l[i].predict_proba(X_test_OE)[:, 1]) * lr_weights[i]

test_predict_proba_lr

array([0.04146178, 0.23631037, 0.00483358, ..., 0.13061298, 0.50063975,
       0.06713644])

In [50]:
estimators_tree = [
    ("tree_1", tree_1),
    ("tree_2", tree_2),
    ("tree_3", tree_3),
    ("tree_4", tree_4),
    ("tree_5", tree_5),
]

In [51]:
# 定义超参数空间
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight1": hp.uniform("weight1", 0, 1),
    "weight2": hp.uniform("weight2", 0, 1),
    "weight3": hp.uniform("weight3", 0, 1),
    "weight4": hp.uniform("weight4", 0, 1),
    "weight5": hp.uniform("weight5", 0, 1),
}

In [52]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight1 = params["weight1"]
    weight2 = params["weight2"]
    weight3 = params["weight3"]
    weight4 = params["weight4"]
    weight5 = params["weight5"]

    weights = [weight1, weight2, weight3, weight4, weight5]

    # 创建带阈值的平均法评估器
    VC_weight_search = VotingClassifier_threshold(
        estimators=estimators_tree, weights=weights, voting="soft", thr=thr
    )

    # 输出验证集上的平均得分
    val_score = cross_val_score(
        VC_weight_search, X_train_OE, y_train, scoring="accuracy", n_jobs=15, cv=5
    ).mean()

    return -val_score

In [53]:
# 定义优化函数
def param_hyperopt_weight(max_evals):
    params_best = fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(17),
    )
    return params_best

In [54]:
best_params = param_hyperopt_weight(300)

100%|██████████| 300/300 [00:14<00:00, 20.93trial/s, best loss: -0.797234167598406]


In [55]:
tree_weights = weights_extract(best_params)

In [56]:
eval_predict_proba_tree = 0

for i in range(5):
    eval_predict_proba_tree += (
        tree_l[i].predict_proba(X_train_OE)[:, 1]
    ) * tree_weights[i]

eval_predict_proba_tree

array([0.04365352, 0.74609312, 0.16049057, ..., 0.39059284, 0.04624722,
       0.04365352])

In [57]:
test_predict_proba_tree = 0

for i in range(5):
    test_predict_proba_tree += (
        tree_l[i].predict_proba(X_test_OE)[:, 1]
    ) * tree_weights[i]

test_predict_proba_tree

array([0.04365352, 0.1870317 , 0.04365352, ..., 0.1870317 , 0.45025595,
       0.17253686])

In [58]:
# 定义超参数空间
params_space = {
    "thr": hp.uniform("thr", 0.4, 0.6),
    "weight1": hp.uniform("weight1", 0, 1),
    "weight2": hp.uniform("weight2", 0, 1),
    "weight3": hp.uniform("weight3", 0, 1),
}

In [59]:
# 定义目标函数
def hyperopt_objective_weight(params):
    thr = params["thr"]
    weight1 = params["weight1"]
    weight2 = params["weight2"]
    weight3 = params["weight3"]

    weights_sum = weight1 + weight2 + weight3

    predict_probo_weight = (
        eval_predict_proba_lr * weight1
        + eval_predict_proba_tree * weight2
        + eval_predict_proba_RF * weight3
    ) / weights_sum

    res_weight = (predict_probo_weight >= thr) * 1

    eval_score = accuracy_score(res_weight, y_train)

    return -eval_score

In [60]:
# 定义优化函数
def param_hyperopt_weight(max_evals):
    params_best = fmin(
        fn=hyperopt_objective_weight,
        space=params_space,
        algo=tpe.suggest,
        max_evals=max_evals,
        rstate=np.random.default_rng(2),
    )
    return params_best

In [61]:
params_best = param_hyperopt_weight(5000)

100%|██████████| 5000/5000 [01:22<00:00, 60.70trial/s, best loss: -0.8345323741007195] 


In [62]:
params_best

{'thr': 0.4462784417468528,
 'weight1': 0.020607599276297635,
 'weight2': 0.03222821458037668,
 'weight3': 0.8780164658743584}

In [63]:
def test_acc(params_best):
    thr = params_best["thr"]
    weight1 = params_best["weight1"]
    weight2 = params_best["weight2"]
    weight3 = params_best["weight3"]

    weights_sum = weight1 + weight2 + weight3

    test_predict_proba = (
        (
            (
                test_predict_proba_lr * weight1
                + test_predict_proba_tree * weight2
                + test_predict_proba_RF * weight3
            )
            / weights_sum
        )
        >= thr
    ) * 1

    print(accuracy_score(test_predict_proba, y_test))

In [64]:
test_acc(params_best)

0.7938671209540034
