In [2]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings("ignore")

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *
from manual_ensemble import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe, Trials
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [3]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [4]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [5]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [6]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [7]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [8]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        if self.voting == "soft":
            res_proba = self.clf.predict_proba(X)
        else:
            res_proba = None
        return res_proba

    def predict(self, X):
        if self.voting == "soft":
            res = (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
        else:
            res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [9]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [10]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [11]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [12]:
test_predict_proba_RF = []

for i in range(5):
    test_predict_proba_RF.append(RF_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = []

for i in range(5):
    test_predict_proba_tree.append(tree_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = []

for i in range(5):
    test_predict_proba_lr.append(lr_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [13]:
X_train1, X_train2, y_train1, y_train2 = train_test_split(
    X_train_OE, y_train, test_size=0.2, random_state=12
)

In [14]:
X_train1.shape

(4225, 19)

In [16]:
X_train2.shape

(1057, 19)

In [18]:
start = time.time()

parameter_space = {
    "min_samples_leaf": range(7, 10),
    "min_samples_split": range(2, 4),
    "max_depth": range(5, 8),
    "max_leaf_nodes": [None] + list(range(32, 49, 2)),
    "n_estimators": range(9, 12),
    "max_features": ["sqrt", "log2"] + list(range(4, 8)),
    "max_samples": [None, 0.55, 0.6, 0.65],
}

RF_blending = RandomForestClassifier(random_state=12)
grid_RF_blending = GridSearchCV(RF_blending, parameter_space, n_jobs=-1)

grid_RF_blending.fit(X_train1, y_train1)

print(time.time() - start)

95.50578761100769


In [19]:
grid_RF_blending.best_params_

{'max_depth': 6,
 'max_features': 6,
 'max_leaf_nodes': None,
 'max_samples': 0.6,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'n_estimators': 10}

In [20]:
grid_RF_blending.score(X_train1, y_train1), grid_RF_blending.score(
    X_train2, y_train2
), grid_RF_blending.score(X_test_OE, y_test)

(0.8191715976331361, 0.8136234626300851, 0.7830777967064169)

In [21]:
logistic_pre = ColumnTransformer(
    [
        ("cat", preprocessing.OneHotEncoder(drop="if_binary"), category_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

num_pre = [
    "passthrough",
    preprocessing.StandardScaler(),
    preprocessing.KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="kmeans"),
]

logistic_blending = logit_threshold(max_iter=int(1e8))

logistic_pipe = make_pipeline(logistic_pre, logistic_blending)

cw_l = [None, "balanced"]

logistic_param = [
    {
        "columntransformer__num": num_pre,
        "logit_threshold__thr": np.arange(0.1, 1, 0.1).tolist(),
        "logit_threshold__penalty": ["l1"],
        "logit_threshold__C": np.arange(0.1, 1.1, 0.1).tolist(),
        "logit_threshold__solver": ["saga"],
        "logit_threshold__class_weight": cw_l,
    },
    {
        "columntransformer__num": num_pre,
        "logit_threshold__thr": np.arange(0.1, 1, 0.1).tolist(),
        "logit_threshold__penalty": ["l2"],
        "logit_threshold__C": np.arange(0.1, 1.1, 0.1).tolist(),
        "logit_threshold__solver": ["lbfgs", "newton-cg", "sag", "saga"],
        "logit_threshold__class_weight": cw_l,
    },
]

# 实例化网格搜索评估器
grid_lr_blending = GridSearchCV(
    estimator=logistic_pipe, param_grid=logistic_param, scoring="accuracy", n_jobs=-1
)

s = time.time()
grid_lr_blending.fit(X_train1, y_train1)
print(time.time() - s, "s")

447.3558382987976 s


In [22]:
grid_lr_blending.best_score_

0.8073372781065089

In [23]:
grid_lr_blending.score(X_train1, y_train1), grid_lr_blending.score(
    X_train2, y_train2
), grid_lr_blending.score(X_test_OE, y_test)

(0.8073372781065089, 0.8183538315988647, 0.7825099375354913)

In [24]:
tree_model = DecisionTreeClassifier(random_state=12)

tree_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

grid_tree_blending = GridSearchCV(
    estimator=tree_model, param_grid=tree_param, n_jobs=12
).fit(X_train1, y_train1)

In [25]:
grid_tree_blending.best_score_

0.8011834319526627

In [26]:
grid_tree_blending.score(X_train1, y_train1), grid_tree_blending.score(
    X_train2, y_train2
), grid_tree_blending.score(X_test_OE, y_test)

(0.8137278106508876, 0.7994323557237465, 0.7717206132879046)

In [27]:
train_oof_blending = pd.DataFrame(
    {
        "lr_oof_blending": grid_lr_blending.predict_proba(X_train2)[:, 1],
        "RF_oof_blending": grid_RF_blending.predict_proba(X_train2)[:, 1],
        "tree_oof_blending": grid_tree_blending.predict_proba(X_train2)[:, 1],
    }
)

In [28]:
test_predict_blending = pd.DataFrame(
    {
        "lr_oof_blending": grid_lr_blending.predict_proba(X_test_OE)[:, 1],
        "RF_oof_blending": grid_RF_blending.predict_proba(X_test_OE)[:, 1],
        "tree_oof_blending": grid_tree_blending.predict_proba(X_test_OE)[:, 1],
    }
)

In [29]:
# 逻辑回归
lr = LogisticRegression().fit(train_oof_blending, y_train2)
print("The results of LR-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (lr.score(train_oof_blending, y_train2), lr.score(test_predict_blending, y_test))
)

# 决策树
tree = DecisionTreeClassifier().fit(train_oof_blending, y_train2)
print("The results of tree-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        tree.score(train_oof_blending, y_train2),
        tree.score(test_predict_blending, y_test),
    )
)

# KNN最近邻分类器
from sklearn import neighbors

KNN = neighbors.KNeighborsClassifier().fit(train_oof_blending, y_train2)
print("The results of KNN-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        KNN.score(train_oof_blending, y_train2),
        KNN.score(test_predict_blending, y_test),
    )
)

# SVM支持向量机
from sklearn import svm

SVM = svm.SVC().fit(train_oof_blending, y_train2)
print("The results of SVM-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        SVM.score(train_oof_blending, y_train2),
        SVM.score(test_predict_blending, y_test),
    )
)

# 朴素贝叶斯/高斯贝叶斯
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB().fit(train_oof_blending, y_train2)
print("The results of GaussianNB-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        gnb.score(train_oof_blending, y_train2),
        gnb.score(test_predict_blending, y_test),
    )
)

# Bagging
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier().fit(train_oof_blending, y_train2)
print("The results of Bagging-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        bagging.score(train_oof_blending, y_train2),
        bagging.score(test_predict_blending, y_test),
    )
)

# 随机森林
RFC = RandomForestClassifier().fit(train_oof_blending, y_train2)
print("The results of RandomForest-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        RFC.score(train_oof_blending, y_train2),
        RFC.score(test_predict_blending, y_test),
    )
)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ABC = AdaBoostClassifier().fit(train_oof_blending, y_train2)
print("The results of AdaBoost-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        ABC.score(train_oof_blending, y_train2),
        ABC.score(test_predict_blending, y_test),
    )
)

# GBDT
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier().fit(train_oof_blending, y_train2)
print("The results of GBDT-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        GBC.score(train_oof_blending, y_train2),
        GBC.score(test_predict_blending, y_test),
    )
)

# XGB
from xgboost import XGBClassifier

XGB = XGBClassifier().fit(train_oof_blending, y_train2)
print("The results of XGB-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        XGB.score(train_oof_blending, y_train2),
        XGB.score(test_predict_blending, y_test),
    )
)

The results of LR-final:
Train2-Accuracy: 0.818354, Test-Accuracy: 0.788189
The results of tree-final:
Train2-Accuracy: 1.000000, Test-Accuracy: 0.727428
The results of KNN-final:
Train2-Accuracy: 0.850520, Test-Accuracy: 0.766042
The results of SVM-final:
Train2-Accuracy: 0.826868, Test-Accuracy: 0.787053
The results of GaussianNB-final:
Train2-Accuracy: 0.807001, Test-Accuracy: 0.780239
The results of Bagging-final:
Train2-Accuracy: 0.977294, Test-Accuracy: 0.752413
The results of RandomForest-final:
Train2-Accuracy: 1.000000, Test-Accuracy: 0.764906
The results of AdaBoost-final:
Train2-Accuracy: 0.838221, Test-Accuracy: 0.779671
The results of GBDT-final:
Train2-Accuracy: 0.886471, Test-Accuracy: 0.781374
The results of XGB-final:
Train2-Accuracy: 0.965941, Test-Accuracy: 0.767746


In [30]:
estimators = [
    ("lr", grid_lr_blending.best_estimator_),
    ("tree", grid_tree_blending.best_estimator_),
    ("rf", grid_RF_blending.best_estimator_),
]

In [31]:
train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators, blending=True
)

In [34]:
lr = LogisticRegression().fit(
    train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]
)
print("The results of LR-final:")
print(
    "Train2-Accuracy: %f, Test-Accuracy: %f"
    % (
        lr.score(train_oof_blending.iloc[:, :-1], train_oof_blending.iloc[:, -1]),
        lr.score(test_predict_blending, y_test),
    )
)

The results of LR-final:
Train2-Accuracy: 0.822138, Test-Accuracy: 0.784214


In [35]:
lr = logit_threshold()
tree = DecisionTreeClassifier()
final_model_l = [lr, tree]

In [36]:
lr_final_param = [
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["saga"],
    },
    {
        "thr": np.arange(0.1, 1.1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

tree_final_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

param_space_l = [lr_final_param, tree_final_param]

In [37]:
best_res_final, best_test_predict_final = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

In [38]:
accuracy_score((best_test_predict_final >= 0.5) * 1, y_test)

0.7830777967064169

In [39]:
lr_hyper = lr_cascade(lr_params_space, max_evals=50)
tree_hyper = tree_cascade(tree_params_space)
RF_hyper = RF_cascade(RF_params_space, max_evals=1000)

estimators = [("lr", lr_hyper), ("tree", tree_hyper), ("rf", RF_hyper)]

In [None]:
train_oof_blending, test_predict_blending = train_cross(
    X_train_OE, y_train, X_test_OE, estimators=estimators, blending=True
)

In [None]:
best_res_final, best_test_predict_final = final_model_opt(
    final_model_l,
    param_space_l,
    train_oof_blending.iloc[:, :-1],
    train_oof_blending.iloc[:, -1],
    test_predict_blending,
)

In [None]:
accuracy_score((best_test_predict_final >= 0.5) * 1, y_test)

极限效果测试下，一次运行约需要1个半小时。并且，最终得到的融合结果略好于单模建模结果，略差于上一小节的自动Stacking融合结果。至此，我们就完整执行了手动和自动Blending融合及优化各流程。