In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings("ignore")

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [6]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [7]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(X) if self.voting == "soft" else None

    def predict(self, X):
        return (
            (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
            if self.voting == "soft"
            else self.clf.predict(X)
        )

    def score(self, X, y):
        return accuracy_score(self.predict(X), y)

In [8]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [9]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [10]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [11]:
test_predict_proba_RF = [
    RF_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)
]
test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = [
    tree_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)
]
test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = [
    lr_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)
]
test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)

In [12]:
logistic = LogisticRegression()
tree = DecisionTreeClassifier()
RF = RandomForestClassifier()

estimators = [("lr", logistic), ("tree", tree), ("rf", RF)]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_train_OE, y_train)

clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.8761832639151836, 0.7864849517319704)

In [13]:
logistic_search = load("./model/grid_lr_1.joblib")
tree_model = load("./model/tree_model.joblib")
RF_0 = load("./model/RF_0.joblib")

estimators = [
    ("lr", logistic_search.best_estimator_),
    ("tree", tree_model),
    ("rf", RF_0),
]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_train_OE, y_train)

clf.score(X_train_OE, y_train), clf.score(X_test_OE, y_test)

(0.8256342294585385, 0.7898921067575241)

In [14]:
train_stack_oof = pd.DataFrame(
    {
        "lr_oof": eval_predict_proba_lr,
        "tree_oof": eval_predict_proba_tree,
        "RF_oof": eval_predict_proba_RF,
    }
)
train_stack_oof

Unnamed: 0,lr_oof,tree_oof,RF_oof
0,0.011289,0.037669,0.044787
1,0.543190,0.787986,0.572187
2,0.151200,0.222819,0.161815
3,0.273393,0.259434,0.250871
4,0.158399,0.107345,0.122533
...,...,...,...
5277,0.062756,0.062959,0.082653
5278,0.346367,0.222819,0.346562
5279,0.688556,0.438538,0.551481
5280,0.050627,0.066419,0.049011


In [15]:
test_stack = pd.DataFrame(
    {
        "lr_test": test_predict_proba_lr,
        "tree_test": test_predict_proba_tree,
        "RF_test": test_predict_proba_RF,
    }
)

test_stack

Unnamed: 0,lr_test,tree_test,RF_test
0,0.039438,0.046473,0.029220
1,0.238789,0.158900,0.311980
2,0.005101,0.046473,0.016244
3,0.031015,0.046473,0.025769
4,0.059170,0.051573,0.035476
...,...,...,...
1756,0.177800,0.212513,0.193367
1757,0.034432,0.046473,0.048821
1758,0.130264,0.158900,0.145346
1759,0.500205,0.437401,0.530686


In [16]:
lr_final = LogisticRegression().fit(train_stack_oof, y_train)

In [17]:
lr_final.score(train_stack_oof, y_train), lr_final.score(test_stack, y_test)

(0.8176826959485044, 0.7950028392958546)

In [18]:
def train_cross(X_train, y_train, X_test, estimators, n_splits=5, random_state=12):
    X = X_train.reset_index(drop=True)
    y = y_train.reset_index(drop=True)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    m = X.shape[0]
    n = len(estimators)
    m_test = X_test.shape[0]

    columns = [f'{estimator[0]}_oof' for estimator in estimators]
    train_oof = pd.DataFrame(np.zeros((m, n)), columns=columns)

    columns = [f"{estimator[0]}_predict" for estimator in estimators]
    test_predict = pd.DataFrame(np.zeros((m_test, n)), columns=columns)

    for estimator in estimators:
        model = estimator[1]
        oof_colName = f'{estimator[0]}_oof'
        predict_colName = f"{estimator[0]}_predict"
        for train_part_index, eval_index in kf.split(X, y):
            X_train_part = X.loc[train_part_index]
            y_train_part = y.loc[train_part_index]
            model.fit(X_train_part, y_train_part)

            X_eval_part = X.loc[eval_index]

            train_oof[oof_colName].loc[eval_index] = model.predict_proba(X_eval_part)[
                :, 1
            ]
            test_predict[predict_colName] += (
                model.predict_proba(X_test)[:, 1] / n_splits
            )
    return train_oof, test_predict

In [19]:
estimators = [
    ("lr", logistic_search.best_estimator_),
    ("tree", tree_model),
    ("rf", RF_0),
]

In [20]:
train_oof, test_predict = train_cross(
    X_train_OE,
    y_train,
    X_test_OE,
    estimators=estimators
)

In [21]:
train_oof

Unnamed: 0,lr_oof,tree_oof,rf_oof
0,0.011289,0.065380,0.064009
1,0.538713,0.787986,0.623811
2,0.149283,0.222819,0.186379
3,0.273393,0.234201,0.243126
4,0.158399,0.211248,0.158017
...,...,...,...
5277,0.070529,0.062959,0.096975
5278,0.349486,0.222819,0.257227
5279,0.685251,0.503722,0.478084
5280,0.051630,0.066419,0.058595


In [22]:
test_predict

Unnamed: 0,lr_predict,tree_predict,rf_predict
0,0.040030,0.052015,0.023194
1,0.242061,0.091196,0.307647
2,0.004933,0.052015,0.006359
3,0.031792,0.052015,0.012658
4,0.061342,0.069881,0.049331
...,...,...,...
1756,0.173789,0.223305,0.192447
1757,0.035180,0.052015,0.066831
1758,0.136799,0.091196,0.160348
1759,0.500012,0.484657,0.591940


In [23]:
clf = LogisticRegression().fit(train_oof, y_train)
clf.score(train_oof, y_train), clf.score(test_predict, y_test)

(0.8120030291556228, 0.7881885292447472)

In [24]:
old_colname = test_predict.columns
test_predict.columns = train_oof.columns
test_predict

Unnamed: 0,lr_oof,tree_oof,rf_oof
0,0.040030,0.052015,0.023194
1,0.242061,0.091196,0.307647
2,0.004933,0.052015,0.006359
3,0.031792,0.052015,0.012658
4,0.061342,0.069881,0.049331
...,...,...,...
1756,0.173789,0.223305,0.192447
1757,0.035180,0.052015,0.066831
1758,0.136799,0.091196,0.160348
1759,0.500012,0.484657,0.591940


In [25]:
# 逻辑回归
lr = LogisticRegression().fit(train_oof, y_train)
print("The results of LR-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (lr.score(train_oof, y_train), lr.score(test_predict, y_test))
)

# 决策树
tree = DecisionTreeClassifier().fit(train_oof, y_train)
print("The results of tree-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (tree.score(train_oof, y_train), tree.score(test_predict, y_test))
)

# KNN最近邻分类器
from sklearn import neighbors

KNN = neighbors.KNeighborsClassifier().fit(train_oof, y_train)
print("The results of KNN-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (KNN.score(train_oof, y_train), KNN.score(test_predict, y_test))
)

# SVM支持向量机
from sklearn import svm

SVM = svm.SVC().fit(train_oof, y_train)
print("The results of SVM-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (SVM.score(train_oof, y_train), SVM.score(test_predict, y_test))
)

# 朴素贝叶斯/高斯贝叶斯
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB().fit(train_oof, y_train)
print("The results of GaussianNB-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (gnb.score(train_oof, y_train), gnb.score(test_predict, y_test))
)

# Bagging
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier().fit(train_oof, y_train)
print("The results of Bagging-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (bagging.score(train_oof, y_train), bagging.score(test_predict, y_test))
)

# 随机森林
RFC = RandomForestClassifier().fit(train_oof, y_train)
print("The results of RandomForest-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (RFC.score(train_oof, y_train), RFC.score(test_predict, y_test))
)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ABC = AdaBoostClassifier().fit(train_oof, y_train)
print("The results of AdaBoost-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (ABC.score(train_oof, y_train), ABC.score(test_predict, y_test))
)

# GBDT
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier().fit(train_oof, y_train)
print("The results of GBDT-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (GBC.score(train_oof, y_train), GBC.score(test_predict, y_test))
)

# XGB
from xgboost import XGBClassifier

XGB = XGBClassifier().fit(train_oof, y_train)
print("The results of XGB-final:")
print(
    "Train-Accuracy: %f, Test-Accuracy: %f"
    % (XGB.score(train_oof, y_train), XGB.score(test_predict, y_test))
)

The results of LR-final:
Train-Accuracy: 0.812003, Test-Accuracy: 0.788189
The results of tree-final:
Train-Accuracy: 0.999432, Test-Accuracy: 0.723453
The results of KNN-final:
Train-Accuracy: 0.854601, Test-Accuracy: 0.762635
The results of SVM-final:
Train-Accuracy: 0.809353, Test-Accuracy: 0.789892
The results of GaussianNB-final:
Train-Accuracy: 0.798372, Test-Accuracy: 0.781942
The results of Bagging-final:
Train-Accuracy: 0.981257, Test-Accuracy: 0.749574
The results of RandomForest-final:
Train-Accuracy: 0.999432, Test-Accuracy: 0.766610
The results of AdaBoost-final:
Train-Accuracy: 0.811814, Test-Accuracy: 0.789892
The results of GBDT-final:
Train-Accuracy: 0.824877, Test-Accuracy: 0.785349
The results of XGB-final:
Train-Accuracy: 0.913858, Test-Accuracy: 0.768881


In [26]:
test_predict.columns = old_colname
test_predict

Unnamed: 0,lr_predict,tree_predict,rf_predict
0,0.040030,0.052015,0.023194
1,0.242061,0.091196,0.307647
2,0.004933,0.052015,0.006359
3,0.031792,0.052015,0.012658
4,0.061342,0.069881,0.049331
...,...,...,...
1756,0.173789,0.223305,0.192447
1757,0.035180,0.052015,0.066831
1758,0.136799,0.091196,0.160348
1759,0.500012,0.484657,0.591940


In [27]:
KNN = neighbors.KNeighborsClassifier(n_neighbors=9).fit(train_oof, y_train)
KNN.score(train_oof, y_train), KNN.score(test_predict, y_test)

(0.8320711851571374, 0.7745599091425327)

In [29]:
logistic_param = [
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ['saga'],
    },
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

In [30]:
logistic_final = logit_threshold(max_iter=int(1e6))

lfg = GridSearchCV(
    estimator=logistic_final,
    param_grid=logistic_param,
    scoring="accuracy",
    n_jobs=-1,
).fit(train_oof, y_train)

In [31]:
lfg.best_score_

0.8123812032338522

In [32]:
lfg.best_params_

{'C': 0.5, 'penalty': 'l1', 'solver': 'saga', 'thr': 0.5}

In [33]:
lfg.score(train_oof, y_train), lfg.score(test_predict, y_test)

(0.8129496402877698, 0.7881885292447472)

In [35]:
tree_final = DecisionTreeClassifier()

tree_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

tfg = GridSearchCV(estimator=tree_final, param_grid=tree_param, n_jobs=-1)
tfg.fit(train_oof, y_train)

In [37]:
tfg.best_params_

{'max_depth': 5,
 'max_leaf_nodes': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [38]:
tfg.best_score_

0.8023461913362576

In [39]:
tfg.score(train_oof, y_train), tfg.score(test_predict, y_test)

(0.8129496402877698, 0.7853492333901193)

In [40]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
}

# 实例化模型与评估器
bagging_final = BaggingClassifier(DecisionTreeClassifier())
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1)

# 模型训练
BG.fit(train_oof, y_train)

print(time.time() - start)

2.403291940689087


In [41]:
BG.best_params_

{'max_samples': 0.1, 'n_estimators': 16}

In [43]:
BG.best_score_

0.8031032295519049

In [44]:
BG.score(train_oof, y_train), BG.score(test_predict, y_test)

(0.8282847406285498, 0.7739920499716071)

In [45]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
}

# 实例化模型与评估器
bagging_final = BaggingClassifier(
    DecisionTreeClassifier(
        max_depth=3, max_leaf_nodes=7, min_samples_leaf=1, min_samples_split=2
    )
)
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1)

# 模型训练
BG.fit(train_oof, y_train)

print(time.time() - start)

3.265425443649292


In [46]:
BG.best_params_

{'max_samples': 1.0, 'n_estimators': 15}

In [47]:
BG.best_score_

0.8114353086207391

In [48]:
BG.score(train_oof, y_train), BG.score(test_predict, y_test)

(0.8127603180613404, 0.7876206700738216)

In [49]:
start = time.time()

parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
    "max_features": np.arange(0.1, 1.1, 0.1).tolist(),
}

bagging_final = BaggingClassifier(LogisticRegression())
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1)

BG.fit(train_oof, y_train)

print(time.time() - start)

21.38026213645935


In [50]:
BG.best_params_

{'max_features': 1.0, 'max_samples': 0.8, 'n_estimators': 17}

In [51]:
BG.best_score_

0.8131385998107852

In [52]:
BG.score(train_oof, y_train), BG.score(test_predict, y_test)

(0.8118137069291935, 0.7876206700738216)

In [53]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
    "max_features": np.arange(0.1, 1.1, 0.1).tolist(),
}

# 实例化模型与评估器
bagging_final = BaggingClassifier(LogisticRegression(penalty="l1", solver="saga"))
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=15)

# 模型训练
BG.fit(train_oof, y_train)

print(time.time() - start)

35.373154640197754


In [54]:
BG.best_params_

{'max_features': 0.8, 'max_samples': 0.9, 'n_estimators': 17}

In [55]:
BG.best_score_

0.8137064232676815

In [56]:
BG.score(train_oof, y_train), BG.score(test_predict, y_test)

(0.8116243847027641, 0.7859170925610448)

In [57]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 101),
    "learning_rate": np.arange(0.01, 0.55, 0.05).tolist(),
    "algorithm": ["SAMME.R", "SAMME"],
}

# 实例化模型与评估器
AB_final = AdaBoostClassifier()
abg = GridSearchCV(AB_final, parameter_space, n_jobs=15)

# 模型训练
abg.fit(train_oof, y_train)

print(time.time() - start)

83.78769421577454


In [58]:
abg.best_params_

{'algorithm': 'SAMME',
 'learning_rate': 0.060000000000000005,
 'n_estimators': 44}

In [59]:
abg.score(train_oof, y_train), abg.score(test_predict, y_test)

(0.8114350624763347, 0.7836456558773425)

In [60]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 51),
    "learning_rate": np.arange(0.01, 0.51, 0.05).tolist(),
    "algorithm": ["SAMME.R", "SAMME"],
}

# 实例化模型与评估器
AB_final = AdaBoostClassifier(LogisticRegression())
abg = GridSearchCV(AB_final, parameter_space, n_jobs=-1)

# 模型训练
abg.fit(train_oof, y_train)

print(time.time() - start)

27.763091802597046


In [61]:
abg.best_params_

{'algorithm': 'SAMME', 'learning_rate': 0.26, 'n_estimators': 29}

In [62]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "n_estimators": range(1, 31),
    "learning_rate": np.arange(0.01, 0.51, 0.05).tolist(),
    "algorithm": ["SAMME.R", "SAMME"],
}

# 实例化模型与评估器
AB_final = AdaBoostClassifier(LogisticRegression(penalty="l1", solver="saga"))
abg = GridSearchCV(AB_final, parameter_space, n_jobs=-1)

# 模型训练
abg.fit(train_oof, y_train)

print(time.time() - start)

8.9546480178833


In [63]:
abg.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 0.01, 'n_estimators': 1}

In [64]:
abg.score(train_oof, y_train), abg.score(test_predict, y_test)

(0.7385460053010223, 0.7228847245883021)

In [66]:
start = time.time()

# 设置超参数空间
parameter_space = {
    "min_samples_leaf": range(2, 6),
    "min_samples_split": range(2, 6),
    "max_depth": range(5, 8),
    "max_leaf_nodes": [None] + list(range(20, 25)),
    "n_estimators": range(6, 11),
    "max_samples": [None, 0.54, 0.55, 0.56],
}

# 实例化模型与评估器
RF_final = RandomForestClassifier(random_state=12)
rfg = GridSearchCV(RF_final, parameter_space, n_jobs=-1)

# 模型训练
rfg.fit(train_oof, y_train)

print(time.time() - start)

32.07269859313965


In [67]:
rfg.best_params_

{'max_depth': 5,
 'max_leaf_nodes': None,
 'max_samples': 0.56,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 10}

In [68]:
rfg.score(train_oof, y_train), rfg.score(test_predict, y_test)

(0.8148428625520636, 0.7887563884156729)

In [69]:
from sklearn.model_selection import RepeatedKFold

In [70]:
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2)

for trn_idx, val_idx in folds.split(train_oof, y_train):
    lr = LogisticRegression(penalty="l1", solver="saga")
    lr.fit(train_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += lr.predict_proba(test_predict)[:, 1] / 10

In [71]:
accuracy_score((res >= 0.5) * 1, y_test)

0.7887563884156729

In [72]:
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2)

for trn_idx, val_idx in folds.split(train_oof, y_train):
    tree = DecisionTreeClassifier(
        max_depth=3, max_leaf_nodes=7, min_samples_leaf=1, min_samples_split=2
    )
    tree.fit(train_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += tree.predict_proba(test_predict)[:, 1] / 10

In [73]:
accuracy_score((res >= 0.5) * 1, y_test)

0.7864849517319704

In [74]:
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12)

for trn_idx, val_idx in folds.split(train_oof, y_train):
    lfg = GridSearchCV(
        estimator=logit_threshold(max_iter=int(1e6)),
        param_grid=logistic_param,
        scoring="accuracy",
        n_jobs=-1,
    )
    lfg.fit(train_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += lfg.predict_proba(test_predict)[:, 1] / 10

accuracy_score((res >= 0.5) * 1, y_test)

0.7876206700738216

In [75]:
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12)

for trn_idx, val_idx in folds.split(train_oof, y_train):
    tfg = GridSearchCV(
        estimator=DecisionTreeClassifier(), param_grid=tree_param, n_jobs=-1
    )
    tfg.fit(train_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += tfg.predict_proba(test_predict)[:, 1] / 10

print(accuracy_score((res >= 0.5) * 1, y_test))

0.7864849517319704


In [76]:
# 设置超参数空间
logistic_param = [
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l1"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["saga"],
    },
    {
        "thr": np.arange(0.1, 1, 0.1).tolist(),
        "penalty": ["l2"],
        "C": np.arange(0.1, 1.1, 0.1).tolist(),
        "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    },
]

# 实例化相关评估器
logistic_final = logit_threshold(max_iter=int(1e6))

# 执行网格搜索
lfg = GridSearchCV(
    estimator=logistic_final, param_grid=logistic_param, scoring="accuracy", n_jobs=15
).fit(train_stack_oof, y_train)

lfg.score(train_stack_oof, y_train), lfg.score(test_stack, y_test)

(0.8186293070806513, 0.7967064168086314)

In [77]:
lfg.best_params_

{'C': 0.30000000000000004, 'penalty': 'l1', 'solver': 'saga', 'thr': 0.5}

In [79]:
# 实例化决策树评估器
tree_final = DecisionTreeClassifier()

tree_param = {
    "max_depth": np.arange(2, 16, 1).tolist(),
    "min_samples_split": np.arange(2, 5, 1).tolist(),
    "min_samples_leaf": np.arange(1, 4, 1).tolist(),
    "max_leaf_nodes": np.arange(6, 30, 1).tolist(),
}

# 实例化网格搜索评估器
tfg = GridSearchCV(estimator=tree_final, param_grid=tree_param, n_jobs=-1).fit(
    train_stack_oof, y_train
)

tfg.score(train_stack_oof, y_train), tfg.score(test_stack, y_test)

(0.8188186293070806, 0.7904599659284497)

In [80]:
tfg.best_params_

{'max_depth': 4,
 'max_leaf_nodes': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [81]:
# 逻辑回归交叉训练
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12)

for trn_idx, val_idx in folds.split(train_stack_oof, y_train):
    lfg = GridSearchCV(
        estimator=logit_threshold(max_iter=int(1e6)),
        param_grid=logistic_param,
        scoring="accuracy",
        n_jobs=15,
    )
    lfg.fit(train_stack_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += lfg.predict_proba(test_stack)[:, 1] / 10

print(accuracy_score((res >= 0.5) * 1, y_test))

0.797274275979557


In [82]:
# 决策树交叉训练过程
res = np.zeros(test_predict.shape[0])

folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=12)

for trn_idx, val_idx in folds.split(train_stack_oof, y_train):
    tfg = GridSearchCV(
        estimator=DecisionTreeClassifier(), param_grid=tree_param, n_jobs=12
    )
    tfg.fit(train_stack_oof.loc[trn_idx], y_train.loc[trn_idx])
    res += tfg.predict_proba(test_stack)[:, 1] / 10

print(accuracy_score((res >= 0.5) * 1, y_test))

0.7904599659284497


In [83]:
# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
    "max_features": np.arange(0.1, 1.1, 0.1).tolist(),
}

# 实例化模型与评估器
bagging_final = BaggingClassifier(
    LogisticRegression(C=0.3, penalty="l1", solver="saga")
)
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=15).fit(
    train_stack_oof, y_train
)

BG.score(train_stack_oof, y_train), BG.score(test_stack, y_test)

(0.8178720181749337, 0.7967064168086314)

In [90]:
# 设置超参数空间
parameter_space = {
    "n_estimators": range(10, 21),
    "max_samples": np.arange(0.1, 1.1, 0.1).tolist(),
}

# 实例化模型与评估器
bagging_final = BaggingClassifier(
    DecisionTreeClassifier(
        max_depth=4, max_leaf_nodes=8, min_samples_leaf=1, min_samples_split=2
    )
)
BG = GridSearchCV(bagging_final, parameter_space, n_jobs=-1).fit(
    train_stack_oof, y_train
)

BG.score(train_stack_oof, y_train), BG.score(test_stack, y_test)

(0.8199545626656569, 0.8006814310051107)