In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [6]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [None]:
class VotingClassifier_threshold(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, estimators, voting="hard", weights=None, thr=0.5):
        self.estimators = estimators
        self.voting = voting
        self.weights = weights
        self.thr = thr

    def fit(self, X, y):
        VC = VotingClassifier(
            estimators=self.estimators, voting=self.voting, weights=self.weights
        )

        VC.fit(X, y)
        self.clf = VC

        return self

    def predict_proba(self, X):
        if self.voting == "soft":
            res_proba = self.clf.predict_proba(X)
        else:
            res_proba = None
        return res_proba

    def predict(self, X):
        if self.voting == "soft":
            res = (self.clf.predict_proba(X)[:, 1] >= self.thr) * 1
        else:
            res = self.clf.predict(X)
        return res

    def score(self, X, y):
        acc = accuracy_score(self.predict(X), y)
        return acc

In [None]:
# 实例化KFold评估器
kf = KFold(n_splits=5, random_state=12, shuffle=True)

# 重置训练集和测试集的index
X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [None]:
# 随机森林模型组
grid_RF_1 = load("./model/grid_RF_1.joblib")
grid_RF_2 = load("./model/grid_RF_2.joblib")
grid_RF_3 = load("./model/grid_RF_3.joblib")
grid_RF_4 = load("./model/grid_RF_4.joblib")
grid_RF_5 = load("./model/grid_RF_5.joblib")

RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

# 决策树模型组
grid_tree_1 = load("./model/grid_tree_1.joblib")
grid_tree_2 = load("./model/grid_tree_2.joblib")
grid_tree_3 = load("./model/grid_tree_3.joblib")
grid_tree_4 = load("./model/grid_tree_4.joblib")
grid_tree_5 = load("./model/grid_tree_5.joblib")

tree_1 = grid_tree_1.best_estimator_
tree_2 = grid_tree_2.best_estimator_
tree_3 = grid_tree_3.best_estimator_
tree_4 = grid_tree_4.best_estimator_
tree_5 = grid_tree_5.best_estimator_

tree_l = [tree_1, tree_2, tree_3, tree_4, tree_5]

# 逻辑回归模型组
grid_lr_1 = load("./model/grid_lr_1.joblib")
grid_lr_2 = load("./model/grid_lr_2.joblib")
grid_lr_3 = load("./model/grid_lr_3.joblib")
grid_lr_4 = load("./model/grid_lr_4.joblib")
grid_lr_5 = load("./model/grid_lr_5.joblib")

lr_1 = grid_lr_1.best_estimator_
lr_2 = grid_lr_2.best_estimator_
lr_3 = grid_lr_3.best_estimator_
lr_4 = grid_lr_4.best_estimator_
lr_5 = grid_lr_5.best_estimator_

lr_l = [lr_1, lr_2, lr_3, lr_4, lr_5]

In [None]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval1_predict_proba_tree = pd.Series(
    tree_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_tree = pd.Series(
    tree_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_tree = pd.Series(
    tree_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_tree = pd.Series(
    tree_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_tree = pd.Series(
    tree_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_tree = pd.concat(
    [
        eval1_predict_proba_tree,
        eval2_predict_proba_tree,
        eval3_predict_proba_tree,
        eval4_predict_proba_tree,
        eval5_predict_proba_tree,
    ]
).sort_index()

eval1_predict_proba_lr = pd.Series(
    lr_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_lr = pd.Series(
    lr_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_lr = pd.Series(
    lr_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_lr = pd.Series(
    lr_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_lr = pd.Series(
    lr_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

eval_predict_proba_lr = pd.concat(
    [
        eval1_predict_proba_lr,
        eval2_predict_proba_lr,
        eval3_predict_proba_lr,
        eval4_predict_proba_lr,
        eval5_predict_proba_lr,
    ]
).sort_index()

In [None]:
test_predict_proba_RF = []

for i in range(5):
    test_predict_proba_RF.append(RF_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_RF = np.array(test_predict_proba_RF)
test_predict_proba_RF = test_predict_proba_RF.mean(0)

test_predict_proba_tree = []

for i in range(5):
    test_predict_proba_tree.append(tree_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_tree = np.array(test_predict_proba_tree)
test_predict_proba_tree = test_predict_proba_tree.mean(0)

test_predict_proba_lr = []

for i in range(5):
    test_predict_proba_lr.append(lr_l[i].predict_proba(X_test_OE)[:, 1])

test_predict_proba_lr = np.array(test_predict_proba_lr)
test_predict_proba_lr = test_predict_proba_lr.mean(0)