In [1]:
# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings

warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 自定义模块
from telcoFunc import *

# 导入特征衍生模块
import features_creation as fc
from features_creation import *

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

In [2]:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [3]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

In [4]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train['Churn'].copy()
y_test = test['Churn'].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq['tenure_year'] = ((72 - X_train['tenure']) // 12) + 2014
X_test_seq['tenure_year'] = ((72 - X_test['tenure']) // 12) + 2014

# 月份衍生
X_train_seq['tenure_month'] = (72 - X_train['tenure']) % 12 + 1
X_test_seq['tenure_month'] = (72 - X_test['tenure']) % 12 + 1

# 季度衍生
X_train_seq['tenure_quarter'] = ((X_train_seq['tenure_month'] - 1) // 3) + 1
X_test_seq['tenure_quarter'] = ((X_test_seq['tenure_month'] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [5]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [6]:
# 本节新增第三方库
from joblib import dump, load
from sklearn.ensemble import VotingClassifier
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.model_selection import cross_val_score

In [7]:
kf = KFold(n_splits=5, random_state=12, shuffle=True)

X_train_OE = X_train_OE.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [8]:
for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    print(train_part_index)
    print(eval_index)
    break

[   1    2    5 ... 5279 5280 5281]
[   0    3    4 ... 5271 5274 5275]


In [9]:
train_part_index_l = []
eval_index_l = []

for train_part_index, eval_index in kf.split(X_train_OE, y_train):
    train_part_index_l.append(train_part_index)
    eval_index_l.append(eval_index)

In [10]:
# 训练集特征
X_train1 = X_train_OE.loc[train_part_index_l[0]]
X_train2 = X_train_OE.loc[train_part_index_l[1]]
X_train3 = X_train_OE.loc[train_part_index_l[2]]
X_train4 = X_train_OE.loc[train_part_index_l[3]]
X_train5 = X_train_OE.loc[train_part_index_l[4]]

# 验证集特征
X_eval1 = X_train_OE.loc[eval_index_l[0]]
X_eval2 = X_train_OE.loc[eval_index_l[1]]
X_eval3 = X_train_OE.loc[eval_index_l[2]]
X_eval4 = X_train_OE.loc[eval_index_l[3]]
X_eval5 = X_train_OE.loc[eval_index_l[4]]

# 训练集标签
y_train1 = y_train.loc[train_part_index_l[0]]
y_train2 = y_train.loc[train_part_index_l[1]]
y_train3 = y_train.loc[train_part_index_l[2]]
y_train4 = y_train.loc[train_part_index_l[3]]
y_train5 = y_train.loc[train_part_index_l[4]]

# 验证集标签
y_eval1 = y_train.loc[eval_index_l[0]]
y_eval2 = y_train.loc[eval_index_l[1]]
y_eval3 = y_train.loc[eval_index_l[2]]
y_eval4 = y_train.loc[eval_index_l[3]]
y_eval5 = y_train.loc[eval_index_l[4]]

In [11]:
train_set = [
    (X_train1, y_train1),
    (X_train2, y_train2),
    (X_train3, y_train3),
    (X_train4, y_train4),
    (X_train5, y_train5),
]

In [12]:
eval_set = [
    (X_eval1, y_eval1),
    (X_eval2, y_eval2),
    (X_eval3, y_eval3),
    (X_eval4, y_eval4),
    (X_eval5, y_eval5),
]

In [15]:
start = time.time()

parameter_space = {
    'min_samples_leaf': range(4, 7),
    "min_samples_split": range(2, 4),
    'max_depth': range(7, 11),
    'max_leaf_nodes': [None] + list(range(31, 34)),
    'n_estimators': range(93, 96),
    'max_features': ['sqrt', 'log2'] + list(range(5, 8)),
    'max_samples': [None, 0.49, 0.5, 0.51],
}

RF_1 = RandomForestClassifier(random_state=12)
grid_RF_1 = GridSearchCV(RF_1, parameter_space, n_jobs=-1)

grid_RF_1.fit(X_train1, y_train1)

print(time.time() - start)

2870.218293428421


In [16]:
grid_RF_1.best_score_

0.805680473372781

In [17]:
grid_RF_1.best_params_

{'max_depth': 8,
 'max_features': 6,
 'max_leaf_nodes': 32,
 'max_samples': 0.5,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 94}

In [19]:
grid_RF_1.score(X_train1, y_train1), grid_RF_1.score(X_eval1, y_eval1), grid_RF_1.score(
    X_test_OE, y_test
)

(0.818698224852071, 0.8164616840113529, 0.7819420783645656)

In [21]:
grid_RF_1 = load('./model/grid_RF_1.joblib')
grid_RF_2 = load('./model/grid_RF_2.joblib')
grid_RF_3 = load('./model/grid_RF_3.joblib')
grid_RF_4 = load('./model/grid_RF_4.joblib')
grid_RF_5 = load('./model/grid_RF_5.joblib')

In [22]:
RF_1 = grid_RF_1.best_estimator_
RF_2 = grid_RF_2.best_estimator_
RF_3 = grid_RF_3.best_estimator_
RF_4 = grid_RF_4.best_estimator_
RF_5 = grid_RF_5.best_estimator_

In [23]:
RF_l = [RF_1, RF_2, RF_3, RF_4, RF_5]

In [24]:
eval_score = 0

for i in range(5):
    X, y = eval_set[i]
    eval_score += RF_l[i].score(X, y)

eval_score / 5

0.8173065207419512

In [25]:
eval1_predict_proba_RF = pd.Series(
    RF_l[0].predict_proba(X_eval1)[:, 1], index=X_eval1.index
)
eval2_predict_proba_RF = pd.Series(
    RF_l[1].predict_proba(X_eval2)[:, 1], index=X_eval2.index
)
eval3_predict_proba_RF = pd.Series(
    RF_l[2].predict_proba(X_eval3)[:, 1], index=X_eval3.index
)
eval4_predict_proba_RF = pd.Series(
    RF_l[3].predict_proba(X_eval4)[:, 1], index=X_eval4.index
)
eval5_predict_proba_RF = pd.Series(
    RF_l[4].predict_proba(X_eval5)[:, 1], index=X_eval5.index
)

In [26]:
eval_predict_proba_RF = pd.concat(
    [
        eval1_predict_proba_RF,
        eval2_predict_proba_RF,
        eval3_predict_proba_RF,
        eval4_predict_proba_RF,
        eval5_predict_proba_RF,
    ]
).sort_index()

eval_predict_proba_RF

0       0.044787
1       0.572187
2       0.161815
3       0.250871
4       0.122533
          ...   
5277    0.082653
5278    0.346562
5279    0.551481
5280    0.049011
5281    0.002783
Length: 5282, dtype: float64

In [27]:
accuracy_score((eval_predict_proba_RF >= 0.5) * 1, y_train)

0.8173040514956456

In [28]:
test_score = sum(RF_l[i].score(X_test_OE, y_test) for i in range(5))
test_score / 5

0.7904599659284497

In [29]:
test_predict_proba_RF = [
    RF_l[i].predict_proba(X_test_OE)[:, 1] for i in range(5)
]
test_predict_proba_RF = np.array(test_predict_proba_RF)

In [30]:
test_predict_proba_RF

array([[3.93370391e-02, 2.49403422e-01, 2.83410667e-02, ...,
        1.49452536e-01, 4.98110882e-01, 1.18764226e-01],
       [4.61954631e-02, 2.45875467e-01, 3.00760027e-02, ...,
        1.38993336e-01, 5.02889767e-01, 1.16184501e-01],
       [2.00137393e-03, 4.16662468e-01, 4.90918017e-04, ...,
        1.42216654e-01, 6.08904016e-01, 8.56019321e-02],
       [3.59908839e-02, 3.22891626e-01, 1.62280766e-02, ...,
        1.52682083e-01, 5.04891040e-01, 1.27421886e-01],
       [2.25733214e-02, 3.25064759e-01, 6.08561589e-03, ...,
        1.43386982e-01, 5.38634836e-01, 8.61879770e-02]])

In [31]:
test_predict_proba_RF = test_predict_proba_RF.mean(0)
test_predict_proba_RF

array([0.02921962, 0.31197955, 0.01624434, ..., 0.14534632, 0.53068611,
       0.1068321 ])

In [32]:
accuracy_score((test_predict_proba_RF >= 0.5) * 1, y_test)

0.7915956842703009