In [25]:
# 基础数据科学运算库
import numpy as np
import pandas as pd
from numpy.random import RandomState

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

# 其他辅助模块
import inspect, re
from tqdm import tqdm
import gc
import time
from joblib import dump, load
import warnings

warnings.filterwarnings("ignore")

# hyperopt优化器
from hyperopt import hp, fmin, tpe

from telcoFunc import *
from manual_ensemble import *

In [2]:
from deepforest import CascadeForestClassifier

In [3]:
CascadeForestClassifier?

[1;31mInit signature:[0m
[0mCascadeForestClassifier[0m[1;33m([0m[1;33m
[0m    [0mn_bins[0m[1;33m=[0m[1;36m255[0m[1;33m,[0m[1;33m
[0m    [0mbin_subsample[0m[1;33m=[0m[1;36m200000[0m[1;33m,[0m[1;33m
[0m    [0mbin_type[0m[1;33m=[0m[1;34m'percentile'[0m[1;33m,[0m[1;33m
[0m    [0mmax_layers[0m[1;33m=[0m[1;36m20[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'gini'[0m[1;33m,[0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mn_trees[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0muse_predictor[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mpredictor[0m[1;33m=[0m[1;34m'forest'[0m[1;33m,[0m[1;33m
[0m    [0mpredictor_kwargs[0m

In [4]:
from deepforest import CascadeForestRegressor

In [5]:
CascadeForestRegressor?

[1;31mInit signature:[0m
[0mCascadeForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_bins[0m[1;33m=[0m[1;36m255[0m[1;33m,[0m[1;33m
[0m    [0mbin_subsample[0m[1;33m=[0m[1;36m200000[0m[1;33m,[0m[1;33m
[0m    [0mbin_type[0m[1;33m=[0m[1;34m'percentile'[0m[1;33m,[0m[1;33m
[0m    [0mmax_layers[0m[1;33m=[0m[1;36m20[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'mse'[0m[1;33m,[0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mn_trees[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0muse_predictor[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mpredictor[0m[1;33m=[0m[1;34m'forest'[0m[1;33m,[0m[1;33m
[0m    [0mpredictor_kwargs[0m[1

In [6]:
from sklearn.datasets import load_digits

In [7]:
X, y = load_digits(return_X_y=True)

In [9]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, random_state=1)
model = CascadeForestClassifier(random_state=8)
model.fit(X_train1, y_train1)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

[2023-09-06 11:26:56.526] Start to fit the model:
[2023-09-06 11:26:56.526] Fitting cascade layer = 0 
[2023-09-06 11:26:56.951] layer = 0  | Val Acc = 97.921 % | Elapsed = 0.425 s
[2023-09-06 11:26:56.953] Fitting cascade layer = 1 
[2023-09-06 11:26:57.389] layer = 1  | Val Acc = 98.144 % | Elapsed = 0.435 s
[2023-09-06 11:26:57.391] Fitting cascade layer = 2 
[2023-09-06 11:26:57.760] layer = 2  | Val Acc = 97.699 % | Elapsed = 0.369 s
[2023-09-06 11:26:57.760] Early stopping counter: 1 out of 2
[2023-09-06 11:26:57.762] Fitting cascade layer = 3 
[2023-09-06 11:26:58.122] layer = 3  | Val Acc = 96.956 % | Elapsed = 0.359 s
[2023-09-06 11:26:58.122] Early stopping counter: 2 out of 2
[2023-09-06 11:26:58.122] Handling early stopping
[2023-09-06 11:26:58.122] The optimal number of layers: 2
[2023-09-06 11:26:58.123] Start to evalute the model:
[2023-09-06 11:26:58.124] Evaluating cascade layer = 0 
[2023-09-06 11:26:58.140] Evaluating cascade layer = 1 

Testing Accuracy: 98.444 %


In [10]:
model = DecisionTreeClassifier(random_state=8)
model.fit(X_train1, y_train1)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))


Testing Accuracy: 84.889 %


In [11]:
model = RandomForestClassifier(random_state=8)
model.fit(X_train1, y_train1)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))


Testing Accuracy: 98.222 %


In [12]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [13]:
model = XGBClassifier(random_state=8)
model.fit(X_train1, y_train1)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))


Testing Accuracy: 97.111 %


In [15]:
model = LGBMClassifier(random_state=8)
model.fit(
    X_train1,
    y_train1,
    categorical_feature="auto",
    eval_set=(X_test1, y_test1),
    feature_name="auto",
)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 832
[LightGBM] [Info] Number of data points in the train set: 1347, number of used features: 53
[LightGBM] [Info] Start training from score -2.377321
[LightGBM] [Info] Start training from score -2.263993
[LightGBM] [Info] Start training from score -2.292980
[LightGBM] [Info] Start training from score -2.330438
[LightGBM] [Info] Start training from score -2.307795
[LightGBM] [Info] Start training from score -2.242791
[LightGBM] [Info] Start training from score -2.278381
[LightGBM] [Info] Start training from score -2.330438
[LightGBM] [Info] Start training from score -2.285654
[LightGBM] [Info] Start training from score -2.322833

Testing Accuracy: 96.444 %


In [18]:
model = CatBoostClassifier(random_state=8)
model.fit(
    X_train1,
    y_train1,
    eval_set=(X_test1, y_test1),
)
y_pred = model.predict(X_test1)
acc = accuracy_score(y_test1, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

Learning rate set to 0.108621
0:	learn: 2.0582560	test: 2.0750653	best: 2.0750653 (0)	total: 142ms	remaining: 2m 22s
1:	learn: 1.8272224	test: 1.8508357	best: 1.8508357 (1)	total: 145ms	remaining: 1m 12s
2:	learn: 1.6543105	test: 1.6790582	best: 1.6790582 (2)	total: 148ms	remaining: 49.1s
3:	learn: 1.5245092	test: 1.5521527	best: 1.5521527 (3)	total: 151ms	remaining: 37.5s
4:	learn: 1.4037097	test: 1.4356270	best: 1.4356270 (4)	total: 153ms	remaining: 30.5s
5:	learn: 1.2980889	test: 1.3384415	best: 1.3384415 (5)	total: 156ms	remaining: 25.9s
6:	learn: 1.2080853	test: 1.2524339	best: 1.2524339 (6)	total: 159ms	remaining: 22.5s
7:	learn: 1.1245619	test: 1.1759412	best: 1.1759412 (7)	total: 162ms	remaining: 20.1s
8:	learn: 1.0653802	test: 1.1177609	best: 1.1177609 (8)	total: 164ms	remaining: 18.1s
9:	learn: 1.0022834	test: 1.0537456	best: 1.0537456 (9)	total: 166ms	remaining: 16.4s
10:	learn: 0.9413678	test: 0.9977078	best: 0.9977078 (10)	total: 168ms	remaining: 15.1s
11:	learn: 0.8841285

In [19]:
# 读取数据
tcc = pd.read_csv('../03_模型融合/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
]

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges'] = (
    tcc['TotalCharges'].apply(lambda x: x if x != ' ' else np.nan).astype(float)
)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)

In [20]:
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc["Churn"].copy()

In [26]:
# 划分训练集和测试集
train, test = train_test_split(tcc, random_state=22)

X_train = train.drop(columns=[ID_col, target]).copy()
X_test = test.drop(columns=[ID_col, target]).copy()

y_train = train["Churn"].copy()
y_test = test["Churn"].copy()

X_train_seq = pd.DataFrame()
X_test_seq = pd.DataFrame()

# 年份衍生
X_train_seq["tenure_year"] = ((72 - X_train["tenure"]) // 12) + 2014
X_test_seq["tenure_year"] = ((72 - X_test["tenure"]) // 12) + 2014

# 月份衍生
X_train_seq["tenure_month"] = (72 - X_train["tenure"]) % 12 + 1
X_test_seq["tenure_month"] = (72 - X_test["tenure"]) % 12 + 1

# 季度衍生
X_train_seq["tenure_quarter"] = ((X_train_seq["tenure_month"] - 1) // 3) + 1
X_test_seq["tenure_quarter"] = ((X_test_seq["tenure_month"] - 1) // 3) + 1

# 独热编码
enc = preprocessing.OneHotEncoder()
enc.fit(X_train_seq)

seq_new = list(X_train_seq.columns)

# 创建带有列名称的独热编码之后的df
X_train_seq = pd.DataFrame(
    enc.transform(X_train_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

X_test_seq = pd.DataFrame(
    enc.transform(X_test_seq).toarray(), columns=cate_colName(enc, seq_new, drop=None)
)

# 调整index
X_train_seq.index = X_train.index
X_test_seq.index = X_test.index

In [27]:
ord_enc = OrdinalEncoder()
ord_enc.fit(X_train[category_cols])

X_train_OE = pd.DataFrame(
    ord_enc.transform(X_train[category_cols]), columns=category_cols
)
X_train_OE.index = X_train.index
X_train_OE = pd.concat([X_train_OE, X_train[numeric_cols]], axis=1)

X_test_OE = pd.DataFrame(
    ord_enc.transform(X_test[category_cols]), columns=category_cols
)
X_test_OE.index = X_test.index
X_test_OE = pd.concat([X_test_OE, X_test[numeric_cols]], axis=1)

In [28]:
model = CascadeForestClassifier(random_state=8)
model.fit(X_train_OE.values, y_train.values)
y_pred = model.predict(X_test_OE)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

[2023-09-06 11:40:56.746] Start to fit the model:
[2023-09-06 11:40:56.746] Fitting cascade layer = 0 
[2023-09-06 11:40:57.615] layer = 0  | Val Acc = 79.137 % | Elapsed = 0.869 s
[2023-09-06 11:40:57.620] Fitting cascade layer = 1 
[2023-09-06 11:40:58.794] layer = 1  | Val Acc = 80.178 % | Elapsed = 1.174 s
[2023-09-06 11:40:58.799] Fitting cascade layer = 2 
[2023-09-06 11:40:59.986] layer = 2  | Val Acc = 80.121 % | Elapsed = 1.187 s
[2023-09-06 11:40:59.986] Early stopping counter: 1 out of 2
[2023-09-06 11:40:59.991] Fitting cascade layer = 3 
[2023-09-06 11:41:01.167] layer = 3  | Val Acc = 80.329 % | Elapsed = 1.175 s
[2023-09-06 11:41:01.172] Fitting cascade layer = 4 
[2023-09-06 11:41:02.346] layer = 4  | Val Acc = 79.799 % | Elapsed = 1.174 s
[2023-09-06 11:41:02.347] Early stopping counter: 1 out of 2
[2023-09-06 11:41:02.351] Fitting cascade layer = 5 
[2023-09-06 11:41:03.517] layer = 5  | Val Acc = 80.140 % | Elapsed = 1.166 s
[2023-09-06 11:41:03.518] Early stopping c

In [29]:
# XGBoost模型预测结果
model = XGBClassifier(random_state=8)
model.fit(X_train_OE, y_train)
y_pred = model.predict(X_test_OE)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))


Testing Accuracy: 77.740 %


In [31]:
# LGBM模型预测结果
model = LGBMClassifier(random_state=8)
model.fit(X_train_OE, y_train, categorical_feature = 'auto',eval_set=(X_test_OE,y_test),feature_name='auto')
y_pred = model.predict(X_test_OE)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))

[LightGBM] [Info] Number of positive: 1381, number of negative: 3901
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 5282, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.261454 -> initscore=-1.038425
[LightGBM] [Info] Start training from score -1.038425

Testing Accuracy: 78.705 %


In [32]:
# CatBoost模型预测结果
categorical_features_indices = np.where(X_train_OE.dtypes != np.float)[0]

model = CatBoostClassifier(verbose=False, random_state=8)
model.fit(
    X_train_OE,
    y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test_OE, y_test),
)
y_pred = model.predict(X_test_OE)
acc = accuracy_score(y_test, y_pred) * 100
print("\nTesting Accuracy: {:.3f} %".format(acc))


Testing Accuracy: 78.819 %


|Models|test_score|
|:--:|:--:|
|CascadeForest|79.046 %|
|XGBoost|77.740 %|
|LGBM|78.705%|
|CatBoost|78.819 %|