In [1]:
### 不同分类器训练时长

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
import time

In [27]:
# data = pd.read_csv("../../data/process_data.csv")
data = pd.read_csv("../../data/new_features_data_2.csv")

if 'Unnamed: 0' in data.columns:
    data = data.drop(['Unnamed: 0','个人编码'], axis=1)
else:
    data = data.drop('个人编码', axis=1)

In [28]:
# 筛选RES列值为1的数据
data_res_1 = data[data['RES'] == 1]

# 计算RES为1的数据中每列的缺失值数量
missing_values_count_res_1 = data_res_1.isnull().sum()

# 过滤出有缺失值的列
missing_values_res_1 = missing_values_count_res_1[missing_values_count_res_1 > 0]

# 打印有缺失值的列和对应的缺失值数量（当RES列的值为1）
print("当RES列的值为1时，有缺失值的列及其缺失数量：")
print(missing_values_res_1)

# 构建的特征集中含有缺失值，部分分类器对缺失值敏感，删除样本
data = data.dropna()

当RES列的值为1时，有缺失值的列及其缺失数量：
月就诊天数_AVG_病种费用标准差值            3
月就诊医院数_MAX_病种费用标准差值           3
月就诊医院数_AVG_病种费用标准差值           3
就诊次数_SUM_病种费用标准差值             3
月就诊次数_MAX_病种费用标准差值            3
月就诊次数_AVG_病种费用标准差值            3
月统筹金额_MAX_病种费用标准差值            3
月统筹金额_AVG_病种费用标准差值            3
月药品金额_MAX_病种费用标准差值            3
月药品金额_AVG_病种费用标准差值            3
医院_就诊天数_MAX_病种费用标准差值          3
医院_就诊天数_AVG_病种费用标准差值          3
医院_统筹金_MAX_病种费用标准差值           3
医院_统筹金_AVG_病种费用标准差值           3
医院_药品_MAX_病种费用标准差值            3
医院_药品_AVG_病种费用标准差值            3
个人账户金额_SUM_病种费用标准差值           3
统筹支付金额_SUM_病种费用标准差值           3
ALL_SUM_病种费用标准差值              3
可用账户报销金额_SUM_病种费用标准差值         3
药品费发生金额_SUM_病种费用标准差值          3
药品费自费金额_SUM_病种费用标准差值          3
药品费申报金额_SUM_病种费用标准差值          3
贵重药品发生金额_SUM_病种费用标准差值         3
中成药费发生金额_SUM_病种费用标准差值         3
中草药费发生金额_SUM_病种费用标准差值         3
检查费发生金额_SUM_病种费用标准差值          3
检查费自费金额_SUM_病种费用标准差值          3
检查费申报金额_SUM_病种费用标准差值          3
贵重检查费金额_SUM_病种费用标准差值          3
治疗费发生金额_SUM_病种费

In [29]:
X = data.drop('RES', axis=1)
y = data['RES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# RF

In [6]:
rf_model = RandomForestClassifier(random_state=42)

start_time = time.time()  # 记录训练开始时间

rf_model.fit(X_train, y_train)
end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")

rf_predictions = rf_model.predict(X_test)
rf_probabilities = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_test, rf_probabilities))

模型训练耗时：18.429328203201294 秒
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3802
           1       0.82      0.37      0.51       198

    accuracy                           0.96      4000
   macro avg       0.89      0.68      0.75      4000
weighted avg       0.96      0.96      0.96      4000

Random Forest AUC Score: 0.9145306298120606


In [7]:
rf_predictions = rf_model.predict(X_train)
rf_probabilities = rf_model.predict_proba(X_train)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_train, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_train, rf_probabilities))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11405
           1       1.00      1.00      1.00       595

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

Random Forest AUC Score: 1.0


# GBDT

In [8]:
gbdt_model = GradientBoostingClassifier(random_state=42)

start_time = time.time()  # 记录训练开始时间

gbdt_model.fit(X_train, y_train)

end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")

gbdt_predictions = gbdt_model.predict(X_test)
gbdt_probabilities = gbdt_model.predict_proba(X_test)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_test, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_test, gbdt_probabilities))

模型训练耗时：51.56669878959656 秒
GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3802
           1       0.78      0.40      0.53       198

    accuracy                           0.96      4000
   macro avg       0.87      0.70      0.76      4000
weighted avg       0.96      0.96      0.96      4000

GBDT AUC Score: 0.9127353227169114


In [9]:
gbdt_predictions = gbdt_model.predict(X_train)
gbdt_probabilities = gbdt_model.predict_proba(X_train)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_train, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_train, gbdt_probabilities))

GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     11405
           1       0.93      0.46      0.62       595

    accuracy                           0.97     12000
   macro avg       0.95      0.73      0.80     12000
weighted avg       0.97      0.97      0.97     12000

GBDT AUC Score: 0.9604886549095745


# XGBoost

In [10]:
xgb_model = XGBClassifier(random_state=42)

start_time = time.time()  # 记录训练开始时间

xgb_model.fit(X_train, y_train)

end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")


xgb_predictions = xgb_model.predict(X_test)
xgb_probabilities = xgb_model.predict_proba(X_test)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_test, xgb_probabilities))

模型训练耗时：1.9439046382904053 秒
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3802
           1       0.71      0.40      0.51       198

    accuracy                           0.96      4000
   macro avg       0.84      0.70      0.75      4000
weighted avg       0.96      0.96      0.96      4000

XGBoost AUC Score: 0.9354778718271616


In [11]:
xgb_predictions = xgb_model.predict(X_train)
xgb_probabilities = xgb_model.predict_proba(X_train)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_train, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_train, xgb_probabilities))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11405
           1       1.00      1.00      1.00       595

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

XGBoost AUC Score: 1.0


# lightGBM

In [12]:
lgb_model = LGBMClassifier(random_state=42)

start_time = time.time()  # 记录训练开始时间
lgb_model.fit(X_train, y_train)

end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")

lgb_predictions = lgb_model.predict(X_test)
lgb_probabilities = lgb_model.predict_proba(X_test)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_test, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_test, lgb_probabilities))

[LightGBM] [Info] Number of positive: 595, number of negative: 11405
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011308 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12757
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049583 -> initscore=-2.953246
[LightGBM] [Info] Start training from score -2.953246
模型训练耗时：1.504951000213623 秒
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3802
           1       0.72      0.38      0.50       198

    accuracy                           0.96      4000
   macro avg       0.85      0.69      0.74      4000
weighted avg       0.96      0.96      0.96      4000

LightGBM AUC Score: 0.9329380071094958


In [13]:
lgb_predictions = lgb_model.predict(X_train)
lgb_probabilities = lgb_model.predict_proba(X_train)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_train, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_train, lgb_probabilities))

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11405
           1       1.00      0.97      0.99       595

    accuracy                           1.00     12000
   macro avg       1.00      0.99      0.99     12000
weighted avg       1.00      1.00      1.00     12000

LightGBM AUC Score: 0.9999988210979263


# 决策树

In [14]:
dt_model = DecisionTreeClassifier(random_state=42)

start_time = time.time()  # 记录训练开始时间

dt_model.fit(X_train, y_train)

end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")


dt_predictions = dt_model.predict(X_test)
dt_probabilities = dt_model.predict_proba(X_test)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_test, dt_probabilities))

模型训练耗时：2.8408243656158447 秒
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      3802
           1       0.38      0.46      0.42       198

    accuracy                           0.94      4000
   macro avg       0.68      0.71      0.69      4000
weighted avg       0.94      0.94      0.94      4000

Decision Tree AUC Score: 0.7104660492351181


In [15]:
dt_predictions = dt_model.predict(X_train)
dt_probabilities = dt_model.predict_proba(X_train)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_train, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_train, dt_probabilities))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11405
           1       1.00      1.00      1.00       595

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

Decision Tree AUC Score: 1.0


# Logistic

In [30]:
log_model = make_pipeline(StandardScaler(), LogisticRegression(random_state=42,max_iter=500))

start_time = time.time()  # 记录训练开始时间
log_model.fit(X_train, y_train)

end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")

log_predictions = log_model.predict(X_test)
log_probabilities = log_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_test, log_probabilities))

模型训练耗时：21.723883152008057 秒
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      3791
           1       0.59      0.38      0.46       209

    accuracy                           0.95      4000
   macro avg       0.78      0.68      0.72      4000
weighted avg       0.95      0.95      0.95      4000

Logistic Regression AUC Score: 0.8443095520869752


In [31]:
log_predictions = log_model.predict(X_train)
log_probabilities = log_model.predict_proba(X_train)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_train, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_train, log_probabilities))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     11416
           1       0.93      0.54      0.69       581

    accuracy                           0.98     11997
   macro avg       0.95      0.77      0.84     11997
weighted avg       0.97      0.98      0.97     11997

Logistic Regression AUC Score: 0.9608394535193532
