In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import time

In [2]:
data = pd.read_csv("../../data/process_data.csv")
# data = pd.read_csv("E:\竞赛\服创赛-A08\data\特征集\\new_features.csv")

if 'Unnamed: 0' in data.columns:
    data = data.drop(['Unnamed: 0','个人编码'], axis=1)
else:
    data = data.drop('个人编码', axis=1)

In [3]:
X = data.drop('RES', axis=1)
y = data['RES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
# 分类0和分类1的数据
X_train_class0 = X_train[y_train == 0]
X_train_class1 = X_train[y_train == 1]
y_train_class0 = y_train[y_train == 0]
y_train_class1 = y_train[y_train == 1]

In [5]:
# 训练100棵决策树
trees = []
start_time = time.time()  # 记录训练开始时间
for i in range(128):
    # 对类别0进行有放回抽样
    sampled_indices_class0 = np.random.choice(len(X_train_class0), size=len(X_train_class1), replace=True)
    X_sampled_class0 = X_train_class0.iloc[sampled_indices_class0]
    y_sampled_class0 = y_train_class0.iloc[sampled_indices_class0]

    # 对类别1进行有放回抽样，但只抽取60%的样本
    sample_size_class1 = int(len(X_train_class1) * 0.9)
    sampled_indices_class1 = np.random.choice(len(X_train_class1), size=sample_size_class1, replace=True)
    X_sampled_class1 = X_train_class1.iloc[sampled_indices_class1]
    y_sampled_class1 = y_train_class1.iloc[sampled_indices_class1]

    # 合并两个类别的样本
    X_combined = pd.concat([X_sampled_class0, X_sampled_class1], axis=0)
    y_combined = pd.concat([y_sampled_class0, y_sampled_class1], axis=0)

    # 打印每次迭代中各类别抽样的样本数
    print(f"Iteration {i+1}:")
    print(f"Class 0 samples: {len(X_sampled_class0)}, Class 1 samples: {len(X_sampled_class1)}")

    # 对数据进行洗牌
    shuffled_indices = np.random.permutation(len(X_combined))
    X_combined_shuffled = X_combined.iloc[shuffled_indices]
    y_combined_shuffled = y_combined.iloc[shuffled_indices]

    # 训练决策树
    tree = DecisionTreeClassifier(random_state=42,
                                  max_depth=7,
                                  min_samples_leaf=10,
                                  min_samples_split=10,
                                 )
    tree.fit(X_combined_shuffled, y_combined_shuffled)
    trees.append(tree)


Iteration 1:
Class 0 samples: 595, Class 1 samples: 535
Iteration 2:
Class 0 samples: 595, Class 1 samples: 535
Iteration 3:
Class 0 samples: 595, Class 1 samples: 535
Iteration 4:
Class 0 samples: 595, Class 1 samples: 535
Iteration 5:
Class 0 samples: 595, Class 1 samples: 535
Iteration 6:
Class 0 samples: 595, Class 1 samples: 535
Iteration 7:
Class 0 samples: 595, Class 1 samples: 535
Iteration 8:
Class 0 samples: 595, Class 1 samples: 535
Iteration 9:
Class 0 samples: 595, Class 1 samples: 535
Iteration 10:
Class 0 samples: 595, Class 1 samples: 535
Iteration 11:
Class 0 samples: 595, Class 1 samples: 535
Iteration 12:
Class 0 samples: 595, Class 1 samples: 535
Iteration 13:
Class 0 samples: 595, Class 1 samples: 535
Iteration 14:
Class 0 samples: 595, Class 1 samples: 535
Iteration 15:
Class 0 samples: 595, Class 1 samples: 535
Iteration 16:
Class 0 samples: 595, Class 1 samples: 535
Iteration 17:
Class 0 samples: 595, Class 1 samples: 535
Iteration 18:
Class 0 samples: 595, Clas

In [6]:
end_time = time.time()  # 记录训练结束时间
training_time = end_time - start_time  # 计算训练所花费的时间

print(f"模型训练耗时：{training_time} 秒")

模型训练耗时：7.096776962280273 秒


In [7]:
# 集成预测
predictions = np.mean([tree.predict_proba(X_test)[:, 1] for tree in trees], axis=0)

In [8]:
# 将概率转换为类别
threshold = 0.5
class_predictions = np.where(predictions >= threshold, 1, 0)

# 评估模型
print(classification_report(y_test, class_predictions))
print("AUC: ", roc_auc_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.87      0.92      3802
           1       0.25      0.82      0.38       198

    accuracy                           0.87      4000
   macro avg       0.62      0.85      0.65      4000
weighted avg       0.95      0.87      0.90      4000

AUC:  0.9245878033358307


In [9]:
predictions = np.mean([tree.predict_proba(X_train)[:, 1] for tree in trees], axis=0)

# 将概率转换为类别
threshold = 0.5
class_predictions = np.where(predictions >= threshold, 1, 0)

# 评估模型
print(classification_report(y_train, class_predictions))
print("AUC: ", roc_auc_score(y_train, predictions))

              precision    recall  f1-score   support

           0       0.99      0.87      0.93     11405
           1       0.27      0.90      0.42       595

    accuracy                           0.88     12000
   macro avg       0.63      0.89      0.68     12000
weighted avg       0.96      0.88      0.91     12000

AUC:  0.9596905382056374
