In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.utils import resample
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
# 加载数据
file_path = 'E:\竞赛\服创赛-A08\data\欺诈手段特征集.xlsx'
data = pd.read_excel(file_path)

# 展示数据的前几行以及基本信息
data.head(), data.info(), data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   就诊次数_SUM     16000 non-null  int64  
 1   药品在总金额中的占比   16000 non-null  float64
 2   个人支付的药品占比    16000 non-null  float64
 3   检查总费用在总金额占比  16000 non-null  float64
 4   治疗费用在总金额占比   16000 non-null  float64
 5   就诊的月数        16000 non-null  int64  
 6   月就诊天数_MAX    16000 non-null  int64  
 7   月就诊天数_AVG    16000 non-null  float64
 8   月就诊医院数_MAX   16000 non-null  int64  
 9   月就诊医院数_AVG   16000 non-null  float64
 10  RES          16000 non-null  int64  
dtypes: float64(6), int64(5)
memory usage: 1.3 MB


(   就诊次数_SUM  药品在总金额中的占比  个人支付的药品占比  检查总费用在总金额占比  治疗费用在总金额占比  就诊的月数  月就诊天数_MAX  \
 0        34    0.939194   0.004262     0.050817    0.007434      6          7   
 1        15    0.955626   0.002982     0.030815    0.013398      6          4   
 2        45    0.783610   0.000332     0.000000    0.195087      6          8   
 3        23    0.458649   0.000184     0.000000    0.541351      6          6   
 4        26    0.983726   0.000316     0.000000    0.016274      6          5   
 
    月就诊天数_AVG  月就诊医院数_MAX  月就诊医院数_AVG  RES  
 0   5.666667           3    2.166667    0  
 1   2.500000           2    1.333333    0  
 2   6.166667           3    2.166667    0  
 3   3.666667           2    1.833333    0  
 4   4.333333           1    1.000000    0  ,
 None,
            就诊次数_SUM    药品在总金额中的占比     个人支付的药品占比   检查总费用在总金额占比    治疗费用在总金额占比  \
 count  16000.000000  16000.000000  16000.000000  16000.000000  16000.000000   
 mean      36.818438      0.854763      0.005500      0.039006      

In [3]:
# 分离特征和目标变量
X = data.drop('RES', axis=1)
y = data['RES']

# 分离类别
data_majority = data[data.RES == 0]
data_minority = data[data.RES == 1]

## 随机森林算法中的重要参数

- n_estimators：森林中树的数量。增加树的数量通常会提高模型的性能，但也会增加计算成本和时间。默认通常是100。

- max_depth：树的最大深度。这个参数可以控制树的复杂度。较深的树可能会捕捉更多数据特性，但也可能导致过拟合。如果设置为None，则节点将扩展到所有叶子都是纯的或直到所有叶子包含少于min_samples_split样本为止。

- min_samples_split：分裂内部节点所需的最小样本数。可以用于控制过拟合。较大的值可以防止模型学习数据中的噪声。

- min_samples_leaf：在叶节点处需要的最小样本数。这个参数同样有助于控制过拟合，特别是对于不平衡的数据集。

- max_features：寻找最佳分割时要考虑的特征数量。这可以影响每棵树的随机性，以及决策树的构建方式。

- bootstrap：是否在构建树时使用bootstrap样本。通常为True，意味着采用有放回的抽样方法。

- oob_score：是否使用袋外样本来估计准确度。这是一种使用训练期间未被某些树看到的数据来评估模型的方法。

- class_weight：类别的权重。对于不平衡的数据集，可以通过调整类别权重来改善模型性能。设置为“balanced”可以根据样本数量自动调整权重。

- random_state：控制森林中树的随机性，以及分裂点的选择。设置一个固定的随机状态可以保证结果的可重复性。

## 欠采样

In [4]:
# 定义一个函数来执行带有不同欠采样比例的交叉验证
def evaluate_downsampling_ratios(X, y, ratios):
    results = {}
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

    for ratio in ratios:
        print(f"\n欠采样比例: {ratio}:1")
        
        for train_index, test_index in skf.split(X, y):
            # 分割数据
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # 分离多数和少数类
            X_train_majority = X_train[y_train == 0]
            X_train_minority = X_train[y_train == 1]

            # 执行欠采样
            majority_sample_size = int(len(X_train_minority) * ratio)
            X_train_majority_downsampled = resample(X_train_majority, 
                                                    replace=False,
                                                    n_samples=majority_sample_size,
                                                    random_state=123)
            y_train_majority_downsampled = y_train.loc[X_train_majority_downsampled.index]

            # 合并数据
            X_train_downsampled = pd.concat([X_train_majority_downsampled, X_train_minority])
            y_train_downsampled = pd.concat([y_train_majority_downsampled, y_train[y_train == 1]])

            # 创建并训练模型
            RF_1 = RandomForestClassifier(n_estimators=100, random_state=123, n_jobs=-1)
            RF_1.fit(X_train_downsampled, y_train_downsampled)

            # 预测
            y_pred = RF_1.predict(X_test)

            # 输出分类报告
            print(classification_report(y_test, y_pred))

# 欠采样比例
ratios = [1, 2, 3, 4, 5]

# 执行评估
evaluate_downsampling_ratios(X, y, ratios)


欠采样比例: 1:1
              precision    recall  f1-score   support

           0       0.98      0.77      0.86      3042
           1       0.14      0.71      0.23       158

    accuracy                           0.77      3200
   macro avg       0.56      0.74      0.55      3200
weighted avg       0.94      0.77      0.83      3200

              precision    recall  f1-score   support

           0       0.98      0.76      0.86      3042
           1       0.14      0.75      0.24       158

    accuracy                           0.76      3200
   macro avg       0.56      0.76      0.55      3200
weighted avg       0.94      0.76      0.83      3200

              precision    recall  f1-score   support

           0       0.98      0.78      0.87      3041
           1       0.15      0.72      0.25       159

    accuracy                           0.78      3200
   macro avg       0.56      0.75      0.56      3200
weighted avg       0.94      0.78      0.84      3200

       

In [5]:
# 首先执行 evaluate_downsampling_ratios 函数以输出所有比例下的模型结果
evaluate_downsampling_ratios(X, y, ratios)

# 然后，定义一个额外的函数或代码块，用于单独训练和返回比例为4的模型
def get_specific_ratio_model(X, y, specific_ratio):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    final_model = None

    for train_index, test_index in skf.split(X, y):
        # 分割数据
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # 分离多数和少数类
        X_train_majority = X_train[y_train == 0]
        X_train_minority = X_train[y_train == 1]

        # 执行欠采样
        majority_sample_size = int(len(X_train_minority) * specific_ratio)
        X_train_majority_downsampled = resample(X_train_majority, 
                                                replace=False,
                                                n_samples=majority_sample_size,
                                                random_state=123)
        y_train_majority_downsampled = y_train.loc[X_train_majority_downsampled.index]

        # 合并数据
        X_train_downsampled = pd.concat([X_train_majority_downsampled, X_train_minority])
        y_train_downsampled = pd.concat([y_train_majority_downsampled, y_train[y_train == 1]])

        # 创建并训练模型
        RF_1 = RandomForestClassifier(n_estimators=100, random_state=123, n_jobs=-1)
        RF_1.fit(X_train_downsampled, y_train_downsampled)

        # 更新最后一个模型
        final_model = RF_1

    return final_model



欠采样比例: 1:1
              precision    recall  f1-score   support

           0       0.98      0.77      0.86      3042
           1       0.14      0.71      0.23       158

    accuracy                           0.77      3200
   macro avg       0.56      0.74      0.55      3200
weighted avg       0.94      0.77      0.83      3200

              precision    recall  f1-score   support

           0       0.98      0.76      0.86      3042
           1       0.14      0.75      0.24       158

    accuracy                           0.76      3200
   macro avg       0.56      0.76      0.55      3200
weighted avg       0.94      0.76      0.83      3200

              precision    recall  f1-score   support

           0       0.98      0.78      0.87      3041
           1       0.15      0.72      0.25       159

    accuracy                           0.78      3200
   macro avg       0.56      0.75      0.56      3200
weighted avg       0.94      0.78      0.84      3200

       

In [6]:
# 获取比例为4:1的模型
specific_ratio = 4
RF_1 = get_specific_ratio_model(X, y, specific_ratio)

# 输出比例为4的模型
RF_1

## 代价敏感学习

In [8]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# 构建随机森林模型
# n_estimators: 决定树的数量
# class_weight: 用于处理不平衡的类别
# random_state: 确保结果的可重复性
RF_2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=123)

# 训练模型
RF_2.fit(X_train, y_train)

# 进行预测
y_pred = RF_2.predict(X_test)

# 生成分类报告
classification_rep = classification_report(y_test, y_pred)

print(classification_rep)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      4572
           1       0.83      0.20      0.32       228

    accuracy                           0.96      4800
   macro avg       0.90      0.60      0.65      4800
weighted avg       0.96      0.96      0.95      4800



## SMOTE方法

In [10]:
from imblearn.over_sampling import SMOTE

In [21]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# 应用SMOTE
# 创建SMOTE实例
smote = SMOTE(
    sampling_strategy='auto',  # 自动增加少数类样本数量，使其与多数类相等
    random_state=42,          # 控制随机性，保持结果一致性
    k_neighbors=3,             # 用于生成合成样本的最近邻数量
    n_jobs=None                # 使用的CPU核心数，默认为None，即使用1个核心
)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 创建随机森林模型
RF_3 = RandomForestClassifier(n_estimators=100, random_state=123)

# 训练模型
RF_3.fit(X_train_smote, y_train_smote)

# 进行预测
y_pred = RF_3.predict(X_test)

# 生成分类报告
classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      4572
           1       0.34      0.38      0.36       228

    accuracy                           0.94      4800
   macro avg       0.65      0.67      0.66      4800
weighted avg       0.94      0.94      0.94      4800



## 阈值移动 RF_1 - - 0.6

In [5]:
# 阈值列表
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# 用于存储不同阈值下的分类报告
classification_reports = {}

for thresh in thresholds:
    # 以概率形式获取预测结果
    y_probs = RF_1.predict_proba(X_test)[:, 1] # 获取正类的概率
    # 应用阈值
    y_pred_thresh = np.where(y_probs > thresh, 1, 0)
    # 生成并存储分类报告
    classification_reports[thresh] = classification_report(y_test, y_pred_thresh)

# 输出每个阈值下的分类报告
for thresh, report in classification_reports.items():
    print(f"Threshold: {thresh}\n", report)

NameError: name 'RF_1' is not defined

In [10]:
# 设置阈值为0.6
threshold = 0.6

# 以概率形式获取预测结果
y_probs = RF_1.predict_proba(X_test)[:, 1] # 获取正类的概率

# 应用阈值
y_pred_thresh = np.where(y_probs > threshold, 1, 0)

# 生成分类报告
classification_report_06 = classification_report(y_test, y_pred_thresh)

print(classification_report_06)

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4572
           1       0.72      0.86      0.78       228

    accuracy                           0.98      4800
   macro avg       0.86      0.92      0.89      4800
weighted avg       0.98      0.98      0.98      4800



In [13]:
from sklearn.metrics import roc_auc_score, roc_curve
y_pred_proba = RF_1.predict_proba(X_test)[:, 1]

# 计算AUC
auc = roc_auc_score(y_test, y_pred_proba)
print(f"AUC: {auc}")

AUC: 0.9640498610919249


## 阈值移动 RF_2 - - 0.5

In [14]:
# 阈值列表
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# 用于存储不同阈值下的分类报告
classification_reports = {}

for thresh in thresholds:
    # 以概率形式获取预测结果
    y_probs = RF_2.predict_proba(X_test)[:, 1] # 获取正类的概率
    # 应用阈值
    y_pred_thresh = np.where(y_probs > thresh, 1, 0)
    # 生成并存储分类报告
    classification_reports[thresh] = classification_report(y_test, y_pred_thresh)

# 输出每个阈值下的分类报告
for thresh, report in classification_reports.items():
    print(f"Threshold: {thresh}\n", report)

Threshold: 0.2
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      3046
           1       0.41      0.32      0.36       154

    accuracy                           0.94      3200
   macro avg       0.69      0.65      0.66      3200
weighted avg       0.94      0.94      0.94      3200

Threshold: 0.3
               precision    recall  f1-score   support

           0       0.96      0.99      0.97      3046
           1       0.49      0.24      0.32       154

    accuracy                           0.95      3200
   macro avg       0.73      0.61      0.65      3200
weighted avg       0.94      0.95      0.94      3200

Threshold: 0.4
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      3046
           1       0.69      0.19      0.30       154

    accuracy                           0.96      3200
   macro avg       0.83      0.59      0.64      3200
weighted avg       0.95   

## 阈值移动 RF_3 - - 舍弃

In [15]:
# 阈值列表
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# 用于存储不同阈值下的分类报告
classification_reports = {}

for thresh in thresholds:
    # 以概率形式获取预测结果
    y_probs = RF_3.predict_proba(X_test)[:, 1] # 获取正类的概率
    # 应用阈值
    y_pred_thresh = np.where(y_probs > thresh, 1, 0)
    # 生成并存储分类报告
    classification_reports[thresh] = classification_report(y_test, y_pred_thresh)

# 输出每个阈值下的分类报告
for thresh, report in classification_reports.items():
    print(f"Threshold: {thresh}\n", report)

Threshold: 0.2
               precision    recall  f1-score   support

           0       0.97      0.79      0.87      3046
           1       0.12      0.59      0.21       154

    accuracy                           0.78      3200
   macro avg       0.55      0.69      0.54      3200
weighted avg       0.93      0.78      0.84      3200

Threshold: 0.3
               precision    recall  f1-score   support

           0       0.97      0.88      0.92      3046
           1       0.17      0.51      0.25       154

    accuracy                           0.86      3200
   macro avg       0.57      0.69      0.59      3200
weighted avg       0.93      0.86      0.89      3200

Threshold: 0.4
               precision    recall  f1-score   support

           0       0.97      0.92      0.95      3046
           1       0.22      0.43      0.29       154

    accuracy                           0.90      3200
   macro avg       0.59      0.68      0.62      3200
weighted avg       0.93   

## RF_1 与 RF_2 融合  -  简单平均

In [16]:
# 获取每个模型的预测概率
y_probs_RF_1 = RF_1.predict_proba(X_test)[:, 1]  # RF1 的正类概率
y_probs_RF_2 = RF_2.predict_proba(X_test)[:, 1]  # RF2 的正类概率

# 应用各自的阈值
y_pred_RF_1 = np.where(y_probs_RF_1 > 0.6, 1, 0)  # 使用阈值0.6
y_pred_RF_2 = np.where(y_probs_RF_2 > 0.5, 1, 0)  # 使用阈值0.5

# 计算平均预测
y_pred_average = np.round((y_pred_RF_1 + y_pred_RF_2) / 2).astype(int)

In [17]:
# 生成分类报告
report = classification_report(y_test, y_pred_average)

print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3046
           1       0.80      0.16      0.26       154

    accuracy                           0.96      3200
   macro avg       0.88      0.58      0.62      3200
weighted avg       0.95      0.96      0.94      3200



## 神经网络

In [18]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

# 假设 RF1 和 RF2 是训练好的模型
# 假设 X_train, y_train, X_val, y_val 是训练和验证数据
X_val = X_test
y_val = y_test

# 使用基模型进行预测
train_probs_RF_1 = RF_1.predict_proba(X_train)[:, 1]
train_probs_RF_2 = RF_2.predict_proba(X_train)[:, 1]
val_probs_RF_1 = RF_1.predict_proba(X_val)[:, 1]
val_probs_RF_2 = RF_2.predict_proba(X_val)[:, 1]

# 准备堆叠特征
stacked_train_features = np.column_stack((train_probs_RF_1, train_probs_RF_2))
stacked_val_features = np.column_stack((val_probs_RF_1, val_probs_RF_2))

# 创建神经网络
model = Sequential()
model.add(Dense(10, input_dim=2, activation='relu'))  # 输入维度为2，因为有两个基模型的预测
model.add(Dense(1, activation='sigmoid'))  # 二分类问题

# 编译模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 训练神经网络
model.fit(stacked_train_features, y_train, epochs=50, batch_size=10, validation_data=(stacked_val_features, y_val))

# 使用训练好的模型进行预测和评估




Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

KeyboardInterrupt: 

In [None]:
# 使用模型进行预测（我们获取概率的第二列，即正类的预测概率）
val_predictions = model.predict(stacked_val_features)

# 将概率转换为类别标签（以0.5为阈值）
val_predictions = (val_predictions > 0.5).astype(int)

# 计算性能指标
accuracy = accuracy_score(y_val, val_predictions)
classification_rep = classification_report(y_val, val_predictions)

print("Accuracy on Validation Set:", accuracy)
print("\nClassification Report:\n", classification_rep)