In [1]:
### 使用手动欠采样、少数类加权、代价敏感学习

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import IsolationForest

from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import class_weight

In [26]:
data = pd.read_csv("../../data/process_data.csv")
# data = pd.read_csv("E:\竞赛\服创赛-A08\data\特征集\\new_features.csv")

if 'Unnamed: 0' in data.columns:
    data = data.drop(['Unnamed: 0','个人编码'], axis=1)
else:
    data = data.drop('个人编码', axis=1)

In [27]:
# 筛选RES列值为1的数据
data_res_1 = data[data['RES'] == 1]

# 计算RES为1的数据中每列的缺失值数量
missing_values_count_res_1 = data_res_1.isnull().sum()

# 过滤出有缺失值的列
missing_values_res_1 = missing_values_count_res_1[missing_values_count_res_1 > 0]

# 打印有缺失值的列和对应的缺失值数量（当RES列的值为1）
print("当RES列的值为1时，有缺失值的列及其缺失数量：")
print(missing_values_res_1)

# 构建的特征集中含有缺失值，部分分类器对缺失值敏感，删除样本
data = data.dropna()

当RES列的值为1时，有缺失值的列及其缺失数量：
Series([], dtype: int64)


In [28]:
X = data.drop('RES', axis=1)
y = data['RES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [29]:
(y_train==1).sum()

595

In [7]:
# 手动欠采样多数类

rus = RandomUnderSampler(random_state=42,sampling_strategy = {0: 2500, 1: 595})
X_train, y_train = rus.fit_resample(X_train, y_train)

In [8]:
# 计算类别权重
# 两种计算方式得到的结果是相同的,部分分类器可能会支持不同的参数，故采用了两种计算方式

weight = class_weight.compute_sample_weight('balanced', y_train)  # 用于在训练时调整样本权重

weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weight_dict = {i: weights[i] for i in range(len(weights))}
weight_dict

{0: 0.619, 1: 2.600840336134454}

In [9]:
weight,weights

(array([0.619     , 0.619     , 0.619     , ..., 2.60084034, 2.60084034,
        2.60084034]),
 array([0.619     , 2.60084034]))

# RF

In [10]:
rf_model = RandomForestClassifier(random_state=42,
                                  # class_weight=weight_dict,
                                  # bootstrap=True, 
                                  # max_samples=100
                                 )
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_probabilities = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_test, rf_probabilities))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3802
           1       0.54      0.59      0.56       198

    accuracy                           0.95      4000
   macro avg       0.76      0.78      0.77      4000
weighted avg       0.96      0.95      0.96      4000

Random Forest AUC Score: 0.9196893979245374


In [11]:
rf_predictions = rf_model.predict(X_train)
rf_probabilities = rf_model.predict_proba(X_train)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_train, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_train, rf_probabilities))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2500
           1       1.00      1.00      1.00       595

    accuracy                           1.00      3095
   macro avg       1.00      1.00      1.00      3095
weighted avg       1.00      1.00      1.00      3095

Random Forest AUC Score: 1.0


# GBDT

In [12]:
gbdt_model = GradientBoostingClassifier(
    random_state=42,
)


gbdt_model.fit(
    X_train,
    y_train,
    sample_weight = weight
              )


gbdt_predictions = gbdt_model.predict(X_test)
gbdt_probabilities = gbdt_model.predict_proba(X_test)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_test, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_test, gbdt_probabilities))

GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.87      0.93      3802
           1       0.24      0.79      0.37       198

    accuracy                           0.87      4000
   macro avg       0.62      0.83      0.65      4000
weighted avg       0.95      0.87      0.90      4000

GBDT AUC Score: 0.9243898745476863


In [13]:
gbdt_predictions = gbdt_model.predict(X_train)
gbdt_probabilities = gbdt_model.predict_proba(X_train)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_train, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_train, gbdt_probabilities))

GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.91      0.94      2500
           1       0.71      0.95      0.81       595

    accuracy                           0.91      3095
   macro avg       0.85      0.93      0.88      3095
weighted avg       0.93      0.91      0.92      3095

GBDT AUC Score: 0.9769774789915966


# XGBoost

In [14]:
xgb_model = XGBClassifier(
    random_state=42,
    # class_weight=weight_dict
)


xgb_model.fit(
    X_train, 
    y_train,
    sample_weight = weight
)


xgb_predictions = xgb_model.predict(X_test)
xgb_probabilities = xgb_model.predict_proba(X_test)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_test, xgb_probabilities))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95      3802
           1       0.33      0.69      0.45       198

    accuracy                           0.92      4000
   macro avg       0.66      0.81      0.70      4000
weighted avg       0.95      0.92      0.93      4000

XGBoost AUC Score: 0.9310809302918719


In [15]:
xgb_predictions = xgb_model.predict(X_train)
xgb_probabilities = xgb_model.predict_proba(X_train)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_train, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_train, xgb_probabilities))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2500
           1       1.00      1.00      1.00       595

    accuracy                           1.00      3095
   macro avg       1.00      1.00      1.00      3095
weighted avg       1.00      1.00      1.00      3095

XGBoost AUC Score: 1.0


# lightGBM

In [16]:
lgb_model = LGBMClassifier(
    random_state=42,
)

lgb_model.fit(
    X_train,
    y_train,
    sample_weight = weight
)

lgb_predictions = lgb_model.predict(X_test)
lgb_probabilities = lgb_model.predict_proba(X_test)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_test, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_test, lgb_probabilities))

[LightGBM] [Info] Number of positive: 595, number of negative: 2500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11535
[LightGBM] [Info] Number of data points in the train set: 3095, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.95      3802
           1       0.34      0.74      0.47       198

    accuracy                           0.92      4000
   macro avg       0.66      0.83      0.71      4000
weighted avg       0.95      0.92      0.93      4000

LightGBM AUC Score: 0.9310145112354475


In [17]:
lgb_predictions = lgb_model.predict(X_train)
lgb_probabilities = lgb_model.predict_proba(X_train)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_train, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_train, lgb_probabilities))

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2500
           1       0.99      1.00      1.00       595

    accuracy                           1.00      3095
   macro avg       1.00      1.00      1.00      3095
weighted avg       1.00      1.00      1.00      3095

LightGBM AUC Score: 1.0


# 决策树

In [38]:
dt_model = DecisionTreeClassifier(random_state=42,
                                 #  max_depth=7,
                                 #  min_samples_leaf=20,
                                 #  min_samples_split=10,
                                 # # class_weight=weight_dict,
                                 )
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_probabilities = dt_model.predict_proba(X_test)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_test, dt_probabilities))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      3802
           1       0.38      0.46      0.42       198

    accuracy                           0.94      4000
   macro avg       0.68      0.71      0.69      4000
weighted avg       0.94      0.94      0.94      4000

Decision Tree AUC Score: 0.7104660492351181


In [39]:
dt_predictions = dt_model.predict(X_train)
dt_probabilities = dt_model.predict_proba(X_train)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_train, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_train, dt_probabilities))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11405
           1       1.00      1.00      1.00       595

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

Decision Tree AUC Score: 1.0


In [20]:
# data = pd.read_csv("E:\竞赛\服创赛-A08\\data\特征集\\new_features_data_2.csv")
# data = data.dropna()

# if 'Unnamed: 0' in data.columns:
#     data = data.drop(['Unnamed: 0','个人编码'], axis=1)
# else:
#     data = data.drop('个人编码', axis=1)

# X = data.drop('RES', axis=1)
# y = data['RES']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Logistic

In [21]:
log_model = make_pipeline(StandardScaler(), LogisticRegression(random_state=42,class_weight=weight_dict))
log_model.fit(
    X_train,
    y_train
)
log_predictions = log_model.predict(X_test)
log_probabilities = log_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_test, log_probabilities))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.86      0.92      3802
           1       0.21      0.68      0.31       198

    accuracy                           0.85      4000
   macro avg       0.59      0.77      0.62      4000
weighted avg       0.94      0.85      0.89      4000

Logistic Regression AUC Score: 0.8724501724238705


In [22]:
log_predictions = log_model.predict(X_train)
log_probabilities = log_model.predict_proba(X_train)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_train, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_train, log_probabilities))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.87      0.89      2500
           1       0.54      0.67      0.60       595

    accuracy                           0.83      3095
   macro avg       0.73      0.77      0.75      3095
weighted avg       0.84      0.83      0.84      3095

Logistic Regression AUC Score: 0.86893243697479
