In [1]:
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import KMeansSMOTE
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline,make_pipeline

1. **Borderline-SMOTE**：重点关注靠近多数类边界的少数类样本。这种方法试图仅在这些区域生成合成样本，因为边界区域的样本更容易被误分类。
2. **ADASYN (Adaptive Synthetic Sampling)**：与SMOTE类似，但更加关注生成难以学习的少数类样本。ADASYN根据少数类样本邻域中多数类样本的数量，动态调整生成合成样本的数量。
4. **SVMSMOTE**：使用支持向量机（SVM）来识别少数类样本和多数类样本之间的边界，并在这个边界附近生成新的样本。
5. **K-Means SMOTE**：结合K-Means聚类和SMOTE，通过首先对少数类样本进行聚类，然后在每个聚类内部进行过采样。
6. **SMOTE-ENN (SMOTE with Edited Nearest Neighbors)**：这是一种组合方法，首先使用SMOTE进行过采样，然后使用编辑的最近邻（ENN）规则来清除那些可能是噪声的合成样本。
7. **SMOTE-Tomek**：结合了SMOTE和Tomek链去除。Tomek链用于清除重叠区域的样本，以提高分类器的性能。

In [2]:
data = pd.read_csv("E:\竞赛\服创赛-A08\data\process_data.csv")
# data = pd.read_csv("E:\竞赛\服创赛-A08\data\特征集\\new_features.csv")

if 'Unnamed: 0' in data.columns:
    data = data.drop(['Unnamed: 0','个人编码'], axis=1)
else:
    data = data.drop('个人编码', axis=1)

In [3]:
# 筛选RES列值为1的数据
data_res_1 = data[data['RES'] == 1]

# 计算RES为1的数据中每列的缺失值数量
missing_values_count_res_1 = data_res_1.isnull().sum()

# 过滤出有缺失值的列
missing_values_res_1 = missing_values_count_res_1[missing_values_count_res_1 > 0]

# 打印有缺失值的列和对应的缺失值数量（当RES列的值为1）
print("当RES列的值为1时，有缺失值的列及其缺失数量：")
print(missing_values_res_1)

# 构建的特征集中含有缺失值，部分分类器对缺失值敏感，删除样本
data = data.dropna()

当RES列的值为1时，有缺失值的列及其缺失数量：
Series([], dtype: int64)


In [4]:
X = data.drop('RES', axis=1)
y = data['RES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Borderline-SMOTE

In [16]:
smote = BorderlineSMOTE(
        sampling_strategy=0.5,    # 平衡类分布
        k_neighbors=5,            # 使用5个最近邻
        m_neighbors=10,           # 边界样本的邻居数量
        kind='borderline-1',      # 边界类型选择（'borderline-1' 或 'borderline-2'）
        random_state=42           
)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [17]:
(y_train==1).sum()

5702

# ADASYN 

In [5]:
smote = ADASYN(
    sampling_strategy=0.5, 
    n_neighbors=5,            # 使用5个最近邻
    random_state=42           
)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [6]:
(y_train==1).sum()

5675

# SVMSMOTE

In [8]:
smote = SVMSMOTE(
    sampling_strategy=0.5, 
    k_neighbors=5,            # 使用5个最近邻
    m_neighbors=10,           # 边界样本的邻居数量
    svm_estimator=None,       # 使用默认SVM分类器
    random_state=42           
)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [9]:
(y_train==1).sum()

5702

# K-Means SMOTE

In [5]:
smote = KMeansSMOTE(
    sampling_strategy=0.5,
    k_neighbors=10,           # 使用10个最近邻
    # n_clusters=2,             # 使用2个簇中心
    random_state=42          
)
X_train, y_train = smote.fit_resample(X_train, y_train)



In [6]:
(y_train==1).sum()

5702

# SMOTE-ENN 

In [9]:
smote = SMOTEENN(
    sampling_strategy=0.5,
    # smote=SMOTE(),  # 默认使用标准smote计算
    enn=None,                 # 使用默认的Edited Nearest Neighbours
    random_state=42        
)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [10]:
(y_train==1).sum()

5484

In [11]:
(y_train==0).sum()

9829

# SMOTE-Tomek

In [6]:
smote = SMOTETomek(
    sampling_strategy=0.5,
    smote=None,               # 使用默认SMOTE
    tomek=None,               # 使用默认Tomek链
    random_state=42          
)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [7]:
(y_train==1).sum()

5683

# 使用分类器

## Decision Tree

In [8]:
dt_model = DecisionTreeClassifier(random_state=42,
                                 #  max_depth=7,
                                 #  min_samples_leaf=10,
                                 #  min_samples_split=10,
                                 #  class_weight=weight_dict,
                                 )
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_probabilities = dt_model.predict_proba(X_test)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_test, dt_probabilities))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      3802
           1       0.30      0.49      0.37       198

    accuracy                           0.92      4000
   macro avg       0.64      0.72      0.66      4000
weighted avg       0.94      0.92      0.93      4000

Decision Tree AUC Score: 0.7152282955807416


In [9]:
dt_predictions = dt_model.predict(X_train)
dt_probabilities = dt_model.predict_proba(X_train)[:, 1]

print("Decision Tree Classification Report:")
print(classification_report(y_train, dt_predictions))
print("Decision Tree AUC Score:", roc_auc_score(y_train, dt_probabilities))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11386
           1       1.00      1.00      1.00      5683

    accuracy                           1.00     17069
   macro avg       1.00      1.00      1.00     17069
weighted avg       1.00      1.00      1.00     17069

Decision Tree AUC Score: 1.0


## Logistic

In [10]:
log_model = make_pipeline(StandardScaler(), LogisticRegression(random_state=42,))
log_model.fit(
    X_train,
    y_train
)
log_predictions = log_model.predict(X_test)
log_probabilities = log_model.predict_proba(X_test)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_test, log_probabilities))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95      3802
           1       0.30      0.55      0.39       198

    accuracy                           0.92      4000
   macro avg       0.64      0.74      0.67      4000
weighted avg       0.94      0.92      0.93      4000

Logistic Regression AUC Score: 0.8058677251207499


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
log_predictions = log_model.predict(X_train)
log_probabilities = log_model.predict_proba(X_train)[:, 1]

print("Logistic Regression Classification Report:")
print(classification_report(y_train, log_predictions))
print("Logistic Regression AUC Score:", roc_auc_score(y_train, log_probabilities))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.95      0.89     11386
           1       0.85      0.63      0.73      5683

    accuracy                           0.84     17069
   macro avg       0.85      0.79      0.81     17069
weighted avg       0.84      0.84      0.83     17069

Logistic Regression AUC Score: 0.9093564867332469


## RF

In [12]:
rf_model = RandomForestClassifier(random_state=42,
                                  # class_weight=weight_dict,
                                  # bootstrap=True, 
                                  # max_samples=100
                                 )
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_probabilities = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_test, rf_probabilities))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3802
           1       0.55      0.45      0.50       198

    accuracy                           0.95      4000
   macro avg       0.76      0.72      0.74      4000
weighted avg       0.95      0.95      0.95      4000

Random Forest AUC Score: 0.919087641273333


In [13]:
rf_predictions = rf_model.predict(X_train)
rf_probabilities = rf_model.predict_proba(X_train)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_train, rf_predictions))
print("Random Forest AUC Score:", roc_auc_score(y_train, rf_probabilities))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11386
           1       1.00      1.00      1.00      5683

    accuracy                           1.00     17069
   macro avg       1.00      1.00      1.00     17069
weighted avg       1.00      1.00      1.00     17069

Random Forest AUC Score: 1.0


## GBDT

In [14]:
gbdt_model = GradientBoostingClassifier(
    random_state=42,
)


gbdt_model.fit(
    X_train,
    y_train,
              )


gbdt_predictions = gbdt_model.predict(X_test)
gbdt_probabilities = gbdt_model.predict_proba(X_test)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_test, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_test, gbdt_probabilities))

GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      3802
           1       0.43      0.51      0.46       198

    accuracy                           0.94      4000
   macro avg       0.70      0.73      0.72      4000
weighted avg       0.95      0.94      0.94      4000

GBDT AUC Score: 0.8903706980377153


In [15]:
gbdt_predictions = gbdt_model.predict(X_train)
gbdt_probabilities = gbdt_model.predict_proba(X_train)[:, 1]
print("GBDT Classification Report:")
print(classification_report(y_train, gbdt_predictions))
print("GBDT AUC Score:", roc_auc_score(y_train, gbdt_probabilities))

GBDT Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     11386
           1       0.93      0.90      0.91      5683

    accuracy                           0.94     17069
   macro avg       0.94      0.93      0.94     17069
weighted avg       0.94      0.94      0.94     17069

GBDT AUC Score: 0.9861468076891893


## LightGBM

In [16]:
lgb_model = LGBMClassifier(
    random_state=42,
)

lgb_model.fit(
    X_train,
    y_train,
)

lgb_predictions = lgb_model.predict(X_test)
lgb_probabilities = lgb_model.predict_proba(X_test)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_test, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_test, lgb_probabilities))

[LightGBM] [Info] Number of positive: 5683, number of negative: 11386
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14693
[LightGBM] [Info] Number of data points in the train set: 17069, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.332943 -> initscore=-0.694905
[LightGBM] [Info] Start training from score -0.694905
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      3802
           1       0.51      0.45      0.48       198

    accuracy                           0.95      4000
   macro avg       0.74      0.72      0.73      4000
weighted avg       0.95      0.95      0.95      4000

LightGBM AUC Score: 0.9186247004500554


In [17]:
lgb_predictions = lgb_model.predict(X_train)
lgb_probabilities = lgb_model.predict_proba(X_train)[:, 1]
print("LightGBM Classification Report:")
print(classification_report(y_train, lgb_predictions))
print("LightGBM AUC Score:", roc_auc_score(y_train, lgb_probabilities))

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     11386
           1       0.99      0.98      0.99      5683

    accuracy                           0.99     17069
   macro avg       0.99      0.99      0.99     17069
weighted avg       0.99      0.99      0.99     17069

LightGBM AUC Score: 0.9996284307028902


## XGBoost

In [18]:
xgb_model = XGBClassifier(
    random_state=42,
)


xgb_model.fit(
    X_train, 
    y_train,
)


xgb_predictions = xgb_model.predict(X_test)
xgb_probabilities = xgb_model.predict_proba(X_test)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_test, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_test, xgb_probabilities))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3802
           1       0.56      0.44      0.49       198

    accuracy                           0.95      4000
   macro avg       0.76      0.71      0.74      4000
weighted avg       0.95      0.95      0.95      4000

XGBoost AUC Score: 0.9097790105154652


In [19]:
xgb_predictions = xgb_model.predict(X_train)
xgb_probabilities = xgb_model.predict_proba(X_train)[:, 1]
print("XGBoost Classification Report:")
print(classification_report(y_train, xgb_predictions))
print("XGBoost AUC Score:", roc_auc_score(y_train, xgb_probabilities))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11386
           1       1.00      1.00      1.00      5683

    accuracy                           1.00     17069
   macro avg       1.00      1.00      1.00     17069
weighted avg       1.00      1.00      1.00     17069

XGBoost AUC Score: 0.9999999845456351
