In [60]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, log_loss, f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# 设置随机种子，确保可重复性
np.random.seed(42)

# 加载数据
X = pd.read_csv("data/X_train.csv")  # 假设已部分标准化
y = pd.read_csv("data/y_train.csv").squeeze()  # 返回 Series

# 检查数据基本信息
print("数据形状:", X.shape)
print("类别分布:\n", y.value_counts(normalize=True))

数据形状: (10000, 300)
类别分布:
 label
5     0.4479
10    0.1081
6     0.0553
8     0.0516
12    0.0457
24    0.0385
17    0.0354
26    0.0278
21    0.0269
14    0.0264
4     0.0238
25    0.0184
19    0.0177
20    0.0153
27    0.0107
7     0.0103
11    0.0078
3     0.0065
13    0.0062
18    0.0060
23    0.0041
15    0.0026
9     0.0025
0     0.0018
2     0.0007
22    0.0007
1     0.0007
16    0.0006
Name: proportion, dtype: float64


In [61]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 分割训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [62]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 标准化（确保均值为 0，标准差为 1）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 转换为 DataFrame（可选，便于后续检查）
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

# 检查标准化结果
print("训练集特征均值（标准化后）:\n", X_train_scaled.mean().describe())
print("训练集特征标准差（标准化后）:\n", X_train_scaled.std().describe())

# PCA 降维（动态选择 n_components，保留 100维）
pca = PCA(n_components=100, random_state=42)  
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# 检查 PCA 结果
print("PCA 降维后特征数量:", X_train_pca.shape[1])
print("PCA 解释方差比例:", sum(pca.explained_variance_ratio_))

训练集特征均值（标准化后）:
 count    3.000000e+02
mean    -4.011606e-19
std      1.377425e-17
min     -3.197442e-17
25%     -1.110223e-17
50%     -8.881784e-19
75%      1.065814e-17
max      3.108624e-17
dtype: float64
训练集特征标准差（标准化后）:
 count    3.000000e+02
mean     1.000063e+00
std      2.037097e-15
min      1.000063e+00
25%      1.000063e+00
50%      1.000063e+00
75%      1.000063e+00
max      1.000063e+00
dtype: float64
PCA 降维后特征数量: 100
PCA 解释方差比例: 0.8681524900473957


In [63]:
# 对比标准化前的统计信息
print("【标准化前】训练集特征均值:\n", X_train.mean().describe())
print("【标准化前】训练集特征标准差:\n", X_train.std().describe())


【标准化前】训练集特征均值:
 count    300.000000
mean      -0.003894
std        0.053986
min       -0.128020
25%       -0.042983
50%       -0.001708
75%        0.036195
max        0.121386
dtype: float64
【标准化前】训练集特征标准差:
 count    300.000000
mean       0.994337
std        0.014133
min        0.959733
25%        0.986148
50%        0.994049
75%        1.004648
max        1.033039
dtype: float64


In [64]:
from sklearn.preprocessing import StandardScaler

# 标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 转成 DataFrame，便于后续检查
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)

# 对比标准化后的统计信息
print("\n【标准化后】训练集特征均值:\n", X_train_scaled.mean().describe())
print("【标准化后】训练集特征标准差:\n", X_train_scaled.std().describe())



【标准化后】训练集特征均值:
 count    3.000000e+02
mean    -4.011606e-19
std      1.377425e-17
min     -3.197442e-17
25%     -1.110223e-17
50%     -8.881784e-19
75%      1.065814e-17
max      3.108624e-17
dtype: float64
【标准化后】训练集特征标准差:
 count    3.000000e+02
mean     1.000063e+00
std      2.037097e-15
min      1.000063e+00
25%      1.000063e+00
50%      1.000063e+00
75%      1.000063e+00
max      1.000063e+00
dtype: float64


In [65]:
from sklearn.decomposition import PCA


# PCA 降维（保留100维）
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# 检查 PCA 结果
print("\nPCA 降维后特征数量:", X_train_pca.shape[1])
print("PCA 解释方差比例总和:", round(sum(pca.explained_variance_ratio_), 4))


PCA 降维后特征数量: 100
PCA 解释方差比例总和: 0.8682


In [66]:
# 计算类别权重
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
weight_dict = dict(zip(np.unique(y_train), class_weights))

# 为训练集和验证集生成样本权重
sample_weight_train = np.array([weight_dict[label] for label in y_train])
sample_weight_val = np.array([weight_dict[label] for label in y_val])

print("类别权重:", weight_dict)

类别权重: {np.int64(0): np.float64(20.408163265306122), np.int64(1): np.float64(47.61904761904762), np.int64(2): np.float64(47.61904761904762), np.int64(3): np.float64(5.4945054945054945), np.int64(4): np.float64(1.5037593984962405), np.int64(5): np.float64(0.07974163709580957), np.int64(6): np.float64(0.6464124111182935), np.int64(7): np.float64(3.484320557491289), np.int64(8): np.float64(0.6918021445866482), np.int64(9): np.float64(14.285714285714286), np.int64(10): np.float64(0.33030553261767137), np.int64(11): np.float64(4.608294930875576), np.int64(12): np.float64(0.78064012490242), np.int64(13): np.float64(5.714285714285714), np.int64(14): np.float64(1.3540961408259986), np.int64(15): np.float64(13.605442176870747), np.int64(16): np.float64(57.142857142857146), np.int64(17): np.float64(1.0095911155981827), np.int64(18): np.float64(5.9523809523809526), np.int64(19): np.float64(2.0120724346076457), np.int64(20): np.float64(2.34192037470726), np.int64(21): np.float64(1.3289036544850499)

In [67]:
from collections import Counter
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pandas as pd

# 安全地确定 SMOTE 的 k 值
min_count = min(Counter(y_train).values())
k = max(1, min(min_count - 1, 5))
print(f"使用的 k_neighbors = {k}")

# 初始化 SMOTE
smote = SMOTE(random_state=42, k_neighbors=k)

# SMOTE 1：对标准化原始特征进行采样
X_train_smote_orig, y_train_smote_orig = smote.fit_resample(X_train_scaled, y_train)

# SMOTE 2：对 PCA 特征进行采样
X_train_smote_pca, y_train_smote_pca = smote.fit_resample(X_train_pca, y_train)

print("SMOTE 后训练集类别分布（原始标准化特征）:")
print(pd.Series(y_train_smote_orig).value_counts(normalize=True))

使用的 k_neighbors = 4
SMOTE 后训练集类别分布（原始标准化特征）:
label
12    0.035714
4     0.035714
2     0.035714
11    0.035714
1     0.035714
13    0.035714
0     0.035714
26    0.035714
16    0.035714
6     0.035714
7     0.035714
9     0.035714
27    0.035714
17    0.035714
3     0.035714
19    0.035714
14    0.035714
18    0.035714
25    0.035714
21    0.035714
10    0.035714
24    0.035714
23    0.035714
8     0.035714
22    0.035714
20    0.035714
5     0.035714
15    0.035714
Name: proportion, dtype: float64


In [68]:
def evaluate_model(name, model, X_tr, y_tr, X_te, y_te, sw_train=None):
    # 训练模型
    if sw_train is not None:
        model.fit(X_tr, y_tr, sample_weight=sw_train)
    else:
        model.fit(X_tr, y_tr)
    
    # 预测
    y_pred = model.predict(X_te)
    y_proba = model.predict_proba(X_te)
    
    # 返回评估结果
    return {
        "model": name,
        "macro_f1": f1_score(y_te, y_pred, average='macro'),
        "weighted_f1": f1_score(y_te, y_pred, average='weighted'),
        "log_loss": log_loss(y_te, y_proba),
        "accuracy": accuracy_score(y_te, y_pred)
    }

In [69]:
# Logistic Regression
lr_base = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
lr_cw = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced', random_state=42)

# Random Forest
rf_base = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_cw = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42)

# XGBoost
xgb_base = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)

In [None]:
results = []

# LR
# 不加 PCA
# 基础模型
results.append(evaluate_model("Logistic (Base, No PCA)", lr_base, X_train_scaled, y_train, X_val_scaled, y_val))
# class_weight
results.append(evaluate_model("Logistic (class_weight, No PCA)", lr_cw, X_train_scaled, y_train, X_val_scaled, y_val))
# sample_weight
results.append(evaluate_model("Logistic (sample_weight, No PCA)", lr_base, X_train_scaled, y_train, X_val_scaled, y_val, sample_weight_train))
# SMOTE
lr_smote = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
results.append(evaluate_model("Logistic (SMOTE, No PCA)", lr_smote, X_train_smote_orig, y_train_smote_orig, X_val_scaled, y_val))

# 加 PCA
# 基础模型
results.append(evaluate_model("Logistic (Base, PCA)", lr_base, X_train_pca, y_train, X_val_pca, y_val))
# class_weight
results.append(evaluate_model("Logistic (class_weight, PCA)", lr_cw, X_train_pca, y_train, X_val_pca, y_val))
# sample_weight
results.append(evaluate_model("Logistic (sample_weight, PCA)", lr_base, X_train_pca, y_train, X_val_pca, y_val, sample_weight_train))
# SMOTE
lr_smote_pca = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
results.append(evaluate_model("Logistic (SMOTE, PCA)", lr_smote_pca, X_train_smote_pca, y_train_smote_pca, X_val_pca, y_val))

In [None]:
# RF
# 不加 PCA
# 基础模型
results.append(evaluate_model("RF (Base, No PCA)", rf_base, X_train_scaled, y_train, X_val_scaled, y_val))
# class_weight
results.append(evaluate_model("RF (class_weight, No PCA)", rf_cw, X_train_scaled, y_train, X_val_scaled, y_val))
# sample_weight
results.append(evaluate_model("RF (sample_weight, No PCA)", rf_base, X_train_scaled, y_train, X_val_scaled, y_val, sample_weight_train))
# SMOTE
rf_smote = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
results.append(evaluate_model("RF (SMOTE, No PCA)", rf_smote, X_train_smote_orig, y_train_smote_orig, X_val_scaled, y_val))

# 加 PCA
# 基础模型
results.append(evaluate_model("RF (Base, PCA)", rf_base, X_train_pca, y_train, X_val_pca, y_val))
# class_weight
results.append(evaluate_model("RF (class_weight, PCA)", rf_cw, X_train_pca, y_train, X_val_pca, y_val))
# sample_weight
results.append(evaluate_model("RF (sample_weight, PCA)", rf_base, X_train_pca, y_train, X_val_pca, y_val, sample_weight_train))
# SMOTE
rf_smote_pca = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
results.append(evaluate_model("RF (SMOTE, PCA)", rf_smote_pca, X_train_smote_pca, y_train_smote_pca, X_val_pca, y_val))

In [None]:
# XGBoost
# 不加 PCA
# 基础模型
results.append(evaluate_model("XGBoost (Base, No PCA)", xgb_base, X_train_scaled, y_train, X_val_scaled, y_val))
# sample_weight
xgb_sw = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)
results.append(evaluate_model("XGBoost (sample_weight, No PCA)", xgb_sw, X_train_scaled, y_train, X_val_scaled, y_val, sample_weight_train))
# SMOTE
xgb_smote = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)
results.append(evaluate_model("XGBoost (SMOTE, No PCA)", xgb_smote, X_train_smote_orig, y_train_smote_orig, X_val_scaled, y_val))

# 加 PCA
# 基础模型
xgb_base_pca = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)
results.append(evaluate_model("XGBoost (Base, PCA)", xgb_base_pca, X_train_pca, y_train, X_val_pca, y_val))
# sample_weight
xgb_sw_pca = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)
results.append(evaluate_model("XGBoost (sample_weight, PCA)", xgb_sw_pca, X_train_pca, y_train, X_val_pca, y_val, sample_weight_train))
# SMOTE
xgb_smote_pca = XGBClassifier(
    n_estimators=200, max_depth=6, learning_rate=0.1, 
    objective='multi:softprob', num_class=len(np.unique(y)), 
    eval_metric='mlogloss', random_state=42
)
results.append(evaluate_model("XGBoost (SMOTE, PCA)", xgb_smote_pca, X_train_smote_pca, y_train_smote_pca, X_val_pca, y_val))

In [None]:
# 整理结果
results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by="macro_f1", ascending=False).reset_index(drop=True)

# 显示结果
print("模型性能对比（按 macro_f1 排序）：")
results_df_sorted

模型性能对比（按 macro_f1 排序）：


Unnamed: 0,model,macro_f1,weighted_f1,log_loss,accuracy
0,"XGBoost (SMOTE, No PCA)",0.480869,0.749294,0.872681,0.7585
1,"Logistic (Base, PCA)",0.480402,0.764787,0.88296,0.778
2,"XGBoost (sample_weight, PCA)",0.455843,0.736977,0.887129,0.756
3,"Logistic (Base, No PCA)",0.453989,0.736295,1.302974,0.745
4,"XGBoost (sample_weight, No PCA)",0.444386,0.742683,0.834154,0.763
5,"XGBoost (SMOTE, PCA)",0.443818,0.735381,0.952801,0.747
6,"Logistic (class_weight, No PCA)",0.440973,0.694061,1.538232,0.661
7,"Logistic (sample_weight, No PCA)",0.440973,0.694061,1.538232,0.661
8,"Logistic (SMOTE, No PCA)",0.432726,0.683916,2.550309,0.6525
9,"Logistic (sample_weight, PCA)",0.432649,0.669488,1.361789,0.624


## Top 5 模型结果对比

| 排名 | 模型                                | Macro F1 | Weighted F1 | Accuracy |
|------|-------------------------------------|----------|--------------|----------|
| 1    | XGBoost (SMOTE, No PCA)             | **0.4809** | 0.7493       | 0.7585   |
| 2    | Logistic (Base, PCA)                | **0.4804** | 0.7648       | 0.7780   |
| 3    | XGBoost (sample_weight, PCA)        | 0.4558    | 0.7360       | 0.7560   |
| 4    | Logistic (Base, No PCA)             | 0.4539    | 0.7363       | 0.7450   |
| 5    | XGBoost (sample_weight, No PCA)     | 0.4444    | 0.7427       | 0.7630   |

**XGBoost + SMOTE（不加 PCA）** 拿下了最高的 Macro F1，表现优秀。

**Logistic Regression** 在**不使用类别平衡策略**时，基础表现已经很强，说明特征本身具备较好区分能力。

加入 `sample_weight` 或 `class_weight` 虽然在一定程度上提升了表现，但整体效果仍不如 SMOTE 显著。
