In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# 读取数据
data = pd.read_excel('Dry_Bean_Dataset.xlsx')

# 1. 缺失值处理
data = data.dropna()

# 定义特征和目标变量
X = data.drop('Class', axis=1)
y = data['Class']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. 特征缩放 - 先使用 StandardScaler 进行标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. 特征缩放 - 使用 MinMaxScaler 进行归一化
min_max_scaler = MinMaxScaler()
X_train_normalized = min_max_scaler.fit_transform(X_train_scaled)
X_test_normalized = min_max_scaler.transform(X_test_scaled)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_normalized, y_train)

y_pred = knn.predict(X_test_normalized)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[351   0  30   0   2   5   7]
 [  0 161   0   0   0   0   0]
 [ 11   0 451   0  11   1   5]
 [  0   0   0 958   1  17  67]
 [  0   0  13   6 555   0  14]
 [  4   0   0  12   0 587  16]
 [  3   0   1  89   7   6 693]]

Classification Report:
              precision    recall  f1-score   support

    BARBUNYA       0.95      0.89      0.92       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.91      0.94      0.93       479
    DERMASON       0.90      0.92      0.91      1043
       HOROZ       0.96      0.94      0.95       588
       SEKER       0.95      0.95      0.95       619
        SIRA       0.86      0.87      0.87       799

    accuracy                           0.92      4084
   macro avg       0.93      0.93      0.93      4084
weighted avg       0.92      0.92      0.92      4084



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report


# 随机森林模型训练
rf_model = RandomForestClassifier(
    n_estimators=100,  # 树的数量
    max_depth=20,       # 树的最大深度
    random_state=42,
    class_weight='balanced'  # 处理类别不平衡
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("随机森林模型性能：")
print("准确率:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

# MLP（多层感知机）模型训练

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp_model = MLPClassifier(
    hidden_layer_sizes=(64, 32),  # 两个隐藏层，分别有64和32个神经元
    activation='relu',            # 激活函数
    solver='adam',                # 优化算法
    max_iter=200,                 # 最大迭代次数
    random_state=42,
    early_stopping=True           # 启用早停防止过拟合
)

mlp_model.fit(X_train_scaled, y_train)
mlp_pred = mlp_model.predict(X_test_scaled)

print("\nMLP模型性能：")
print("准确率:", accuracy_score(y_test, mlp_pred))
print(classification_report(y_test, mlp_pred))

随机森林模型性能：
准确率: 0.9233594515181195
              precision    recall  f1-score   support

    BARBUNYA       0.94      0.90      0.92       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.92      0.94      0.93       479
    DERMASON       0.90      0.93      0.92      1043
       HOROZ       0.97      0.95      0.96       588
       SEKER       0.94      0.94      0.94       619
        SIRA       0.88      0.88      0.88       799

    accuracy                           0.92      4084
   macro avg       0.94      0.93      0.93      4084
weighted avg       0.92      0.92      0.92      4084


MLP模型性能：
准确率: 0.925073457394711
              precision    recall  f1-score   support

    BARBUNYA       0.93      0.92      0.92       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.93      0.94      0.94       479
    DERMASON       0.93      0.90      0.91      1043
       HOROZ       0.97      0.95      0.96       588
       SEKE

In [22]:

# 优化尝试3：针对重叠区域的特征处理
from sklearn.preprocessing import PolynomialFeatures

# 生成多项式特征（二次项）
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_normalized)
X_test_poly = poly.transform(X_test_normalized)

# 使用优化后的MLP
mlp_improved = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  
    activation='relu',
    alpha=0.001,  # L2正则化
    batch_size=64,
    learning_rate='adaptive',
    early_stopping=True,
    random_state=42)

mlp_improved.fit(X_train_poly, y_train)
y_pred_mlp = mlp_improved.predict(X_test_poly)
print("改进版MLP性能：")
print(classification_report(y_test, y_pred_mlp))

改进版MLP性能：
              precision    recall  f1-score   support

    BARBUNYA       0.96      0.87      0.91       395
      BOMBAY       1.00      1.00      1.00       161
        CALI       0.90      0.96      0.93       479
    DERMASON       0.91      0.91      0.91      1043
       HOROZ       0.98      0.92      0.95       588
       SEKER       0.92      0.95      0.94       619
        SIRA       0.85      0.87      0.86       799

    accuracy                           0.92      4084
   macro avg       0.93      0.93      0.93      4084
weighted avg       0.92      0.92      0.92      4084



In [None]:
# 优化尝试5：使用Oversampling增强SIRA样本
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy={'SIRA': 2000},  # 适当增加SIRA样本
              random_state=42,
              k_neighbors=10)
X_res, y_res = smote.fit_resample(X_train_poly, y_train)

# 在增强数据上重新训练
mlp_improved.fit(X_res, y_res)
y_pred_smote = mlp_improved.predict(X_test_poly)
print("SMOTE增强后的MLP性能：")
print(classification_report(y_test, y_pred_smote))