In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

# 加载数据
X = np.load('./data_set/X_train.npy')
Y = np.load('./data_set/y_train.npy')

# 分割数据集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Y_train = Y_train[:, 0]
Y_test = Y_test[:, 0]

# 定义特征索引
binary_features = [8, 11, 13, 14, 15, 17, 19, 20, 22, 24, 25, 26, 27, 28, 29, 30, 38, 39, 40, 41, 42, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 87, 94, 95, 96, 97, 105, 106, 107, 108, 109, 110]
categorical_features = [1, 2, 3, 4, 7, 9, 10, 37, 43, 44, 45, 46, 82, 84, 91, 92, 93, 98, 99, 100, 101, 102, 103, 104]
continuous_features = [0, 34, 35, 36, 83, 85, 88, 89, 90]

# 过滤掉一元特征
unary_features = [5, 6, 12, 16, 18, 21, 23, 31, 51, 61, 66, 77, 80, 86, 32, 33]
binary_features = [f for f in binary_features if f not in unary_features]
categorical_features = [f for f in categorical_features if f not in unary_features]
continuous_features = [f for f in continuous_features if f not in unary_features]


# 创建预处理管道
preprocessor = ColumnTransformer(
    transformers=[
        ('bin', SimpleImputer(strategy='most_frequent'), binary_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), continuous_features)
    ]
)

# 创建SVM模型管道
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear'))
])

# 训练模型
svm_pipeline.fit(X_train, Y_train)

# 评估模型
Y_pred = svm_pipeline.predict(X_test)
print(classification_report(Y_test, Y_pred))


              precision    recall  f1-score   support

           0       0.70      0.86      0.77       138
           1       0.38      0.19      0.26        62

    accuracy                           0.65       200
   macro avg       0.54      0.52      0.51       200
weighted avg       0.60      0.65      0.61       200



