In [1]:
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd

# 加载数据集
data_path = './datasets/pima-indians-diabetes.data.csv'  # 数据文件路径，注意修改为正确的路径
data = pd.read_csv(data_path, header=None)  # Pima数据集通常没有标题行

# 数据分割
# 将特征和目标变量分开
X = data.iloc[:, :-1].values  # 所有行，除了最后一列
y = data.iloc[:, -1].values   # 所有行，只有最后一列

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)  # 70%训练，30%测试

# 定义SVM分类器
def classifier():
    clf = svm.SVC(kernel='linear')  # 使用线性核
    return clf

# 训练SVM分类器
clf = classifier()
clf.fit(X_train, y_train)

# 使用交叉验证来评估模型性能
scores = cross_val_score(clf, X, y, cv=5)  # 5折交叉验证
print("Cross-validated scores:", scores)
print("Average score:", scores.mean())

# 测试集上的准确率
accuracy_test = clf.score(X_test, y_test)
print("Test set accuracy:", accuracy_test)

# 使用训练好的模型进行预测
y_pred = clf.predict(X_test)

# 打印出预测结果
print("Predicted labels:", y_pred)


Cross-validated scores: [0.75974026 0.75324675 0.74025974 0.81045752 0.76470588]
Average score: 0.7656820303879128
Test set accuracy: 0.7445887445887446
Predicted labels: [0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 0 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0
 0 0 0 1 0 0 0 0 0]
