In [1]:
import pandas as pd
data=pd.read_csv('diabete.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [2]:
# 提取数据
x=data.drop('class',axis=1) #提取自变量,axis=1表示以列删除；默认删除行
y=data['class'] # 提取目标变量
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [3]:
# 标准化
from sklearn.preprocessing import StandardScaler #导入标准化模块
sc = StandardScaler()# 实例化
X = sc.fit_transform(x) # 训练集标准化
X # 查看训练集标准化的结果

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [9]:
from sklearn.model_selection import train_test_split
#85%的数据是训练集，25%是测试集。
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=10)

## 多模型评价

In [10]:
from sklearn.linear_model import LogisticRegression# 逻辑回归
from sklearn.svm import SVC, LinearSVC # 支持向量机
from sklearn.tree import DecisionTreeClassifier # 决策树
from sklearn.naive_bayes import GaussianNB# 朴素贝叶斯
from sklearn.model_selection import cross_val_score # 通过交叉验证评估准确性
from sklearn.model_selection import KFold # k折交叉验证
kf = KFold(shuffle=True, n_splits=5)# 洗牌，折叠五次

In [11]:
models = []
models.append(("逻辑回归",LogisticRegression()))
models.append(("支持向量机",SVC()))
models.append(("线性支持向量机",LinearSVC()))
models.append(("决策树",DecisionTreeClassifier()))
models.append(("高斯朴素贝叶斯", GaussianNB()))

In [12]:
results = []
names = []
for name,model in models:
    result = cross_val_score(model, X, y,  cv=kf)# 依次使用k交叉验证
    names.append(name) # 模型名字放到name列表
    results.append(result) #训练结果放到results列表
# 遍历输出
for i in range(len(names)):
    print(names[i],"准确性：",results[i].mean())# 准确性

逻辑回归 准确性： 0.7682369917664035
支持向量机 准确性： 0.7644087938205585
线性支持向量机 准确性： 0.7721161191749427
决策树 准确性： 0.6913335030982088
高斯朴素贝叶斯 准确性： 0.756480774127833




## bagging（袋装法）

In [13]:
# 导入bagging分类学习器
from sklearn.ensemble import BaggingClassifier
#  base_estimator - 这表示用作基础/弱学习器的算法。我们使用DecisionTreeClassifier算法作为我们的弱/基础学习器。
# n_estimators - 这表示使用的弱学习器的数量。我们将使用 100 个决策树来构建 bagging 模型。
# max_samples - 从训练集中采样的最大数据数。我们使用 80% 的训练数据集进行重采样。
#bootstrap - 允许在不替换的情况下对训练数据集进行重新采样。
# oob_score - 用于在训练后计算模型的准确度得分。
# random_state - 允许我们重现相同的数据集样本。此外，它确保在生成多个子集时使用相同的比率。
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), 
    n_estimators=100, 
    max_samples=0.8, 
    bootstrap=True,
    oob_score=True,
    random_state=0
)

In [14]:
# 拟合模型
bag_model.fit(X_train, y_train)
# 模型准确率。结果确实比决策树高。准确度得分从0.6913335030982088提高到0.7604166666666666
bag_model.oob_score_

0.7604166666666666

In [15]:
# 我们还可以使用测试数据集检查准确度分数，以确定我们的模型是否过拟合。
# 准确度得分表明我们的模型没有过度拟合。当我们在使用测试数据集时获得较低的准确性时，就会发生过度拟合。
bag_model.score(X_test, y_test)

0.7760416666666666