# K折交叉验证

## 不分层抽样
没有使用分层抽样，原始数据集、训练集和测试集中的阳性类比例在每个分拆集中的差异很大。

In [7]:
import numpy as np
# keras 依赖 tensorflow：pip install tensorflow -i https://pypi.tuna.tsinghua.edu.cn/simple
from keras.datasets import mnist

# 训练集进行分析，并将其归一化为 [0, 1] 范围
(x, y), (_, _) = mnist.load_data()
x = x.reshape(-1, 28*28) / 255.0
# 为这项二进制分类任务创建一个目标向量
y = (y == 9)

# 使用 Scikit-Learn 通过分层 K 折交叉验证来训练和测试随机梯度下降（SGD）分类器
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
skfolds = KFold(n_splits=3)
splits = skfolds.split(x, y)
for i, (train_index, test_index) in enumerate(splits):  
    x_train = x[train_index]
    y_train = y[train_index]
    x_test  = x[test_index]
    y_test  = y[test_index]
    clf = SGDClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy = np.mean(y_pred == y_test)
    print("[SPLIT %d]"%(i+1))
    print("Percentage of digit 9 in the original dataset: %.2f %%"%(np.mean(y==True)*100))
    print("Percentage of digit 9 in the training set: %.2f %%"%(np.mean(y_train==True)*100))
    print("Percentage of digit 9 in the test set: %.2f %%"%(np.mean(y_test==True)*100))
    print("Accuracy: %.4f"%accuracy)

[SPLIT 1]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 9.84 %
Percentage of digit 9 in the test set: 10.07 %
Accuracy: 0.9640
[SPLIT 2]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 9.87 %
Percentage of digit 9 in the test set: 10.01 %
Accuracy: 0.9556
[SPLIT 3]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 10.04 %
Percentage of digit 9 in the test set: 9.66 %
Accuracy: 0.9628


## 分层抽样

In [6]:
import numpy as np
# keras 依赖 tensorflow：pip install tensorflow -i https://pypi.tuna.tsinghua.edu.cn/simple
from keras.datasets import mnist

# 训练集进行分析，并将其归一化为 [0, 1] 范围
(x, y), (_, _) = mnist.load_data()
x = x.reshape(-1, 28*28) / 255.0
# 为这项二进制分类任务创建一个目标向量
y = (y == 9)

# 使用 Scikit-Learn 通过分层 K 折交叉验证来训练和测试随机梯度下降（SGD）分类器
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold

skfolds = StratifiedKFold(n_splits=3)
splits = skfolds.split(x, y)
for i, (train_index, test_index) in enumerate(splits):  
    x_train = x[train_index]
    y_train = y[train_index]
    x_test  = x[test_index]
    y_test  = y[test_index]
    clf = SGDClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accuracy = np.mean(y_pred == y_test)
    print("[SPLIT %d]"%(i+1))
    print("Percentage of digit 9 in the original dataset: %.2f %%"%(np.mean(y==True)*100))
    print("Percentage of digit 9 in the training set: %.2f %%"%(np.mean(y_train==True)*100))
    print("Percentage of digit 9 in the test set: %.2f %%"%(np.mean(y_test==True)*100))
    print("Accuracy: %.4f"%accuracy)

[SPLIT 1]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 9.92 %
Percentage of digit 9 in the test set: 9.92 %
Accuracy: 0.9627
[SPLIT 2]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 9.92 %
Percentage of digit 9 in the test set: 9.92 %
Accuracy: 0.9636
[SPLIT 3]
Percentage of digit 9 in the original dataset: 9.92 %
Percentage of digit 9 in the training set: 9.92 %
Percentage of digit 9 in the test set: 9.92 %
Accuracy: 0.9575
