## 使用 MNIST 数据集

In [3]:
import numpy as np
from sklearn.datasets import fetch_mldata
# fetch_ 开头的是在线获取数据

In [10]:
mnist = fetch_mldata('MNIST original')



In [11]:
# 上面的警告可以忽略
mnist

{'DESCR': 'mldata.org dataset: mnist-original',
 'COL_NAMES': ['label', 'data'],
 'target': array([0., 0., 0., ..., 9., 9., 9.]),
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}

In [12]:
X, y = mnist["data"], mnist["target"]

In [13]:
# MNIST 数据集已经做好 train test split, 前 60000 是训练集

In [16]:
X_train = np.array(X[:60000], dtype='float')
y_train = np.array(y[:60000], dtype='float')
X_test = np.array(X[60000:], dtype='float')
y_test = np.array(y[60000:], dtype='float')

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [None]:
# 784 = 28 ** 2,  是一个 28 x 28 像素的灰度图

## 使用 kNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train, y_train)

CPU times: user 28.6 s, sys: 247 ms, total: 28.8 s
Wall time: 29 s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [18]:
%time knn_clf.score(X_test, y_test)

CPU times: user 9min 33s, sys: 3.26 s, total: 9min 36s
Wall time: 9min 39s


0.9688

In [19]:
# 使用原始数据，fit 和 predict 都耗时间

In [20]:
# 上面的数据没有做数据归一化，是因为每个特征值都是一个 0~255 的灰度值，是在同一个单位和量级

## 使用 PCA 降维

In [27]:
from sklearn.decomposition import PCA
pca = PCA(0.9)
pca.fit(X_train)
X_train_reduction = pca.transform(X_train)
X_test_reduction = pca.transform(X_test)

In [28]:
X_train_reduction.shape

(60000, 87)

In [29]:
# 从 784 维 降到了 87 维，但是保留了 90% 的信息，这个效率还是蛮高的。

In [30]:
knn_clf = KNeighborsClassifier()
%time knn_clf.fit(X_train_reduction, y_train)

CPU times: user 289 ms, sys: 4.24 ms, total: 293 ms
Wall time: 292 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [31]:
%time knn_clf.score(X_test_reduction, y_test)

CPU times: user 58.9 s, sys: 290 ms, total: 59.2 s
Wall time: 59 s


0.9728

In [None]:
# 用时从之前的 9min 33s 降到了 59s ，速度快太多。
# 而且惊奇的是，降维后准确率反而提高了，从 之前的 0.9688 升到 0.9728，这就是 PCA 的另外一个用处，PCA 除了可以降维 还可以降噪。