In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import operator
import random
import time
import os
import sys
from tensorflow import keras

In [2]:
T1 = time.time()

In [3]:

def gaussian(dist, sigma=10.0):
    """ Input a distance and return it`s weight"""
    weight = np.exp(-dist ** 2 / (2 * sigma ** 2))
    return weight


In [4]:

def knn(x_test, x_data, y_data, k):
    # 计算样本数量
    x_data_size = x_data.shape[0]
    # 复制x_test
    np.tile(x_test, (x_data_size, 1))
    # 计算x_test与每一个样本的差值
    diffMat = np.tile(x_test, (x_data_size, 1)) - x_data
    # 计算差值的平方
    sqDiffMat = diffMat ** 2
    # 求和
    sqDistances = sqDiffMat.sum(axis=1)
    # 开方
    distances = sqDistances ** 0.5
    # 从小到大排序
    sortedDistances = distances.argsort()

    # 求权
    distancesByWeight = gaussian(sortedDistances)

    classCount = {}
    for i in range(k):
        # 获取标签
        votelabel = y_data[sortedDistances[i]]
        # 统计标签数量
        classCount[votelabel] = classCount.get(votelabel, 0) + distancesByWeight[i] * 1
    # 根据operator.itemgetter(1)-第1个值对classCount排序，然后再取倒序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    # 获取数量最多的标签
    return sortedClassCount[0][0]


In [5]:
# fashion_mnist图像分类数据集
fashion_mnist = keras.datasets.fashion_mnist

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

y_test=y_test.flatten()
x_train=x_train.reshape(60000,784)
y_train=y_train.flatten()
x_test=x_test.reshape(10000,784)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)
(60000, 784)
(60000,)
(10000, 784)
(10000,)


In [6]:
predictions = []
for i in range(x_test.shape[0]):
    predictions.append(knn(x_test[i], x_train, y_train, 5))

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.65      0.75      0.70      1000
           1       0.84      0.95      0.89      1000
           2       0.62      0.65      0.63      1000
           3       0.70      0.79      0.74      1000
           4       0.67      0.52      0.58      1000
           5       0.70      0.95      0.81      1000
           6       0.53      0.37      0.44      1000
           7       0.81      0.75      0.78      1000
           8       0.91      0.76      0.83      1000
           9       0.89      0.83      0.86      1000

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000



In [7]:
print(confusion_matrix(y_test, predictions))

[[752  31  14  92   6   8  84   1  12   0]
 [  2 952   4  24   7   3   6   0   2   0]
 [ 82   7 650  31 110  14 101   0   5   0]
 [ 51  83   6 789  26   2  21   0  22   0]
 [ 36  25 215  84 518  12 102   0   8   0]
 [  0   0   0   3   0 951   0  27   3  16]
 [227  36 146  93  99  12 370   0  17   0]
 [  0   0   0   0   0 189   0 747   2  62]
 [ 14   2  16  10  10  92  16  53 764  23]
 [  0   1   0   4   0  74   0  89   1 831]]


In [8]:
T2 = time.time()
print('程序运行时间:%s秒' % ((T2 - T1)))

程序运行时间:1236.0238513946533秒
