In [None]:
"""
实现knn
knn算法是使用k个邻居来决定自己的标签，监督学习算法。找到所有训练集中跟自己"距离"最近的sample，
对sample的标签进行计数投票统计，最多的作为本测试sample的标签。
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

CLASS_TOTAL = 4
fruits_df = pd.read_table('../data/fruit_data_with_colors.txt')
fruit_features = fruits_df[["mass", "width", "height", "color_score"]]
fruit_label = fruits_df[["fruit_label"]]

X_train, X_test, y_train, y_test = train_test_split(fruit_features, fruit_label,test_size=1/4,random_state=0)

"""
使用min_heap更好一点
"""
def predict(testset, trainset, train_labels, k):
    predict = []
    for test_vec in testset:
        knn_list = []
        for i in range(k):
            dist = np.linalg.norm(trainset[i] - test_vec)  # 计算两个点的欧氏距离
            knn_list.append((dist, train_labels[i]))
        knn_list = sorted(knn_list, key=lambda x: x[0])
        for j in range(k, len(train_labels)):
            label = train_labels[j]
            dist = np.linalg.norm(trainset[j] - test_vec)  # 计算两个点的欧氏距离
            cur_max_dist = knn_list[-1][0]
            # 如果当前k个最近邻居中存在点距离比当前点距离远，则替换
            if dist < cur_max_dist:
                knn_list[-1] = (dist, label)
                knn_list = sorted(knn_list, key=lambda x: x[0])
        # 统计选票
        class_count = np.zeros(CLASS_TOTAL + 1)
        for dist, label in knn_list:
            class_count[label] += 1
        max_class_index = np.argmax(class_count)
        predict.append(max_class_index)
    return np.array(predict)


def predict_simple(testset, trainset, train_labels, k):
    predict = []
    for test_vec in testset:
        dist_arr = []
        for i in range(len(trainset)):
            dist = np.linalg.norm(trainset[i] - test_vec)
            dist_arr.append((dist, train_labels[i]))
        dist_arr = sorted(dist_arr, key=lambda x: x[0])
        knn_list = dist_arr[0:k]
        # 统计选票
        class_count = np.zeros(CLASS_TOTAL + 1)
        for dist, label in knn_list:
            class_count[label] += 1
        max_class_index = np.argmax(class_count)
        predict.append(max_class_index)
    return predict


X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy().squeeze()
y_test = y_test.to_numpy().squeeze()

pic_x = []
pic_score1 = []
pic_score2 = []

for iter in range(1, 10):
    test_predict = predict(X_test, X_train, y_train, iter)
    score = accuracy_score(y_test, test_predict)
    test_predict2 = predict_simple(X_test, X_train, y_train, iter)
    score2 = accuracy_score(y_test, test_predict2)
    pic_x.append(iter)
    pic_score1.append(score)
    pic_score2.append(score2)

plt.subplot(2, 1, 1)
plt.xlabel("iter")
plt.ylabel("score")
plt.plot(pic_x, pic_score1)

plt.subplot(2, 1, 2)
plt.xlabel("iter")
plt.ylabel("score2")
plt.plot(pic_x, pic_score2)
plt.show()
