In [1]:
from sklearn.datasets import load_iris

In [2]:
X, y = load_iris(return_X_y=True)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
knn = KNeighborsClassifier()

In [7]:
knn.fit(X=X_train, y=y_train)

In [8]:
y_pred = knn.predict(X=X_test)

In [9]:
acc = (y_pred == y_test).mean()

In [10]:
acc

np.float64(0.9666666666666667)

### KNN 核心思想
- K Nearest Neighbors K个最近的邻居
- 核心思想：
    - 近朱者赤，近墨者黑
- 如何做一个样本的推理：
    - 分类问题：x0到底属于哪一类呢？
        - 寻找 x0 的 K 个最近的邻居
        - 统计这 K 个邻居中哪个类别出现的最多
        - 这个出现次数最多的类别，就是 x0 的类别
    - 回归算法：x0到底是多少？
        - 寻找 x0 的 K 个最近的邻居
        - 统计这 K 个邻居标签的均值
        - 这个标签的均值就是 x0 的值

- 算法特点：
  - 惰性计算（几乎没有训练过程，在推理时直接硬计算，这不属于典型的人工智能！）
- 如何实现？
  - 全面模仿 sklearn

In [21]:
import numpy as np
class MyKNeighborsClassifier2(object):
    """
        自定义的KNN算法
    """
    
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X):
        for x in X:
            distance = np.linalg.norm(x - self.X, axis=1)
            idxes = distance.argsort()[:self.n_neighbors]
            print(idxes)
            break
        return distance


### 重要笔记：
np.linalg.norm中，参数axis：
- 二维矩阵：axis=0 沿列计算，axis=1 沿行计算。
- 高维矩阵：axis 指定沿哪个维度计算范数，计算后该维度被压缩。
- 无论矩阵的维度如何，axis 参数的作用都是指定沿哪个维度计算范数。

In [22]:
my_knn2 = MyKNeighborsClassifier2()
my_knn2.fit(X=X_train, y=y_train)
y_pred = my_knn2.predict(X=X_test)
# print(y_pred)

[ 32 107  64  46   3]


In [23]:
from collections import Counter
import numpy as np
class MyKNeighborsClassifier(object):
    """
        自定义KNN分类算法
    """
    def __init__(self, n_neighbors=5):
        """
            初始化方法：
                - 接收 超参数
        """
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        """
            训练过程
        """
        self.X = X
        self.y = y

    def predict(self, X):
        """
            推理过程
        """
        # X:[batch_size, num_features]
        
        # 第一步：寻找样本的 K个邻居
        # 第二步：对 K 个邻居的标签进行投票
        results = []
        for x in X:
            # 用欧氏距离计算x这个点和所有点也就是self.X的距离。这里self.X是二维（一堆花），x是一维（一朵花），这里做减法的时候会适用广播机制。另外axis=1是按行计算合的意思
            distance = ((self.X - x) ** 2).sum(axis=1) ** 0.5 

            # 排序找到 K 个距离最近的好友，并拿到他们的位置号，也就是  是排位
            # 这里目的不是想知道离我最近的 K 个点的距离是多少，而是 离我最近 K 个点是哪些点。
            # sort:数值排序，这里不用这个；    argsort：默认升序，返回位置号
            idxes = distance.argsort()[:self.n_neighbors]
            # print(f'idxes的类型是{type(idxes)}')

            # 接下来要知道这N个最近的点的  label或叫target.
            # 所有的labels都在y里面
            labels = self.y[idxes]

            # 根据最近的 K 个labels中，看哪种多，把多的那个类型赋值给final_label
            # Counter(labels) 返回字典。{类别A:个数, 类别B:个数}
            # most_common(出现次数最多的前 n 个， 1的话就是要出现次数最多的1个)
            final_label = Counter(labels).most_common(1)[0][0]

            results.append(final_label)
        return np.array(results)

In [14]:
my_knn = MyKNeighborsClassifier(n_neighbors=5)

In [15]:
my_knn.fit(X=X_train, y=y_train)

In [16]:
y_pred = my_knn.predict(X=X_test)

In [17]:
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0])

In [18]:
(y_pred == y_test).mean()

np.float64(0.9666666666666667)

### 回归问题：
- 预测的是一个连续数据

In [19]:
import pandas as pd

In [20]:
data = pd.read_csv(filepath_or_buffer='boston_house_prices.csv', skiprows=1)

In [21]:
data

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [22]:
# pd数据中删除特定列，并且转换为numpy
X = data.drop(columns=["MEDV"]).to_numpy()
y = data["MEDV"].to_numpy()

In [23]:
X.shape

(506, 13)

In [24]:
y.shape

(506,)

In [25]:
y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [26]:
X

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]], shape=(506, 13))

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
# KNeighborsRegressor是回归器
from sklearn.neighbors import KNeighborsRegressor

In [29]:
knn = KNeighborsRegressor(n_neighbors=5)

In [30]:
knn.fit(X=X_train, y=y_train)

In [31]:
y_pred = knn.predict(X=X_test)

In [32]:
y_pred

array([20.76, 29.54, 23.08, 11.94, 22.  , 21.4 , 22.96, 24.46, 29.96,
       18.26, 10.6 , 10.8 , 17.82, 10.38, 38.84, 25.34, 21.98, 23.44,
       24.32, 27.28, 23.56, 20.32, 19.02, 31.14, 21.16, 12.7 , 18.48,
       22.46, 23.66, 17.4 , 16.04, 20.16, 19.68, 23.  , 24.6 , 19.04,
       10.38, 20.08, 13.44, 17.92, 25.14, 21.16, 20.18, 19.04, 20.2 ,
       23.9 , 23.26, 21.6 , 17.16, 20.04, 18.26, 22.3 , 28.14, 26.28,
       22.46, 22.42, 23.4 , 19.58, 10.  , 22.3 , 32.58, 20.72, 26.28,
       27.82, 22.52, 34.16, 19.04, 20.62, 13.12, 29.34, 31.16, 21.82,
       24.72, 30.06, 22.52, 10.14, 32.1 , 22.84, 25.04, 20.34, 24.96,
       18.22, 16.78, 32.68, 28.02, 29.34, 23.8 , 12.7 , 37.42, 20.56,
       31.76, 12.7 , 22.74, 25.02, 23.76, 22.62, 10.8 , 25.06,  9.54,
       22.34, 29.42, 19.56])

In [33]:
y_test

array([22.6, 50. , 23. ,  8.3, 21.2, 19.9, 20.6, 18.7, 16.1, 18.6,  8.8,
       17.2, 14.9, 10.5, 50. , 29. , 23. , 33.3, 29.4, 21. , 23.8, 19.1,
       20.4, 29.1, 19.3, 23.1, 19.6, 19.4, 38.7, 18.7, 14.6, 20. , 20.5,
       20.1, 23.6, 16.8,  5.6, 50. , 14.5, 13.3, 23.9, 20. , 19.8, 13.8,
       16.5, 21.6, 20.3, 17. , 11.8, 27.5, 15.6, 23.1, 24.3, 42.8, 15.6,
       21.7, 17.1, 17.2, 15. , 21.7, 18.6, 21. , 33.1, 31.5, 20.1, 29.8,
       15.2, 15. , 27.5, 22.6, 20. , 21.4, 23.5, 31.2, 23.7,  7.4, 48.3,
       24.4, 22.6, 18.3, 23.3, 17.1, 27.9, 44.8, 50. , 23. , 21.4, 10.2,
       23.3, 23.2, 18.9, 13.4, 21.9, 24.8, 11.9, 24.3, 13.8, 24.7, 14.1,
       18.7, 28.1, 19.8])

In [34]:
# MAE - Mean Absolute Error 平均绝对误差
abs(y_pred - y_test).mean()

np.float64(4.756078431372549)

In [35]:
# MSE - Mean Squared Error 均方误差
((y_pred - y_test) ** 2).mean()

np.float64(51.74387450980392)

In [36]:
class MyKNeighborsRegressor(object):
    """
        自定义KNN回归算法
    """
    def __init__(self, n_neighbors=5):
        """
            初始化方法：
                - 接收 超参数
        """
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        """
            训练过程
        """
        self.X = X
        self.y = y

    def predict(self, X):
        """
            推理过程
        """
        # X:[batch_size, num_features]
        
        # 第一步：寻找样本的 K个邻居
        # 第二步：对K个邻居的标签取均值
        results = []
        for x in X:
            distance = ((self.X - x) ** 2).sum(axis=1) ** 0.5
            idxes = distance.argsort()[:self.n_neighbors]
            labels = self.y[idxes]
            final_label = labels.mean()
            results.append(final_label)
        return np.array(results)

In [37]:
knn = MyKNeighborsRegressor(n_neighbors=5)

In [38]:
knn.fit(X=X_train, y=y_train)

In [39]:
y_pred = knn.predict(X=X_test)

In [40]:
abs(y_pred - y_test).mean()

np.float64(4.756078431372549)

In [41]:
((y_pred - y_test) ** 2).mean()

np.float64(51.74387450980392)