## KNN

### 概览

- KNN用于分类和回归，需要考虑最近的邻居
- 分类就是分组；回归就是预测结果（数字）
- 特征抽取就是将物品的（水果和用户）转化为一系列可比较的数字
- 能否挑选合适的特征使馆KNN算法的成败

### 最近邻分类器(1-nearest neighbor classifier)

In [8]:
from scipy.spatial import distance

class ScrappyKNN():
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        predictions = []
        for row in X_test:
            label = self.closest(row)
            predictions.append(label)
        return predictions
    
    def closest(self, row):
        best_dist = self.euc(row, self.X_train[0])
        best_index = 0
        for i in range(1, len(X_train)):
            dist = self.euc(row, self.X_train[i])
            if dist < best_dist:
                best_dist = dist
                best_index = i
        return self.y_train[best_index]
    
    def euc(self, a, b):
        return distance.euclidean(a, b)
    
# test
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

my_classifier = ScrappyKNN()
my_classifier.fit(X_train,y_train)
predictions = my_classifier.predict(X_test)

print(accuracy_score(y_test, predictions))

0.973684210526


### KNN

In [15]:
import numpy as np
import operator
from scipy.spatial import distance

class ScrappyKNN():
    def fit(self, X_train, y_train, k):
        self.X_train = X_train
        self.y_train = y_train
        self.k = k
        
    def predict(self, X_test):
        predictions = []
        for row in X_test:
            label = self.closest_k(row)
            predictions.append(label)
        return predictions
    
    def closest_k(self, row):
        # distances存储测试点到数据集各个点的距离
        distances = []
        for i in range(len(X_train)):
            distances.append(self.euc(row, self.X_train[i]))
        # 转换成数组，对距离排序（从小到大），返回位置信息
        distances = np.array(distances)
        sortedDistIndicies = distances.argsort()
        
        classCount = {}
        for i in range(self.k):
            voteLabel = y_train[sortedDistIndicies[i]]
            classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
        # 根据“票数”排序
        sortedClassCount = sorted(classCount.items(), 
                                  key=operator.itemgetter(1),
                                 reverse=True)
        return sortedClassCount[0][0]
    
    def euc(self, a, b):
        return distance.euclidean(a, b)
    
# test
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

my_classifier = ScrappyKNN()
my_classifier.fit(X_train,y_train, k=3)
predictions = my_classifier.predict(X_test)

print(accuracy_score(y_test, predictions))

0.921052631579
