# KNN

In [2]:
import numpy as np

In [45]:
from collections import Counter

In [44]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b)**2))

## Class redefinition

In [72]:
class KNN:
    
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.Y_train = None
        
    def fit(self, X, Y):
        self.X_train = X.values
        self.Y_train = Y.values
        
    def predict(self, X):
        predicted_labels = [self.predict_(x) for x in X.values]
        return np.array(predicted_labels)
    
    def predict_(self, X):
        # compute distances
        distances = [ euclidean_distance(X, x) for x in self.X_train ]
        
        # get k nearest samples, labels
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_label = [self.Y_train[i] for i in k_indices]
        
        # majority vote, most common class label
        most_common = Counter(k_nearest_label).most_common(1)[0][0]
        
        return most_common
        
    def score(self, X, Y):
        return (self.predict(X) == Y).sum() / Y.size
        

### Counter test

In [3]:
tab = [1, 1, 2, 1, 2, 4, 1, 7]
from collections import Counter
most = Counter(tab)
most

Counter({1: 4, 2: 2, 4: 1, 7: 1})

In [6]:
most.most_common()

[(1, 4), (2, 2), (4, 1), (7, 1)]

In [8]:
most[1]

4

In [10]:
most.most_common(1)

[(1, 4)]

In [12]:
most.most_common(1)[0][0]

1

### Test

In [13]:
import pandas as pd

In [18]:
from sklearn.datasets import load_iris

In [19]:
iris = load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [20]:
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)

In [21]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [23]:
target = np.array([iris.target_names[i] for i in iris.target])
target

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [24]:
data['target'] = target

In [25]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [28]:
import matplotlib.pyplot as plt

In [29]:
X = data[['petal length (cm)', 'petal width (cm)']]
Y = data['target']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size = 0.3,
    random_state = 1,
    stratify = Y
)

In [73]:
knn = KNN(3)

In [74]:
knn.fit(X_train, Y_train)

In [61]:
X_test

Unnamed: 0,petal length (cm),petal width (cm)
148,5.4,2.3
5,1.7,0.4
6,1.4,0.3
106,4.5,1.7
75,4.4,1.4
71,4.0,1.3
108,5.8,1.8
68,4.5,1.5
111,5.3,1.9
16,1.3,0.4


In [69]:
X_test.values.shape

(45, 2)

In [70]:
X_test.values

array([[5.4, 2.3],
       [1.7, 0.4],
       [1.4, 0.3],
       [4.5, 1.7],
       [4.4, 1.4],
       [4. , 1.3],
       [5.8, 1.8],
       [4.5, 1.5],
       [5.3, 1.9],
       [1.3, 0.4],
       [1.5, 0.4],
       [4.9, 1.8],
       [1.4, 0.2],
       [3.3, 1. ],
       [1.4, 0.2],
       [4.8, 1.4],
       [5.1, 2.3],
       [4.5, 1.5],
       [4.3, 1.3],
       [5.2, 2. ],
       [5.9, 2.1],
       [1.4, 0.1],
       [4.4, 1.4],
       [4.9, 2. ],
       [4.2, 1.3],
       [4.7, 1.6],
       [3. , 1.1],
       [5.7, 2.5],
       [1.6, 0.6],
       [5. , 2. ],
       [1.6, 0.2],
       [1.4, 0.2],
       [4. , 1. ],
       [4. , 1.3],
       [6.3, 1.8],
       [5.5, 2.1],
       [1.4, 0.1],
       [1.6, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [5.7, 2.3],
       [5.1, 1.8],
       [4.6, 1.5],
       [1.5, 0.2],
       [1.5, 0.1]])

In [71]:
for val in X_test.values:
    print(val)

[5.4 2.3]
[1.7 0.4]
[1.4 0.3]
[4.5 1.7]
[4.4 1.4]
[4.  1.3]
[5.8 1.8]
[4.5 1.5]
[5.3 1.9]
[1.3 0.4]
[1.5 0.4]
[4.9 1.8]
[1.4 0.2]
[3.3 1. ]
[1.4 0.2]
[4.8 1.4]
[5.1 2.3]
[4.5 1.5]
[4.3 1.3]
[5.2 2. ]
[5.9 2.1]
[1.4 0.1]
[4.4 1.4]
[4.9 2. ]
[4.2 1.3]
[4.7 1.6]
[3.  1.1]
[5.7 2.5]
[1.6 0.6]
[5. 2.]
[1.6 0.2]
[1.4 0.2]
[4. 1.]
[4.  1.3]
[6.3 1.8]
[5.5 2.1]
[1.4 0.1]
[1.6 0.2]
[1.4 0.2]
[4.7 1.4]
[5.7 2.3]
[5.1 1.8]
[4.6 1.5]
[1.5 0.2]
[1.5 0.1]


In [56]:
Y_test

148     virginica
5          setosa
6          setosa
106     virginica
75     versicolor
71     versicolor
108     virginica
68     versicolor
111     virginica
16         setosa
15         setosa
127     virginica
8          setosa
57     versicolor
47         setosa
76     versicolor
141     virginica
66     versicolor
74     versicolor
147     virginica
102     virginica
12         setosa
65     versicolor
121     virginica
94     versicolor
56     versicolor
98     versicolor
144     virginica
43         setosa
113     virginica
11         setosa
0          setosa
62     versicolor
53     versicolor
107     virginica
112     virginica
37         setosa
25         setosa
1          setosa
50     versicolor
120     virginica
149     virginica
54     versicolor
7          setosa
9          setosa
Name: target, dtype: object

In [75]:
knn.score(X_test, Y_test)

0.9777777777777777

In [77]:
knn.predict(X_test)

array(['virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'versicolor', 'virginica', 'setosa',
       'setosa', 'virginica', 'setosa', 'versicolor', 'setosa',
       'versicolor', 'virginica', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'setosa', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'setosa', 'virginica',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'setosa', 'setosa', 'setosa', 'versicolor',
       'virginica', 'virginica', 'versicolor', 'setosa', 'setosa'],
      dtype='<U10')