In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
print('Number of classes: %d' %len(np.unique(iris_y)))
print('Number of data points: %d' %len(iris_y))

X0 = iris_X[iris_y == 0, :]
print('\nSample from class 0:\n', X0[:5])

X1 = iris_X[iris_y == 1, :]
print('\nSample from class 0:\n', X1[:5])

X2 = iris_X[iris_y == 2, :]
print('\nSample from class 0:\n', X2[:5])

Number of classes: 3
Number of data points: 150

Sample from class 0:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]

Sample from class 0:
 [[7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]]

Sample from class 0:
 [[6.3 3.3 6.  2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.  5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.  5.8 2.2]]


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=50)

print("Train size: %d" %len(y_train))
print("Test size: %d" %len(y_test))

Train size: 100
Test size: 50


In [4]:
clf = neighbors.KNeighborsClassifier(n_neighbors = 1, p = 2) # p=2 (norm 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Print results for 20 test data points:")
print("Predicted labels: ", y_pred[20:40])
print("Ground truth:     ", y_test[20:40])

Print results for 20 test data points:
Predicted labels:  [2 1 1 2 2 1 2 1 1 0 1 2 0 2 0 1 0 1 0 0]
Ground truth:      [2 1 1 2 1 1 2 1 1 0 1 2 0 2 0 1 0 1 0 0]


In [5]:
print("Accuracy of 1NN: %2.f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 1NN: 98 %


In [6]:
# Viết đánh giá accuracy riêng
accuracy = (y_pred == y_test).mean() * 100
print("Độ chính các KNN khi k = 1 là: %2.f %%" %accuracy)

Độ chính các KNN khi k = 1 là: 98 %


In [7]:
# k = 10
clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print ("Accuracy of 10NN with major voting: %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 10NN with major voting: 98.00 %


In [8]:
# Đánh trọng số
clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2, weights = 'distance')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print ("Accuracy of 10NN (1/distance weights): %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 10NN (1/distance weights): 98.00 %


In [10]:
# Viết hàm đánh trọng số neighbors
def myweight(distances):
    sigma2 = .5 # we can change this number
    return np.exp(-distances**2/sigma2)

clf = neighbors.KNeighborsClassifier(n_neighbors = 10, p = 2, weights = myweight)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print ("Accuracy of 10NN (customized weights): %.2f %%" %(100*accuracy_score(y_test, y_pred)))

Accuracy of 10NN (customized weights): 98.00 %


## Note:
#### 1. Dự đoán kết quả của dữ liệu mới rất đơn giản, phù hợp khi có ít dữ liệu, dữ liệu có số chiều thấp (cái này có lẽ khá quan trọng khi có ít dữ liệu)
#### 2. Không cần giả sử về phân phối của các class (cái này khá quan trọng)