In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
import math

In [5]:
iris = load_iris()

In [6]:
#iris.data -> features, iris.target -> labels
X = iris.data
y = iris.target

- About data : https://www.kaggle.com/uciml/iris

In [7]:
# split into test and train dataset, and use random_state=48
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=48)

In [8]:
from sklearn.preprocessing import StandardScaler

- Documentation for "StandardScaler" : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [9]:
ss = StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [10]:
trainSet=np.column_stack((X_train, y_train))
testSet=np.column_stack((X_test, y_test))

### KNN Classifier Implementation

In [11]:
pd.Series(y).unique() # 3가지 종류가 있음을 알 수 있다.

array([0, 1, 2], dtype=int64)

In [23]:
from scipy.spatial import distance
from numpy.linalg import inv

# mahalonobis 거리를 활용하기 위한 공분산행렬이다.
cov_matrix = np.cov(X_train.T)

def getDistance(p, instance1, instance2):
    # p=1 : return Manhattan Distance
    # p=2 : return Eucludean Distance
    # p=3 : return Mahalonobis Distance
    # p=4 : return Correlation Distance
    if p == 1:
        return distance.cityblock(instance1[:-1], instance2[:-1]) #cityblock이 manhattan거리이다.

    elif p == 2:
        return distance.euclidean(instance1[:-1], instance2[:-1])

    elif p == 3:    
        cov_matrix = np.cov(X_train.T)
        return distance.mahalanobis(instance1[:-1], instance2[:-1], inv(cov_matrix))
    
    elif p == 4:
        return distance.correlation(instance1[:-1], instance2[:-1])

In [24]:
def getNeighbors(p, trainSet, testInstance, k):
    
    neighbors = list()
    for i in range(len(trainSet)):
        # trainSet의 index와 거리를 같이 저장한다.
        neighbors.append((i, getDistance(p, trainSet[i], testInstance)))
    
    #이웃중에서 거리가 가장 짧은 k개만 뽑는다
    return sorted(neighbors, key=lambda x:x[1])[:k]

In [25]:
import operator
from collections import Counter

def getResponse(neighbors):
    # 0은 인덱스 1은 거리값
    # 정답(class)값만 담고있는 리스트 생성
    result = [trainSet[each[0]][-1] for each in neighbors]

    # 가장 많이나온 정답(class)가 무엇인지 알려주기
    counter = Counter(result)
    vote = max(result, key=counter.get)

    return vote

In [26]:
def getAccuracy(testSet, predictions):
    answer_count = 0
    for i in range(len(predictions)):
        if predictions[i] == testSet[i][-1]:
            answer_count += 1

    accuracy_score = answer_count / len(testSet)
    return accuracy_score

In [27]:
k = 3
p = 2 # Euclidean distance

def KNN(p, trainSet, testSet, k):
    predictions=[]    
    for i in range(len(testSet)):
        neighbors = getNeighbors(p, trainSet, testSet[i], k)
        result = getResponse(neighbors)
        predictions.append(result)
#         print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + str(accuracy) + '%')
    
    return accuracy

KNN(p, trainSet, testSet, k)

Accuracy: 0.8888888888888888%


0.8888888888888888

# 거리를 4가지 방법으로 해서 accuracy비교!
### manhatan거리가 정확률이 제일 좋게 나온다.

In [28]:
for p in range(1, 5):
    KNN(p, trainSet, testSet, k)

Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.8444444444444444%
Accuracy: 0.7111111111111111%


# 적절한 k 값 찾기
### accuracy가 높았던 manhatan거리와 euclidean거리를 사용하도록 한다.

In [30]:
p = 1
print("manhatan")
# for k in range(1, 10):
k_value = [KNN(p, trainSet, testSet, k) for k in range(1, 20)]
print("k :", k_value.index(max(k_value)) + 1)
p = 2
print("euclidean")
k_value = [KNN(p, trainSet, testSet, k) for k in range(1, 20)]
print("k :", k_value.index(max(k_value)) + 1)

manhatan
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9333333333333333%
Accuracy: 0.9111111111111111%
Accuracy: 0.9555555555555556%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
k : 13
euclidean
Accuracy: 0.8666666666666667%
Accuracy: 0.8666666666666667%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9333333333333333%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.91

#### 3개의 클래스라 3개가 제일 높을 줄 알았는데 훈련데이터에 따라 달라 k가 다른숫자일 때도 높게 나오기도 함을 볼 수 있다.