In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier
import math

In [2]:
iris = load_iris()

In [3]:
#iris.data -> features, iris.target -> labels
X = iris.data
y = iris.target

- About data : https://www.kaggle.com/uciml/iris

In [4]:
# split into test and train dataset, and use random_state=48
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=48)

In [5]:
from sklearn.preprocessing import StandardScaler

- Documentation for "StandardScaler" : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [6]:
ss = StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [7]:
trainSet=np.column_stack((X_train, y_train))
testSet=np.column_stack((X_test, y_test))

### KNN Classifier Implementation

In [8]:
pd.Series(y).unique() # 3가지 종류가 있음을 알 수 있다.

array([0, 1, 2], dtype=int64)

In [9]:
from scipy.spatial import distance
from numpy.linalg import inv

# mahalonobis 거리를 활용하기 위한 공분산행렬이다.
cov_matrix = np.cov(X_train.T)

def getDistance(p, instance1, instance2):
    # p=1 : return Manhattan Distance
    # p=2 : return Eucludean Distance
    # p=3 : return Mahalonobis Distance
    # p=4 : return Correlation Distance
    if p == 1:
        return distance.cityblock(instance1[:-1], instance2[:-1]) #cityblock이 manhattan거리이다.

    elif p == 2:
        return distance.euclidean(instance1[:-1], instance2[:-1])

    elif p == 3:    
        cov_matrix = np.cov(X_train.T)
        return distance.mahalanobis(instance1[:-1], instance2[:-1], inv(cov_matrix))
    
    elif p == 4:
        return distance.correlation(instance1[:-1], instance2[:-1])

In [10]:
def getNeighbors(p, trainSet, testInstance, k):
    
    neighbors = list()
    for i in range(len(trainSet)):
        # trainSet의 index와 거리를 같이 저장한다.
        neighbors.append((i, getDistance(p, trainSet[i], testInstance)))
    
    #이웃중에서 거리가 가장 짧은 k개만 뽑는다
    return sorted(neighbors, key=lambda x:x[1])[:k]

In [11]:
import operator
from collections import Counter

def getResponse(neighbors):
    # 0은 인덱스 1은 거리값
    # 정답(class)값만 담고있는 리스트 생성
    result = [trainSet[each[0]][-1] for each in neighbors]

    # 가장 많이나온 정답(class)가 무엇인지 알려주기
    counter = Counter(result)
    vote = max(result, key=counter.get)

    return vote

In [12]:
def getAccuracy(testSet, predictions):
    answer_count = 0
    for i in range(len(predictions)):
        if predictions[i] == testSet[i][-1]:
            answer_count += 1

    accuracy_score = answer_count / len(testSet)
    return accuracy_score

In [13]:
k = 3
p = 2 # Euclidean distance

def KNN(p, trainSet, testSet, k):
    predictions=[]    
    for i in range(len(testSet)):
        neighbors = getNeighbors(p, trainSet, testSet[i], k)
        result = getResponse(neighbors)
        predictions.append(result)
#         print(str(i) + ' > predicted : ' + str(result) + ', actual : ' + str(testSet[i][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + str(accuracy) + '%')
    
    return accuracy

KNN(p, trainSet, testSet, k)

Accuracy: 0.8888888888888888%


0.8888888888888888

# 거리를 4가지 방법으로 해서 accuracy비교!
### manhatan거리가 정확률이 제일 좋게 나온다.

In [14]:
for p in range(1, 5):
    KNN(p, trainSet, testSet, k)

Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.8444444444444444%
Accuracy: 0.7111111111111111%


# 적절한 k 값 찾기
### accuracy가 높았던 manhatan거리와 euclidean거리를 사용하도록 한다.

In [15]:
p = 1
print("manhatan")
# for k in range(1, 10):
k_value = [KNN(p, trainSet, testSet, k) for k in range(1, 20)]
print("k :", k_value.index(max(k_value)) + 1)
p = 2
print("euclidean")
k_value = [KNN(p, trainSet, testSet, k) for k in range(1, 20)]
print("k :", k_value.index(max(k_value)) + 1)

manhatan
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9333333333333333%
Accuracy: 0.9111111111111111%
Accuracy: 0.9555555555555556%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
k : 13
euclidean
Accuracy: 0.8666666666666667%
Accuracy: 0.8666666666666667%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.8888888888888888%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.9333333333333333%
Accuracy: 0.9111111111111111%
Accuracy: 0.9111111111111111%
Accuracy: 0.91

#### 3개의 클래스라 3개가 제일 높을 줄 알았는데 훈련데이터에 따라 달라 k가 다른숫자일 때도 높게 나오기도 함을 볼 수 있다.
#### 하지만 이렇게 하면 testSet에 오버피팅 된 하이퍼파라미터 k를 얻기 때문에 cross validation을 사용해본다!

# sklearn 라이브러리 사용한 knn 구현
# 적절한 k 값 찾기 - Cross Validation(cv=5)

In [16]:
knn = KNeighborsClassifier(n_neighbors=3, p=2)
# distance는 default값 : p=2
# minkowski_distance - euclidean distance와 manhattan distance의 일반화
# p=1 이면 manhattan p=2이면 euclidean
knn.fit(X_train, y_train)

knn.score(X_test, y_test) # 내가 구현한 knn과 값이 똑같음을 알 수 있다.

0.8888888888888888

In [17]:
from sklearn.model_selection import cross_val_score

acc_list =[]

for k in range(1, 21):
    knn_cv = KNeighborsClassifier(n_neighbors=k)

    cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=5)

    print("k =", k, cv_scores)
    print('cv_scores mean:{}'.format(np.mean(cv_scores)))
    acc_list.append((k, np.mean(cv_scores)))

k = 1 [0.95454545 1.         0.95238095 0.95238095 1.        ]
cv_scores mean:0.9718614718614719
k = 2 [0.90909091 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9536796536796537
k = 3 [0.95454545 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9627705627705628
k = 4 [0.95454545 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9627705627705628
k = 5 [0.95454545 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9627705627705628
k = 6 [0.95454545 0.95454545 1.         0.95238095 1.        ]
cv_scores mean:0.9722943722943723
k = 7 [0.95454545 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9627705627705628
k = 8 [0.90909091 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9536796536796537
k = 9 [0.95454545 0.95454545 0.95238095 0.95238095 1.        ]
cv_scores mean:0.9627705627705628
k = 10 [0.90909091 0.95454545 1.         0.95238095 1.        ]
cv_scores mean:0.9632034632034632
k = 11 [0.95454545 0.95454545

In [18]:
max(acc_list, key=lambda x:x[1]) # k=6일 떄 accuracy가 제일 높게 나온다!

(6, 0.9722943722943723)

### grid search library 사용하기

In [19]:
from sklearn.model_selection import GridSearchCV
#create new a knn model
knn2 = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 21)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X, y)

print(knn_gscv.best_params_, knn_gscv.best_score_) # 내가 구한 값하고 같음을 볼 수 있다.

{'n_neighbors': 6} 0.98


# k=6을 사용하여 예측하기

In [20]:
knn = KNeighborsClassifier(n_neighbors=6, p=2)
# distance는 default값 : p=2
# minkowski_distance - euclidean distance와 manhattan distance의 일반화
# p=1 이면 manhattan p=2이면 euclidean
knn.fit(X_train, y_train)

knn.score(X_test, y_test) # 3일때 보다 더 좋은 accuracy를 갖게 된다.

0.9333333333333333

# Weighted KNN

In [21]:
knn = KNeighborsClassifier(n_neighbors=6, p=2, weights='distance')
# default는 uniform으로 가중치가 다 같다는 전제하에 있고
# distance로 하면 우리ppt에 있는 거리의 역순으로 가중치를 부여하게 된다.

knn.fit(X_train, y_train)

knn.score(X_test, y_test) 

0.8888888888888888

#### 전보다 결과가 안좋은데 이유는 이미 정규화가 되어있고 교육용 데이터라 이상값이 별로 없기 때문이라고 생각한다.