<a href="https://colab.research.google.com/github/seoyeon7/ML/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
from sklearn import datasets
iris = datasets.load_iris()
samples = iris.data
feature_names = iris['feature_names']

In [76]:
import pandas as pd
data = pd.DataFrame(samples, columns=feature_names)
data['target'] = iris['target']
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [77]:
data = data.sample(frac=1).reset_index(drop=True)
data['seq'] = data.index
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,seq
0,5.4,3.4,1.7,0.2,0,0
1,6.1,3.0,4.6,1.4,1,1
2,5.7,3.0,4.2,1.2,1,2
3,4.6,3.6,1.0,0.2,0,3
4,4.6,3.2,1.4,0.2,0,4


In [78]:
#train set(80%)과 test set(20%)으로 분류
train_size = int(data.shape[0]*0.80)
test_size = int(data.shape[0]*0.20)

train = data[:train_size]
test = data[test_size:]

In [79]:
# 거리 계산 함수

In [80]:
# Euclidian distance (유클리드 거리)
import math

def get_eucl(row1, row2):
    return math.sqrt(sum([(x1-x2)**2 for x1,x2 in zip(row1,row2)]))

In [81]:
# Cosine Similarity (코사인 유사도)
def get_cos_sim(row1, row2):
    return math.acos(
        sum([x1*x2 for x1,x2 in zip(row1,row2)])/(sum([i**2 for i in row1]) * sum([i**2 for i in row2]))
    )

In [82]:
# Normalized Euclidean distance (정규화된 데이터에 대한 유클리드 거리를 계산)
columns=['sepal length (cm)', 'sepal width (cm)','petal length (cm)','petal width (cm)', 'target', 'seq']
train_n = pd.DataFrame(columns)

train_n['target'] = train['target'].copy()
train_n['seq'] = train['seq'].copy()

train_n['sepal length (cm)'] = train['sepal length (cm)'].apply(
    lambda x: (x-train['sepal length (cm)'].min())/(train['sepal length (cm)'].max()-train['sepal length (cm)'].min()))

train_n['sepal width (cm)'] = train['sepal width (cm)'].apply(
    lambda x: (x-train['sepal width (cm)'].min())/(train['sepal width (cm)'].max()-train['sepal width (cm)'].min()))

train_n['petal length (cm)'] = train['petal length (cm)'].apply(
    lambda x: (x-train['petal length (cm)'].min())/(train['petal length (cm)'].max()-train['petal length (cm)'].min()))

train_n['petal width (cm)'] = train['petal width (cm)'].apply(
    lambda x: (x-train['petal width (cm)'].min())/(train['petal width (cm)'].max()-train['petal width (cm)'].min())) 


In [83]:
# 거리 계산

In [84]:
train_val = train.values
ntrain_val = train_n.values
eud = []
cosine_sim = []
neud = []
l = len(train_val)
for i in range(l):
    eu_distance = []
    cos_sim = []
    neu_distance = []
    for j in range(l):
        if(i!=j):
            index = train_val[j][5]
            nindex = ntrain_val[j][5]
            ed = get_eucl(train_val[i][:-2], train_val[j][:-2])
            cs = get_cos_sim(train_val[i][:-2], train_val[j][:-2])
            neu = get_eucl(ntrain_val[i][:-2], ntrain_val[j][:-2])
            
            eu_distance.append((ed, index))
            cos_sim.append((cs, index))
            neu_distance.append((neu, nindex))
            
    eu_distance.sort(key= lambda x: x[0])
    cos_sim.sort(key= lambda x: x[0])
    neu_distance.sort(key= lambda x: x[0])
    
    eu_distance = [i[1] for i in eu_distance]
    cos_sim = [i[1] for i in cos_sim]
    neu_distance = [i[1] for i in neu_distance]
    
    eud.append(eu_distance)
    cosine_sim.append(cos_sim)
    neud.append(neu_distance)
    

train['euclidean'] = eud
train['cosine_sim'] = cosine_sim
train['n_euclidean'] = neud

TypeError: ignored

In [None]:
# 가까운 요소 k개 선택

def get_near(row, dis,k):
    return row[dis][:k]

In [None]:
# k개 요소 중 dominant class 선택

def get_dominant_class(df, neighbors):
    classes = df[df['seq'].isin(neighbors)]['target']
    return classes.value_counts().index[0]

In [None]:
# 최적화

k = 1
hyper_params = []
acc = {1: {}}

train['eud_{}'.format(k)] = train.apply(lambda x: get_near(x, 'euclidean',k), axis=1)
train['cosim_{}'.format(k)] = train.apply(lambda x: get_near(x, 'cosine_sim',k), axis=1)
train['neud_{}'.format(k)] = train.apply(lambda x: get_near(x, 'n_euclidean',k), axis=1)
    
train['eud_{}_target'.format(k)] = train['eud_{}'.format(k)].apply(lambda row: get_dominant_class(train, row))
train['cosim_{}_target'.format(k)] = train['cosim_{}'.format(k)].apply(lambda row: get_dominant_class(train, row))
train['neud_{}_target'.format(k)] = train['neud_{}'.format(k)].apply(lambda row: get_dominant_class(train, row))
    
hyper_params.append('eud_{}_target'.format(k))
hyper_params.append('cosim_{}_target'.format(k))
hyper_params.append('neud_{}_target'.format(k))
    
acc[k]['eud'] = train[train['target']==train['eud_{}_target'.format(k)]].shape[0]/train.shape[0]
acc[k]['cosine'] = train[train['target']==train['cosim_{}_target'.format(k)]].shape[0]/train.shape[0]
acc[k]['neud'] = train[train['target']==train['neud_{}_target'.format(k)]].shape[0]/train.shape[0]
    
k+=2

In [None]:
#파라미터 시도

cols = ['target'] + hyper_params
train[cols].head()

In [None]:
# k=3일 때

test['seq'] = test.index
test2 = test.values
test_eud = []
l = len(test)
for i in range(l):
    test_eu_distance = []
    for j in range(len(train)):
        index = train_val[j][5]
        ed = get_eucl(test2[i][:-2], train_val[j][:-2])
        test_eu_distance.append((ed, index))
        
    test_eu_distance.sort(key= lambda x: x[0])
    test_eu_distance = [i[1] for i in test_eu_distance]
    test_eud.append(test_eu_distance)
    
test['euclidean'] = test_eud

In [None]:
test['eu'] = test.apply(lambda x: get_near(x, 'euclidean',3), axis=1)
test[columns+['eu']].head()