### 데이터 간의 유사성을 판단하기 위해서는, 거리를 계산합니다.

주로 사용하는 거리 함수(distance function, metric)로는 다음 3가지가 있습니다.


1. Manhattan Distance (L1 distance)


2. Euclidean Distance (L2 distance)


3. Cosine Distance


위의 3가지 metric을 직접 구현해보고 차이점을 확인해봅니다.

In [1]:
import numpy as np

v1 = np.array([1, 0, 0])
v2 = np.array([0, 1, 0])

v3 = np.array([3, 1, 2])
v4 = np.array([3, -1, -1])

In [10]:
def manhattan_distance(x, y):
#     """
#     x, y : np.array
#     return : x와 y 사이의 manhattan distance
#     """
    return np.sum(np.abs(x - y))
#     return

In [9]:
def euclidean_distance(x, y):
    """
    x, y : np.array
    return : x와 y 사이의 euclidean distance
    """
    return ((x - y) ** 2).sum() ** 0.5

In [6]:
def cosine_distance(x, y):
#     """
#     x, y : np.array
#     return : x와 y 사이의 cosine distance
#     """
    dot = x @ y # (x * y).sum() // np.dot(x, y)
    norm_x = np.sqrt(np.sum(np.square(x)))
#     norm_x = euclidean_distance(x, np.array([0, 0, 0]))
#     norm_x = euclidean_distance(x, np.zeros(len(x)))
    
    norm_y = np.sqrt(np.sum(np.square(y)))
    
    norm = norm_x * norm_y
    
    cosine_similarity = dot / norm
    
    return 1 - cosine_similarity
#     return

In [7]:
euclidean_distance(v1, v2)

1.4142135623730951

In [11]:
manhattan_distance(v1, v2), euclidean_distance(v1, v2)

(2, 1.4142135623730951)

In [12]:
manhattan_distance(v3, v4), euclidean_distance(v3, v4)

(5, 3.605551275463989)

## 샘플 데이터를 이용한 실습 진행

### - 가장 유사한 고객 찾기!

In [13]:
import pandas as pd

data = pd.read_csv("./train.csv")
data

Unnamed: 0,Id,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,...,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48,Response
0,2,1,D3,10,0.076923,2,1,1,0.641791,0.581818,...,0,0,0,0,0,0,0,0,0,8
1,5,1,A1,26,0.076923,2,3,1,0.059701,0.600000,...,0,0,0,0,0,0,0,0,0,4
2,6,1,E1,26,0.076923,2,3,1,0.029851,0.745455,...,0,0,0,0,0,0,0,0,0,8
3,7,1,D4,10,0.487179,2,3,1,0.164179,0.672727,...,0,0,0,0,0,0,0,0,0,8
4,8,1,D2,26,0.230769,2,3,1,0.417910,0.654545,...,0,0,0,0,0,0,0,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59376,79142,1,D1,10,0.230769,2,3,1,0.074627,0.709091,...,0,0,0,0,0,0,0,0,0,4
59377,79143,1,D3,26,0.230769,2,3,1,0.432836,0.800000,...,0,0,0,0,0,0,0,0,0,7
59378,79144,1,E1,26,0.076923,2,3,1,0.104478,0.745455,...,0,0,0,0,0,0,0,0,0,8
59379,79145,1,D2,10,0.230769,2,3,1,0.507463,0.690909,...,1,0,0,0,0,0,0,0,0,8


#### 6번 고객과 가장 가까운 고객 id 찾기!

In [23]:
data = data.iloc[:, : 15]

In [24]:
data

Unnamed: 0,Id,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_1,Employment_Info_2,Employment_Info_3,Employment_Info_4
0,2,1,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,0.028,12,1,0.00000
1,5,1,26,0.076923,2,3,1,0.059701,0.600000,0.131799,0.272288,0.000,1,3,0.00000
2,6,1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.428780,0.030,9,1,0.00000
3,7,1,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,0.042,9,1,0.00000
4,8,1,26,0.230769,2,3,1,0.417910,0.654545,0.234310,0.424046,0.027,9,1,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59376,79142,1,10,0.230769,2,3,1,0.074627,0.709091,0.320084,0.519103,0.020,1,3,0.00000
59377,79143,1,26,0.230769,2,3,1,0.432836,0.800000,0.403766,0.551119,0.100,9,1,0.00001
59378,79144,1,26,0.076923,2,3,1,0.104478,0.745455,0.246862,0.360969,0.035,9,1,0.00000
59379,79145,1,10,0.230769,2,3,1,0.507463,0.690909,0.276151,0.462452,0.038,9,1,


In [33]:
# data = data.drop(columns=["Product_Info_2", "Employemnt_Info_1"])
# data = data.drop(columns=["Employment_Info_1"])
data

Unnamed: 0,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_2,Employment_Info_3,Employment_Info_4
0,1,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,12,1,0.00000
1,1,26,0.076923,2,3,1,0.059701,0.600000,0.131799,0.272288,1,3,0.00000
2,1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.428780,9,1,0.00000
3,1,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,9,1,0.00000
4,1,26,0.230769,2,3,1,0.417910,0.654545,0.234310,0.424046,9,1,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59376,1,10,0.230769,2,3,1,0.074627,0.709091,0.320084,0.519103,1,3,0.00000
59377,1,26,0.230769,2,3,1,0.432836,0.800000,0.403766,0.551119,9,1,0.00001
59378,1,26,0.076923,2,3,1,0.104478,0.745455,0.246862,0.360969,9,1,0.00000
59379,1,10,0.230769,2,3,1,0.507463,0.690909,0.276151,0.462452,9,1,


In [35]:
# ids = data["Id"]
# data = data.drop(columns="Id")
data

Unnamed: 0,Product_Info_1,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,BMI,Employment_Info_2,Employment_Info_3,Employment_Info_4
0,1,10,0.076923,2,1,1,0.641791,0.581818,0.148536,0.323008,12,1,0.00000
1,1,26,0.076923,2,3,1,0.059701,0.600000,0.131799,0.272288,1,3,0.00000
2,1,26,0.076923,2,3,1,0.029851,0.745455,0.288703,0.428780,9,1,0.00000
3,1,10,0.487179,2,3,1,0.164179,0.672727,0.205021,0.352438,9,1,0.00000
4,1,26,0.230769,2,3,1,0.417910,0.654545,0.234310,0.424046,9,1,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59376,1,10,0.230769,2,3,1,0.074627,0.709091,0.320084,0.519103,1,3,0.00000
59377,1,26,0.230769,2,3,1,0.432836,0.800000,0.403766,0.551119,9,1,0.00001
59378,1,26,0.076923,2,3,1,0.104478,0.745455,0.246862,0.360969,9,1,0.00000
59379,1,10,0.230769,2,3,1,0.507463,0.690909,0.276151,0.462452,9,1,


### To-do: 6번 고객과 가장 가까운 고객을 euclidean_distance 기준으로 찾기

In [37]:
target_user = data.iloc[2]
target_user

Product_Info_1        1.000000
Product_Info_3       26.000000
Product_Info_4        0.076923
Product_Info_5        2.000000
Product_Info_6        3.000000
Product_Info_7        1.000000
Ins_Age               0.029851
Ht                    0.745455
Wt                    0.288703
BMI                   0.428780
Employment_Info_2     9.000000
Employment_Info_3     1.000000
Employment_Info_4     0.000000
Name: 2, dtype: float64

In [42]:
# for i in range(len(data)):
min_idx = 0 # 자기자신으로 설정해도 무관
min_distance = 123456789 # 충분히 큰 값으로 설정해주자!
distances = []
for i in data.index:
    if i == 2:
        continue
    distance = euclidean_distance(target_user, data.iloc[i])
    if distance < min_distance:
        min_distance = distance
        min_idx = i
print("user_ID: ", ids[min_idx])

user_ID:  24878
