### 데이터 간의 유사성을 판단하기 위해서는, 거리를 계산합니다.

주로 사용하는 거리 함수(distance function, metric)로는 다음 3가지가 있습니다.


1. Manhattan Distance (L1 distance)


2. Euclidean Distance (L2 distance)


3. Cosine Distance


위의 3가지 metric을 직접 구현해보고 차이점을 확인해봅니다.

In [1]:
import numpy as np

v1 = np.array([1, 0, 0])
v2 = np.array([0, 1, 0])

v3 = np.array([3, 1, 2])
v4 = np.array([3, -1, -1])

In [2]:
def manhattan_distance(x, y):
    """
    x, y : np.array
    return : x와 y 사이의 manhattan distance
    """

    dist = np.sum(np.abs(x - y))

    return dist

In [3]:
manhattan_distance(v1,v2)

2

In [4]:
def euclidean_distance(x, y):
    """
    x, y : np.array
    return : x와 y 사이의 euclidean distance
    """
    dist =  np.sqrt(np.sum(np.power(x-y, 2)))

    return dist

In [5]:
euclidean_distance(v1, v2)

1.4142135623730951

In [6]:
def cosine_distance(x, y):
    """
    x, y : np.array
    return : x와 y 사이의 cosine distance
    """

    norms = np.linalg.norm(x) * np.linalg.norm(y)
    dot = x @ y
    dist = dot / norms
    
    return 1-dist

In [7]:
cosine_distance(v1,v2)

1.0

### 가장 유사한 메뉴 찾기!

In [8]:
import pandas as pd

data = pd.read_csv('data/McDonaldsMenuNutrition.csv')
data

Unnamed: 0,Item,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
0,Hamburger,250.0,80.0,9.0,3.5,0.5,25.0,520.0,31.0,2.0,6.0,12.0,247.5
1,Cheeseburger,300.0,110.0,12.0,6,0.5,40.0,750.0,33.0,2.0,6.0,15.0,297.0
2,Double Cheeseburger,440.0,210.0,23.0,11,1.5,80.0,1150.0,34.0,2.0,7.0,25.0,433.0
3,McDouble,390.0,170.0,19.0,8,1.0,65.0,920.0,33.0,2.0,7.0,22.0,383.0
4,Quarter Pounder® with Cheese,510.0,230.0,26.0,12,1.5,90.0,1190.0,40.0,3.0,9.0,29.0,502.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Strawberry Banana Smoothie (Medium),260.0,5.0,1.0,0,0.0,5.0,40.0,60.0,3.0,54.0,2.0,312.0
326,Strawberry Banana Smoothie (Small),210.0,5.0,0.5,0,0.0,5.0,35.0,49.0,2.0,44.0,2.0,252.0
327,Wild Berry Smoothie (Large),320.0,10.0,1.0,0.5,0.0,5.0,45.0,75.0,4.0,69.0,3.0,386.5
328,Wild Berry Smoothie (Medium),260.0,5.0,1.0,0,0.0,5.0,35.0,60.0,4.0,55.0,3.0,312.0


In [9]:
data.isnull().any(axis=0)

Item                     False
Calories                  True
Calories from\nFat        True
Total Fat\n(g)            True
Saturated Fat\n(g)        True
Trans Fat\n(g)            True
Cholesterol\n(mg)         True
Sodium \n(mg)             True
Carbs\n(g)                True
Fiber\n(g)                True
Sugars\n(g)               True
Protein\n(g)              True
Weight Watchers\nPnts     True
dtype: bool

In [10]:
data.isnull().mean()

Item                     0.000000
Calories                 0.006061
Calories from\nFat       0.015152
Total Fat\n(g)           0.006061
Saturated Fat\n(g)       0.006061
Trans Fat\n(g)           0.006061
Cholesterol\n(mg)        0.006061
Sodium \n(mg)            0.006061
Carbs\n(g)               0.006061
Fiber\n(g)               0.006061
Sugars\n(g)              0.006061
Protein\n(g)             0.006061
Weight Watchers\nPnts    0.015152
dtype: float64

In [11]:
data = data.drop(index=259).drop(index=64)
cond = data['Saturated Fat\n(g)'].str.contains(r'^[0-9]+.?[0-9]?$')
cond[cond!=True]

102    False
Name: Saturated Fat\n(g), dtype: bool

In [12]:
# data['Saturated Fat\n(g)'].apply(float)

In [13]:
data.loc[data['Saturated Fat\n(g)']=='5.5 g', 'Saturated Fat\n(g)'] = 5.5
data['Saturated Fat\n(g)'] = data['Saturated Fat\n(g)'].apply(float)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 0 to 329
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Item                  328 non-null    object 
 1   Calories              328 non-null    float64
 2   Calories from
Fat     325 non-null    float64
 3   Total Fat
(g)         328 non-null    float64
 4   Saturated Fat
(g)     328 non-null    float64
 5   Trans Fat
(g)         328 non-null    float64
 6   Cholesterol
(mg)      328 non-null    float64
 7   Sodium 
(mg)          328 non-null    float64
 8   Carbs
(g)             328 non-null    float64
 9   Fiber
(g)             328 non-null    float64
 10  Sugars
(g)            328 non-null    float64
 11  Protein
(g)           328 non-null    float64
 12  Weight Watchers
Pnts  325 non-null    float64
dtypes: float64(12), object(1)
memory usage: 35.9+ KB


In [14]:
numeric_data = data.loc[:, ~(data.dtypes=='object')]
numeric_data

Unnamed: 0,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
0,250.0,80.0,9.0,3.5,0.5,25.0,520.0,31.0,2.0,6.0,12.0,247.5
1,300.0,110.0,12.0,6.0,0.5,40.0,750.0,33.0,2.0,6.0,15.0,297.0
2,440.0,210.0,23.0,11.0,1.5,80.0,1150.0,34.0,2.0,7.0,25.0,433.0
3,390.0,170.0,19.0,8.0,1.0,65.0,920.0,33.0,2.0,7.0,22.0,383.0
4,510.0,230.0,26.0,12.0,1.5,90.0,1190.0,40.0,3.0,9.0,29.0,502.0
...,...,...,...,...,...,...,...,...,...,...,...,...
325,260.0,5.0,1.0,0.0,0.0,5.0,40.0,60.0,3.0,54.0,2.0,312.0
326,210.0,5.0,0.5,0.0,0.0,5.0,35.0,49.0,2.0,44.0,2.0,252.0
327,320.0,10.0,1.0,0.5,0.0,5.0,45.0,75.0,4.0,69.0,3.0,386.5
328,260.0,5.0,1.0,0.0,0.0,5.0,35.0,60.0,4.0,55.0,3.0,312.0


In [15]:
numeric_data.describe()

Unnamed: 0,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
count,328.0,325.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,325.0
mean,284.618902,88.8,9.987805,4.42378,0.193598,40.228659,338.460366,40.009146,0.881098,28.103659,9.432927,306.426154
std,218.601528,99.605425,11.046254,4.797934,0.47557,79.274382,436.20478,30.97582,1.444554,28.574837,9.41216,232.483311
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,140.0,0.0,0.0,0.0,0.0,5.0,85.0,20.0,0.0,6.0,2.0,153.5
50%,235.0,60.0,7.0,3.5,0.0,20.0,145.0,36.0,0.0,18.0,7.0,267.0
75%,390.0,150.0,16.25,7.0,0.0,40.0,350.0,49.25,1.25,43.0,13.0,395.0
max,1160.0,540.0,60.0,20.0,2.5,575.0,2260.0,203.0,7.0,168.0,48.0,1317.0


In [16]:
numeric_data = (numeric_data-numeric_data.min())/(numeric_data.max() - numeric_data.min())
numeric_data.describe()

Unnamed: 0,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
count,328.0,325.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,328.0,325.0
mean,0.245361,0.164444,0.166463,0.221189,0.077439,0.069963,0.149761,0.197089,0.125871,0.167284,0.196519,0.23267
std,0.18845,0.184454,0.184104,0.239897,0.190228,0.137868,0.193011,0.15259,0.206365,0.170088,0.196087,0.176525
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.12069,0.0,0.0,0.0,0.0,0.008696,0.037611,0.098522,0.0,0.035714,0.041667,0.116553
50%,0.202586,0.111111,0.116667,0.175,0.0,0.034783,0.064159,0.17734,0.0,0.107143,0.145833,0.202733
75%,0.336207,0.277778,0.270833,0.35,0.0,0.069565,0.154867,0.242611,0.178571,0.255952,0.270833,0.299924
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
numeric_data.head()

Unnamed: 0,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts
0,0.215517,0.148148,0.15,0.175,0.2,0.043478,0.230088,0.152709,0.285714,0.035714,0.25,0.187927
1,0.258621,0.203704,0.2,0.3,0.2,0.069565,0.331858,0.162562,0.285714,0.035714,0.3125,0.225513
2,0.37931,0.388889,0.383333,0.55,0.6,0.13913,0.50885,0.167488,0.285714,0.041667,0.520833,0.328778
3,0.336207,0.314815,0.316667,0.4,0.4,0.113043,0.40708,0.162562,0.285714,0.041667,0.458333,0.290812
4,0.439655,0.425926,0.433333,0.6,0.6,0.156522,0.526549,0.197044,0.428571,0.053571,0.604167,0.381169


In [18]:
target = numeric_data.iloc[1]
target

Calories                 0.258621
Calories from\nFat       0.203704
Total Fat\n(g)           0.200000
Saturated Fat\n(g)       0.300000
Trans Fat\n(g)           0.200000
Cholesterol\n(mg)        0.069565
Sodium \n(mg)            0.331858
Carbs\n(g)               0.162562
Fiber\n(g)               0.285714
Sugars\n(g)              0.035714
Protein\n(g)             0.312500
Weight Watchers\nPnts    0.225513
Name: 1, dtype: float64

In [19]:
## TO-DO: Chesseburget와 가장 비슷한 메뉴를 euclidean_distance 기준으로 찾기!
target = numeric_data.iloc[1] # 치즈버거

data['distance'] = numeric_data.apply(lambda s: manhattan_distance(target.to_numpy(), s.to_numpy()), axis=1) # manhattan_distance(target.to_numpy(), s.to_numpy())
data['rank'] = data['distance'].rank()
data.head()

Unnamed: 0,Item,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts,distance,rank
0,Hamburger,250.0,80.0,9.0,3.5,0.5,25.0,520.0,31.0,2.0,6.0,12.0,247.5,0.511454,2.0
1,Cheeseburger,300.0,110.0,12.0,6.0,0.5,40.0,750.0,33.0,2.0,6.0,15.0,297.0,0.0,1.0
2,Double Cheeseburger,440.0,210.0,23.0,11.0,1.5,80.0,1150.0,34.0,2.0,7.0,25.0,433.0,1.708241,128.0
3,McDouble,390.0,170.0,19.0,8.0,1.0,65.0,920.0,33.0,2.0,7.0,22.0,383.0,0.941149,15.0
4,Quarter Pounder® with Cheese,510.0,230.0,26.0,12.0,1.5,90.0,1190.0,40.0,3.0,9.0,29.0,502.0,2.260757,235.0


In [20]:
data[data['rank']<5].sort_values(by='rank')[1:]

Unnamed: 0,Item,Calories,Calories from\nFat,Total Fat\n(g),Saturated Fat\n(g),Trans Fat\n(g),Cholesterol\n(mg),Sodium \n(mg),Carbs\n(g),Fiber\n(g),Sugars\n(g),Protein\n(g),Weight Watchers\nPnts,distance,rank
0,Hamburger,250.0,80.0,9.0,3.5,0.5,25.0,520.0,31.0,2.0,6.0,12.0,247.5,0.511454,2.0
24,Honey Mustard Snack Wrap® (Crispy),330.0,140.0,16.0,4.5,0.0,30.0,780.0,34.0,1.0,4.0,14.0,324.5,0.655152,3.0
26,Chipotle BBQ Snack Wrap® (Crispy),330.0,140.0,15.0,4.5,0.0,30.0,810.0,35.0,1.0,4.0,14.0,324.5,0.656686,4.0
