In [1]:
# import libraries
import numpy as np
from numpy import ndarray

import unittest

## 1. Interval-scaled variables

$$x_i = \{x_{i1}, x_{i2}, \cdots, x_{ip}\}$$
$$x_j = \{x_{j1}, x_{j2}, \cdots, x_{jp}\}$$
Standardize the values then find the distance
$$ z_{if} = \frac{x_{if} - m_{f}}{s_f} $$

$$ m_f = \frac{1}{n}\sum\limits_{i=1}^{n}x_{if}$$
$$ s_f = \frac{1}{n}\sum\limits_{i=1}^{n}|x_{if} - m_f|$$

In [30]:
def standardize(data: ndarray) -> ndarray:
    m = np.mean(data)
    s = np.std(data)
    return (data - m) / s

def minkowski_distance(data1: ndarray, data2: ndarray, d: int):
    return np.power(np.sum(np.power(np.abs(data1 - data2), d)), 1 / d)

In [37]:
a = np.array([15, 23, 3, 563, 35])
b = np.array([456, 40, 2, 599, 70])
c = np.array([16, 22, 3, 564, 36])

# distances
d = 1
print("a, b: Without standardize:", minkowski_distance(a, b, d))
print("a, b: After standardize:", minkowski_distance(standardize(a), standardize(b), d))

print("a, c: Without standardize:", minkowski_distance(a, c, d))
print("a, c: After standardize:", minkowski_distance(standardize(a), standardize(c), d))

print("b, c: Without standardize:", minkowski_distance(b, c, d))
print("b, c: After standardize:", minkowski_distance(standardize(b), standardize(c), d))

a, b: Without standardize: 530.0
a, b: After standardize: 2.8504878297421103
a, c: Without standardize: 4.0
a, c: After standardize: 0.013668234218137232
b, c: Without standardize: 528.0
b, c: After standardize: 2.843523992852403


***
## 2. Binary Variable
Contingency Matrix
| Objects | 1 | 0 |
| --- | --- | --- |
| 1 | a | b |
| 0 | c | d |

### For the `symmetric` attributes:
$$ d(i, j) = \frac{b + c}{a+b+c+d}$$
This is called simple matching coefficient
### For the `asymmetric` attributes:
$$ d(i, j) = \frac{b + c}{a+b+c}$$
This is called Jaccard's coefficient

In [68]:
def contingency_matrix(data1, data2):
    keys = ['fever', 'cough', 'test_1', 'test_2', 'test_3', 'test_4']
    values_1 = [data1[key] for key in keys]
    values_2 = [data2[key] for key in keys]
    
    values_1 = list(map(lambda x: 1 if x == "Y" or x == "P" else 0, values_1))
    values_2 = list(map(lambda x: 1 if x == "Y" or x == "P" else 0, values_2))
    
    a = b = c = d = 0
    for i, j in zip(values_1, values_2):
        if i == 1 and j == 1:
            a += 1
        elif i == 0 and j == 1:
            b += 1
        elif i == 1 and j == 0:
            c += 1
        else:
            d += 1
    return np.array([
        [a, b],
        [c, d]
    ])

def simple_matching_coefficient(cont_matrix: ndarray):
    """Symmetric variables"""
    return (cont_matrix[0][1] + cont_matrix[1][0]) / (np.sum(cont_matrix))

def jaccards_coefficient(cont_matrix: ndarray):
    """Asymmetric variables"""
    return (cont_matrix[0][1] + cont_matrix[1][0]) / (np.sum(cont_matrix) - cont_matrix[1, 1])

In [69]:
data = [
    dict(name = "Jack", fever="Y", cough="N", test_1="P", test_2="N", test_3="N", test_4="N"),
    dict(name = "Mary", fever="Y", cough="N", test_1="P", test_2="N", test_3="P", test_4="N"),
    dict(name = "Jim", fever="Y", cough="P", test_1="N", test_2="N", test_3="N", test_4="N"),
]

In [70]:
c_01 = contingency_matrix(data[0], data[1])
c_02 = contingency_matrix(data[0], data[2])
c_12 = contingency_matrix(data[1], data[2])

print(c_01, c_02, c_12, sep="\n")

[[2 1]
 [0 3]]
[[1 1]
 [1 3]]
[[1 1]
 [2 2]]


In [71]:
j_01 = jaccards_coefficient(c_01)
j_02 = jaccards_coefficient(c_02)
j_12 = jaccards_coefficient(c_12)

print(j_01, j_02, j_12, sep="\n")

0.3333333333333333
0.6666666666666666
0.75


## 3. Nominal Values

### i) Simple Matching:
$$d(i, j) = \frac{p - m}{p}$$
$p$ : Total number of variables

$m$ : Number of matching variables

### ii) One-Hot Encoding
Creates similar to assymetric binary attributes

In [72]:
data = [
    dict(color_1 = "black", color_2 = "red", color_3 = "brown"),
    dict(color_1 = "golden", color_2 = "red", color_3 = "black"),
    dict(color_1 = "yellow", color_2 = "brown", color_3 = "black")
]

In [96]:
def simple_matching(data1, data2):
    assert len(data1) == len(data2), "Length should be same"
    p = len(data1)
    m = 0
    for i, j in zip(data1.values(), data2.values()):
        if i == j:
            m += 1
    return (p - m) / p
        
def one_hot_encoding(data1, data2):
    new_data = []
    for i, j in zip(data1.items(), data2.items()):
        new_data.append(f"{i[0]}-{i[1]}") if not f"{i[0]}-{i[1]}" in new_data else ""
        new_data.append(f"{j[0]}-{j[1]}") if not f"{j[0]}-{j[1]}" in new_data else ""
    
    new_data_1 = dict()    
    new_data_2 = dict()
    

In [77]:
simple_matching(data[0], data[1])

1


0.6666666666666666

In [97]:
one_hot_encoding(data[0], data[1])

['color_1-black',
 'color_1-golden',
 'color_2-red',
 'color_3-brown',
 'color_3-black']