In [33]:
from IPython.display import Image
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, fpmax, association_rules
import numpy as np
import pandas as pd

### Mean
    
$${\displaystyle {\bar {x}}={\frac {1}{n}}\left(\sum_{i=1}^{n}{x_{i}}\right)={\frac {x_{1}+x_{2}+\cdots +x_{n}}{n}}}$$

In [34]:
np.mean([1,2,3,4,5,6,7,8,9,10])

5.5

### Median   

if ${\displaystyle n}$ is odd, $${\displaystyle \mathrm {median} (x)=x_{(n+1)/2}}$$
if ${\displaystyle n}$ is even, $${\displaystyle \mathrm {median} (x)={\frac {x_{(n/2)}+x_{((n/2)+1)}}{2}}}$$
also, 
$$mean − mode ≈ 3(mean − median)$$
$$midrange = (min + max) / 2$$
$$range = max− min$$

In [35]:
np.median([1,2,3,4,5,6,7,8,9,10])

5.5

### IQR = Q3 − Q1

Outliers are values falling at least 1.5 × IQR above the third quartile or below the first quartile.

<div>
<img src="" width="500"/>
</div>


In [64]:
# data = np.array([30,36,47,50,52,52,56,60,63,70,70,110])
data = np.array([20, 22, 30, 33, 33, 35, 35, 35, 35, 36, 40, 41, 42, 51, 54])
print(f'Len : {len(data)}, sorted : {sorted(data)}')
q1 = np.quantile(data, 0.25, method= 'midpoint')
q2 = np.quantile(data, 0.5, method= 'midpoint')
q3 = np.quantile(data, 0.75, method= 'midpoint')
iqr = q3 - q1
minimum = q1 - 1.5 * iqr
maximum = q3 + 1.5 * iqr
print(f'Q1 : {q1}, Median : {q2}, Q3 : {q3}')
print(f'IQR : {iqr}')
print(f'Minimum : {minimum}, Maximum : {maximum}')

Len : 15, sorted : [20, 22, 30, 33, 33, 35, 35, 35, 35, 36, 40, 41, 42, 51, 54]
Q1 : 33.0, Median : 35.0, Q3 : 40.5
IQR : 7.5
Minimum : 21.75, Maximum : 51.75


### 5−number summary

[Minimum, Q1, Median, Q3, Maximum]

In [37]:
Image(url='https://i.imgur.com/0WovyJS.png', width=500)

### Variance

$$\operatorname {Var} (X)=\operatorname {E} \left[(X-\mu )^{2}\right]$$
$$\operatorname {Var} (X)=\operatorname {Cov} (X,X)$$
$${\displaystyle \operatorname {Var} (X) = \operatorname {E} \left[X^{2}\right]-\operatorname {E} [X]^{2}}$$
$${\displaystyle \operatorname {Var} (X)={\frac {1}{n}}\sum _{i=1}^{n}(x_{i}-\mu )^{2}}$$

In [38]:
np.var([1,2,3,4,5,6,7,8,9,10])

8.25

### Standard deviation

$${\displaystyle \sigma = \sqrt{\operatorname {Var} (X)} =\sqrt{{\frac {1}{n}}\sum _{i=1}^{n}(x_{i}-\mu )^{2}}}$$

In [39]:
np.std([1,2,3,4,5,6,7,8,9,10])

2.8722813232690143

### Dissimilarity matrix
1. Nominal variables
    
    $${\displaystyle d(i,j) = \frac{n\_variables - n\_matches}{n\_variables}}$$
    
2. Ordinal variables 
    
    Map the range of all attributes to $[0, 1]$
    
    $${\displaystyle d(i,j) = \sqrt{(\sum_f{(x_{if} - x_{jf})^{2}})}}$$
    
    where $x_i$ and $x_j$ are two scaled values of different rows for the same variable, summed up over all the ordinal variables 
    
3. Numerical variables

    1. Euclidean distance 
        
        $${\displaystyle d(i,j) = \sqrt{(\sum_f{(x_{if} - x_{jf})^{2}})}}$$
        
    2. Manhattan distance 
        
        $${\displaystyle d(i,j) = (\sum_f{|x_{if} - x_{jf}|})}$$
        
    3. Minkowski distance 
        
        $${\displaystyle d(i,j) = (\sum_f{|x_{if} - x_{jf}|^{p}})^{1/p}}$$

        where $x_i$ and $x_j$ are two scaled values of different rows for the same variable, summed up over all the numerical variables
        
    4. Supremum distance, for $p = \infin$
        
        $${\displaystyle d(i,j) = max_f({|x_{if} - x_{jf}|})}$$
        
4. Binary variables
    1. Symmetric
        
        $${\displaystyle d(i,j) = {M_{01} + M_{10} \over M_{00} + M_{01} + M_{10} + M_{11}}}$$
        
    2. Unsymmetric
        
        $${\displaystyle d(i,j) = {M_{01} + M_{10} \over M_{01} + M_{10} + M_{11}}}$$
        
        Jacard coeffecient, $${\displaystyle J = 1 - d(i,j) = {M_{11} \over M_{01} + M_{10} + M_{11}}}$$

In [40]:
Image(url='https://i.imgur.com/DkbRiVQ.png', width=1000)

In [59]:
# dissimilarity measure

# for nominal data
data_col = ['A', 'B', 'A']
dissimilarity_matrix = np.zeros((len(data_col), len(data_col)), dtype=float)
for i in range(len(data_col)):
    for j in range(i):
        if data_col[i] != data_col[j]:
            dissimilarity_matrix[i][j] = 1
print(f'Dissimilarity matrix : \n{dissimilarity_matrix}')

# for ordinal data / numeric data
data_col = [0, 1, 0.5]
dissimilarity_matrix = np.zeros((len(data_col), len(data_col)))
# scale between 0 and 1 
data_col = (data_col - np.min(data_col)) / (np.max(data_col) - np.min(data_col))
dissimilarity_matrix = np.zeros((len(data_col), len(data_col)), dtype=float)
for i in range(len(data_col)):
    for j in range(i):
        dissimilarity_matrix[i][j] = abs(data_col[i] - data_col[j])
print(f'Dissimilarity matrix : \n{dissimilarity_matrix}')

Dissimilarity matrix : 
[[0. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
Dissimilarity matrix : 
[[0.  0.  0. ]
 [1.  0.  0. ]
 [0.5 0.5 0. ]]


### Cosine similarity

$${\displaystyle S_{C}(A,B):=\cos(\theta )={\mathbf {A} \cdot \mathbf {B}  \over \|\mathbf {A} \|\|\mathbf {B} \|}={\frac {\sum \limits _{i=1}^{n}{A_{i}B_{i}}}{{\sqrt {\sum \limits _{i=1}^{n}{A_{i}^{2}}}}{\sqrt {\sum \limits _{i=1}^{n}{B_{i}^{2}}}}}}}$$

In [42]:
# calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

a = np.array([5, 0, 3, 0, 2, 0, 0, 2, 0,0])
b = np.array([3, 0, 2, 0, 1, 1, 0, 1, 0,1])
print(f"a * b = {a * b}")
a_dot_b = np.dot(a, b)
print(f"a_norm = {a * a}")
a_norm = np.linalg.norm(a)
print(f"b_norm = {b * b}")
b_norm = np.linalg.norm(b)
a_dot_b, a_norm, b_norm, a_dot_b / (a_norm * b_norm)

a * b = [15  0  6  0  2  0  0  2  0  0]
a_norm = [25  0  9  0  4  0  0  4  0  0]
b_norm = [9 0 4 0 1 1 0 1 0 1]


(25, 6.48074069840786, 4.123105625617661, 0.9356014857063997)

### Normalization
1. Min-max normalization
    
    $${\displaystyle x^{\prime}=\frac{x-\min (x)}{\max (x)-\min (x)}}$$
    
    $${\displaystyle x^{\prime}=a+\frac{(x-\min (x))(b-a)}{\max (x)-\min (x)}}$$
    
2. Z-score normalization
    
    $${\displaystyle z=\frac{x-\mu}{\sigma}}$$
    
3. Decimal normalization - normalize to range [−1, +1]

In [43]:
# min-max normalization
data = np.array([1,2,3,4,5,6,7,8,9,10])
data_1 = (data - np.min(data)) / (np.max(data) - np.min(data))
print(f'Min : {np.min(data)}, Max : {np.max(data)}, \ndata_1 : {data_1}')

# z-score normalization
data_2 = (data - np.mean(data)) / np.std(data)
print(f'Mean : {np.mean(data)}, Std : {np.std(data)}, \ndata_2 : {data_2}')

# decimal scaling normalization
data_3 = data / 10
print(f'data_3 : {data_3}')

Min : 1, Max : 10, 
data_1 : [0.         0.11111111 0.22222222 0.33333333 0.44444444 0.55555556
 0.66666667 0.77777778 0.88888889 1.        ]
Mean : 5.5, Std : 2.8722813232690143, 
data_2 : [-1.5666989  -1.21854359 -0.87038828 -0.52223297 -0.17407766  0.17407766
  0.52223297  0.87038828  1.21854359  1.5666989 ]
data_3 : [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


### Binning 

1. Equal-width binning
    
    $${\displaystyle \Delta = \frac{max - min}{n}}$$
    
    $${\displaystyle x^{\prime} = \frac{x - min}{\Delta}}$$

2. Equal-frequency binning
    
    $${\displaystyle \Delta = \frac{N}{n}}$$
    
    $${\displaystyle x^{\prime} = \frac{rank(x)}{\Delta}}$$

In [44]:
# equal width binning and equal frequency binning

data = np.array([70, 70, 72, 73, 75, 75, 76, 76, 78, 79, 80, 81, 53, 56, 57, 63, 66, 67, 67, 67, 68, 69, 70, 70])
n_bins = 3

# sort data
data = np.sort(data)
print(f'Sorted data : {data}')
print(f'Min : {np.min(data)}, Max : {np.max(data)}, Range : {np.max(data) - np.min(data)}, Count : {len(data)}, Width : {(np.max(data) - np.min(data)) / n_bins}, Depth : {len(data) / n_bins}')

# equal width binning
data_1 = np.array([np.min(data) + (np.max(data) - np.min(data)) / n_bins * i for i in range(n_bins + 1)])
print(f'Equal width binning : {data_1}')
# data split into bins
print(f'Bin 1 : {data[data < data_1[1]]}')
print(f'Bin 2 : {data[(data >= data_1[1]) & (data < data_1[2])]}')
print(f'Bin 3 : {data[data >= data_1[2]]}')

# equal frequency binning
data_2 = np.array([np.quantile(data, i / n_bins) for i in range(n_bins + 1)])
print(f'Equal frequency binning : {data_2}')
# data split into bins
print(f'Bin 1 : {data[data < data_2[1]]}')
print(f'Bin 2 : {data[(data >= data_2[1]) & (data < data_2[2])]}')
print(f'Bin 3 : {data[data >= data_2[2]]}')

Sorted data : [53 56 57 63 66 67 67 67 68 69 70 70 70 70 72 73 75 75 76 76 78 79 80 81]
Min : 53, Max : 81, Range : 28, Count : 24, Width : 9.333333333333334, Depth : 8.0
Equal width binning : [53.         62.33333333 71.66666667 81.        ]
Bin 1 : [53 56 57]
Bin 2 : [63 66 67 67 67 68 69 70 70 70 70]
Bin 3 : [72 73 75 75 76 76 78 79 80 81]
Equal frequency binning : [53.         67.66666667 73.66666667 81.        ]
Bin 1 : [53 56 57 63 66 67 67 67]
Bin 2 : [68 69 70 70 70 70 72 73]
Bin 3 : [75 75 76 76 78 79 80 81]


### Pearson’s correlation coefficient

For a sample, $${\displaystyle r_{x y}=\frac{\sum_{i=1}^n\left(x_i-\bar{x}\right)\left(y_i-\bar{y}\right)}{\sqrt{\sum_{i=1}^n\left(x_i-\bar{x}\right)^2} \sqrt{\sum_{i=1}^n\left(y_i-\bar{y}\right)^2}}}$$

In [45]:
# calculating pearson correlation
x = np.array([20, 10, 23, 5])
y = np.array([30, 5, 29, 10])
x_mean = np.mean(x)
y_mean = np.mean(y)
print(f'x_mean : {x_mean}, y_mean : {y_mean}')
# numerator
print(f'x - x_mean : {x - x_mean}\ny - y_mean : {y - y_mean}')
print(f'(x - x_mean) ** 2 : {(x - x_mean) ** 2}\n(y - y_mean) ** 2 : {(y - y_mean) ** 2}\n(x - x_mean) * (y - y_mean) : {(x - x_mean) * (y - y_mean)}')
numerator = np.sum((x - x_mean) * (y - y_mean))
print(f'numerator : {numerator}')
denominator = np.sqrt(np.sum((x - x_mean) ** 2)) * np.sqrt(np.sum((y - y_mean) ** 2))
print(f'denominator : {denominator}')
print(f'pearson correlation : {numerator / denominator}')

x_mean : 14.5, y_mean : 18.5
x - x_mean : [ 5.5 -4.5  8.5 -9.5]
y - y_mean : [ 11.5 -13.5  10.5  -8.5]
(x - x_mean) ** 2 : [30.25 20.25 72.25 90.25]
(y - y_mean) ** 2 : [132.25 182.25 110.25  72.25]
(x - x_mean) * (y - y_mean) : [63.25 60.75 89.25 80.75]
numerator : 294.0
denominator : 325.36287434186465
pearson correlation : 0.9036064750617149


### $\chi^2$  statistic

The $\chi^2$ statistic is a measure of the difference between expected and observed data in a statistical analysis. It is commonly used to test the null hypothesis that two categorical variables are independent of each other.

In [46]:
# calculating chi-square statistic

data = {'A': {'a': 11, 'b': 5, 'c': 1}, 'B': {'a': 8, 'b': 6, 'c': 8}, 'C': {'a': 3, 'b': 10, 'c': 12}}
data = pd.DataFrame(data).T
data['sum_row'] = data.sum(axis=1)
data.loc['sum_col'] = data.sum(axis=0)
display(data)

print("Expected Values")
data1 = data.copy()
# calculate expected value
for i in data1.index[:-1]:
    for j in data1.columns[:-1]:
        data1.loc[i, j] = data1.loc['sum_col', j] * data1.loc[i, 'sum_row'] / data1.loc['sum_col', 'sum_row']
display(data1)

# calculate chi-square statistic
data2 = data.copy().iloc[:-1, :-1]
for i in data2.index:
    for j in data2.columns:
        data2.loc[i, j] = (data2.loc[i, j] - data1.loc[i, j]) ** 2 / data1.loc[i, j]
display(data2)

chi = data2.sum().sum()
print(f'Chi-squared statistic : {chi}')


Unnamed: 0,a,b,c,sum_row
A,11,5,1,17
B,8,6,8,22
C,3,10,12,25
sum_col,22,21,21,64


Expected Values


Unnamed: 0,a,b,c,sum_row
A,5.84375,5.578125,5.578125,17
B,7.5625,7.21875,7.21875,22
C,8.59375,8.203125,8.203125,25
sum_col,22.0,21.0,21.0,64


Unnamed: 0,a,b,c
A,4.549632,0.059918,3.757397
B,0.02531,0.205763,0.084551
C,3.641023,0.393601,1.757411


Chi-squared statistic : 14.474605180915342


### Decision tree impurity measures

$${\displaystyle \text{Entropy}(t) = -\sum_{c=1}^{C} p(c|t) log_2(p(c|t))}$$

$${\displaystyle Gini(t) = 1 - \sum_{c=1}^{C} [p(c|t)]^2}$$

$$\text{Misclassification error}(t) =  1 - \max_c[p(c|t)]$$

Where $t$ is the current node, $C$ is the number of classes, and $p(c|t)$ is the proportion of the samples that belong to class $c$ at node $t$.

In [47]:
from math import log2
# code for calculating entropy, gini index, and misclassification error

# sample data
data = {'A': 5, 'B': 9}

# calculate entropy
print('\nEntropy')
entropy = 0.0
for c, i in data.items():
    p = i/sum(data.values())
    if p != 0:
        class_entropy = -p*log2(p)
    else:
        class_entropy = 0
    entropy += class_entropy
    print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Entropy : {class_entropy:.3f}')
print(f'Total Entropy : {entropy:.3f}')

# calculate gini index
print('\nGini Index')
gini = 1
for c, i in data.items():
    p = i/sum(data.values())
    gini -= p**2
    print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Gini Index : {p**2:.3f}')
print(f'Total Gini Index : {gini:.3f}')

# calculate misclassification error
print('\nMisclassification Error')
misclassification = 1 - max(data.values())/sum(data.values())
print(f'Misclassification Error : {misclassification:.3f}')




Entropy
Class : A, Count : 5, Probability : 0.357, Class Entropy : 0.531
Class : B, Count : 9, Probability : 0.643, Class Entropy : 0.410
Total Entropy : 0.940

Gini Index
Class : A, Count : 5, Probability : 0.357, Class Gini Index : 0.128
Class : B, Count : 9, Probability : 0.643, Class Gini Index : 0.413
Total Gini Index : 0.459

Misclassification Error
Misclassification Error : 0.357


In [48]:
from collections import Counter

# code for calculating information gain

data = {'a1': {'A': 3, 'B': 2}, 'a2': {'A': 4, 'B': 0}, 'a3': {'A': 2, 'B': 3}}
summed_data = Counter()
for k, v in data.items():
    summed_data.update(v)
summed_data = dict(summed_data)

# calculate entropy
print('\nEntropy before split')
entropy_bef = 0.0
for c, i in summed_data.items():
    p = i/sum(summed_data.values())
    if p != 0:
        class_entropy = -p*log2(p)
    else:
        class_entropy = 0
    entropy_bef += class_entropy
    print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Entropy : {class_entropy:.3f}')
print(f'Total Entropy before split: {entropy_bef:.3f}')

# calculate entropy after split
print('\nEntropy after split')
entropy_af = 0.0
for a, c in data.items():
    print(f'For attribute value {a}')
    entropy = 0.0
    for c, i in c.items():
        p = i/sum(data[a].values())
        if p != 0:
            class_entropy = -p*log2(p)
        else:
            class_entropy = 0
        entropy += class_entropy
        print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Entropy : {class_entropy:.3f}')

    entropy_af += entropy * sum(data[a].values())/sum([sum(data[b].values()) for b in data])
    print(f'Attribute : {a}, Entropy : {entropy:.3f}')
print(f'Total Entropy after split: {entropy_af:.3f}')

# calculate information gain
print(f'Information Gain : {entropy_bef - entropy_af:.3f}')


Entropy before split
Class : A, Count : 9, Probability : 0.643, Class Entropy : 0.410
Class : B, Count : 5, Probability : 0.357, Class Entropy : 0.531
Total Entropy before split: 0.940

Entropy after split
For attribute value a1
Class : A, Count : 3, Probability : 0.600, Class Entropy : 0.442
Class : B, Count : 2, Probability : 0.400, Class Entropy : 0.529
Attribute : a1, Entropy : 0.971
For attribute value a2
Class : A, Count : 4, Probability : 1.000, Class Entropy : -0.000
Class : B, Count : 0, Probability : 0.000, Class Entropy : 0.000
Attribute : a2, Entropy : 0.000
For attribute value a3
Class : A, Count : 2, Probability : 0.400, Class Entropy : 0.529
Class : B, Count : 3, Probability : 0.600, Class Entropy : 0.442
Attribute : a3, Entropy : 0.971
Total Entropy after split: 0.694
Information Gain : 0.247


In [49]:
# code for calculating information gain

data = {'a1': {'A': 3, 'B': 2}, 'a2': {'A': 4, 'B': 0}, 'a3': {'A': 2, 'B': 3}}
summed_data = Counter()
for k, v in data.items():
    summed_data.update(v)
summed_data = dict(summed_data)

# calculate gini index
print('\nGini Index before split')
gini_bef = 1
for c, i in summed_data.items():
    p = i/sum(summed_data.values())
    gini_bef -= p**2
    print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Gini Index : {p**2:.3f}')
print(f'Total Gini Index before split: {gini_bef:.3f}')

# calculate gini index after split
print('\nGini Index after split')
gini_af = 0.0
for a, c in data.items():
    print(f'For attribute value {a}')
    gini = 1
    for c, i in c.items():
        p = i/sum(data[a].values())
        gini -= p**2
        print(f'Class : {c}, Count : {i}, Probability : {p:.3f}, Class Gini Index : {p**2:.3f}')

    gini_af += gini * sum(data[a].values())/sum([sum(data[b].values()) for b in data])
    print(f'Attribute : {a}, Gini Index : {gini:.3f}')
print(f'Total Gini Index after split: {gini_af:.3f}')

# calculate gini gain
print(f'Gini Gain : {gini_bef - gini_af:.3f}')


Gini Index before split
Class : A, Count : 9, Probability : 0.643, Class Gini Index : 0.413
Class : B, Count : 5, Probability : 0.357, Class Gini Index : 0.128
Total Gini Index before split: 0.459

Gini Index after split
For attribute value a1
Class : A, Count : 3, Probability : 0.600, Class Gini Index : 0.360
Class : B, Count : 2, Probability : 0.400, Class Gini Index : 0.160
Attribute : a1, Gini Index : 0.480
For attribute value a2
Class : A, Count : 4, Probability : 1.000, Class Gini Index : 1.000
Class : B, Count : 0, Probability : 0.000, Class Gini Index : 0.000
Attribute : a2, Gini Index : 0.000
For attribute value a3
Class : A, Count : 2, Probability : 0.400, Class Gini Index : 0.160
Class : B, Count : 3, Probability : 0.600, Class Gini Index : 0.360
Attribute : a3, Gini Index : 0.480
Total Gini Index after split: 0.343
Gini Gain : 0.116


In [50]:
Image(url='https://i.imgur.com/0Zn675n.png', width=500)

### Classification Measures

Accuracy = $\frac{TP + TN}{TP + TN + FP + FN}$

Error Rate = $\frac{FP + FN}{TP + TN + FP + FN} = 1 - Accuracy$

Precision = $\frac{TP}{TP + FP}$

Recall / Sensitivity / TPR = $\frac{TP}{TP + FN}$

TNR / Specificity = $\frac{TN}{TN + FP}$

FPR / Fall-out / Type I error = $\frac{FP}{FP + TN} = 1 - TNR$

F1 Score = $\frac{2 * Precision * Recall}{Precision + Recall}$

In [51]:
tp, tn, fp, fn = 10, 20, 30, 40
accuracy = (tp + tn)/(tp + tn + fp + fn)
error_rate = 1 - accuracy
precision = tp/(tp + fp)
recall = tp/(tp + fn)
tnr = tn/(tn + fp)
fpr = fp/(fp + tn)
f1_score = 2 * precision * recall/(precision + recall)
print(f'Accuracy : {accuracy:.3f}, Error Rate : {error_rate:.3f}, Precision : {precision:.3f}, Recall : {recall:.3f}, TNR : {tnr:.3f}, FPR : {fpr:.3f}, F1 Score : {f1_score:.3f}')

Accuracy : 0.300, Error Rate : 0.700, Precision : 0.250, Recall : 0.200, TNR : 0.400, FPR : 0.600, F1 Score : 0.222


In [52]:
# ROC curve

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
data = {'y_true': [1, 0, 1, 0, 0, 0, 1, 0, 1, 1], 'y_pred': [0.25, 0.43, 0.53, 0.76, 0.85, 0.85, 0.85, 0.87, 0.93, 0.95]}
fpr, tpr, thresholds = roc_curve(data['y_true'], data['y_pred'], drop_intermediate=False)
thresholds, tpr, fpr = thresholds.tolist()[::-1], tpr.tolist()[::-1], fpr.tolist()[::-1]
thresholds, tpr, fpr

([0.25, 0.43, 0.53, 0.76, 0.85, 0.87, 0.93, 0.95, 1.95],
 [1.0, 0.8, 0.8, 0.6, 0.6, 0.4, 0.4, 0.2, 0.0],
 [1.0, 1.0, 0.8, 0.8, 0.6, 0.2, 0.0, 0.0, 0.0])

### TF-IDF (Term Frequency - Inverse Document Frequency)

$${\displaystyle \text{tf-idf}(t,d)=\text{tf}(t,d)\cdot \text{idf}(t)}$$

where $t$ is a term and $d$ is a document. The tf-idf value increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general.

Here, $$\text{tf}(t,d) = 1 + log(1 + log(f_{t,d}))$$
and $$\text{idf}(t) = log(\frac{1 + |d|}{|d_t|})$$

where $f_{t,d}$ is the frequency of term $t$ in document $d$, and $\sum_{t'} f_{t',d}$ is the total number of terms in document $d$.

### Association Rules

$$\text{support}(A\rightarrow C) = \text{support}(A \cup C), \;\;\; \text{range: } [0, 1]$$

Support is used to measure the abundance or frequency (often interpreted as significance or importance) of an itemset in a database. We refer to an itemset as a "frequent itemset" if you support is larger than a specified minimum-support threshold. Note that in general, due to the downward closure property, all subsets of a frequent itemset are also frequent.

$$\text{confidence}(A\rightarrow C) = \frac{\text{support}(A\rightarrow C)}{\text{support}(A)}, \;\;\; \text{range: } [0, 1]$$

The confidence of a rule A->C is the probability of seeing the consequent in a transaction given that it also contains the antecedent. Note that the metric is not symmetric or directed; for instance, the confidence for A->C is different than the confidence for C->A. The confidence is 1 (maximal) for a rule A->C if the consequent and antecedent always occur together.

$$\text{lift}(A\rightarrow C) = \frac{\text{confidence}(A\rightarrow C)}{\text{support}(C)}, \;\;\; \text{range: } [0, \infty]$$

The lift metric is commonly used to measure how much more often the antecedent and consequent of a rule A->C occur together than we would expect if they were statistically independent. If A and C are independent, the Lift score will be exactly 1.

$$\text{levarage}(A\rightarrow C) = \text{support}(A\rightarrow C) - \text{support}(A) \times \text{support}(C), \;\;\; \text{range: } [-1, 1]$$

Leverage computes the difference between the observed frequency of A and C appearing together and the frequency that would be expected if A and C were independent. A leverage value of 0 indicates independence.

$$\text{conviction}(A\rightarrow C) = \frac{1 - \text{support}(C)}{1 - \text{confidence}(A\rightarrow C)}, \;\;\; \text{range: } [0, \infty]$$

A high conviction value means that the consequent is highly depending on the antecedent. For instance, in the case of a perfect confidence score, the denominator becomes 0 (due to 1 - 1) for which the conviction score is defined as 'inf'. Similar to lift, if items are independent, the conviction is 1.


A **closed** itemset is an itemset for which there exists no proper super-itemset with the same support count. A super-itemset is an itemset that contains all items of another itemset (the subset) and at least one additional item. A **closed frequent** itemset is an itemset that is both closed and frequent in a dataset.

A **maximal frequent** itemset (or max-itemset) is a frequent itemset for which none of its immediate supersets are frequent. In other words, there exists no super-itemset of a maximal frequent itemset that is also frequent in the dataset.



### Apriori and FP Growth algorithms

| Algorithm | Description | Candidate Generation | Pattern Generation | Process | Memory Usage |
| --- | --- | --- | --- | --- | --- |
| Apriori | Generates frequent patterns by making itemsets using pairing such as single, double, triple itemset. | Uses candidate generation. | Generates pattern by pairing the items. | Slower process with exponential increase in runtime as the number of itemsets increases. | Saves a converted version of the database in memory. |
| FP Growth | Generates an FP-Tree for making frequent patterns. | No candidate generation. | Generates pattern by constructing an FP tree. | Faster process with linear increase in runtime as the number of itemsets increases. | Saves a compact version of the conditional FP-Tree for each item in memory. |

In [53]:
# dataset = [
#     ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#     ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#     ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
#     ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
#     ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs'],
# ]
dataset = [
    ['1', '2', '5'],
    ['2', '4'],
    ['2', '3'],
    ['1', '2', '4'],
    ['1', '3'],
    ['2', '3'],
    ['1', '3'],
    ['1', '2', '3', '5'],
    ['1', '2', '3'],
]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.22222, use_colnames=True)
# frequent_itemsets = fpgrowth(df, min_support=0.6, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets['count'] = (frequent_itemsets['support'] * len(dataset)).astype(int)
frequent_itemsets
# frequent_itemsets[frequent_itemsets['length'] == 3]
# frequent_itemsets[frequent_itemsets['count'] >= 2]

Unnamed: 0,support,itemsets,length,count
0,0.666667,(1),1,6
1,0.777778,(2),1,7
2,0.666667,(3),1,6
3,0.222222,(4),1,2
4,0.222222,(5),1,2
5,0.444444,"(1, 2)",2,4
6,0.444444,"(3, 1)",2,4
7,0.222222,"(5, 1)",2,2
8,0.444444,"(3, 2)",2,4
9,0.222222,"(4, 2)",2,2


In [54]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1),(2),0.666667,0.777778,0.444444,0.666667,0.857143,-0.074074,0.666667
1,(2),(1),0.777778,0.666667,0.444444,0.571429,0.857143,-0.074074,0.777778
2,(3),(1),0.666667,0.666667,0.444444,0.666667,1.0,0.0,1.0
3,(1),(3),0.666667,0.666667,0.444444,0.666667,1.0,0.0,1.0
4,(5),(1),0.222222,0.666667,0.222222,1.0,1.5,0.074074,inf
5,(3),(2),0.666667,0.777778,0.444444,0.666667,0.857143,-0.074074,0.666667
6,(2),(3),0.777778,0.666667,0.444444,0.571429,0.857143,-0.074074,0.777778
7,(4),(2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
8,(5),(2),0.222222,0.777778,0.222222,1.0,1.285714,0.049383,inf
9,"(3, 1)",(2),0.444444,0.777778,0.222222,0.5,0.642857,-0.123457,0.444444


### K-Means Clustering

In [66]:
import numpy as np

# Define the number of clusters and the number of iterations
K = 2
max_iterations = 3

# Generate some sample data
# data = np.random.randint(0, 10, size=(5, 2))
data = np.array([[1, 1], [2, 2], [2, 3], [1, 2], [5,6], [5, 7], [6, 7], [6, 6]])
print(f"Points: {data}")

# Initialize the centroids by randomly selecting K data points
# centroids = data[np.random.choice(data.shape[0], K, replace=False)]
centroids = np.array([[1, 1], [5, 6]])
centroids = centroids.astype(float)

# Iterate the k-means algorithm
for i in range(max_iterations):
    # Assign each point to the nearest centroid
    distances = np.sqrt(np.sum((data[:, np.newaxis, :] - centroids) ** 2, axis=2))
    labels = np.argmin(distances, axis=1)
    
    # Print the centroids and the distances at each iteration
    print(f"Iteration {i+1}:")
    print(f"Centroids: {centroids}")
    print(f"Distances: {distances}")
    
    # Update the centroids to the mean of the assigned points  
    for k in range(K):
        centroids[k] = np.mean(data[labels == k], axis=0)

Points: [[1 1]
 [2 2]
 [2 3]
 [1 2]
 [5 6]
 [5 7]
 [6 7]
 [6 6]]
Iteration 1:
Centroids: [[1. 1.]
 [5. 6.]]
Distances: [[0.   6.4 ]
 [1.41 5.  ]
 [2.24 4.24]
 [1.   5.66]
 [6.4  0.  ]
 [7.21 1.  ]
 [7.81 1.41]
 [7.07 1.  ]]
Iteration 2:
Centroids: [[1.5 2. ]
 [5.5 6.5]]
Distances: [[1.12 7.11]
 [0.5  5.7 ]
 [1.12 4.95]
 [0.5  6.36]
 [5.32 0.71]
 [6.1  0.71]
 [6.73 0.71]
 [6.02 0.71]]
Iteration 3:
Centroids: [[1.5 2. ]
 [5.5 6.5]]
Distances: [[1.12 7.11]
 [0.5  5.7 ]
 [1.12 4.95]
 [0.5  6.36]
 [5.32 0.71]
 [6.1  0.71]
 [6.73 0.71]
 [6.02 0.71]]


### Heirarchical Clustering

In the context of cluster analysis, linkage measures are used to determine the distance between clusters. There are several common linkage measures used in hierarchical clustering:

Single linkage: The distance between two clusters is defined as the shortest distance between any two points in the different clusters. Mathematically, for clusters $C_i$ and $C_j$, the single linkage distance is given by: $$d(C_i,C_j) = \min_{x \in C_i, y \in C_j} d(x,y)$$

Complete linkage: The distance between two clusters is defined as the longest distance between any two points in the different clusters. Mathematically, for clusters $C_i$ and $C_j$, the complete linkage distance is given by: $$d(C_i,C_j) = \max_{x \in C_i, y \in C_j} d(x,y)$$

Average linkage: The distance between two clusters is defined as the average distance between all pairs of points in the different clusters. Mathematically, for clusters $C_i$ and $C_j$, the average linkage distance is given by: $$d(C_i,C_j) = \frac{1}{|C_i||C_j|} \sum_{x \in C_i} \sum_{y \in C_j} d(x,y)$$

Centroid linkage: The distance between two clusters is defined as the distance between their centroids. Mathematically, for clusters $C_i$ and $C_j$, with centroids $\mu_i$ and $\mu_j$, respectively, the centroid linkage distance is given by: $$d(C_i,C_j) = d(\mu_i,\mu_j)$$

Ward’s method: The distance between two clusters is defined as the increase in the total within-cluster variance that would result from merging them. Mathematically, for clusters $C_i$ and $C_j$, with centroids $\mu_i$ and $\mu_j$, respectively, and sizes $|C_i|$ and $|C_j|$, Ward’s method distance is given by: $$d(C_i,C_j) = \frac{|C_i||C_j|}{|C_i|+|C_j|} d(\mu_i,\mu_j)^2$$

In [63]:
### Linkage Clustering Code
data = np.array([[2, 3], [2, 4], [3, 4], [5, 6], [6, 7], [5, 7], [6,3]])

# Calculate the distance matrix
# set print precision to 2 decimal places
np.set_printoptions(precision=2)
distances = np.sqrt(np.sum((data[:, np.newaxis, :] - data) ** 2, axis=2))
print(f"Distance Matrix: {distances}")

# Initialize the clusters
clusters = [[i] for i in range(len(data))]
print(f"Initial Clusters: {clusters}")

# Iterate the clustering algorithm
while len(clusters) > 1:

    # distance matrix for current clusters
    distance_mat = np.full((len(clusters), len(clusters)), fill_value=0.0, dtype=float)

    # Find the closest clusters
    min_distance = np.inf
    for i in range(len(clusters)):
        for j in range(i+1, len(clusters)):
            # CHOOSE ONE OF THE LINKAGE METHODS
            # manhattan distance
            # distance = np.sum(np.abs(data[clusters[i]] - data[clusters[j]]))

            distance = np.max(distances[clusters[i]][:, clusters[j]]) # complete linkage
            # distance = np.min(distances[clusters[i]][:, clusters[j]]) # single linkage
            # distance = np.mean(distances[clusters[i]][:, clusters[j]]) # average linkage
            distance_mat[i, j] = distance
            if distance < min_distance:
                min_distance = distance
                min_i, min_j = i, j
    
    print(f"Distance matrix: \n{distance_mat}")
    print(f"Closest clusters: {clusters[min_i]} and {clusters[min_j]} (distance: {min_distance:.2f})")

    # Merge the closest clusters
    clusters[min_i] += clusters[min_j]
    del clusters[min_j]
    
    # Print the clusters at each iteration
    print(f"\nClusters: {clusters}")

Distance Matrix: [[0.   1.   1.41 4.24 5.66 5.   4.  ]
 [1.   0.   1.   3.61 5.   4.24 4.12]
 [1.41 1.   0.   2.83 4.24 3.61 3.16]
 [4.24 3.61 2.83 0.   1.41 1.   3.16]
 [5.66 5.   4.24 1.41 0.   1.   4.  ]
 [5.   4.24 3.61 1.   1.   0.   4.12]
 [4.   4.12 3.16 3.16 4.   4.12 0.  ]]
Initial Clusters: [[0], [1], [2], [3], [4], [5], [6]]
Distance matrix: 
[[0.   1.   1.41 4.24 5.66 5.   4.  ]
 [0.   0.   1.   3.61 5.   4.24 4.12]
 [0.   0.   0.   2.83 4.24 3.61 3.16]
 [0.   0.   0.   0.   1.41 1.   3.16]
 [0.   0.   0.   0.   0.   1.   4.  ]
 [0.   0.   0.   0.   0.   0.   4.12]
 [0.   0.   0.   0.   0.   0.   0.  ]]
Closest clusters: [0] and [1] (distance: 1.00)

Clusters: [[0, 1], [2], [3], [4], [5], [6]]
Distance matrix: 
[[0.   1.41 4.24 5.66 5.   4.12]
 [0.   0.   2.83 4.24 3.61 3.16]
 [0.   0.   0.   1.41 1.   3.16]
 [0.   0.   0.   0.   1.   4.  ]
 [0.   0.   0.   0.   0.   4.12]
 [0.   0.   0.   0.   0.   0.  ]]
Closest clusters: [3] and [5] (distance: 1.00)

Clusters: [[0, 1], [

### DBSCAN

* $\epsilon$-neighborhood: The $\epsilon$-neighborhood of a point $p$ is the set of points within a distance $\epsilon$ of $p$.
* MinPts: The minimum number of points required to form a dense region (including the point itself).
* Core point: A point $p$ is a core point if at least MinPts points are within its $\epsilon$-neighborhood.
* Border point: A point $p$ is a border point if it is within the $\epsilon$-neighborhood of a core point, but it is not a core point itself.
* Noise point: A point $p$ is a noise point if it is neither a core point nor a border point.
* For two objects $p$ and $q$, they can be directly density-reachable, density-reachable, or density-connected.
* A density based cluster is a maximal set of density-connected points. A set of points $C \subseteq D$ form a cluster if:
    1. for any two points $p$ and $q$ in $C$, $p$ and $q$ are density-connected.
    2. there does not exist a point $o \in C$ and another object $o' \in D \setminus C$ such that $o$ and $o'$ are density-connected.
* DBSCAN steps:
    1. Find all the neighbor points within $\epsilon$ and identify the core points or visited with more than MinPts neighbors.
    2. For each core point if it is not already assigned to a cluster, create a new cluster.
    3. Find recursively all its density connected points and assign them to the same cluster as the core point.
    4. Iterate through the remaining unvisited points in the dataset. Those points that do not belong to any cluster are noise.

| Method | Pros | Cons |
| --- | --- | --- |
| k-means | - Simple and easy to implement. <br> - Scalable to large datasets. <br> - Can produce tight clusters. | - Assumes spherical clusters of similar size and density. <br> - Sensitive to initial centroid placement. <br> - Requires the number of clusters to be specified in advance. |
| Single linkage | - Can handle non-convex clusters. <br> - Simple and easy to implement. | - Sensitive to noise and outliers. <br> - Can produce elongated or "chained" clusters. |
| Complete linkage | - Less sensitive to noise and outliers than single linkage. <br> - Simple and easy to implement. | - Tends to produce compact clusters of similar size. <br> - Can break large clusters. |
| Average linkage | - Less sensitive to noise and outliers than single linkage. <br> - Simple and easy to implement. | - Tends to produce compact clusters of similar size. |
| DBSCAN | - Can handle clusters of arbitrary shape and size. <br> - Robust to noise and outliers. <br> - Does not require the number of clusters to be specified in advance. | - Sensitive to parameter selection. <br> - Assumes clusters have similar density. |

### Hopkins Statistic

The Hopkins statistic is a measure of cluster tendency. 
- Sample $n$ points $p_1, p_2, \dots, p_n$ from the dataset $D$., for each point $p_i$, find distance to nearest neighbors in $D$. $$ x_i = \min_{v \in D \setminus \{p_i\}} d(p_i, v) $$
- Sample $n$ points $q_1, q_2, \dots, q_n$ from the dataset $D$ uniformly at random, for each point $q_i$, find distance to nearest neighbors in $D - \{q_i\}$. $$ y_i = \min_{v \in D \setminus \{q_i\}} d(q_i, v) $$
- Then the Hopkins statistic is defined as: $$ H = \frac{\sum_{i=1}^n y_i}{\sum_{i=1}^n x_i + \sum_{i=1}^n y_i} $$
- Interpretation:
    1. $H = 0$ if D is highly clustered.
    2. $H = 1$ if D cannot be clustered.
    3. $H = 0.5$ if D is a uniform distribution.
    4. If $H > 0.5$, then D may not have statistically significant clusters.

### Silhouette Coefficient
Calculate the silhouette coefficient for each point $o$ in the dataset $D$:
$$ s(o) = \frac{b(o) - a(o)}{\max(a(o), b(o))} $$
where $a(o)$ is the average distance between $o$ and all other points in the same cluster, and $b(o)$ is the minimum average distance between $o$ and all points in other clusters. 

Silhouette coefficient for cluster $C$ is defined as the average silhouette coefficient of all points in $C$.
Silhouette coefficient for the dataset $D$ is defined as the average silhouette coefficient of all points in $D$.
- Interpretation:
    1. $s(o) = 1$ if $o$ is well clustered.
    2. $s(o) = 0$ if $o$ is on the boundary of two clusters.
    3. $s(o) = -1$ if $o$ is assigned to the wrong cluster.

### Uni-variate Outliers using MLE

* Learn the parameters of the normal distribution $µ$ and $σ$ using Maximum Likelihood Method (MLE).
* Identify the points with low probability as outliers.

In [57]:
# using MLE to get outliers
data = np.array([10, 24.0, 28.9, 28.9, 29.0, 29.1, 29.1, 29.2, 29.2, 29.3, 29.4])
mean = np.mean(data)
std = np.std(data)
print(f"Mean: {mean:.2f}, Std: {std:.2f}, 3*Std: {3*std:.2f}, Mean - 3*Std: {mean - 3*std:.2f}, Mean + 3*Std: {mean + 3*std:.2f}")
print(f"Outliers: {data[(data < mean - 3*std) | (data > mean + 3*std)]}")

Mean: 26.92, Std: 5.55, 3*Std: 16.65, Mean - 3*Std: 10.27, Mean + 3*Std: 43.56
Outliers: [10.]


IDS Calculators

1. Confusion matrix : https://onlineconfusionmatrix.com/
2. Information Gain : https://planetcalc.com/8421/
3. FP Tree: https://planktonfun.github.io/FPTreeSimulator/
4. Single, Complete, and Average linkage with Dendrogram: https://people.revoledu.com/kardi/tutorial/Clustering/Online-Hierarchical-Clustering.html