In [67]:
import numpy as np
from numpy import ndarray

In [97]:
first_solution = np.array([1, 1, 2, 2, 3, 3])
second_solution = np.array([3, 3, 2, 2, 2, 1])

## 1. Using Relabeling the Solution

In [38]:
def get_mode(arr: ndarray):
    """ Calculate the mode of the array """
    
    values, counts = np.unique(arr, return_counts=True)
    mode = values[np.argmax(counts)]
    return mode

In [44]:
def calculate_similarity(arr_1: ndarray, arr_2: ndarray):
    """ Compares two clustering solutions """
    
    assert len(arr_1) == len(arr_2), "Arrays should be of same size"
    
    match_count = np.sum(first_solution == new_sol)
    return match_count / len(arr_1)

In [36]:
def relabel_solution(ref_arr: ndarray, arr: ndarray) -> ndarray:
    
    # new solution with the same shape of the arr
    new_solution: ndarray = np.zeros_like(arr)
    
    for label in np.unique(arr): # unique values from the arr
        
        # get the indices where the values i is in arr
        indices = np.where(arr == label)
        
        # get the mode of the referece array, from the indices of the value i
        new_label = get_mode(ref_arr[indices]) 
        
        # update the new solution
        new_solution[indices] = new_label
    
    return new_solution

In [39]:
new_sol = relabel_solution(first_solution, second_solution)
new_sol

array([1, 1, 2, 2, 2, 3])

In [43]:
calculate_similarity(first_solution, new_sol)

0.8333333333333334

---
## 2. Using `Rand Index`

In [131]:
def get_matrix(arr: ndarray):
    m = (arr[:, np.newaxis] == arr).astype('int')
    np.fill_diagonal(m, 0)
    return np.triu(m)

In [132]:
a = get_matrix(first_solution)
b = get_matrix(second_solution)

In [133]:
a

array([[0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0]])

In [134]:
b

array([[0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [143]:
a_values = a[np.triu_indices_from(a, k=1)]
b_values = b[np.triu_indices_from(b, k=1)]

In [152]:
count = np.sum((a_values == 1) & (b_values == 1))
count

2

In [153]:
np.sum((a_values == 0) & (b_values == 0))

10

In [154]:
np.sum((a_values == 1) & (b_values == 0))

1

In [157]:
np.sum((a_values == 0) & (b_values == 1))

2

In [196]:
def get_confusion_matrix(sol_1: ndarray, sol_2: ndarray):
    """
    | (1, 1) | (1, 0) |
    | (1, 0) | (0, 0) | 
    """
    a = get_matrix(first_solution)
    b = get_matrix(second_solution)
    
    a_values = a[np.triu_indices_from(a, k=1)]
    b_values = b[np.triu_indices_from(b, k=1)]
    
    conf_matrix = np.zeros((2, 2), dtype=int)
    for i in range(1, -1, -1):
        for j in range(1, -1, -1):
            conf_matrix[1 - i][1 - j] = np.sum((a_values == i) & (b_values == j))
    
    return conf_matrix

In [199]:
confusion_matrix = get_confusion_matrix(first_solution, second_solution)
confusion_matrix

array([[ 2,  1],
       [ 2, 10]])

In [201]:
def rand_index(cnf_matrx):
    return np.trace(cnf_matrx) / np.sum(cnf_matrx)

In [208]:
rand_index(confusion_matrix) 

0.8

____
## To know if we've done everything right

In [None]:
from sklearn.metrics.cluster import rand_score

In [None]:
ri = rand_score(first_solution, second_solution)
ri 

0.8