In [None]:
# in this notebook, we load two cluster assignments, create dictionaries using the shared examples only based on their keys, and measure their similarities.

In [1]:
# let's load a few important packages for checking the cluster similarities
import numpy as np
import pandas as pd
import os
import sys
import json
from collections import Counter
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_samples

In [2]:
# let's load the target/ground-truth cluster assignments. it's in `root_id_type_dict.pkl` file.
import pickle
root_id_type_dict = pickle.load(open('root_id_type_dict.pkl', 'rb'))

In [None]:
# let's load the estimated cluster assignments from `cluster_assignment_dict.npy`
cluster_assignment_dict = np.load('pca_cluster_assignment_dict.npy', allow_pickle=True).item()

In [24]:
# first, let's create a dictionary of the keys of `root_id_type_dict` mapped to integer indices.
root_id_type_dict_keys = list(root_id_type_dict.keys())
root_id_type_dict_keys_dict = {root_id_type_dict_keys[i]: i for i in range(len(root_id_type_dict_keys))}

In [None]:
# create a cluster assignment array using the keys of `root_id_type_dict`.
# to do so, first, we should convert string-based values of `root_id_type_dict` into integer-based values.
root_id_type_dict_cluster_strings = list(set(root_id_type_dict.values()))
root_id_type_dict_values_int = dict([(cluster_string, cid) 
                                     for cid, cluster_string 
                                     in enumerate(root_id_type_dict_cluster_strings)])
print(f'there are {len(root_id_type_dict_values_int)} unique cluster assignments in the ground-truth data.')

# create a cluster assignment array using the keys of `root_id_type_dict`.
# since we use python3, we assume `dict` preserves the order of the items.
ground_truth_assignment = np.array([root_id_type_dict_values_int[root_id_type_dict[key]] 
                                         for key in root_id_type_dict_keys])


In [39]:
# now we do it for `cluster_assignment_dict`.
# first, we should convert values of `cluster_assignment_dict` into integer-based values.
cluster_assignment_dict_cluster_strings = list(set(cluster_assignment_dict.values()))
cluster_assignment_dict_values_int = dict([(cluster_string, cid) 
                                           for cid, cluster_string 
                                           in enumerate(cluster_assignment_dict_cluster_strings)])

# create a cluster assignment array using the keys of `cluster_assignment_dict`.
target_assignment = np.array([cluster_assignment_dict_values_int[value]
                              for key, value in cluster_assignment_dict.items()
                              if key in root_id_type_dict_keys_dict])


In [None]:
# now, let's measure the similarity between the two cluster assignments; `ground_truth_assignment` and `target_assignment`.

# we use the following metrics:
# 1. adjusted_rand_score
# 2. normalized_mutual_info_score
# 3. adjusted_mutual_info_score

# 1. adjusted_rand_score
adjusted_rand_score_value = adjusted_rand_score(ground_truth_assignment, target_assignment)
print(f'adjusted_rand_score: {adjusted_rand_score_value}')

# 2. normalized_mutual_info_score
normalized_mutual_info_score_value = normalized_mutual_info_score(ground_truth_assignment, target_assignment)
print(f'normalized_mutual_info_score: {normalized_mutual_info_score_value}')

# 3. adjusted_mutual_info_score
adjusted_mutual_info_score_value = adjusted_mutual_info_score(ground_truth_assignment, target_assignment)
print(f'adjusted_mutual_info_score: {adjusted_mutual_info_score_value}')

adjusted_rand_score: 0.007057830350906706
normalized_mutual_info_score: 0.08665074286324119
adjusted_mutual_info_score: 0.036285943846389515


KeyboardInterrupt: 

In [38]:
ground_truth_assignment.shape

(47529,)