In [None]:
%load_ext autoreload
%autoreload 2

# Analyze sub-clusters to see if they are closer to other clusters

The questions to answer are:

1. Which sub-clusters are closer to sub-clusters in another cluster than to the sub-clusters in their own cluster?
2. Which sub-clusters are really close to sub-clusters in other clusters, and how close are they?
3. **How many bad mistakes is the name-to-vec making?**

Steps:

1. Compute sub-cluster centroids
2. For each sub-cluster, calculate the similarity between that sub-cluster and all other sub-clusters
3. Report any sub-clusters that are either 
   - very close to a sub-cluster in another cluster, or 
   - are closer to a sub-cluster in another cluster than to the nearest sub-cluster in their own cluster, or
   - two out of the three closest sub-clusters are in another cluster, or
   - are closer to the centroid of another cluster than the centroid of their own cluster

In [None]:
from collections import defaultdict
import json
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm.auto import tqdm

from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab

In [None]:
# configure
given_surname = "given"

other_cluster_similarity_threshold = 0.9

distance_threshold = 0.65
sub_clusters_path = f"../data/models/sub_clusters_{given_surname}-{distance_threshold}.json"
nama_bucket = 'nama-data'
vocab_type = 'f'
subword_vocab_size = 2000
subwords_path=f"data/models/fs-{given_surname}-subword-tokenizer-{subword_vocab_size}{vocab_type}.json"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
model_path = f"../data/models/bi_encoder-{given_surname}.pth"

## Load data

In [None]:
# load sub-clusters
with open(sub_clusters_path, 'rt') as f:
    clusters = json.load(f)
print(len(clusters))

In [None]:
# load model
model = torch.load(model_path)

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    subwords_path=subwords_path,
    nama_bucket=nama_bucket,
)
len(tokenizer_vocab)

## Compute cluster and sub-cluster centroids

In [None]:
def get_sub_cluster_key(cluster_name, sub_cluster_name):
    return f"{cluster_name}/{sub_cluster_name}"

def get_cluster_from_key(key):
    return key.split('/')[0]

def get_sub_cluster_from_key(key):
    return key.split('/')[1]

In [None]:
name_embeddings = {}
cluster_centroids = {}
sub_cluster_centroids = {}

for cluster_name, cluster in tqdm(clusters.items()):
    cluster_embeddings = []
    for sub_cluster_name, sub_cluster in cluster.items():
        sub_cluster_embeddings = []
        for name in sub_cluster:
            embedding = model.get_embedding(tokenize(name))
            # normalize embedding
            embedding /= np.linalg.norm(embedding)
            name_embeddings[name] = embedding
            sub_cluster_embeddings.append(embedding)
            cluster_embeddings.append(embedding)
        centroid = np.array(sub_cluster_embeddings).sum(axis=0) / len(sub_cluster_embeddings)
        # normalize centroid
        centroid /= np.linalg.norm(centroid)
        sub_cluster_centroids[get_sub_cluster_key(cluster_name, sub_cluster_name)] = centroid
    centroid = np.array(cluster_embeddings).sum(axis=0) / len(cluster_embeddings)
    cluster_centroids[cluster_name] = centroid

In [None]:
# turn sub-cluster centroids into a numpy array
all_sub_cluster_centroids = []
sub_cluster_keys = []     # map index to key
sub_cluster_indices = {}  # max key to index

for ix, (key, centroid) in enumerate(sub_cluster_centroids.items()):
    all_sub_cluster_centroids.append(centroid)
    sub_cluster_keys.append(key)
    sub_cluster_indices[key] = ix
    
all_sub_cluster_centroids = np.array(all_sub_cluster_centroids)

In [None]:
# turn cluster centroids into a numpy array
all_cluster_centroids = []
cluster_keys = []     # map index to key
cluster_indices = {}  # max key to index

for ix, (key, centroid) in enumerate(cluster_centroids.items()):
    all_cluster_centroids.append(centroid)
    cluster_keys.append(key)
    cluster_indices[key] = ix
    
all_cluster_centroids = np.array(all_cluster_centroids)

## Check similarities

In [None]:
def find_most_similar_centroids(centroid, all_centroids, k):
    # Calculate the dot product
    similarities = np.dot(all_centroids, centroid)
    # Find the indices of the top k most similar vectors
    top_indices = np.argpartition(similarities, -k)[-k:]
    # Sort the top_indices by similarity (descending order)
    top_indices = top_indices[np.argsort(similarities[top_indices])[::-1]]
    # Get the similarities of the top k vectors
    top_similarities = similarities[top_indices]    
    return top_indices, top_similarities

def calc_similarity_to_sub_cluster(sub_cluster_key1, sub_cluster_key2):
    centroid1 = np.array(sub_cluster_centroids[sub_cluster_key1])
    centroid2 = np.array(sub_cluster_centroids[sub_cluster_key2])
    return  np.dot(centroid1, centroid2)

def calc_similarity_to_cluster(sub_cluster_key, cluster_key):
    centroid1 = np.array(sub_cluster_centroids[sub_cluster_key])
    centroid2 = np.array(cluster_centroids[cluster_key])
    return  np.dot(centroid1, centroid2)

In [None]:
# check possible mistakes
def check_mistake(key, other_key):
    sub_cluster = get_sub_cluster_from_key(key)
    cluster = get_cluster_from_key(key)
    cluster_similarity = calc_similarity_to_cluster(key, cluster)
    print('THIS', key, clusters[cluster][sub_cluster], cluster_similarity)
    for other_sub_cluster in clusters[cluster]:
        similarity = calc_similarity_to_sub_cluster(key, get_sub_cluster_key(cluster, other_sub_cluster))
        print(other_sub_cluster, similarity, clusters[cluster][other_sub_cluster])
    other_sub_cluster = get_sub_cluster_from_key(other_key)
    other_cluster = get_cluster_from_key(other_key)
    similarity = calc_similarity_to_sub_cluster(key, other_key)
    print('OTHER', other_key, similarity, clusters[other_cluster][other_sub_cluster])

In [None]:
# how often do these mistakes happen?
#   where a sub-cluster is far away from the cluster centroid and it isn't a nickname/cognate
#   where a sub-cluster is close to another sub-cluster that has nothing to do with it
#   we want to minimize these occurrences
# why is this sub-cluster centroid so far from the aaron cluster centroid?
# we need to try to improve name-to-vec
#   train a cross-encoder from the triplets
#   use the cross-encoder to score WANs, name-variants, common-non-negatives, high-freq name pairs
#      with in-batch negatives to generate new triplets
#   train a bi-encoder with the new triplets
#   fine-tune the bi-encoder using the original triplets

check_mistake('aaron/ehren', 'severino/sovren')

In [None]:
check_mistake('asta/austie', 'austacia/austacia')

In [None]:
check_mistake('aaron/erin', 'er/er')

## Report sub-clusters that need to be reviewed

Sub-clusters that are more similar to sub-clusters in another cluster than sub-clusters in their own cluster should be reviewed for possibly moving to the other cluster, or merging the two clusters.

In [None]:
k = 3
for cnt, (cluster_name, cluster) in tqdm(enumerate(clusters.items())):
    if cnt > 20:
        break
    for sub_cluster_name, sub_cluster in cluster.items():
        # get the key for this sub-cluster
        key = get_sub_cluster_key(cluster_name, sub_cluster_name)
#         print(key)
        # get the centroid for this key
        sub_cluster_centroid = np.array(sub_cluster_centroids[key])
        
        # get closest k+1 sub-clusters
        top_indices, top_similarities = find_most_similar_centroids(
            sub_cluster_centroid, 
            all_sub_cluster_centroids, 
            k=k+1
        )
        top_indices = top_indices.tolist()
        top_similarities = top_similarities.tolist()
#         print(top_indices, top_similarities)
        # remove this sub-cluster from the top results
        sub_cluster_ix = sub_cluster_indices[key]
        if sub_cluster_ix in top_indices:
            ix = top_indices.index(sub_cluster_ix)
            del top_indices[ix]
            del top_similarities[ix]
        else:
            print(f"WARNING {sub_cluster_ix} not found in {top_indices} with similarities {top_similarities}")
            top_indices = top_indices[:k]
            top_similarities = top_similarities[:k]

#         print(top_indices, top_similarities)
#         for top_index in top_indices:
#             print(top_index, sub_cluster_keys[top_index])

        # check if this sub-cluster

        # is very close to a sub-cluster in another cluster
        for ix, similarity in zip(top_indices, top_similarities):
            other_key = sub_cluster_keys[ix]
            other_cluster = get_cluster_from_key(other_key)
            if similarity > other_cluster_similarity_threshold and other_cluster != cluster_name:
                print(f"1. Sub-cluster {key} is very similar to {other_key} with similarity {similarity}")

        # is closer to a sub-cluster in another cluster than to the nearest sub-cluster in its own cluster
        other_key = sub_cluster_keys[top_indices[0]]
        other_cluster = get_cluster_from_key(other_key)
        if len(cluster) > 1 and other_cluster != cluster_name:
            print(f"2. Sub-cluster {key} is closer to {other_key} with similarity {similarity} than to a sub-cluster in its own cluster: {list(cluster.keys())}")

        # has more than half of the closest sub-clusters in another cluster
        top_clusters = [get_cluster_from_key(sub_cluster_keys[ix]) for ix in top_indices]
        for top_cluster in set(top_clusters):
            if top_clusters.count(top_cluster) > k / 2:
                print(f"3. Sub-cluster {key} has more than half of its closest sub-clusters in cluster {top_cluster}")

        # is closer to the centroid of another cluster than the centroid of its own cluster        
        top_indices, top_similarities = find_most_similar_centroids(sub_cluster_centroid, all_cluster_centroids, k=1)
#         print("cluster", top_indices, top_similarities, cluster_keys[top_indices[0]])
        other_cluster = cluster_keys[top_indices[0]]
        if other_cluster != cluster_name:
            print(f"4. Sub-cluster {key} is closer to cluster {other_cluster} with similarity {top_similarities[0]} than its own cluster")

In [None]:
len([sim for sim in similarities if sim < 0.5])

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(random.sample(similarities, 10000), bins=40, label="Name similarity to centroid")
plt.title('Centroid similarities')
plt.xlabel('similarity')
plt.ylabel('Frequency')

# Show the plot
plt.tight_layout()
plt.show()

### Plot number of sub-clusters

In [None]:
n_sub_clusters = []
for cluster in clusters.values():
    n_sub_clusters.append(len(cluster))
len(n_sub_clusters)

In [None]:
len([n for n in n_sub_clusters if n > 10])

In [None]:
len(clusters['elizabeth'])

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(n_sub_clusters, bins=40, label="Number of Sub-clusters")
plt.title('Number of Sub-clusters')
plt.xlabel('Number of Sub-clusters')
plt.ylabel('Frequency')

# Show the plot
plt.tight_layout()
plt.show()