In [None]:
%load_ext autoreload
%autoreload 2

# Split buckets into clusters and super-clusters using bi-encoder

Load the parser and trained model, and use a hierarchal agglomerative clustering algorithm to split existing FamilySearch buckets into clusters and super-clusters based upon similarity computed using bi-encoder model. Each cluster contains the names in the bucket that the model determines are similar to each other, and each super-cluster contains all of the clusters in the bucket.

Each cluster contains:

1. a list of names, 
2. the most-common name as the cluster label (preceded by an underscore), and 
3. a cluster centroid: a vector depicting the center of the cluster. 

Each super-cluster contains:

1. a list of cluster labels
2. the most-common name in the cluster as the super-cluster label

If a bucket has only one cluster, we don't create a super-cluster for the bucket.

When determine which cluster a rare name belongs to, we will choose the closest centroid.

The questions to answer are:

1. Should we weight more-common names more than less-common names when computing the clusters?: log_10(freq)?
2. Should we use average or complete linkage?
3. What should the threshold be?

In [None]:
from collections import defaultdict
import json
import math
import os

import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm.auto import tqdm

from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab

In [None]:
# configure
given_surname = "given"

linkage = "average"  # average, complete
similarity_threshold = 0.3  # 0.25-0.35 for average; 0.05 for complete
cluster_freq_normalizer = "none"  # log, log10, none

experiment_name = f"{linkage}-{similarity_threshold}-{cluster_freq_normalizer}"

max_tokens = 10
subwords_path=f"../data/models/fs-{given_surname}-subword-tokenizer-2000f.json"
std_path = f"../data/processed/std_{given_surname}-augmented.txt"
model_type = 'cecommon+0+aug-0-1'
model_path = f"../data/models/bi_encoder-{given_surname}-{model_type}.pth"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"

experiment_dir = f"../reports/"
clusters_path = f"../data/processed/clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
super_clusters_path = f"../data/processed/super_clusters_{given_surname}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"

## Load data

In [None]:
def get_cluster_freq(name):
    freq = name_freq.get(name, 0)
    if cluster_freq_normalizer == "log":
        return max(1, math.floor(math.log(max(1,freq))))
    elif cluster_freq_normalizer == "log10":
        return max(1, math.floor(math.log10(max(1,freq))))
    else:
        return 1

In [None]:
# load buckets
bucket_names = defaultdict(set)
name_buckets = defaultdict(set)
with open(std_path, 'rt') as f:
    for line in f.readlines():
        names = line.strip().replace(':', '').split(' ')
        bucket_name = names[0]
        for name in names:
            name = name.strip()
            if len(name) == 0:
                continue
            bucket_names[bucket_name].add(name)
            name_buckets[name].add(bucket_name)
print(len(bucket_names), len(name_buckets))

In [None]:
# load pref names
pref_df = pd.read_csv(pref_path, na_filter=False)
name_freq = {name: freq for name, freq in zip(pref_df['name'], pref_df['frequency'])}
pref_df = None
print(len(name_freq))
print('john', name_freq['john'], get_cluster_freq('john'))

In [None]:
cnt = 0
for name in name_buckets:
    if name not in name_freq:
        cnt += 1
        print(name)
print(cnt)

In [None]:
def get_most_freq_name(names):
    most_freq_name = None
    most_freq_freq = None
    for name in names:
        freq = name_freq.get(name, 0)
        if most_freq_name is None or freq > most_freq_freq:
            most_freq_name = name
            most_freq_freq = freq
    return most_freq_name

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    max_tokens=max_tokens,
    subwords_path=subwords_path,
)
len(tokenizer_vocab)

In [None]:
# load model
model = torch.load(model_path)

## Cluster names

### compute embeddings

In [None]:
name_embedding = {}
for names in tqdm(bucket_names.values()):
    for name in names:
        embedding = model.get_embedding(tokenize(name))
        if linkage == "ward":
            embedding /= np.linalg.norm(embedding)            
        name_embedding[name] = embedding

### create clusterer

In [None]:
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="euclidean" if linkage == "ward" else "cosine",
    linkage=linkage,
    distance_threshold=(1-similarity_threshold),
)

### test clusterer

In [None]:
test_names = ['abraham','abe','aabraham','ab','abaham','abaraham','abarham','abb','abelarde','abera','aberaham']
X = []
names = []
for name in test_names:
    embedding = name_embedding[name]
    for _ in range(get_cluster_freq(name)):
        names.append(name)
        X.append(embedding)
print(len(X))
clustering = clusterer.fit(X)
sub_clusters = [set() for _ in range(clustering.n_clusters_)]
print('n_clusters', clustering.n_clusters_)
print('labels', clustering.labels_)
print('names', names)
for name, sub_cluster in zip(names, clustering.labels_):
    sub_clusters[sub_cluster].add(name)
for sub_cluster in sub_clusters:
    print(sub_cluster)

### run clusterer

In [None]:
bucket_clusters = {}
for bucket_name, names in tqdm(bucket_names.items()):
    if len(names) == 1:
        clusters = [names]
    else:
        X = []
        clustered_names = []
        for name in names:
            embedding = name_embedding[name]
            for _ in range(get_cluster_freq(name)):
                clustered_names.append(name)
                X.append(embedding)
        clustering = clusterer.fit(X)
        clusters = [set() for _ in range(clustering.n_clusters_)]
        for name, cluster in zip(clustered_names, clustering.labels_):
            clusters[cluster].add(name)
    bucket_clusters[bucket_name] = clusters

## Evaluate results

In [None]:
bucket_cluster_total = sum(len(clusters) for clusters in bucket_clusters.values())
print(len(bucket_clusters), bucket_cluster_total)

In [None]:
# what about the top 100, 1000 names?
total_clusters = 0
total_names = 0
for ix, name in enumerate(name_freq.keys()):
    if ix % 100 == 0 and ix > 0:
        print(total_names, total_clusters / total_names)
    if ix == 2000:
        break
    if name not in name_buckets:
        continue
    bucket_name = next(iter(name_buckets[name]))
    total_clusters += len(bucket_clusters[bucket_name])
    total_names += 1

In [None]:
for name in list(name_freq.keys())[:10]:
    if name not in name_buckets:
        continue
    bucket_name = next(iter(name_buckets[name]))
    print('***', name, bucket_name)
    for ix, cluster in enumerate(bucket_clusters[bucket_name]):
        print(' ', ix, get_most_freq_name(cluster), ':', ' '.join(cluster))


### Write experiment report

deprecated

## Save Clusters and Super-Clusters

In [None]:
def get_cluster_centroid(cluster):
    centroid = None
    for name in cluster:
        embedding = name_embedding[name]
        for _ in range(get_cluster_freq(name)):
            if centroid is None:
                centroid = embedding.copy()
            else:
                centroid += embedding
    return centroid / np.linalg.norm(centroid)    

In [None]:
get_cluster_freq('richard')

In [None]:
emb1 = name_embedding['richard']
emb2 = name_embedding['dallan']
print(cosine_similarity([emb1], [emb2]))
print(cosine_similarity([emb1], [emb1]))
print(cosine_similarity([emb1], [emb1+emb2]))
print(cosine_similarity([emb1], [get_cluster_centroid(['richard', 'dallan'])]))

In [None]:
all_clusters = {}
super_clusters = {}
for bucket_name, clusters in bucket_clusters.items():
    cluster_names = []
    for cluster in clusters:
        centroid = get_cluster_centroid(cluster)
        cluster_name = f"{bucket_name}/{get_most_freq_name(cluster)}"
        cluster_names.append(cluster_name)
        all_clusters[cluster_name] = {
            "names": list(cluster),
            "centroid": centroid.tolist(),
        }
    if len(cluster_names) > 1:
        super_clusters[bucket_name] = cluster_names

In [None]:
print(len(all_clusters), len(super_clusters))

In [None]:
print(clusters_path, super_clusters_path)

In [None]:
with open(clusters_path, 'wt') as f:
    json.dump(all_clusters, f, indent=2)
with open(super_clusters_path, 'wt') as f:
    json.dump(super_clusters, f, indent=2)