In [None]:
%load_ext autoreload
%autoreload 2

# Generate clusters using the trained bi-encoder model

**Deprecated in favor of 241_create_clusters_from_buckets**

Load the parser and trained model, and use a hierarchal agglomerative clustering algorithm to cluster the most-frequent names into clusters based upon similarity computed using the trained model. Each cluster will contain names that the model determines are similar to each other. 

We want to create cohesive clusters, but not too many, because we will ultimately map the existing Buckets to one or more clusters. Each cluster can appear in multiple Buckets, but again we want to limit the number of Buckets that contain the same cluster.

At index time, each name will be mapped to a single cluster and indexed under that cluster. At query time, each name will again be mapped to a single cluster, but we will look up all clusters in the Bucket(s) in which the queried cluster appears.

Each cluster will contain:

1. a list of names, 
2. the most-common name as the cluster label, and 
3. a cluster centroid: a vector depicting the centroid of the cluster. 

## Todo
- try ward, complete linkage
- merge clusters that only have names in the same bucket?
- use buckets to guide merging and splitting clusters

In [None]:
from collections import defaultdict
import json
import math
import os
import re

import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import torch
from tqdm.auto import tqdm

from src.data.utils import read_csv
from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab

In [None]:
# configure
given_surname = "given"

linkage = "average"  # ward, average, complete, single
similarity_threshold = 0.73

max_tokens = 10
subwords_path=f"../data/models/fs-{given_surname}-subword-tokenizer-2000f.json"
num_common_names = 100_000
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
std_path = f"../references/std_{given_surname}.txt"
model_type = 'cecommon+0+aug-0-1'
model_path = f"../data/models/bi_encoder-{given_surname}-{model_type}.pth"

clusters_path = f"../data/models/clusters_{given_surname}-{similarity_threshold}.json"

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load data

In [None]:
# load buckets
bucket_names = []
name_buckets = defaultdict(set)

with open(std_path) as f:
    for line in f.readlines():
        line = line.strip()
        head_names, tail_names = line.split(':')
        head_names = head_names.strip()
        tail_names = tail_names.strip()
        names = set()
        for name in head_names.split(' '):
            if len(name) > 0 and name not in names:
                names.add(name)
        for name in tail_names.split(' '):
            if len(name) > 0 and name not in names:
                names.add(name)
        if len(names) < 1:
            continue
        for name in names:
            name_buckets[name].add(len(bucket_names))
        bucket_names.append(names)
print(len(bucket_names), len(name_buckets), 
      sum(len(names) for names in bucket_names), 
      sum(len(buckets) for buckets in name_buckets.values()))

In [None]:
# load pref names
pref_df = read_csv(pref_path)
common_names = [name for name in pref_df['name'][:num_common_names].tolist() \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)]
pref_df = None
len(common_names)

In [None]:
names_to_cluster = []
seen_names = set()
for name in common_names:
    if name not in seen_names:
        names_to_cluster.append(name)
        seen_names.add(name)
for names in bucket_names:
    for name in names:
        if name not in seen_names:
            names_to_cluster.append(name)
            seen_names.add(name)
del seen_names
len(names_to_cluster)

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    max_tokens=max_tokens,
    subwords_path=subwords_path,
)
len(tokenizer_vocab)

In [None]:
# load model
model = torch.load(model_path)
model.eval()

## Compute embeddings

In [None]:
name_embedding = {}
for name in tqdm(names_to_cluster):
    embedding = model.get_embedding(tokenize(name))
    if linkage == "ward":
        embedding /= np.linalg.norm(embedding)            
    name_embedding[name] = embedding

In [None]:
print(len(model.get_embedding(tokenize('dallan'))))
model.get_embedding(tokenize('dallan'))[:20]

In [None]:
# test normalize
embedding = model.get_embedding(tokenize('dallan'))
print(embedding[:20])
norm = np.linalg.norm(embedding)
print(norm)
embedding /= norm
print(math.sqrt(sum([v*v for v in embedding])))
embedding[:20]

In [None]:
# test embeddings
import torch.nn.functional as F
import torch
from sklearn.metrics.pairwise import cosine_similarity

# aaltje 0.21964864
# altgen 0.45249435
# eltje 0.08212702
# aeltje 0.18246093
# aalken 0.11775353
# aaltjen 0.253144

bucket = ['altgen', 'altgen', 'altgin', 'altino', 'aaltje', 'eltje', 'aeltje', 'aalken', 'aaltjen', ]
emb1 = model.get_embedding(tokenize(bucket[0]))
print(bucket[0])
for name in bucket[1:]:
    emb2 = model.get_embedding(tokenize(name))
    torch_sim = F.cosine_similarity(torch.Tensor(emb1), torch.Tensor(emb2), dim=-1)
    sklearn_sim = cosine_similarity([emb1], [emb2])[0]
    print(name, torch_sim, sklearn_sim)

## Create closures

In [None]:
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity
# from collections import defaultdict

# def create_transitive_closures(vectors, threshold):
#     # Function to find the connected components (closures)
#     def find_connected_components(graph):
#         visited = set()
#         components = []

#         def dfs(node, component):
#             visited.add(node)
#             component.add(node)
#             for neighbor in graph[node]:
#                 if neighbor not in visited:
#                     dfs(neighbor, component)

#         for node in graph:
#             if node not in visited:
#                 component = set()
#                 dfs(node, component)
#                 components.append(component)

#         return components

#     # Normalize vectors for cosine similarity calculation
#     vectors = np.array(vectors)
#     norms = np.linalg.norm(vectors, axis=1)
#     normalized_vectors = vectors / norms[:, np.newaxis]

#     # Create an adjacency list for the graph
#     graph = defaultdict(set)

#     # Populate the graph based on the cosine similarity threshold
#     for i in tqdm(range(len(normalized_vectors))):
#         # Compute cosine similarity of vector i with all other vectors
#         similarities = np.dot(normalized_vectors, normalized_vectors[i])
#         for j, similarity in enumerate(similarities):
#             if i != j and similarity > threshold:
#                 graph[i].add(j)
#                 graph[j].add(i)

#     # Find the connected components (transitive closures)
#     closures = find_connected_components(graph)

#     # Convert set to list for each closure
#     closures = [list(closure) for closure in closures]

#     return closures


In [None]:
# %%time
# embeddings_to_cluster = [name_embedding[name] for name in names_to_cluster]
# closures = create_transitive_closures(embeddings_to_cluster, threshold=similarity_threshold)
# print(len(closures))
# for closure in closures:
#     if len(closure) > 50_000:
#         print(len(closure))

## Cluster names

In [None]:
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="euclidean" if linkage == "ward" else "cosine",
    linkage=linkage,
    distance_threshold=(1.0 - similarity_threshold),
)

In [None]:
# test clusterer
bucket = ['abraham','abe','aabraham','ab','abaham','abaraham','abarham','abb','abelarde','abera','aberaham']
X = []
names = []
for name in bucket:
    embedding = name_embedding[name]
    names.append(name)
    X.append(embedding)
clustering = clusterer.fit(X)
cluster_names = [set() for _ in range(clustering.n_clusters_)]
print('n_clusters', clustering.n_clusters_)
print('labels', clustering.labels_)
print('names', names)
for name, cluster in zip(names, clustering.labels_):
    cluster_names[cluster].add(name)
for ix, names in enumerate(cluster_names):
    print(ix, names)

### run clusterer

In [None]:
%%time

sample_size = 80_000

X = []
clustered_names = []
for name in names_to_cluster[:sample_size] if sample_size else names_to_cluster:
    embedding = name_embedding[name]
    clustered_names.append(name)
    X.append(embedding)
    
clustering = clusterer.fit(X)

In [None]:
cluster_names = [set() for _ in range(clustering.n_clusters_)]
for name, cluster in zip(clustered_names, clustering.labels_):
    cluster_names[cluster].add(name)
    
len(cluster_names)

## Evaluate results

In [None]:
name_cluster = {}
for ix, names in enumerate(cluster_names):
    for name in names:
        name_cluster[name] = ix
len(name_cluster)

In [None]:
def get_cluster_buckets(cluster, verbose=False):
    """Return all of the buckets each name in the cluster appears in."""
    buckets = []
    for name in cluster_names[cluster]:
        if verbose:
            print('  get_cluster_buckets', name, name_buckets.get(name, []))
        buckets.extend(name_buckets.get(name, []))
    return list(set(buckets))

def get_cluster_lookups(name, verbose=False):
    """Return all of the clusters that have to be looked up when a name is searched."""
    # get the cluster of the name
    cluster = name_cluster[name]
    if verbose:
        print('cluster', cluster, cluster_names[cluster])
    # get all of the buckets that names in this cluster appear in
    buckets = get_cluster_buckets(cluster)
    if verbose:
        print('buckets', buckets)
    # for each bucket, get all of the clusters associated with the names in that bucket
    clusters = set([cluster])
    for bucket in buckets:
        if verbose:
            print('> bucket', bucket, bucket_names[bucket])
        for bucket_name in bucket_names[bucket]:
            if bucket_name in name_cluster:
                if verbose:
                    print('    bucket name', bucket_name, 
                          name_cluster[bucket_name], cluster_names[name_cluster[bucket_name]])
                clusters.add(name_cluster[bucket_name])
    return clusters

In [None]:
# average number of buckets per cluster
total_buckets = sum([len(get_cluster_buckets(cluster)) for cluster in range(len(cluster_names))])
print('total, avg #buckets per cluster', total_buckets, total_buckets / len(cluster_names))

# average number of clusters per bucket
total_clusters = 0
for bucket in range(len(bucket_names)):
    clusters = set()
    for name in bucket_names[bucket]:
        if name in name_cluster:
            clusters.add(name_cluster[name])
    total_clusters += len(clusters)
print('total, avg #clusters per bucket', total_clusters, total_clusters / len(bucket_names))

In [None]:
# compute the number of lookups for each common name
name_lookups = []
for name in common_names[:sample_size] if sample_size else common_names:
    lookups = get_cluster_lookups(name)
    name_lookups.append(len(lookups))
    
print('top 100', sum(name_lookups[:100]) / 100)
print('top 1000', sum(name_lookups[:1000]) / 1000)
print('top 10000', sum(name_lookups[:10000]) / 10000)
print('all', sum(name_lookups) / len(name_lookups))

In [None]:
for name in ['richard', 'james', 'susan', 'elizabeth', 'mary', 'john']:
    print('\nNAME', name)
    
    # print all of the names in the bucket for name
    temp_bucket_names = set()
    for bucket in name_buckets[name]:
        temp_bucket_names |= bucket_names[bucket]
        print('bucket', bucket, bucket_names[bucket])

    # print all of names in each cluster looked up
    all_new_names = set()
    for cluster in get_cluster_lookups(name, verbose=True):
        new_names = cluster_names[cluster] - temp_bucket_names
        all_new_names |= new_names
        old_names = cluster_names[cluster] - new_names
        print('cluster', cluster, 'IN BUCKET', old_names, 'NEW', new_names)
    print('all new names', all_new_names)

In [None]:
# gather every 25'th name into an experiment
experiment = {}
for ix, (label, sub_buckets) in enumerate(bucket_sub_buckets.items()):
    if ix % 25 != 0:
        continue
    experiment[label] = sub_buckets

In [None]:
def name_sort_key(name):
    freq = name_freq.get(name, 0)
    return f"{freq:12d}:{name}"

In [None]:
lines = []
lines.append(f"Experiment: {experiment_name}")
lines.append(f"sub-buckets={sub_bucket_count}")
for label, sub_buckets in experiment.items():
    lines.append(label)
    sub_buckets.sort(key=lambda bucket: name_sort_key(get_most_freq_name(bucket)), reverse=True)
    for sub_bucket in sub_buckets:
        sub_bucket.sort(key=name_sort_key, reverse=True)
        lines.append(f"- {get_most_freq_name(sub_bucket)}: {' '.join(sub_bucket)}")

In [None]:
for line in lines:
    print(line)

## Save experiment report

In [None]:
experiment_name

In [None]:
experiment_filename = f"{experiment_name}.txt"
with open(os.path.join(experiment_dir, experiment_filename), 'wt') as f:
    f.write("\n".join(lines))

## Save sub-clusters

In [None]:
clusters = {}
for sub_buckets in bucket_sub_buckets.values():
    all_names = [name for sub_bucket in sub_buckets for name in sub_bucket]
    cluster_label = get_most_freq_name(all_names)
    clusters[cluster_label] = {}
    for sub_bucket in sub_buckets:
        sub_cluster_label = get_most_freq_name(sub_bucket)
        clusters[cluster_label][sub_cluster_label] = sub_bucket

In [None]:
sub_clusters_path

In [None]:
with open(sub_clusters_path, 'wt') as f:
    json.dump(clusters, f, indent=2)