In [None]:
%load_ext autoreload
%autoreload 2

# Split existing clusters into sub-clusters using trained model

Load the parser and trained model, and use a hierarchal agglomerative clustering algorithm to split existing FamilySearch clusters (buckets) into sub-clusters based upon similarity computed using the trained model. Each sub-cluster will contain names that the model determines are similar to each other. 

We want to create cohesive sub-clusters, but not too many, because we will eventually need to manually create "wormholes" to combine the sub-clusters back into the original clusters.

Each sub-cluster will contain:

1. a list of names, 
2. the most-common name as the cluster label, and 
3. a cluster centroid: a vector depicting the center of the cluster. 

When determine which sub-cluster a rare name belongs to, we will choose the closest centroid.

The questions to answer are:

1. Should we weight more-common names more than less-common names when computing the clusters?: log_10(freq)? NO
2. Should we use single, average, complete, or ward linkage? AVERAGE
3. What should the threshold be? 0.83
4. Should we do dimensionality reduction, and if so, using umap or PCA or t-sne? NO - umap and PCA @ 10 didn't work
5. Should we weight more-common names more than less-common names when computing the cluster centroid? NO

In [None]:
import json
import math
import os

import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
import torch
from tqdm.auto import tqdm
import umap

from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab

In [None]:
# configure
given_surname = "given"

linkage = "average"  # ward, average, complete, single
distance_threshold = 0.65
n_dimensions = 0
dim_reduction = ''  # pca or umap
# freq_normalizer, freq_normalizer_name = lambda x: math.floor(math.log10(max(1,x))), "log10"
freq_normalizer, freq_normalizer_name = lambda x: 0, "none"
experiment_name = f"{linkage}-{distance_threshold}-{freq_normalizer_name}-{n_dimensions}"

vocab_type = 'f'  # tokenizer based upon training name frequency
subword_vocab_size = 2000  # 500, 1000, 1500, 2000

nama_bucket = 'nama-data'
subwords_path=f"data/models/fs-{given_surname}-subword-tokenizer-{subword_vocab_size}{vocab_type}.json"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
buckets_path = f"../references/std_{given_surname}.txt"
model_path = f"../data/models/bi_encoder-{given_surname}.pth"
experiment_dir = f"../reports/"
sub_clusters_path = f"../data/models/sub_clusters_{given_surname}-{distance_threshold}.json"

## Load data

In [None]:
# load buckets
buckets = []
with open(buckets_path, 'rt') as f:
    for line in f.readlines():
        names = line.strip().replace(':', '').split(' ')
        buckets.append(names)
len(buckets)

In [None]:
# load pref names
pref_df = pd.read_csv(pref_path, keep_default_na=False)
name_freq = {name: freq for name, freq in zip(pref_df['name'], pref_df['frequency'])}
pref_df = None
print(len(name_freq))
freq = name_freq['john']
print(freq, freq_normalizer(freq))

In [None]:
def get_most_freq_name(names):
    most_freq_name = names[0]
    most_freq_freq = name_freq.get(most_freq_name, 0)
    for name in names:
        freq = name_freq.get(name, 0)
        if freq > most_freq_freq:
            most_freq_name = name
            most_freq_freq = freq
    return most_freq_name

In [None]:
# load model
model = torch.load(model_path)

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    subwords_path=subwords_path,
    nama_bucket=nama_bucket,
)
len(tokenizer_vocab)

In [None]:
tokenize('dallan')

## Cluster names

### compute embeddings

In [None]:
name_embedding = {}
for bucket in tqdm(buckets):
    for name in bucket:
        embedding = model.get_embedding(tokenize(name))
        if linkage == "ward" or (n_dimensions > 0 and dim_reduction == "umap"):
            embedding /= np.linalg.norm(embedding)            
        name_embedding[name] = embedding

### try reducing dimensions

In [None]:
%%time

if n_dimensions > 0:
    embeddings = []
    for name in name_embedding.keys():
        embeddings.append(name_embedding[name])
    if dim_reduction == 'umap':
        reducer = umap.UMAP(
            n_neighbors=30,
            min_dist=0.0,
            n_components=n_dimensions,
        )
    elif dim_reduction == 'pca':
        reducer = PCA(n_dimensions)
    else:
        print("ERROR!", dim_reduction)
    reduced_embeddings = reducer.fit_transform(embeddings)
    for name, embedding in zip(name_embedding.keys(), reduced_embeddings):
        name_embedding[name] = embedding

### create clusterer

In [None]:
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="euclidean" if linkage == "ward" or (n_dimensions > 0 and dim_reduction == "umap") else "cosine",
    linkage=linkage,
    distance_threshold=distance_threshold,
)

In [None]:
print(len(model.get_embedding(tokenize('dallan'))))
model.get_embedding(tokenize('dallan'))[:20]

In [None]:
embedding = model.get_embedding(tokenize('dallan'))
print(embedding[:20])
norm = np.linalg.norm(embedding)
print(norm)
embedding /= norm
print(math.sqrt(sum([v*v for v in embedding])))
embedding[:20]

In [None]:
bucket = ['abraham','abe','aabraham','ab','abaham','abaraham','abarham','abb','abelarde','abera','aberaham']
X = []
names = []
for name in bucket:
    embedding = name_embedding[name]
    freq = max(1, freq_normalizer(name_freq.get(name, 0)))
    for _ in range(freq):
        names.append(name)
        X.append(embedding)
clustering = clusterer.fit(X)
sub_clusters = [[] for _ in range(clustering.n_clusters_)]
print('n_clusters', clustering.n_clusters_)
print('labels', clustering.labels_)
print('names', names)
for name, sub_cluster in zip(names, clustering.labels_):
    sub_clusters[sub_cluster].append(name)
for sub_cluster in sub_clusters:
    print(list(set(sub_cluster)))

In [None]:
import torch.nn.functional as F
import torch

# aaltje 0.21964864
# altgen 0.45249435
# eltje 0.08212702
# aeltje 0.18246093
# aalken 0.11775353
# aaltjen 0.253144

bucket = ['altino', 'aaltje', 'altgen', 'eltje', 'aeltje', 'aalken', 'aaltjen', ]
emb1 = model.get_embedding(tokenize(bucket[0]))
for name in bucket[1:]:
    emb2 = model.get_embedding(tokenize(name))
    sim = F.cosine_similarity(torch.Tensor(emb1), torch.Tensor(emb2), dim=-1)
    print(name, sim)

### run clusterer

In [None]:
bucket_sub_buckets = {}
for bucket in tqdm(buckets):
    first_name = bucket[0]
    if len(bucket) == 1:
        sub_clusters = [bucket]
    else:
        X = []
        names = []
        for name in bucket:
            embedding = name_embedding[name]
            freq = max(1, freq_normalizer(name_freq.get(name, 0)))
            for _ in range(freq):
                names.append(name)
                X.append(embedding)
        clustering = clusterer.fit(X)
        sub_clusters = [[] for _ in range(clustering.n_clusters_)]
        for name, sub_cluster in zip(names, clustering.labels_):
            sub_clusters[sub_cluster].append(name)
    bucket_sub_buckets[first_name] = []
    for sub_cluster in sub_clusters:
        bucket_sub_buckets[first_name].append(list(set(sub_cluster)))

## Evaluate results

In [None]:
sub_bucket_count = sum(len(sub_buckets) for sub_buckets in bucket_sub_buckets.values())
print(sub_bucket_count)

In [None]:
# gather every 25'th name into an experiment
experiment = {}
for ix, (label, sub_buckets) in enumerate(bucket_sub_buckets.items()):
    if ix % 25 != 0:
        continue
    experiment[label] = sub_buckets

In [None]:
def name_sort_key(name):
    freq = name_freq.get(name, 0)
    return f"{freq:12d}:{name}"

In [None]:
lines = []
lines.append(f"Experiment: {experiment_name}")
lines.append(f"sub-buckets={sub_bucket_count}")
for label, sub_buckets in experiment.items():
    lines.append(label)
    sub_buckets.sort(key=lambda bucket: name_sort_key(get_most_freq_name(bucket)), reverse=True)
    for sub_bucket in sub_buckets:
        sub_bucket.sort(key=name_sort_key, reverse=True)
        lines.append(f"- {get_most_freq_name(sub_bucket)}: {' '.join(sub_bucket)}")

In [None]:
for line in lines:
    print(line)

## Save experiment report

In [None]:
experiment_name

In [None]:
experiment_filename = f"{experiment_name}.txt"
with open(os.path.join(experiment_dir, experiment_filename), 'wt') as f:
    f.write("\n".join(lines))

## Save sub-clusters

In [None]:
clusters = {}
for sub_buckets in bucket_sub_buckets.values():
    all_names = [name for sub_bucket in sub_buckets for name in sub_bucket]
    cluster_label = get_most_freq_name(all_names)
    clusters[cluster_label] = {}
    for sub_bucket in sub_buckets:
        sub_cluster_label = get_most_freq_name(sub_bucket)
        clusters[cluster_label][sub_cluster_label] = sub_bucket

In [None]:
sub_clusters_path

In [None]:
with open(sub_clusters_path, 'wt') as f:
    json.dump(clusters, f, indent=2)