In [None]:
%load_ext autoreload
%autoreload 2

# Split buckets into clusters and super-clusters using cross-encoder

Load the parser and trained model, and use a hierarchal agglomerative clustering algorithm to split existing FamilySearch buckets into clusters and super-clusters based upon similarity computed using the cross-encoder model. Each cluster contains the names in the bucket that the model determines are similar to each other, and each super-cluster contains all of the clusters in the bucket.

Each cluster contains:

1. a list of names, 
2. the most-common name as the cluster label, and 
3. a cluster centroid: a vector depicting the center of the cluster using a bi-encoder. 

Each super-cluster contains:

1. a list of cluster labels
2. the most-common name in the cluster as the super-cluster label

If a bucket has only one cluster, we don't create a super-cluster for the bucket.

When determine which cluster a rare name belongs to, we will choose the closest centroid.

The questions to answer are:

1. What should the threshold be?

ISSUES
- cross-encoder scores much lower than bi-encoder: why?
    - do we need to combine both scores?
    - is cross-encoder really wrong when it scores near 0?
- why is the cross-encoder score so much lower than the bi-encoder score?
- is taking the average cross-encoder score the right thing to do?
    - what about taking the max score for each name, and then averaging the maximums???
- does the problem go away if we take the average of the two ce scores instead of the harmonic mean?
    - is the bi-encoder closer to the average or the harmonic mean?
- we should try to **graph** correlation between Y=bi-encoder score and X=average vs harmonic mean of ce score
- when you sort the two scores, is the order different?

larger ttest is better, smaller mann is better
t_ttest, _ = ttest_ind(random_pair_scores, non_negative_scores, equal_var=False)
t_mann, _  = mannwhitneyu(random_pair_scores, non_negative_scores, use_continuity=False)
print(int(abs(t_ttest)), int(t_mann/1_000_000))

In [None]:
from collections import defaultdict
import json
import math
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from statistics import harmonic_mean
import torch
from tqdm.auto import tqdm

from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab
from src.models.utils import get_cross_encoder_score

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
# configure
given_surname = "given"

linkage = "average"  # average, complete
scorer = "ce"  # be, ce, or cebe
similarity_threshold = 0.1  # be=0.3, ce=0.08 or 0.10 or 0.15, cebe=0.28
cluster_freq_normalizer = "none"  # log, log10, none

experiment_name = f"{linkage}-{similarity_threshold}-{cluster_freq_normalizer}"

max_tokens = 10
subwords_path=f"../data/models/fs-{given_surname}-subword-tokenizer-2000f.json"
std_path = f"../references/std_{given_surname}.txt"
tokenizer_max_length = 32
ce_model_dir = f"../data/models/cross-encoder-{given_surname}-10m-265-same-all"
be_model_type = 'cecommon+0+aug-0-1'
be_model_path = f"../data/models/bi_encoder-{given_surname}-{be_model_type}.pth"
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"

# experiment_dir = f"../reports/"
clusters_path = f"../data/processed/clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
super_clusters_path = f"../data/processed/super_clusters_{given_surname}-{scorer}-{linkage}-{similarity_threshold}-{cluster_freq_normalizer}.json"
clusters_path

## Load data

In [None]:
def get_cluster_freq(name):
    freq = name_freq.get(name, 0)
    if cluster_freq_normalizer == "log":
        return max(1, math.floor(math.log(max(1,freq))))
    elif cluster_freq_normalizer == "log10":
        return max(1, math.floor(math.log10(max(1,freq))))
    else:
        return 1

In [None]:
# load buckets
bucket_names = defaultdict(set)
name_buckets = defaultdict(set)
with open(std_path, 'rt') as f:
    for line in f.readlines():
        names = line.strip().replace(':', ' ').split(' ')
        bucket_name = names[0]
        for name in names:
            name = name.strip()
            if len(name) == 0:
                continue
            bucket_names[bucket_name].add(name)
            name_buckets[name].add(bucket_name)
print(len(bucket_names), len(name_buckets))

In [None]:
# load pref names
pref_df = pd.read_csv(pref_path, na_filter=False)
name_freq = {name: freq for name, freq in zip(pref_df['name'], pref_df['frequency'])}
pref_df = None
print(len(name_freq))
print('john', name_freq['john'], get_cluster_freq('john'))

In [None]:
cnt = 0
for name in name_buckets:
    if name not in name_freq:
        cnt += 1
        print(name)
print(cnt)

In [None]:
def get_most_freq_name(names):
    most_freq_name = None
    most_freq_freq = None
    for name in names:
        freq = name_freq.get(name, 0)
        if most_freq_name is None or freq > most_freq_freq:
            most_freq_name = name
            most_freq_freq = freq
    return most_freq_name

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    max_tokens=max_tokens,
    subwords_path=subwords_path,
)
len(tokenizer_vocab)

In [None]:
# load bi-encoder model
be_model = torch.load(be_model_path)
be_model.eval()

In [None]:
# load cross encoder model
ce_model = CrossEncoder(ce_model_dir, max_length=tokenizer_max_length)

## Compute Bi-Encoder Embeddings

In [None]:
name_embedding = {}
for names in tqdm(bucket_names.values()):
    for name in names:
        embedding = be_model.get_embedding(tokenize(name))
        name_embedding[name] = embedding

## Compare Bi-encoder and Cross-encoder scores

In [None]:
bi_encoder_scores = []
ce_avg_scores = []
ce_harmonic_scores = []
ce_max_scores = []

for bucket_name, names in tqdm(bucket_names.items()):
    if len(names) == 1:
        continue
    # compute X and clustered_names
    X = []
    names = list(names)
    for name1 in random.sample(names, min(len(names), 3)):
        for name2 in random.sample(names, min(len(names), 3)):
            if name1 == name2:
                continue
            # compute bi-encoder score
            emb1 = name_embedding[name1]
            emb2 = name_embedding[name2]
            bi_encoder_score = cosine_similarity([emb1], [emb2])[0][0]
            # compute cross-encoder scores
            ce_scores = ce_model.predict([[name1, name2], [name2, name1]])
            ce_harmonic_score = harmonic_mean([ce_scores[0], ce_scores[1]])
            ce_avg_score = (ce_scores[0]+ce_scores[1])/2
            ce_max_score = max(ce_scores[0], ce_scores[1])
            # save them
            bi_encoder_scores.append(bi_encoder_score)
            ce_harmonic_scores.append(ce_harmonic_score)
            ce_avg_scores.append(ce_avg_score)
            ce_max_scores.append(ce_max_score)

In [None]:
for ix, (h, a, m) in enumerate(zip(ce_harmonic_scores, ce_avg_scores, ce_max_scores)):
    if ix > 20:
        break
    print(h, a, m)

In [None]:
print(len(bi_encoder_scores), len(ce_harmonic_scores), len(ce_avg_scores), len(ce_max_scores))

In [None]:
plt.scatter(bi_encoder_scores, ce_harmonic_scores, s=1, alpha=0.1)

In [None]:
plt.scatter(bi_encoder_scores, ce_avg_scores, s=1, alpha=0.1)

In [None]:
plt.scatter(bi_encoder_scores, ce_max_scores, s=1, alpha=0.1)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(bi_encoder_scores, bins=100, alpha=0.5, label="bi-encoder", color='green')
plt.hist(ce_harmonic_scores, bins=100, alpha=0.5, label="cross-encoder", color='red')
plt.legend(loc='upper right')
# Show the plot
plt.tight_layout()
plt.show()

## Cluster names

### create clusterer

In [None]:
def compute_row(name, names, scorer="ce"):  # scorer=ce, ce, be, or cebe
    # compute pairs
    pairs = []
    if scorer != "be":
        for name2 in names:
            if name2 != name:
                pairs.append([name, name2])
                pairs.append([name2, name])
        if len(pairs) > 0:
            scores = ce_model.predict(pairs)
    # construct row
    row = []
    ix = 0
    for name2 in names:
        score = 0.0
        if scorer != "be":
            if name2 == name:
                score = 1.0
            else:
                score = harmonic_mean([scores[ix], scores[ix+1]])
                ix += 2
        if scorer == "be" or (scorer == "cebe" and score < 0.2):
            emb1 = name_embedding[name]
            emb2 = name_embedding[name2]
            score = cosine_similarity([emb1], [emb2])[0][0]
                
        # store the distance between name and name2
        for _ in range(get_cluster_freq(name2)):
            row.append(1.0 - score)
    return row

In [None]:
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="precomputed",
    linkage=linkage,
    distance_threshold=(1-similarity_threshold),
)

### test clusterer

In [None]:
%%time

# test_names = ['david', 'dan', 'daniel', 'dave']  # , 'darris', 'darrin', 'daren',
#               'abraham','abe','aabraham','ab','abaham','abaraham','abarham','abb',
#               'abelarde','abera','aberaham']
# test_names = ['maholy', 'malay', 'mauley', 'ma', 'mala']
test_names = bucket_names['elizabeth']
print(len(test_names))
X = []
names = []
for name in test_names:
    row = compute_row(name, test_names, scorer)
    for _ in range(get_cluster_freq(name)):
        names.append(name)
        X.append(row)
print(len(X))
clustering = clusterer.fit(X)
clusters = [set() for _ in range(clustering.n_clusters_)]
if scorer == "be":
    be_clusters = clusters
elif scorer == "ce":
    ce_clusters = clusters
elif scorer == "cebe":
    cebe_clusters = clusters
print('n_clusters', clustering.n_clusters_)
print('labels', clustering.labels_)
print('names', names)
for name, cluster in zip(names, clustering.labels_):
    clusters[cluster].add(name)
for cluster in clusters:
    print()
    print('cluster', get_most_freq_name(cluster), ':', cluster)

In [None]:
ce_15_clusters = ce_clusters
print(len(ce_15_clusters))
for cluster in ce_15_clusters:
    print()
    print('cluster', get_most_freq_name(cluster), ':', cluster)

In [None]:
ce_20_clusters = ce_clusters
print(len(ce_20_clusters))
for cluster in ce_20_clusters:
    print()
    print('cluster', get_most_freq_name(cluster), ':', cluster)

In [None]:
ce_25_clusters = ce_clusters
print(len(ce_25_clusters))
for cluster in ce_25_clusters:
    print()
    print('cluster', get_most_freq_name(cluster), ':', cluster)

In [None]:
def get_be_score(name1, name2):
    emb1 = name_embedding[name1]
    emb2 = name_embedding[name2]
    return cosine_similarity([emb1], [emb2])[0][0]
           
def get_ce_score(name1, name2):
    scores = ce_model.predict([[name1, name2], [name2, name1]])
    print(scores)
    return harmonic_mean([scores[0], scores[1]])

name1 = 'mauley'
name2 = 'mala'
print(get_be_score(name1, name2), get_ce_score(name1, name2))
print(compute_row(name1, [name2], scorer="ce"))

### run clusterer

In [None]:
print(scorer, linkage, similarity_threshold, clusters_path)

In [None]:
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="precomputed",
    linkage=linkage,
    distance_threshold=(1-similarity_threshold),
)

In [None]:
bucket_clusters = {}
for bucket_name, names in tqdm(bucket_names.items()):
    if len(names) == 1:
        bucket_clusters[bucket_name] = [names]
    else:
        # compute X and clustered_names
        X = []
        clustered_names = []
        for name in names:
            row = compute_row(name, names, scorer)
            for _ in range(get_cluster_freq(name)):
                clustered_names.append(name)
                X.append(row)
        X = np.array(X)
        # cluster
        clustering = clusterer.fit(X)
        # create the clusters
        clusters = [set() for _ in range(clustering.n_clusters_)]
        for name, cluster in zip(clustered_names, clustering.labels_):
            clusters[cluster].add(name)
        bucket_clusters[bucket_name] = clusters
len(bucket_clusters)

In [None]:
print(
    len(bucket_clusters), 
    sum(len(bucket_cluster) for bucket_cluster in bucket_clusters.values()),
    sum(sum(len(cluster) for cluster in clusters) for clusters in bucket_clusters.values())  
)

## Evaluate results

In [None]:
bucket_cluster_total = sum(len(clusters) for clusters in bucket_clusters.values())
print(len(bucket_clusters), bucket_cluster_total)

In [None]:
# what about the top 100, 1000 names?
total_clusters = 0
total_names = 0
for ix, name in enumerate(name_freq.keys()):
    if ix % 100 == 0 and ix > 0:
        print(total_names, total_clusters / total_names)
    if ix == 2000:
        break
    if name not in name_buckets:
        continue
    bucket_name = next(iter(name_buckets[name]))
    total_clusters += len(bucket_clusters[bucket_name])
    total_names += 1

In [None]:
for name in list(name_freq.keys())[:10]:
    if name not in name_buckets:
        continue
    bucket_name = next(iter(name_buckets[name]))
    print('***', name, bucket_name)
    for ix, cluster in enumerate(bucket_clusters[bucket_name]):
        print(' ', ix, get_most_freq_name(cluster), ':', ' '.join(cluster))


### Write experiment report

deprecated

## Save Clusters and Super-Clusters

In [None]:
def get_cluster_centroid(cluster):
    centroid = None
    for name in cluster:
        embedding = name_embedding[name]
        for _ in range(get_cluster_freq(name)):
            if centroid is None:
                centroid = embedding.copy()
            else:
                centroid += embedding
    return centroid / np.linalg.norm(centroid)    

In [None]:
get_cluster_freq('richard')

In [None]:
emb1 = name_embedding['richard']
emb2 = name_embedding['dallan']
print(cosine_similarity([emb1], [emb2]))
print(cosine_similarity([emb1], [emb1]))
print(cosine_similarity([emb1], [emb1+emb2]))
print(cosine_similarity([emb1], [get_cluster_centroid(['richard', 'dallan'])]))

In [None]:
all_clusters = {}
super_clusters = {}
for bucket_name, clusters in bucket_clusters.items():
    cluster_names = []
    for cluster in clusters:
        centroid = get_cluster_centroid(cluster)
        cluster_name = f"{bucket_name}/{get_most_freq_name(cluster)}"
        cluster_names.append(cluster_name)
        all_clusters[cluster_name] = {
            "names": list(cluster),
            "centroid": centroid.tolist(),
        }
    if len(cluster_names) > 1:
        super_clusters[bucket_name] = cluster_names

In [None]:
print(len(all_clusters), len(super_clusters))

In [None]:
print(clusters_path, super_clusters_path)

In [None]:
with open(clusters_path, 'wt') as f:
    json.dump(all_clusters, f, indent=2)
with open(super_clusters_path, 'wt') as f:
    json.dump(super_clusters, f, indent=2)