In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Train a greedy clustering model
Add names to clusters from most-frequent to least-frequent, 
with a clustering threshold that varies based upon frequency

In [None]:
from collections import Counter, defaultdict
import math
import random

import boto3
import jellyfish
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm.autonotebook import tqdm

from src.data.normalize import normalize_freq_names
from src.data.filesystem import fopen
from src.eval.freq_metrics import calc_avg_precision_recall
from src.models.cluster import write_clusters
from src.models.swivel import SwivelModel
from src.models.utils import add_padding


In [None]:
# configure
given_surname = "surname"
vocab_size = 610000 if given_surname == "given" else 2100000
n_jobs = 8

embed_dim = 100
verbose = True

tree_freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv"
swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
freq_df = pd.read_csv(tree_freq_path, na_filter=False)
tree_name_freq = normalize_freq_names(freq_df, 
                                      is_surname=given_surname != "given", 
                                      add_padding=False,
                                      dont_return_empty=False)
freq_df = None
tree_name_freq['mary' if given_surname == 'given' else 'johnson']

In [None]:
vocab_df = pd.read_csv(fopen(swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), embed_dim)
swivel_model.load_state_dict(torch.load(fopen(swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
s3 = boto3.resource('s3')

# these nicknames include nickname heads going to themselves (e.g., john -> john)
nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names:
            nicknames[name].add(headname)
print(len(nicknames))
print(nicknames['zachery'])
print(nicknames['zachariah'])

In [None]:
# TODO review with Richard
print(nicknames['joan'])
print(nicknames['joane'])

In [None]:
query_names = pd.read_csv(query_path, keep_default_na=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

In [None]:
train_df = pd.read_csv(train_path, keep_default_na=False)
print(train_df.shape)
train_df.head(3)

## Greedy Cluster

In [None]:
def cluster(name_counter, nicknames, swivel_vocab, swivel_model, 
            n_to_cluster, high_freq_ix, low_freq_ix, upper, lower):
    # Set m and b to fit the points:
    # (log(name_freq@high_freq_ix), upper)
    # (log(name_freq@low_freq_ix), lower)
    high_freq = name_counter.most_common(high_freq_ix)[-1][1]
    low_freq = name_counter.most_common(low_freq_ix)[-1][1]
    m = (lower - upper) / (math.log(low_freq) - math.log(high_freq))
    b = upper - m * math.log(high_freq)
    
    clustered_names = []
    clustered_vectors = []
    name2clusters = defaultdict(set)
    cluster2names = defaultdict(set)
    count = 0
    for name, freq in tqdm(name_counter.most_common(n_to_cluster)):
        # print stats periodically
        if count % 10000 == 0:
            print(count, 'n_clusters', len(cluster2names))
        count += 1
        
        # calculate threshold = m * log(freq) + b, clamped between upper and lower
        threshold = max(lower, min(upper, m * math.log(freq) + b))

        # get name vector
        name_ix = swivel_vocab.get(add_padding(name), -1)
        if name_ix < 0:
            continue
        name_vector = swivel_model.wi.weight.data[name_ix].cpu().numpy()

        # is this a nickname?
        if given_surname == "given" and name in nicknames:
            for nickhead in nicknames[name]:
                if nickhead not in name2clusters:
                    # create a new cluster for nickname head
                    name2clusters[nickhead].add(nickhead)
                    cluster2names[nickhead].add(nickhead)
                # add name to nickhead cluster(s)
                for cluster in name2clusters[nickhead]:
                    name2clusters[name].add(cluster)
                    cluster2names[cluster].add(name)
                
        # is this the first name?
        elif len(clustered_names) == 0:
            name2clusters[name].add(name)
            cluster2names[name].add(name)
            
        # compare name vector to clustered vectors
        else:
            scores = cosine_similarity([name_vector], clustered_vectors)[0]
            max_score_ix = max(range(len(scores)), key=lambda i: scores[i])
            # is name vector within threshold to an existing clustered name?
            if scores[max_score_ix] >= threshold:
                # add the name to the same cluster as the nearest name
                nearest_name = clustered_names[max_score_ix]
                for cluster in name2clusters[nearest_name]:
                    name2clusters[name].add(cluster)
                    cluster2names[cluster].add(name)
            else:
                # otherwise, create a new cluster
                cluster = name
                name2clusters[name].add(cluster)
                cluster2names[cluster].add(name)
                
        # add name to clustered names
        # TODO consider only adding cluster heads to clustered names + vectors
        clustered_names.append(name)
        clustered_vectors.append(name_vector)
        
    return name2clusters, cluster2names

## Hyperparameter Search

In [None]:
nicknames['ann']

In [None]:
# test_name_counter = Counter({
#     'anna': 1000, 
#     'ann': 100, 
#     'anne': 10, 
#     'nantje': 2,
#     'nanci': 1,
# })
# for name in test_name_counter.keys():
#     print(name, nicknames.get(name))
    
# name2clusters, cluster2names = cluster(test_name_counter, 
#                                        nicknames, 
#                                        swivel_vocab, 
#                                        swivel_model, 
#                                        n_to_cluster=100000, 
#                                        m=0.04, 
#                                        b=0.34, 
#                                        upper=0.95, 
#                                        lower=0.6)
# print(name2clusters)
# print(cluster2names)

In [None]:
# name2clusters, cluster2names = cluster(name_counter, 
#                                        nicknames, 
#                                        swivel_vocab, 
#                                        swivel_model, 
#                                        n_to_cluster=5000, 
#                                        m=0.04, 
#                                        b=0.34, 
#                                        upper=0.95, 
#                                        lower=0.6)


In [None]:
print(name2clusters['ann'])
print(name2clusters['anna'])

In [None]:
name_counter = Counter(tree_name_freq)

In [None]:
for ix, (name, freq) in enumerate(name_counter.items()):
    if name in ['quass', 'quast', 'phillips', 'chatterton', 'simon', 'lillywhite', 'ehat', 'chesworth', 'satter', 'jensen', 'jenson', 'adams']:
        print(name, ix, freq)

In [None]:
name_counter.most_common(500)[-10:]

### Compute and Eval clusters

In [None]:
def _get_nama_standards(nicknames, name2clusters, name):
    standards = set()
    lookups = set([name])
    if given_surname == "given" and name in nicknames:
        lookups.update(nicknames[name])
    for lookup in lookups:
        if lookup in name2clusters:
            standards.update(name2clusters[lookup])
    return standards

def get_nama_nysiis(nicknames, name2clusters, names):
    name2codes = defaultdict(set)
    code2names = defaultdict(set)
    for name in tqdm(names):
        codes = _get_nama_standards(nicknames, name2clusters, name)
        for code in codes:
            name2codes[name].add(code)
            code2names[code].add(name)
        code = jellyfish.nysiis(name)
        # always query nysiis code
        # name2codes simulates query: given a name, what codes to lookup
        name2codes[name].add(code)
        # add name to nysiis bucket only if it isn't in another bucket
        # code2names simulates an index lookup: given code, what names are indexed with that code
        if len(codes) == 0:
            code2names[code].add(name)
    return name2codes, code2names

def eval_clusters(nicknames, name2clusters, train_df, query_names):
        name2codes, code2names = get_nama_nysiis(nicknames,
                                                 name2clusters,
                                                 set(train_df["tree_name"]) | set(train_df["record_name"]))
        print("total names", len(name2codes))
        print("total index entries", sum(len(names) for names in code2names.values()))
        print("total codes", len(code2names))
        print("total queries", len(query_names))
        print("total lookups", sum(len(name2codes[query]) for query in query_names))
        precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, train_df)
        print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")    

def save_clusters(path, cluster2names):
    all_names = []
    all_clusters = []
    for cluster_id, names in cluster2names.items():
        for name in names:
            all_clusters.append(cluster_id)
            all_names.append(name)
    df = pd.DataFrame({"name": all_names, "cluster": all_clusters})
    df.to_csv(path, index=False)

In [None]:
n_to_cluster_values = [300000]
upper_values = [0.71]
lower_values = [0.71]
high_freq_ix_values = [100]  # 100, 500, 2000
low_freq_ix_values = [25000]  # 10000, 25000, 100000

hyperparameters = []
for n_to_cluster in n_to_cluster_values:
    for high_freq_ix in high_freq_ix_values:
        for low_freq_ix in low_freq_ix_values:
            for upper in upper_values:
                for lower in lower_values:
                    hyperparameters.append({
                        'n_to_cluster': n_to_cluster,
                        'upper': upper,
                        'lower': lower,
                        'high_freq_ix': high_freq_ix,
                        'low_freq_ix': low_freq_ix,
                    })
print(len(hyperparameters))

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)