In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Train a greedy clustering model
Add names to clusters from most-frequent to least-frequent, 
with a clustering threshold that varies based upon frequency

In [None]:
from collections import Counter, defaultdict
import math
import random

import boto3
import jellyfish
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm.autonotebook import tqdm

from src.data.normalize import normalize_freq_names
from src.data.filesystem import fopen
from src.eval.freq_metrics import calc_avg_precision_recall
from src.models.cluster import write_clusters
from src.models.swivel import SwivelModel
from src.models.utils import add_padding


In [None]:
# configure
given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
n_jobs = 8

embed_dim = 100
verbose = True

tree_freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv"
swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth"
query_path = f"s3://familysearch-names/processed/query-names-{given_surname}-v2.csv.gz"
nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
freq_df = pd.read_csv(tree_freq_path, na_filter=False)
tree_name_freq = normalize_freq_names(freq_df, 
                                      is_surname=given_surname != "given", 
                                      add_padding=False,
                                      dont_return_empty=False)
freq_df = None
tree_name_freq['mary' if given_surname == 'given' else 'johnson']

In [None]:
name_counter = Counter(tree_name_freq)

In [None]:
for ix, (name, freq) in enumerate(name_counter.most_common()):
    if name == "vicky":
        print(ix, name, freq)
        break

In [None]:
vocab_df = pd.read_csv(fopen(swivel_vocab_path, "rb"), na_filter=False)
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), embed_dim)
swivel_model.load_state_dict(torch.load(fopen(swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
s3 = boto3.resource('s3')

# these nicknames include nickname heads going to themselves (e.g., john -> john)
nicknames = defaultdict(set)
if given_surname == "given":
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names:
            nicknames[name].add(headname)
print(len(nicknames))
print(nicknames['zachery'])
print(nicknames['zachariah'])

In [None]:
query_names = pd.read_csv(query_path, na_filter=False)["name"].tolist()
print(len(query_names))
query_names[0:3]

In [None]:
train_df = pd.read_csv(train_path, na_filter=False)
print(train_df.shape)
train_df.head(3)

## Greedy Cluster

In [None]:
def cluster(name_counter, nicknames, swivel_vocab, swivel_model, 
            n_to_cluster, threshold):
    
    clustered_names = []
    clustered_vectors = []
    name2clusters = defaultdict(set)
    cluster2names = defaultdict(set)
    count = 0
    name_freqs = name_counter.most_common(n_to_cluster)
    # add nicknames to names to cluster
    if given_surname == "given":
        names_to_cluster = set(name for name, _ in name_freqs)
        for nickname in nicknames.keys():
            if nickname not in names_to_cluster:
                print("adding", nickname)
                names_to_cluster.add(nickname)
                name_freqs.append((nickname, 1))
            
    for name, freq in tqdm(name_freqs):
        # print stats periodically
        if count % 10000 == 0:
            print(count, 'n_clusters', len(cluster2names))
        count += 1
        
        # get name vector
        name_ix = swivel_vocab.get(add_padding(name), -1)
        if name_ix < 0:
            continue
        name_vector = swivel_model.wi.weight.data[name_ix].cpu().numpy()

        # is this the first name?
        if len(clustered_names) == 0:
            name2clusters[name].add(name)
            cluster2names[name].add(name)
            
        # compare name vector to clustered vectors
        else:
            scores = cosine_similarity([name_vector], clustered_vectors)[0]
            max_score_ix = max(range(len(scores)), key=lambda i: scores[i])
            # is name vector within threshold to an existing clustered name?
            if scores[max_score_ix] >= threshold:
                # add the name to the same cluster as the nearest name
                nearest_name = clustered_names[max_score_ix]
                for cluster in name2clusters[nearest_name]:
                    name2clusters[name].add(cluster)
                    cluster2names[cluster].add(name)
            else:
                # otherwise, create a new cluster
                cluster = name
                name2clusters[name].add(cluster)
                cluster2names[cluster].add(name)
                
        # add name to clustered names
        # TODO consider only adding cluster heads to clustered names + vectors
        clustered_names.append(name)
        clustered_vectors.append(name_vector)
        
    return name2clusters, cluster2names

In [None]:
def _get_nama_standards(nicknames, name2clusters, name):
    standards = set()
    lookups = set([name])
    if given_surname == "given" and name in nicknames:
        lookups.update(nicknames[name])
    for lookup in lookups:
        if lookup in name2clusters:
            standards.update(name2clusters[lookup])
    return standards

def get_nama_nysiis(nicknames, name2clusters, names):
    # name2codes simulates query: given a name, what codes to lookup
    name2codes = defaultdict(set)
    # code2names simulates index: given acode, what names are indexed under that code
    code2names = defaultdict(set)
    for name in names:
        # get codes for name
        codes = name2clusters[name] if name in name2clusters else set()
        for code in codes:
            # query each code for the name
            name2codes[name].add(code)
            # index name under each code
            code2names[code].add(name)
        # always query nysiis for the name
        nysiis_code = jellyfish.nysiis(name)
        # always query nysiis code
        name2codes[name].add(nysiis_code)
        # add name to nysiis bucket only if it isn't in another bucket
        if len(codes) == 0:
            code2names[nysiis_code].add(name)
        # query codes for each nickname
        if given_surname == "given" and name in nicknames:
            for nickhead in nicknames[name]:
                if nickhead in name2clusters:
                    for code in name2clusters[nickhead]:
                        name2codes[name].add(code)
    return name2codes, code2names

def eval_clusters(nicknames, name2clusters, train_df, query_names):
        name2codes, code2names = get_nama_nysiis(nicknames,
                                                 name2clusters,
                                                 set(train_df["tree_name"]) | set(train_df["record_name"]))
        print("total names", len(name2codes))
        print("total index entries", sum(len(names) for names in code2names.values()))
        print("total codes", len(code2names))
        print("total queries", len(query_names))
        print("total lookups", sum(len(name2codes[query]) for query in query_names))
        precision, recall, f1, f2 = calc_avg_precision_recall(query_names, name2codes, code2names, train_df)
        print(f"precision={precision}, recall={recall} f1={f1} f2={f2}")    

def save_clusters(path, cluster2names):
    all_names = []
    all_clusters = []
    for cluster_id, names in cluster2names.items():
        for name in names:
            all_clusters.append(cluster_id)
            all_names.append(name)
    df = pd.DataFrame({"name": all_names, "cluster": all_clusters})
    df.to_csv(path, index=False)

## Hyperparameter Search

In [None]:
n_to_cluster_values = [100000]
threshold_values = [0.7]

hyperparameters = []
for n_to_cluster in n_to_cluster_values:
    for threshold in threshold_values:
        hyperparameters.append({
            'n_to_cluster': n_to_cluster,
            'threshold': threshold,
        })
print(len(hyperparameters))

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-threshold_{hyperparameter['threshold']}.csv"
    # save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-upper_{hyperparameter['upper']}-lower_{hyperparameter['lower']}-high_freq_ix_{hyperparameter['high_freq_ix']}-low_freq_ix_{hyperparameter['low_freq_ix']}.csv"
    save_clusters(path, cluster2names)
    print(len(cluster2names), path)

In [None]:
for hyperparameter in hyperparameters:
    print(hyperparameter)
    name2clusters, cluster2names = cluster(name_counter, 
                                           nicknames, 
                                           swivel_vocab, 
                                           swivel_model, 
                                           **hyperparameter)
    eval_clusters(nicknames, name2clusters, train_df, query_names)
    path = f"../data/models/fs-{given_surname}-cluster-greedy-{hyperparameter['n_to_cluster']}-threshold_{hyperparameter['threshold']}.csv"
    # save_clusters(path, cluster2names)
    print(len(cluster2names), path)