In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import random
import torch
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.cluster import AgglomerativeClustering
from tqdm.auto import tqdm

from src.data.filesystem import fopen
from src.data.utils import load_train_test
from src.data.prepare import normalize
from src.eval.encoder import eval_encoder
from src.models.utils import (
    add_padding,
    remove_padding,
    build_token_idx_maps,
    convert_names_to_model_inputs,
    get_best_matches,
)

### Configure

In [None]:
sample_size = 0
max_closure_size = 10000
max_distance = 0.22
cluster_distance_threshold = 0.155
super_cluster_distance_threshold = 0.205
num_candidates = 2000
eps = 0.000001
model_filename = "../data/models/anc-triplet-bilstm-100-512-40-05.pth"

# process_nicknames = True
# werelate_names_filename = 'givenname_similar_names.werelate.20210414.tsv'
# nicknames_filename = '../data/models/givenname_nicknames.txt'
# name_freqs_filename = 'given-final.normal.txt'
# clusters_filename = 'givenname_clusters.tsv'
# super_clusters_filename = 'givenname_super_clusters.tsv'

werelate_names_filename = "../data/external/surname_similar_names.werelate.20210414.tsv"
nicknames_filename = ""
name_freqs_filename = "../data/external/surname-final.normal.txt"
clusters_filename = "../data/models/surname_clusters.tsv"
super_clusters_filename = "../data/models/surname_super_clusters.tsv"
is_surname = True

### Read WeRelate names into all_names
Later, we'll want to read frequent FS names into all_names

In [None]:
# TODO rewrite this in just a few lines using pandas
def load_werelate_names(path, is_surname):
    name_variants = defaultdict(set)
    with fopen(path, mode="r", encoding="utf-8") as f:
        is_header = True
        for line in f:
            if is_header:
                is_header = False
                continue
            fields = line.rstrip().split("\t")
            # normalize should only return a single name piece, but loop just in case
            for name_piece in normalize(fields[0], is_surname):
                confirmed_variants = fields[1].strip().split(" ") if len(fields) >= 2 else []
                computer_variants = fields[2].strip().split(" ") if len(fields) == 3 else []
                variants = confirmed_variants + computer_variants
                for variant in variants:
                    for variant_piece in normalize(variant, is_surname):
                        name_variants[name_piece].add(variant_piece)
    return name_variants

In [None]:
all_names = set()

name_variants = load_werelate_names(werelate_names_filename, is_surname)
print(len(name_variants))
for k, v in name_variants.items():
    all_names.add(add_padding(k))
    all_names.update(add_padding(variant) for variant in v)
print(len(all_names), next(iter(all_names)))

name_variants = None

### Read nicknames and remove from names

In [None]:
def load_nicknames(path):
    nicknames = defaultdict(set)
    with fopen(path, mode="r", encoding="utf-8") as f:
        for line in f:
            names = line.rstrip().split(" ")
            # normalize should only return a single name piece, but loop just in case
            for name_piece in normalize(names[0], False):
                orig_name = add_padding(name_piece)
                for nickname in names[1:]:
                    for nickname_piece in normalize(nickname, False):
                        nicknames[add_padding(nickname_piece)].add(orig_name)
    return nicknames

In [None]:
if not is_surname:
    nick_names = load_nicknames(nicknames_filename)
    name_nicks = defaultdict(set)
    for nick, names in nick_names.items():
        for name in names:
            name_nicks[name].add(nick)
    print(next(iter(nick_names.items())), "nick_names", len(nick_names.keys()), "name_nicks", len(name_nicks.keys()))
    all_names -= set(nickname for nickname in nick_names.keys())
    print(len(all_names))

### Map names to ids

In [None]:
def map_names_to_ids(names):
    ids = range(len(names))
    return dict(zip(names, ids)), dict(zip(ids, names))

In [None]:
name_ids, id_names = map_names_to_ids(all_names)
print(next(iter(name_ids.items())), next(iter(id_names.items())))

### Read name frequencies

In [None]:
# TODO rewrite this using pandas too
def load_name_freqs(path, is_surname):
    name_freqs = defaultdict(int)
    with fopen(path, mode="r", encoding="utf-8") as f:
        for line in f:
            fields = line.rstrip().split("\t")
            for name_piece in normalize(fields[0], is_surname):
                name_freqs[name_piece] = int(fields[1])
    return name_freqs

In [None]:
name_freqs = load_name_freqs(name_freqs_filename, is_surname)
# keep only entries in all_names
name_freqs = dict((add_padding(k), v) for k, v in name_freqs.items() if add_padding(k) in all_names)
print(len(name_freqs), next(iter(name_freqs.items())))

### Load model

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = torch.load(model_filename, map_location=torch.device(device))

### Encode names

In [None]:
MAX_NAME_LENGTH = 30
char_to_idx_map, idx_to_char_map = build_token_idx_maps()

#### Take a sample because encoded names require a lot of memory

In [None]:
if sample_size <= 0 or sample_size >= len(all_names):
    names_sample = np.array(list(all_names))
else:
    names_sample = np.array(random.sample(all_names, sample_size))
print(names_sample.shape)

#### Compute encodings

In [None]:
# Get embeddings
names_tensor, _ = convert_names_to_model_inputs(names_sample, char_to_idx_map, MAX_NAME_LENGTH)

In [None]:
# Get encodings for the names from the encoder
names_encoded = eval_encoder(model, names_tensor, 1024)
names_encoded.shape

### Compute distances

In [None]:
# This takes an hour for 190k
# names_candidates = (names_sample, candidates, (name text, score))
name_candidates = get_best_matches(
    names_encoded, names_encoded, names_sample, num_candidates=num_candidates, metric="euclidean"
)

In [None]:
# first, repeat each name num_candidates times and convert the 1D array to a 2D array
# next, reshape name_candidates to be a 2D array where the columns are (candidate, score)
# finally, stack the two arrays into a 2D array where the columns are (name, candidate, score)
distances = np.hstack((np.repeat(names_sample, num_candidates)[:, np.newaxis], name_candidates.reshape(-1, 2)))
# remove distances > max_distance
distances = distances[distances[:, -1].astype("float") <= max_distance]
# sort
distances = distances[distances[:, -1].astype("float").argsort()]
print(distances.shape)
name_candidates = None

### Compute closures

In [None]:
# iterate over all distances, create closures and save scores
next_closure = 0
closure_ids = {}
id_closure = {}
row_ixs = []
col_ixs = []
dists = []
max_size = 0

for row in tqdm(distances):
    name1 = row[0]
    name2 = row[1]
    id1 = name_ids[name1]
    id2 = name_ids[name2]
    # each distance is in distances twice
    if id1 > id2:
        continue
    distance = max(eps, float(row[2]))
    closure1 = id_closure.get(id1)
    closure2 = id_closure.get(id2)
    if closure1 is None and closure2 is not None:
        id1, id2 = id2, id1
        name1, name2 = name2, name1
        closure1, closure2 = closure2, closure1
    # add to distance matrix
    row_ixs.append(id1)
    col_ixs.append(id2)
    dists.append(distance)
    # skip if names are the same
    if id1 == id2:
        continue
    row_ixs.append(id2)
    col_ixs.append(id1)
    dists.append(distance)
    # create closures
    if closure1 is None:
        # if closure1 is None, then closure2 must be none also due to the above
        # so create a new closure with id1 and id2
        closure1 = next_closure
        next_closure += 1
        id_closure[id1] = closure1
        id_closure[id2] = closure1
        closure_ids[closure1] = [id1, id2]
        next_closure += 1
    elif closure2 is None:
        # put id2 into id1's closure
        id_closure[id2] = closure1
        closure_ids[closure1].append(id2)
    elif closure1 != closure2 and len(closure_ids[closure1]) + len(closure_ids[closure2]) <= max_closure_size:
        # move all ids in closure2 into closure1
        for id in closure_ids[closure2]:
            id_closure[id] = closure1
            closure_ids[closure1].append(id)
        del closure_ids[closure2]
    if len(closure_ids[closure1]) > max_size:
        max_size = len(closure_ids[closure1])

# create distances matrix
dist_matrix = csr_matrix((dists, (row_ixs, col_ixs)))

print("max closure_size", max_size)
print("number of closures", len(closure_ids), "number of names enclosed", len(id_closure))

### Compute clusters

In [None]:
def compute_clusters(closure_ids, id_names, dist_matrix, linkage, distance_threshold, eps, max_dist):
    cluster_names = defaultdict(set)
    name_cluster = {}
    for closure, ids in tqdm(closure_ids.items()):
        clusterer = AgglomerativeClustering(
            n_clusters=None, affinity="precomputed", linkage=linkage, distance_threshold=distance_threshold
        )
        X = dist_matrix[ids][:, ids].todense()
        X[X < eps] = max_dist
        labels = clusterer.fit_predict(X)
        for id, label in zip(ids, labels):
            name = id_names[id]
            cluster = f"{closure}_{label}"
            cluster_names[cluster].add(name)
            name_cluster[name] = cluster
    return cluster_names, name_cluster

In [None]:
# try ward, average, single
cluster_linkage = "average"
max_dist = 10.0

cluster_names, name_cluster = compute_clusters(
    closure_ids, id_names, dist_matrix, cluster_linkage, cluster_distance_threshold, eps, max_dist
)
print(len(cluster_names))

#### Add unclustered names as singleton clusters

In [None]:
def add_singleton_names(cluster_names, name_cluster, names_sample):
    for ix, name in enumerate(names_sample):
        if name not in name_cluster:
            cluster = f"{ix}"
            cluster_names[cluster].add(name)
            name_cluster[name] = cluster
    return cluster_names, name_cluster

In [None]:
cluster_names, name_cluster = add_singleton_names(cluster_names, name_cluster, names_sample)
print(len(cluster_names))

### Eval cluster P/R over Ancestry test data

In [None]:
train, test = load_train_test("../data/raw/records25k_data_train.csv", "../data/raw/records25k_data_test.csv")

_, _, candidates_train = train
input_names_test, weighted_actual_names_test, candidates_test = test

all_candidates = np.concatenate((candidates_train, candidates_test))

In [None]:
def get_precision_recall(
    names_sample, all_candidates, input_names_test, weighted_actual_names_test, cluster_names, name_cluster
):
    names_sample_set = set(names_sample.tolist())
    all_candidates_set = set(all_candidates.tolist())

    precisions = []
    recalls = []
    missing = set()
    # TODO if the input_name is not in the names_sample_set, consider looking up nearest neighbor
    for input_name, weighted_actual_names in zip(input_names_test, weighted_actual_names_test):
        if input_name not in names_sample_set:
            missing.add(input_name)
            continue
        cluster_id = name_cluster[input_name]
        names_in_cluster = cluster_names[cluster_id] & all_candidates_set
        found_recall = 0.0
        total_recall = 0.0
        found_count = 0
        for name, weight, _ in weighted_actual_names:
            if name not in names_sample_set:
                missing.add(name)
                continue
            total_recall += weight
            if name in names_in_cluster:
                found_recall += weight
                found_count += 1
        if total_recall == 0.0:
            continue
        precision = found_count / len(names_in_cluster) if len(names_in_cluster) > 0 else 1.0
        recall = found_recall / total_recall
        precisions.append(precision)
        recalls.append(recall)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    return avg_precision, avg_recall, len(precisions), len(missing)

In [None]:
precision, recall, total, missing = get_precision_recall(
    names_sample, all_candidates, input_names_test, weighted_actual_names_test, cluster_names, name_cluster
)
print(f"Total={total} Precision={precision} Recall={recall} Missing={missing}")

### Write clusters

In [None]:
def write_clusters(path, cluster_names, name_freqs, name_nicks):
    cluster_id_name_map = {}
    with fopen(path, mode="w", encoding="utf-8") as f:
        for cluster_id, names in cluster_names.items():
            # get most-frequent name
            cluster_name = max(names, key=(lambda name: name_freqs.get(name, 0)))
            # map cluster id to cluster name
            cluster_id_name_map[cluster_id] = cluster_name
            # add nicknames
            nicknames = set()
            if name_nicks:
                for name in names:
                    if name in name_nicks:
                        nicknames.update(name_nicks[name])
            # remove padding
            cluster_name = remove_padding(cluster_name)
            names = [remove_padding(name) for name in names | nicknames]
            # write cluster
            f.write(f'{cluster_name}\t{" ".join(names)}\n')
    return cluster_id_name_map

In [None]:
cluster_id_name_map = write_clusters(clusters_filename, cluster_names, name_freqs, name_nicks)

### Create super-clusters

In [None]:
super_cluster_names, name_super_cluster = compute_clusters(
    closure_ids, id_names, dist_matrix, cluster_linkage, super_cluster_distance_threshold, eps, max_dist
)
print(len(super_cluster_names))

In [None]:
super_cluster_names, name_super_cluster = add_singleton_names(super_cluster_names, name_super_cluster, names_sample)
print(len(super_cluster_names))

In [None]:
precision, recall, total = get_precision_recall(
    names_sample, all_candidates, input_names_test, weighted_actual_names_test, super_cluster_names, name_super_cluster
)
print("Total=", total, " Precision=", precision, " Recall=", recall)

In [None]:
# get cluster names for each name in super cluster
super_cluster_clusters = {
    id: set([cluster_id_name_map[name_cluster[name]] for name in names]) for id, names in super_cluster_names.items()
}

### Write super-clusters

In [None]:
_ = write_clusters(super_clusters_filename, super_cluster_clusters, name_freqs, None)