In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Get Statistics for the Old Clusters/Buckets
The old clusters come from an algorithm I wrote around 15 years ago. It wasn't very good.

In [None]:
from collections import defaultdict, namedtuple

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import wandb

from src.data.normalize import normalize_freq_names
from src.data.utils import load_dataset
from src.models.cluster import get_validation_results, get_names_to_cluster
from src.models.utils import add_padding, remove_padding

In [None]:
# configure
given_surname = "surname"
vocab_size = 610000 if given_surname == "given" else 2100000
n_to_cluster = 200000 if given_surname == "given" else 500000
sample_size = 1000
embed_dim = 100
verbose = True
n_jobs = 1

Config = namedtuple("Config", [
    "eval_path",
    "freq_path",
    "embed_dim",
])
config = Config(
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="91_compare_old_algo",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

### Load data

In [None]:
input_names_eval, weighted_actual_names_eval, candidate_names_eval = load_dataset(config.eval_path, is_eval=True)

In [None]:
# make sure all candidate_names_eval are in actual_names_eval
actual_names_eval = set([name for wans in weighted_actual_names_eval for name, _, _ in wans])
candidate_names_eval = np.array(list(actual_names_eval))
del actual_names_eval
print(len(candidate_names_eval))

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = normalize_freq_names(freq_df, is_surname=given_surname != "given", add_padding=True)
freq_df = None

In [None]:
names_to_cluster = get_names_to_cluster(name_freq, n_to_cluster)

### Make sure all names to cluster have clusters in the old name-cluster map

In [None]:
_, input_names_sample, _, weighted_actual_names_sample = \
    train_test_split(input_names_eval, weighted_actual_names_eval, test_size=sample_size)
candidate_names_sample = candidate_names_eval

print("input_names", len(input_names_sample))
print("weighted_actual_names", len(weighted_actual_names_sample))
print("candidate_names", len(candidate_names_sample))

In [None]:
# read the old cluster map
with open(f"../std_{given_surname}.txt", "rt") as f:
    lines = f.readlines()
old_name_cluster_map = {}
for line in lines:
    line = line.replace(':', ' '). strip()
    cluster = None
    for name in line.split(' '):
        name = name.strip()
        if not name:
            continue
        if cluster is None:
            cluster = name
        old_name_cluster_map[add_padding(name)] = cluster

In [None]:
# read additional name->cluster assignments
with open(f"../names_not_found_{given_surname}.txt", "rt") as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    if not line:
        continue
    name, cluster = line.split(' ')
    old_name_cluster_map[add_padding(name)] = cluster 


In [None]:
len(old_name_cluster_map)

In [None]:
# How many names to cluster are not in the lookup table?
names_not_found = set()
for name in names_to_cluster:
    if name not in old_name_cluster_map:
        names_not_found.add(remove_padding(name))
for name in set(input_names_sample).union(candidate_names_sample):
    if name not in old_name_cluster_map:
        names_not_found.add(remove_padding(name))
print(len(names_not_found))

In [None]:
# write out names not in the lookup table
with open("new_names_not_found.txt", "wt") as f:
    for name in names_not_found:
        f.write(name+'\n')

### Generate clusters

In [None]:
# get clusters for names to cluster
def generate_clusters_from_old_map(names_to_cluster, verbose=False):
    result = {}
    for name in names_to_cluster:
        cluster = old_name_cluster_map[name]
        result[name] = cluster
    return result

In [None]:
name_cluster_old = generate_clusters_from_old_map(names_to_cluster=names_to_cluster,
                                                  verbose=verbose)

In [None]:
len(name_cluster_old)

In [None]:
cluster_names = defaultdict(set)
for name, cluster in name_cluster_old.items():
    cluster_names[cluster].add(name)
cluster_sizes_df = pd.DataFrame([len(names) for names in cluster_names.values()])
print("names to cluster", len(names_to_cluster))
print("number of clusters", len(set(name_cluster_old.values())))
print("max cluster_size", max([len(names) for names in cluster_names.values()]))
cluster_sizes_df.hist(bins=100)


In [None]:
cluster_counts = defaultdict(int)
cluster_names = defaultdict(list)
for name, cluster in name_cluster_old.items():
    cluster_counts[cluster] += name_freq.get(name, 0)
    cluster_names[cluster].append(name)
cluster_counts_df = pd.DataFrame.from_dict(cluster_counts, 
                                           orient='index',
                                           columns=['counts'],
                                          )
cluster_counts_df.hist(bins=100)


In [None]:
cluster_counts_df.nlargest(20, 'counts')

In [None]:
for tup in cluster_counts_df.nlargest(20, 'counts').itertuples():
    cluster = tup[0]
    count = tup[1]
    print(cluster, count, len(cluster_names[cluster]), cluster_names[cluster])
    print()

In [None]:
# make sure we've added all names to the lookup table
names_to_cluster_old = list(set(names_to_cluster).union(set(input_names_sample)).union(set(candidate_names_sample)))
name_cluster_old = generate_clusters_from_old_map(
             names_to_cluster=names_to_cluster_old,
             verbose=verbose)
print(len(name_cluster_old))

In [None]:
results = get_validation_results(input_names_eval=input_names_sample,
                              weighted_actual_names_eval=weighted_actual_names_sample,
                              candidate_names_eval=candidate_names_sample,
                              name_cluster=name_cluster_old,
                              name_freq=None,
                              swivel_model=None,
                              swivel_vocab=None,
                              tfidf_vectorizer=None,
                              ensemble_model=None,
                              num_matches=None,
                              max_clusters=None,
                              search_threshold=0.5,
                              lookup_mode=True,
                              sample_size=sample_size,
                              validation_sizes=[0],
                              n_jobs=n_jobs,
                              verbose=verbose)
print(results)

In [None]:
wandb.finish()