In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Hyperparameter search for clustering

In [None]:
from collections import defaultdict, namedtuple
import heapq
import math
import pickle

from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
import jellyfish
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ray
from ray import tune
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.integration.wandb import WandbLoggerCallback
import torch

from src.data.normalize import normalize_freq_names
from src.data.utils import load_dataset, select_frequent_k
from src.data.filesystem import fopen
from src.eval.metrics import (
    avg_precision_at_threshold, 
    avg_weighted_recall_at_threshold,
    precision_weighted_recall_curve_at_threshold,
)
from src.models.cluster import (
    generate_clusters,
    get_clusters,
    get_best_cluster_matches,
    get_names_to_cluster,
    get_distances,
    generate_clusters_from_distances,
    get_validation_results,
)
from src.models.ensemble import get_best_ensemble_matches
from src.models.swivel import SwivelModel, get_swivel_embeddings
from src.models.swivel_encoder import SwivelEncoderModel
from src.models.utils import add_padding, remove_padding

In [None]:
# configure
wandb_api_key_file = "../.wandb-api-key"
given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
embed_dim = 100
encoder_layers = 2
num_matches = 5000
batch_size = 256

DEFAULT_NAMES_TO_CLUSTER = 50000  # TODO tune
DEFAULT_SEARCH_THRESHOLD = 0.0
DEFAULT_REPEAT_FREQ_NAMES = False
DEFAULT_ALGO = "agglomerative"
# agglomerative options
DEFAULT_CLUSTER_THRESHOLD = 0.3
DEFAULT_CLUSTER_LINKAGE = "average"
# optics and hdbscan options
DEFAULT_MIN_SAMPLES = 2
DEFAULT_EPS = 0.2
# optics options
DEFAULT_MAX_EPS = 1.0
DEFAULT_XI = 0.15
# hdbscan options
DEFAULT_SELECTION_METHOD = "eom"
DEFAULT_MIN_CLUSTER_SIZE = 2

MAX_NAMES_TO_CLUSTER = 200000

Config = namedtuple("Config", [ 
    "eval_path",
    "freq_path",
    "embed_dim",
    "swivel_vocab_path",
    "swivel_model_path",
    "tfidf_path",
    "ensemble_model_path"
])
config = Config(
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
    tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf.joblib",
    ensemble_model_path=f"s3://nama-data/data/models/fs-{given_surname}-ensemble-model-{vocab_size}-{embed_dim}-augmented-100.joblib",
)

### Load data

In [None]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"  # force CPU because we want to run multiple trials in parallel
print(device)

In [None]:
input_names_eval, weighted_actual_names_eval, candidate_names_eval = load_dataset(config.eval_path, is_eval=True)

In [None]:
actual_names_eval = set([name for wans in weighted_actual_names_eval for name, _, _ in wans])
candidate_names_eval = np.array(list(actual_names_eval))
del actual_names_eval
print(len(candidate_names_eval))

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = normalize_freq_names(freq_df, is_surname=given_surname != "given", add_padding=True)
freq_df = None

In [None]:
vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path, "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
tfidf_vectorizer = joblib.load(fopen(config.tfidf_path, mode='rb'))

In [None]:
ensemble_model = joblib.load(fopen(config.ensemble_model_path, mode='rb'))

### Optimize hyperparameters

In [None]:
def train_eval(config,
               swivel_model,
               swivel_vocab,
               tfidf_vectorizer,
               ensemble_model,
               name_freq,
               input_names_eval,
               weighted_actual_names_eval,
               candidate_names_eval,
               n_jobs=1,
               verbose=False):
    
    names_to_cluster = get_names_to_cluster(name_freq, config["n_to_cluster"])
    
    distances = get_distances(name_freq, 
                              names_to_cluster,
                              swivel_model=swivel_model,
                              swivel_vocab=swivel_vocab,
                              tfidf_vectorizer=tfidf_vectorizer,
                              ensemble_model=ensemble_model,
                              num_matches=num_matches,
                              verbose=verbose,
                              n_jobs=n_jobs,
                             )
    
    name_cluster = generate_clusters_from_distances(cluster_algo=config["cluster_algo"],
                 cluster_linkage=config["cluster_linkage"],
                 cluster_threshold=config["cluster_threshold"],
                 distances=distances,
                 names_to_cluster=names_to_cluster,
                 verbose=verbose,
                 n_jobs=n_jobs)
    
    # validate on validation sets of various sizes
    return get_validation_results(input_names_eval=input_names_eval,
                                  weighted_actual_names_eval=weighted_actual_names_eval,
                                  candidate_names_eval=candidate_names_eval,
                                  name_freq=name_freq,
                                  name_cluster=name_cluster,
                                  swivel_model=swivel_model,
                                  swivel_vocab=swivel_vocab,
                                  tfidf_vectorizer=tfidf_vectorizer,
                                  ensemble_model=ensemble_model,
                                  search_threshold=config["search_threshold"],
                                  num_matches=num_matches,
                                  max_clusters=config["max_clusters"],
                                  n_jobs=n_jobs,
                                  verbose=verbose)


#### Use Ray Tune

In [None]:
# def ray_training_function(config,
#                           swivel_model,
#                           swivel_vocab,
#                           tfidf_vectorizer,
#                           ensemble_model,
#                           name_freq,
#                           input_names_eval,
#                           weighted_actual_names_eval,
#                           candidate_names_eval,
#                           n_jobs=1,
#                           verbose=False):

#     result = train_eval(config,
#                         swivel_model,
#                         swivel_vocab,
#                         tfidf_vectorizer,
#                         ensemble_model,
#                         name_freq,
#                         input_names_eval,
#                         weighted_actual_names_eval,
#                         candidate_names_eval,
#                         n_jobs=n_jobs,
#                         verbose=verbose)
    
#     if 'error' not in result:
#         # Report the metrics to Ray
#         tune.report(f1=result['f1'],
#                     f2=result['f2'],
#                     f1s=result['f1s'],
#                     f2s=result['f2s'],
#                     precisions=result['precisions'], 
#                     recalls=result['recalls'],
#                    )

In [None]:
# config_params={
#     "cluster_algo": DEFAULT_ALGO,
#     "n_to_cluster": tune.qrandint(100000, 200000, 50000),
#     "search_threshold": 0.0,  # tune.quniform(0.0, 0.6, 0.1),
#     "max_clusters": 10,
#     "repeat_freq_names": False,  # tune.choice([True, False]),
#     "cluster_threshold": tune.quniform(-0.98, -0.78, 0.05),
#     "cluster_linkage": "average",  # tune.choice(["average", "single", "complete"]),
#     "min_samples": DEFAULT_MIN_SAMPLES,
#     "eps": DEFAULT_EPS,
#     "max_eps": DEFAULT_MAX_EPS,
#     "cluster_method": "dbscan",
#     "xi": DEFAULT_XI, 
#     "selection_method": DEFAULT_SELECTION_METHOD,  # tune.choice(["eom", "leaf"]),
#     "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,
# }

# current_best_params = [{
#     "cluster_algo": DEFAULT_ALGO,
#     "n_to_cluster": DEFAULT_NAMES_TO_CLUSTER,
#     "search_threshold": DEFAULT_SEARCH_THRESHOLD,
#     "max_clusters": 10,
#     "repeat_freq_names": DEFAULT_REPEAT_FREQ_NAMES,
#     "cluster_threshold": DEFAULT_CLUSTER_THRESHOLD,
#     "cluster_linkage": DEFAULT_CLUSTER_LINKAGE,
#     "min_samples": DEFAULT_MIN_SAMPLES,
#     "eps": DEFAULT_EPS,
#     "max_eps": DEFAULT_MAX_EPS,
#     "cluster_method": "dbscan",
#     "xi": DEFAULT_XI,
#     "selection_method": DEFAULT_SELECTION_METHOD,
#     "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,
# }]

In [None]:
# https://docs.ray.io/en/latest/tune/api_docs/suggestion.html#tune-hyperopt

# search_alg = HyperOptSearch(points_to_evaluate=current_best_params)

# ray.shutdown()
# ray.init()

# callbacks = []
# if wandb_api_key_file:
#     callbacks.append(WandbLoggerCallback(
#         project="nama",
#         entity="nama",
#         group="80_cluster_tune_"+given_surname+"_agglomerative",
#         notes="",
#         config=config._asdict(),
#         api_key_file=wandb_api_key_file
#     ))

# result = tune.run(
#     tune.with_parameters(ray_training_function,
#                          swivel_model=swivel_model,
#                          swivel_vocab=swivel_vocab,
#                          tfidf_vectorizer=tfidf_vectorizer,
#                          ensemble_model=ensemble_model,
#                          name_freq=name_freq,
#                          input_names_eval=input_names_eval,
#                          weighted_actual_names_eval=weighted_actual_names_eval,
#                          candidate_names_eval=candidate_names_eval),
#     resources_per_trial={"cpu": 8.0, "gpu": 0.0},
#     max_concurrent_trials=1,
#     config=config_params,
#     search_alg=search_alg,
#     num_samples=100,
#     metric="f2",
#     mode="max",
#     checkpoint_score_attr="f2",
#     time_budget_s=46*3600,
#     progress_reporter=tune.JupyterNotebookReporter(
#         overwrite=False,
#         max_report_frequency=5*60
#     ),
#     callbacks=callbacks
# )

In [None]:
# # Get trial that has the highest F1
# best_trial = result.get_best_trial(metric='f2', mode='max', scope='all')

# # Parameters with the highest F1
# best_trial.config

# print(f"Best trial final train f2: {best_trial.last_result['f2']}")
# print(f"Best trial final train precision: {best_trial.last_result['precision']}")
# print(f"Best trial final train recall: {best_trial.last_result['recall']}")

# # All trials as pandas dataframe
# df = result.results_df

# df

#### Use Hyperopt

In [None]:
# def hyperopt_objective_function(swivel_model,
#                                 swivel_vocab,
#                                 tfidf_vectorizer,
#                                 ensemble_model,
#                                 name_freq,
#                                 input_names_eval,
#                                 weighted_actual_names_eval,
#                                 candidate_names_eval,
#                                 n_jobs=1,
#                                 verbose=False):
#     def objective(config):
#         config['n_to_cluster'] = int(config['n_to_cluster'])
#         if verbose:
#             print("config", datetime.now(), config)
#         result = train_eval(config,
#                             swivel_model,
#                             swivel_vocab,
#                             tfidf_vectorizer,
#                             ensemble_model,
#                             name_freq,
#                             input_names_eval,
#                             weighted_actual_names_eval,
#                             candidate_names_eval,
#                             n_jobs=n_jobs,
#                             verbose=verbose)
#         if verbose:
#             print("result", datetime.now(), result)

#         if 'error' in result:
#             return {
#                 'status': STATUS_FAIL
#             }
#         else:
#             return {
#                 'status': STATUS_OK,
#                 'loss': 1.0 - result['f2'],
#                 'config': config,
#                 'f1': result['f1'],
#                 'f2': result['f2'],
#                 'f1s': result['f1s'],
#                 'f2s': result['f2s'],
#                 'precisions': result['precisions'],
#                 'recalls': result['recalls'],
#             }
#     return objective

In [None]:
# n_jobs = 64

# # HyperOpt search space
# space = {
#     "cluster_algo": DEFAULT_ALGO,
#     "n_to_cluster": 100000,  # hp.quniform('n_to_cluster', 50000, 200000, 50000),
#     "search_threshold": 0.0,  # hp.quniform('search_threshold', 0.0, 0.1, 0.1),
#     "repeat_freq_names": False,  # hp.choice('repeat_freq_names', [True, False]),
#     "cluster_threshold": 0.0,  # hp.quniform('cluster_threshold', -0.5, 0.1, 0.2),
#     "cluster_linkage": "average", # hp.choice('cluster_linkage', ["average", "single"]), 
#     "min_samples": DEFAULT_MIN_SAMPLES,
#     "eps": DEFAULT_EPS,
#     "max_eps": DEFAULT_MAX_EPS,
#     "cluster_method": "dbscan",
#     "xi": DEFAULT_XI, 
#     "selection_method": DEFAULT_SELECTION_METHOD,  # tune.choice(["eom", "leaf"]),
#     "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
# }
# objective = hyperopt_objective_function(swivel_model,
#                                         swivel_vocab,
#                                         tfidf_vectorizer,
#                                         ensemble_model,
#                                         name_freq,
#                                         input_names_eval,
#                                         weighted_actual_names_eval,
#                                         candidate_names_eval,
#                                         n_jobs=n_jobs,
#                                         verbose=True)
# trials = Trials()

# # minimize the objective over the space
# best = fmin(objective, 
#             space, 
#             algo=tpe.suggest, 
#             trials=trials,
#             max_evals=1)

In [None]:
# print("best", best)
# print("results", trials.results) 

### Manual hyperparameter tuning

In [None]:
n_jobs = 8
verbose = True
n_to_cluster = 20
cluster_threshold=0.3
search_threshold=0.0
max_clusters=20 
search_thresholds=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]

In [None]:
names_to_cluster = get_names_to_cluster(name_freq, n_to_cluster)

In [None]:
distances = get_distances(name_freq, 
                          names_to_cluster,
                          swivel_model=swivel_model,
                          swivel_vocab=swivel_vocab,
                          tfidf_vectorizer=tfidf_vectorizer,
                          ensemble_model=ensemble_model,
                          num_matches=num_matches,
                          verbose=verbose,
                          n_jobs=n_jobs,
                         )

In [None]:
np.savez_compressed(f"distances_{given_surname}.npz", distances=distances)
with open(f"names_to_cluster_{given_surname}.pickle", 'wb') as handle:
    pickle.dump(names_to_cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
distances = np.load(f"distances_{given_surname}.npz", allow_pickle=True)["distances"]

In [None]:
with open(f"names_to_cluster_{given_surname}.pickle", "rb") as f:
    names_to_cluster = pickle.load(f)

#### How many input and candidate names are not in names to cluster?

In [None]:
names_to_cluster_set = set(names_to_cluster)
print(len(names_to_cluster_set))

In [None]:
size = 100000
input_names_validate, weighted_actual_names_validate, candidate_names_validate = \
    select_frequent_k(input_names_eval,
                      weighted_actual_names_eval,
                      candidate_names_eval,
                      size)

In [None]:
print(len(input_names_validate))
print(len(candidate_names_validate))
print(len([name for name in input_names_validate if name not in names_to_cluster_set]))
print(len([name for name in candidate_names_validate if name not in names_to_cluster_set]))

### Generate clusters @ 0.5

In [None]:
cluster_threshold = 0.5
name_cluster = generate_clusters_from_distances(cluster_algo="agglomerative",
             cluster_linkage="average",
             cluster_threshold=cluster_threshold,
             distances=distances,
             names_to_cluster=names_to_cluster,
             verbose=verbose,
             n_jobs=n_jobs)

In [None]:
cluster_counts = defaultdict(int)
cluster_names = defaultdict(list)
for name, cluster in name_cluster.items():
    cluster_counts[cluster] += name_freq[name]
    cluster_names[cluster].append(name)
cluster_counts_df = pd.DataFrame.from_dict(cluster_counts, 
                                           orient='index',
                                           columns=['counts'],
                                          )
cluster_counts_df.hist(bins=100)


In [None]:
print(next(iter(name_freq.items())))

In [None]:
cluster_counts_df.nlargest(20, 'counts')

In [None]:
for tup in cluster_counts_df.nlargest(20, 'counts').itertuples():
    cluster = tup[0]
    count = tup[1]
    print(cluster, count, len(cluster_names[cluster]), cluster_names[cluster])
    print()

In [None]:
results = get_validation_results(input_names_eval=input_names_eval,
                              weighted_actual_names_eval=weighted_actual_names_eval,
                              candidate_names_eval=candidate_names_eval,
                              name_freq=name_freq,
                              name_cluster=name_cluster,
                              swivel_model=swivel_model,
                              swivel_vocab=swivel_vocab,
                              tfidf_vectorizer=tfidf_vectorizer,
                              ensemble_model=ensemble_model,
                              search_threshold=search_thresholds,
                              num_matches=num_matches,
                              max_clusters=max_clusters,
                              n_jobs=n_jobs,
                              verbose=verbose)
print(results)

In [None]:
with open(f"name_cluster_{cluster_threshold}.pickle", 'wb') as handle:
    pickle.dump(name_cluster, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Generate clusters from nysiis

In [None]:
def generate_clusters_from_nysiis(names_to_cluster, verbose=False):
    result = {}
    for name in names_to_cluster:
        unpadded_name = remove_padding(name)
        code = jellyfish.nysiis(unpadded_name)
        result[name] = code
    return result

In [None]:
name_cluster_nysiis = generate_clusters_from_nysiis(names_to_cluster=names_to_cluster,
                                                    verbose=verbose)

In [None]:
cluster_names = defaultdict(set)
for name, cluster in name_cluster_nysiis.items():
    cluster_names[cluster].add(name)
cluster_sizes_df = pd.DataFrame([len(names) for names in cluster_names.values()])
print("names to cluster", len(names_to_cluster))
print("number of clusters", len(set(name_cluster_nysiis.values())))
print("max cluster_size", max([len(names) for names in cluster_names.values()]))
cluster_sizes_df.hist(bins=100)


In [None]:
cluster_counts = defaultdict(int)
cluster_names = defaultdict(list)
for name, cluster in name_cluster_nysiis.items():
    cluster_counts[cluster] += name_freq[name]
    cluster_names[cluster].append(name)
cluster_counts_df = pd.DataFrame.from_dict(cluster_counts, 
                                           orient='index',
                                           columns=['counts'],
                                          )
cluster_counts_df.hist(bins=100)


In [None]:
cluster_counts_df.nlargest(20, 'counts')

In [None]:
for tup in cluster_counts_df.nlargest(20, 'counts').itertuples():
    cluster = tup[0]
    count = tup[1]
    print(cluster, count, len(cluster_names[cluster]), cluster_names[cluster])
    print()

In [None]:
# make sure we've added all names to the lookup table
names_to_cluster_nysiis = list(set(names_to_cluster).union(set(input_names_eval)).union(set(candidate_names_eval)))
name_cluster_nysiis = generate_clusters_from_nysiis(
             names_to_cluster=names_to_cluster_nysiis,
             verbose=verbose)
print(len(name_cluster_nysiis))

In [None]:
results = get_validation_results(input_names_eval=input_names_eval,
                              weighted_actual_names_eval=weighted_actual_names_eval,
                              candidate_names_eval=candidate_names_eval,
                              name_cluster=name_cluster_nysiis,
                              name_freq=None,
                              swivel_model=None,
                              swivel_vocab=None,
                              tfidf_vectorizer=None,
                              ensemble_model=None,
                              num_matches=None,
                              max_clusters=None,
                              search_threshold=0.5,
                              lookup_mode=True,
                              n_jobs=n_jobs,
                              verbose=verbose)
print(results)

### Generate clusters from soundex

In [None]:
def generate_clusters_from_soundex(names_to_cluster, verbose=False):
    result = {}
    for name in names_to_cluster:
        unpadded_name = remove_padding(name)
        code = jellyfish.soundex(unpadded_name)
        result[name] = code
    return result

In [None]:
name_cluster_soundex = generate_clusters_from_soundex(names_to_cluster=names_to_cluster,
                                                      verbose=verbose)

In [None]:
cluster_names = defaultdict(set)
for name, cluster in name_cluster_soundex.items():
    cluster_names[cluster].add(name)
cluster_sizes_df = pd.DataFrame([len(names) for names in cluster_names.values()])
print("names to cluster", len(names_to_cluster))
print("number of clusters", len(set(name_cluster_soundex.values())))
print("max cluster_size", max([len(names) for names in cluster_names.values()]))
cluster_sizes_df.hist(bins=100)


In [None]:
cluster_counts = defaultdict(int)
cluster_names = defaultdict(list)
for name, cluster in name_cluster_soundex.items():
    cluster_counts[cluster] += name_freq[name]
    cluster_names[cluster].append(name)
cluster_counts_df = pd.DataFrame.from_dict(cluster_counts, 
                                           orient='index',
                                           columns=['counts'],
                                          )
cluster_counts_df.hist(bins=100)

In [None]:
cluster_counts_df.nlargest(20, 'counts')

In [None]:
for tup in cluster_counts_df.nlargest(20, 'counts').itertuples():
    cluster = tup[0]
    count = tup[1]
    print(cluster, count, len(cluster_names[cluster]), cluster_names[cluster])
    print()

In [None]:
# make sure we've added all names to the lookup table
names_to_cluster_soundex = list(set(names_to_cluster).union(set(input_names_eval)).union(set(candidate_names_eval)))
name_cluster_soundex = generate_clusters_from_soundex(
             names_to_cluster=names_to_cluster_soundex,
             verbose=verbose)
print(len(name_cluster_soundex))

In [None]:
results = get_validation_results(input_names_eval=input_names_eval,
                              weighted_actual_names_eval=weighted_actual_names_eval,
                              candidate_names_eval=candidate_names_eval,
                              name_cluster=name_cluster_soundex,
                              name_freq=None,
                              swivel_model=None,
                              swivel_vocab=None,
                              tfidf_vectorizer=None,
                              ensemble_model=None,
                              num_matches=None,
                              max_clusters=None,
                              search_threshold=0.5,
                              lookup_mode=True,
                              n_jobs=n_jobs,
                              verbose=verbose)
print(results)

### Generate clusters from old code

In [None]:
# read the old cluster map
with open(f"std_{given_surname}.txt", "rt") as f:
    lines = f.readlines()
old_name_cluster_map = {}
for line in lines:
    line = line.replace(':', ' '). strip()
    cluster = None
    for name in line.split(' '):
        name = name.strip()
        if not name:
            continue
        if cluster is None:
            cluster = name
        old_name_cluster_map[add_padding(name)] = cluster

In [None]:
# read additional name->cluster assignments
with open(f"names_not_found_{given_surname}.txt", "rt") as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    if not line:
        continue
    name, cluster = line.split(' ')
    old_name_cluster_map[add_padding(name)] = cluster 


In [None]:
len(old_name_cluster_map)

In [None]:
# How many names to cluster are not in the lookup table?
names_not_found = set()
for name in names_to_cluster:
    if name not in old_name_cluster_map:
        names_not_found.add(remove_padding(name))
for name in set(input_names_eval).union(candidate_names_eval):
    if name not in old_name_cluster_map:
        names_not_found.add(remove_padding(name))
print(len(names_not_found))


In [None]:
# write out names not in the lookup table
with open("names_not_found.txt", "wt") as f:
    for name in names_not_found:
        f.write(name+'\n')

In [None]:
# get clusters for names to cluster
def generate_clusters_from_old_map(names_to_cluster, verbose=False):
    result = {}
    for name in names_to_cluster:
        cluster = old_name_cluster_map[name]
        result[name] = cluster
    return result

In [None]:
name_cluster_old = generate_clusters_from_old_map(names_to_cluster=names_to_cluster,
                                                  verbose=verbose)

In [None]:
len(name_cluster_old)

In [None]:
cluster_names = defaultdict(set)
for name, cluster in name_cluster_old.items():
    cluster_names[cluster].add(name)
cluster_sizes_df = pd.DataFrame([len(names) for names in cluster_names.values()])
print("names to cluster", len(names_to_cluster))
print("number of clusters", len(set(name_cluster_old.values())))
print("max cluster_size", max([len(names) for names in cluster_names.values()]))
cluster_sizes_df.hist(bins=100)


In [None]:
cluster_counts = defaultdict(int)
cluster_names = defaultdict(list)
for name, cluster in name_cluster_old.items():
    cluster_counts[cluster] += name_freq.get(name, 0)
    cluster_names[cluster].append(name)
cluster_counts_df = pd.DataFrame.from_dict(cluster_counts, 
                                           orient='index',
                                           columns=['counts'],
                                          )
cluster_counts_df.hist(bins=100)


In [None]:
cluster_counts_df.nlargest(20, 'counts')

In [None]:
for tup in cluster_counts_df.nlargest(20, 'counts').itertuples():
    cluster = tup[0]
    count = tup[1]
    print(cluster, count, len(cluster_names[cluster]), cluster_names[cluster])
    print()

In [None]:
# make sure we've added all names to the lookup table
names_to_cluster_old = list(set(names_to_cluster).union(set(input_names_eval)).union(set(candidate_names_eval)))
name_cluster_old = generate_clusters_from_old_map(
             names_to_cluster=names_to_cluster_old,
             verbose=verbose)
print(len(name_cluster_old))

In [None]:
results = get_validation_results(input_names_eval=input_names_eval,
                              weighted_actual_names_eval=weighted_actual_names_eval,
                              candidate_names_eval=candidate_names_eval,
                              name_cluster=name_cluster_old,
                              name_freq=None,
                              swivel_model=None,
                              swivel_vocab=None,
                              tfidf_vectorizer=None,
                              ensemble_model=None,
                              num_matches=None,
                              max_clusters=None,
                              search_threshold=0.5,
                              lookup_mode=True,
                              n_jobs=n_jobs,
                              verbose=verbose)
print(results)