## Hyperparameter searches for clustering

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from collections import defaultdict, namedtuple
from datetime import datetime
import math

from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ray
from ray import tune
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.integration.wandb import WandbLoggerCallback
from sklearn.model_selection import train_test_split
import torch

from src.data.normalize import normalize_freq_names
from src.data.utils import load_dataset, select_frequent_k
from src.data.filesystem import fopen
from src.eval.metrics import (
    avg_precision_at_threshold, 
    avg_weighted_recall_at_threshold,
    precision_weighted_recall_curve_at_threshold,
)
from src.models.cluster import (
    get_sorted_similarities,
    generate_closures,
    generate_clusters,
    get_clusters,
    get_best_cluster_matches,
)
from src.models.ensemble import get_best_ensemble_matches
from src.models.swivel import SwivelModel, get_swivel_embeddings
from src.models.swivel_encoder import SwivelEncoderModel
from src.models.utils import add_padding

### Agglomerative
#### min, max, avg, avg w extra entries for frequent names? x5
#### How many names to cluster (50k, 100k, 150k, 200k, max)? x5
### Consider other algos
##### at this point we've decided how many names to cluster and whether to add extra entries for frequent names
#### Optics vs HDBscan? x2
#### Various parameters x8

## How to evaluate clusters
### How to choose a metric for how far away a cluster is from a name?
#### min, max, avg distance?
#### if avg, weight frequent names more in the average calculation?
## How to create super-clusters?
### Can we cluster the clusters?
### If we re-cluster the names, what happens if a super-cluster splits a cluster?
#### maybe that doesn't happen very often, and is it always bad?
## Fallback
#### use nysiis?

In [None]:
# configure
wandb_api_key_file = "../.wandb-api-key"
given_surname = "surname"
vocab_size = 610000 if given_surname == "given" else 2100000
embed_dim = 100
encoder_layers = 2
num_matches = 5000
batch_size = 256

DEFAULT_NAMES_TO_CLUSTER = 50000  # TODO tune
DEFAULT_SEARCH_THRESHOLD = 0.0
DEFAULT_REPEAT_FREQ_NAMES = False
DEFAULT_ALGO = "agglomerative"
# agglomerative options
DEFAULT_CLUSTER_THRESHOLD = 0.3
DEFAULT_CLUSTER_LINKAGE = "average"
# optics and hdbscan options
DEFAULT_MIN_SAMPLES = 2
DEFAULT_EPS = 0.2
# optics options
DEFAULT_MAX_EPS = 1.0
DEFAULT_XI = 0.15
# hdbscan options
DEFAULT_SELECTION_METHOD = "eom"
DEFAULT_MIN_CLUSTER_SIZE = 2

MAX_NAMES_TO_CLUSTER = 200000

Config = namedtuple("Config", [ 
    "eval_path",
    "freq_path",
    "embed_dim",
    "swivel_vocab_path",
    "swivel_model_path",
    "tfidf_path",
    "ensemble_model_path"
])
config = Config(
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
    embed_dim=embed_dim,
    swivel_vocab_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-vocab-{vocab_size}-augmented.csv",
    swivel_model_path=f"s3://nama-data/data/models/fs-{given_surname}-swivel-model-{vocab_size}-{embed_dim}-augmented.pth",
    tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf.joblib",
    ensemble_model_path=f"s3://nama-data/data/models/fs-{given_surname}-ensemble-model-{vocab_size}-{embed_dim}-augmented-40-40-25.joblib",
)

### Load data

In [None]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = "cpu"  # force CPU because we want to run multiple trials in parallel
print(device)

In [None]:
input_names_eval, weighted_actual_names_eval, candidate_names_eval = load_dataset(config.eval_path, is_eval=True)

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = normalize_freq_names(freq_df, is_surname=given_surname != "given", add_padding=True)
freq_df = None

In [None]:
vocab_df = pd.read_csv(fopen(config.swivel_vocab_path, "rb"))
swivel_vocab = {name: _id for name, _id in zip(vocab_df["name"], vocab_df["index"])}

In [None]:
swivel_model = SwivelModel(len(swivel_vocab), config.embed_dim)
swivel_model.load_state_dict(torch.load(fopen(config.swivel_model_path+".40", "rb"), map_location=torch.device(device)))
swivel_model.to(device)
swivel_model.eval()

In [None]:
tfidf_vectorizer = joblib.load(fopen(config.tfidf_path, mode='rb'))

In [None]:
ensemble_model = joblib.load(fopen(config.ensemble_model_path, mode='rb'))

### Optimize hyperparameters

In [None]:
def train_eval(config,
               swivel_model,
               swivel_vocab,
               tfidf_vectorizer,
               ensemble_model,
               name_freq,
               input_names_eval,
               weighted_actual_names_eval,
               candidate_names_eval,
               n_jobs=1,
               verbose=False):
    
    # get names to score
    names_to_score = np.array(list(name_freq.keys())[:config["n_to_cluster"]])

    # get ensemble scores
    if verbose:
        print("get ensemble scores", datetime.now(), len(names_to_score))
    similar_names_scores = get_best_ensemble_matches(
        model=swivel_model,
        vocab=swivel_vocab,
        input_names=names_to_score,
        candidate_names=names_to_score,
        tfidf_vectorizer=tfidf_vectorizer,
        ensemble_model=ensemble_model,
        name_freq=name_freq,
        k=num_matches,
        batch_size=batch_size,
        add_context=True,
        n_jobs=n_jobs,
        verbose=False,
    )

    # repeat frequent names?
    if config["repeat_freq_names"]:
        names_to_cluster = []
        for name in names_to_score:
            freq = name_freq[name]
            for ix in range(0, math.ceil(math.log10(freq+1))):
                names_to_cluster.append(name)
        names_to_cluster = np.array(names_to_cluster)
    else:
        names_to_cluster = names_to_score

    if len(names_to_cluster) > MAX_NAMES_TO_CLUSTER:
        return {
            'error': 'too large',
        }  
        
    # create name to index dictionary
    name_indices = defaultdict(list)
    for ix, name in enumerate(names_to_cluster):
        name_indices[name].append(ix)
    
    # create distances array
    # names are initially 2.0 apart; similar names are 1.0 - score apart
    if verbose:
        print("create distances array", datetime.now(), len(names_to_cluster))
    distances = np.full((len(names_to_cluster), len(names_to_cluster)), 2.0, dtype=np.float32)
    for name1, names_scores in zip(names_to_score, similar_names_scores):
        name1_ixs = name_indices[name1]
        for name2, score in names_scores:
                name2_ixs = name_indices[name2]
                for name1_ix in name1_ixs:
                    for name2_ix in name2_ixs:
                        distances[name1_ix, name2_ix] = 1.0 - score
                        distances[name2_ix, name1_ix] = 1.0 - score
    del similar_names_scores
    
    # generate clusters from distances
    if verbose:
        print("generate clusters", datetime.now())
    clusters = generate_clusters(distances,
                                 cluster_algo=config["cluster_algo"],
                                 # agglomerative options
                                 cluster_linkage=config["cluster_linkage"],
                                 cluster_threshold=config["cluster_threshold"],
                                 # optics or hdbscan options
                                 min_samples=config["min_samples"],
                                 eps=config["eps"],
                                 # optics options
                                 cluster_method=config["cluster_method"],
                                 max_eps=config["max_eps"],
                                 xi=config["xi"],
                                 # hdbscan options
                                 selection_method=config["selection_method"],
                                 min_cluster_size=config["min_cluster_size"],
                                 # other options
                                 n_jobs=n_jobs,
                                 verbose=False,
                                )
    del distances

    # generate cluster->names and name->cluster
    cluster_names = defaultdict(list)
    name_cluster = {}
    max_cluster_size = 0
    max_cluster_id = None
    for _id, cluster in enumerate(clusters):
        clustered_name = names_to_cluster[_id]
        cluster_names[cluster].append(clustered_name)
        if len(cluster_names[cluster]) > max_cluster_size:
            max_cluster_size = len(cluster_names[cluster])
            max_cluster_id = cluster
        name_cluster[clustered_name] = cluster
    if verbose:
        print("number of clusters", datetime.now(), len(cluster_names))
        print("max cluster size", datetime.now(), max_cluster_size)
        print("max cluster", cluster_names[max_cluster_id])
        cluster_sizes_df = pd.DataFrame([len(names) for names in cluster_names.values()])
        cluster_sizes_df.hist(bins=100)
    
    # validate on validation sets of various sizes
    validation_sizes = [25000, 50000, 100000, 200000, 0]
    sample_size = 20000
    precisions = []
    recalls = []
    f1s = []
    f2s = []
    for size in validation_sizes:
        if verbose:
            print("validate", datetime.now(), size)
        if size == 0:
            input_names_validate, weighted_actual_names_validate, candidate_names_validate = \
                input_names_eval, weighted_actual_names_eval, candidate_names_eval
        else:
            input_names_validate, weighted_actual_names_validate, candidate_names_validate = \
                select_frequent_k(input_names_eval, 
                                  weighted_actual_names_eval, 
                                  candidate_names_eval,
                                  size)

        # sample the validation set
        if len(input_names_validate) > sample_size:
            _, input_names_validate, _, weighted_actual_names_validate = \
                train_test_split(input_names_validate, weighted_actual_names_validate, test_size=sample_size)
            # filter the candidate names to just those in weighted actual names
            # however, we don't do this in notebook 90, so don't do it here either so the numbers a fair comparison
#             candidate_names_validate = np.array(list(set(
#                 name for wans in weighted_actual_names_validate for name, _, _ in wans)))
        
        # get validate names
        all_names_validate = list(set(input_names_validate).union(set(candidate_names_validate)))
        
        # assign all names to clusters
        if verbose:
            print("get_clusters", datetime.now(), len(all_names_validate))
        name2clusters, cluster2names = get_clusters(all_names_validate,
                                                    name_cluster,
                                                    swivel_model,
                                                    swivel_vocab,
                                                    tfidf_vectorizer,
                                                    ensemble_model,
                                                    name_freq,
                                                    max_clusters=1,
                                                    n_jobs=n_jobs,
                                                    verbose=False,
                                                   )

#         print("name2clusters", len(name2clusters),
#               min(len(clusters) for clusters in name2clusters.values()),
#               max(len(clusters) for clusters in name2clusters.values()))
#         print("cluster2names", len(cluster2names), \
#               min(len(names) for names in cluster2names.values()),
#               max(len(names) for names in cluster2names.values()))
#         print("maria cluster", name2clusters["<maria>"])
#         print("maria cluster names", cluster2names[name2clusters["<maria>"][0][0]])
#         for input_name, wans in zip(input_names_validate, weighted_actual_names_validate):
#             if input_name != "<maria>":
#                 continue
#             print("maria weighted actual names", wans)
        
        # get best matches
        if verbose:
            print("get_best_cluster_matches", datetime.now())
        best_matches = get_best_cluster_matches(name2clusters, cluster2names, 
                                                input_names_validate)

        # eval f1
        precision = avg_precision_at_threshold(weighted_actual_names_validate, best_matches, config["search_threshold"])
        recall = avg_weighted_recall_at_threshold(weighted_actual_names_validate, best_matches, config["search_threshold"])
        f1 = 2 * (precision * recall) / (precision + recall)
        f2 = 5 * (precision * recall) / (4 * precision + recall)
        if verbose:
            print("result", datetime.now(), "precision", precision, "recall", recall, "f1", f1, "f2", f2)
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        f2s.append(f2)
        
    f1 = (sum(f1s) / len(f1s)) if len(f1s) > 0 else 0
    f2 = (sum(f2s) / len(f2s)) if len(f2s) > 0 else 0

    return {
        'f1': f1,
        'f2': f2,
        'f1s': f1s,
        'f2s': f2s,
        'precisions': precisions,
        'recalls': recalls,
    }

#### Use Ray Tune

In [None]:
def ray_training_function(config,
                          swivel_model,
                          swivel_vocab,
                          tfidf_vectorizer,
                          ensemble_model,
                          name_freq,
                          input_names_eval,
                          weighted_actual_names_eval,
                          candidate_names_eval,
                          n_jobs=1,
                          verbose=False):

    result = train_eval(config,
                        swivel_model,
                        swivel_vocab,
                        tfidf_vectorizer,
                        ensemble_model,
                        name_freq,
                        input_names_eval,
                        weighted_actual_names_eval,
                        candidate_names_eval,
                        n_jobs=n_jobs,
                        verbose=verbose)
    
    if 'error' not in result:
        # Report the metrics to Ray
        tune.report(f1=result['f1'],
                    f2=result['f2'],
                    f1s=result['f1s'],
                    f2s=result['f2s'],
                    precisions=result['precisions'], 
                    recalls=result['recalls'],
                   )

In [None]:
config_params={
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": tune.qrandint(100000, 200000, 50000),
    "search_threshold": 0.0,  # tune.quniform(0.0, 0.6, 0.1),
    "repeat_freq_names": False,  # tune.choice([True, False]),
    "cluster_threshold": tune.quniform(-0.98, -0.78, 0.05),
    "cluster_linkage": "average",  # tune.choice(["average", "single", "complete"]),
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,  # tune.choice(["eom", "leaf"]),
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,
}

current_best_params = [{
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": DEFAULT_NAMES_TO_CLUSTER,
    "search_threshold": DEFAULT_SEARCH_THRESHOLD,
    "repeat_freq_names": DEFAULT_REPEAT_FREQ_NAMES,
    "cluster_threshold": DEFAULT_CLUSTER_THRESHOLD,
    "cluster_linkage": DEFAULT_CLUSTER_LINKAGE,
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI,
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,
}]

In [None]:
# https://docs.ray.io/en/latest/tune/api_docs/suggestion.html#tune-hyperopt

# search_alg = HyperOptSearch(points_to_evaluate=current_best_params)

# ray.shutdown()
# ray.init()

# callbacks = []
# if wandb_api_key_file:
#     callbacks.append(WandbLoggerCallback(
#         project="nama",
#         entity="nama",
#         group="80_cluster_tune_"+given_surname+"_agglomerative",
#         notes="",
#         config=config._asdict(),
#         api_key_file=wandb_api_key_file
#     ))

# result = tune.run(
#     tune.with_parameters(ray_training_function,
#                          swivel_model=swivel_model,
#                          swivel_vocab=swivel_vocab,
#                          tfidf_vectorizer=tfidf_vectorizer,
#                          ensemble_model=ensemble_model,
#                          name_freq=name_freq,
#                          input_names_eval=input_names_eval,
#                          weighted_actual_names_eval=weighted_actual_names_eval,
#                          candidate_names_eval=candidate_names_eval),
#     resources_per_trial={"cpu": 8.0, "gpu": 0.0},
#     max_concurrent_trials=1,
#     config=config_params,
#     search_alg=search_alg,
#     num_samples=100,
#     metric="f2",
#     mode="max",
#     checkpoint_score_attr="f2",
#     time_budget_s=46*3600,
#     progress_reporter=tune.JupyterNotebookReporter(
#         overwrite=False,
#         max_report_frequency=5*60
#     ),
#     callbacks=callbacks
# )

In [None]:
# # Get trial that has the highest F1
# best_trial = result.get_best_trial(metric='f2', mode='max', scope='all')

# # Parameters with the highest F1
# best_trial.config

# print(f"Best trial final train f2: {best_trial.last_result['f2']}")
# print(f"Best trial final train precision: {best_trial.last_result['precision']}")
# print(f"Best trial final train recall: {best_trial.last_result['recall']}")

# # All trials as pandas dataframe
# df = result.results_df

# df

#### Use Hyperopt

In [None]:
def hyperopt_objective_function(swivel_model,
                                swivel_vocab,
                                tfidf_vectorizer,
                                ensemble_model,
                                name_freq,
                                input_names_eval,
                                weighted_actual_names_eval,
                                candidate_names_eval,
                                n_jobs=1,
                                verbose=False):
    def objective(config):
        config['n_to_cluster'] = int(config['n_to_cluster'])
        if verbose:
            print("config", datetime.now(), config)
        result = train_eval(config,
                            swivel_model,
                            swivel_vocab,
                            tfidf_vectorizer,
                            ensemble_model,
                            name_freq,
                            input_names_eval,
                            weighted_actual_names_eval,
                            candidate_names_eval,
                            n_jobs=n_jobs,
                            verbose=verbose)
        if verbose:
            print("result", datetime.now(), result)

        if 'error' in result:
            return {
                'status': STATUS_FAIL
            }
        else:
            return {
                'status': STATUS_OK,
                'loss': 1.0 - result['f2'],
                'config': config,
                'f1': result['f1'],
                'f2': result['f2'],
                'f1s': result['f1s'],
                'f2s': result['f2s'],
                'precisions': result['precisions'],
                'recalls': result['recalls'],
            }
    return objective

In [None]:
n_jobs = 64

# HyperOpt search space
space = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,  # hp.quniform('n_to_cluster', 50000, 200000, 50000),
    "search_threshold": 0.0,  # hp.quniform('search_threshold', 0.0, 0.1, 0.1),
    "repeat_freq_names": False,  # hp.choice('repeat_freq_names', [True, False]),
    "cluster_threshold": 0.0,  # hp.quniform('cluster_threshold', -0.5, 0.1, 0.2),
    "cluster_linkage": "average", # hp.choice('cluster_linkage', ["average", "single"]), 
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,  # tune.choice(["eom", "leaf"]),
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
objective = hyperopt_objective_function(swivel_model,
                                        swivel_vocab,
                                        tfidf_vectorizer,
                                        ensemble_model,
                                        name_freq,
                                        input_names_eval,
                                        weighted_actual_names_eval,
                                        candidate_names_eval,
                                        n_jobs=n_jobs,
                                        verbose=True)
trials = Trials()

# minimize the objective over the space
best = fmin(objective, 
            space, 
            algo=tpe.suggest, 
            trials=trials,
            max_evals=1)

In [None]:
print("best", best)
print("results", trials.results) 

### Manual hyperparameter tuning

In [None]:
n_jobs = 1  # TODO 64

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 50000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.98,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 150000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.0,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.0,
    "cluster_linkage": "single",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.0,
    "cluster_linkage": "complete",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 50000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.0,
    "cluster_linkage": "complete",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.2,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.4,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.6,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.1,
    "cluster_linkage": "single",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": 0.2,
    "cluster_linkage": "single",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.8,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.9,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 200000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.9,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.95,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.98,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.98,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

In [None]:
config = {
    "cluster_algo": DEFAULT_ALGO,
    "n_to_cluster": 100000,
    "search_threshold": 0.0,
    "repeat_freq_names": False,
    "cluster_threshold": -0.99,
    "cluster_linkage": "average",
    "min_samples": DEFAULT_MIN_SAMPLES,
    "eps": DEFAULT_EPS,
    "max_eps": DEFAULT_MAX_EPS,
    "cluster_method": "dbscan",
    "xi": DEFAULT_XI, 
    "selection_method": DEFAULT_SELECTION_METHOD,
    "min_cluster_size": DEFAULT_MIN_CLUSTER_SIZE,    
}
result = train_eval(config,
                    swivel_model,
                    swivel_vocab,
                    tfidf_vectorizer,
                    ensemble_model,
                    name_freq,
                    input_names_eval,
                    weighted_actual_names_eval,
                    candidate_names_eval,
                    n_jobs=n_jobs,
                    verbose=True)
print(result)

### Test

In [None]:
cluster_names = ["<john>", "<jonathan>", "<mary>", "<marie>", "<maria>", "<george>"]
closure2ids = {"c": [0,1,2,3,4,5]}
cluster_embeddings = get_swivel_embeddings(swivel_model, swivel_vocab, cluster_names).astype('float32')

In [None]:
id2cluster = generate_clusters(closure2ids, cluster_embeddings, 0.15, "average", n_jobs=1)
print(id2cluster)
id2cluster = generate_clusters(closure2ids, cluster_embeddings, 0.99, "average", n_jobs=1)
print(id2cluster)
id2cluster = generate_clusters(closure2ids, cluster_embeddings, 0.01, "average", n_jobs=1)
print(id2cluster)
id2cluster = generate_clusters(closure2ids, cluster_embeddings, 0.01, "ward", n_jobs=1)
print(id2cluster)


In [None]:
from sklearn.preprocessing import normalize

cluster_embeddings = normalize(cluster_embeddings)

In [None]:
from sklearn.cluster import OPTICS, cluster_optics_dbscan

min_samples=2
max_eps=0.7
xi=0.05   # 0.01..0.20, 0.01
metric="cosine"
eps=0.45  # 0.45..0.70, 0.05


clust = OPTICS(min_samples=min_samples, 
               xi=xi, 
               max_eps=max_eps,
               metric=metric,
              )
clust.fit(cluster_embeddings)

labels = cluster_optics_dbscan(
    reachability=clust.reachability_,
    core_distances=clust.core_distances_,
    ordering=clust.ordering_,
    eps=eps,
)
labels

In [None]:
import hdbscan

min_samples=2
eps=0.0
selection_method="leaf"
min_cluster_size=2

clust = hdbscan.HDBSCAN(min_samples=min_samples,
                        cluster_selection_epsilon=eps,
                        cluster_selection_method=selection_method,
                        min_cluster_size=min_cluster_size,
                        metric="euclidean",
                        )
clust.fit(cluster_embeddings)

clust.labels_

In [None]:
max_cluster = max(clust.labels_)
max_cluster

In [None]:
labels = [0,1,0,1,-1,-1,0,1,-1]
max_cluster = max(labels)

In [None]:
results = []
for label in labels:
    if label < 0:
        max_cluster += 1
        label = max_cluster
    results.append(label)
    
print(results)