# Analyze name frequencies and codes
* count the number of distinct codes for various coders
* put in buckets avg for every 1000 names in order from most-to-least frequent
* graph based upon tree-record pair frequency or pref-tree-name frequency


In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Graph various statistics of names and name-coders

In [None]:
from collections import namedtuple, defaultdict

import jellyfish
import joblib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from mpire import WorkerPool
import numpy as np
import pandas as pd
from pyphonetics import RefinedSoundex
# from rapidfuzz.string_metric import levenshtein
import regex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import safe_sparse_dot
import torch
from tqdm import tqdm
from unidecode import unidecode
import wandb

from src.data.filesystem import fopen
from src.data.utils import load_dataset, select_frequent_k, frequent_k_names
from src.eval import metrics
from src.eval.utils import similars_to_ndarray
from src.models.ensemble import get_best_ensemble_matches
from src.models.swivel import SwivelModel, get_best_swivel_matches
from src.models.swivel_encoder import SwivelEncoderModel
from src.models.utils import remove_padding, add_padding

In [None]:
# config

given_surname = "given"
vocab_size = 610000 if given_surname == "given" else 2100000
sample_size = 10000
Config = namedtuple("Config", [
    "eval_path",
    "freq_path",
])
config = Config(
    eval_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz",
    freq_path=f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="72_analyze_names",
    group=given_surname,
    notes="",
    config=config._asdict(),
)

### Load data

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

In [None]:
input_names_eval, weighted_actual_names_eval, candidate_names_eval = load_dataset(config.eval_path, is_eval=False)

In [None]:
freq_df = pd.read_csv(config.freq_path, na_filter=False)
name_freq = {add_padding(name): freq for name, freq in zip(freq_df["name"], freq_df["frequency"])}
# freq_df = None

### Nysiis codes

In [None]:
n_names = [50000, 100000, 200000, 400000, 610000]

In [None]:
def clean(name):
    name = unidecode(remove_padding(name.lower()))
    name = regex.sub(r'[^a-z]', "", name)
    return name

In [None]:
for num in n_names:
    codes = set()
    for ix, name in enumerate(name_freq.keys()):
        if ix > num:
            break
        name = clean(name)
        if not name:
            continue
        codes.add(jellyfish.nysiis(name))
    print(num, len(codes))

### Soundex codes

In [None]:
for num in n_names:
    codes = set()
    for ix, name in enumerate(name_freq.keys()):
        if ix > num:
            break
        name = clean(name)
        if not name:
            continue
        codes.add(jellyfish.soundex(name))
    print(num, len(codes))

### Refined Soundex

In [None]:
refined_soundex = RefinedSoundex()

for num in n_names:
    codes = set()
    for ix, name in enumerate(name_freq.keys()):
        if ix > num:
            break
        name = clean(name)
        if not name:
            continue
        try:
            codes.add(refined_soundex.phonetics(name))
        except:
            print(name)
    print(num, len(codes))

### Graph tree name frequencies

In [None]:
sum_freq = 0
cnt = 0
xs = []
ys = []
for ix, (name, freq) in enumerate(name_freq.items()):
    if cnt > 0 and ix % 1000 == 0:
        xs.append(ix)
        ys.append(sum_freq / cnt)
        sum_freq = 0
        cnt = 0
    sum_freq += freq
    cnt += 1
print(len(xs), len(ys))   

In [None]:
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("Name frequencies")
ax.scatter(xs, ys)
plt.ylim([0, 1000])
plt.xlim([0, 500000])
plt.show()

### Graph tree-record name frequencies

In [None]:
freqs = defaultdict(int)
for input_name, wans in zip(input_names_eval, weighted_actual_names_eval):
    for name, _, freq in wans:
        freqs[input_name] += freq
        freqs[name] += freq
freqs = sorted(freqs.items(), key=lambda item: item[1], reverse=True)

In [None]:
sum_freq = 0
cnt = 0
xs = []
ys = []
for ix, (name, freq) in enumerate(freqs):
    if cnt > 0 and ix % 1000 == 0:
        xs.append(ix)
        ys.append(sum_freq / cnt)
        sum_freq = 0
        cnt = 0
    sum_freq += freq
    cnt += 1
print(len(xs), len(ys))   

In [None]:
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("Tree-name <-> Record-name frequencies")
ax.scatter(xs, ys)
plt.ylim([0, 1000])
plt.show()

### Graph same-name weights
How likely is a name to be associated with itself

In [None]:
# freqs = {k: v for k, v in freqs}
name_freq_weight_triplets = []
for input_name, wans in zip(input_names_eval, weighted_actual_names_eval):
    for name, weight, _ in wans:
        if name == input_name:
            name_freq_weight_triplets.append((input_name, name_freq.get(input_name, 0), weight))
            break
name_freq_weight_triplets = sorted(name_freq_weight_triplets, key=lambda item: item[1], reverse=True)

In [None]:
sum_weight = 0.0
cnt = 0
xs = []
ys = []
for ix, (name, _, weight) in enumerate(name_freq_weight_triplets):
    if cnt > 0 and ix % 10 == 0:
        xs.append(ix)
        ys.append(sum_weight / cnt)
        sum_weight = 0.0
        cnt = 0
    sum_weight += weight
    cnt += 1
print(len(xs), len(ys)) 

In [None]:
figure, ax = plt.subplots(1, 1, figsize=(20, 15))
ax.set_title("Tree-name -> same record-name average weight")
ax.scatter(xs, ys)
plt.ylim([0, 1.0])
plt.xlim([0, 250000])
plt.show()