In [None]:
%load_ext autoreload
%autoreload 2

# Graph aggregate statistics and select similar and dissimilar pairs
We will develop a similarity model for similar pairs; someone will review dissimilar pairs to find nicknames that we will incorporate later

In [None]:
from collections import namedtuple

import matplotlib.pyplot as plt
import pandas as pd
import wandb

In [None]:
given_surname = "surname"
Config = namedtuple("Config", "in_path min_freq similar_out_path dissimilar_out_path")
config = Config(
    in_path=f"s3://familysearch-names/interim/tree-hr-{given_surname}-aggr.parquet",
    min_freq = 20 if given_surname == "surname" else 5,
    similar_out_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-similar.csv.gz",
    dissimilar_out_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-dissimilar.csv.gz",
)

In [None]:
wandb.init(
    project="nama",
    entity="nama",
    name="40_filter",
    group=given_surname,
    notes="",
    config=config._asdict()
)

In [None]:
df = pd.read_parquet(config.in_path)
print(df.shape)

In [None]:
df.sample(n=10)

In [None]:
# review low-frequency names
df[(df["total_name_frequency"] < 100) | (df["total_alt_name_frequency"] < 100)].sample(n=25)

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(16, 16))
axs[0].hist(df["ordered_prob"], bins=20, log=True)
axs[0].set_title("ordered_prob")
axs[1].hist(df["unordered_prob"], bins=20, log=True)
axs[1].set_title("unordered_prob")
axs[2].hist(df["similarity"], bins=20)
axs[2].set_title("similarity")

In [None]:
plt.figure(figsize=(16, 8))
plt.hist(df["total_name_frequency"], bins=100, range=(1, 1000000), log=True)
plt.title("total_name_frequency")

In [None]:
plt.figure(figsize=(16, 8))
plt.scatter(x=df["similarity"], y=df["unordered_prob"])
plt.title("similarity vs probability")
plt.xlabel("similarity")
plt.ylabel("unordered_prob")

In [None]:
print(df.shape)

In [None]:
# remove low similarity and low frequency
freq = df[(df["similarity"] > 0.4) & (df["frequency"] > 1)]
print(freq.shape)
freq.sample(n=25)

In [None]:
# consider removing low-similarity, low-frequency, and low-probability pairs
low_sim_to_remove_indexes = (freq["similarity"] < 0.55) & ((freq["frequency"] < config.min_freq) | (freq["ordered_prob"] < .08))
low_sim_to_keep_indexes = (freq["similarity"] < 0.55) & ~((freq["frequency"] < config.min_freq) | (freq["ordered_prob"] < .08))
low_sim_to_remove = freq[low_sim_to_remove_indexes]
low_sim_to_keep = freq[low_sim_to_keep_indexes]

In [None]:
print(low_sim_to_remove.shape)
low_sim_to_remove.sample(n=25)

In [None]:
print(low_sim_to_keep.shape)
low_sim_to_keep.sample(n=50)

In [None]:
# remove noisy pairs as defined above
sim = freq[~low_sim_to_remove_indexes]
print(sim.shape)
# review suspicious pairs
print(sim[sim["similarity"] < 0.55].shape)
sim[sim["similarity"] < 0.55].sample(n=25)

In [None]:
# review dissimilar but frequent pairs
dis = df[
    (df["similarity"] <= 0.4) & ((df["frequency"] + df["reverse_frequency"]) >= 1000)
]
print(dis.shape)
dis.sample(n=25)

In [None]:
# write similar and dissimilar pairs
sim.to_csv(config.similar_out_path, index=False)
dis.to_csv(config.dissimilar_out_path, index=False)

In [None]:
wandb.finish()