In [None]:
%load_ext autoreload
%autoreload 2

# Aggregate pairs and compute probabilities and similarities

In [None]:
from os.path import join

from mpire import WorkerPool
import pandas as pd
from pandarallel import pandarallel

from src.data.filesystem import glob
from src.data.match import levenshtein_similarity

In [None]:
in_path = "s3://familysearch-names/interim/tree-hr-given-pairs/"
out_path = "s3://familysearch-names/interim/tree-hr-given-aggr.parquet"

In [None]:
# read input files into dataframe array
with WorkerPool() as pool:
    dfs = pool.map(pd.read_parquet, glob(join(in_path, "part-*")), progress_bar=True)
print(len(dfs))

In [None]:
df = pd.concat(dfs)
del dfs
print(df.shape)
print(df)

In [None]:
# group by name and alt name and calculate frequency
df["frequency"] = 1
grouped = df.groupby(["name", "alt_name"]).sum().reset_index()
del df
print(grouped.shape)

## Calculate ordered and unordered probabilities

In [None]:
# sum frequency by name
sum_name_freq = (
    grouped.groupby("name")
    .sum()
    .reset_index()
    .rename(columns={"frequency": "sum_frequency"})
)

In [None]:
# sum frequency by alt_name
sum_alt_name_freq = (
    grouped.groupby("alt_name")
    .sum()
    .reset_index()
    .rename(columns={"alt_name": "name", "frequency": "sum_frequency"})
)

In [None]:
# total frequency for name or alt_name
total_name_freq = (
    pd.concat([sum_name_freq, sum_alt_name_freq]).groupby("name").sum().reset_index()
)

In [None]:
# include reverse name-alt_name frequency
grouped = (
    pd.merge(
        grouped,
        grouped,
        how="left",
        left_on=["name", "alt_name"],
        right_on=["alt_name", "name"],
        suffixes=("", "_ignore"),
    )
    .drop(columns=["name_ignore", "alt_name_ignore"])
    .rename(columns={"frequency_ignore": "reverse_frequency"})
    .reset_index(drop=True)
    .fillna(0)
)

In [None]:
# include sum frequency for name
grouped = (
    pd.merge(grouped, sum_name_freq, how="inner", on="name")
    .rename(columns={"sum_frequency": "sum_name_frequency"})
    .reset_index(drop=True)
)

In [None]:
# include total name frequency
grouped = (
    pd.merge(grouped, total_name_freq, how="inner", on="name")
    .rename(columns={"sum_frequency": "total_name_frequency"})
    .reset_index(drop=True)
)

In [None]:
# include total alt name frequency
grouped = (
    pd.merge(
        grouped,
        total_name_freq.rename(columns={"name": "alt_name"}),
        how="inner",
        on="alt_name",
    )
    .rename(columns={"sum_frequency": "total_alt_name_frequency"})
    .reset_index(drop=True)
)

In [None]:
# ordered probability = frequency / sum frequency for name
grouped["ordered_prob"] = grouped["frequency"] / grouped["sum_name_frequency"]

In [None]:
# unordered probability = frequency + reverse frequency / total number of times name and alt_name appear anywhere
#   which is total name frequency + total alt name frequency, but then we've double-counted the times they appear together,
#   so we need to subtract frequency + reverse frequency from the denominator
grouped["unordered_prob"] = (grouped["frequency"] + grouped["reverse_frequency"]) / (
    grouped["total_name_frequency"]
    + grouped["total_alt_name_frequency"]
    - (grouped["frequency"] + grouped["reverse_frequency"])
)

In [None]:
# calculate levenshtein similarity
pandarallel.initialize(progress_bar=True)
grouped["similarity"] = grouped.parallel_apply(lambda row: levenshtein_similarity(row["name"], row["alt_name"]), axis=1)

In [None]:
# downgrade types to save space
grouped = grouped.astype(
    {
        "frequency": "int32",
        "reverse_frequency": "int32",
        "sum_name_frequency": "int32",
        "total_name_frequency": "int32",
        "total_alt_name_frequency": "int32",
        "ordered_prob": "float32",
        "unordered_prob": "float32",
        "similarity": "float32",
    }
)

In [None]:
# write to parquet
grouped.to_parquet(out_path, index=False)

In [None]:
print(grouped.shape)

In [None]:
grouped.memory_usage(deep=True)