In [None]:
%load_ext autoreload
%autoreload 2

# Aggregate preferred names

In [None]:
from os.path import join

import pandas as pd
from mpire import WorkerPool

from src.data.filesystem import glob

In [None]:
in_path = "s3://familysearch-names/interim/tree-preferred-surname/"
out_path = "s3://familysearch-names/interim/tree-preferred-surname-aggr.csv.gz"
is_surname = True

In [None]:
# read input files into dataframe array
with WorkerPool() as pool:
    dfs = pool.map(pd.read_parquet, glob(join(in_path, "part-*")), progress_bar=True)
print(len(dfs))
print(dfs[0].shape)
print(dfs[0])

In [None]:
# process and pre-aggregate each dataframe
def parameterize(dfs):
    # a second element in the tuple is needed to keep mpire from trying to also iterate over the df
    return [(df, ix) for ix, df in enumerate(dfs)]


def process(df, _):
    # split into individual name pieces
    df["name"] = df["name"].str.split()

    # explode names
    df = pd.DataFrame(
        df.explode("name", ignore_index=True).dropna()["name"].tolist(),
        columns=["name"],
    )

    # group
    df["frequency"] = 1
    return df.groupby(["name"]).sum().reset_index()


with WorkerPool() as pool:
    dfs = pool.map(process, parameterize(dfs), progress_bar=True)

In [None]:
print(len(dfs))
print(dfs[0].shape)
print(dfs[0])

In [None]:
%%time
# combine all dataframes into a single dataframe
df = pd.concat(dfs)
del dfs
print(df.shape)
print(df)

In [None]:
%%time
# group by name and sum frequency
grouped = df.groupby(["name"]).sum().reset_index()
del df
print(grouped.shape)

In [None]:
%%time
# remove the empty name and single-occurrence names, and sort by descending frequency
grouped = grouped[(grouped["name"] != "") & (grouped["frequency"] > 1)]
grouped = grouped.sort_values(by="frequency", ascending=False)

In [None]:
print(grouped.memory_usage(deep=True))
print(grouped)

In [None]:
# write to csv
grouped.to_csv(out_path, index=False)