In [1]:
%load_ext autoreload
%autoreload 2

# Aggregate preferred names
Split into individual name pieces before aggregation

In [2]:
from os.path import join

import pandas as pd
from mpire import WorkerPool

from nama.data.filesystem import glob, download_file_from_s3, save_file

In [3]:
# TODO process given and surname
given_surname = "given"
# given_surname = "surname"

is_surname = given_surname == "surname"

in_path = f"s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-{given_surname}/"
out_path = f"s3://fs-nama-data/2024/familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"

In [4]:
# read input files into dataframe array
def read_file(filename):
    filename = download_file_from_s3(filename) if filename.startswith("s3://") else filename
    return pd.read_parquet(filename)

with WorkerPool() as pool:
    dfs = pool.map(read_file, glob(join(in_path, "part-*")), progress_bar=True)
print(len(dfs))
print(dfs[0].shape)
print(dfs[0])

Exception occurred, terminating ... :  25%|█████████▉                              | 2995/12000 [04:41<07:43, 19.42it/s]

Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02940.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03026.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02998.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02929.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03012.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02950.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02915.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03004.parquet from S3: 





Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02973.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03096.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03040.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02984.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03072.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-02904.parquet from S3: 
Error downloading file s3://fs-nama-data/2024/familysearch-names/interim/tree-preferred-given/part-03052.parquet from S3: 


KeyboardInterrupt: 

In [None]:
# process and pre-aggregate each dataframe
def parameterize(dfs):
    # a second element in the tuple is needed to keep mpire from trying to also iterate over the df
    return [(df, ix) for ix, df in enumerate(dfs)]


def process(df, _):
    # split into individual name pieces
    df["name"] = df["name"].str.split()

    # explode names
    df = pd.DataFrame(
        df.explode("name", ignore_index=True).dropna()["name"].tolist(),
        columns=["name"],
    )

    # group
    df["frequency"] = 1
    return df.groupby(["name"]).sum().reset_index()


with WorkerPool() as pool:
    dfs = pool.map(process, parameterize(dfs), progress_bar=True)

In [None]:
print(len(dfs))
print(dfs[0].shape)
print(dfs[0])

In [None]:
%%time
# combine all dataframes into a single dataframe
df = pd.concat(dfs)
del dfs
print(df.shape)
print(df)

In [None]:
%%time
# group by name and sum frequency
grouped = df.groupby(["name"]).sum().reset_index()
del df
print(grouped.shape)

In [None]:
%%time
# remove the empty name and single-occurrence names, and sort by descending frequency
grouped = grouped[(grouped["name"] != "") & (grouped["frequency"] > 1)]
grouped = grouped.sort_values(by="frequency", ascending=False)

In [None]:
print(grouped.memory_usage(deep=True))
print(grouped)

In [None]:
# write to csv
save_file(out_path,
          lambda local_out_path : grouped.to_csv(local_out_path, index=False))