In [None]:
%load_ext autoreload
%autoreload 2

# Sample Query Given and Surnames

In [None]:
import os
import random
from urllib.parse import unquote

import pandas as pd

from src.data.normalize import normalize

In [None]:
input_dir = "../data/queries"

freq_threshold = 20  # names must be associated with other names at least this many times
sample_size = 5000

given_train_path = "s3://familysearch-names/processed/tree-hr-given-train-v2.csv.gz"
surname_train_path = "s3://familysearch-names/processed/tree-hr-surname-train-v2.csv.gz"

given_query_path = "s3://familysearch-names/processed/query-names-given-v2.csv.gz"
surname_query_path = "s3://familysearch-names/processed/query-names-surname-v2.csv.gz"

## Read training data

In [None]:
given_train_df = pd.read_csv(given_train_path, keep_default_na=False)
surname_train_df = pd.read_csv(surname_train_path, keep_default_na=False)

In [None]:
# ignore frequency of names associated with themselves
given_train_df = given_train_df[given_train_df["tree_name"] != given_train_df["record_name"]]
surname_train_df = surname_train_df[surname_train_df["tree_name"] != surname_train_df["record_name"]]

In [None]:
# we need to find tree names that are associated with other record names
# more often than a threshold so we can calculate reliable precision and recall numbers 
frequent_given_df = given_train_df.groupby("tree_name")["frequency"].sum().reset_index()
frequent_given_df = frequent_given_df[frequent_given_df["frequency"] >= freq_threshold]
print(frequent_given_df.shape)
frequent_givens = set(frequent_given_df["tree_name"].tolist())
frequent_given_df.head(3)

In [None]:
frequent_surname_df = surname_train_df.groupby("tree_name")["frequency"].sum().reset_index()
frequent_surname_df = frequent_surname_df[frequent_surname_df["frequency"] >= freq_threshold]
print(frequent_surname_df.shape)
frequent_surnames = set(frequent_surname_df["tree_name"].tolist())
frequent_surname_df.head(3)

## Read Queries

In [None]:
all_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

given_queries = []
surname_queries = []

for file in all_files:
    file_path = os.path.join(input_dir, file)
    with open(file_path, "r") as f:
        for ix, line in enumerate(f.readlines()):
            if ix == 0:
                continue
            line = line.strip('"')
            line = unquote(line)
            for term in line.split('&'):
                q, value = term.split('=')
                if q != "q.givenName" and q != "q.surname":
                    continue
                is_surname = (q == "q.surname")
                for name in normalize(value, 
                                      is_surname=is_surname, 
                                      handle_patronymics=True,
                                      dont_return_empty=False):
                    if len(name) < 3:
                        continue
                    if is_surname:
                        if name not in frequent_surnames:
                            continue
                        surname_queries.append(name)
                    else:
                        if name not in frequent_givens:
                            continue
                        given_queries.append(name)
                        
print("given names", len(given_queries))
print("surnames", len(surname_queries))

In [None]:
given_queries = random.sample(given_queries, k=sample_size)
given_queries[:20]

In [None]:
surname_queries = random.sample(surname_queries, k=sample_size)
surname_queries[:20]

## Write sample query names

In [None]:
given_df = pd.DataFrame(given_queries, columns=["name"])
print(given_df.shape)
given_df.head(3)

In [None]:
surname_df = pd.DataFrame(surname_queries, columns=["name"])
print(surname_df.shape)
surname_df.head(3)

In [None]:
given_df.to_csv(given_query_path, index=False)

In [None]:
surname_df.to_csv(surname_query_path, index=False)