In [None]:
%load_ext autoreload
%autoreload 2

# Analyze nama clusters and nicknames
This notebook is just a scratchpad to review nama clusters, nicknames, and compare the nama nicknames to the existing nicknames.

In [None]:
from collections import defaultdict
import gzip
import json

import boto3
import pandas as pd

In [None]:
given_surname = "given"

cluster_path = f"s3://nama-data/data/models/fs-{given_surname}-cluster-names.csv"
nickname_bucket = "familysearch-names"
nickname_path = "processed/givenname_nicknames.csv"
std_given_path = "/home/dallan/fhd/searchng-standards-wrapper/src/main/resources/std_given.txt"

## Read Clusters

In [None]:
df = pd.read_csv(cluster_path, na_filter=False)
print(len(df))
df.head(3)

In [None]:
name2clusters = defaultdict(set)
cluster2names = defaultdict(set)

for name, cluster in zip(df['name'], df['cluster']):
    name2clusters[name].add(cluster)
    cluster2names[cluster].add(name)

In [None]:
clusters = name2clusters["john"]
print(clusters)
print(cluster2names[next(iter(clusters))])
clusters = name2clusters["johhnn"]
print(clusters)
if len(clusters) > 0:
    print(cluster2names[next(iter(clusters))])

## Read nicknames

In [None]:
nicknames = defaultdict(set)
if nickname_path:
    obj = s3.Object(nickname_bucket, nickname_path)
    contents = obj.get()['Body'].read().decode('utf-8')
    for ix, line in enumerate(contents.split('\n')):
        line = line.strip()
        names = line.split(',')
        headname = names[0]
        for name in names[1:]:
            if name != headname:
                nicknames[name].add(headname)

In [None]:
print(nicknames['ron'])
print(nicknames['abigail'])
print(nicknames['abby'])

## Look up standards for a name + nicknames

In [None]:
def get_standards(name):
    standards = set()
    lookups = set(name)
    if name in nicknames:
        lookups.update(nicknames[name])
    for lookup in lookups:
        if lookup in name2clusters:
            standards.update(name2clusters[lookup])
    return standards


In [None]:
print(get_standards('ron'))
print(get_standards('abigail'))
print(get_standards('abby'))

In [None]:
print(name2clusters['ronald'])
print(cluster2names['donald'])

## Read existing standards

In [None]:
std_nicknames = defaultdict(set)
std_nickname_list = []
if std_given_path:
    with open(std_given_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            line = line.strip()
            headnames = set(line.split(':')[0].split(' '))
            if len(headnames) > 1:
                std_nickname_list.append(headnames)
                for headname in headnames:
                    std_nicknames[headname].update(headnames - {headname})


In [None]:
print(len(std_nickname_list))
print(len(std_nicknames))
print(std_nicknames['alf'])
print(std_nicknames['ron'])

## Compare existing nicknames to new nicknames

In [None]:
total = 0
for std_ix, std_nicks in enumerate(std_nickname_list):
    # find inclusive match in nickname list
    found = False
    for nicks in nickname_list:
        if len(std_nicks & nicks) > 0 and len(std_nicks - nicks) == 0:
            found = True
    if found:
        continue
    # find partial match in nickname list
    for ix, nicks in enumerate(nickname_list):
        if len(std_nicks & nicks) > 0:
            found = True
            # if std_nicks has more nicks, print them
            if len(std_nicks - nicks) > 0:
                print(ix, "nicks=", nicks, " std", std_ix, "=", std_nicks, " new ", (std_nicks - nicks))
    total += 1
    if found:
        continue
    print(std_ix, "std_nicks", std_nicks)
print(total)