In [None]:
%load_ext autoreload
%autoreload 2

# Augment standard with preferred names

Load a standard and preferred tree names, and determine which common tree names do not appear in the standard.

Try to figure out programmatically which bucket they should go into.

In [None]:
from collections import defaultdict, Counter
import json
import math
import os
import re

import numpy as np
import pandas as pd
from py4j.java_gateway import JavaGateway
from sentence_transformers.cross_encoder import CrossEncoder
from sklearn.cluster import AgglomerativeClustering
import torch
from tqdm.auto import tqdm

from src.data.normalize import normalize
from src.data.utils import read_csv
from src.models.biencoder import BiEncoder
from src.models.tokenizer import get_tokenize_function_and_vocab
from src.models.utils import top_similar_names

In [None]:
# configure
given_surname = "given"

max_tokens = 10
subwords_path=f"../data/models/fs-{given_surname}-subword-tokenizer-2000f.json"
common_name_threshold = 105
pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
std_path = f"../references/std_{given_surname}.txt"
model_type = 'cecommon+0+aug-0-1'
model_path = f"../data/models/bi_encoder-{given_surname}-{model_type}.pth"
triplets_path=f"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz"
tokenizer_max_length = 32
cross_encoder_dir = f"../data/models/cross-encoder-{given_surname}-10m-265-same-all"

std_augmented_path = f"../data/processed/std_{given_surname}-augmented.txt"

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Load data

In [None]:
# read triplets
triplets_df = read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(3)

In [None]:
# load buckets
bucket_names = {}
bucket_head_names = {}
name_buckets = defaultdict(set)

with open(std_path) as f:
    for line in f.readlines():
        line = line.strip()
        head_names, tail_names = line.split(':')
        head_names = head_names.strip()
        tail_names = tail_names.strip()
        bucket_name = None
        names = set()
        heads = set()
        for name in head_names.split(' '):
            if len(name) == 0:
                continue
            if bucket_name is None:
                bucket_name = name
            names.add(name)
            heads.add(name)
        for name in tail_names.split(' '):
            if len(name) == 0:
                continue
            names.add(name)
        if len(names) < 1:
            continue
        for name in names:
            name_buckets[name].add(bucket_name)
        bucket_names[bucket_name] = names
        bucket_head_names[bucket_name] = heads
print(len(bucket_names), len(name_buckets))

In [None]:
# load pref names
pref_df = read_csv(pref_path)

In [None]:
# get total frequency, including names w frequency=1 that aren't in pref_df
total_freq = sum(pref_df['frequency']) + len(pref_df[pref_df['frequency'] == 2]) * 2
total_freq

In [None]:
# calculate % of total frequency of the top N names 
freq = sum(pref_df['frequency'][:117000])
print(freq/total_freq)

In [None]:
# create common names pref names that occur >= common_name_threshold
common_names = [name for name, freq in zip(pref_df['name'], pref_df['frequency']) \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name) and freq >= common_name_threshold]
len(common_names)

In [None]:
# load tokenize function
tokenize, tokenizer_vocab = get_tokenize_function_and_vocab(
    max_tokens=max_tokens,
    subwords_path=subwords_path,
)
len(tokenizer_vocab)

In [None]:
# load model
model = torch.load(model_path)
model.eval()

In [None]:
# load cross encoder model
ce_model = CrossEncoder(cross_encoder_dir, max_length=tokenizer_max_length)

## Which names are not in the standard?

In [None]:
print_cnt = 10
unseen_names = []
for ix, name in enumerate(common_names):
    name_pieces = normalize(name, is_surname=given_surname=='surname', dont_return_empty=False)
    if len(name_pieces) != 1:
        continue
    name = name_pieces[0]
    if ix % 1000 == 0 and len(unseen_names) > 0:
        print(ix, len(unseen_names))
        print_cnt = 10
    if name in name_buckets:
        continue
    unseen_names.append(name)
    if print_cnt > 0:
        print('   ', ix, name)
        print_cnt -= 1

In [None]:
print(len(unseen_names))
unseen_names[:10]

### get name embeddings

In [None]:
def get_embedding(name):
    embedding = model.get_embedding(tokenize(name)) 
    embedding /= np.linalg.norm(embedding)
    return embedding

In [None]:
name_embeddings_names = np.array(list(name_buckets.keys()))
name_embeddings = [get_embedding(name) for name in name_buckets.keys()]

## Figure out which bucket to put the names into

In [None]:
def get_nearest_bi_encoder_names(name, threshold=0.5, limit=20):
    embedding = get_embedding(name)
    return top_similar_names(embedding, name_embeddings, name_embeddings_names, threshold, limit)

def get_bi_encoder_bucket_score(name, other_names, other_scores, limit=1):
    buckets = Counter()
    ix = 0
    for other_name, other_score in zip(other_names, other_scores):
        if ix == limit:
            break
        if other_name in name_buckets:
            bucket = next(iter(name_buckets[other_name]))
            buckets[bucket] += other_score
            ix += 1
    if len(buckets) == 0:
        return None, None
    return buckets.most_common(1)[0]

In [None]:
def harmonic_mean(x,y):
    return 2 / (1/x+1/y)

def get_cross_encoder_score(name, other_name):
    if name == other_name:
        return 1.0
    score1, score2 = ce_model.predict([[name, other_name], [other_name, name]])        
    return harmonic_mean(score1, score2)
    
def get_cross_encoder_bucket_score(name, other_names, limit=3):
    max_name = None
    max_score = None
    name_scores = []
    for other_name in other_names:
        if other_name not in name_buckets:
            continue
        score = get_cross_encoder_score(name, other_name)
        name_scores.append((other_name, score))
    if len(name_scores) == 0:
        return None, None
    name_scores = sorted(name_scores, key=lambda x: -x[1])
#     print(name_scores)
    buckets = Counter()
    for ix in range(min(len(name_scores), limit)):
        other_name, score = name_scores[ix]
        bucket = next(iter(name_buckets[other_name]))
        buckets[bucket] += score
#     print(buckets)
    return buckets.most_common(1)[0]

In [None]:
# match-spark/pipeline
# java -cp target/spark-pipeline.jar org.familysearch.search.spark.py4j.Py4JGateway

gateway = JavaGateway()

def get_fs_bucket_score(name):
    bucket = gateway.getClusters(name, given_surname == 'surname')
    score = max([get_cross_encoder_score(name, bucket_name) for bucket_name in bucket_names[bucket]])
    return bucket, score

In [None]:
def get_triplets_bucket_score(name, threshold = 0.4):
    df = triplets_df[(triplets_df['anchor'] == name) | (triplets_df['positive'] == name)]
    df = df[(df['anchor'] != name) | (df['positive'] != name)]
    df = df[df['positive_score'] > threshold]
    df = df.sort_values(by='positive_score', ascending=False)
    for i in range(len(df)):
        top_row = df.iloc[i]
        top_name = top_row['anchor'] if top_row['positive'] == name else top_row['positive']
        if top_name in name_buckets:
            return next(iter(name_buckets[top_name])), top_row['positive_score']
    return None, None

In [None]:
# test
name = 'ivanovna'
names, scores = get_nearest_bi_encoder_names(name, limit=20)
print(names, scores)
# names = names[1:]
# scores = scores[1:]
ce_bucket, ce_score = get_cross_encoder_bucket_score(name, names)
print('cross-encoder', ce_bucket, ce_score)
be_bucket, be_score = get_bi_encoder_bucket_score(name, names, scores, limit=3)
print('bi-encoder3', be_bucket, be_score)
be_bucket, be_score = get_bi_encoder_bucket_score(name, names, scores, limit=1)
print('bi-encoder1', be_bucket, be_score)

In [None]:
def sample_names(bucket):
    if not bucket:
        return ''
    return ' '.join(list(bucket_names[bucket])[:8])

In [None]:
print(len(bucket_names), len(name_buckets))

In [None]:
# where did these numbers come from?
# they came by looking at individual name scores and fine-tuning by hand
# no machine-learning was harmed in the preparation of these numbers
# only humans were harmed :-)

testing = False

fs_weight = 1.65
fs_boost = 0.28
ce_weight = 1.0
triplets_weight = 1.29
unseen_boost = 2.0
score_threshold = 1.01

unseen_names_set = set(unseen_names)

for name in unseen_names[:1000] if testing else tqdm(unseen_names):
    # gather votes
    votes = Counter()
    # get fs vote
    fs_bucket, fs_score = get_fs_bucket_score(name)
    if fs_bucket is not None:
        votes[fs_bucket] += fs_score * fs_weight + fs_boost
    # get nearby names
    names, scores = get_nearest_bi_encoder_names(name)
    if len(names) > 0:
        # get cross-encoder vote
        ce_bucket, ce_score = get_cross_encoder_bucket_score(name, names)
        # don't double-count cross-encoder if it already voted up fs
        if ce_bucket is not None and ce_bucket != fs_bucket:
            votes[ce_bucket] += ce_score * ce_weight
        # get bi-encoder votes
        # be_bucket, be_score = get_bi_encoder_bucket_score(name, names, scores, limit=1)
    # get triplet vote
    tri_bucket, tri_score = get_triplets_bucket_score(name)
    if tri_bucket is not None:
        votes[tri_bucket] += tri_score * triplets_weight

    # get winner
    winner, score = votes.most_common(1)[0]
    
    # print stuff if testing
    if testing:
        print()
        print(name)
        print('   fs', fs_bucket, fs_score, sample_names(fs_bucket))
        print('   ce', ce_bucket, ce_score, sample_names(ce_bucket))
        print('  tri', tri_bucket, tri_score, sample_names(tri_bucket))
        print(votes)
        if score > score_threshold:
            print('WINNER', winner, score)
        continue

    # if winning bucket is unseen, then increase score, similar to as if FS had found it
    if winner in unseen_names_set:
        score = score * fs_weight + fs_boost
        
    # add name to existing bucket, or create a new bucket
    if score > score_threshold:
        name_buckets[name] = {winner}
        bucket_names[winner].add(name)
    else:
        name_buckets[name] = {name}
        bucket_names[name] = {name}
        bucket_head_names[name] = {name}

    # add embedding
    name_embeddings_names = np.append(name_embeddings_names, [name], axis=0)
    name_embeddings = np.append(name_embeddings, [get_embedding(name)], axis=0)


In [None]:
print(len(bucket_names), len(name_buckets))

## Review names to see if they should be moved to other buckets

**Actually, after looking through a few of these, they are generally better where they are**

### calculate bucket centroids

### for each name, is it closer to another bucket's centroid than its own?

## Save augmented buckets

In [None]:
with open(std_augmented_path, 'wt') as f:
    for bucket in sorted(bucket_names.keys()):
        heads = bucket_head_names[bucket]
        head_names = ' '.join([bucket, *sorted([head for head in heads if head != bucket])]).strip()
        tail_names = ' '.join([name for name in sorted(bucket_names[bucket]) if name not in heads]).strip()
        line = f"{head_names}: {tail_names}".strip()
        f.write(f"{line}\n")

In [None]:
std_augmented_path