In [None]:
%load_ext autoreload
%autoreload 2

# Generate tree anchor name, pos record name, neg record name triplets

Use the training data generated in notebook 100 and generate (anchor, pos, pos_score, neg, neg_score) triplets.

We decided to use tree_name_min_freq=1000 going forward

In [None]:
from collections import Counter, defaultdict
import random

import pandas as pd
from tqdm.auto import tqdm

from src.data.utils import load_dataset_v2

In [None]:
# Config

given_surname = "given"

tree_name_min_freq = 1000
record_name_min_freq = 200
pos_threshold = 0.5
max_triplets_per_tree_name = 2000

train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"

triplets_path=f"../data/processed/tree-hr-{given_surname}-triplets-v2-{tree_name_min_freq}.csv.gz"

## Load data

In [None]:
tree_names_train, attached_names_train, record_names_train = \
    load_dataset_v2(train_path)

In [None]:
print("tree_names_train", len(tree_names_train))
print("attached_names_train", sum(len(attaches) for attaches in attached_names_train))
print("total pairs", sum(freq for attachments in attached_names_train for _, freq in attachments))
print("record_names_train", len(record_names_train))
print("total names", len(set(tree_names_train).union(set(record_names_train))))

## Generate triplets

In [None]:
total_tree_occurs = Counter()
total_record_occurs = Counter()
for tree_name, attachments in tqdm(zip(tree_names_train, attached_names_train)):
    # attachments is a list of (record name, frequency)
    for attachment in attachments:
        # include frequency even if a name goes to itself, 
        # because if a name usually goes to itself, we want its vector
        # to not be that close to another vector
        total_tree_occurs[tree_name] += attachment[1]
        total_record_occurs[attachment[0]] += attachment[1]

In [None]:
tree_names_train_ixs = {}
for ix, tree_name in enumerate(tree_names_train):
    tree_names_train_ixs[tree_name] = ix

In [None]:
def _score(tree_name, record_name):
    tree_ix = tree_names_train_ixs[tree_name]
    freq = 0
    for name_freq in attached_names_train[tree_ix]:
        if name_freq[0] == record_name:
            freq = name_freq[1]
            break
    total_tree_occur = total_tree_occurs[tree_name]
    tree_co_occur_ratio = freq / total_tree_occur
    total_record_occur = total_record_occurs[record_name]
    record_co_occur_ratio = freq / total_record_occur
    return freq, tree_co_occur_ratio, record_co_occur_ratio

def sample_scores(tree_name, pos, pos_score, neg, neg_score):
    for name, score in [(pos, pos_score), (neg, neg_score)]:
        bucket = int(score * 10)
        if random.random() < 0.001 and len(score_buckets[bucket]) < 40:
            freq, tree_co_occur_ratio, record_co_occur_ratio = _score(tree_name, name)
            score_buckets[bucket].append({
                'tree_name': tree_name, 
                'record_name': name, 
                'score': score, 
                'tree_co_occur_ratio': tree_co_occur_ratio,
                'record_co_occur_ratio': record_co_occur_ratio,
                'freq': freq,
            })

In [None]:
# smooth rare names, increase all scores by a multiplier
# we want nearly every attachment to score at least 0.4
def score(tree_name, record_name, smoothing=20, multiplier=0.38):
    tree_ix = tree_names_train_ixs[tree_name]
    freq = 0
    for name_freq in attached_names_train[tree_ix]:
        if name_freq[0] == record_name:
            freq = name_freq[1]
            break
    total_tree_occur = total_tree_occurs[tree_name]
    tree_co_occur_ratio = (freq + smoothing) / (total_tree_occur + smoothing)
    total_record_occur = total_record_occurs[record_name]
    record_co_occur_ratio = (freq + smoothing) / (total_record_occur + smoothing)
    max_score = max(tree_co_occur_ratio, record_co_occur_ratio)
    return max_score + multiplier * (1.0 - max_score)

In [None]:
score_buckets = defaultdict(list)

total_record_candidates = 0
total_tree_names = 0
triplets = []
for tree_name, attachments in tqdm(zip(tree_names_train, attached_names_train)):
    if total_tree_occurs[tree_name] < tree_name_min_freq:
        continue
    record_candidates = [name_freq for name_freq in attachments \
                         if total_record_occurs[name_freq[0]] >= record_name_min_freq]
    pairs = set()
    for pos_candidate in record_candidates:
        pos_name = pos_candidate[0]
        if pos_name == tree_name:
            continue
        for neg_candidate in record_candidates:
            neg_name = neg_candidate[0]
            if neg_name == tree_name:
                continue
            if pos_name == neg_name:
                continue
            if f"{pos_name},{neg_name}" in pairs \
            or f"{neg_name},{pos_name}" in pairs:
                continue
            pos_score = score(tree_name, pos_name)
            neg_score = score(tree_name, neg_name)
            if max(pos_score, neg_score) < pos_threshold:
                continue
            if pos_score < neg_score:
                pos_name, pos_score, neg_name, neg_score = neg_name, neg_score, pos_name, pos_score
            pairs.add(f"{pos_name},{neg_name}")
            triplets.append({
                'anchor': tree_name, 
                'positive': pos_name, 
                'positive_score': pos_score, 
                'negative': neg_name, 
                'negative_score': neg_score
            })
            sample_scores(tree_name, pos_name, pos_score, neg_name, neg_score)
            if len(pairs) == max_triplets_per_tree_name:
                break
        if len(pairs) == max_triplets_per_tree_name:
            break
    total_record_candidates += len(record_candidates)
    total_tree_names += 1
print('tree names', total_tree_names)
print('total record candidates for all tree names', total_record_candidates)
print('avg record candidates per tree name', total_record_candidates / total_tree_names)
print('total triplets', len(triplets))

In [None]:
df = pd.DataFrame(triplets)
df['positive_score'].hist(bins=20)

In [None]:
df['negative_score'].hist(bins=20)

In [None]:
triplets[::10000]

In [None]:
bucket = 4
pd.DataFrame(score_buckets[bucket])

## Save triplets

In [None]:
random.shuffle(triplets)

In [None]:
df = pd.DataFrame(triplets)

In [None]:
df.to_csv(triplets_path, index=False)

In [None]:
anchor_pos_df = df[['anchor', 'positive']].drop_duplicates()
len(anchor_pos_df)

## Review anchor-positive pairs

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend

espeak = EspeakBackend('en-us')
separator = Separator(phone=' ', syllable=None, word='|')

In [None]:
print(espeak.phonemize(
    ['john'], 
    separator=separator,
    strip=True
)[0])


In [None]:
for anchor, positive, positive_score in \
        df[['anchor', 'positive', 'positive_score']].sample(100).values.tolist():
    print(anchor, 
          positive, 
          positive_score,
          espeak.phonemize([anchor], separator=separator, strip=True)[0],
          espeak.phonemize([positive], separator=separator, strip=True)[0],
         )

In [None]:
import Levenshtein

In [None]:
name1 = '<abcdefxyij'
name2 = '<abcfxyghik'

In [None]:
opcodes = Levenshtein.opcodes(name1, name2)
opcodes

In [None]:
word_pieces = []
for (opcode, src_start, src_end, tar_start, tar_end) in opcodes:
    if opcode == 'equal':
        word_pieces.append(name1[src_start:src_end])
    elif opcode == 'delete':
        word_pieces.append(name1[src_start:src_end])
    elif opcode == 'insert':
        word_pieces.append(name2[tar_start:tar_end])
    elif opcode == 'replace':
        word_pieces.append(name1[src_start:src_end])
        word_pieces.append(name2[tar_start:tar_end])
    else:
        print('Unexpected opcode', opcode)
word_pieces

In [None]:
# Costs for the operations
INS_COST = 1
DEL_COST = 1
SUB_COST = 2

def find_minimum_edit_distance(source_string, target_string) :

    # Create a dp matrix of dimension (source_string + 1) x (destination_matrix + 1)
    dp = [[0] * (len(source_string) + 1) for i in range(len(target_string) + 1)]

    # Initialize the required values of the matrix
    for i in range(1, len(target_string) + 1) :
        dp[i][0] = dp[i - 1][0] + INS_COST
    for i in range(1, len(source_string) + 1) :
        dp[0][i] = dp[0][i - 1] + DEL_COST

    # Maintain the record of opertions done
    # Record is one tuple. Eg : (INSERT, 'a') or (SUBSTITUTE, 'e', 'r') or (DELETE, 'j')
    operations_performed = []

    # Build the matrix following the algorithm
    for i in range(1, len(target_string) + 1) :
        for j in range(1, len(source_string) + 1) :
            if source_string[j - 1] == target_string[i - 1] :
                dp[i][j] = dp[i - 1][j - 1]
            else :
                dp[i][j] =  min(dp[i - 1][j] + INS_COST, \
                                dp[i - 1][j - 1] + SUB_COST, \
                                dp[i][j - 1] + DEL_COST)

    # Initialization for backtracking
    i = len(target_string)
    j = len(source_string)

    # Backtrack to record the operation performed
    while (i != 0 and j != 0) :
        # If the character of the source string is equal to the character of the destination string,
        # no operation is performed
        if target_string[i - 1] == source_string[j - 1] :
            i -= 1
            j -= 1
        else :
            # Check if the current element is derived from the upper-left diagonal element
            if dp[i][j] == dp[i - 1][j - 1] + SUB_COST :
                operations_performed.append(('SUBSTITUTE', source_string[j - 1], target_string[i - 1]))
                i -= 1
                j -= 1
            # Check if the current element is derived from the upper element
            elif dp[i][j] == dp[i - 1][j] + INS_COST :
                operations_performed.append(('INSERT', target_string[i - 1]))
                i -= 1
            # Check if the current element is derived from the left element
            else :
                operations_performed.append(('DELETE', source_string[j - 1]))
                j -= 1

    # If we reach top-most row of the matrix
    while (j != 0) :
        operations_performed.append(('DELETE', source_string[j - 1]))
        j -= 1

    # If we reach left-most column of the matrix
    while (i != 0) :
        operations_performed.append(('INSERT', target_string[i - 1]))
        i -= 1

    # Reverse the list of operations performed as we have operations in reverse
    # order because of backtracking
    operations_performed.reverse()
    return [dp[len(target_string)][len(source_string)], operations_performed]


In [None]:
find_minimum_edit_distance(name1, name2)