In [None]:
%load_ext autoreload
%autoreload 2

# Generate tree anchor name, pos record name, neg record name triplets

In [None]:
from collections import Counter

import pandas as pd
from tqdm.auto import tqdm

from src.data.utils import load_dataset

In [None]:
# Config

given_surname = "given"

tree_name_min_freq = 1000
record_name_min_freq = 200
pos_threshold = 0.5
max_triplets_per_tree_name = 2000

train_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-train.csv.gz"
triplets_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-triplets-{tree_name_min_freq}.csv.gz"

## Load data

In [None]:
input_names_train, weighted_actual_names_train, candidate_names_train = \
    load_dataset(train_path)

In [None]:
print("input_names_train", len(input_names_train))
print("weighted_actual_names_train", sum(len(wan) for wan in weighted_actual_names_train))
print("total pairs", sum(freq for wans in weighted_actual_names_train for _, _, freq in wans))
print("candidate_names_train", len(candidate_names_train))
print("total names", len(set(input_names_train).union(set(candidate_names_train))))

## Generate triplets

In [None]:
total_tree_occurs = Counter()
total_record_occurs = Counter()
for input_name, wans in tqdm(zip(input_names_train, weighted_actual_names_train)):
    for wan in wans:
        # include co-occurrences even if a name goes to itself, 
        # because if a name usually goes to itself, we want its vector
        # to not be that close to another vector
        total_tree_occurs[input_name] += wan[2]
        total_record_occurs[wan[0]] += wan[2]

In [None]:
input_names_train_ixs = {}
for ix, input_name in enumerate(input_names_train):
    input_names_train_ixs[input_name] = ix

In [None]:
def score(tree_name, record_name):
    tree_ix = input_names_train_ixs[tree_name]
    co_occur = 0
    for row in weighted_actual_names_train[tree_ix]:
        if row[0] == record_name:
            co_occur = row[2]
            break
    total_tree_occur = total_tree_occurs[tree_name]
    tree_co_occur_ratio = co_occur / total_tree_occur
    total_record_occur = total_record_occurs[record_name]
    record_co_occur_ratio = co_occur / total_record_occur
    return max(tree_co_occur_ratio, record_co_occur_ratio)

In [None]:
total_record_candidates = 0
total_tree_names = 0
triplets = []
for input_name, wans in tqdm(zip(input_names_train, weighted_actual_names_train)):
    if total_tree_occurs[input_name] < tree_name_min_freq:
        continue
    record_candidates = [wan for wan in wans \
                         if total_record_occurs[wan[0]] >= record_name_min_freq]
    pairs = set()
    for pos_candidate in record_candidates:
        pos_name = pos_candidate[0]
        if pos_name == input_name:
            continue
        for neg_candidate in record_candidates:
            neg_name = neg_candidate[0]
            if neg_name == input_name:
                continue
            if pos_name == neg_name:
                continue
            if f"{pos_name},{neg_name}" in pairs \
            or f"{neg_name},{pos_name}" in pairs:
                continue
            pos_score = score(input_name, pos_name)
            neg_score = score(input_name, neg_name)
            if max(pos_score, neg_score) < pos_threshold:
                continue
            if pos_score < neg_score:
                pos_name, pos_score, neg_name, neg_score = neg_name, neg_score, pos_name, pos_score
            pairs.add(f"{pos_name},{neg_name}")
            triplets.append({
                'anchor': input_name, 
                'positive': pos_name, 
                'positive_score': pos_score, 
                'negative': neg_name, 
                'negative_score': neg_score
            })
            if len(pairs) == max_triplets_per_tree_name:
                break
        if len(pairs) == max_triplets_per_tree_name:
            break
    total_record_candidates += len(record_candidates)
    total_tree_names += 1
print('tree names', total_tree_names)
print('total record candidates for all tree names', total_record_candidates)
print('avg record candidates per tree name', total_record_candidates / total_tree_names)
print('total triplets', len(triplets))

In [None]:
triplets[::10000]

## Save triplets

In [None]:
df = pd.DataFrame(triplets)

In [None]:
df.to_csv(triplets_path, index=False)

In [None]:
anchor_pos_df = df[['anchor', 'positive']].drop_duplicates()
len(anchor_pos_df)

## Review anchor-positive pairs

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend

espeak = EspeakBackend('en-us')
separator = Separator(phone=' ', syllable=None, word='|')

In [None]:
print(espeak.phonemize(
    ['john'], 
    separator=separator,
    strip=True
)[0])


In [None]:
for anchor, positive, positive_score in \
        df[['anchor', 'positive', 'positive_score']].sample(100).values.tolist():
    print(anchor, 
          positive, 
          positive_score,
          espeak.phonemize([anchor], separator=separator, strip=True)[0],
          espeak.phonemize([positive], separator=separator, strip=True)[0],
         )

In [None]:
import Levenshtein

In [None]:
name1 = '<abcdefxyij'
name2 = '<abcfxyghik'

In [None]:
opcodes = Levenshtein.opcodes(name1, name2)
opcodes

In [None]:
word_pieces = []
for (opcode, src_start, src_end, tar_start, tar_end) in opcodes:
    if opcode == 'equal':
        word_pieces.append(name1[src_start:src_end])
    elif opcode == 'delete':
        word_pieces.append(name1[src_start:src_end])
    elif opcode == 'insert':
        word_pieces.append(name2[tar_start:tar_end])
    elif opcode == 'replace':
        word_pieces.append(name1[src_start:src_end])
        word_pieces.append(name2[tar_start:tar_end])
    else:
        print('Unexpected opcode', opcode)
word_pieces

In [None]:
# Costs for the operations
INS_COST = 1
DEL_COST = 1
SUB_COST = 2

def find_minimum_edit_distance(source_string, target_string) :

    # Create a dp matrix of dimension (source_string + 1) x (destination_matrix + 1)
    dp = [[0] * (len(source_string) + 1) for i in range(len(target_string) + 1)]

    # Initialize the required values of the matrix
    for i in range(1, len(target_string) + 1) :
        dp[i][0] = dp[i - 1][0] + INS_COST
    for i in range(1, len(source_string) + 1) :
        dp[0][i] = dp[0][i - 1] + DEL_COST

    # Maintain the record of opertions done
    # Record is one tuple. Eg : (INSERT, 'a') or (SUBSTITUTE, 'e', 'r') or (DELETE, 'j')
    operations_performed = []

    # Build the matrix following the algorithm
    for i in range(1, len(target_string) + 1) :
        for j in range(1, len(source_string) + 1) :
            if source_string[j - 1] == target_string[i - 1] :
                dp[i][j] = dp[i - 1][j - 1]
            else :
                dp[i][j] =  min(dp[i - 1][j] + INS_COST, \
                                dp[i - 1][j - 1] + SUB_COST, \
                                dp[i][j - 1] + DEL_COST)

    # Initialization for backtracking
    i = len(target_string)
    j = len(source_string)

    # Backtrack to record the operation performed
    while (i != 0 and j != 0) :
        # If the character of the source string is equal to the character of the destination string,
        # no operation is performed
        if target_string[i - 1] == source_string[j - 1] :
            i -= 1
            j -= 1
        else :
            # Check if the current element is derived from the upper-left diagonal element
            if dp[i][j] == dp[i - 1][j - 1] + SUB_COST :
                operations_performed.append(('SUBSTITUTE', source_string[j - 1], target_string[i - 1]))
                i -= 1
                j -= 1
            # Check if the current element is derived from the upper element
            elif dp[i][j] == dp[i - 1][j] + INS_COST :
                operations_performed.append(('INSERT', target_string[i - 1]))
                i -= 1
            # Check if the current element is derived from the left element
            else :
                operations_performed.append(('DELETE', source_string[j - 1]))
                j -= 1

    # If we reach top-most row of the matrix
    while (j != 0) :
        operations_performed.append(('DELETE', source_string[j - 1]))
        j -= 1

    # If we reach left-most column of the matrix
    while (i != 0) :
        operations_performed.append(('INSERT', target_string[i - 1]))
        i -= 1

    # Reverse the list of operations performed as we have operations in reverse
    # order because of backtracking
    operations_performed.reverse()
    return [dp[len(target_string)][len(source_string)], operations_performed]


In [None]:
find_minimum_edit_distance(name1, name2)