In [None]:
%load_ext autoreload
%autoreload 2

# Generate subwords from edits (opcodes) between anchor and positive names

In [None]:
from collections import Counter
import json
import random
from typing import List

import boto3
import Levenshtein
import pandas as pd
from tokenizers import models, Tokenizer, trainers, NormalizedString, PreTokenizedString
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import PreTokenizer, Whitespace
from transformers import PreTrainedTokenizerFast
from tqdm.auto import tqdm

from src.data.filesystem import fopen

In [None]:
given_surname = 'given'

# run with 500, 1000, 1500, 2000
vocab_size = 1500

triplets_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-triplets.csv.gz"
train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
nama_bucket = 'nama-data'
tokenizer_path=f"data/models/fs-{given_surname}-subword-tokenizer-{vocab_size}.json"
edit_tokenizer_path=f"data/models/fs-{given_surname}-edit-subword-tokenizer-{vocab_size}.json"

tokenizer_bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-tokenizer_vocab_bigrams-{vocab_size}.json"
edit_tokenizer_bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-edit_tokenizer_vocab_bigrams-{vocab_size}.json"

## Load data

In [None]:
# read triplets
triplets_df = pd.read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(3)

In [None]:
all_names_df = pd.read_csv(train_path, keep_default_na=False)
print(all_names_df.shape)
all_names_df.head(3)

In [None]:
all_names = set(all_names_df['tree_name']) | set(all_names_df['record_name'])
print(len(all_names))
next(iter(all_names))

## Calculate edit pieces based on anchor-positive pairs

In [None]:
def generate_edit_pieces(src, tar):
    src_pieces = []
    tar_pieces = []
    opcodes = Levenshtein.opcodes(src, tar)
    for (opcode, src_start, src_end, tar_start, tar_end) in opcodes:
        if opcode == 'equal':
            src_pieces.append(src[src_start:src_end])
            tar_pieces.append(tar[tar_start:tar_end])
        elif opcode == 'delete':
            src_pieces.append(src[src_start:src_end])
        elif opcode == 'insert':
            tar_pieces.append(tar[tar_start:tar_end])
        elif opcode == 'replace':
            src_pieces.append(src[src_start:src_end])
            tar_pieces.append(tar[tar_start:tar_end])
        else:
            print('Unexpected opcode', opcode)
    return ','.join(src_pieces), ','.join(tar_pieces)

In [None]:
anchor_pos_df = triplets_df[['anchor', 'positive']].drop_duplicates()
len(anchor_pos_df)

In [None]:
edit_names = set()
for src, tar in tqdm(zip(anchor_pos_df['anchor'], anchor_pos_df['positive'])):
    src = src[1:-1]
    tar = tar[1:-1]
    src, tar = generate_edit_pieces(src, tar)
    edit_names.add(src)
    edit_names.add(tar)
len(edit_names)

## Pre-tokenize by splitting on edit pieces

In [None]:
class EditPiecePreTokenizer:
    def split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        # we need to call `str(normalized_string)` because split expects a str,
        # not a NormalizedString
        return [NormalizedString(s) for s in str(normalized_string).split(',')]
    
    def pre_tokenize(self, pretok: PreTokenizedString):
        return pretok.split(self.split)

## Generate Subwords

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

### from edit pieces

In [None]:
edit_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
edit_tokenizer.pre_tokenizer = PreTokenizer.custom(EditPiecePreTokenizer())

trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)

In [None]:
# train tokenizer from edit pieces
def get_edit_names():
    for name in edit_names:
        yield name
        
edit_tokenizer.train_from_iterator(get_edit_names(), trainer=trainer)

In [None]:
edit_tokenizer.get_vocab()

In [None]:
# now that the tokenizer has been trained, we don't need the pre-tokenizer any more
# so default it to whitespace
edit_tokenizer.pre_tokenizer = Whitespace()

## Generate subwords from all names

In [None]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)

In [None]:
# train tokenizer from edit pieces
def get_all_names():
    for name in all_names:
        yield name
        
tokenizer.train_from_iterator(get_all_names(), trainer=trainer)

In [None]:
tokenizer.get_vocab()

In [None]:
len(tokenizer.get_vocab())

## Review sample

In [None]:
sample_size = 100
sample_df = triplets_df.sample(sample_size)

In [None]:
for anchor, positive in zip(sample_df['anchor'], sample_df['positive']):
    anchor = anchor[1:-1]
    positive = positive[1:-1]
    print(anchor, positive)
    print('edit', edit_tokenizer.encode(anchor).tokens, edit_tokenizer.encode(positive).tokens)
    print(' all', tokenizer.encode(anchor).tokens, tokenizer.encode(positive).tokens)

## Calculate subwords, subword-bigrams, and lengths

In [None]:
subword_counter = Counter()
subword_bigrams_counter = Counter()
subword_lengths = Counter()
edit_subword_counter = Counter()
edit_subword_bigrams_counter = Counter()
edit_subword_lengths = Counter()

for name in all_names:
    subwords = tokenizer.encode(name).tokens
    for subword in subwords:
        subword_counter[subword] += 1
    context_subword = 'START'
    subwords.append('END')
    for subword in subwords:
        subword_bigrams_counter[f"{context_subword},{subword}"] += 1
        context_subword = subword
    subword_lengths[len(subwords)] += 1
    
    subwords = edit_tokenizer.encode(name).tokens
    for subword in subwords:
        edit_subword_counter[subword] += 1
    context_subword = 'START'
    subwords.append('END')
    for subword in subwords:
        edit_subword_bigrams_counter[f"{context_subword},{subword}"] += 1
        context_subword = subword
    edit_subword_lengths[len(subwords)] += 1
    

In [None]:
subword_counter.most_common()

In [None]:
edit_subword_counter.most_common()

In [None]:
print(len(subword_bigrams_counter))
subword_bigrams_counter.most_common(vocab_size)

In [None]:
print(len(edit_subword_bigrams_counter))
edit_subword_bigrams_counter.most_common(vocab_size)

In [None]:
edit_subword_lengths

In [None]:
subword_lengths

## Save subword tokenizers and vocabularies

In [None]:
s3 = boto3.client('s3')

tokenizer.save(f"../{tokenizer_path}")
with open(f"../{tokenizer_path}", "rb") as f:
    s3.upload_fileobj(f, nama_bucket, tokenizer_path)
    
edit_tokenizer.save(f"../{edit_tokenizer_path}")
with open(f"../{edit_tokenizer_path}", "rb") as f:
    s3.upload_fileobj(f, nama_bucket, edit_tokenizer_path)    

In [None]:
# save tokenizer bigrams vocabulary
tokenizer_bigrams_vocab = {}
ix = 0
for subword in tokenizer.get_vocab():
    tokenizer_bigrams_vocab[subword] = ix
    ix += 1
for bigram, _ in subword_bigrams_counter.most_common(vocab_size):
    tokenizer_bigrams_vocab[bigram] = ix
    ix += 1

print(len(tokenizer_bigrams_vocab))

with fopen(tokenizer_bigrams_vocab_path, 'w') as f:
    json.dump(tokenizer_bigrams_vocab, f)


In [None]:
# save edit tokenizer bigrams vocabulary
edit_tokenizer_bigrams_vocab = {}
ix = 0
for subword in edit_tokenizer.get_vocab():
    edit_tokenizer_bigrams_vocab[subword] = ix
    ix += 1
for bigram, _ in edit_subword_bigrams_counter.most_common(vocab_size):
    edit_tokenizer_bigrams_vocab[bigram] = ix
    ix += 1
    
print(len(edit_tokenizer_bigrams_vocab))
    
with fopen(edit_tokenizer_bigrams_vocab_path, 'w') as f:
    json.dump(edit_tokenizer_bigrams_vocab, f)


### Test loading tokenizer

In [None]:
with open(f"../{tokenizer_path}", 'wb') as f:
    s3.download_fileobj(nama_bucket, tokenizer_path, f)
loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f"../{tokenizer_path}")

In [None]:
loaded_tokenizer.convert_ids_to_tokens(loaded_tokenizer.encode('zacharias'))