In [1]:
%load_ext autoreload
%autoreload 2

# Generate subword tokenizer

Use the tree-hr attachment names to train a subword tokenizer.


In [17]:
import random

import pandas as pd
from tokenizers import models, Tokenizer, trainers
from transformers import PreTrainedTokenizerFast
from tqdm.auto import tqdm

from nama.data.filesystem import download_file_from_s3, save_file

In [49]:
# TODO run both given and surname
given_surname = "given"
# given_surname = "surname"

# run with 256, 512, 1024, 1536, 2048
vocab_size = 1536

frequencies_path = f"s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-{given_surname}-aggr-v2.parquet"

tokenizer_path=f"s3://fs-nama-data/2024/nama-data/data/models/fs-{given_surname}-subword-tokenizer-{vocab_size}.json"

## Load data

In [4]:
# load counts
frequencies_path = download_file_from_s3(frequencies_path) if frequencies_path.startswith("s3://") else frequencies_path
counts_df = pd.read_parquet(frequencies_path)
print(counts_df.shape)
counts_df.head(3)

(25541154, 10)


Unnamed: 0,name,alt_name,frequency,reverse_frequency,sum_name_frequency,total_name_frequency,total_alt_name_frequency,ordered_prob,unordered_prob,similarity
0,a,a,1622927,1622927,2578937,36295683,36295683,0.629301,0.04680698,1.0
1,a,aa,154,139,2578937,36295683,5067,6e-05,8.071524e-06,0.5
2,a,aaa,3,5,2578937,36295683,143,1e-06,2.204111e-07,0.333333


In [5]:
# create a dataframe with alt name (record name) frequencies
counts_df = counts_df[['alt_name', 'total_alt_name_frequency']].drop_duplicates()
print(counts_df.shape)
counts_df.head(3)

(6148634, 2)


Unnamed: 0,alt_name,total_alt_name_frequency
0,a,36295683
1,aa,5067
2,aaa,143


In [6]:
counts_df['total_alt_name_frequency'].sum()

np.int64(2906726951)

In [7]:
# get list of all names (shuffle takes a long time)
all_names = []
for row in tqdm(counts_df.to_dict('records')):
    for _ in range(row['total_alt_name_frequency'] // 2):
        all_names.append(row['alt_name'])
random.shuffle(all_names)    
print(len(all_names))

  0%|          | 0/6148634 [00:00<?, ?it/s]

1451132858


## Generate Subwords

In [50]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

In [51]:
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

trainer = trainers.WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)

In [52]:
%%time
# train tokenizer from all names
def get_all_names_batch(batch_size=10000):
    for i in range(0, len(all_names), batch_size):
        yield all_names[i : i + batch_size]
        
tokenizer.train_from_iterator(get_all_names_batch(), 
                              trainer=trainer, 
                              length=len(all_names))




CPU times: user 1h 2min 42s, sys: 1h 36min 33s, total: 2h 39min 15s
Wall time: 16min 55s


In [53]:
tokenizer.get_vocab()

{'##onnie': 1064,
 'cathrine': 1407,
 '##oo': 1418,
 'timothy': 1495,
 '##zia': 1507,
 '##ler': 1122,
 'p': 20,
 'reb': 523,
 '##v': 43,
 'mar': 61,
 'addie': 1040,
 '##ina': 123,
 '##res': 653,
 'joannes': 794,
 '##per': 837,
 'cur': 1005,
 'beatr': 693,
 '##ven': 883,
 '##ier': 835,
 'caro': 349,
 'caroline': 431,
 '##um': 722,
 'marjor': 832,
 'ang': 515,
 'lillie': 659,
 '##art': 889,
 'car': 190,
 '##ona': 656,
 '##dalupe': 1091,
 'naom': 1399,
 'joann': 586,
 'dav': 256,
 'bessie': 526,
 'jona': 928,
 'norma': 1189,
 'fan': 652,
 'ferdin': 1152,
 'die': 1434,
 '##ail': 1215,
 'sophia': 609,
 'wilbur': 996,
 'anne': 354,
 'robert': 154,
 'qu': 1417,
 'ma': 90,
 'burton': 1521,
 'joanna': 1243,
 'mur': 948,
 'winifred': 1202,
 'warren': 801,
 'david': 279,
 'dor': 205,
 'sof': 1259,
 'kathryn': 1046,
 'jan': 189,
 'andre': 957,
 'grace': 399,
 'tim': 1298,
 '##q': 52,
 '##nis': 994,
 'aaron': 1309,
 '##ur': 121,
 '##ima': 1366,
 '##old': 271,
 'irene': 493,
 'urs': 1425,
 'alta': 1

In [54]:
len(tokenizer.get_vocab())

1536

## Review sample

In [55]:
sample_size = 100
sample_df = counts_df.sample(sample_size)

In [56]:
for name in sample_df['alt_name']:
    print(name, tokenizer.encode(name).tokens)

friedezias ['frie', '##de', '##zia', '##s']
eculaslica ['e', '##c', '##ula', '##s', '##li', '##ca']
winberto ['win', '##bert', '##o']
hylantha ['h', '##yl', '##ant', '##ha']
melkadis ['mel', '##ka', '##dis']
dorcenos ['dor', '##cen', '##os']
stanrod ['stan', '##ro', '##d']
hillyrd ['hil', '##ly', '##r', '##d']
bionel ['b', '##ion', '##el']
romarlo ['ro', '##mar', '##lo']
johniyene ['john', '##i', '##y', '##ene']
howlow ['ho', '##w', '##lo', '##w']
fedillla ['f', '##ed', '##ill', '##la']
amboleene ['amb', '##ol', '##ee', '##ne']
felichian ['felic', '##h', '##ian']
sydalia ['sy', '##dal', '##ia']
grettke ['gre', '##tt', '##ke']
arehy ['ar', '##e', '##h', '##y']
hesckiak ['he', '##s', '##ck', '##ia', '##k']
nielhi ['niel', '##h', '##i']
hamernick ['ham', '##er', '##n', '##ick']
foxcraft ['f', '##o', '##x', '##c', '##ra', '##f', '##t']
vicentede ['vicente', '##de']
skjerhold ['s', '##k', '##je', '##r', '##h', '##old']
maurante ['maur', '##ant', '##e']
buttermieer ['bu', '##tt', '##er', '##

## Save tokenizers

In [57]:
save_file(tokenizer_path,
          lambda local_out_path : tokenizer.save(local_out_path))

## Test load tokenizer

In [19]:
tokenizer_path

's3://fs-nama-data/2024/nama-data/data/models/fs-given-subword-tokenizer-2048.json'

In [20]:
local_tokenizer_path = download_file_from_s3(tokenizer_path) if tokenizer_path.startswith("s3://") else tokenizer_path
loaded_tokenizer = PreTrainedTokenizerFast(tokenizer_file=local_tokenizer_path)

In [21]:
for name in ['chesworth', 'quass']:
    print(loaded_tokenizer.convert_ids_to_tokens(loaded_tokenizer.encode(name)))

['ch', '##es', '##worth']
['qu', '##as', '##s']
