## import library

In [1]:
from hugtokencraft import editor
from transformers import BertTokenizer
import os

## Load a PreTrainedTokenizer
First let us load a pretrained bert tokenizer and examine its vocabulary size and maximum token length  

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
initial_vocab_size=len(tokenizer)
print(f"initial vocab size: {initial_vocab_size}")
print(f"Model's maximum token length: {tokenizer.model_max_length}")

initial vocab size: 30522 words
Model's maximum token length: 512


## Target vocabulary
Now let us select a subset of the vocabulary to keep. We are going to keep the top 10 most frequently used words in the original tokenizer in our modified tokenizer.


In [3]:
target_vocab_size=20
selected_20_words=editor.get_top_tokens(tokenizer,target_vocab_size)
selected_words=set(list(selected_20_words)[:10])
selected_words_add=set(list(selected_20_words)[10:])
query_text=result = " ".join(selected_20_words)
print(selected_words)
print(selected_words_add)

{'thyroid', 'inspections', 'sanjay', 'infused', 'collarbone', 'hideout', 'scandals', 'pudding', 'arsenic', 'wreath'}
{'nitrate', 'stalks', 'necessitated', 'dowry', 'disgrace', 'salamanca', 'genealogical', 'wince', 'tbs', 'leases'}


## Parameters
To get the modified tokenizer, we need two things
- Location path to save the tokenizer. This is important as modified tokenizer will not function properly without saving.
- New value for model_max_length (Optional). Defaults to None which means no modification. 

In [4]:
current_directory = os.getcwd()
# Define the path where you want to save the tokenizer
tokenizer_path = os.path.join(current_directory,"ModifiedTokenizer")
model_max_length=128

## Reduce the vocabulary
This is done in 4 steps.
- Reduce the vocabulary 
- Save the modified tokenizer
- Load the modified tokenizer for use
- A validation check of the integrity of the new tokenizer (autometically done during loading) 

In [5]:
modified_tokenizer=editor.reduce_vocabulary(tokenizer,selected_words)
tokenizer_path=editor.save_tokenizer(modified_tokenizer,tokenizer_path,model_max_length)
modified_tokenizer=editor.load_tokenizer(type(tokenizer),tokenizer_path)

Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!


Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabulary to c:\Users\fahim\Documents\My Python Public\HugTokenCraft\ModifiedTokenizer\vocab.txt: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!
Saving vocabula


Vocabulary reduction: Done
 Original vocabulary size: 30522
 New vocabulary size: 15

Tokenizer files created

Vocabulary was reduced. Requires json editing
Tokenizer loaded
 Starting to edit the json files...
 Target Special token mapping:
  Special Token: [PAD], Token ID: 0
  Special Token: [UNK], Token ID: 1
  Special Token: [CLS], Token ID: 2
  Special Token: [SEP], Token ID: 3
  Special Token: [MASK], Token ID: 4

 1. Editing added_tokens.json
 2. Editing tokenizer_config.json
 3. Model max length updated: 128
 Editing done. 
Tokenization model creation completed.

Tokenization model Validation: Passed

15


Check

In [6]:
new_vocab_size=len(modified_tokenizer)
print(f"new vocab size: {new_vocab_size}")
print("New vocabulary:")
print(modified_tokenizer.get_vocab())
print("Added tokens:")
print(modified_tokenizer.get_added_vocab())
print("Tokenized text:")
tokenized_text=modified_tokenizer.tokenize(query_text)
print(tokenized_text)

new vocab size: 15 words
New vocabulary:
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, 'wreath': 5, 'hideout': 6, 'inspections': 7, 'sanjay': 8, 'infused': 9, 'pudding': 10, 'arsenic': 11, 'collarbone': 12, 'scandals': 13, 'thyroid': 14}
Added words:
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
Tokenized text:
['sanjay', 'collarbone', 'scandals', 'pudding', 'hideout', 'arsenic', 'thyroid', 'inspections', 'infused', 'wreath', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]']


## Expand vocabulary
This is done in 4 steps.
- Reduce the vocabulary 
- Save the modified tokenizer
- Load the modified tokenizer for use
- A validation check of the integrity of the new tokenizer (autometically done during loading) 

In [9]:
modified_tokenizer=editor.expand_vocabulary(modified_tokenizer,selected_words_add)
tokenizer_path=editor.save_tokenizer(modified_tokenizer,tokenizer_path,model_max_length,isreduced=False)
modified_tokenizer=editor.load_tokenizer(type(tokenizer),tokenizer_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer files created

Vocabulary was expanded. Therefore, skipped json editing

Tokenization model Validation: Passed

25


Check

In [11]:
new_vocab_size=len(modified_tokenizer)
print(f"new vocab size: {new_vocab_size} words")
print("New vocabulary:")
print(modified_tokenizer.get_vocab())
print("Added words:")
print(modified_tokenizer.get_added_vocab())
tokenized_text=modified_tokenizer.tokenize(query_text)
print("Tokenized text:")
print(tokenized_text)

new vocab size: 25 words
New vocabulary:
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, 'tbs': 5, 'wince': 6, 'inspections': 7, 'infused': 8, 'arsenic': 9, 'leases': 10, 'genealogical': 11, 'salamanca': 12, 'thyroid': 13, 'necessitated': 14, 'sanjay': 15, 'collarbone': 16, 'nitrate': 17, 'pudding': 18, 'wreath': 19, 'disgrace': 20, 'hideout': 21, 'stalks': 22, 'dowry': 23, 'scandals': 24}
Added words:
{'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4, 'sanjay': 15, 'collarbone': 16, 'nitrate': 17, 'pudding': 18, 'wreath': 19, 'disgrace': 20, 'hideout': 21, 'stalks': 22, 'dowry': 23, 'scandals': 24}
Tokenize text:
['wince', 'arsenic', 'genealogical', 'necessitated', 'salamanca', 'infused', 'inspections', 'tbs', 'thyroid', 'leases', 'sanjay', 'nitrate', 'collarbone', 'pudding', 'disgrace', 'hideout', 'scandals', 'wreath', 'stalks', 'dowry']
