In [1]:
from datasets import load_dataset
from pprint import pprint 
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE # Byte-Pair Encoding
from tokenizers import Tokenizer
from ipywidgets import interact, IntSlider, Layout
from tokenizers.trainers import BpeTrainer
from tokenizers.tools import EncodingVisualizer
from multiprocessing import cpu_count
import pandas as pd 

print(cpu_count())

16


**Pipeline** 

normalizer (lowercase) -> pre-tokenizer (whitespace) -> model (BPE) -> postprocessor (None)

In [2]:
ds = load_dataset('bookcorpus', split='train', trust_remote_code=True)
pprint(ds)

Dataset({
    features: ['text'],
    num_rows: 74004228
})


In [3]:
num_samples = 6 
for idx, sample in enumerate(ds[0:num_samples]['text']):
    print(f'{idx} : {sample}')

0 : usually , he would be tearing around the living room , playing with his toys .
1 : but just one look at a minion sent him practically catatonic .
2 : that had been megan 's plan when she got him dressed earlier .
3 : he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .
4 : she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .
5 : `` are n't you being a good boy ? ''


In [4]:
model = BPE(ukn_token='[UNK]') # for unknown tokens output UNK
tokenizer = Tokenizer(model=model)

tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size = 32000, special_tokens = ['[UNK]', '[PAD]'], 
    continuing_subword_prefix='##'
) #runnning --> run ##ing

In [5]:
def get_examples(batch_size=1000) : 
    for i in range(0, len(ds), batch_size):
        yield ds[i:i+batch_size]['text']

In [6]:
tokenizer.train_from_iterator(
    get_examples(batch_size=10000), trainer=trainer, 
    length=len(ds)
) 

In [7]:
tokenizer.model.save('D:/vscode/DL_practice/NLP/models', prefix='hopper')

['D:/vscode/DL_practice/NLP/models\\hopper-vocab.json',
 'D:/vscode/DL_practice/NLP/models\\hopper-merges.txt']

In [8]:
with open('models/hopper-merges.txt', 'r') as file :
    row, num_lines = 0, 10 
    for line in file.readlines():
        print(line.strip())
        row += 1
        if row == num_lines:
            break

#version: 0.2
##h ##e
t ##he
##i ##n
##e ##r
##e ##d
##o ##u
##n ##d
##in ##g
t ##o


In [9]:
with open('models/hopper-merges.txt', 'r') as file :
    row, num_lines = 0, 10 
    for line in reversed(file.readlines()):
        print(line.strip())
        row += 1
        if row == num_lines:
            break

mel ##anthe
black ##er
ad ##ject
v ##ang
betroth ##al
tiptoe ##ing
restroom ##s
consol ##ing
esp ##ionage
influ ##x


num merges are slightly less than vocab size as merges don't include single character tokens, letters, numbers 

In [19]:
with open('models/hopper-merges.txt', 'r') as file :
    lines = file.readlines()

print(f'Number of merges: {len(lines)}')
print(f'vocab size : {tokenizer.get_vocab_size()}')

Number of merges: 31871
vocab size : 32000


In [34]:
vocab = tokenizer.get_vocab()
vocab_sorted = sorted(vocab.items(), key=lambda item: item[1])

In [37]:
def get_pairs(pair): 
    word, token_id = vocab_sorted[pair]
    print(f"Word: {word}, Token ID: {token_id}")

def display_token_id(id): 
    token = tokenizer.decode([id])
    print(f"Token ID: {id}, Token: {token}")

_ = interact(get_pairs, pair=IntSlider(min=0, max=len(lines)-1, step=1, value=1, layout=Layout(wifth='900px')))
_ = interact(display_token_id, id=IntSlider(min=0, max=31999, step=1, value=130, layout=Layout(wifth='900px')))

interactive(children=(IntSlider(value=1, description='pair', max=31870), Output()), _dom_classes=('widget-inte…

interactive(children=(IntSlider(value=130, description='id', max=31999), Output()), _dom_classes=('widget-inte…

### Encode Decode

In [13]:
sample = ds[0]['text']
print(f'sample: {sample}')
encoding = tokenizer.encode(sample)
print(encoding)

sample: usually , he would be tearing around the living room , playing with his toys .
Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [14]:
token_ids = encoding.ids
tokens = encoding.tokens
type_ids = encoding.type_ids
attention_mask = encoding.attention_mask

visualizer = EncodingVisualizer(tokenizer = tokenizer)
visualizer(text=sample) # everytihng is a token in the vocabulary

In [15]:
out_dict = {'tokens': tokens, 'ids': token_ids, 'type_ids': type_ids, 'attention_mask': attention_mask}
df = pd.DataFrame.from_dict(out_dict)
df

Unnamed: 0,tokens,ids,type_ids,attention_mask
0,usually,2462,0,1
1,",",19,0,1
2,he,149,0,1
3,would,277,0,1
4,be,162,0,1
5,tearing,6456,0,1
6,around,422,0,1
7,the,131,0,1
8,living,1559,0,1
9,room,536,0,1


when encoding a batch of samples we need to pad shorter sequences in the batch, a process known as  `dynamic batching`

In [16]:
samples = ds[0:4]['text']

batch_encoding = tokenizer.encode_batch(samples)
pprint(batch_encoding)

[Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [17]:
tokenizer.enable_padding(
    direcction = 'right', pad_id = 0, pad_token = '[PAD]', pad_type_id=0, 
    length = None, # None defaults to the max length in the batch 
    pad_to_multiple_of=None
)
tokenizer.enable_truncation(max_length=512)

In [18]:
batch_encoding = tokenizer.encode_batch(samples)
pprint(batch_encoding)

[Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [25]:
tokenizer.decode([1])

''