# Import data

In [1]:
import urllib.request

In [2]:
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
      "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x78bb0db56000>)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Custom tokenizer

In [4]:
import re

In [5]:
extracted_content = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)

In [6]:
len(extracted_content)

9235

#### number of white spaces

In [7]:
type(extracted_content)

list

In [8]:
len(list(filter(lambda x: x == ' ' or x =='\t', extracted_content)))

3551

#### removing white spaces

In [9]:
x = 'tst '

In [10]:
x.strip()

'tst'

In [11]:
extracted_content =  [item.strip() for item in extracted_content if item.strip()]

In [12]:
len(extracted_content)

4690

In [13]:
extracted_content[:30]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

## Assign tokenIDs

In [14]:
# unique tokens
vocab = set(extracted_content)

In [15]:
len(vocab)

1130

In [16]:
# sorting alphabetically
vocab = sorted(vocab)

In [17]:
vocab[:10]

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']

In [18]:
vocab = {token: integer for integer, token in enumerate(vocab)}

In [19]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


## Tokenizer:V1

In [20]:
class SimpleTokenizerV1:

    def __init__(self, vocab):

        self.str_to_int =  vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

##### NOTES

* r'\1': This is a backreference in regex, referring to the first captured group in a pattern.


In [21]:
# tokenizer
simpletokenizer_v1 = SimpleTokenizerV1(vocab)

In [22]:
test = """"It's the last he painted, you know,"
Mrs.Gisburn said with pardonable pride."""
test_ids = simpletokenizer_v1.encode(test)

In [23]:
test_ids

[1,
 56,
 2,
 850,
 988,
 602,
 533,
 746,
 5,
 1126,
 596,
 5,
 1,
 67,
 7,
 38,
 851,
 1108,
 754,
 793,
 7]

In [24]:
simpletokenizer_v1.decode(test_ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [25]:
print(simpletokenizer_v1.decode(test_ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Tokenizer:V2

### \<unk> and \<endoftext>

In [26]:
alltokens = sorted(set(extracted_content))

In [27]:
alltokens.extend(["<|endoftext|>", "<|unk|>"])

In [28]:
vocab_dict = {token:integer for integer, token in enumerate(alltokens)}

In [29]:
len(vocab_dict)

1132

In [30]:
# CHECKING
for i, item in enumerate(list(vocab_dict.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [31]:
'your' in vocab_dict

True

In [32]:
1127 in vocab_dict

False

In [33]:
class SimpleTokenizerV2:

    def __init__(self, vocab):

        self.str_to_int =  vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [34]:
# testing
test1 = "Hello, do you like tea?"

simpletokenizer_v2 = SimpleTokenizerV2(vocab_dict)

In [35]:
output1 = simpletokenizer_v2.encode(test1)

In [36]:
simpletokenizer_v2.decode(output1)

'<|unk|>, do you like tea?'

In [37]:
test2 = "In the sunlit terraces of the palace."
test3 = "With some sugar."
test = " <|endoftext|> ".join([test1, test2, test3])
test

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace. <|endoftext|> With some sugar.'

In [38]:
output2 = simpletokenizer_v2.encode(test)

In [39]:
simpletokenizer_v2.decode(output2)

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>. <|endoftext|> <|unk|> some sugar.'

# Byte-Pair encoding

## Pretrained tokenizer

In [40]:
from tokenizers import Tokenizer

In [41]:
gpt_tokenizer = Tokenizer.from_pretrained('gpt2')

In [42]:
gpt_tokenizer.get_vocab_size()

50257

In [43]:
gpt_vocab = gpt_tokenizer.get_vocab()

In [44]:
gpt_vocab

{'!?"': 42720,
 'anguages': 33213,
 'raising': 32741,
 'ĠInternet': 4455,
 'itent': 48324,
 'ĠEE': 27254,
 'Ġannually': 13844,
 'Ġtuning': 24549,
 'Ġdear': 13674,
 'Whatever': 21875,
 'ĠOften': 18023,
 'cription': 6820,
 'Ġcontact': 2800,
 'Ġinclud': 846,
 'Hash': 26257,
 'UTE': 37780,
 'uristic': 27915,
 'cam': 20991,
 'ourced': 30555,
 'Ġinfant': 11212,
 'Ġnarrowed': 33214,
 'Ġsignal': 6737,
 'ĠFin': 4463,
 'ĠInteresting': 43580,
 'pid': 35317,
 'olkien': 31052,
 'ĠDisorder': 31162,
 'Ġlogistics': 26355,
 'wy': 21768,
 'Ġoverflowing': 43347,
 'Ġinserts': 42220,
 'Ġexpended': 37328,
 'advert': 17904,
 'Ġdinners': 42541,
 'Ġtyp': 2170,
 'Ġ630': 44505,
 'Ġrescued': 19868,
 'iframe': 39621,
 '035': 44215,
 'Ġpenetrated': 41847,
 'ĠExt': 5683,
 'ĠBros': 14266,
 'ĠFresh': 20138,
 'ĠJustin': 10799,
 'Ġweren': 6304,
 'Ġ136': 21056,
 'Ġ98': 9661,
 'Ġstarted': 2067,
 'except': 16341,
 'hist': 10034,
 'Ġsqueezed': 29650,
 'rossover': 23954,
 'ĠPopulation': 20133,
 'ĠCBI': 47970,
 'utton': 21115

In [45]:
list(gpt_vocab)[:5]

['!?"', 'anguages', 'raising', 'ĠInternet', 'itent']

In [46]:
list(gpt_vocab)[-2:]

['ĠPercent', 'ĠNRA']

In [47]:
'<|endoftext|>' in gpt_vocab

True

In [48]:
gpt_vocab['<|endoftext|>']

50256

#### Letter casing

In [49]:
gpt_vocab['a']

64

In [50]:
gpt_vocab['A']

32

### Encode

In [51]:
gpt_encoded = gpt_tokenizer.encode(sequence=test1, pair=test2)

In [52]:
gpt_encoded.ids

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 818,
 262,
 4252,
 18250,
 8812,
 2114,
 286,
 262,
 20562,
 13]

In [53]:
gpt_encoded.attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [54]:
# sentence1, sentence2
gpt_encoded.sequence_ids

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [55]:
gpt_encoded.word_ids

[0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 2, 3, 3, 4, 5, 6, 7]

In [56]:
gpt_encoded.tokens

['Hello',
 ',',
 'Ġdo',
 'Ġyou',
 'Ġlike',
 'Ġtea',
 '?',
 'In',
 'Ġthe',
 'Ġsun',
 'lit',
 'Ġterr',
 'aces',
 'Ġof',
 'Ġthe',
 'Ġpalace',
 '.']

In [57]:
gpt_encoded.type_ids

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

#### Offsets for alignment tracking

In [58]:
gpt_encoded.offsets

[(0, 5),
 (5, 6),
 (6, 9),
 (9, 13),
 (13, 18),
 (18, 22),
 (22, 23),
 (0, 2),
 (2, 6),
 (6, 10),
 (10, 13),
 (13, 18),
 (18, 22),
 (22, 25),
 (25, 29),
 (29, 36),
 (36, 37)]

In [59]:
sentence =  "".join([test1, test2])
for (i, j) in gpt_encoded.offsets:
    
    print(sentence[i:j])

Hello
,
 do
 you
 like
 tea
?
He
llo,
 do 
you
 like
 tea
?In
 the
 sunlit
 


### Decode

In [60]:
gpt_tokenizer.decode_batch(sequences=[gpt_encoded.ids])

['Hello, do you like tea?In the sunlit terraces of the palace.']

# Data sampling with a sliding window

* For training

## Encode

In [61]:
# encode the data using the pretrained tokenizer
encoded_text = gpt_tokenizer.encode(raw_text)

In [62]:
type(encoded_text.tokens)

list

In [63]:
len(encoded_text.tokens)

5145

## Dataloader

In [75]:
from importlib import reload

In [126]:
import dataloader

In [96]:
# %reload_ext dataloader
reload(dataloader)

<module 'dataloader' from '/home/sameervk/Documents/Training/MachineLearning/LLM_app_huggingface/RnD/Tokenisation-Embedding/dataloader.py'>

In [97]:
dataset = dataloader.GPTDatasetV1(txt=raw_text, 
                                  tokenizer=gpt_tokenizer,
                                  max_length=4, 
                                  stride=1
                                 )

In [112]:
data_loader = dataloader.create_dataloader_v1(dataset=dataset, 
                                              batch_size=1,
                                              shuffle=False, 
                                              drop_last=True, 
                                              num_workers=0
                                             )

### TESTING

In [113]:
data_iterator = iter(data_loader)

In [114]:
first_batch = next(data_iterator)

In [115]:
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [116]:
second_batch = next(data_iterator)

In [117]:
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [118]:
50257*12288

617558016

# Embedding layer

In [119]:
import torch

In [120]:
vocab_size = gpt_tokenizer.get_vocab_size()
vocab_size

50257

In [121]:
embedding_dimension = 256

In [122]:
embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, 
                                     embedding_dim=embedding_dimension
                                    )

#### TESTING

In [127]:
del data_loader

In [124]:
del dataset

In [128]:
max_input_length=4
dataset = GPTDatasetV1(txt=raw_text, 
                       tokenizer=gpt_tokenizer, 
                       max_length=max_input_length, 
                       stride=max_input_length 
                      )
data_loader = dataloader.create_dataloader_v1(dataset=dataset, 
                                              batch_size=8,
                                              shuffle=False, 
                                              drop_last=True, 
                                              num_workers=0
                                             )

In [129]:
data_iterator = iter(data_loader)

In [130]:
test_batch = next(data_iterator)
test_batch

[tensor([[   40,   367,  2885,  1464],
         [ 1807,  3619,   402,   271],
         [10899,  2138,   257,  7026],
         [15632,   438,  2016,   257],
         [  922,  5891,  1576,   438],
         [  568,   340,   373,   645],
         [ 1049,  5975,   284,   502],
         [  284,  3285,   326,    11]]),
 tensor([[  367,  2885,  1464,  1807],
         [ 3619,   402,   271, 10899],
         [ 2138,   257,  7026, 15632],
         [  438,  2016,   257,   922],
         [ 5891,  1576,   438,   568],
         [  340,   373,   645,  1049],
         [ 5975,   284,   502,   284],
         [ 3285,   326,    11,   287]])]

In [131]:
test_batch[0].shape

torch.Size([8, 4])

In [132]:
test_batch_embeddings = embedding_layer(test_batch[0])

In [134]:
test_batch_embeddings.shape

torch.Size([8, 4, 256])

# Positional Encoding layer

* This is also a layer with learnable weights

## Context length

In [135]:
context_length = max_input_length

## Layer

In [136]:
pos_embedding_layer = torch.nn.Embedding(context_length, embedding_dimension)

In [137]:
pos_embeddings = pos_embedding_layer(torch.arange(0, context_length))
pos_embeddings.shape

torch.Size([4, 256])

# Embeddings + Positional Encoding

In [138]:
test_input_embeddings = test_batch_embeddings +  pos_embeddings

In [139]:
test_input_embeddings.shape

torch.Size([8, 4, 256])

## NOTE

* The sum of Embeddings and Positional Encoding are the inputs to the LLM.
  