In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version", version("tiktoken"))

torch version: 2.4.0
tiktoken version 0.7.0


In [2]:
#the data we are using is from the-verdict book which consists of 20479 characters
with open("data/the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()

print("Total no.of characters: ",len(raw_text))
print(raw_text[:99])

Total no.of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
##This regular expression splits whitespaces and punctuations
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
print(preprocessed[:30])

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--']


In [4]:
print("Number of tokens:", len(preprocessed))
print("Number of unique tokens:", len(set(preprocessed)))

Number of tokens: 9235
Number of unique tokens: 1133


In [5]:
##The vocabolary contains unique words in the input text
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1133


In [6]:
## Creating token ids
vocab = {token: integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if(i>20):
        break

('', 0)
('\n', 1)
(' ', 2)
('!', 3)
('"', 4)
("'", 5)
('(', 6)
(')', 7)
(',', 8)
('--', 9)
('.', 10)
(':', 11)
(';', 12)
('?', 13)
('A', 14)
('Ah', 15)
('Among', 16)
('And', 17)
('Are', 18)
('Arrt', 19)
('As', 20)
('At', 21)


### Creating a Simple Tokenizer Class 

In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    ## The encoder converts text into token ids
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [ self.str_to_int[s] for s in preprocessed]
        return ids

    ## The decoder converts token ids into text
    def decode(self,ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
tokeizer = SimpleTokenizerV1(vocab)

In [9]:
## This small text is also taken from the-verdict.txt
tokeizer.encode('"The height of his glory"--that was what the women called it.')

[4, 96, 541, 725, 552, 499, 4, 9, 990, 1080, 1092, 991, 1115, 245, 588, 10]

In [10]:
tokeizer.decode([4, 96, 541, 725, 552, 499, 4, 9, 990, 1080, 1092, 991, 1115, 245, 588, 10])

'" The height of his glory" -- that was what the women called it.'

### BytePair Encoding

In [11]:
# GPT-2 uses BytePair encoding as its tokenizer.
# Here I am using tiktoken tokenizer which is nearly 3x faster than the original GPT2 tokenizer.

import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')

In [12]:
# We have 50257 tokens in gpt2 vocabulary, 50256 --> <|endoftext|> (0 based indexing)
text = "Hello, I am Neo <|endoftext|> I am from Matrix"

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

integers

[15496, 11, 314, 716, 21227, 220, 50256, 314, 716, 422, 24936]

In [13]:
strings = tokenizer.decode(integers)

strings

'Hello, I am Neo <|endoftext|> I am from Matrix'

In [27]:
from utils import create_dataloader_v1

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Check for Mac M1s GPU

In [32]:
import torch
print(torch.backends.mps.is_built())
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(device)

True
mps


In [36]:
x  = torch.rand(size=(3,4)).to(device)
x

tensor([[0.3826, 0.1148, 0.9627, 0.6304],
        [0.9410, 0.7646, 0.7533, 0.5084],
        [0.5970, 0.3250, 0.8704, 0.5511]], device='mps:0')

In [37]:
y  = torch.rand(size=(3,4))
y

tensor([[0.7342, 0.0219, 0.6956, 0.3475],
        [0.3875, 0.2691, 0.5654, 0.0680],
        [0.1955, 0.3770, 0.6571, 0.3816]])