<a href="https://colab.research.google.com/github/srika16/LLMs-from-scratch-working-copy/blob/main/ch02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7aeeeaac82d0>)

In [15]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()
print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [16]:
# list of individual words, whitespaces and punctionation characters
import re
text = "Hello, World. This, is a test."
result = re.split(r'(\s)', text)
print(result)


['Hello,', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [17]:
# regular expression splits on whitespaces(\s), commas and periods ([,.])
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [18]:
# whitespace free output
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']


In [19]:
# Handle other punctuation types
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)

result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [20]:
# now we have the proper regular expression, apply to the story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
preprocessed[0:30]

4690


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

In [21]:
# Convert tokens to IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [22]:
# creating a vocabulary
vocab = {token:i for i,token in enumerate(all_words)}
list(vocab.items())[:30]

[('!', 0),
 ('"', 1),
 ("'", 2),
 ('(', 3),
 (')', 4),
 (',', 5),
 ('--', 6),
 ('.', 7),
 (':', 8),
 (';', 9),
 ('?', 10),
 ('A', 11),
 ('Ah', 12),
 ('Among', 13),
 ('And', 14),
 ('Are', 15),
 ('Arrt', 16),
 ('As', 17),
 ('At', 18),
 ('Be', 19),
 ('Begin', 20),
 ('Burlington', 21),
 ('But', 22),
 ('By', 23),
 ('Carlo', 24),
 ('Chicago', 25),
 ('Claude', 26),
 ('Come', 27),
 ('Croft', 28),
 ('Destroyed', 29)]

In [23]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[token] for token in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [24]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [25]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [26]:
# Will not run, as "Hello" is not in the vocabulary
# text = "Hello, do you like Tea?"
# ids = tokenizer.encode(text)
# print(ids)

In [27]:
# add 2 special tokens <unk> unknown, <|endoftext|> end of the line text
all_tokens = sorted(set(preprocessed))
all_tokens = ["<|endoftext|>", "<|unk|>"] + all_tokens
vocab = {token:i for i,token in enumerate(all_tokens)}
print(list(vocab)[-5:])

['yet', 'you', 'younger', 'your', 'yourself']


In [28]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    # replaces unknown words by <|unk|> tokens
    preprocessed = [item if item in self.str_to_int
                    else "<|unk|>" for item in preprocessed]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # Replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text



In [29]:
text1 = "Hello, do you like Tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1, text2])
print(text)
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(text)
print(ids)

Hello, do you like Tea? <|endoftext|> In the sunlit terraces of the palace.
[1, 7, 357, 1128, 630, 1, 12, 0, 57, 990, 958, 986, 724, 990, 1, 9]


In [30]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like <|unk|>? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [31]:
# Bype Pair Encoding is a more sophisticated tokenization schema based on a
# concept called Bype Pair Encoding (BPE), used to train GPT2, GPT3.

# We will use an implemention called tiktoken
# Bype Pair Encoding can break down unknown words to smaller tokens so it will
# never have unknown tokens.
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [32]:
from importlib.metadata import version
import tiktoken
print("tiktoken version: ", version("tiktoken"))

tiktoken version:  0.8.0


In [33]:
tokenizer = tiktoken.get_encoding("gpt2")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
print(tokenizer.decode(integers))

[15496, 11, 466, 345, 588, 15777, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13]
Hello, do you like Tea? <|endoftext|> In the sunlit terraces of the palace.
