<a href="https://colab.research.google.com/github/srika16/LLMs-from-scratch-working-copy/blob/main/ch02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x79b71003d490>)

In [19]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()
print("Total number of characters: ", len(raw_text))
print(raw_text[:99])

Total number of characters:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [20]:
# list of individual words, whitespaces and punctionation characters
import re
text = "Hello, World. This, is a test."
result = re.split(r'(\s)', text)
print(result)


['Hello,', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [21]:
# regular expression splits on whitespaces(\s), commas and periods ([,.])
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [22]:
# whitespace free output
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']


In [23]:
# Handle other punctuation types
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)

result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [27]:
# now we have the proper regular expression, apply to the story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
preprocessed[0:30]

4690


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

In [30]:
# Convert tokens to IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [37]:
# creating a vocabulary
vocab = {token:i for i,token in enumerate(all_words)}
list(vocab.items())[:30]

[('!', 0),
 ('"', 1),
 ("'", 2),
 ('(', 3),
 (')', 4),
 (',', 5),
 ('--', 6),
 ('.', 7),
 (':', 8),
 (';', 9),
 ('?', 10),
 ('A', 11),
 ('Ah', 12),
 ('Among', 13),
 ('And', 14),
 ('Are', 15),
 ('Arrt', 16),
 ('As', 17),
 ('At', 18),
 ('Be', 19),
 ('Begin', 20),
 ('Burlington', 21),
 ('But', 22),
 ('By', 23),
 ('Carlo', 24),
 ('Chicago', 25),
 ('Claude', 26),
 ('Come', 27),
 ('Croft', 28),
 ('Destroyed', 29)]

In [57]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[token] for token in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [58]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [59]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [60]:
# Will not run, as "Hello" is not in the vocabulary
text = "Hello, do you like Tea?"
ids = tokenizer.encode(text)
print(ids)

KeyError: 'Hello'