In [1]:
from pathlib import Path
import llm_from_scratch.c2


VERDICT_PATH = Path(llm_from_scratch.c2.__file__).with_name("the-verdict.txt")

RAW_VERDICT_TEXT = VERDICT_PATH.read_text()
print("Total number of character:", len(RAW_VERDICT_TEXT))
print(RAW_VERDICT_TEXT[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re

text = "Hello, world. This, is a test."
result = re.split(r"(\s)", text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:
result = re.split(r"([,.|\s])", text)
print(result)

result = [word for word in result if word.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [4]:
import re

def tokenize(text : str) -> list[str]:
    result = re.split(r'([,.:;?_!"\(\)\']|--|\s)', text)
    result = [item.strip() for item in result]
    result = [item for item in result if item]
    return result

In [5]:
text = "Hello, world. Is this-- a test?"
print(tokenize(text))

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [6]:
VERDICT_TOKENS = tokenize(RAW_VERDICT_TEXT)
print("The verdict has this many tokens:", len(VERDICT_TOKENS))

The verdict has this many tokens: 4690


In [7]:
print(VERDICT_TOKENS[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [8]:
ALL_WORDS = sorted(set(VERDICT_TOKENS))
VOCAB_SIZE = len(ALL_WORDS)
print("Vocabulary size is:", VOCAB_SIZE)

Vocabulary size is: 1130


In [9]:
VOCAB = {word : idx for idx, word in enumerate(ALL_WORDS)}

In [10]:
for item, _ in zip(VOCAB.items(), range(30)):
    print(item)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)


In [11]:
class SimpleTokenizerV1:
    def __init__(self, vocab : dict[str, int]) -> None:
        self.str_to_int = vocab            #1
        self.int_to_str = {i:s for s,i in vocab.items()}        #2

    def encode(self, text : str) -> list[int]:         #3
        preprocessed = re.split(r'([,.:;?_!\"\(\)\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed]
        preprocessed = [item for item in preprocessed if item]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids : list[int]) -> str:         #4
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!\"\(\)\'])', r'\1', text)    #5
        return text


In [12]:
TOKENIZER = SimpleTokenizerV1(VOCAB)

In [13]:
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = TOKENIZER.encode(text)

print("The ids are:")
print(ids)

print("The decoded text is:")
print(TOKENIZER.decode(ids))


The ids are:
[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
The decoded text is:
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [14]:
TOKENIZER.encode("Hello, do you like tea?")

KeyError: 'Hello'

In [79]:
ALL_WORDS = sorted(set(VERDICT_TOKENS))
print("Type of ALL_WORDS is", type(ALL_WORDS))
ALL_WORDS += ["<|endoftext|>", "<|unk|>"]
print("Length of all words is", len(ALL_WORDS))
VOCAB = {word : idx for idx, word in enumerate(ALL_WORDS)}

print("Vocab size is:", len(VOCAB))

for i, item in enumerate(list(VOCAB.items())[-5:]):
    print(item)

Type of ALL_WORDS is <class 'list'>
Length of all words is 1132
Vocab size is: 1132
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [80]:
class SimpleTokenizerV2:
    def __init__(self, vocab : dict[str, int]) -> None:
        self.str_to_int = vocab            #1
        self.int_to_str = {i:s for s,i in vocab.items()}        #2
        self.unk_idx = self.str_to_int["<|unk|>"]

    def encode(self, text : str) -> list[int]:         #3
        preprocessed = re.split(r'([,.:;?_!\"\(\)\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed]
        preprocessed = [item for item in preprocessed if item]
        ids = [self.str_to_int.get(s, self.unk_idx) for s in preprocessed]
        return ids

    def decode(self, ids : list[int]) -> str:         #4
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!\"\(\)\'])', r'\1', text)    #5
        return text


TOKENIZER = SimpleTokenizerV2(VOCAB)

In [81]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

encoded = TOKENIZER.encode(text)
decoded = TOKENIZER.decode(encoded)

print("Encoded text:")
print(encoded)

print("Decoded text:")
print(decoded)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
Encoded text:
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
Decoded text:
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [15]:
import tiktoken
import sentencepiece
from importlib.metadata import version

print("tiktoken version method 1:", version("tiktoken"), tiktoken.__version__)


print("sentencepiece version", version("sentencepiece"), sentencepiece.__version__)



tiktoken version method 1: 0.8.0 0.8.0
sentencepiece version 0.2.0 0.2.0


In [16]:
GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")


NameError: name 'sene' is not defined

In [7]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = GPT2_TOKENIZER.encode(text, allowed_special={"<|endoftext|>"})
print("Encoded text is: ")
print(integers)

decoded = GPT2_TOKENIZER.decode(integers)
print("Decoded text is: ")
print(decoded)

Encoded text is: 
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Decoded text is: 
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.
