# Import data

In [1]:
import urllib.request

In [4]:
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
      "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x730f0cd918b0>)

In [5]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


# Custom tokenizer

In [6]:
import re

In [29]:
extracted_content = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)

In [30]:
len(extracted_content)

9235

#### number of white spaces

In [31]:
type(extracted_content)

list

In [32]:
len(list(filter(lambda x: x == ' ' or x =='\t', extracted_content)))

3551

#### removing white spaces

In [33]:
x = 'tst '

In [34]:
x.strip()

'tst'

In [35]:
extracted_content =  [item.strip() for item in extracted_content if item.strip()]

In [36]:
len(extracted_content)

4690

In [37]:
extracted_content[:30]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

## Assign tokenIDs

In [38]:
# unique tokens
vocab = set(extracted_content)

In [41]:
len(vocab)

1130

In [42]:
# sorting alphabetically
vocab = sorted(vocab)

In [43]:
vocab[:10]

['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']

In [44]:
vocab = {token: integer for integer, token in enumerate(vocab)}

In [46]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


## Tokenizer:V1

In [47]:
class SimpleTokenizerV1:

    def __init__(self, vocab):

        self.str_to_int =  vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

##### NOTES

* r'\1': This is a backreference in regex, referring to the first captured group in a pattern.


In [48]:
# tokenizer
simpletokenizer_v1 = SimpleTokenizerV1(vocab)

In [51]:
test = """"It's the last he painted, you know,"
Mrs.Gisburn said with pardonable pride."""
test_ids = simpletokenizer_v1.encode(test)

In [52]:
test_ids

[1,
 56,
 2,
 850,
 988,
 602,
 533,
 746,
 5,
 1126,
 596,
 5,
 1,
 67,
 7,
 38,
 851,
 1108,
 754,
 793,
 7]

In [53]:
simpletokenizer_v1.decode(test_ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [54]:
print(simpletokenizer_v1.decode(test_ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Tokenizer:V2

### \<unk> and \<endoftext>

In [55]:
alltokens = sorted(set(extracted_content))

In [56]:
alltokens.extend(["<|endoftext|>", "<|unk|>"])

In [57]:
vocab_dict = {token:integer for integer, token in enumerate(alltokens)}

In [58]:
len(vocab_dict)

1132

In [61]:
# CHECKING
for i, item in enumerate(list(vocab_dict.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [63]:
'your' in vocab_dict

True

In [78]:
1127 in vocab_dict

False

In [64]:
class SimpleTokenizerV2:

    def __init__(self, vocab):

        self.str_to_int =  vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [66]:
# testing
test1 = "Hello, do you like tea?"

simpletokenizer_v2 = SimpleTokenizerV2(vocab_dict)

In [68]:
output1 = simpletokenizer_v2.encode(test1)

In [69]:
simpletokenizer_v2.decode(output1)

'<|unk|>, do you like tea?'

In [79]:
test2 = "In the sunlit terraces of the palace."
test3 = "With some sugar."
test = " <|endoftext|> ".join([test1, test2, test3])
test

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace. <|endoftext|> With some sugar.'

In [80]:
output2 = simpletokenizer_v2.encode(test)

In [81]:
simpletokenizer_v2.decode(output2)

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>. <|endoftext|> <|unk|> some sugar.'

# Byte-Pair encoding