<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Read a publicly available text file from a URL.

import urllib.request

url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
file_path = ("the-verdict.txt")

urllib.request.urlretrieve(url, file_path)

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

print("Length of the file is : ", len(text))

print(text[:99])


Length of the file is :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
#Split the text that was just read using reg expressions and print the length of the text before and after split

import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed), len(text))

print(preprocessed[:30])


4690 20479
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [3]:
# In this step , we need to sort the tokenized text, remove dups, and assign an unique integer for each token.

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size, type(all_words))

vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break


1130 <class 'list'>
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [4]:
# Tokenizer Class that takes in the vocab that we created. And also, we send a sample new text for tokenization and encoding to an unique integer id and then decode as well.
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # vocab is a dictionary and hence str_to_int is a dictionary as well
        self.int_to_str = {i:s for s,i in vocab.items()}
       # print(self.str_to_int)

    def encode(self, text): #new input text
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed] # creating a list
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [5]:
# use the above class by instantiating it with the vocabulary we created earlier from the verdict corpus. And then encode and decode

tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(len(ids), ids)

print(tokenizer.decode(ids))

text1 = """"Mrs. said pride."""
ids1 = tokenizer.encode(text1)
print(len(ids1), ids1)

print(tokenizer.decode(ids1))

21 [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
6 [1, 67, 7, 851, 793, 7]
" Mrs. said pride.


In [6]:
# what about words or tokens not in the corupus like below?

text2 = """"Mr. Sridhar said pride."""
ids2 = tokenizer.encode(text2)
print(len(ids2), ids2)

print(tokenizer.decode(ids2))

KeyError: 'Sridhar'

In [7]:
# Need to add some additional tokens for a. unknown b. end of source text

all_tokens = sorted(set(preprocessed))
print(len(all_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
print(len(all_tokens))

vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab))

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

1130
1132
1132
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [8]:
# Now need to modify the tokenizer custom class to include above

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab # vocab is a dictionary and hence str_to_int is a dictionary as well
        self.int_to_str = {i:s for s,i in vocab.items()}
       # print(self.str_to_int)

    def encode(self, text): #new input text
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        print("Preprocessed before token check : " , preprocessed)
        # now check for each token in the preprocessed against the vocab.
        preprocessed = [item if item in self.str_to_int
                             else "<|unk|>"
                        for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed] # creating a list
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [10]:
# Let's test the new tokenizer class

# with existing valid text matching tokens in the vocab

tokenizer = SimpleTokenizerV2(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(len(ids), ids)

text = tokenizer.decode(ids)
print(text)

# 2 unrelated texts mixed with unknown tokens

text1 = "the last he painted, Sridhar"
text2 = "Hello, do you like tea?"
text = " <|endoftext|> ".join((text1, text2))

print(text)
ids = tokenizer.encode(text)
print(len(ids), ids)

text = tokenizer.decode(ids)
print(text)

Preprocessed before token check :  ['"', 'It', "'", 's', 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',', '"', 'Mrs', '.', 'Gisburn', 'said', 'with', 'pardonable', 'pride', '.']
21 [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
the last he painted, Sridhar <|endoftext|> Hello, do you like tea?
Preprocessed before token check :  ['the', 'last', 'he', 'painted', ',', 'Sridhar', '<|endoftext|>', 'Hello', ',', 'do', 'you', 'like', 'tea', '?']
14 [988, 602, 533, 746, 5, 1131, 1130, 1131, 5, 355, 1126, 628, 975, 10]
the last he painted, <|unk|> <|endoftext|> <|unk|>, do you like tea?


In [11]:
# using Byte Pair Encoding algorithm for Tokenization
!pip install tiktoken

from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.2 MB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m0.7/1.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0
tiktoken version: 0.8.0


In [12]:
tokenizer = tiktoken.get_encoding("gpt2")


text1 = "the last he painted, Sridhar"
text2 = "Hello, do you like tea?"
text = " <|endoftext|> ".join((text1, text2))

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[1169, 938, 339, 13055, 11, 311, 6058, 9869, 220, 50256, 18435, 11, 466, 345, 588, 8887, 30]
the last he painted, Sridhar <|endoftext|> Hello, do you like tea?


In [22]:
from typing import TextIO
# Exercise 2.1 Byte pair encoding of unknown words
"""
Try the BPE tokenizer from the tiktoken library on the unknown words “Akwirw ier” and print the individual token IDs. Then, call the decode function on each of the resulting integers in this list to reproduce the mapping shown in figure 2.11. Lastly, call the decode method on the token IDs to check whether it can reconstruct the original input, “Akwirw ier.”
"""
tokenizerR50 = tiktoken.get_encoding("r50k_base")
tokenizerP50 = tiktoken.get_encoding("p50k_base")
tokenizerCl100k = tiktoken.get_encoding("cl100k_base")
tokenizero200k = tiktoken.get_encoding("o200k_base")

text = "Akwirw ier"

integers = tokenizerR50.encode(text, allowed_special={"<|endoftext|>"})
print("R50 ", integers, type(integers))

for i in integers:
    print(tokenizerR50.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerR50.decode(integers))


integers = tokenizerP50.encode(text, allowed_special={"<|endoftext|>"})
print("P50 ", integers, type(integers))

for i in integers:
    print(tokenizerP50.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerP50.decode(integers))

integers = tokenizerCl100k.encode(text, allowed_special={"<|endoftext|>"})
print("cl100k ", integers, type(integers))

for i in integers:
    print(tokenizerCl100k.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerCl100k.decode(integers))


integers = tokenizero200k.encode(text, allowed_special={"<|endoftext|>"})
print("o200k ", integers, type(integers))

for i in integers:
    print(tokenizero200k.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizero200k.decode(integers))


R50  [33901, 86, 343, 86, 220, 959] <class 'list'>
Ak --> 33901
w --> 86
ir --> 343
w --> 86
  --> 220
ier --> 959
---------------------------------
Akwirw ier
P50  [33901, 86, 343, 86, 220, 959] <class 'list'>
Ak --> 33901
w --> 86
ir --> 343
w --> 86
  --> 220
ier --> 959
---------------------------------
Akwirw ier
cl100k  [32, 29700, 404, 86, 602, 261] <class 'list'>
A --> 32
kw --> 29700
ir --> 404
w --> 86
 i --> 602
er --> 261
---------------------------------
Akwirw ier
o200k  [32, 9500, 380, 86, 131455] <class 'list'>
A --> 32
kw --> 9500
ir --> 380
w --> 86
 ier --> 131455
---------------------------------
Akwirw ier
