<a href="https://colab.research.google.com/github/saivenkatreddy29/Building-LLM-from-Scratch/blob/main/Chapter_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [2]:
import torch
from importlib.metadata import version

print('torch version', version('torch'))
print('tiktoken version',version('tiktoken'))

torch version 2.5.1+cu121
tiktoken version 0.8.0


In [3]:
import os
import requests
path = "the-verdict.txt"
url =  ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
if os.path.exists(path):
  print("The file already exist")
else:
  print("The file doesn't exist downloading.....")
  response = requests.get(url)
  if response.status_code == 200:
    print('Download completed')
    with open(path,'wb') as f:
      f.write(response.content)
  else:
    print('Download Failed')


The file doesn't exist downloading.....
Download completed


In [4]:
import re
text =  "Hello, world. This, is a test."
result = re.split(r'\s',text)
print(result)

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']


In [5]:
result = re.split(r'([,.]|\s)',text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [6]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [7]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)',text)
result = [item for item in result if item.strip()]

In [8]:
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

In [9]:
with open(path, 'r',  encoding="utf-8") as f:
  raw_text = f.read()

In [10]:
raw_text[:10]

'I HAD alwa'

In [11]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [12]:
print(preprocessed[:20])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


In [13]:
len(preprocessed)

4690

In [14]:
preprocessed = sorted(set(preprocessed))
preprocessed[:20]

['!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 ';',
 '?',
 'A',
 'Ah',
 'Among',
 'And',
 'Are',
 'Arrt',
 'As',
 'At',
 'Be']

In [15]:
word_to_in = {}
in_to_word = {}

for i,word in enumerate(preprocessed):
  word_to_in[word] = i
  in_to_word[i] = word

In [16]:
for word,key in word_to_in.items():
  print(f'word:{word},key:{key}')
  if key>=20:
    break

word:!,key:0
word:",key:1
word:',key:2
word:(,key:3
word:),key:4
word:,,key:5
word:--,key:6
word:.,key:7
word::,key:8
word:;,key:9
word:?,key:10
word:A,key:11
word:Ah,key:12
word:Among,key:13
word:And,key:14
word:Are,key:15
word:Arrt,key:16
word:As,key:17
word:At,key:18
word:Be,key:19
word:Begin,key:20


In [17]:
for key,word in in_to_word.items():
  print(f'key:{key} word:{word}')
  if key>=20:
    break

key:0 word:!
key:1 word:"
key:2 word:'
key:3 word:(
key:4 word:)
key:5 word:,
key:6 word:--
key:7 word:.
key:8 word::
key:9 word:;
key:10 word:?
key:11 word:A
key:12 word:Ah
key:13 word:Among
key:14 word:And
key:15 word:Are
key:16 word:Arrt
key:17 word:As
key:18 word:At
key:19 word:Be
key:20 word:Begin


In [18]:
class SimpleTokenizerv1():
  def __init__(self,vocabulary):

    self.word_to_key = {}
    self.key_to_word = {}
    for i,word in enumerate(preprocessed):
      self.word_to_key[word] = i
      self.key_to_word[i] = word


  def encode(self,text):
    pro_text = re.split(r'([,.:;?_!"()\']|--|\s)',text)
    pro_text = [item.strip() for item in pro_text if item.strip()]
    encoded = []
    for word in pro_text:
      encoded.append(self.word_to_key[word])
    return encoded

  def decode(self,encoded):
    ans = []
    # print(self.key_to_word)
    for key in encoded:
      ans.append(self.key_to_word[key])
    text = ' '.join(ans)
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text




In [19]:
#preprocessed
text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""

In [20]:
tokenizer = SimpleTokenizerv1(preprocessed)
encoded_text = tokenizer.encode(text)
print(encoded_text)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [21]:
decoded_text = tokenizer.decode(encoded_text)
print(decoded_text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [23]:
text = "Hello, do you like tea. Is this-- a test?"


Adding Special tokens to the simple tokenizer