<a href="https://colab.research.google.com/github/sridhartroy/AIML/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Read a publicly available text file from a URL.

import urllib.request

url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
file_path = ("the-verdict.txt")

urllib.request.urlretrieve(url, file_path)

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

print("Length of the file is : ", len(text))

print(text[:99])


Length of the file is :  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
#Split the text that was just read using reg expressions and print the length of the text before and after split

import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed), len(text))

print(preprocessed[:30])


4690 20479
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [4]:
# In this step , we need to sort the tokenized text, remove dups, and assign an unique integer for each token.

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size, type(all_words))

vocab = {token:integer for integer,token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break


1130 <class 'list'>
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [5]:
# Tokenizer Class that takes in the vocab that we created. And also, we send a sample new text for tokenization and encoding to an unique integer id and then decode as well.
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab # vocab is a dictionary and hence str_to_int is a dictionary as well
        self.int_to_str = {i:s for s,i in vocab.items()}
       # print(self.str_to_int)

    def encode(self, text): #new input text
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed] # creating a list
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [6]:
# use the above class by instantiating it with the vocabulary we created earlier from the verdict corpus. And then encode and decode

tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(len(ids), ids)

print(tokenizer.decode(ids))

text1 = """"Mrs. said pride."""
ids1 = tokenizer.encode(text1)
print(len(ids1), ids1)

print(tokenizer.decode(ids1))

21 [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
6 [1, 67, 7, 851, 793, 7]
" Mrs. said pride.


In [7]:
# what about words or tokens not in the corupus like below?

text2 = """"Mr. Sridhar said pride."""
ids2 = tokenizer.encode(text2)
print(len(ids2), ids2)

print(tokenizer.decode(ids2))

KeyError: 'Sridhar'

In [8]:
# Need to add some additional tokens for a. unknown b. end of source text

all_tokens = sorted(set(preprocessed))
print(len(all_tokens))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
print(len(all_tokens))

vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab))

for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

1130
1132
1132
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [9]:
# Now need to modify the tokenizer custom class to include above

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab # vocab is a dictionary and hence str_to_int is a dictionary as well
        self.int_to_str = {i:s for s,i in vocab.items()}
       # print(self.str_to_int)

    def encode(self, text): #new input text
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        print("Preprocessed before token check : " , preprocessed)
        # now check for each token in the preprocessed against the vocab.
        preprocessed = [item if item in self.str_to_int
                             else "<|unk|>"
                        for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed] # creating a list
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [10]:
# Let's test the new tokenizer class

# with existing valid text matching tokens in the vocab

tokenizer = SimpleTokenizerV2(vocab)
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(len(ids), ids)

text = tokenizer.decode(ids)
print(text)

# 2 unrelated texts mixed with unknown tokens

text1 = "the last he painted, Sridhar"
text2 = "Hello, do you like tea?"
text = " <|endoftext|> ".join((text1, text2))

print(text)
ids = tokenizer.encode(text)
print(len(ids), ids)

text = tokenizer.decode(ids)
print(text)

Preprocessed before token check :  ['"', 'It', "'", 's', 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',', '"', 'Mrs', '.', 'Gisburn', 'said', 'with', 'pardonable', 'pride', '.']
21 [1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.
the last he painted, Sridhar <|endoftext|> Hello, do you like tea?
Preprocessed before token check :  ['the', 'last', 'he', 'painted', ',', 'Sridhar', '<|endoftext|>', 'Hello', ',', 'do', 'you', 'like', 'tea', '?']
14 [988, 602, 533, 746, 5, 1131, 1130, 1131, 5, 355, 1126, 628, 975, 10]
the last he painted, <|unk|> <|endoftext|> <|unk|>, do you like tea?


In [11]:
# using Byte Pair Encoding algorithm for Tokenization
!pip install tiktoken

from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0
tiktoken version: 0.8.0


In [12]:
tokenizer = tiktoken.get_encoding("gpt2")


text1 = "the last he painted, Sridhar"
text2 = "Hello, do you like tea?"
text = " <|endoftext|> ".join((text1, text2))

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[1169, 938, 339, 13055, 11, 311, 6058, 9869, 220, 50256, 18435, 11, 466, 345, 588, 8887, 30]
the last he painted, Sridhar <|endoftext|> Hello, do you like tea?


In [13]:
from typing import TextIO
# Exercise 2.1 Byte pair encoding of unknown words
"""
Try the BPE tokenizer from the tiktoken library on the unknown words “Akwirw ier” and print the individual token IDs. Then, call the decode function on each of the resulting integers in this list to reproduce the mapping shown in figure 2.11. Lastly, call the decode method on the token IDs to check whether it can reconstruct the original input, “Akwirw ier.”
"""
tokenizerR50 = tiktoken.get_encoding("r50k_base")
tokenizerP50 = tiktoken.get_encoding("p50k_base")
tokenizerCl100k = tiktoken.get_encoding("cl100k_base")
tokenizero200k = tiktoken.get_encoding("o200k_base")

text = "Akwirw ier"

integers = tokenizerR50.encode(text, allowed_special={"<|endoftext|>"})
print("R50 ", integers, type(integers))

for i in integers:
    print(tokenizerR50.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerR50.decode(integers))


integers = tokenizerP50.encode(text, allowed_special={"<|endoftext|>"})
print("P50 ", integers, type(integers))

for i in integers:
    print(tokenizerP50.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerP50.decode(integers))

integers = tokenizerCl100k.encode(text, allowed_special={"<|endoftext|>"})
print("cl100k ", integers, type(integers))

for i in integers:
    print(tokenizerCl100k.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizerCl100k.decode(integers))


integers = tokenizero200k.encode(text, allowed_special={"<|endoftext|>"})
print("o200k ", integers, type(integers))

for i in integers:
    print(tokenizero200k.decode([i]), "-->", i)


print("---------------------------------")

print(tokenizero200k.decode(integers))


R50  [33901, 86, 343, 86, 220, 959] <class 'list'>
Ak --> 33901
w --> 86
ir --> 343
w --> 86
  --> 220
ier --> 959
---------------------------------
Akwirw ier
P50  [33901, 86, 343, 86, 220, 959] <class 'list'>
Ak --> 33901
w --> 86
ir --> 343
w --> 86
  --> 220
ier --> 959
---------------------------------
Akwirw ier
cl100k  [32, 29700, 404, 86, 602, 261] <class 'list'>
A --> 32
kw --> 29700
ir --> 404
w --> 86
 i --> 602
er --> 261
---------------------------------
Akwirw ier
o200k  [32, 9500, 380, 86, 131455] <class 'list'>
A --> 32
kw --> 9500
ir --> 380
w --> 86
 ier --> 131455
---------------------------------
Akwirw ier


In [14]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# print(raw_text)
enc_text = tokenizer.encode(raw_text)
print(len(enc_text), type(enc_text))

# do a sampling for 50 tokens

enc_sample = enc_text[50:]
#print(enc_sample)
#print(tokenizer.decode(enc_sample))

context_size = 10
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

5145 <class 'list'>
x: [290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686]
y:      [4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976]
[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257
[290, 4920, 2241, 287, 257] ----> 4489
[290, 4920, 2241, 287, 257, 4489] ----> 64
[290, 4920, 2241, 287, 257, 4489, 64] ----> 319
[290, 4920, 2241, 287, 257, 4489, 64, 319] ----> 262
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262] ----> 34686
[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686] ----> 41976
 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a
 and established himself in a ---->  vill
 and established himself in a vill ----> a
 and established himself in a villa ---->  on
 and established himself in a villa on ---->  the
 and established himself in a villa on the ---->  Riv
 and established himself in a villa on the Riv ----> iera


In [15]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
pip install torch==2.4.0

Collecting torch==2.4.0
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.4.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.4.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.4.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-many

In [16]:
import torch
torch.__version__
#torch.cuda.is_available()

'2.5.1+cu121'

In [19]:
import torch

tensor0d = torch.tensor(1)

tensor1d = torch.tensor([1.1, 2, 3])

tensor2d = torch.tensor([[1, 2,3],
                         [3, 4,5]])

tensor3d = torch.tensor([[[1, 2], [3, 4]],
                         [[5, 6], [7, 8]]])


print(tensor0d)
print(tensor1d)
print(tensor2d)
print(tensor3d)

print(tensor1d.dtype)

tensor0d = torch.tensor([1, 2, 3])
print(tensor0d.dtype)

tensor0df = tensor0d.to(torch.float32)
print(tensor0df.dtype)
print(tensor0d)
print(tensor0df)

tensor(1)
tensor([1.1000, 2.0000, 3.0000])
tensor([[1, 2, 3],
        [3, 4, 5]])
tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
torch.float32
torch.int64
torch.float32
tensor([1, 2, 3])
tensor([1., 2., 3.])


In [20]:
print(tensor0d, tensor0d.shape)
print(tensor1d, tensor1d.shape)
print(tensor2d, tensor2d.shape)
print(tensor3d, tensor3d.shape)

print(tensor2d.reshape(3, 2))

print(tensor2d.view(3, 2))


print(tensor2d.T)

tensor([1, 2, 3]) torch.Size([3])
tensor([1.1000, 2.0000, 3.0000]) torch.Size([3])
tensor([[1, 2, 3],
        [3, 4, 5]]) torch.Size([2, 3])
tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]]) torch.Size([2, 2, 2])
tensor([[1, 2],
        [3, 3],
        [4, 5]])
tensor([[1, 2],
        [3, 3],
        [4, 5]])
tensor([[1, 3],
        [2, 4],
        [3, 5]])


In [21]:
print(tensor2d)
print("**")
print(tensor2d.T)
print("MatMul")
print(tensor2d.matmul(tensor2d.T))
print(tensor2d @ tensor2d.T)

tensor([[1, 2, 3],
        [3, 4, 5]])
**
tensor([[1, 3],
        [2, 4],
        [3, 5]])
MatMul
tensor([[14, 26],
        [26, 50]])
tensor([[14, 26],
        [26, 50]])


In [22]:
# Seeing models as computational graphs.
#  A logistic regression forward pass.

import torch.nn.functional as F

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2])
b = torch.tensor([0.0])
z = x1 * w1 + b
a = torch.sigmoid(z)
loss = F.binary_cross_entropy(a, y)

In [24]:
# computing the gradients via autograd function of torch

import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True)
grad_L_b = grad(loss, b, retain_graph=True)

print(grad_L_w1)
print(grad_L_b)

print('******************************')

loss.backward()
print(w1.grad)
print(b.grad)

(tensor([-0.0898]),)
(tensor([-0.0817]),)
******************************
tensor([-0.0898])
tensor([-0.0817])


In [25]:
# Implement a multi-layer perceptron with 2 hidden layers
import torch.nn as M

class NeuralNetwork(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(

            # 1st hidden layer
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            # 2nd hidden layer
            torch.nn.Linear(30, 20),
            torch.nn.ReLU(),

            # output layer
            torch.nn.Linear(20, num_outputs),
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

In [26]:
# instantiate the above neural network

model = NeuralNetwork(50, 3)

print(model)

# No. of trainable parameters of this model.

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of trainable model parameters:", num_params)

total_params = sum(p.numel() for p in model.parameters())
print("Total number of model parameters:", total_params)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)
Total number of trainable model parameters: 2213
Total number of model parameters: 2213


In [27]:
# Now let's print out the weight tensor

print("Weight Matrix")
print(model.layers[0].weight, model.layers[0].weight.shape, type(model.layers), type(model.layers[0]), (model.layers[0].weight.dtype))

print("Bias Vector")
print(model.layers[0].bias, model.layers[0].bias.shape, type(model.layers), type(model.layers[0]), (model.layers[0].bias.dtype))

Weight Matrix
Parameter containing:
tensor([[ 0.0961,  0.0406,  0.0280,  ..., -0.0892, -0.0472,  0.0694],
        [ 0.1102,  0.0287,  0.1392,  ..., -0.1153, -0.0418,  0.0494],
        [ 0.0796,  0.1122, -0.0747,  ...,  0.1070,  0.1269, -0.0701],
        ...,
        [-0.1360, -0.0838,  0.0347,  ..., -0.0224, -0.0029,  0.1131],
        [ 0.0235,  0.1116,  0.0919,  ...,  0.0821,  0.0876,  0.0159],
        [ 0.0993,  0.0129, -0.0756,  ..., -0.0600, -0.1297,  0.0031]],
       requires_grad=True) torch.Size([30, 50]) <class 'torch.nn.modules.container.Sequential'> <class 'torch.nn.modules.linear.Linear'> torch.float32
Bias Vector
Parameter containing:
tensor([ 0.0095, -0.0167, -0.0445, -0.0265,  0.1330,  0.1158, -0.0488, -0.1341,
         0.0900,  0.0611,  0.0304,  0.0983, -0.1272,  0.0835,  0.0755,  0.1120,
         0.1164, -0.1169, -0.0593,  0.1052,  0.0354,  0.1110,  0.0162, -0.0457,
        -0.1370, -0.0100,  0.0950,  0.0347,  0.1160, -0.1285],
       requires_grad=True) torch.Size([30]

In [28]:
torch.manual_seed(123)
model = NeuralNetwork(50, 3)
print(model)
print(model.layers[0].weight.shape, model.layers[0].bias.shape)
print(model.layers[2].weight.shape, model.layers[2].bias.shape)
print(model.layers[4].weight.shape, model.layers[4].bias.shape)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)
torch.Size([30, 50]) torch.Size([30])
torch.Size([20, 30]) torch.Size([20])
torch.Size([3, 20]) torch.Size([3])


In [32]:
torch.manual_seed(123)
X = torch.rand((1, 50))
print(X)
out = torch.softmax(model(X), dim=1)
out1 = model(X)
print(out)
print(out1)

print(model.layers[0].weight.shape, model.layers[0].bias.shape)
print(model.layers[2].weight.shape, model.layers[2].bias.shape)
print(model.layers[4].weight.shape, model.layers[4].bias.shape)


tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740, 0.8665, 0.1366, 0.1025, 0.1841,
         0.7264, 0.3153, 0.6871, 0.0756, 0.1966, 0.3164, 0.4017, 0.1186, 0.8274,
         0.3821, 0.6605, 0.8536, 0.5932, 0.6367, 0.9826, 0.2745, 0.6584, 0.2775,
         0.8573, 0.8993, 0.0390, 0.9268, 0.7388, 0.7179, 0.7058, 0.9156, 0.4340,
         0.0772, 0.3565, 0.1479, 0.5331, 0.4066, 0.2318, 0.4545, 0.9737, 0.4606,
         0.5159, 0.4220, 0.5786, 0.9455, 0.8057]])
tensor([[0.3113, 0.3934, 0.2952]], grad_fn=<SoftmaxBackward0>)
tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)
torch.Size([30, 50]) torch.Size([30])
torch.Size([20, 30]) torch.Size([20])
torch.Size([3, 20]) torch.Size([3])


In [35]:
with torch.no_grad():
    out = model(X)
print(out)

tensor([[-0.1262,  0.1080, -0.1792]])


In [36]:
with torch.no_grad():
    out = torch.softmax(model(X), dim=1)
print(out)

tensor([[0.3113, 0.3934, 0.2952]])


In [37]:
# Setting up efficient data loaders and data sets

X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])


In [42]:
from torch.utils.data import Dataset

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y
        print(type(self.features), type(self.labels))

    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y

    def __len__(self):
        return self.labels.shape[0]

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)


print(X_train, X_train.shape)
print(y_train, y_train.shape)
print(train_ds.features, train_ds.labels, train_ds.__len__())

<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
tensor([[-1.2000,  3.1000],
        [-0.9000,  2.9000],
        [-0.5000,  2.6000],
        [ 2.3000, -1.1000],
        [ 2.7000, -1.5000]]) torch.Size([5, 2])
tensor([0, 0, 0, 1, 1]) torch.Size([5])
tensor([[-1.2000,  3.1000],
        [-0.9000,  2.9000],
        [-0.5000,  2.6000],
        [ 2.3000, -1.1000],
        [ 2.7000, -1.5000]]) tensor([0, 0, 0, 1, 1]) 5


In [43]:
from torch.utils.data import DataLoader


torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

print(train_ds.features, train_ds.labels)
print("**************************************")

en=enumerate(train_loader)
print(en)
print("**************************************")
for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

tensor([[-1.2000,  3.1000],
        [-0.9000,  2.9000],
        [-0.5000,  2.6000],
        [ 2.3000, -1.1000],
        [ 2.7000, -1.5000]]) tensor([0, 0, 0, 1, 1])
**************************************
<enumerate object at 0x7ef284e84ef0>
**************************************
Batch 1: tensor([[ 2.7000, -1.5000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2: tensor([[ 2.3000, -1.1000],
        [-1.2000,  3.1000]]) tensor([1, 0])
Batch 3: tensor([[-0.5000,  2.6000]]) tensor([0])


In [44]:
train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0,
    drop_last=True
)

for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx+1}:", x, y)

Batch 1: tensor([[-0.5000,  2.6000],
        [-0.9000,  2.9000]]) tensor([0, 0])
Batch 2: tensor([[-1.2000,  3.1000],
        [ 2.3000, -1.1000]]) tensor([0, 1])


In [45]:
# Now that we have a dataset and dataloader defined. Let's try to use these to train the sample model that we had instantiated few cells above.

import torch.nn.functional as F

torch.manual_seed(123)
model = NeuralNetwork(num_inputs=2, num_outputs=2)
optimizer = torch.optim.SGD(
    model.parameters(), lr=0.5
)

num_epochs = 3
for epoch in range(num_epochs):

    model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):

        logits = model(features)

        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        ### LOGGING
        print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
              f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
              f" | Train Loss: {loss:.2f}")

    model.eval()
    # Insert optional model evaluation code

Epoch: 001/003 | Batch 000/002 | Train Loss: 0.75
Epoch: 001/003 | Batch 001/002 | Train Loss: 0.65
Epoch: 002/003 | Batch 000/002 | Train Loss: 0.44
Epoch: 002/003 | Batch 001/002 | Train Loss: 0.13
Epoch: 003/003 | Batch 000/002 | Train Loss: 0.03
Epoch: 003/003 | Batch 001/002 | Train Loss: 0.00


In [47]:

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Total number of trainable model parameters:", num_params)

Total number of trainable model parameters: 752


In [48]:
# above model is trained with x_train and y_train
# now do a test with the entire training set. typically we use a validation dataset and then a test dataset
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)


# apply softmax to get probabilities

torch.set_printoptions(sci_mode=False)
probas = torch.softmax(outputs, dim=1)
print(probas)

# convert to 0s and 1s using argmax

predictions = torch.argmax(probas, dim=1)
print(predictions)

predictions = torch.argmax(outputs, dim=1)
print(predictions)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])
tensor([[    0.9991,     0.0009],
        [    0.9982,     0.0018],
        [    0.9949,     0.0051],
        [    0.0491,     0.9509],
        [    0.0307,     0.9693]])
tensor([0, 0, 0, 1, 1])
tensor([0, 0, 0, 1, 1])


In [55]:
# evaluation

predictions == y_train

# no. of correct predictions

torch.sum(predictions == y_train)

# function to compute the prediction accuracy

def compute_accuracy(model, dataloader):

    model = model.eval()
    correct = 0.0
    total_examples = 0

    for idx, (features, labels) in enumerate(dataloader):

        with torch.no_grad():
            logits = model(features)

        predictions = torch.argmax(logits, dim=1)
        compare = labels == predictions
        correct += torch.sum(compare)
        total_examples += len(compare)

    return (correct / total_examples).item()

In [56]:
# print the accuracy of prediction from above function for our sample model and dataset

#on training set
print("Training Accuracy : ", compute_accuracy(model, train_loader))

#on testing set
print("Test Accuracy : ", compute_accuracy(model, test_loader))

Training Accuracy :  1.0
Test Accuracy :  1.0


In [57]:
#  Continuing from cell 14...we need now to convert the token ids that the bpe tokernizer created to embeddings

# Instantiate Dataset

import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt) # use the tiktoken to tokenize the entire text
        print(len(token_ids))

        for i in range(0, len(token_ids) - max_length, stride): #chunking the token ids into overlapping sequences of max_lenght to create sliding window. And convert the list chunks to a tensor
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
           # print(self.input_ids[i], self.target_ids[i])
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [58]:
# following code uses the above dataset to load the inputs in batches via the dataloader

def create_dataloader_v1(txt, batch_size, max_length,
                         stride, shuffle, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2") #use openai tiktoken and use gpt2
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #create the dataset. this will have 2 lists of tensors
    #print(len(dataset.target_ids))
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [60]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

#tokenizer.decode(first_batch[1])

second_batch = next(data_iter)
print(second_batch)



dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=2, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

second_batch = next(data_iter)
print(second_batch)



dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=8, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)

print(first_batch)

second_batch = next(data_iter)
print(second_batch)

5145
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
5145
[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]
5145
[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


In [62]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4,
    shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

# NOTE : Note that we increase the stride to 4 to utilize the data set fully (we don’t skip a single word). This avoids any overlap between the batches since more overlap could lead to increased overfitting.

5145
Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [68]:
# Next converting token ids to token emeddings. Preparation involves tokenizing text, converting text tokens to token IDs, and converting token IDs into embedding vectors. Here, we consider the previously created token IDs to create the token embedding vectors.


input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6 # small corpus
output_dim = 3 # small dimension for each embedding / token
# Note. BPE has vocab of 50,257. And each embedding vector has dim of 12,288

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(input_ids)
print("*****************************************************************")
print(embedding_layer.weight)
print("*****************************************************************")
# There is one row for each of the six possible tokens in the vocabulary, and there is one column for each of the three embedding dimensions.


#let’s apply it to a token ID to obtain the embedding vector:

print(embedding_layer(torch.tensor([3])))
print("*****************************************************************")
# In other words, the embedding layer is essentially a lookup operation that retrieves rows from the embedding layer’s weight matrix via a token ID.

print(embedding_layer(input_ids))
print("*****************************************************************")

tensor([2, 3, 5, 1])
*****************************************************************
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
*****************************************************************
tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)
*****************************************************************
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)
*****************************************************************


In [None]:
# Having now created embedding vectors from token IDs, next we’ll add a small modification to these embedding vectors to encode positional information about a token within a text.