In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import torch
import math
import re
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from tokenizers import ByteLevelBPETokenizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\satvm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import tiktoken

In [3]:
# Read the txt file to inspect it
with open('tiny-shakespeare.txt', 'r') as f:
    text = f.read()

print("Length of dataset:", len(text), "\n")
print(text[:100]) # First 100 characters

Length of dataset: 1115394 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
chars = sorted(list(set(text))) # Get all unique characters in the text
vocab_size = len(chars)         # Length of the vocabulary (this includes the space character)
print(''.join(chars))
print(f'\nVocabulary size: {vocab_size}')


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

Vocabulary size: 65


In [5]:
# This is both encoder and decoder
stoi = { ch:i for i,ch in enumerate(chars) }     # Character to index mapping
itos = { i:ch for i,ch in enumerate(chars) }     # Index to character mapping
encode = lambda s: [stoi[c] for c in s]          # Encode a string to a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # Decode a list of integers to a string

msg = "hii there"
token_list = encode(msg)
print(token_list)
print(decode(token_list))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [6]:
enc = tiktoken.get_encoding('gpt2')

msg = "hii there"
token_list = enc.encode(msg)
print(token_list) # BPE returns fewer tokens than the character encoding
print(enc.decode(enc.encode("hii there")))

print(enc.n_vocab) # total amount of tokens in the vocabulary

[71, 4178, 612]
hii there
50257


In [7]:
# Encode the text into a tensor of integers
data = torch.tensor(encode(text), dtype=torch.long)
print(f'Total size: {data.shape} elements of type {data.dtype}')
print('First 10 tokens from the dataset:', data[:10])

Total size: torch.Size([1115394]) elements of type torch.int64
First 10 tokens from the dataset: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [8]:
n = int(0.9 * len(data)) # 90% of the data will be used for training and 10% for validation
train_data = data[:n]    # 0 to 90-th percentile
val_data = data[n:]   

In [9]:
torch.manual_seed(1337)
batch_size = 32  # number of sequences in a batch / processed in parallel
block_size = 400

In [10]:
def get_batch(split, batch_size):
    # Generate a batch of inputs/prompts x and respective targets y
    # batches are always of shape (batch_size, block_size)
    data = train_data if split == 'train' else val_data
    # Tensor of shape (batch_size,) with random sequence start indices between 0 and len(data) - block_size
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # Accumulate and add each sequence of this batch to form a tensor
    x = torch.stack([data[i:i+block_size] for i in ix])
    # Same as x but shifted by one token
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y # x is (4,8), y is (4,8) too

# Get a batch of inputs and targets
xb, yb = get_batch('train', batch_size)

# Print the shape of the batch and the actual data
print('inputs shape: ', xb.shape)
print(xb,'\n')
print('targets shape: ', yb.shape)
print(yb, '\n')

inputs shape:  torch.Size([32, 400])
tensor([[ 0, 28, 39,  ...,  1, 42, 43],
        [53, 59,  1,  ..., 59, 41, 46],
        [56, 43, 60,  ...,  1, 44, 47],
        ...,
        [ 1, 45, 56,  ..., 56, 11,  0],
        [44, 43, 50,  ..., 43, 12,  0],
        [ 6,  0, 32,  ..., 21, 27, 10]]) 

targets shape:  torch.Size([32, 400])
tensor([[28, 39, 50,  ..., 42, 43, 39],
        [59,  1, 57,  ..., 41, 46, 57],
        [43, 60, 39,  ..., 44, 47, 56],
        ...,
        [45, 56, 39,  ..., 11,  0, 32],
        [43, 50, 50,  ..., 12,  0,  0],
        [ 0, 32, 46,  ..., 27, 10,  0]]) 



In [11]:
# vocab_size = tokenizer.get_vocab_size()  # Number of unique words in the vocabulary
embedding_dim = 256
batch_size = 32 # Increase batch size if resources allow as it bring stabilization, 1 is very noisy
learning_rate = 0.0001 # changed lr because maybe the embedding dim is too low and lr is too high so gradient is just bouncing around and not learning much
heads = 4
epochs = 10
seq_len = 400
device = "cpu"

In [12]:
class WordEmbedder(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

    def forward(self, x):
        return self.embeddings(x)

In [13]:
# Now that we have successfully trained our word embedding layer, time to make our positional matrix, which we will make using the formula mentioned in the paper
def positional_encoding(seq_len, embedding_dim, device=device):
        positional_encoding = torch.zeros(seq_len, embedding_dim)
        position = torch.arange(0, seq_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        return positional_encoding

In [14]:
# Time to make the Multi-head Self Attention block
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Calculate attention
        x= self.softmax((Q @ K.transpose(-1, -2)) / torch.sqrt(torch.tensor(embedding_dim)))
        attention = x @ V

        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.transpose(1, 2).reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        # print("Shape after attention:", output.shape)
        # num_active_elements = torch.gt(output, -1).sum().item()
        # total_elements = output.numel()
        # print(f"active  att: {num_active_elements}/{total_elements}")
        return output

In [15]:
# Time to make the Masked Multi-head Self Attention block
class MaskedMultiHeadSelfAttention(nn.Module):
    def __init__(self, heads, embedding_dim):
        super(MaskedMultiHeadSelfAttention, self).__init__()
        self.heads = heads

        # 3 Linear Layers for Q, K and V
        self.w_q = nn.Linear(embedding_dim, embedding_dim)
        self.w_k = nn.Linear(embedding_dim, embedding_dim)
        self.w_v = nn.Linear(embedding_dim, embedding_dim)

        # Since the feature or embedding dimension is typically the last dimension
        self.softmax = nn.Softmax(dim=-1)

        # Last Linear layer for the attention
        self.w_a = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, embedding_vector):
        batch_size, seq_len, embedding_dim = embedding_vector.size()
        # Compute Q, K, and V
        Q = self.w_q(embedding_vector)
        K = self.w_k(embedding_vector)
        V = self.w_v(embedding_vector)

        # Seperate into heads
        head_dim = embedding_dim // self.heads
        Q = Q.view(batch_size, self.heads, seq_len, head_dim)
        K = K.view(batch_size, self.heads, seq_len, head_dim)
        V = V.view(batch_size, self.heads, seq_len, head_dim)

        # Create a mask for masking the attention score
        mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).unsqueeze(0).unsqueeze(0).expand(batch_size, self.heads, -1, -1).to(device)
        value_to_fill = float('-inf')

        # Calculate attention (including mask)
        x = self.softmax((Q @  K.transpose(-1, -2)).masked_fill(mask, value_to_fill) / torch.sqrt(torch.tensor(embedding_dim)))
        attention = x @ V
        # Concatenating the attention heads (Transposing for correct concatenation)
        attention = attention.reshape(batch_size, seq_len, embedding_dim)
        output = self.w_a(attention)
        # print("Shape after mask:", output.shape)
        # num_active_elements = torch.gt(output, -1).sum().item()
        # total_elements = output.numel()
        # print(f"active masked: {num_active_elements}/{total_elements}")
        return output

In [16]:
class AddNorm(nn.Module):
    def __init__(self, n_features):
        super(AddNorm, self).__init__()
        # Layer Norm will normalize the last dimension of the matrix
        self.norm = nn.LayerNorm(n_features)

    def forward(self, original, modified):
        return self.norm(original + modified)

In [17]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim):
        super(FeedForward, self).__init__()
        # Normally nn.Linear(embedding_dim, embedding_dim * 4) for expressiveness, we will change it if we have the resources to do so
        self.lr1 = nn.Linear(embedding_dim, embedding_dim)
        self.relu = nn.ReLU()
        self.lr2 = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        x = self.lr1(x)
        x = self.relu(x)
        x = self.lr2(x)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active feedforward: {num_active_elements}/{total_elements}")
        return x

In [18]:
# Time to build the Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, batch_size, max_length, heads):
        super(Decoder, self).__init__()
        self.word_embedder = WordEmbedder(vocab_size, embedding_dim)
        self.masked_attention = MaskedMultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm1 = AddNorm(embedding_dim)
        self.attention = MultiHeadSelfAttention(heads, embedding_dim)
        self.add_norm2 = AddNorm(embedding_dim)
        self.feed_forward = FeedForward(embedding_dim)
        self.add_norm3 = AddNorm(embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        # print("Shape before word embedder:", x.shape)
        x = self.word_embedder(x)
        # print("Shape after word embedder/before positional:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active word embedder: {num_active_elements}/{total_elements}")
        x += positional_encoding(x.size(1), embedding_dim).unsqueeze(0).expand(x.size(0), -1, -1).to(device)
        # print("Shape after positional:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active positional_encoding: {num_active_elements}/{total_elements}")
        x = self.add_norm1(x, self.masked_attention(x))
        # print("Shape after addnorm1:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_norm1: {num_active_elements}/{total_elements}")
        x = self.add_norm2(x, self.attention(x))
        # print("Shape after addnorm2:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_nor2: {num_active_elements}/{total_elements}")
        x = self.add_norm3(x, self.feed_forward(x))
        # print("Shape after addnorm3:", x.shape)
        # num_active_elements = torch.gt(x, -1).sum().item()
        # total_elements = x.numel()
        # print(f"active add_norm3: {num_active_elements}/{total_elements}")
        # print("Shape before linear:", x.shape)
        logits = self.linear(x)
        # print("Shape after linear:", logits.shape)
        # num_active_elements = torch.gt(logits, 0).sum().item()
        # total_elements = logits.numel()
        # print(f"active linear: {num_active_elements}/{total_elements}")
        # print(logits.shape)
        return logits

In [19]:
model = Decoder(65, embedding_dim, batch_size, block_size, heads)
model

Decoder(
  (word_embedder): WordEmbedder(
    (embeddings): Embedding(65, 256)
  )
  (masked_attention): MaskedMultiHeadSelfAttention(
    (w_q): Linear(in_features=256, out_features=256, bias=True)
    (w_k): Linear(in_features=256, out_features=256, bias=True)
    (w_v): Linear(in_features=256, out_features=256, bias=True)
    (softmax): Softmax(dim=-1)
    (w_a): Linear(in_features=256, out_features=256, bias=True)
  )
  (add_norm1): AddNorm(
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (attention): MultiHeadSelfAttention(
    (w_q): Linear(in_features=256, out_features=256, bias=True)
    (w_k): Linear(in_features=256, out_features=256, bias=True)
    (w_v): Linear(in_features=256, out_features=256, bias=True)
    (softmax): Softmax(dim=-1)
    (w_a): Linear(in_features=256, out_features=256, bias=True)
  )
  (add_norm2): AddNorm(
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (feed_forward): FeedForward(
    (lr1): Linear(in_feat

In [20]:
# Initialize our optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

In [21]:
batch_size = 32 # Increasing the batch size from 4 to 32
losses = []

# Train for 10000 steps/batches
for steps in range(1000):
    xb, yb = get_batch('train', batch_size)
    optimizer.zero_grad()
    probs = model(xb)
    # Reshape input and output to correct format for loss calculation
    B, T, C = probs.shape
    probs = probs.view(B*T, C)
    yb = yb.view(-1)
    loss = loss_fn(probs, yb)
    loss.backward()
    
    optimizer.step()
    # Print the loss every 100 steps
    if steps % 10 == 0:
        print(f'Loss at step {steps}: {loss.item()}')
        losses.append(loss.item())

Loss at step 0: 4.380224704742432
Loss at step 10: 3.030172824859619
Loss at step 20: 2.7086572647094727
Loss at step 30: 2.6185996532440186
Loss at step 40: 2.5912744998931885
Loss at step 50: 2.531022071838379
Loss at step 60: 2.513446092605591
Loss at step 70: 2.498901128768921
Loss at step 80: 2.475170373916626
Loss at step 90: 2.44492769241333
Loss at step 100: 2.45695161819458
Loss at step 110: 2.4411234855651855
Loss at step 120: 2.4021055698394775
Loss at step 130: 2.4001097679138184
Loss at step 140: 2.397204875946045
Loss at step 150: 2.378685712814331
Loss at step 160: 2.3240041732788086
Loss at step 170: 2.3320224285125732
Loss at step 180: 2.3286190032958984
Loss at step 190: 2.30329966545105
Loss at step 200: 2.2928197383880615
Loss at step 210: 2.2707505226135254
Loss at step 220: 2.2857577800750732
Loss at step 230: 2.2415802478790283
Loss at step 240: 2.200723171234131
Loss at step 250: 2.1866893768310547
Loss at step 260: 2.1699914932250977
Loss at step 270: 2.1169717

In [22]:
plt.plot(range(1, epochs + 1), epoch_losses) 
plt.xlabel("Epoch")
plt.ylabel("Training Loss")
plt.title("Training Loss Over Epochs")
plt.show()

NameError: name 'epoch_losses' is not defined

In [None]:
model_path = "model_weights.pth"
torch.save(model.state_dict(), model_path)

In [None]:
model.load_state_dict(torch.load("model_weights.pth"))
model.eval()

In [23]:
def generate(idx, max_new_tokens):
    for _ in range(max_new_tokens):
        logits = model(idx)                              # Forward pass (this is the forward function) with the current sequence of characters idx, results in (B, T, C)
        logits = logits[:, -1, :]                          # Focus on the last token from the logits (B, T, C) -> (B, C)
        probs = F.softmax(logits, dim=-1)                  # Calculate the probability distribution for the next token based on this last token, results in (B, C)
        idx_next = torch.multinomial(probs, num_samples=1) # Sample the next token (B, 1), the token with the highest probability is sampled most likely
        idx = torch.cat((idx, idx_next), dim=1)            # Add the new token to the sequence (B, T+1) for the next iteration
    return idx 

In [25]:
# print(decode(generate(torch.rand((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

In [43]:
itos[30]

'R'

In [74]:
low = 0
high = 66  # High should be 66 to include 65 in the range (exclusive upper bound)

# Generate a random integer between low and high (inclusive)
random_int = torch.randint(low=low, high=high, size=(1, 1), dtype=torch.long)

print(random_int)
print(decode(random_int[0].tolist()))

tensor([[17]])
E


In [75]:
print(decode(generate(random_int, max_new_tokens=500)[0].tolist()))

EEEEE:SEatEatSEEatSS:aItSalS EYc:
At EEEEEaaUEEaEtEEEEEELE?EEEELE:EEEEEE:EEEEELaEtcEOEEET:Y:
LItrktSpld Gld qupldlplGlGllGlo ETo;ll Tll;
ple; hle; fulix! Art JoneF carBe, ETEY;I le. Vor
Aqutlle, K:
S K:
N Y Qu Q Q Que K:
H S S K:
Noe 'e Y Y H, X QU 
AEEEEEetc: V'
Et K:
 ENEjeatJK:
fo, Eg::hEEf
EfeitinifefeifefIfOfefefifeGeGefefifefeBeGerfemifimeGifIEtEfO
Ene. 
Clen! PArdim Joh, Getsu IGem
Triht GUCAefeyiaO:
Har'OGAO:
HO, nHAyFl'lal:
Fors, s,-sks, H: Nor:rs, Hirk, k'l'lak:
E:'r'H:'r'Ro, b,'s, E:'E
