<a href="https://colab.research.google.com/github/shinnew9/llms_from_scratch/blob/main/llms_from_scracth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!!pip install datasets
!pip install tokenizers



### Step 1: Load dataset

In [4]:
# Step 1: Load the data and separate into train, validation and test data
# Import necessary libraries
import os
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path
from datasets import load_dataset
from tqdm import tqdm_gui

os.mkdir("./malaygpt")
os.mkdir("./tokenizer_en")
os.mkdir("./tokenizer_my")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ms", split='train')
val_dataset = load_dataset("Helsinki-NLP/opus-100", "en-ms", split='validation')

# limit the number of data in dataset for faster training purpose
raw_train_dataset, rt_to_skip = random_split(train_dataset, [1500, len(train_dataset)-1500])
raw_val_dataset, vt_to_skip = random_split(val_dataset, [50, len(val_dataset)-50])

### Step 2: Create Tokenizer

In [5]:
# Step 2: Create tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

def get_ds_iterator(raw_train_dataset, lang):
  for data in raw_train_dataset:
    yield data['translation'][lang]

# Create Source Tokenizer - English
tokenizer_en = Tokenizer(BPE(unk_token="[UNK]"))
trainer_en = BpeTrainer(min_frequency=2, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
# We'll also need to add a pre-tokenizer to split our input into words as without a pre-tokenizer, we might get tokens that overlap several words: for instance we could get a "there is" token since those two words often appear next to each other.
# Using a pre-tokenizer will ensure no token is bigger than a word returned by the pre-tokenizer.
tokenizer_en.pre_tokenizer = Whitespace()
tokenizer_en.train_from_iterator(get_ds_iterator(raw_train_dataset, "en"), trainer = trainer_en)
tokenizer_en.save("./tokenizer_en/tokenizer_en.json")

# Create Target Tokenizer - Malay
tokenizer_my = Tokenizer(BPE(unk_token="[UNK]"))
trainer_my = BpeTrainer(min_frequency=2, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MNSK]"])
tokenizer_my.pre_tokenizer = Whitespace()
tokenizer_my.train_from_iterator(get_ds_iterator(raw_train_dataset, "ms"), trainer=trainer_my)
tokenizer_my.save("./tokenizer_my/tokenizer_my.json")

tokenizer_en = Tokenizer.from_file("./tokenizer_en/tokenizer_en.json")
tokenizer_my = Tokenizer.from_file("./tokenizer_my/tokenizer_my.json")

source_vocab_size = tokenizer_en.get_vocab_size()
target_vocab_size = tokenizer_my.get_vocab_size()

# to calculate the max sequence length in the entire training dataset for the source and target dataset
max_seq_len_source = 0
max_seq_len_target = 0

for data in raw_train_dataset:
  enc_ids = tokenizer_en.encode(data['translation']['en']).ids
  dec_ids = tokenizer_en.encode(data['translation']['ms']).ids
  max_seq_len_source = max(max_seq_len_source, len(enc_ids))
  max_seq_len_target = max(max_seq_len_target, len(dec_ids))

print(f'max_seqlen_source: {max_seq_len_source}')  # 99 - can be different in your case
print(f'max_seqlen_target: {max_seq_len_target}')  # 100 - can be different in your case


# to make it standard for our training we'll just take max_seq_len_source and add 20-5- to cover the additional tokens such as PAD, CLS, SEP
max_seq_len = 155

max_seqlen_source: 408
max_seqlen_target: 680


### Step 3: Prepare Dataset and DataLoader

In [6]:
# Step 3: Prepare dataset and dataloader

# Transform raw dataset to the encoded dataset that can be processed by the model
class EncodeDataset(Dataset):
  def __init__(self, raw_dataset, max_seq_len):
    super().__init__()
    self.raw_dataset = raw_dataset
    self.max_seq_len = max_seq_len

  def __len__(self):
    return len(self.raw_dataset)

  def __getitem__(self, index):

    # fetching the single data for the given index value that consist of both english and malay language.
    raw_text = self.raw_dataset[index]

    # separating source text with enligh tokenizer and target text with malay tokenizer
    source_text = raw_text['translation']['en']
    target_text = raw_text['translation']['ms']

    # Encoding source text with enlish tokenizer and target text with malay tokenizer
    source_text_encoded = tokenizer_en.encode(source_text).ids
    target_text_encoded = tokenizer_en.encode(target_text).ids

    # Convert the CLS, SEP and PAD tokens to their corresponding index id in vocabulary using tokenizer [the id would be same with either tokenizers]
    CLS_ID = torch.tensor([tokenizer_my.token_to_id("[CLS]")], dtype=torch.int64)
    SEP_ID = torch.tensor([tokenizer_my.token_ti_id("[SEP]")], dtype=torch.int64)
    PAD_ID = torch.tensor([tokenizer_my.token_to_id("[PAD]")], dtype=torch.int64)

    # To train the model, the sequence length of each input should be equal max seq length. Hence additional number of padding will be added to the input sequence if the length is not equal to the max seq length.
    num_source_padding = self.max_seq_len - len(source_text_encoded) - 2
    num_target_padding = self.max_seq_len - len(source_text_encoded) - 1

    encoder_padding = torch.tensor([PAD_ID] * num_source_padding, dtype = torch.int64)
    decoder_padding = torch.tensor([PAD_ID] * num_target_padding, dtype = torch.int64)

    # encoder_input has the first token as start of sentence - CLS_ID, followed by source encoding which is then followed by the end of sentence token - SEP.
    # To reach the required max_seq_len, addition PAD token will be added at the end.
    encoder_input = torch.cat([CLS_ID, torch.tensor(source_text_encoded, dtype=torch.int64), SEP_ID, encoder_padding], dim=0)

    # decoder_input has the first token as start of sentence - CLS_ID, followed by target encoding.
    # To reach the required max_seq_len, addition PAD token will be added at the end. There is no end of sentence token - SEP in decoder input.
    decoder_input = torch.cat([CLS_ID, torch.tensor(target_text_encoded, dtype=torch.int64), decoder_padding], dim=0)

    # target_label is required for the loss calculation during training to compare between the predidcted and target label.
    # target_label has the first token as target encoding followed by actual target encoding. There is no staart of sentence token - CLS in target label.
    # To reach the required max_seq_len, addition PAD tokens will be added at the end.
    target_label = torch.cat([torch.tensor(target_text_encoded, dtype=torch.int64), SEP_ID, decoder_padding], dim=0)

    # SInce we've added extra padding token with input encoding, we dpn't want this token to be trained by model.
    # So, we'll use encoder mask to nullify the padding value prior to producing output of self attention in encoder block
    encoder_mask = (encoder_input != PAD_ID). unsqueeze(0).int()

    # We don't want any token to get influence the future token during the decoding stage. Hence, Causal mask is being implemented during masked multihead attention to handle this.
    decoder_mask = (decoder_input != PAD_ID).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input. size(0))


    return {
        'encoder_input': encoder_input,
        'decoder_input': decoder_input,
        'target_label': target_label,
        'encoder_mask': encoder_mask,
        'decoder_mask': decoder_mask,
        'source_text': source_text,
        'target_text': target_text
    }

# Causal mask will make sure any token that comes after the current token will be masked meaning the value will be replaced by - infinity that will be converted to zero or nearly zero after softmax operation.
# Hence the model will just ignore these vue or won't be able to learn anything.
def causal_mask(size):
  # Creating a square matrix of dimensions 'size x size' filled with ones
  mask = torch.triu(torch.ones(1, size, size), diagonal = 1), type(torch.int)
  return mask == 0

# Create a dataloader to use for model training and validation
train_ds = EncodeDataset(raw_train_dataset, max_seq_len)
val_ds = EncodeDataset(raw_val_dataset, max_seq_len)

train_dataloader = DataLoader(train_ds, batch_size = 5, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle=True)

### Step 4: Input Embedding and Positional Encoding

In [7]:
# Step 4: Input embedding and positional encoding
import torch
import torch.nn as nn
import math

class EmbeddingLayer(nn.Module):
  def __init__(self, d_model: int, vocab_size: int):
    super().__init__()
    self.d_model = d_model
    # using pytorch models embedding layer to map token id to embedding vector which has the shpae of (vocab_size, d_model)
    # The vocab_size is the vocabulary size of the training data created by tokenizer in step 2
    self.embeding = nn.Embedding(vocab_size, d_model)

  def forward(self, input):
    # In addition of giving input to the embedding, the extra multiplication by square root of d_model is to normalize the embedding layer output
    embedding_output = self.embedding(input)*math.sqrt(self.d_model)
    return embedding_output

class PositionalEncoding(nn.Module):
  def __init__(self, d_model:int, max_seq_len: int, dropout_rate:float):
    super().__init__()
    self.dropout = nn.Dropout(dropout_rate)
    pe = torch.zeros(max_seq_len, d_model)

    pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * 0 (-math.log(10000.0 / d_model)))

    pe[:, 0::2]= torch.sin(pos*div_term)
    pe[:, 1::2]= torch.cos(pos*div_term)

    # since we're expecting the input sentences in batches so the extra dimension to cater batch number needs to be added in 0 position
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, input_embedding):
    input_embedding = input_embedding + (self.pe[:, :input_embedding.shape[1], :]).requires_grad(False)
    return self.dropout(input_embedding)

  div_term = torch.exp(torch.arange(0, d_model, 2).float() * 0 (-math.log(10000.0 / d_model)))


### Step 5: Multi-Head Attention


In [None]:
# Step 5: Multihead Attention
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model: int, num_heads:int, dropout_rate:float):
    super().__init__()
    # Defining dropout to prevent overfitting
    self.dropout = nn.Dropout(dropout_rate)
    self.num_heads = num_heads
    assert d_model % num_heads == 0, "d_model must be divisible by number of heads"

    # d_k is the new dimension of each self attention heads
    self.d_k = d_model // num_heads

    # Weight matrix are defined which are all learnable parameters
    self.W_q = nn.Linear(d_model, d_model, bias=False)
    self.W_k = nn.Linear(d_model, d_model, bias=False)
    self.W_v = nn.Linear(d_model, d_model, bias=False)
    self.W_o = nn.Linear(d_model, d_model, bias=False)

  def forward(self, q, k, v, encoder_mask):
    # Please note that we'll be training our model with not just a single sequence but rather batches of sequence, hence we'll include batch_size in the shape
    # query, Key and value are calculated by matrix multiplication of corresponding weights with the input embeddings
    # Change of shape: q(batch_size, seq_len, d_model) @ W_q(d_model, d_model) => query(batch_size, seq_len, d_model) [same goes to key and value]
    query = self.W_q(q)
    key = self.W_k(k)
    value = self.W_v(v)

    # Dividing query, key and value into number of heads, hence new dimension will be d_k.
    # Change of shape: query(batch_size, seq_len, d_model) => query(batch_size, seq_len, num_heads, d_k) -> query (batch_size, num_heads, seq_len, d_k) [sam goes to key and value]
    query = query.view(query.shape[0], query.shape[1], self.num_heads, self.d_k).transpose(1, 2)
    key = key.view(key.shape[0], key.shape[1], self.num_heads, self.d_k).transpose(1, 2)
    value = value.view(value.shape[0], value.shape[1], self.num_heads, self.d_k).transpose(1, 2)

    # :: SELF ATTENTION BLOCK STARTS ::

    # Attention score is calculated to find the similarity of rleation of query with key of itself and all other embedding in the sequence
    # Change of shape: query(batch_size, num_heads, seq_len, d_k) @ key(batch_size,num_heqds, seq_len, d_k) => attention_score(batch_size, num_heads, seq_len, seq_len)
    attention_score = (query @ key.transpose(-2, -1))/math.sqrt(self.d_k)

    # If mask is provided the attention score needs to modify as per the mask value. Refer to the details in point no 4.
    if encoder_mask is not None:
      attention_score.masked_fill_(encoder_mask==0, -1e9)

    # Softmax operation calculates the probability distribution among all the attention scores. This will determine which embedding is more similar to the griven query embedding and assign the attention weight accordingly.
    # Change of shape: same as attention_score
    attention_score = attention_score.softmax(dim=-1)

    if self.dropout is not None:
      attention_score = self.dropout(attention_score)

    # Final step of Self attetion block is to marix multiplication of attention_weight with value embedding.
    # Change of shape: attention_score (batch_size, num_heqds, seq_len, seq_len) @ value(batch_size, num_heads, seq_len, num_heads, d_k)
    attention_output = attention_score @ value

    # :: SELF ATTENTION BLOCK ENDS ::

    # Now, all the heads will be concated back to for a single head
    # Change of shape:attention_output(batch_size, num_heads, seq_len, d_k) => attention_output(batch_size, seq_len, num_heads, d_k) => attention_output(batch_size, seq_len, d_model)
    attention_output = attention_output.transpose(1, 2).contiguous().view(attention_output.shape[0], -1, self.num_heads * self.d_k)

    # Finally attention_output is matrix multiplied with output weight matrix to give the final Multi-Head Attention output.
    # The shape of the multihead_output is same as the embedding input
    # Change of shape: attentipn_output(bath_size, seq_len, d_model) @ W_o(d_model, d_model) => multihead_output(batch_size, seq_len, d_model)
    multihead_output = self.w_o(attention_output)

    return multihead_output