In [172]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm

In [173]:
device = torch.device("cuda")
device

device(type='cuda')

# Encoder Decoder Architecture

In [174]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 2000 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_embd = 36
n_head = 6
n_layer = 6
dropout = 0.2
vocab_size = 1503

### Embedding

In [175]:
class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, dim_embd=n_embd):
        super().__init__()
        self.dim_embd = dim_embd
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, dim_embd)

    def forward(self, x):
        return self.embedding(x) * (self.dim_embd ** 0.5)

In [176]:
class PositionalEncoding(nn.Module):
    def __init__(self, num_toks ,dim_embd= n_embd):
        super().__init__()
        self.dim_embd = dim_embd
        self.toks_len = num_toks
        self.pos_embedding = nn.Embedding(num_toks, dim_embd)

    def forward(self, x):

        print("in pos enc: ", x.shape)
        return x + self.pos_embedding(torch.arange(x.shape[1], device=x.device))

### Encoder and Decoder Attention

In [177]:
class SelfAttention(nn.Module):
    def __init__(self, embd_size, dim_embd = n_embd):
        super().__init__()

        self.key = nn.Linear(dim_embd, embd_size, bias=False)
        self.query = nn.Linear(dim_embd, embd_size, bias=False)
        self.value = nn.Linear(dim_embd, embd_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x , enc_out=None):

        B,T,E = x.shape

        if enc_out is None:
          k = self.key(x)   # (B, T, embd_size)
          q = self.query(x) # (B, T, embd_size)
          v = self.value(x) # (B, T, embd_size)
        else:
          k = self.key(enc_out)
          q = self.query(x)
          v = self.value(enc_out)

        scaled_dot_prod = (q @ k.transpose(-2, -1)) * (k.shape[-1] ** -0.5) # (B, T, embd_size) @ (B, embd_size, T) -> (B, T, T)
        soft = F.softmax(scaled_dot_prod , dim=-1) # (B, T, T)
        attention_score = self.dropout(soft)

        output = attention_score @ v # (B, T, T) @ (B, T, embd_size) -> (B, T, embd_size)
        return output


In [178]:
class MultiHeadAttention(nn.Module):
      def __init__(self, num_heads, each_embd_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttention(each_embd_size) for _ in range(num_heads)])
        self.proj = nn.Linear(each_embd_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

      def forward(self, x, en_out=None):
        out = torch.cat([h(x, en_out) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


In [179]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, embd_size, dim_embd= n_embd):
        super().__init__()
        self.key = nn.Linear(dim_embd, embd_size, bias=False)
        self.query = nn.Linear(dim_embd, embd_size, bias=False)
        self.value = nn.Linear(dim_embd, embd_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B,T,E = x.shape
        k = self.key(x)   # (B, T, embd_size)
        q = self.query(x) # (B, T, embd_size)
        v = self.value(x) # (B, T, embd_size)

        scaled_dot_prod = (q @ k.transpose(-2, -1)) * (k.shape[-1] ** -0.5) # (B, T, embd_size) @ (B, embd_size, T) -> (B, T, T)
        mask = scaled_dot_prod.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        soft = F.softmax(mask, dim=-1) # (B, T, T)
        attention_score = self.dropout(soft)

        output = attention_score @ v # (B, T, T) @ (B, T, embd_size) -> (B, T, embd_size)
        return output


In [180]:
class MaskedMultiHeadAttention(nn.Module):
      def __init__(self, num_heads, each_embd_size):
        super().__init__()
        self.heads = nn.ModuleList([MaskedSelfAttention(each_embd_size) for _ in range(num_heads)])
        self.proj = nn.Linear(each_embd_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

      def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


### Feed Forward

In [181]:
class FeedForward(nn.Module):

    def __init__(self, dim_embd=n_embd, ff_hid_dim=2048):
        super().__init__()
        self.linear1 = nn.Linear(dim_embd, ff_hid_dim)
        self.linear2 = nn.Linear(ff_hid_dim, dim_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

### Block of Encoder-Decoder

In [182]:
class EncoderBlock(nn.Module):
    def __init__(self, num_heads, dim_embd=n_embd):
        super().__init__()
        head_size = dim_embd // num_heads
        self.mha = MultiHeadAttention(num_heads, head_size)
        self.ff = FeedForward(dim_embd)
        self.ln1 = nn.LayerNorm(dim_embd)
        self.ln2 = nn.LayerNorm(dim_embd)

    def forward(self, x):
        x = x + self.mha(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

In [183]:
class DecoderBlock(nn.Module):
    def __init__(self, num_heads, dim_embd=n_embd):
        super().__init__()
        head_size = dim_embd // num_heads
        self.mmha = MaskedMultiHeadAttention(num_heads, head_size)
        self.mha = MultiHeadAttention(num_heads, head_size)
        self.ff = FeedForward(dim_embd)
        self.ln1 = nn.LayerNorm(dim_embd)
        self.ln2 = nn.LayerNorm(dim_embd)
        self.ln3 = nn.LayerNorm(dim_embd)

    def forward(self, x, en_out=None):
        x = x + self.mmha(self.ln1(x))
        x = x + self.mha(self.ln2(x), en_out=en_out)
        x = x + self.ff(self.ln3(x))
        return x

# Encoder Decoder Model

In [184]:
class EncoderModel(nn.Module):
    def __init__(self):
        super().__init__()

        # self.embedding = InputEmbedding(vocab_size)
        # self.positional_encoding = PositionalEncoding(block_size)

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_encoding = nn.Embedding(block_size, n_embd)

        self.blocks = nn.Sequential(*[EncoderBlock(n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.linear_head = nn.Linear(n_embd, vocab_size)


        self.apply(self._init_weights)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        B, T = input.shape

        # print("Encoder Before Embedding")
        # tok_emb = self.embedding(input)  # (B,T,embd_size)
        # print("Encoder After Embedding")
        # pos_emb = self.positional_encoding(tok_emb) # (B,T,embd_size)
        # print("Encoder After Positional Embd")

        # print("Encoder Before Embedding: ", input.shape)
        tok_emb = self.token_embedding_table(input)
        # print("Encoder After Embedding: ", tok_emb.shape)
        # print("T size: ", T)
        pos_emb = self.positional_encoding(torch.arange(T, device=device))
        # print("Encoder After Positional Embd: ", pos_emb.shape)
        x = tok_emb + pos_emb
        # print("x in Encoder After Positional Embd", x.shape)

        # print("Enc after embd: ", x.shape)

        x = self.blocks(x) # (B,T,embd_size)
        x = self.ln_f(x) # (B,T,embd_size)

        return x

In [185]:
class DecoderModel(nn.Module):
    def __init__(self):
        super().__init__()

        # self.embedding = InputEmbedding(vocab_size)
        # self.positional_encoding = PositionalEncoding(block_size)

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.positional_encoding = nn.Embedding(block_size, n_embd)

        self.blocks = nn.ModuleList([DecoderBlock(n_head) for _ in range(n_layer)])

        self.ln_f = nn.LayerNorm(n_embd)
        self.linear_head = nn.Linear(n_embd, vocab_size)


        self.apply(self._init_weights)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, en_out, targets=None):

        B, T = input.shape

        # # print("Decoder Before Embedding")
        # tok_emb = self.embedding(input)  # (B,T,embd_size)
        # # print("Decoder Token emb:", tok_emb.shape)
        # pos_emb = self.positional_encoding(tok_emb) # (B,T,embd_size)
        # # print("Decoder Pos emb:", pos_emb.shape)

        tok_emb = self.token_embedding_table(input)
        pos_emb = self.positional_encoding(torch.arange(T, device=device))
        x = tok_emb + pos_emb

        for block in self.blocks:
            x = block(x, en_out)
            # print("DecoderEach block shape: ", x.shape)

        x = self.ln_f(x) # (B,T,embd_size)
        # print("Layer norm shape: ", x.shape)
        logits = self.linear_head(x) # (B,T,vocab)
        # print("Linear head shape: ", logits.shape)

        # For inference
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

# Translation Model

In [186]:
class TranslationModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = EncoderModel()
        self.decoder = DecoderModel()

    def forward(self, enc_input, dec_input, targets=None):
        # print("Before Encoder")
        en_out = self.encoder(enc_input)
        # print("After Encoder shape: ", en_out.shape)
        logits, loss = self.decoder(dec_input, en_out, targets=targets)
        # print("After Decoder shape: ", logits.shape)
        return logits, loss


In [187]:
model = TranslationModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training Model

In [188]:
!pip install -q datasets

In [189]:
from datasets import load_dataset

data = load_dataset("CohleM/english-to-nepali")

In [190]:
data['train']['en'][0], data['train']['ne'][0]

('Technical committees will be attached to each ministry.',
 'प्रत्येक मन्त्रालय अन्तर्गत शिल्प (टेक्निकल) कमिटीहरु गठन गरिनेछन्')

In [191]:
eng_data = data['train']['en']
nep_data = data['train']['ne']

eng_corpus = " ".join(eng_data)
nep_corpus = " ".join(nep_data)

In [192]:
class BPETokenizer:

    def __init__(self, text, vocab_size = 300):
        tokens = text.encode("utf-8")
        tokens = list(map(int, tokens))
        self.merges = self.create_merges(tokens,vocab_size)
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]

    def get_stats(self, ids):
        counts = {}
        for pair in zip(ids, ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def merge(self, ids, pair, idx):
        newids = []
        i = 0
        while i < len(ids):
          if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
          else:
            newids.append(ids[i])
            i += 1
        return newids

    def create_merges(self, ids, vocab_size):
        num_merges = vocab_size - 256
        merges = {}
        for i in range(num_merges):
          stats = self.get_stats(ids)
          pair = max(stats, key=stats.get)
          idx = 256 + i
          print(f"merging {pair} into a new token {idx}")
          ids = self.merge(ids, pair, idx)
          merges[pair] = idx
        return merges


    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
          stats = self.get_stats(tokens)
          pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
          if pair not in self.merges:
            break # nothing else can be merged
          idx = self.merges[pair]
          tokens = self.merge(tokens, pair, idx)
        return tokens

    def decode(self, ids):
        # given ids (list of integers), return Python string
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text

In [193]:
import pickle

# Reading pickle file

with open("eng_tokenizer_50k.pkl", "rb") as file:
    eng_tok = pickle.load(file)

with open("nep_tokenizer_50k.pkl", "rb") as file:
    nep_tok = pickle.load(file)

In [194]:
vocab = eng_tok.vocab

In [195]:
# string_dict = {key: value.decode('utf-8', errors='replace') for key, value in vocab.items()}

# # Print the resulting dictionary
# for key, val in string_dict.items():
#   print(key, val)

In [196]:
# Adding eos and sos in vocab

nep_tok.vocab[1501] = b'<sos>'
nep_tok.vocab[1502] = b'<eos>'
nep_tok.vocab[1500] = b'<pad>'
eng_tok.vocab[1500] = b'<pad>'
eng_tok.vocab[1501] = b'<sos>'
eng_tok.vocab[1502] = b'<eos>'

In [197]:
len(eng_tok.vocab), len(nep_tok.vocab)

(1503, 1503)

In [198]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [199]:
enco_eng_data = torch.load("/content/drive/MyDrive/Models/enco_eng_data.pth")
deco_nep_data = torch.load("/content/drive/MyDrive/Models/deco_nep_data.pth")



  enco_eng_data = torch.load("/content/drive/MyDrive/Models/enco_eng_data.pth")
  deco_nep_data = torch.load("/content/drive/MyDrive/Models/deco_nep_data.pth")


In [200]:
eng_data_train = enco_eng_data
deco_nep_data_sos = [[1501] + sentence for sentence in deco_nep_data]
deco_nep_data_eos = [sentence + [1502] for sentence in deco_nep_data]

In [201]:
# Function to pad a batch of sequences
def pad_batch(batch, pad_token=1500):
    """
    Pads input, target, and output sequences dynamically based on max length in the batch.
    """
    input_seqs, target_seqs, output_seqs = zip(*batch)  # Unpack batch

    # Find max length for this batch
    max_len = max(max(len(seq) for seq in input_seqs),
                  max(len(seq) for seq in target_seqs),
                  max(len(seq) for seq in output_seqs))

    # Apply padding
    padded_input = [seq + [pad_token] * (max_len - len(seq)) for seq in input_seqs]
    padded_target = [seq + [pad_token] * (max_len - len(seq)) for seq in target_seqs]
    padded_output = [seq + [pad_token] * (max_len - len(seq)) for seq in output_seqs]

    # Convert to tensors
    return torch.tensor(padded_input), torch.tensor(padded_target), torch.tensor(padded_output)


In [202]:
from torch.utils.data import Dataset, DataLoader

# Dataset class remains the same
class TranslationDataset(Dataset):
    def __init__(self, input_data, target_data, output_data):
        self.input_data = input_data
        self.target_data = target_data
        self.output_data = output_data

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.target_data[idx], self.output_data[idx]


In [203]:
# Create Dataset
dataset = TranslationDataset(enco_eng_data, deco_nep_data_sos, deco_nep_data_eos)

# Create DataLoader with custom collate_fn
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=pad_batch)

In [204]:
for batch_idx, (input_tensor, target_tensor, output_tensor) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}")
    print("Input tensor shape:", input_tensor.shape)
    print("Target tensor shape:", target_tensor.shape)
    print("Output tensor shape:", output_tensor.shape)
    if batch_idx == 2:  # Check only the first few batches
        break

Batch 1
Input tensor shape: torch.Size([16, 129])
Target tensor shape: torch.Size([16, 129])
Output tensor shape: torch.Size([16, 129])
Batch 2
Input tensor shape: torch.Size([16, 125])
Target tensor shape: torch.Size([16, 125])
Output tensor shape: torch.Size([16, 125])
Batch 3
Input tensor shape: torch.Size([16, 125])
Target tensor shape: torch.Size([16, 125])
Output tensor shape: torch.Size([16, 125])


# Training

In [None]:
train_loss = []
num_epochs = 5

for epoch in range(num_epochs):
    total_loss = 0

    # Initialize tqdm progress bar for the epoch
    progress_bar = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, (input_tensor, target_tensor, output_tensor) in enumerate(progress_bar):
        input_tensor, target_tensor, output_tensor = (
            input_tensor.to(device),
            target_tensor.to(device),
            output_tensor.to(device)
        )

        optimizer.zero_grad()

        logits, loss = model(input_tensor, target_tensor, targets=output_tensor)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update tqdm bar with loss information
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    torch.save(model.state_dict(), "/content/drive/MyDrive/Models/translation_model.pth")


    # Compute and store average loss for the epoch
    avg_epoch_loss = total_loss / len(train_loader)
    train_loss.append(avg_epoch_loss)

    # Print final loss after epoch completes
    print(f"\nEpoch [{epoch+1}/{num_epochs}] - Avg Loss: {avg_epoch_loss:.4f}\n")

Epoch 1/5: 100%|██████████| 3125/3125 [08:00<00:00,  6.50it/s, loss=2.7133]



Epoch [1/5] - Avg Loss: 2.6848



Epoch 2/5:  31%|███       | 963/3125 [02:32<06:07,  5.88it/s, loss=1.6487]

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_loss)