In [7]:
pip install torch datasets streamlit sentencepiece

Collecting streamlit
  Downloading streamlit-1.43.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [9]:
pip install datasets



In [37]:
from datasets import load_dataset
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
import json

# Load dataset
dataset = load_dataset("opus100", "ar-en")
train_data = dataset["train"].select(range(100000))
val_data = dataset["validation"]
# Build vocabularies
def build_vocab(sentences, max_vocab=10000):
    vocab = Counter()
    for sent in sentences:
        vocab.update(sent.split())
    vocab = ["<pad>", "<sos>", "<eos>", "<unk>"] + [word for word, _ in vocab.most_common(max_vocab)]
    return {word: i for i, word in enumerate(vocab)}

src_sentences = [ex["translation"]["ar"] for ex in train_data]
tgt_sentences = [ex["translation"]["en"].lower() for ex in train_data]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)

# Save vocabularies
with open("src_vocab.json", "w") as f:
    json.dump(src_vocab, f)
with open("tgt_vocab.json", "w") as f:
    json.dump(tgt_vocab, f)

# Dataset class
class TranslationDataset(Dataset):
    def __init__(self, src_sents, tgt_sents, src_vocab, tgt_vocab):
        self.src = [self.process(s, src_vocab) for s in src_sents]
        self.tgt = [self.process(t, tgt_vocab, is_tgt=True) for t in tgt_sents]

    def process(self, sent, vocab, is_tgt=False):
        tokens = sent.split()
        indices = [vocab.get(tok, vocab["<unk>"]) for tok in tokens]
        if is_tgt:
            return [vocab["<sos>"]] + indices + [vocab["<eos>"]]
        return indices + [vocab["<eos>"]]

    def __len__(self): return len(self.src)

    def __getitem__(self, idx):
        return {"src": self.src[idx], "tgt": self.tgt[idx]}

# Collate function
def collate_fn(batch):
    src = [torch.LongTensor(item["src"]) for item in batch]
    tgt = [torch.LongTensor(item["tgt"]) for item in batch]
    src = torch.nn.utils.rnn.pad_sequence(src, padding_value=src_vocab["<pad>"])
    tgt = torch.nn.utils.rnn.pad_sequence(tgt, padding_value=tgt_vocab["<pad>"])
    return {"src": src.T, "tgt": tgt.T}

# Create dataloaders
train_dataset = TranslationDataset(src_sentences, tgt_sentences, src_vocab, tgt_vocab)
train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn, shuffle=True)

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads

        assert self.head_dim * num_heads == d_model, "d_model must be divisible by num_heads"

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Linear transformations and split into heads
        Q = self.wq(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.wk(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.wv(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Scaled Dot-Product Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = F.softmax(scores, dim=-1)

        # Concatenate heads and pass through final linear layer
        out = torch.matmul(attention, V)
        out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.fc_out(out)

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:x.size(1)]

# Feed-Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(F.relu(self.fc1(x)))

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)

        self.ffn = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout1(attn_output))

        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout2(ffn_output))
        return x

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)

        self.enc_dec_attention = MultiHeadAttention(d_model, num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(dropout)

        self.ffn = FeedForward(d_model, d_ff)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        # Self-attention
        attn_output = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout1(attn_output))

        # Encoder-Decoder attention
        attn_output = self.enc_dec_attention(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout2(attn_output))

        # Feed-forward network
        ffn_output = self.ffn(x)
        x = self.norm3(x + self.dropout3(ffn_output))
        return x

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, d_model, num_layers, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.d_model = d_model  # Define d_model
        self.embedding = nn.Embedding(src_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

    def forward(self, src, src_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)  # Now self.d_model is defined
        src = self.pos_encoding(src)
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

# Decoder
class Decoder(nn.Module):
    def __init__(self, tgt_vocab_size, d_model, num_layers, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.d_model = d_model  # Define d_model
        self.embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

    def forward(self, tgt, enc_output, src_mask=None, tgt_mask=None):
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)  # Now self.d_model is defined
        tgt = self.pos_encoding(tgt)
        for layer in self.layers:
            tgt = layer(tgt, enc_output, src_mask, tgt_mask)
        return tgt

# Transformer
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_layers=6, num_heads=8, d_ff=2048, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, num_layers, num_heads, d_ff, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_layers, num_heads, d_ff, dropout)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        return self.fc_out(dec_output)

In [25]:
torch.cuda.empty_cache()

In [39]:
from tqdm import tqdm  # Import tqdm

# Instantiate the model
model = Transformer(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    d_model=512,  # Reduced from 512
    num_layers=6,  # Reduced from 6
    num_heads=8,  # Reduced from 8
    d_ff=2048,  # Reduced from 2048
    dropout=0.1
).to(device)

# Training loop
for epoch in range(10):
    model.train()
    total_loss = 0

    # Wrap train_loader with tqdm for a progress bar
    train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=True, colour="green")

    for batch in train_loop:
        src = batch["src"].to(device)
        tgt = batch["tgt"].to(device)

        # Forward pass
        output = model(src, tgt[:, :-1])  # Pass src and tgt (excluding the last token)

        # Compute loss
        loss = criterion(output.reshape(-1, output.size(-1)), tgt[:, 1:].reshape(-1))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update total loss
        total_loss += loss.item()

        # Update the progress bar description with the current loss
        train_loop.set_postfix(loss=loss.item())

    # Print average loss for the epoch
    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader)}")

Epoch 1: 100%|[32m██████████[0m| 6250/6250 [07:24<00:00, 14.07it/s, loss=9.3]


Epoch 1, Average Loss: 9.32076387298584


Epoch 2: 100%|[32m██████████[0m| 6250/6250 [07:24<00:00, 14.05it/s, loss=9.4]


Epoch 2, Average Loss: 9.320353903045655


Epoch 3: 100%|[32m██████████[0m| 6250/6250 [07:27<00:00, 13.98it/s, loss=9.24]


Epoch 3, Average Loss: 9.320411079864503


Epoch 4: 100%|[32m██████████[0m| 6250/6250 [07:23<00:00, 14.08it/s, loss=9.29]


Epoch 4, Average Loss: 9.320286298675537


Epoch 5: 100%|[32m██████████[0m| 6250/6250 [07:24<00:00, 14.07it/s, loss=9.34]


Epoch 5, Average Loss: 9.320531028137207


Epoch 6: 100%|[32m██████████[0m| 6250/6250 [07:26<00:00, 14.00it/s, loss=9.32]


Epoch 6, Average Loss: 9.32074204940796


Epoch 7: 100%|[32m██████████[0m| 6250/6250 [07:26<00:00, 13.99it/s, loss=9.3]


Epoch 7, Average Loss: 9.320411442718505


Epoch 8: 100%|[32m██████████[0m| 6250/6250 [07:25<00:00, 14.02it/s, loss=9.3]


Epoch 8, Average Loss: 9.320808847198487


Epoch 9: 100%|[32m██████████[0m| 6250/6250 [07:23<00:00, 14.09it/s, loss=9.28]


Epoch 9, Average Loss: 9.320587664489747


Epoch 10: 100%|[32m██████████[0m| 6250/6250 [07:22<00:00, 14.13it/s, loss=9.36]

Epoch 10, Average Loss: 9.320846364746094





In [40]:
# Save the model's state dictionary
torch.save(model.state_dict(), "transformer_nmt.pth")
print("Model saved to transformer_nmt.pth")

Model saved to transformer_nmt.pth


In [42]:
import torch
import json

# Load vocabularies
with open("src_vocab.json", "r") as f:
    src_vocab = json.load(f)
with open("tgt_vocab.json", "r") as f:
    tgt_vocab = json.load(f)
tgt_vocab_inv = {v: k for k, v in tgt_vocab.items()}  # Inverse vocabulary for decoding

# Load the trained model
model = Transformer(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    d_model=512,
    num_layers=6,
    num_heads=8,
    d_ff=2048,
    dropout=0.1
).to(device)
model.load_state_dict(torch.load("transformer_nmt.pth", map_location=device))
model.eval()

# Preprocess input text
def preprocess_input(text, vocab):
    tokens = text.split() + ["<eos>"]
    indices = [vocab.get(tok, vocab["<unk>"]) for tok in tokens]
    return torch.LongTensor(indices).unsqueeze(0).to(device)  # Add batch dimension

# Generate translation
def translate(text, max_len=100):
    # Preprocess input
    src = preprocess_input(text, src_vocab)

    # Initialize target sequence with <sos>
    tgt = torch.LongTensor([[tgt_vocab["<sos>"]]]).to(device)

    # Generate translation token by token
    for _ in range(max_len):
        with torch.no_grad():  # Disable gradient calculation
            output = model(src, tgt)
        next_token = output.argmax(-1)[:, -1:]  # Greedy decoding
        tgt = torch.cat([tgt, next_token], dim=1)

        # Stop if <eos> is generated
        if next_token.item() == tgt_vocab["<eos>"]:
            break

    # Convert indices to tokens
    translation = " ".join([tgt_vocab_inv.get(idx.item(), "") for idx in tgt[0][1:-1]])
    return translation

# Test the function
input_text = "كيف حالك"
translated_text = translate(input_text)
print(f"Input: {input_text}")
print(f"Translation: {translated_text}")

Input: كيف حالك
Translation: maggie leone gun? good? hungry. since. forbid perimeter texts minimal remembered jamahiriya stay else's explicit domestic explanations science alan kid. arrange challenges explicit else's explicit else's explicit aren't firearms remarkable -why explanations so? papers. countries item too ruling mining minimal secretary-general's texts explicit aren't checked tell. left michael serious. stay else's explicit aren't hands. tolerance swing for? adopt the, guinea-bissau, michael serious. stay else's explicit aren't hands. tolerance breathe. else's explicit aren't hands. tolerance swing cambodia breathe. else's explicit aren't else's explicit aren't else's explicit aren't cameras. shocking yesterday explicit aren't cameras. shocking yesterday explicit aren't cameras. shocking yesterday
