In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import pandas as pd
import re
import unicodedata

# Load dataset
data_path = '/kaggle/input/urdu-dataset-20000/final_main_dataset.tsv'
df = pd.read_csv(data_path, sep='\t')  # Tab separator, adjust if needed
print("Columns in dataset:", df.columns.tolist())  # Columns check karo

Columns in dataset: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']


In [2]:
# Install necessary libraries if needed (uncomment in Kaggle)
# !pip install urduhack  # For better Urdu tokenization, optional

import pandas as pd
import re
import unicodedata
from collections import Counter
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import numpy as np

# Normalization function for Urdu
def normalize_urdu(text):
    # Remove diacritics
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    # Standardize Alef and Yeh forms
    text = re.sub(r'[آأإ]', 'ا', text)
    text = re.sub(r'[ےۓ]', 'ی', text)
    text = text.strip()
    return text

# Tokenization (simple word split; for better, use urduhack.tokenization.word_tokenize if installed)
def tokenize(text):
    return text.split()  # Or import urduhack and use urduhack.tokenization.word_tokenize(text)

# Load dataset
data_path = '/kaggle/input/urdu-dataset-20000/final_main_dataset.tsv'
df = pd.read_csv(data_path, sep='\t')  # Tab separator, adjust if needed
print("Columns in dataset:", df.columns.tolist())  # Confirm columns

# Clean column names
df.columns = df.columns.str.strip()

# Use 'sentence' as input, create synthetic 'Bot' response (e.g., add "جی ہاں" or rephrase)
df['User'] = df['sentence'].apply(normalize_urdu)
df['Bot'] = df['sentence'].apply(lambda x: normalize_urdu(x) + " - جی ہاں، بالکل!")  # Synthetic response

# Build vocabulary from all text
all_text = ' '.join(df['User'].tolist() + df['Bot'].tolist())
tokens = tokenize(all_text)
vocab_counter = Counter(tokens)
vocab = {word: idx + 4 for idx, (word, _) in enumerate(vocab_counter.most_common())}  # Start from 4 for special tokens
vocab['<PAD>'] = 0
vocab['<SOS>'] = 1
vocab['<EOS>'] = 2
vocab['<UNK>'] = 3
vocab_size = len(vocab)

# Reverse vocab for decoding
idx_to_word = {idx: word for word, idx in vocab.items()}

# Function to convert text to indices
def text_to_indices(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, vocab['<UNK>']) for token in tokens]

# Add SOS/EOS and convert to tensors
df['user_indices'] = df['User'].apply(lambda x: [vocab['<SOS>']] + text_to_indices(x, vocab) + [vocab['<EOS>']])
df['bot_indices'] = df['Bot'].apply(lambda x: [vocab['<SOS>']] + text_to_indices(x, vocab) + [vocab['<EOS>']])

# Split dataset: 80% train, 10% val, 10% test
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Custom Dataset class
class ChatDataset(Dataset):
    def __init__(self, df):
        self.src = df['user_indices'].tolist()
        self.tgt = df['bot_indices'].tolist()
    
    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])

# Collate function for padding
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=vocab['<PAD>'])
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=vocab['<PAD>'])
    return src_padded, tgt_padded

# Create datasets and loaders
batch_size = 32
train_dataset = ChatDataset(train_df)
val_dataset = ChatDataset(val_df)
test_dataset = ChatDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print("Data preprocessing complete. Vocab size:", vocab_size)

Columns in dataset: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accents', 'variant', 'locale', 'segment']
Data preprocessing complete. Vocab size: 10860


In [3]:
import torch
import torch.nn as nn
import math

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, v), attn

    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)
        q = self.w_q(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output, attn = self.scaled_dot_product_attention(q, k, v, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.w_o(output)

# Feed Forward Network
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, src_mask):
        attn_output = self.self_attn(x, x, x, src_mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        enc_dec_output = self.enc_dec_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(enc_dec_output))
        ff_output = self.ff(x)
        return self.norm3(x + self.dropout(ff_output))

# Full Transformer
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=2, num_layers=2, dropout=0.1, max_len=1000):
        super().__init__()
        self.src_embed = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt, pad_idx=0):
        src_mask = (src != pad_idx).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != pad_idx).unsqueeze(1).unsqueeze(3)
        seq_len = tgt.size(1)
        nopeak_mask = torch.tril(torch.ones(seq_len, seq_len)).bool().to(tgt.device)
        tgt_mask = tgt_mask & nopeak_mask.unsqueeze(0).unsqueeze(0)
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embed = self.dropout(self.pos_enc(self.src_embed(src)))
        tgt_embed = self.dropout(self.pos_enc(self.tgt_embed(tgt)))
        
        enc_output = src_embed
        for layer in self.encoder_layers:
            enc_output = layer(enc_output, src_mask)
        
        dec_output = tgt_embed
        for layer in self.decoder_layers:
            dec_output = layer(dec_output, enc_output, src_mask, tgt_mask)
        
        return self.fc_out(dec_output)

print("Model architecture defined.")

Model architecture defined.


In [5]:
# Install required packages
!pip install sacrebleu
!pip install rouge-score

import torch
import torch.nn as nn
import torch.optim as optim
import sacrebleu
from rouge_score import rouge_scorer
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader

# Set environment variable for better memory management
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Assuming ChatDataset and collate_fn are defined in Cell 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Custom Tokenizer for Urdu (space-based splitting)
class UrduTokenizer:
    def tokenize(self, text):
        return text.split()

# Evaluation functions
def calculate_bleu(model, loader, idx_to_word):
    model.eval()
    references = []
    hypotheses = []
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.argmax(dim=-1).cpu().numpy()
            for i in range(output.shape[0]):
                hyp = ' '.join([idx_to_word.get(idx, '<UNK>') for idx in output[i] if idx not in [0, 1, 2]])
                ref = ' '.join([idx_to_word.get(idx, '<UNK>') for idx in tgt[i, 1:].cpu().numpy() if idx not in [0, 1, 2]])
                if hyp and ref:
                    hypotheses.append(hyp.strip())
                    references.append([ref.strip()])
    return sacrebleu.corpus_bleu(hypotheses, references).score if hypotheses and references else 0.0

def calculate_rouge(model, loader, idx_to_word):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False, tokenizer=UrduTokenizer())  # Use custom Urdu tokenizer
    rouge_scores = []
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.argmax(dim=-1).cpu().numpy()
            for i in range(output.shape[0]):
                hyp = ' '.join([idx_to_word.get(idx, '') for idx in output[i] if idx not in [0, 1, 2]])  # Remove <UNK>
                ref = ' '.join([idx_to_word.get(idx, '') for idx in tgt[i, 1:].cpu().numpy() if idx not in [0, 1, 2]])
                if hyp and ref:
                    hyp = hyp.strip()
                    ref = ref.strip()
                    score = scorer.score(ref, hyp)['rougeL'].fmeasure
                    rouge_scores.append(score)
                    print(f"Debug - Hyp: '{hyp}', Ref: '{ref}', ROUGE-L: {score}")
    return np.mean(rouge_scores) if rouge_scores else 0.0

def calculate_chrf(model, loader, idx_to_word):
    model.eval()
    references = []
    hypotheses = []
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.argmax(dim=-1).cpu().numpy()
            for i in range(output.shape[0]):
                hyp = ' '.join([idx_to_word.get(idx, '<UNK>') for idx in output[i] if idx not in [0, 1, 2]])
                ref = ' '.join([idx_to_word.get(idx, '<UNK>') for idx in tgt[i, 1:].cpu().numpy() if idx not in [0, 1, 2]])
                if hyp and ref:
                    hypotheses.append(hyp.strip())
                    references.append([ref.strip()])
    return sacrebleu.corpus_chrf(hypotheses, references).score if hypotheses and references else 0.0

def calculate_perplexity(model, loader):
    model.eval()
    total_loss = 0
    total_words = 0
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'], reduction='sum')
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
            total_words += tgt[:, 1:].ne(vocab['<PAD>']).sum().item()
    avg_loss = total_loss / total_words if total_words > 0 else 0
    return np.exp(avg_loss) if avg_loss > 0 else float('inf')

def calculate_val_loss(model, loader):
    model.eval()
    val_loss = 0
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'], reduction='sum')
    with torch.no_grad():
        for src, tgt in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].reshape(-1))
            val_loss += loss.item()
    return val_loss / len(loader)

# Trials configuration
trials = [
    {
        'embedding_dimensions': 256,
        'heads': 2,
        'encoder_layers': 2,
        'decoder_layers': 2,
        'dropout': 0.1,
        'batch_size': 32,
        'learning_rate': 1e-4,
        'model_path': 'temp_model_trial1.pth'
    },
    {
        'embedding_dimensions': 512,
        'heads': 2,
        'encoder_layers': 2,
        'decoder_layers': 2,
        'dropout': 0.3,
        'batch_size': 32,
        'learning_rate': 5e-4,
        'model_path': 'temp_model_trial2.pth'
    }
]

epochs = 20  # Adjust based on time/resources
best_overall_bleu = 0
best_model_state = None
best_trial_config = None

# Run trials
for trial_idx, trial in enumerate(trials, 1):
    print(f"\n=== Starting Trial {trial_idx} ===")
    print(f"Hyperparameters: Embedding Dimensions={trial['embedding_dimensions']}, Heads={trial['heads']}, Encoder Layers={trial['encoder_layers']}, "
          f"Decoder Layers={trial['decoder_layers']}, Dropout={trial['dropout']}, Batch Size={trial['batch_size']}, Learning Rate={trial['learning_rate']}")

    # Reinitialize model for each trial
    model = Transformer(src_vocab_size=10860, tgt_vocab_size=10860, 
                       d_model=trial['embedding_dimensions'], num_heads=trial['heads'], 
                       num_layers=trial['encoder_layers'], dropout=trial['dropout'])
    model.to(device)

    # Reinitialize DataLoader for the trial's batch size
    train_loader = DataLoader(train_dataset, batch_size=trial['batch_size'], shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=trial['batch_size'], shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=trial['batch_size'], shuffle=False, collate_fn=collate_fn)

    optimizer = optim.Adam(model.parameters(), lr=trial['learning_rate'])
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])

    # Training loop
    best_trial_bleu = 0
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for src, tgt in tqdm(train_loader, desc=f"Trial {trial_idx} - Epoch {epoch+1}/{epochs}"):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])  # Teacher forcing
            loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for src, tgt in val_loader:
                src, tgt = src.to(device), tgt.to(device)
                output = model(src, tgt[:, :-1])
                loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].reshape(-1))
                val_loss += loss.item()
        val_loss = val_loss / len(val_loader)
        
        val_bleu = calculate_bleu(model, val_loader, idx_to_word)
        print(f"Epoch {epoch+1}/{epochs}: Train Loss = {train_loss / len(train_loader):.4f}, Val Loss = {val_loss:.4f}, Val BLEU = {val_bleu:.2f}")
        
        if val_bleu > best_trial_bleu:
            best_trial_bleu = val_bleu
            torch.save(model.state_dict(), trial['model_path'])
            print(f"Saved best model for Trial {trial_idx} with BLEU: {best_trial_bleu:.2f}")

    # Final evaluation on test set for this trial
    model.load_state_dict(torch.load(trial['model_path']))
    test_bleu = calculate_bleu(model, test_loader, idx_to_word)
    test_rouge = calculate_rouge(model, test_loader, idx_to_word)
    test_chrf = calculate_chrf(model, test_loader, idx_to_word)
    test_perplexity = calculate_perplexity(model, test_loader)
    test_val_loss = calculate_val_loss(model, val_loader)

    print(f"\nTrial {trial_idx} Final Evaluation on Test Set:")
    print(f"BLEU: {test_bleu:.2f}, ROUGE-L: {test_rouge:.2f}, chrF: {test_chrf:.2f}, Perplexity: {test_perplexity:.2f}, Val Loss: {test_val_loss:.4f}")

    # Update best overall model if this trial's best BLEU is higher
    if best_trial_bleu > best_overall_bleu:
        best_overall_bleu = best_trial_bleu
        best_model_state = torch.load(trial['model_path'])
        best_trial_config = trial
        print(f"New overall best BLEU: {best_overall_bleu:.2f} from Trial {trial_idx}")

# Save the overall best model
if best_model_state is not None:
    best_model = Transformer(src_vocab_size=10860, tgt_vocab_size=10860, 
                           d_model=best_trial_config['embedding_dimensions'], num_heads=best_trial_config['heads'], 
                           num_layers=best_trial_config['encoder_layers'], dropout=best_trial_config['dropout'])
    best_model.load_state_dict(best_model_state)
    best_model.to(device)
    torch.save(best_model.state_dict(), 'best_model.pth')
    print(f"\nFinal best model saved as 'best_model.pth' with BLEU: {best_overall_bleu:.2f} from Trial {best_trial_config}")

    # Final evaluation of the best model on test set
    test_bleu = calculate_bleu(best_model, test_loader, idx_to_word)
    test_rouge = calculate_rouge(best_model, test_loader, idx_to_word)
    test_chrf = calculate_chrf(best_model, test_loader, idx_to_word)
    test_perplexity = calculate_perplexity(best_model, test_loader)
    test_val_loss = calculate_val_loss(best_model, val_loader)
    print(f"\nBest Model Final Evaluation on Test Set:")
    print(f"BLEU: {test_bleu:.2f}, ROUGE-L: {test_rouge:.2f}, chrF: {test_chrf:.2f}, Perplexity: {test_perplexity:.2f}, Val Loss: {test_val_loss:.4f}")
    print(f"Best Model Hyperparameters: Embedding Dimensions={best_trial_config['embedding_dimensions']}, Heads={best_trial_config['heads']}, "
          f"Encoder Layers={best_trial_config['encoder_layers']}, Decoder Layers={best_trial_config['decoder_layers']}, "
          f"Dropout={best_trial_config['dropout']}, Batch Size={best_trial_config['batch_size']}, Learning Rate={best_trial_config['learning_rate']}")
else:
    print("No valid model trained. Check dataset or training setup.")

print("\n=== All Trials Completed ===")


=== Starting Trial 1 ===
Hyperparameters: Embedding Dimensions=256, Heads=2, Encoder Layers=2, Decoder Layers=2, Dropout=0.1, Batch Size=32, Learning Rate=0.0001


Trial 1 - Epoch 1/20: 100%|██████████| 500/500 [00:10<00:00, 45.64it/s]


Epoch 1/20: Train Loss = 4.5795, Val Loss = 3.5432, Val BLEU = 24.03
Saved best model for Trial 1 with BLEU: 24.03


Trial 1 - Epoch 2/20: 100%|██████████| 500/500 [00:10<00:00, 49.10it/s]


Epoch 2/20: Train Loss = 3.2077, Val Loss = 2.6394, Val BLEU = 24.90
Saved best model for Trial 1 with BLEU: 24.90


Trial 1 - Epoch 3/20: 100%|██████████| 500/500 [00:10<00:00, 49.05it/s]


Epoch 3/20: Train Loss = 2.4918, Val Loss = 2.0400, Val BLEU = 25.57
Saved best model for Trial 1 with BLEU: 25.57


Trial 1 - Epoch 4/20: 100%|██████████| 500/500 [00:10<00:00, 48.35it/s]


Epoch 4/20: Train Loss = 1.9757, Val Loss = 1.5959, Val BLEU = 23.26


Trial 1 - Epoch 5/20: 100%|██████████| 500/500 [00:10<00:00, 48.26it/s]


Epoch 5/20: Train Loss = 1.5830, Val Loss = 1.2876, Val BLEU = 20.33


Trial 1 - Epoch 6/20: 100%|██████████| 500/500 [00:10<00:00, 47.14it/s]


Epoch 6/20: Train Loss = 1.2703, Val Loss = 1.0472, Val BLEU = 24.90


Trial 1 - Epoch 7/20: 100%|██████████| 500/500 [00:10<00:00, 46.97it/s]


Epoch 7/20: Train Loss = 1.0226, Val Loss = 0.8593, Val BLEU = 27.03
Saved best model for Trial 1 with BLEU: 27.03


Trial 1 - Epoch 8/20: 100%|██████████| 500/500 [00:10<00:00, 47.31it/s]


Epoch 8/20: Train Loss = 0.8227, Val Loss = 0.7248, Val BLEU = 27.03


Trial 1 - Epoch 9/20: 100%|██████████| 500/500 [00:10<00:00, 47.99it/s]


Epoch 9/20: Train Loss = 0.6602, Val Loss = 0.6171, Val BLEU = 41.33
Saved best model for Trial 1 with BLEU: 41.33


Trial 1 - Epoch 10/20: 100%|██████████| 500/500 [00:10<00:00, 47.75it/s]


Epoch 10/20: Train Loss = 0.5350, Val Loss = 0.5235, Val BLEU = 28.48


Trial 1 - Epoch 11/20: 100%|██████████| 500/500 [00:10<00:00, 48.29it/s]


Epoch 11/20: Train Loss = 0.4275, Val Loss = 0.4516, Val BLEU = 42.00
Saved best model for Trial 1 with BLEU: 42.00


Trial 1 - Epoch 12/20: 100%|██████████| 500/500 [00:10<00:00, 48.05it/s]


Epoch 12/20: Train Loss = 0.3433, Val Loss = 0.3889, Val BLEU = 41.33


Trial 1 - Epoch 13/20: 100%|██████████| 500/500 [00:10<00:00, 47.54it/s]


Epoch 13/20: Train Loss = 0.2754, Val Loss = 0.3491, Val BLEU = 42.00


Trial 1 - Epoch 14/20: 100%|██████████| 500/500 [00:10<00:00, 47.32it/s]


Epoch 14/20: Train Loss = 0.2208, Val Loss = 0.3131, Val BLEU = 41.33


Trial 1 - Epoch 15/20: 100%|██████████| 500/500 [00:10<00:00, 47.80it/s]


Epoch 15/20: Train Loss = 0.1816, Val Loss = 0.2951, Val BLEU = 41.33


Trial 1 - Epoch 16/20: 100%|██████████| 500/500 [00:10<00:00, 48.07it/s]


Epoch 16/20: Train Loss = 0.1485, Val Loss = 0.2692, Val BLEU = 42.00


Trial 1 - Epoch 17/20: 100%|██████████| 500/500 [00:10<00:00, 48.21it/s]


Epoch 17/20: Train Loss = 0.1228, Val Loss = 0.2566, Val BLEU = 42.00


Trial 1 - Epoch 18/20: 100%|██████████| 500/500 [00:10<00:00, 47.68it/s]


Epoch 18/20: Train Loss = 0.1010, Val Loss = 0.2514, Val BLEU = 42.00


Trial 1 - Epoch 19/20: 100%|██████████| 500/500 [00:10<00:00, 47.87it/s]


Epoch 19/20: Train Loss = 0.0813, Val Loss = 0.2220, Val BLEU = 41.17


Trial 1 - Epoch 20/20: 100%|██████████| 500/500 [00:10<00:00, 47.70it/s]


Epoch 20/20: Train Loss = 0.0660, Val Loss = 0.2243, Val BLEU = 39.81
Debug - Hyp: 'ہر چیز انتہايی زبردست اور اعلی معیار کی تھی - جی ہاں، بالکل! - - - - - - - - -', Ref: 'ہر چیز انتہايی زبردست اور اعلی معیار کی تھی - جی ہاں، بالکل!', ROUGE-L: 0.7428571428571429
Debug - Hyp: 'اس جمود کو ہیرو ہی۔ - جی ہاں، بالکل! - - - - - - - - - - - - -', Ref: 'اس جمود کو توڑا ہی۔ - جی ہاں، بالکل!', ROUGE-L: 0.5161290322580644
Debug - Hyp: 'انہوںنی ناریل کی چھلکی سی کاربن حاصل کرکی - جی ہاں، بالکل! جی - - - - - - - - -', Ref: 'انہوںنی ناریل کی چھلکی سی کاربن حاصل کرکی - جی ہاں، بالکل!', ROUGE-L: 0.7058823529411764
Debug - Hyp: 'اسی پتہ بھی ہی کہ وہ جھوٹ بول رہا ہی - جی ہاں، بالکل! - - - - - - - -', Ref: 'اسی پتہ بھی ہی کہ وہ جھوٹ بول رہا ہی - جی ہاں، بالکل!', ROUGE-L: 0.7777777777777778
Debug - Hyp: 'افسوس، یہ ہی۔ - جی ہاں، بالکل! - - - - جی جی جی - - - - - - - -', Ref: 'افسوس، یہ ہی۔ - جی ہاں، بالکل!', ROUGE-L: 0.4827586206896552
Debug - Hyp: 'اوراس کی ساتھ اداروں میں تصادم کو بھی روکنا ہی۔ - جی ہاں، 

Trial 2 - Epoch 1/20: 100%|██████████| 500/500 [00:19<00:00, 26.10it/s]


Epoch 1/20: Train Loss = 3.3082, Val Loss = 1.6479, Val BLEU = 22.87
Saved best model for Trial 2 with BLEU: 22.87


Trial 2 - Epoch 2/20: 100%|██████████| 500/500 [00:19<00:00, 26.16it/s]


Epoch 2/20: Train Loss = 1.4832, Val Loss = 0.7094, Val BLEU = 22.21


Trial 2 - Epoch 3/20: 100%|██████████| 500/500 [00:19<00:00, 26.15it/s]


Epoch 3/20: Train Loss = 0.7384, Val Loss = 0.3807, Val BLEU = 43.44
Saved best model for Trial 2 with BLEU: 43.44


Trial 2 - Epoch 4/20: 100%|██████████| 500/500 [00:19<00:00, 26.08it/s]


Epoch 4/20: Train Loss = 0.4233, Val Loss = 0.3028, Val BLEU = 45.86
Saved best model for Trial 2 with BLEU: 45.86


Trial 2 - Epoch 5/20: 100%|██████████| 500/500 [00:19<00:00, 26.07it/s]


Epoch 5/20: Train Loss = 0.2847, Val Loss = 0.2554, Val BLEU = 42.16


Trial 2 - Epoch 6/20: 100%|██████████| 500/500 [00:19<00:00, 25.97it/s]


Epoch 6/20: Train Loss = 0.2103, Val Loss = 0.2444, Val BLEU = 30.16


Trial 2 - Epoch 7/20: 100%|██████████| 500/500 [00:19<00:00, 26.07it/s]


Epoch 7/20: Train Loss = 0.1771, Val Loss = 0.2834, Val BLEU = 42.16


Trial 2 - Epoch 8/20: 100%|██████████| 500/500 [00:19<00:00, 25.95it/s]


Epoch 8/20: Train Loss = 0.1781, Val Loss = 0.2790, Val BLEU = 40.28


Trial 2 - Epoch 9/20: 100%|██████████| 500/500 [00:19<00:00, 25.97it/s]


Epoch 9/20: Train Loss = 0.1392, Val Loss = 0.2969, Val BLEU = 42.00


Trial 2 - Epoch 10/20: 100%|██████████| 500/500 [00:19<00:00, 26.02it/s]


Epoch 10/20: Train Loss = 0.1321, Val Loss = 0.2971, Val BLEU = 32.34


Trial 2 - Epoch 11/20: 100%|██████████| 500/500 [00:19<00:00, 26.04it/s]


Epoch 11/20: Train Loss = 0.1249, Val Loss = 0.2997, Val BLEU = 30.61


Trial 2 - Epoch 12/20: 100%|██████████| 500/500 [00:19<00:00, 26.03it/s]


Epoch 12/20: Train Loss = 0.1263, Val Loss = 0.3163, Val BLEU = 29.62


Trial 2 - Epoch 13/20: 100%|██████████| 500/500 [00:19<00:00, 26.10it/s]


Epoch 13/20: Train Loss = 0.1235, Val Loss = 0.3021, Val BLEU = 41.17


Trial 2 - Epoch 14/20: 100%|██████████| 500/500 [00:19<00:00, 25.98it/s]


Epoch 14/20: Train Loss = 0.1111, Val Loss = 0.3067, Val BLEU = 44.06


Trial 2 - Epoch 15/20: 100%|██████████| 500/500 [00:19<00:00, 25.96it/s]


Epoch 15/20: Train Loss = 0.1075, Val Loss = 0.3236, Val BLEU = 37.49


Trial 2 - Epoch 16/20: 100%|██████████| 500/500 [00:19<00:00, 26.05it/s]


Epoch 16/20: Train Loss = 0.0913, Val Loss = 0.3026, Val BLEU = 31.04


Trial 2 - Epoch 17/20: 100%|██████████| 500/500 [00:19<00:00, 26.00it/s]


Epoch 17/20: Train Loss = 0.0943, Val Loss = 0.3209, Val BLEU = 33.92


Trial 2 - Epoch 18/20: 100%|██████████| 500/500 [00:19<00:00, 26.15it/s]


Epoch 18/20: Train Loss = 0.0884, Val Loss = 0.3386, Val BLEU = 31.65


Trial 2 - Epoch 19/20: 100%|██████████| 500/500 [00:19<00:00, 26.24it/s]


Epoch 19/20: Train Loss = 0.0945, Val Loss = 0.3309, Val BLEU = 42.64


Trial 2 - Epoch 20/20: 100%|██████████| 500/500 [00:19<00:00, 26.08it/s]


Epoch 20/20: Train Loss = 0.0829, Val Loss = 0.3533, Val BLEU = 25.54
Debug - Hyp: 'ہر چیز انتہايی زبردست اور اعلی معیار کی تھی - جی ہاں، بالکل! - کی کی - - - - - تھی', Ref: 'ہر چیز انتہايی زبردست اور اعلی معیار کی تھی - جی ہاں، بالکل!', ROUGE-L: 0.7428571428571429
Debug - Hyp: 'اس جمود کو توڑا ہی۔ - جی ہاں، بالکل! - - - - - - - - - - - -', Ref: 'اس جمود کو توڑا ہی۔ - جی ہاں، بالکل!', ROUGE-L: 0.6
Debug - Hyp: 'انہوںنی ناریل کی چھلکی سی کاربن حاصل کرکی - جی ہاں، بالکل! - حاصل - - - - - - - -', Ref: 'انہوںنی ناریل کی چھلکی سی کاربن حاصل کرکی - جی ہاں، بالکل!', ROUGE-L: 0.7058823529411764
Debug - Hyp: 'اسی پتہ بھی ہی کہ وہ جھوٹ بول رہا ہی - جی ہاں، بالکل! - رہا رہا رہا رہا رہا رہا رہا', Ref: 'اسی پتہ بھی ہی کہ وہ جھوٹ بول رہا ہی - جی ہاں، بالکل!', ROUGE-L: 0.7777777777777778
Debug - Hyp: 'افسوس، یہ ہی۔ - جی ہاں، بالکل! - - ہی۔ ہی۔ - - - - - - - - - - ہی۔', Ref: 'افسوس، یہ ہی۔ - جی ہاں، بالکل!', ROUGE-L: 0.4827586206896552
Debug - Hyp: 'اوراس کی ساتھ اداروں میں تصادم کو بھی روکنا ہی۔ - جی

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np

# Load the best model configuration and state
best_model_path = 'best_model.pth'
best_trial_config = {'d_model': 512, 'num_heads': 2, 'num_layers': 2, 'dropout': 0.3}  # Update this with actual config from Cell 3 if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the Transformer model with the best configuration
model = Transformer(src_vocab_size=10860, tgt_vocab_size=10860, 
                   d_model=best_trial_config['d_model'], num_heads=best_trial_config['num_heads'], 
                   num_layers=best_trial_config['num_layers'], dropout=best_trial_config['dropout'])
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

# Assuming vocab and idx_to_word are defined in Cell 1
word_to_idx = {w: i for i, w in idx_to_word.items()}  # Reverse mapping for tokenization

# Function to convert text to tensor
def text_to_tensor(text, max_len=1000):
    tokens = [word_to_idx.get(w, word_to_idx.get('<UNK>', 3)) for w in text.split()]
    tokens = [word_to_idx.get('<SOS>', 1)] + tokens + [word_to_idx.get('<EOS>', 2)]
    if len(tokens) > max_len:
        tokens = tokens[:max_len]
    else:
        tokens.extend([word_to_idx.get('<PAD>', 0)] * (max_len - len(tokens)))
    return torch.tensor(tokens[:max_len], dtype=torch.long).unsqueeze(0).to(device)

# Function to generate response
def generate_response(input_text, max_len=1000):
    with torch.no_grad():
        src_tensor = text_to_tensor(input_text)
        tgt_start = torch.tensor([word_to_idx.get('<SOS>', 1)], dtype=torch.long).to(device)
        generated = [word_to_idx.get('<SOS>', 1)]

        for _ in range(max_len - 1):
            tgt_tensor = torch.tensor(generated, dtype=torch.long).unsqueeze(0).to(device)
            output = model(src_tensor, tgt_tensor)
            next_token = output.argmax(dim=-1)[:, -1].item()
            generated.append(next_token)
            if next_token == word_to_idx.get('<EOS>', 2):
                break

        # Convert tokens back to text
        response = ' '.join([idx_to_word.get(idx, '<UNK>') for idx in generated[1:] if idx not in [word_to_idx.get('<PAD>', 0), word_to_idx.get('<EOS>', 2)]])
        return response

# Interactive loop
print("Chat with the trained model! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Goodbye!")
        break
    response = generate_response(user_input)
    print(f"Model: {response}")

Chat with the trained model! Type 'exit' to quit.


You:  وہ گیا ایسا کی گیا ہی گیا


Model: وہ گیا ایسا ایسا کی گیا ہی - جی ہاں، بالکل!


In [None]:


























,,,,,,,,,,,,,,,,,,,,,