# Assg 02
# Q3

In [None]:
# English to Urdu Translation using Transformers
# Implementation based on the paper "Attention is All You Need" (Vaswani et al.)

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel, MarianTokenizer
from sacrebleu import corpus_bleu
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Data Loading and Preprocessing
class ParallelCorpusDataset(Dataset):
    def __init__(self, english_texts, urdu_texts, tokenizer, max_length=128):
        self.english_texts = english_texts
        self.urdu_texts = urdu_texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.english_texts)
    
    def __getitem__(self, idx):
        english_text = str(self.english_texts[idx])
        urdu_text = str(self.urdu_texts[idx])
        
        # Tokenize inputs
        inputs = self.tokenizer(english_text, return_tensors="pt", max_length=self.max_length, 
                               padding="max_length", truncation=True)
        
        # Tokenize targets
        with self.tokenizer.as_target_tokenizer():
            targets = self.tokenizer(urdu_text, return_tensors="pt", max_length=self.max_length, 
                                    padding="max_length", truncation=True)
        
        input_ids = inputs.input_ids.squeeze()
        attention_mask = inputs.attention_mask.squeeze()
        labels = targets.input_ids.squeeze()
        
        # Replace padding token id with -100 for CrossEntropyLoss to ignore
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def load_data(filepath):
    """Load data from CSV file with English and Urdu parallel sentences"""
    df = pd.read_csv(filepath)
    return df['english'].tolist(), df['urdu'].tolist()

def download_kaggle_dataset():    
    os.makedirs("data", exist_ok=True)
    
    sample_data = {
        'english': [
            'Hello, how are you?',
            'My name is Saad.',
            'I live in Islamabad.',
            'The weather is nice today.',
            'I love programming.',
        ],
        'urdu': [
            'ہیلو، آپ کیسے ہیں؟',
            'میرا نام سعد ہے۔',
            'میں اسلام آباد میں رہتا ہوں۔',
            'آج موسم اچھا ہے۔',
            'مجھے پروگرامنگ پسند ہے۔',
        ]
    }
    
    sample_df = pd.DataFrame(sample_data)
    sample_df.to_csv('data/sample_en_ur_corpus.csv', index=False)
    print("Created a sample dataset at data/sample_en_ur_corpus.csv")


# Training and Evaluation Functions
def train_model(model, train_dataloader, optimizer, device, num_epochs=5):
    model.train()
    training_losses = []
    
    for epoch in range(num_epochs):
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(train_dataloader)
        training_losses.append(avg_loss)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")
    
    return training_losses

def evaluate_model(model, test_dataloader, tokenizer, device):
    model.eval()
    predictions = []
    references = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Generate translations
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            
            # Decode the generated translations and actual targets
            pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            
            # Replace -100 with pad token ID for decoding
            labels_for_decode = labels.clone()
            labels_for_decode[labels_for_decode == -100] = tokenizer.pad_token_id
            ref_texts = tokenizer.batch_decode(labels_for_decode, skip_special_tokens=True)
            
            predictions.extend(pred_texts)
            references.extend(ref_texts)
    
    # Calculate BLEU score
    bleu_score = corpus_bleu(predictions, [references]).score
    
    return bleu_score, predictions, references

def translate_examples(model, tokenizer, english_texts, device):
    model.eval()
    translations = []
    
    with torch.no_grad():
        for text in english_texts:
            # Tokenize input text
            inputs = tokenizer(text, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
            input_ids = inputs.input_ids.to(device)
            attention_mask = inputs.attention_mask.to(device)
            
            # Generate translation
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            
            # Decode the generated translation
            translation = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            translations.append(translation)
    
    return translations

# Main execution function
def main():
    # Step 1: Download/prepare dataset (in practice, download from Kaggle)
    if not os.path.exists('data/sample_en_ur_corpus.csv'):
        download_kaggle_dataset()
    
    # Path to your dataset
    data_path = 'data/sample_en_ur_corpus.csv'
    
    # Step 2: Load and preprocess data
    try:
        english_texts, urdu_texts = load_data(data_path)
        print(f"Loaded {len(english_texts)} sentence pairs")
    except Exception as e:
        print(f"Error loading data: {e}")
        # Fall back to the sample data if loading fails
        english_texts = [
            'Hello, how are you?',
            'My name is Saad.',
            'I live in Islamabad.',
            'The weather is nice today.',
            'I love programming.',
        ]
        urdu_texts = [
            'ہیلو، آپ کیسے ہیں؟',
            'مِرا نام سعد ہے۔',
            'میں نیویارک میں رہتا ہوں۔',
            'آج موسم اچھا ہے۔',
            'مجھے پروگرامنگ پسند ہے۔',
        ]
        print("Using sample data instead")
    
    # Step 3: Split the data into training and test sets
    train_en, test_en, train_ur, test_ur = train_test_split(
        english_texts, urdu_texts, test_size=0.2, random_state=42
    )
    
    # Step 4: Load pre-trained model and tokenizer
    # Using Helsinki-NLP/opus-mt-en-ur from Hugging Face's Transformers
    model_name = "Helsinki-NLP/opus-mt-en-ur"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    # Step 5: Create datasets and dataloaders
    train_dataset = ParallelCorpusDataset(train_en, train_ur, tokenizer)
    test_dataset = ParallelCorpusDataset(test_en, test_ur, tokenizer)
    
    batch_size = 16
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
    # Step 6: Initialize optimizer with learning rate scheduling
    optimizer = optim.AdamW(model.parameters(), lr=5e-5)
    
    # Step 7: Train the model
    print("Starting training...")
    num_epochs = 3  # Reduced for demonstration, can be increased for better results
    training_losses = train_model(model, train_dataloader, optimizer, device, num_epochs=num_epochs)
    
    # Step 8: Evaluate the model
    print("Evaluating model...")
    bleu_score, predictions, references = evaluate_model(model, test_dataloader, tokenizer, device)
    print(f"BLEU score: {bleu_score:.2f}")
    
    # Step 9: Plot training loss
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, num_epochs + 1), training_losses, marker='o')
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.savefig('training_loss.png')
    
    # Step 10: Demonstrate some translations
    sample_texts = test_en[:5]
    translations = translate_examples(model, tokenizer, sample_texts, device)
    
    print("\nSample Translations:")
    for i, (source, target, pred) in enumerate(zip(sample_texts, test_ur[:5], translations)):
        print(f"\nExample {i+1}:")
        print(f"Source (English): {source}")
        print(f"Target (Urdu): {target}")
        print(f"Predicted (Urdu): {pred}")
    
    # Step 11: Save the model
    model_save_path = 'en_ur_transformer_model'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"\nModel saved to {model_save_path}")

if __name__ == "__main__":
    main()