# Notebook 5: Translation (Nepali ‚Üî English)

This notebook implements:
- Using pretrained Helsinki-NLP models for translation
- Nepali to English translation
- English to Nepali translation
- Batch translation support
- Translation quality evaluation

In [None]:
# Import libraries
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import torch
from transformers import MarianMTModel, MarianTokenizer, pipeline
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print("‚úì Libraries imported")

## 1. Configuration

In [None]:
# Paths
BASE_DIR = Path(r'c:\Users\sagun\Desktop\news_project')
DATA_DIR = BASE_DIR / 'data' / 'processed'
MODEL_DIR = BASE_DIR / 'models' / 'translator'
RESULTS_DIR = BASE_DIR / 'results' / 'translations'

MODEL_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Pretrained translation models
# Note: Direct Nepali models may not be available, so we'll use Indic language models
NE_TO_EN_MODEL = "Helsinki-NLP/opus-mt-mul-en"  # Multilingual to English
EN_TO_NE_MODEL = "Helsinki-NLP/opus-mt-en-mul"  # English to Multilingual

print(f"Nepali‚ÜíEnglish Model: {NE_TO_EN_MODEL}")
print(f"English‚ÜíNepali Model: {EN_TO_NE_MODEL}")

## 2. Load Pretrained Translation Models

In [None]:
# Load Nepali to English model
print(f"Loading Nepali‚ÜíEnglish model: {NE_TO_EN_MODEL}...")
ne_to_en_tokenizer = MarianTokenizer.from_pretrained(NE_TO_EN_MODEL)
ne_to_en_model = MarianMTModel.from_pretrained(NE_TO_EN_MODEL).to(device)
print("‚úì Nepali‚ÜíEnglish model loaded")

# Load English to Nepali model
print(f"\nLoading English‚ÜíNepali model: {EN_TO_NE_MODEL}...")
en_to_ne_tokenizer = MarianTokenizer.from_pretrained(EN_TO_NE_MODEL)
en_to_ne_model = MarianMTModel.from_pretrained(EN_TO_NE_MODEL).to(device)
print("‚úì English‚ÜíNepali model loaded")

## 3. Translation Functions

In [None]:
def translate_ne_to_en(text, max_length=512):
    """
    Translate Nepali text to English using pretrained model
    """
    try:
        # Tokenize
        inputs = ne_to_en_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)
        
        # Translate
        ne_to_en_model.eval()
        with torch.no_grad():
            translated = ne_to_en_model.generate(**inputs, max_length=max_length)
        
        # Decode
        translation = ne_to_en_tokenizer.decode(translated[0], skip_special_tokens=True)
        return translation
    except Exception as e:
        print(f"Translation error: {e}")
        return ""

def translate_en_to_ne(text, max_length=512):
    """
    Translate English text to Nepali using pretrained model
    """
    try:
        # Tokenize
        inputs = en_to_ne_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)
        
        # Translate
        en_to_ne_model.eval()
        with torch.no_grad():
            translated = en_to_ne_model.generate(**inputs, max_length=max_length)
        
        # Decode
        translation = en_to_ne_tokenizer.decode(translated[0], skip_special_tokens=True)
        return translation
    except Exception as e:
        print(f"Translation error: {e}")
        return ""

# Test translations
print("Testing translation functions:\n")

nepali_text = "‡§ï‡§æ‡§†‡§Æ‡§æ‡§°‡•å‡§Ç‡§Æ‡§æ ‡§Ü‡§ú ‡§Æ‡•å‡§∏‡§Æ ‡§∞‡§æ‡§Æ‡•ç‡§∞‡•ã ‡§õ‡•§"
english_text = "The weather is good in Kathmandu today."

print(f"Nepali: {nepali_text}")
print(f"‚Üí English: {translate_ne_to_en(nepali_text)}")
print(f"\nEnglish: {english_text}")
print(f"‚Üí Nepali: {translate_en_to_ne(english_text)}")

## 4. Load Data

In [None]:
# Load test data
with open(DATA_DIR / 'test_data.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

df = pd.DataFrame(test_data)
print(f"‚úì Loaded {len(df)} articles")

# Use a sample for demonstration
sample_size = min(30, len(df))
sample_df = df.sample(n=sample_size, random_state=42).copy()
print(f"‚úì Using {sample_size} articles for translation")

## 5. Translate Articles

In [None]:
# Translate Nepali articles to English
print("Translating Nepali articles to English...\n")

translations = []

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Translating"):
    nepali_text = row['text']
    
    # Translate first 200 words for efficiency
    words = nepali_text.split()[:200]
    text_to_translate = ' '.join(words)
    
    english_translation = translate_ne_to_en(text_to_translate)
    
    translations.append({
        'original_nepali': nepali_text,
        'english_translation': english_translation,
        'category': row['category'],
        'original_length': len(nepali_text),
        'translation_length': len(english_translation)
    })

translations_df = pd.DataFrame(translations)
print("\n‚úì Translation complete!")

## 6. Analyze Translations

In [None]:
# Calculate statistics
print("Translation Statistics:\n")
print(f"Total translations: {len(translations_df)}")
print(f"Average original length: {translations_df['original_length'].mean():.0f} chars")
print(f"Average translation length: {translations_df['translation_length'].mean():.0f} chars")
print(f"Average length ratio: {(translations_df['translation_length'] / translations_df['original_length']).mean():.2f}")

## 7. Visualizations

In [None]:
# Visualize translation statistics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Length comparison
axes[0].scatter(translations_df['original_length'], translations_df['translation_length'], 
                alpha=0.6, s=50, color='steelblue')
axes[0].plot([0, translations_df['original_length'].max()], 
             [0, translations_df['original_length'].max()], 
             'r--', label='1:1 ratio')
axes[0].set_title('Original vs Translation Length', fontweight='bold', fontsize=12)
axes[0].set_xlabel('Original Length (Nepali)')
axes[0].set_ylabel('Translation Length (English)')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Length ratio distribution
length_ratio = translations_df['translation_length'] / translations_df['original_length']
axes[1].hist(length_ratio, bins=20, color='lightcoral', edgecolor='black')
axes[1].set_title('Translation Length Ratio Distribution', fontweight='bold', fontsize=12)
axes[1].set_xlabel('Translation/Original Length Ratio')
axes[1].set_ylabel('Frequency')
axes[1].axvline(length_ratio.mean(), color='red', linestyle='--', label=f'Mean: {length_ratio.mean():.2f}')
axes[1].legend()

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'translation_statistics.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úì Visualization saved to {RESULTS_DIR / 'translation_statistics.png'}")

## 8. Save Results

In [None]:
# Save translations
translations_output = translations_df.to_dict('records')

with open(RESULTS_DIR / 'nepali_to_english_translations.json', 'w', encoding='utf-8') as f:
    json.dump(translations_output, f, ensure_ascii=False, indent=2)

print(f"‚úì Translations saved to {RESULTS_DIR / 'nepali_to_english_translations.json'}")

# Save statistics
stats = {
    'total_translations': len(translations_df),
    'avg_original_length': float(translations_df['original_length'].mean()),
    'avg_translation_length': float(translations_df['translation_length'].mean()),
    'avg_length_ratio': float(length_ratio.mean()),
    'models_used': {
        'nepali_to_english': NE_TO_EN_MODEL,
        'english_to_nepali': EN_TO_NE_MODEL
    }
}

with open(RESULTS_DIR / 'translation_statistics.json', 'w', encoding='utf-8') as f:
    json.dump(stats, f, ensure_ascii=False, indent=2)

print(f"‚úì Statistics saved to {RESULTS_DIR / 'translation_statistics.json'}")

## 9. Example Translations

In [None]:
# Display example translations
print("="*80)
print("EXAMPLE TRANSLATIONS")
print("="*80)

for i in range(min(5, len(translations_df))):
    row = translations_df.iloc[i]
    
    print(f"\nüì∞ Article {i+1} - Category: {row['category']}")
    print("-" * 80)
    print(f"\nOriginal (Nepali):\n{row['original_nepali'][:300]}...\n")
    print(f"Translation (English):\n{row['english_translation'][:300]}...")
    print("\n" + "="*80)

## 10. Cross-Language Summarization Demo

In [None]:
# Demonstrate cross-language summarization
# (Translate Nepali ‚Üí English ‚Üí Summarize ‚Üí Translate back to Nepali)

print("Cross-Language Summarization Demo:\n")
print("="*80)

sample_article = sample_df.iloc[0]['text']

print("Step 1: Original Nepali Article")
print(f"{sample_article[:200]}...\n")

print("Step 2: Translate to English")
english_version = translate_ne_to_en(sample_article)
print(f"{english_version[:200]}...\n")

print("Step 3: Translate back to Nepali")
back_to_nepali = translate_en_to_ne(english_version)
print(f"{back_to_nepali[:200]}...\n")

print("="*80)
print("This demonstrates the capability for cross-language processing!")

## 11. Summary

In [None]:
print("="*80)
print("TRANSLATION SUMMARY")
print("="*80)
print(f"\nü§ñ Pretrained Models:")
print(f"  ‚Ä¢ Nepali‚ÜíEnglish: {NE_TO_EN_MODEL}")
print(f"  ‚Ä¢ English‚ÜíNepali: {EN_TO_NE_MODEL}")
print(f"\nüìä Translation Statistics:")
print(f"  ‚Ä¢ Total translations: {stats['total_translations']}")
print(f"  ‚Ä¢ Avg original length: {stats['avg_original_length']:.0f} chars")
print(f"  ‚Ä¢ Avg translation length: {stats['avg_translation_length']:.0f} chars")
print(f"  ‚Ä¢ Avg length ratio: {stats['avg_length_ratio']:.2f}")
print(f"\nüíæ Saved Files:")
print(f"  ‚Ä¢ Translations: {RESULTS_DIR / 'nepali_to_english_translations.json'}")
print(f"  ‚Ä¢ Statistics: {RESULTS_DIR / 'translation_statistics.json'}")
print(f"  ‚Ä¢ Visualization: {RESULTS_DIR / 'translation_statistics.png'}")
print("\n‚úÖ Translation completed successfully!")
print("="*80)