<a href="https://colab.research.google.com/github/shubhsharma097/Cross-Lingual-Knowledge-Transfer/blob/main/Cross_Lingual_Knowledge_Transfer_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cross-Lingual Knowledge Transfer - Direction 1
!pip install -q transformers datasets accelerate sacrebleu sentencepiece

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from sacrebleu import corpus_bleu, corpus_chrf
from tqdm import tqdm
import json
from huggingface_hub import login
from google.colab import userdata
import matplotlib.pyplot as plt

# Experiment configuration
CONFIG = {
    "model_name": "google/gemma-2-2b-it",
    "source_lang": "English",
    "target_lang": "German",  # Langauge Pair: English-to-German
    "dataset_name": "wmt19",
    "dataset_config": "de-en",
    "num_samples": 50,
    "temperature": 0.1,
    "max_new_tokens": 256,
}

# Retrieve the Hugging Face token
hf_token = userdata.get('HF_TOKEN')

if hf_token:
    try:
        login(token=hf_token)
        print("Successfully logged in to Hugging Face")
    except Exception as e:
        print(f"Login failed to Huggin Face: {e}")
else:
    print("Token not found!")


# Load Model and Tokenizer
print(f"\nLoading model: {CONFIG['model_name']}...")

tokenizer = AutoTokenizer.from_pretrained(
    CONFIG['model_name'],
)
model = AutoModelForCausalLM.from_pretrained(
    CONFIG['model_name'],
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# Load WMT19 German-English dataset (using validation split)
dataset = load_dataset(CONFIG['dataset_name'], CONFIG['dataset_config'], split="validation")

# Take 50 samples
test_data = dataset.select(range(CONFIG['num_samples']))

print("\nFirst example:")
print(f"English: {test_data[0]['translation']['en']}")
print(f"German: {test_data[0]['translation']['de']}")


# Translation Function to
# Translate the source language(s) to the target language(s) with the LLM(s)
# input: text to translate, source langauge, target langauge, hugging face model & hugging face tokenizer
# output: translated text string
def translate(source_text, source_lang, target_lang, model, tokenizer):
    # prompt to feed into gemma
    prompt = f"""Translate the following {source_lang} text to {target_lang}. Provide only the translation, no explanations.


{source_lang}: {source_text}
{target_lang}:"""

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate translation
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=CONFIG['max_new_tokens'],
            temperature=CONFIG['temperature'],
            do_sample=False,  # Deterministic for evaluation
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the translation (after the last colon)
    try:
        translation = full_output.split(f"{target_lang}:")[-1].strip()
        # Remove any remaining prompt text
        if source_text in translation:
            translation = translation.replace(source_text, "").strip()
    except:
        translation = full_output

    return translation

# Translate the whole dataset
print(f"\nTranslating {len(test_data)} examples")

translations = []

for example in tqdm(test_data, desc="Translating"):
    source = example['translation']['en']
    reference = example['translation']['de']

    # Generate translation
    translation = translate(
        source,
        CONFIG['source_lang'],
        CONFIG['target_lang'],
        model,
        tokenizer
    )

    translations.append({
        "source": source,
        "reference": reference,
        "translation": translation
    })

print("Translation complete.")


# Measure the translation performance with translation metrics
# Using BLEU & ChrF

print("Measuring translation performance:")

# Extract lists for evaluation
references = [[t["reference"] for t in translations]]
hypotheses = [t["translation"] for t in translations]

# Calculate BLEU
bleu = corpus_bleu(hypotheses, references)
print(f"\nBLEU Score: {bleu.score:.2f}")

# Calculate ChrF
chrf = corpus_chrf(hypotheses, references)
print(f"ChrF Score: {chrf.score:.2f}")


# Present findings with a table or plot

# Create results table
results_df = pd.DataFrame({
    "Model": [CONFIG['model_name']],
    "Language Pair": [f"{CONFIG['source_lang']}â†’{CONFIG['target_lang']}"],
    "Dataset": [CONFIG['dataset_name']],
    "Sample Size": [CONFIG['num_samples']],
    "BLEU": [f"{bleu.score:.2f}"],
    "ChrF": [f"{chrf.score:.2f}"],
})

print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)
print(results_df.to_markdown(index=False))


# Create bar chart of scores
metrics = ['BLEU', 'ChrF']
scores = [bleu.score, chrf.score]

plt.figure(figsize=(7, 4))
plt.bar(metrics, scores, color=['#4CAF50', '#2196F3'])
plt.ylabel('Score', fontsize=12)
plt.title('Translation Quality Metrics', fontsize=14, fontweight='bold')
plt.ylim(0, 100)

# Add score labels on bars
for i, score in enumerate(scores):
    plt.text(i, score + 2, f'{score:.2f}', ha='center', fontsize=11)

plt.tight_layout()
plt.show()