In [5]:
"""
Simple phrase translator pipeline (single-file)
Requirements (install once):
pip install transformers datasets sentencepiece evaluate sacrebleu accelerate

Usage:
- Set src_lang and tgt_lang (eg 'en' -> 'de' for English to German)
- Run: python translator.py
"""

from transformers import (
    MarianMTModel, MarianTokenizer,
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
)
from datasets import Dataset
import torch

# ---------- CONFIG ----------
src_lang = "en"   # source language code (e.g., 'en')
tgt_lang = "hi"   # target language code (e.g., 'de')
# model_name convention for Helsinki-NLP: "Helsinki-NLP/opus-mt-{src}-{tgt}"
model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
device = "cuda" if torch.cuda.is_available() else "cpu"

# ---------- LOAD PRE-TRAINED TRANSLATION MODEL (inference-ready) ----------
print("Loading model:", model_name)
try:
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
except Exception as e:
    # Fallback generic Seq2Seq loader if Marian tokenizer/model not found
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

def translate_phrases(phrases, max_length=128):
    """
    Translate a list of phrases using the loaded model.
    Returns a list of translated strings.
    """
    if isinstance(phrases, str):
        phrases = [phrases]
    # prep inputs
    batch = tokenizer(phrases, return_tensors="pt", padding=True, truncation=True).to(device)
    # generate
    with torch.no_grad():
        outputs = model.generate(**batch, max_length=max_length, num_beams=4, early_stopping=True)
    translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return translations

# Quick demo (pre-trained)
demo_phrases = [
    "Hello, how are you?",
    "Where is the nearest train station?",
    "Can you recommend a good restaurant?"
]
print("\n--- Pre-trained model translations ---")
for src, tgt in zip(demo_phrases, translate_phrases(demo_phrases)):
    print(f"EN -> {tgt_lang.upper()} : {src}  =>  {tgt}")

# ---------- OPTIONAL: Fine-tune on a small custom phrase-pair dataset ----------
# If you only want to use the pre-trained model, skip everything below.
def fine_tune_on_phrase_pairs(pairs, output_dir="./fine_tuned_translator", epochs=3, batch_size=8):
    """
    pairs: list of dicts like [{"src": "Hello", "tgt": "Hallo"}, ...]
    This function fine-tunes the loaded model on tiny data for a few epochs.
    NOTE: For meaningful improvements you need many more pairs and more compute.
    """
    print("\nPreparing to fine-tune on custom data (this is optional and small-scale)...")
    # Build a HuggingFace Dataset
    ds = Dataset.from_list([{"text": p["src"], "translation": p["tgt"]} for p in pairs])

    # Tokenize
    def preprocess(batch):
        inputs = tokenizer(batch["text"], truncation=True, padding="longest", max_length=128)
        targets = tokenizer(batch["translation"], truncation=True, padding="longest", max_length=128)
        inputs["labels"] = targets["input_ids"]
        return inputs

    tokenized = ds.map(preprocess, batched=True, remove_columns=ds.column_names)

    # Data collator for seq2seq
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # Training arguments (very small and quick)
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        num_train_epochs=epochs,
        logging_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        remove_unused_columns=True,
        push_to_hub=False,
        report_to="none",
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model(output_dir)
    print(f"Fine-tuned model saved to {output_dir}")

# Example tiny custom dataset (for demo only)
custom_pairs = [
    {"src": "Good morning", "tgt": "Guten Morgen"},
    {"src": "Thank you very much", "tgt": "Vielen Dank"},
    {"src": "Excuse me, where is the restroom?", "tgt": "Entschuldigung, wo ist die Toilette?"},
    {"src": "How much does this cost?", "tgt": "Wie viel kostet das?"},
]

# Uncomment the next line to run a tiny fine-tune (warning: will use CPU or GPU and take time)
# fine_tune_on_phrase_pairs(custom_pairs, epochs=1, batch_size=4)

# If you fine-tuned and saved to output_dir, you can load it like:
def load_fine_tuned_model(path="./fine_tuned_translator"):
    print("Loading fine-tuned model from", path)
    tok = AutoTokenizer.from_pretrained(path)
    mod = AutoModelForSeq2SeqLM.from_pretrained(path).to(device)
    return tok, mod

# Example: post-fine-tune translate (if you ran fine_tune_on_phrase_pairs and saved)
# tok, mod = load_fine_tuned_model("./fine_tuned_translator")
# >>> use mod & tok similarly to the translate_phrases function above

# ---------- UTILITY: interactive CLI ----------
if __name__ == "__main__":
    print("\nInteractive demo: type a phrase to translate (empty line to quit).")
    while True:
        src = input(f"({src_lang} -> {tgt_lang}) Enter phrase: ").strip()
        if src == "":
            break
        out = translate_phrases([src])[0]
        print("→", out)
    print("Goodbye!")

Loading model: Helsinki-NLP/opus-mt-en-hi


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


--- Pre-trained model translations ---


model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

EN -> HI : Hello, how are you?  =>  हैलो, तुम कैसे हो?
EN -> HI : Where is the nearest train station?  =>  सबसे नज़दीकी स्टेशन कहाँ है?
EN -> HI : Can you recommend a good restaurant?  =>  क्या आप एक अच्छे रेस्तराँ की सिफारिश कर सकते हैं?

Interactive demo: type a phrase to translate (empty line to quit).
(en -> hi) Enter phrase: hello
→ सलाम
(en -> hi) Enter phrase: hi
→ हाय
(en -> hi) Enter phrase: love
→ प्रेम
(en -> hi) Enter phrase: 
Goodbye!
