<a href="https://colab.research.google.com/github/prabhatpathak77/punjabi-to-english-SLM/blob/main/Punjabi_to_English_pnyb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# Punjabi → English (pa→en) Mini Translator
# - Robust to missing files/keys
# - Small model (Helsinki-NLP/opus-mt-pa-en)
# - Safe defaults to avoid OOM on Colab
# - Early stopping + eval BLEU
# ============================================

# 1) Install deps (quiet & compatible)
!pip -q install "transformers>=4.41" "datasets>=2.19" "accelerate>=0.31" "evaluate>=0.4.2" sacrebleu sentencepiece

import os, json, math, random
from pathlib import Path

import torch
import evaluate
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
)

# Silence tokenizers warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2) Ensure dataset is present (prompts upload if not)
data_path = Path("/content/data.json")
if not data_path.exists():
    try:
        from google.colab import files  # type: ignore
        print("❗ data.json not found. Please select your data.json to upload.")
        uploaded = files.upload()
        if "data.json" not in uploaded:
            raise FileNotFoundError("data.json was not uploaded.")
    except Exception as e:
        raise FileNotFoundError(
            "data.json is required in the current directory. "
            "Upload it or place it next to the notebook."
        ) from e

# 3) Load & validate dataset
def load_parallel_json(filepath: Path):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list) or len(data) == 0:
        raise ValueError("data.json must be a non-empty list of objects.")

    # Try to auto-detect source/target keys
    cand_src = ["sourceText", "src", "source", "pa", "sentence_pa", "input", "from"]
    cand_tgt = ["targetText", "tgt", "target", "en", "sentence_en", "output", "to"]

    sample = data[0]
    src_key = next((k for k in cand_src if k in sample), None)
    tgt_key = next((k for k in cand_tgt if k in sample), None)

    if src_key is None or tgt_key is None:
        # Also support nested {"translation": {"pa": "...", "en": "..."}}
        if "translation" in sample and isinstance(sample["translation"], dict):
            if "pa" in sample["translation"] and "en" in sample["translation"]:
                src_key, tgt_key = ("translation.pa", "translation.en")
        if src_key is None or tgt_key is None:
            raise ValueError(
                "Could not find parallel text keys. "
                "Expected keys like sourceText/targetText or translation.pa/translation.en."
            )

    def get_val(row, key):
        if "." in key:
            top, sub = key.split(".", 1)
            return (row.get(top, {}) or {}).get(sub, "")
        return row.get(key, "")

    pairs = []
    for row in data:
        src = get_val(row, src_key)
        tgt = get_val(row, tgt_key)
        if isinstance(src, str) and isinstance(tgt, str):
            src, tgt = src.strip(), tgt.strip()
            if src and tgt:
                pairs.append((src, tgt))

    if len(pairs) == 0:
        raise ValueError("No valid (source, target) sentence pairs found.")

    # Deduplicate exact duplicates
    pairs = list(dict.fromkeys(pairs))
    return pairs

pairs = load_parallel_json(data_path)
print(f"✅ Loaded {len(pairs)} sentence pairs.")

# 4) Build HF Dataset with train/valid split
random.seed(42)
random.shuffle(pairs)
n = len(pairs)
valid_size = max(1, min( int(0.1 * n), 200 ))  # up to 10% or 200 lines for quick eval
train_pairs = pairs[valid_size:]
valid_pairs = pairs[:valid_size]

def to_dataset(pairs):
    return Dataset.from_dict({
        "pa": [s for s, _ in pairs],
        "en": [t for _, t in pairs]
    })

raw = DatasetDict({
    "train": to_dataset(train_pairs),
    "validation": to_dataset(valid_pairs)
})
print(raw)

# 5) Model & tokenizer (small Marian model)
model_id = "Helsinki-NLP/opus-mt-pa-en"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

# 6) Tokenization
max_source_len = 128
max_target_len = 128

def preprocess(batch):
    model_inputs = tokenizer(
        batch["pa"], max_length=max_source_len, truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["en"], max_length=max_target_len, truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=["pa", "en"])

# 7) Data collator
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 8) Metrics (BLEU)
bleu = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [[l.strip()] for l in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in lab] for lab in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# 9) Training args (safe on T4/CPU)
use_cuda = torch.cuda.is_available()
fp16 = use_cuda  # safe mixed precision on GPU
bf16 = False     # set True on A100 if you want; False keeps it universal

train_bs = 8 if use_cuda else 4
eval_bs  = 8 if use_cuda else 4

args = Seq2SeqTrainingArguments(
    output_dir="pa_en_small",
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=eval_bs,
    gradient_accumulation_steps=2,   # helps avoid OOM
    learning_rate=2e-5,
    num_train_epochs=3,
    eval_strategy="steps", # Changed evaluation_strategy to eval_strategy
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=128,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    fp16=fp16,
    bf16=bf16,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 10) Train (with graceful OOM fallback to smaller batch)
try:
    trainer.train()
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("⚠️ CUDA OOM detected. Reducing batch via gradient_accumulation_steps.")
        del trainer
        torch.cuda.empty_cache()
        args.gradient_accumulation_steps = 4
        trainer = Seq2SeqTrainer(
            model=model,
            args=args,
            train_dataset=tokenized["train"],
            eval_dataset=tokenized["validation"],
            data_collator=collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        )
        trainer.train()
    else:
        raise

# 11) Save final model & tokenizer
save_dir = "punjabi_to_english_small"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"✅ Model saved to {save_dir}")

# 12) Simple inference helper
def translate(texts, max_new_tokens=128):
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        gen = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            length_penalty=1.0
        )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)

# Quick smoke test (will not error even on CPU)
sample_src = raw["validation"][0]["pa"] if len(raw["validation"]) else raw["train"][0]["pa"]
print("SRC:", sample_src)
print("PRED:", translate(sample_src)[0])

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Loaded 1997 sentence pairs.
DatasetDict({
    train: Dataset({
        features: ['pa', 'en'],
        num_rows: 1798
    })
    validation: Dataset({
        features: ['pa', 'en'],
        num_rows: 199
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/817k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss,Bleu
200,4.2534,4.324522,8.689822


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


✅ Model saved to punjabi_to_english_small
SRC: ਮਦਰ-ਆਫ਼-ਥ੍ਰੀ ਵਿਲਾਬੀ ਅਤੇ ਪਤੀ ਡਾਨ ਬਾਲਡਵਿਨ ,ਜੋਨਜ਼ ਅਤੇ ਉਸਦੀ ਪਤਨੀ ਤਾਰਾ ਕੈਪ ਦੇ ਕਰੀਬ ਦਸ ਸਾਲਾਂ ਲਈ ਰਹੇ ਹਨ
PRED: ਪੁੱਛਿਆ-by-old and my husband, Mayor and husband, are nearly ten years of age, and his wife are on the run.


In [None]:
print(translate("ਹੈਲੋ ਤੁਸੀ ਕਿਵੇਂ ਹੋ")[0])


How You Are


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_dir = "punjabi_to_english_small"  # folder where your trained model is saved

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

def translate(texts, max_new_tokens=128):
    if isinstance(texts, str):  # single string
        texts = [texts]
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        gen = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            num_beams=5,         # beam search = better translations
            length_penalty=1.0
        )
    return tokenizer.batch_decode(gen, skip_special_tokens=True)


