In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

# Working directories (change if you use a different path)
ROOT = "/content/drive/MyDrive/sem5/ML/Project/TalkBridge"
DATA_DIR = f"{ROOT}/data"
CHECKPOINT_DIR = f"{ROOT}/model_training/checkpoints"
FINAL_MODEL_DIR = f"{ROOT}/model_training/final_model"
LOGS_DIR = f"{ROOT}/model_training/logs"

# Ensure folders exist
import os
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
os.makedirs(LOGS_DIR, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("CHECKPOINT_DIR:", CHECKPOINT_DIR)
print("FINAL_MODEL_DIR:", FINAL_MODEL_DIR)


Mounted at /content/drive
DATA_DIR: /content/drive/MyDrive/sem5/ML/Project/TalkBridge/data
CHECKPOINT_DIR: /content/drive/MyDrive/sem5/ML/Project/TalkBridge/model_training/checkpoints
FINAL_MODEL_DIR: /content/drive/MyDrive/sem5/ML/Project/TalkBridge/model_training/final_model


In [None]:
# Select direction: choose one pair and re-run notebook for other directions
SRC_LANG = "en"   # "en" or "hi" or "te"
TGT_LANG = "hi"   # "hi" or "en" or "te"

# Marian model mapping for directions we support
mapping = {
    ("en","hi"): "Helsinki-NLP/opus-mt-en-hi",
    ("hi","en"): "Helsinki-NLP/opus-mt-hi-en",
    ("en","te"): "Helsinki-NLP/opus-mt-en-te",
    ("te","en"): "Helsinki-NLP/opus-mt-te-en",
}

model_name = mapping.get((SRC_LANG, TGT_LANG), None)
if model_name is None:
    raise ValueError(f"No Marian mapping for {SRC_LANG} -> {TGT_LANG}. Use en/hi/en/te combos.")

print("Training direction:", SRC_LANG, "->", TGT_LANG)
print("Model:", model_name)


Training direction: en -> hi
Model: Helsinki-NLP/opus-mt-en-hi


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# verify tokenizer/pad tokens
print("Vocab size:", tokenizer.vocab_size)
print("Pad token id:", tokenizer.pad_token_id)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Vocab size: 61950
Pad token id: 61949


In [None]:
from datasets import Dataset
import io

def load_parallel(src_file, tgt_file):
    with open(src_file, "r", encoding="utf-8") as f:
        src_lines = [line.strip() for line in f if line.strip()]
    with open(tgt_file, "r", encoding="utf-8") as f:
        tgt_lines = [line.strip() for line in f if line.strip()]
    if len(src_lines) != len(tgt_lines):
        print("Warning: src and tgt length mismatch:", len(src_lines), len(tgt_lines))
    # Truncate to min length to avoid misalignment
    n = min(len(src_lines), len(tgt_lines))
    return Dataset.from_dict({"src": src_lines[:n], "tgt": tgt_lines[:n]})

train_path_src = f"{DATA_DIR}/train.{SRC_LANG}"
train_path_tgt = f"{DATA_DIR}/train.{TGT_LANG}"
valid_path_src = f"{DATA_DIR}/valid.{SRC_LANG}"
valid_path_tgt = f"{DATA_DIR}/valid.{TGT_LANG}"

print("Loading:", train_path_src, "<->", train_path_tgt)
train_ds = load_parallel(train_path_src, train_path_tgt)
valid_ds = load_parallel(valid_path_src, valid_path_tgt)

print("Train examples:", len(train_ds))
print("Valid examples:", len(valid_ds))


Loading: /content/drive/MyDrive/sem5/ML/Project/TalkBridge/data/train.en <-> /content/drive/MyDrive/sem5/ML/Project/TalkBridge/data/train.hi
Train examples: 1699978
Valid examples: 520


train_ds = train_ds.map(preprocess_batch, batched=True, remove_columns=train_ds.column_names)
valid_ds = valid_ds.map(preprocess_batch, batched=True, remove_columns=valid_ds.column_names)

train_ds.set_format(type="torch")
valid_ds.set_format(type="torch")


In [None]:
MAX_LEN = 128  # reduce if OOM

def preprocess_batch(batch):
    # tokenization: do not return tensors here (datasets maps to python objects)
    inputs = tokenizer(batch["src"], truncation=True, padding="max_length", max_length=MAX_LEN)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["tgt"], truncation=True, padding="max_length", max_length=MAX_LEN)
    # replace pad token id's in labels by -100 so they are ignored in loss
    labels_ids = labels["input_ids"]
    labels_ids = [[(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in seq] for seq in labels_ids]
    inputs["labels"] = labels_ids
    return inputs

train_ds = train_ds.map(preprocess_batch, batched=True)
valid_ds = valid_ds.map(preprocess_batch, batched=True)

train_ds.set_format(type="torch")
valid_ds.set_format(type="torch")

print(train_ds)
print(valid_ds)


Map:   0%|          | 0/1699978 [00:00<?, ? examples/s]



Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1699978
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 520
})


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=CHECKPOINT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=3e-5,
    logging_steps=200,
    save_total_limit=3,
    fp16=False,
    report_to="none",    # ← MUST ADD
)


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
import os

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()  

In [None]:
trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)
print("Saved final model to", FINAL_MODEL_DIR)




Saved final model to /content/drive/MyDrive/sem5/ML/Project/TalkBridge/model_training/final_model


In [None]:
!pip install -q evaluate
import evaluate
import numpy as np
from tqdm.auto import tqdm

bleu = evaluate.load("sacrebleu")

def generate_and_compute_bleu(dataset, batch_size=8):
    preds = []
    refs = []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i : i + batch_size]

        inputs = {
            "input_ids": batch["input_ids"].to(model.device),
            "attention_mask": batch["attention_mask"].to(model.device),
        }

        outputs = model.generate(**inputs, max_length=128)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # decode labels (ignore -100)
        label_ids = batch["labels"].numpy()
        label_ids = np.where(label_ids != -100, label_ids, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

        preds.extend([p.strip() for p in decoded_preds])
        refs.extend([[l.strip()] for l in decoded_labels])

    result = bleu.compute(predictions=preds, references=refs)
    print("BLEU Score =", result["score"])
    return result

# RUN BLEU
generate_and_compute_bleu(valid_ds, batch_size=8)


BLEU Score = 5.8100000796131015


{'score': 5.8100000796131015,
 'counts': [3057, 910, 311, 102],
 'totals': [10179, 9659, 9139, 8619],
 'precisions': [30.0324196875921,
  9.421265141318978,
  3.402998139840245,
  1.183431952662722],
 'bp': 1.0,
 'sys_len': 10179,
 'ref_len': 9531}