Initializes variant of **MarianMT** model (English -> Tamil)

In [None]:
!pip install transformers accelerate bitsandbytes
!pip install -U bitsandbytes
!pip install sacrebleu

In [None]:
import os
from dotenv import load_dotenv
from transformers import MarianMTModel, MarianTokenizer, BitsAndBytesConfig
import bitsandbytes as bnb
import torch

# Load environment variables from .env file
load_dotenv()

# Read Hugging Face token
hf_token = os.getenv("HF_TOKEN")  # make sure HF_TOKEN is in your .env

model_name = "Helsinki-NLP/opus-mt-en-dra"

# Load tokenizer
tokenizer = MarianTokenizer.from_pretrained(model_name, use_auth_token=hf_token)

# Configure 8-bit quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

# Load model with 8-bit quantization
model = MarianMTModel.from_pretrained(
    model_name,
    use_auth_token=hf_token,
    quantization_config=bnb_config,
    device_map="auto"
)

# Test inference
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs)
translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Translation:", translated)

# Save quantized model
save_path = "./opus-mt-en-dra-quantized"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Quantized model saved to {save_path}")




tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


source.spm:   0%|          | 0.00/818k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# megabyte amount
!du -sh ./opus-mt-en-dra-quantized


In [None]:
import kagglehub
import os

# Download dataset
path = kagglehub.dataset_download("hemanthkumar21/englist-tamil-parallel-sent")
print("Dataset downloaded to:", path)

# List files inside
for root, dirs, files in os.walk(path):
    for f in files:
        print(os.path.join(root, f))

import pandas as pd

file_path = os.path.join(path, "en-ta", "general_en_ta 87k.csv")
df = pd.read_csv(file_path)

print(df.tail())


In [None]:
df = df.rename(columns={'0': "english", '1': "tamil"})


# Check the result
print(df.tail())

from sklearn.model_selection import train_test_split

# Assuming your dataframe has 'english' and 'tamil' columns
# Optional: shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df["english"] = ">>tam<<" + df["english"]
print(df.head())

In [None]:
import torch
from tqdm import tqdm
import sacrebleu

def evaluate_bleu_beam(
    model, 
    tokenizer, 
    test_df, 
    src_col="english", 
    tgt_col="tamil", 
    num_beams=6, 
    max_length=70, 
    length_penalty=1.2, 
    no_repeat_ngram_size=3,
    prepend_lang_token=True, 
    lang_token="", 
    max_samples=None
):
    """
    Evaluate BLEU score of a translation model using beam search.

    Args:
        model: Hugging Face translation model.
        tokenizer: Corresponding tokenizer.
        test_df: Pandas DataFrame with source and target columns.
        src_col: Name of source column (default: "english").
        tgt_col: Name of target column (default: "tamil").
        num_beams: Beam width for beam search.
        max_length: Maximum length of generated translation.
        length_penalty: Length penalty for beam search (>1 favors longer sentences).
        no_repeat_ngram_size: Block repeating n-grams of this size.
        prepend_lang_token: Whether to prepend a language token to source.
        lang_token: Language token to prepend if prepend_lang_token=True.
        max_samples: Limit number of rows for quick evaluation.

    Returns:
        BLEU score (0-100).
    """
    refs = []
    hyps = []

    device = next(model.parameters()).device

    if max_samples:
        test_df = test_df.iloc[:max_samples]

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating BLEU"):
        src_text = row[src_col]
        tgt_text = row[tgt_col]

        if prepend_lang_token:
            src_text = f"{lang_token}{src_text}"

        # Tokenize and move to device
        inputs = tokenizer(src_text, return_tensors="pt", padding=True).to(device)

        # Generate translation
        with torch.no_grad():
            output_tokens = model.generate(
                **inputs,
                num_beams=num_beams,
                max_length=max_length,
                length_penalty=length_penalty,
                no_repeat_ngram_size=no_repeat_ngram_size,
                early_stopping=True
            )

        translated = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

        refs.append([tgt_text])
        hyps.append(translated)

    # Compute corpus BLEU
    bleu = sacrebleu.corpus_bleu(hyps, list(zip(*refs)))
    return bleu.score


bleu_score = evaluate_bleu_beam(
    model=model, 
    tokenizer=tokenizer, 
    test_df=df, 
    num_beams=10, 
    max_length=70, 
    prepend_lang_token=True, 
    max_samples=10
)
print(f"BLEU score with quantized_model: {bleu_score:.2f}")