In [1]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer
)
from peft import LoraConfig, get_peft_model, PeftModel
import random
import sacrebleu
import torch
import unicodedata
import re

In [3]:
def normalize_unicode(text):
    """Apply NFC Unicode normalization (best for Devanagari)."""
    if not isinstance(text, str):
        return text
    return unicodedata.normalize("NFC", text)

def normalize_whitespace(text):
    """Standardize whitespace, remove redundant spaces, keep lexical integrity."""
    if not isinstance(text, str):
        return text
    text = re.sub(r"\s+", " ", text)    # collapse spaces/tabs/newlines
    return text.strip()

def normalize(text):
    """Full normalization pipeline for Marathi or English text."""
    text = normalize_unicode(text)
    text = normalize_whitespace(text)
    return text

In [4]:
SAMPLE_SIZE = 10000
TRAIN_SPLIT = 0.9
VAL_SPLIT = 0.05  # remaining 0.05 is test

print("Loading full dataset...")
ds = load_dataset("anujsahani01/English-Marathi", split="train")

print("Random sampling 100k...")
sampled = ds.shuffle(seed=67).select(range(SAMPLE_SIZE))

# Split indices
train_end = int(TRAIN_SPLIT * SAMPLE_SIZE)
val_end = int((TRAIN_SPLIT + VAL_SPLIT) * SAMPLE_SIZE)

train_ds = sampled.select(range(train_end))
val_ds = sampled.select(range(train_end, val_end))
test_ds = sampled.select(range(val_end, SAMPLE_SIZE))

train_ds, val_ds, test_ds

Loading full dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/206 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/621M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/243M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2637962 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/879321 [00:00<?, ? examples/s]

Random sampling 100k...


(Dataset({
     features: ['english', 'marathi'],
     num_rows: 9000
 }),
 Dataset({
     features: ['english', 'marathi'],
     num_rows: 500
 }),
 Dataset({
     features: ['english', 'marathi'],
     num_rows: 500
 }))

In [5]:
def format_example(ex):
    mar = normalize(ex["marathi"])
    eng = normalize(ex["english"])

    return {
        "prompt": f"Translate from Marathi to English:\nMarathi: {mar}\nEnglish:",
        "label": eng
    }

train_ds = train_ds.map(format_example)
val_ds   = val_ds.map(format_example)
test_ds  = test_ds.map(format_example)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [6]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [7]:
import evaluate
MODEL = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model on GPU
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
)

bleu = evaluate.load("sacrebleu")

def base_translate(text):
    base_model.eval()
    inputs = tokenizer(text, return_tensors="pt").to(base_model.device)
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=60,
            temperature=0.0,      # deterministic decoding for evaluation
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translation = full_output[len(text):].strip()
    return translation

base_preds = []
refs  = []

for i in range(0, len(test_ds)):
    ex = test_ds[i]
    # print("PROMPT:")
    # print(ex["prompt"])
    translation = base_translate(ex['prompt'])
    # print("MODEL OUTPUT:")
    # print(translation)
    base_preds.append(translation)
    refs.append(ex["english"])
    # print('GROUND TRUTH:')
    # print(ex['english'])

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [9]:
results_base = bleu.compute(predictions=base_preds, references=refs)
print(f"BLEU Score:  {results_base['score']:.2f}")

BLEU Score:  0.39


In [11]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [13]:
from bert_score import score
P_base, R_base, F1_base = score(base_preds, refs, lang="en", verbose=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 9.98 seconds, 50.11 sentences/sec


In [15]:
print("Precision:", P_base.mean().item())
print("Recall:", R_base.mean().item())
print("F1 Score:", F1_base.mean().item())

Precision: 0.7506673336029053
Recall: 0.8455973863601685
F1 Score: 0.79383784532547
