# Model Fine-Tuning

In [3]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [4]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    Trainer
)
from peft import LoraConfig, get_peft_model, PeftModel
import random
import sacrebleu
import torch
import unicodedata
import re

In [5]:
def normalize_unicode(text):
    """Apply NFC Unicode normalization (best for Devanagari)."""
    if not isinstance(text, str):
        return text
    return unicodedata.normalize("NFC", text)

def normalize_whitespace(text):
    """Standardize whitespace, remove redundant spaces, keep lexical integrity."""
    if not isinstance(text, str):
        return text
    text = re.sub(r"\s+", " ", text)    # collapse spaces/tabs/newlines
    return text.strip()

def normalize(text):
    """Full normalization pipeline for Marathi or English text."""
    text = normalize_unicode(text)
    text = normalize_whitespace(text)
    return text

In [10]:
SAMPLE_SIZE = 10000
TRAIN_SPLIT = 0.9
VAL_SPLIT = 0.05  # remaining 0.05 is test

print("Loading full dataset...")
ds = load_dataset("anujsahani01/English-Marathi", split="train")

print("Random sampling 100k...")
sampled = ds.shuffle(seed=67).select(range(SAMPLE_SIZE))

# Split indices
train_end = int(TRAIN_SPLIT * SAMPLE_SIZE)
val_end = int((TRAIN_SPLIT + VAL_SPLIT) * SAMPLE_SIZE)

train_ds = sampled.select(range(train_end))
val_ds = sampled.select(range(train_end, val_end))
test_ds = sampled.select(range(val_end, SAMPLE_SIZE))

train_ds, val_ds, test_ds



Loading full dataset...


README.md:   0%|          | 0.00/206 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/621M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/243M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2637962 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/879321 [00:00<?, ? examples/s]

Random sampling 100k...


(Dataset({
     features: ['english', 'marathi'],
     num_rows: 9000
 }),
 Dataset({
     features: ['english', 'marathi'],
     num_rows: 500
 }),
 Dataset({
     features: ['english', 'marathi'],
     num_rows: 500
 }))

In [14]:
def format_example(ex):
    mar = normalize(ex["marathi"])
    eng = normalize(ex["english"])

    return {
        "prompt": f"Translate from Marathi to English:\nMarathi: {mar}\nEnglish:",
        "label": eng
    }

train_ds = train_ds.map(format_example)
val_ds   = val_ds.map(format_example)
test_ds  = test_ds.map(format_example)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
import torch

print("checkpoint 1")

MODEL = "microsoft/phi-2"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model on GPU
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
)

# Very lightweight LoRA config
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("checkpoint 2")

MAX_LEN = 256

def preprocess(example):
    full_text = example["prompt"] + " " + example["label"]

    model_input = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
    )

    prompt_len = len(tokenizer(
        example["prompt"],
        truncation=True,
        max_length=MAX_LEN
    )["input_ids"])

    labels = model_input["input_ids"].copy()
    labels[:prompt_len] = [-100] * prompt_len

    model_input["labels"] = labels
    return model_input

print("checkpoint 3")

tokenized_train = train_ds.map(
    preprocess,
    remove_columns=train_ds.column_names
)

tokenized_val = val_ds.map(
    preprocess,
    remove_columns=val_ds.column_names
)

print("checkpoint 4")

training_args = TrainingArguments(
    output_dir="lora-phi-marathi",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size = 16
    learning_rate=2e-4,
    num_train_epochs=3,
    # Removed max_steps - trains for full 3 epochs
    fp16=True,
    logging_steps=10,  # Log every 10 steps
    save_steps=50,  # Save checkpoints every 50 steps
    save_total_limit=2,
    eval_strategy="steps",
    eval_steps=25,  # Evaluate every 25 steps
    report_to="none",
    warmup_steps=20,  # 20 warmup steps
    load_best_model_at_end=True,
)

print("checkpoint 5")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

print("checkpoint 6 - starting training")
trainer.train()

print("finished training")
model.save_pretrained("lora-marathi")
tokenizer.save_pretrained("lora-marathi")

checkpoint 1


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 10,485,760 || all params: 2,790,169,600 || trainable%: 0.3758
checkpoint 2
checkpoint 3


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


checkpoint 4
checkpoint 5
checkpoint 6 - starting training


Step,Training Loss,Validation Loss
25,1.2168,0.289741
50,0.24,0.266748
75,0.2356,0.258573
100,0.2462,0.255673
125,0.2555,0.252444
150,0.2354,0.250027
175,0.2543,0.248376
200,0.2477,0.246812
225,0.2431,0.246781
250,0.23,0.245139


finished training


('lora-marathi/tokenizer_config.json',
 'lora-marathi/special_tokens_map.json',
 'lora-marathi/vocab.json',
 'lora-marathi/merges.txt',
 'lora-marathi/added_tokens.json',
 'lora-marathi/tokenizer.json')

In [11]:
from google.colab import files
!zip -r lora-marathi.zip lora-marathi/
files.download('lora-marathi.zip')

  adding: lora-marathi/ (stored 0%)
  adding: lora-marathi/adapter_config.json (deflated 57%)
  adding: lora-marathi/README.md (deflated 65%)
  adding: lora-marathi/tokenizer.json (deflated 82%)
  adding: lora-marathi/added_tokens.json (deflated 84%)
  adding: lora-marathi/special_tokens_map.json (deflated 75%)
  adding: lora-marathi/adapter_model.safetensors (deflated 8%)
  adding: lora-marathi/merges.txt (deflated 53%)
  adding: lora-marathi/tokenizer_config.json (deflated 94%)
  adding: lora-marathi/vocab.json (deflated 59%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/MyDrive/lora-marathi")
tokenizer.save_pretrained("/content/drive/MyDrive/lora-marathi")

Mounted at /content/drive


('/content/drive/MyDrive/lora-marathi/tokenizer_config.json',
 '/content/drive/MyDrive/lora-marathi/special_tokens_map.json',
 '/content/drive/MyDrive/lora-marathi/vocab.json',
 '/content/drive/MyDrive/lora-marathi/merges.txt',
 '/content/drive/MyDrive/lora-marathi/added_tokens.json',
 '/content/drive/MyDrive/lora-marathi/tokenizer.json')

In [13]:
# ===== TEST INFERENCE =====
print("\n" + "=" * 60)
print("TEST INFERENCE")
print("=" * 60)


# Test with a sample
test_example = test_ds[15]
test_input = test_example["prompt"]
print(f"\nInput: {test_input}")


# Generate
model.eval()
inputs = tokenizer(test_input, return_tensors="pt").to(model.device)
with torch.no_grad():
   outputs = model.generate(
       **inputs,
       max_new_tokens=50,
       temperature=0.7,
       do_sample=True,
       pad_token_id=tokenizer.eos_token_id
   )


generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nGenerated: {generated}")
print(f"\nExpected: {test_example['label']}")



TEST INFERENCE

Input: Translate from Marathi to English:
Marathi: बॅटरी डीसचार्ज होत आहे
English:

Generated: Translate from Marathi to English:
Marathi: बॅटरी डीसचार्ज होत आहे
English: Betrayal of trust

Expected: Tablet is discharging


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [1]:
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Load directly from Drive (no extraction needed!)
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = '/content/drive/MyDrive/lora-marathi'  # Update folder name

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = model.to('cuda')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [8]:
import evaluate
bleu = evaluate.load("sacrebleu")

def translate(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            temperature=0.0,      # deterministic decoding for evaluation
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    translation = full_output[len(text):].strip()
    return translation

Downloading builder script: 0.00B [00:00, ?B/s]

In [30]:
preds = []
refs  = []

for i in range(0, len(test_ds)):
    ex = test_ds[i]
    # print("PROMPT:")
    # print(ex["prompt"])
    translation = translate(ex['prompt'])
    # print("MODEL OUTPUT:")
    # print(translation)
    preds.append(translation)
    refs.append(ex["english"])
    # print('GROUND TRUTH:')
    # print(ex['english'])

In [31]:
results = bleu.compute(predictions=preds, references=refs)
print(f"BLEU Score:  {results['score']:.2f}")

BLEU Score:  2.52


In [32]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [33]:
from bert_score import score
P, R, F1 = score(preds, refs, lang="en", verbose=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 7.27 seconds, 68.81 sentences/sec


In [36]:
print("Precision:", P.mean().item())
print("Recall:", R.mean().item())
print("F1 Score:", F1.mean().item())

Precision: 0.878211498260498
Recall: 0.8679447174072266
F1 Score: 0.8728769421577454
