In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset, DatasetDict
import pandas as pd
import evaluate
import numpy as np

# Load dataset
df = pd.read_csv("RAW_Idiom_Data - Updated_Idiom_Data.csv")
df = df[['Actual idiom', 'Human Annotation & With descriptions']].dropna()
df = df.rename(columns={'Actual idiom': 'input_text', 'Human Annotation & With descriptions': 'target_text'})

# Split dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_test = dataset['test'].train_test_split(test_size=0.5, seed=42)
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

# Load model + tokenizer
model_id = "hishab/titulm-llama-3.2-3b-v2.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Tokenization: input + target concatenated
def tokenize_function(examples):
    model_inputs = []
    for inp, tgt in zip(examples["input_text"], examples["target_text"]):
        #prompt = f"Translate Hindi idiom to English explanation:\nHindi: {inp}\nEnglish:"
        prompt = f"Explain the idioms to English idiomatic understandings:\nIdiom: {inp}\nEnglish:"
        full = f"{prompt} {tgt} {tokenizer.eos_token}"
        
        enc = tokenizer(full, truncation=True, padding=False, max_length=64)
        labels = enc["input_ids"]
        
        # Mask input prompt tokens
        prompt_len = len(tokenizer(prompt)["input_ids"])
        labels = [
            token if idx >= prompt_len else -100
            for idx, token in enumerate(labels)
        ]
        
        enc["labels"] = labels
        model_inputs.append(enc)
    
    # Convert list of dicts to dict of lists
    batch = {k: [dic[k] for dic in model_inputs] for k in model_inputs[0]}
    return batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Use dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        **rouge_result,
        'bleu': bleu_result['score']
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bengali-idiom-finetune",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none",
    fp16=True  # Mixed precision to save memory
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Final evaluation
print("Final evaluation on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu
1,No log,1.823836,0.483237,0.296622,0.451231,0.451381,22.567405
2,1.681800,1.880378,0.548141,0.363625,0.521463,0.521355,26.036041


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Final evaluation on test set:


{'eval_loss': 1.8885923624038696, 'eval_rouge1': 0.5371619144459978, 'eval_rouge2': 0.35263183355313665, 'eval_rougeL': 0.5135045394391669, 'eval_rougeLsum': 0.5134159672898323, 'eval_bleu': 25.478276185541567, 'eval_runtime': 20.1156, 'eval_samples_per_second': 15.61, 'eval_steps_per_second': 15.61, 'epoch': 2.9928172386272944}


In [2]:
!srun gpustat

dgx01                     Wed Jul  2 19:53:48 2025  535.183.06
[0] NVIDIA A100-SXM4-80GB | 44°C,  ?? % | 33429 / 81920 MB | 66076006(21628M) 66076055(11778M)
[1] NVIDIA A100-SXM4-80GB | 36°C,  ?? % | 31271 / 81920 MB | 66076006(9546M) 66076006(5698M) 66076006(9420M) 66076006(6490M)
[2] NVIDIA A100-SXM4-80GB | 29°C,  ?? % |    87 / 81920 MB |
[3] NVIDIA A100-SXM4-80GB | 30°C,  ?? % |    87 / 81920 MB |
[4] NVIDIA A100-SXM4-80GB | 58°C,  ?? % |  7731 / 81920 MB | kuntpong(7718M)
[5] NVIDIA A100-SXM4-80GB | 36°C,  ?? % |     1 / 81920 MB |
[6] NVIDIA A100-SXM4-80GB | 33°C,  ?? % |     6 / 81920 MB |
[7] NVIDIA A100-SXM4-80GB | 33°C,  ?? % |     1 / 81920 MB |
