In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
from datasets import Dataset, DatasetDict
import pandas as pd
import evaluate
import numpy as np

# Load dataset
df = pd.read_csv("RAW_Idiom_Data - Updated_Idiom_Data.csv")
df = df[['Actual idiom', 'Human Annotation & With descriptions']].dropna()
df = df.rename(columns={'Actual idiom': 'input_text', 'Human Annotation & With descriptions': 'target_text'})

# Split dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_test = dataset['test'].train_test_split(test_size=0.5, seed=42)
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

# Load model + tokenizer
model_id = "LingoIITGN/ganga-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Tokenization: input + target concatenated
def tokenize_function(examples):
    model_inputs = []
    for inp, tgt in zip(examples["input_text"], examples["target_text"]):
        prompt = f"Translate Hindi idiom to English explanation:\nHindi: {inp}\nEnglish:"
        full = f"{prompt} {tgt} {tokenizer.eos_token}"
        
        enc = tokenizer(full, truncation=True, padding=False, max_length=64)
        labels = enc["input_ids"]
        
        # Mask input prompt tokens
        prompt_len = len(tokenizer(prompt)["input_ids"])
        labels = [
            token if idx >= prompt_len else -100
            for idx, token in enumerate(labels)
        ]
        
        enc["labels"] = labels
        model_inputs.append(enc)
    
    # Convert list of dicts to dict of lists
    batch = {k: [dic[k] for dic in model_inputs] for k in model_inputs[0]}
    return batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Use dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # BERTScore
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bertscore_precision = float(np.mean(bertscore_result['precision']))
    bertscore_recall = float(np.mean(bertscore_result['recall']))
    bertscore_f1 = float(np.mean(bertscore_result['f1']))

    return {
        'rouge1': rouge_result['rouge1'],
        'rouge2': rouge_result['rouge2'],
        'rougeL': rouge_result['rougeL'],
        'rougeLsum': rouge_result['rougeLsum'],
        'bleu': bleu_result['score'],
        'bertscore_precision': bertscore_precision,
        'bertscore_recall': bertscore_recall,
        'bertscore_f1': bertscore_f1,
    }

# Custom callback to print losses and metrics per epoch
class EpochMetricsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Print training loss at the end of each epoch
        if state.log_history:
            # Get the most recent log with 'loss'
            for log in reversed(state.log_history):
                if 'loss' in log:
                    print(f"Epoch {int(state.epoch)} - Training loss: {log['loss']:.4f}")
                    break

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Print validation loss and metrics at the end of each evaluation (epoch)
        if metrics is not None:
            print(f"Epoch {int(state.epoch)} - Validation loss: {metrics.get('eval_loss', float('nan')):.4f}")
            for key in ['eval_rouge1', 'eval_rouge2', 'eval_rougeL', 'eval_rougeLsum', 'eval_bleu', 'eval_bertscore_precision', 'eval_bertscore_recall', 'eval_bertscore_f1']:
                if key in metrics:
                    print(f"{key}: {metrics[key]:.4f}")

# Training arguments
training_args = TrainingArguments(
    output_dir="./ganga-idiom-finetune",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none",
    fp16=True  # Mixed precision to save memory
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EpochMetricsCallback()]
)

# Train
trainer.train()

# Final evaluation
print("Final evaluation on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)


Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Bertscore Precision,Bertscore Recall,Bertscore F1
1,No log,,0.403552,0.293182,0.391478,0.391383,18.725259,0.780848,0.904157,0.837381
2,0.709600,,0.467782,0.352363,0.46002,0.459717,20.114977,0.78387,0.910055,0.8416


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Validation loss: nan
eval_rouge1: 0.4036
eval_rouge2: 0.2932
eval_rougeL: 0.3915
eval_rougeLsum: 0.3914
eval_bleu: 18.7253
eval_bertscore_precision: 0.7808
eval_bertscore_recall: 0.9042
eval_bertscore_f1: 0.8374




Epoch 2 - Training loss: 0.7096
Epoch 2 - Validation loss: nan
eval_rouge1: 0.4589
eval_rouge2: 0.3367
eval_rougeL: 0.4462
eval_rougeLsum: 0.4459
eval_bleu: 21.5817
eval_bertscore_precision: 0.7760
eval_bertscore_recall: 0.9094
eval_bertscore_f1: 0.8366




Epoch 2 - Training loss: 0.7096
Epoch 2 - Validation loss: nan
eval_rouge1: 0.4678
eval_rouge2: 0.3524
eval_rougeL: 0.4600
eval_rougeLsum: 0.4597
eval_bleu: 20.1150
eval_bertscore_precision: 0.7839
eval_bertscore_recall: 0.9101
eval_bertscore_f1: 0.8416




Final evaluation on test set:


Epoch 2 - Validation loss: nan
eval_rouge1: 0.4687
eval_rouge2: 0.3498
eval_rougeL: 0.4630
eval_rougeLsum: 0.4628
eval_bleu: 20.4938
eval_bertscore_precision: 0.7945
eval_bertscore_recall: 0.9165
eval_bertscore_f1: 0.8505
{'eval_loss': nan, 'eval_rouge1': 0.4687466611918398, 'eval_rouge2': 0.34983466788892903, 'eval_rougeL': 0.4629938589628926, 'eval_rougeLsum': 0.46284835103075944, 'eval_bleu': 20.493785712704124, 'eval_bertscore_precision': 0.7945005529245753, 'eval_bertscore_recall': 0.9164730196545838, 'eval_bertscore_f1': 0.8504800441538453, 'eval_runtime': 9.4989, 'eval_samples_per_second': 33.056, 'eval_steps_per_second': 33.056, 'epoch': 2.9928172386272944}




In [2]:
!srun gpustat

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


dgx01                     Wed Jul  2 19:04:25 2025  535.183.06
[0] NVIDIA A100-SXM4-80GB | 59°C,  ?? % | 52486 / 81920 MB | 66076041(19050M) 66076006(21628M) 66076055(11778M)
[1] NVIDIA A100-SXM4-80GB | 43°C,  ?? % | 31271 / 81920 MB | 66076006(9546M) 66076006(5698M) 66076006(9420M) 66076006(6490M)
[2] NVIDIA A100-SXM4-80GB | 29°C,  ?? % |    87 / 81920 MB |
[3] NVIDIA A100-SXM4-80GB | 30°C,  ?? % |    87 / 81920 MB |
[4] NVIDIA A100-SXM4-80GB | 47°C,  ?? % |     1 / 81920 MB |
[5] NVIDIA A100-SXM4-80GB | 39°C,  ?? % | 45413 / 81920 MB | sarmistha(45400M)
[6] NVIDIA A100-SXM4-80GB | 32°C,  ?? % |     1 / 81920 MB |
[7] NVIDIA A100-SXM4-80GB | 34°C,  ?? % | 50625 / 81920 MB | sarmistha(50612M)


In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)
from datasets import Dataset, DatasetDict
import pandas as pd
import evaluate
import numpy as np

# Load dataset
df = pd.read_csv("RAW_Idiom_Data - Updated_Idiom_Data.csv")
df = df[['Actual idiom', 'Human Annotation & With descriptions']].dropna()
df = df.rename(columns={'Actual idiom': 'input_text', 'Human Annotation & With descriptions': 'target_text'})

# Split dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_test = dataset['test'].train_test_split(test_size=0.5, seed=42)
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': val_test['train'],
    'test': val_test['test']
})

# Load model + tokenizer
model_id = "LingoIITGN/ganga-1b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Enable gradient checkpointing to save memory
model.gradient_checkpointing_enable()

# Tokenization: input + target concatenated
def tokenize_function(examples):
    model_inputs = []
    for inp, tgt in zip(examples["input_text"], examples["target_text"]):
        prompt = f"Explain the idioms to English idiomatic understandings:\nIdiom: {inp}\nEnglish:"
        full = f"{prompt} {tgt} {tokenizer.eos_token}"
        
        enc = tokenizer(full, truncation=True, padding=False, max_length=64)
        labels = enc["input_ids"]
        
        # Mask input prompt tokens
        prompt_len = len(tokenizer(prompt)["input_ids"])
        labels = [
            token if idx >= prompt_len else -100
            for idx, token in enumerate(labels)
        ]
        
        enc["labels"] = labels
        model_inputs.append(enc)
    
    # Convert list of dicts to dict of lists
    batch = {k: [dic[k] for dic in model_inputs] for k in model_inputs[0]}
    return batch

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Use dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("sacrebleu")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # BERTScore
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    bertscore_precision = float(np.mean(bertscore_result['precision']))
    bertscore_recall = float(np.mean(bertscore_result['recall']))
    bertscore_f1 = float(np.mean(bertscore_result['f1']))

    return {
        'rouge1': rouge_result['rouge1'],
        'rouge2': rouge_result['rouge2'],
        'rougeL': rouge_result['rougeL'],
        'rougeLsum': rouge_result['rougeLsum'],
        'bleu': bleu_result['score'],
        'bertscore_precision': bertscore_precision,
        'bertscore_recall': bertscore_recall,
        'bertscore_f1': bertscore_f1,
    }

# Custom callback to print losses and metrics per epoch
class EpochMetricsCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Print training loss at the end of each epoch
        if state.log_history:
            # Get the most recent log with 'loss'
            for log in reversed(state.log_history):
                if 'loss' in log:
                    print(f"Epoch {int(state.epoch)} - Training loss: {log['loss']:.4f}")
                    break

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        # Print validation loss and metrics at the end of each evaluation (epoch)
        if metrics is not None:
            print(f"Epoch {int(state.epoch)} - Validation loss: {metrics.get('eval_loss', float('nan')):.4f}")
            for key in ['eval_rouge1', 'eval_rouge2', 'eval_rougeL', 'eval_rougeLsum', 'eval_bleu', 'eval_bertscore_precision', 'eval_bertscore_recall', 'eval_bertscore_f1']:
                if key in metrics:
                    print(f"{key}: {metrics[key]:.4f}")

# Training arguments
training_args = TrainingArguments(
    output_dir="./ganga-idiom-finetune",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none",
    fp16=True  # Mixed precision to save memory
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EpochMetricsCallback()]
)

# Train
trainer.train()

# Final evaluation
print("Final evaluation on test set:")
test_results = trainer.evaluate(tokenized_datasets["test"])
print(test_results)


Map:   0%|          | 0/2506 [00:00<?, ? examples/s]

Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Bertscore Precision,Bertscore Recall,Bertscore F1
1,No log,,0.365979,0.271685,0.363301,0.363918,13.112838,0.746853,0.870564,0.803374
2,0.609700,,0.339337,0.247587,0.332086,0.332477,13.735625,0.744914,0.869613,0.801848


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Validation loss: nan
eval_rouge1: 0.3660
eval_rouge2: 0.2717
eval_rougeL: 0.3633
eval_rougeLsum: 0.3639
eval_bleu: 13.1128
eval_bertscore_precision: 0.7469
eval_bertscore_recall: 0.8706
eval_bertscore_f1: 0.8034




Epoch 2 - Training loss: 0.6097
Epoch 2 - Validation loss: nan
eval_rouge1: 0.3395
eval_rouge2: 0.2471
eval_rougeL: 0.3308
eval_rougeLsum: 0.3315
eval_bleu: 11.4006
eval_bertscore_precision: 0.7409
eval_bertscore_recall: 0.8735
eval_bertscore_f1: 0.8010




Epoch 2 - Training loss: 0.6097
Epoch 2 - Validation loss: nan
eval_rouge1: 0.3393
eval_rouge2: 0.2476
eval_rougeL: 0.3321
eval_rougeLsum: 0.3325
eval_bleu: 13.7356
eval_bertscore_precision: 0.7449
eval_bertscore_recall: 0.8696
eval_bertscore_f1: 0.8018




Final evaluation on test set:


Epoch 2 - Validation loss: nan
eval_rouge1: 0.3812
eval_rouge2: 0.2873
eval_rougeL: 0.3780
eval_rougeLsum: 0.3775
eval_bleu: 14.0821
eval_bertscore_precision: 0.7599
eval_bertscore_recall: 0.8785
eval_bertscore_f1: 0.8143
{'eval_loss': nan, 'eval_rouge1': 0.38117335639642586, 'eval_rouge2': 0.2872542385039878, 'eval_rougeL': 0.37796982839550075, 'eval_rougeLsum': 0.3775317865157093, 'eval_bleu': 14.082145843521506, 'eval_bertscore_precision': 0.7598566868502623, 'eval_bertscore_recall': 0.8785201087119473, 'eval_bertscore_f1': 0.814324592708782, 'eval_runtime': 9.2421, 'eval_samples_per_second': 33.975, 'eval_steps_per_second': 33.975, 'epoch': 2.9928172386272944}


