In [1]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)

2025-09-20 13:21:16.477588: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758374476.800534      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758374476.893933      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def load_vatika_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    contexts, questions, answers = [], [], []
    for domain in data["domains"]:
        for item in domain["contexts"]:
            context = item["context"]
            for qa in item["qas"]:
                contexts.append(context)
                questions.append(qa["question"])
                answers.append(qa["answer"])
    return {"context": contexts, "question": questions, "answers": answers}

train_path = "/kaggle/input/vatika/train.json"
val_path = "/kaggle/input/vatika/validation.json"
testA_path = "/kaggle/input/vatika/test-A-gold.json"

In [4]:
train_data = load_vatika_data(train_path)
val_data = load_vatika_data(val_path)

In [5]:
def prepare_features(examples, tokenizer):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length"
    )
    start_positions, end_positions = [], []
    for i, offsets in enumerate(tokenized["offset_mapping"]):
        context = examples["context"][i]
        answer = examples["answers"][i]
        answer_start = context.find(answer)
        answer_end = answer_start + len(answer)
        start = end = 0
        for idx, (s, e) in enumerate(offsets):
            if s <= answer_start < e:
                start = idx
            if s < answer_end <= e:
                end = idx
                break
        start_positions.append(start)
        end_positions.append(end)
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForQuestionAnswering.from_pretrained("google/muril-base-cased")

dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

tokenized_dataset = dataset.map(lambda x: prepare_features(x, tokenizer), batched=True)
tokenized_val_dataset = val_dataset.map(lambda x: prepare_features(x, tokenizer), batched=True)

tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13092 [00:00<?, ? examples/s]

Map:   0%|          | 0/2798 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir="./muril-vatika",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="no",
    logging_strategy="no",         
    report_to="none",              
    disable_tqdm=True              
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

trainer.train()

  trainer = Trainer(


{'train_runtime': 1880.1566, 'train_samples_per_second': 13.926, 'train_steps_per_second': 0.871, 'train_loss': 4.869628011961996, 'epoch': 2.0}


TrainOutput(global_step=1638, training_loss=4.869628011961996, metrics={'train_runtime': 1880.1566, 'train_samples_per_second': 13.926, 'train_steps_per_second': 0.871, 'train_loss': 4.869628011961996, 'epoch': 2.0})

In [7]:
def compute_metrics(references, predictions):
    # F1
    f1s = []
    for ref, pred in zip(references, predictions):
        ref_tokens, pred_tokens = ref.split(), pred.split()
        common = set(ref_tokens) & set(pred_tokens)
        if not common:
            f1s.append(0)
        else:
            precision = len(common) / len(pred_tokens) if pred_tokens else 0
            recall = len(common) / len(ref_tokens) if ref_tokens else 0
            if precision + recall == 0:
                f1s.append(0)
            else:
                f1s.append(2 * precision * recall / (precision + recall))
    avg_f1 = sum(f1s) / len(f1s)

    # Exact Match
    em = sum([1 if r.strip() == p.strip() else 0 for r, p in zip(references, predictions)]) / len(references)

    # BLEU
    bleu = sum([sentence_bleu([r.split()], p.split()) for r, p in zip(references, predictions)]) / len(references)

    # ROUGE-L (skip empty cases)
    rouge = Rouge()
    rouge_scores = []
    for r, p in zip(references, predictions):
        if not p.strip() or not r.strip():   
            rouge_scores.append(0)
            continue
        score = rouge.get_scores(p, r)[0]['rouge-l']['f']
        rouge_scores.append(score)
    rouge_l = sum(rouge_scores) / len(rouge_scores)

    return avg_f1, em, bleu, rouge_l

In [8]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [9]:
def predict_answers(model, tokenizer, data):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    preds, refs = [], []
    for domain in data["domains"]:
        for ctx in domain["contexts"]:
            context = ctx["context"]
            for qa in ctx["qas"]:
                question = qa["question"]
                refs.append(qa["answer"])
                inputs = tokenizer(
                    question,
                    context,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                    padding="max_length"
                ).to(device)
                with torch.no_grad():
                    outputs = model(**inputs)
                    start = torch.argmax(outputs.start_logits)
                    end = torch.argmax(outputs.end_logits)
                    if start > end:  # fallback fix
                        start, end = end, start
                    tokens = inputs["input_ids"][0][start:end+1]
                    answer = tokenizer.decode(tokens, skip_special_tokens=True).strip()
                    if not answer:  # avoid empty prediction
                        answer = "[UNK]"
                    preds.append(answer)
    return refs, preds

In [10]:

val_data_json = json.load(open(val_path, 'r', encoding='utf-8'))
val_refs, val_preds = predict_answers(model, tokenizer, val_data_json)
val_f1, val_em, val_bleu, val_rouge = compute_metrics(val_refs, val_preds)

print("\nValidation Metrics")
print(f"F1 Score: {val_f1:.4f}")
print(f"Exact Match: {val_em:.4f}")
print(f"BLEU Score: {val_bleu:.4f}")
print(f"ROUGE-L Score: {val_rouge:.4f}")


testA_data_json = json.load(open(testA_path, 'r', encoding='utf-8'))
testA_refs, testA_preds = predict_answers(model, tokenizer, testA_data_json)
testA_f1, testA_em, testA_bleu, testA_rouge = compute_metrics(testA_refs, testA_preds)

print("\n Test-A Metrics")
print(f"F1 Score: {testA_f1:.4f}")
print(f"Exact Match: {testA_em:.4f}")
print(f"BLEU Score: {testA_bleu:.4f}")
print(f"ROUGE-L Score: {testA_rouge:.4f}")


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Validation Metrics
F1 Score: 0.5088
Exact Match: 0.0000
BLEU Score: 0.3513
ROUGE-L Score: 0.5333

 Test-A Metrics
F1 Score: 0.4962
Exact Match: 0.0000
BLEU Score: 0.3382
ROUGE-L Score: 0.5201
