In [1]:
#!/usr/bin/env python
# preprocess_answerable_muril.py

import json
import os
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer  # We'll use AutoTokenizer for MuRIL

# Adjust your paths here
train_json = "English 2.1/squad2.1_train.json"    # SQuAD 2.0 train with unanswerable
val_json   = "English 2.1/squad2.1_val.json"      # SQuAD 2.0 val
test_json  = "English 2.1/squad2.1_test.json"     # SQuAD 2.0 test

out_dir    = "processed_english_answerable_data_muril"   # where we'll write train.pt, val.pt, test.pt
os.makedirs(out_dir, exist_ok=True)

max_length = 512
model_tokenizer_name = "google/muril-large-cased"  # MuRIL Large

###############################################
# 1) Filter out unanswerable QAs
###############################################
def filter_answerable_squad(input_path):
    """
    Returns a new SQuAD JSON dict containing only QAs where is_impossible=False
    with at least one answer.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_data = {
        "version": data.get("version", "filtered_English"),
        "data": []
    }
    for article in data["data"]:
        new_paragraphs = []
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            new_qas = []
            for qa in paragraph["qas"]:
                if not qa.get("is_impossible", False) and qa.get("answers"):
                    new_qas.append(qa)
            if new_qas:
                new_paragraphs.append({
                    "context": context,
                    "qas": new_qas
                })
        if new_paragraphs:
            new_data["data"].append({
                "title": article.get("title", ""),
                "paragraphs": new_paragraphs
            })
    return new_data

###############################################
# 2) Build offset-based examples
###############################################
def build_answerable_examples(squad_data, tokenizer, max_length=384):
    """
    For each answerable QA:
      - tokenize question+context
      - find start/end token indices
      - store offset_mapping, context, gold_text, etc.
    """
    examples_out = []
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                ans = qa["answers"][0]
                ans_start = ans["answer_start"]
                ans_text  = ans["text"]
                ans_end   = ans_start + len(ans_text)

                enc = tokenizer(
                    qa["question"],
                    context,
                    max_length=max_length,
                    truncation="only_second",  # Usually for Q&A we truncate the context
                    return_offsets_mapping=True,
                    return_tensors="pt",
                    padding="max_length"
                )

                input_ids      = enc["input_ids"][0]
                attention_mask = enc["attention_mask"][0]
                offset_mapping = enc["offset_mapping"][0].tolist()

                # find start/end token indices
                start_token = None
                end_token   = None
                for i, (start_char, end_char) in enumerate(offset_mapping):
                    if start_char <= ans_start < end_char:
                        start_token = i
                    if start_char < ans_end <= end_char:
                        end_token = i

                # fallback if mismatch
                if start_token is None or end_token is None or end_token < start_token:
                    start_token = 0
                    end_token   = 0

                ex_item = {
                    "id": qa["id"],
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "start_positions": torch.tensor(start_token, dtype=torch.long),
                    "end_positions":   torch.tensor(end_token,   dtype=torch.long),
                    "offset_mapping":  offset_mapping,
                    "context":         context,
                    "gold_text":       ans_text
                }
                examples_out.append(ex_item)
    return examples_out

def main():
    print(f"Using tokenizer: {model_tokenizer_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_name)  # MuRIL Large tokenizer

    # 1) Filter + build train
    print("Filtering + building train data (English) ...")
    train_data_raw = filter_answerable_squad(train_json)
    train_list = build_answerable_examples(train_data_raw, tokenizer, max_length)
    print(f"Train answerable size: {len(train_list)}")

    # 2) Filter + build val
    print("Filtering + building val data (English) ...")
    val_data_raw = filter_answerable_squad(val_json)
    val_list = build_answerable_examples(val_data_raw, tokenizer, max_length)
    print(f"Val answerable size: {len(val_list)}")

    # 3) Filter + build test
    print("Filtering + building test data (English) ...")
    test_data_raw = filter_answerable_squad(test_json)
    test_list = build_answerable_examples(test_data_raw, tokenizer, max_length)
    print(f"Test answerable size: {len(test_list)}")

    # 4) Save as .pt
    train_out = os.path.join(out_dir, "train.pt")
    val_out   = os.path.join(out_dir, "val.pt")
    test_out  = os.path.join(out_dir, "test.pt")

    torch.save(train_list, train_out)
    torch.save(val_list,   val_out)
    torch.save(test_list,  test_out)

    print(f"\nSaved final PT files to {out_dir}/")
    print("Done! English answerable preprocessing completed with MuRIL Large.")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Using tokenizer: google/muril-large-cased
Filtering + building train data (English) ...
Train answerable size: 80221
Filtering + building val data (English) ...
Val answerable size: 6600
Filtering + building test data (English) ...
Test answerable size: 5928

Saved final PT files to processed_english_answerable_data_muril/
Done! English answerable preprocessing completed with MuRIL Large.


In [2]:
#!/usr/bin/env python
# preprocess_answerable_muril.py

import json
import os
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import AutoTokenizer  # We'll use AutoTokenizer for MuRIL

# Adjust your paths here
train_json = "Telugu 2.1/squad2.1_telugu_train.json"    # SQuAD 2.0 train with unanswerable
val_json   = "Telugu 2.1/squad2.1_telugu_val.json"      # SQuAD 2.0 val
test_json  = "Telugu 2.1/squad2.1_telugu_test.json"     # SQuAD 2.0 test

out_dir    = "processed_telugu_answerable_data_muril"   # where we'll write train.pt, val.pt, test.pt
os.makedirs(out_dir, exist_ok=True)

max_length = 512
model_tokenizer_name = "google/muril-large-cased"  # MuRIL Large

###############################################
# 1) Filter out unanswerable QAs
###############################################
def filter_answerable_squad(input_path):
    """
    Returns a new SQuAD JSON dict containing only QAs where is_impossible=False
    with at least one answer.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_data = {
        "version": data.get("version", "filtered_telugu"),
        "data": []
    }
    for article in data["data"]:
        new_paragraphs = []
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            new_qas = []
            for qa in paragraph["qas"]:
                if not qa.get("is_impossible", False) and qa.get("answers"):
                    new_qas.append(qa)
            if new_qas:
                new_paragraphs.append({
                    "context": context,
                    "qas": new_qas
                })
        if new_paragraphs:
            new_data["data"].append({
                "title": article.get("title", ""),
                "paragraphs": new_paragraphs
            })
    return new_data

###############################################
# 2) Build offset-based examples
###############################################
def build_answerable_examples(squad_data, tokenizer, max_length=384):
    """
    For each answerable QA:
      - tokenize question+context
      - find start/end token indices
      - store offset_mapping, context, gold_text, etc.
    """
    examples_out = []
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                ans = qa["answers"][0]
                ans_start = ans["answer_start"]
                ans_text  = ans["text"]
                ans_end   = ans_start + len(ans_text)

                enc = tokenizer(
                    qa["question"],
                    context,
                    max_length=max_length,
                    truncation="only_second",  # Usually for Q&A we truncate the context
                    return_offsets_mapping=True,
                    return_tensors="pt",
                    padding="max_length"
                )

                input_ids      = enc["input_ids"][0]
                attention_mask = enc["attention_mask"][0]
                offset_mapping = enc["offset_mapping"][0].tolist()

                # find start/end token indices
                start_token = None
                end_token   = None
                for i, (start_char, end_char) in enumerate(offset_mapping):
                    if start_char <= ans_start < end_char:
                        start_token = i
                    if start_char < ans_end <= end_char:
                        end_token = i

                # fallback if mismatch
                if start_token is None or end_token is None or end_token < start_token:
                    start_token = 0
                    end_token   = 0

                ex_item = {
                    "id": qa["id"],
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "start_positions": torch.tensor(start_token, dtype=torch.long),
                    "end_positions":   torch.tensor(end_token,   dtype=torch.long),
                    "offset_mapping":  offset_mapping,
                    "context":         context,
                    "gold_text":       ans_text
                }
                examples_out.append(ex_item)
    return examples_out

def main():
    print(f"Using tokenizer: {model_tokenizer_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_name)  # MuRIL Large tokenizer

    # 1) Filter + build train
    print("Filtering + building train data (TELUGU) ...")
    train_data_raw = filter_answerable_squad(train_json)
    train_list = build_answerable_examples(train_data_raw, tokenizer, max_length)
    print(f"Train answerable size: {len(train_list)}")

    # 2) Filter + build val
    print("Filtering + building val data (TELUGU) ...")
    val_data_raw = filter_answerable_squad(val_json)
    val_list = build_answerable_examples(val_data_raw, tokenizer, max_length)
    print(f"Val answerable size: {len(val_list)}")

    # 3) Filter + build test
    print("Filtering + building test data (TELUGU) ...")
    test_data_raw = filter_answerable_squad(test_json)
    test_list = build_answerable_examples(test_data_raw, tokenizer, max_length)
    print(f"Test answerable size: {len(test_list)}")

    # 4) Save as .pt
    train_out = os.path.join(out_dir, "train.pt")
    val_out   = os.path.join(out_dir, "val.pt")
    test_out  = os.path.join(out_dir, "test.pt")

    torch.save(train_list, train_out)
    torch.save(val_list,   val_out)
    torch.save(test_list,  test_out)

    print(f"\nSaved final PT files to {out_dir}/")
    print("Done! Telugu answerable preprocessing completed with MuRIL Large.")

if __name__ == "__main__":
    main()

Using tokenizer: google/muril-large-cased
Filtering + building train data (TELUGU) ...
Train answerable size: 72039
Filtering + building val data (TELUGU) ...
Val answerable size: 6600
Filtering + building test data (TELUGU) ...
Test answerable size: 5430

Saved final PT files to processed_telugu_answerable_data_muril/
Done! Telugu answerable preprocessing completed with MuRIL Large.


### English

In [1]:
#!/usr/bin/env python
# fine_tune_answerable_muril.py

import os
import re
import torch
import numpy as np
from functools import partial
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from datasets import Dataset
from transformers.trainer_utils import EvalPrediction
import wandb

############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Postprocesses model predictions to extract answer text.
    """
    preds = {}
    num_examples = len(examples)
    num_logits = len(start_logits)

    # Safety check
    if num_logits != num_examples:
        print(f"⚠️ Warning: Logits size {num_logits} doesn't match dataset size {num_examples}. Truncating...")
        num_examples = min(num_examples, num_logits)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue

        if len(start_logits[i]) == 0 or len(end_logits[i]) == 0:
            preds[ex["id"]] = ""
            continue

        # Get best start/end indices
        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        # Extract the predicted answer span
        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """
    eval_preds => (start_logits, end_logits)
    dataset => the raw examples with gold_text
    """
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict.get(ex_id, "")
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = total_em / count * 100.0
    f1_val = total_f1 / count * 100.0
    return {
        "em": em_val,
        "f1": f1_val
    }

def main():
    # 1) Load .pt files
    data_dir = "processed_english_answerable_data_muril"  # from previous script
    train_list = torch.load(os.path.join(data_dir, "train.pt"))
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))

    # 2) Convert to huggingface Datasets
    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)

    # Initialize Weights & Biases (optional)
    wandb.init(project="TeQAS_1.2", name="MuRIL_Eng_1")

    # 3) Initialize model (MuRIL Large)
    model_name = "google/muril-large-cased"
    print(f"Loading model: {model_name}")
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="checkpoints_muril_eng_answerable_v2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,            # Adjust as needed
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_muril_answerable",
        logging_steps=100
    )

    def hf_compute_metrics(p: EvalPrediction):
        # p.predictions => (start_logits, end_logits)
        return compute_metrics(p.predictions, val_dataset)

    # Trainer Initialization
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        tokenizer=None,  # Not strictly needed since we don't do on-the-fly tokenization
        compute_metrics=hf_compute_metrics
    )

    # 4) Train
    trainer.train()

    # 5) Save final model
    trainer.save_model("final_muril_eng_answerable_v2")
    print("Done! Model + checkpoint saved.")

    # Evaluate on val => see final
    final_val_metrics = trainer.evaluate()
    print("Final val metrics:", final_val_metrics)

    # Evaluate on test set => final test metrics
    test_preds = trainer.predict(test_dataset)
    final_test_metrics = compute_metrics(test_preds.predictions, test_list)
    print("Test set metrics:", final_test_metrics)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-27 18:08:18.073976: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  train_list = torch.load(os.path.join(data_dir, "train.pt"))
  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK bac

Loading model: google/muril-large-cased


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-large-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[1738002437.166616] [101f5ae8f935:2854 :f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device




Epoch,Training Loss,Validation Loss,Em,F1
1,0.7504,0.777438,71.181818,84.391236
2,0.5898,0.754629,72.090909,85.245737




Done! Model + checkpoint saved.




Final val metrics: {'eval_loss': 0.7546290755271912, 'eval_em': 72.0909090909091, 'eval_f1': 85.24573695226366, 'eval_runtime': 600.5605, 'eval_samples_per_second': 10.99, 'eval_steps_per_second': 0.345, 'epoch': 2.0}
Test set metrics: {'em': 69.77058029689609, 'f1': 83.99227358899536}


### Telugu

In [1]:
#!/usr/bin/env python
# fine_tune_answerable_muril.py

import os
import re
import torch
import numpy as np
from functools import partial
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from datasets import Dataset
from transformers.trainer_utils import EvalPrediction
import wandb

############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Postprocesses model predictions to extract answer text.
    """
    preds = {}
    num_examples = len(examples)
    num_logits = len(start_logits)

    # Safety check
    if num_logits != num_examples:
        print(f"⚠️ Warning: Logits size {num_logits} doesn't match dataset size {num_examples}. Truncating...")
        num_examples = min(num_examples, num_logits)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue

        if len(start_logits[i]) == 0 or len(end_logits[i]) == 0:
            preds[ex["id"]] = ""
            continue

        # Get best start/end indices
        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        # Extract the predicted answer span
        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """
    eval_preds => (start_logits, end_logits)
    dataset => the raw examples with gold_text
    """
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict.get(ex_id, "")
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = total_em / count * 100.0
    f1_val = total_f1 / count * 100.0
    return {
        "em": em_val,
        "f1": f1_val
    }

def main():
    # 1) Load .pt files
    data_dir = "processed_telugu_answerable_data_muril"  # from previous script
    train_list = torch.load(os.path.join(data_dir, "train.pt"))
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))

    # 2) Convert to huggingface Datasets
    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)

    # Initialize Weights & Biases (optional)
    wandb.init(project="TeQAS_1.2", name="MuRIL_Telugu_3")

    # 3) Initialize model (MuRIL Large)
    model_name = "./final_muril_eng_answerable_v2"
    print(f"Loading model: {model_name}")
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="checkpoints_muril_tel_answerable_v2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,            # Adjust as needed
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_muril_answerable",
        logging_steps=100
    )

    def hf_compute_metrics(p: EvalPrediction):
        # p.predictions => (start_logits, end_logits)
        return compute_metrics(p.predictions, val_dataset)

    # Trainer Initialization
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        tokenizer=None,  # Not strictly needed since we don't do on-the-fly tokenization
        compute_metrics=hf_compute_metrics
    )

    # 4) Train
    trainer.train()

    # 5) Save final model
    trainer.save_model("final_muril_tel_answerable_v2")
    print("Done! Model + checkpoint saved.")

    # # Evaluate on val => see final
    # final_val_metrics = trainer.evaluate()
    # print("Final val metrics:", final_val_metrics)

    # Evaluate on test set => final test metrics
    test_preds = trainer.predict(test_dataset)
    final_test_metrics = compute_metrics(test_preds.predictions, test_list)
    print("Test set metrics:", final_test_metrics)

    wandb.finish()

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-28 08:37:41.048232: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  train_list = torch.load(os.path.join(data_dir, "train.pt"))
  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))
[34m[1mwandb[0m: Currently logged in as: [33msanthoshrishi9999[0m ([33msanthosh-rishi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Loading model: ./final_muril_eng_answerable_v2


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Em,F1
1,1.1845,1.169649,60.19697,77.287197
2,0.9444,1.184296,60.318182,77.901823
3,0.726,1.254607,60.893939,78.19211




Done! Model + checkpoint saved.




Test set metrics: {'em': 51.60220994475138, 'f1': 72.08715583242966}


0,1
eval/em,▁▂█
eval/f1,▁▆█
eval/loss,▁▂█
eval/runtime,▆▁█
eval/samples_per_second,▃█▁
eval/steps_per_second,▃█▁
test/em,▁
test/f1,▁
test/loss,▁
test/runtime,▁

0,1
eval/em,60.89394
eval/f1,78.19211
eval/loss,1.25461
eval/runtime,628.2416
eval/samples_per_second,10.506
eval/steps_per_second,0.329
test/em,0.04545
test/f1,2.4011
test/loss,1.66918
test/runtime,507.0632


In [2]:
#!/usr/bin/env python
# compare_muril_baseline_vs_finetuned.py

import os
import re
import torch
import numpy as np
from functools import partial
from transformers import (
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from datasets import Dataset
from transformers.trainer_utils import EvalPrediction
import wandb

############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    """Normalize text for comparison."""
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    """Exact match score."""
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    """F1-score computation."""
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """Extract answer spans from model outputs."""
    preds = {}
    for i, ex in enumerate(examples):
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue

        # Get best start/end indices
        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        # Extract the predicted answer span
        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """Compute EM and F1 metrics."""
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict.get(ex_id, "")
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = total_em / count * 100.0
    f1_val = total_f1 / count * 100.0
    return {"em": em_val, "f1": f1_val}

############################
# Load Data
############################
def load_data():
    """Load processed datasets."""
    data_dir = "processed_telugu_answerable_data_muril"
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)
    return val_dataset, test_dataset

############################
# Model Evaluation
############################
def evaluate_model(model_name, dataset, model_alias):
    """Load a model and evaluate it on the given dataset."""
    print(f"Loading model: {model_name}")
    model = AutoModelForQuestionAnswering.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

    # Define Training Arguments (for evaluation)
    training_args = TrainingArguments(
        output_dir="checkpoints_muril_comparison",
        per_device_eval_batch_size=16,
        do_train=False,
        do_eval=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=dataset,
        data_collator=default_data_collator,
        compute_metrics=lambda p: compute_metrics(p.predictions, dataset)
    )

    # Evaluate
    results = trainer.evaluate()
    print(f"Results for {model_alias}: {results}")
    return results

############################
# Main Function
############################
def main():
    # Load datasets
    val_dataset, test_dataset = load_data()

    # Initialize W&B for logging
    wandb.init(project="TeQAS_Comparison", name="MuRIL_Baseline_vs_FineTuned")

    # Evaluate Baseline MuRIL
    baseline_results_val  = evaluate_model("google/muril-large-cased", val_dataset, "Baseline MuRIL (Val)")
    baseline_results_test = evaluate_model("google/muril-large-cased", test_dataset, "Baseline MuRIL (Test)")

    # Evaluate Fine-Tuned MuRIL
    finetuned_results_val  = evaluate_model("./final_muril_tel_answerable_v2", val_dataset, "Fine-Tuned MuRIL (Val)")
    finetuned_results_test = evaluate_model("./final_muril_tel_answerable_v2", test_dataset, "Fine-Tuned MuRIL (Test)")

    # Comparison Table
    print("\n### Performance Comparison ###")
    print(f"{'Metric':<20}{'Baseline MuRIL':<20}{'Fine-Tuned MuRIL'}")
    print("-" * 60)
    print(f"{'Validation EM':<20}{baseline_results_val['eval_em']:<20.2f}{finetuned_results_val['eval_em']:.2f}")
    print(f"{'Validation F1':<20}{baseline_results_val['eval_f1']:<20.2f}{finetuned_results_val['eval_f1']:.2f}")
    print(f"{'Test EM':<20}{baseline_results_test['eval_em']:<20.2f}{finetuned_results_test['eval_em']:.2f}")
    print(f"{'Test F1':<20}{baseline_results_test['eval_f1']:<20.2f}{finetuned_results_test['eval_f1']:.2f}")

    # Log results
    wandb.log({
        "Baseline EM (Val)": baseline_results_val["eval_em"],
        "Baseline F1 (Val)": baseline_results_val["eval_f1"],
        "Fine-Tuned EM (Val)": finetuned_results_val["eval_em"],
        "Fine-Tuned F1 (Val)": finetuned_results_val["eval_f1"],
        "Baseline EM (Test)": baseline_results_test["eval_em"],
        "Baseline F1 (Test)": baseline_results_test["eval_f1"],
        "Fine-Tuned EM (Test)": finetuned_results_test["eval_em"],
        "Fine-Tuned F1 (Test)": finetuned_results_test["eval_f1"],
    })

    wandb.finish()
    print("✅ Comparison completed!")

if __name__ == "__main__":
    main()

  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))


Loading model: google/muril-large-cased


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-large-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Results for Baseline MuRIL (Val): {'eval_loss': 6.258428573608398, 'eval_em': 0.0, 'eval_f1': 2.3042203328654196, 'eval_runtime': 643.545, 'eval_samples_per_second': 10.256, 'eval_steps_per_second': 0.322}
Loading model: google/muril-large-cased


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-large-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Results for Baseline MuRIL (Test): {'eval_loss': 6.2398481369018555, 'eval_em': 0.0, 'eval_f1': 4.231857038797818, 'eval_runtime': 499.9708, 'eval_samples_per_second': 10.861, 'eval_steps_per_second': 0.34}
Loading model: ./final_muril_tel_answerable_v2


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Results for Fine-Tuned MuRIL (Val): {'eval_loss': 1.2546072006225586, 'eval_em': 60.89393939393939, 'eval_f1': 78.19210983489383, 'eval_runtime': 592.3048, 'eval_samples_per_second': 11.143, 'eval_steps_per_second': 0.349}
Loading model: ./final_muril_tel_answerable_v2


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Results for Fine-Tuned MuRIL (Test): {'eval_loss': 1.6691776514053345, 'eval_em': 51.60220994475138, 'eval_f1': 72.08715583242966, 'eval_runtime': 530.972, 'eval_samples_per_second': 10.227, 'eval_steps_per_second': 0.32}

### Performance Comparison ###
Metric              Baseline MuRIL      Fine-Tuned MuRIL
------------------------------------------------------------
Validation EM       0.00                60.89
Validation F1       2.30                78.19
Test EM             0.00                51.60
Test F1             4.23                72.09


0,1
Baseline EM (Test),▁
Baseline EM (Val),▁
Baseline F1 (Test),▁
Baseline F1 (Val),▁
Fine-Tuned EM (Test),▁
Fine-Tuned EM (Val),▁
Fine-Tuned F1 (Test),▁
Fine-Tuned F1 (Val),▁
eval/em,▁▁█▇
eval/f1,▁▁█▇

0,1
Baseline EM (Test),0.0
Baseline EM (Val),0.0
Baseline F1 (Test),4.23186
Baseline F1 (Val),2.30422
Fine-Tuned EM (Test),51.60221
Fine-Tuned EM (Val),60.89394
Fine-Tuned F1 (Test),72.08716
Fine-Tuned F1 (Val),78.19211
eval/em,51.60221
eval/f1,72.08716


✅ Comparison completed!


In [4]:
import pandas as pd
import numpy as np

# Set display options for better formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.2f' % x if isinstance(x, (int, float)) else str(x))

def print_table_with_header(df, title):
   # Get the maximum width of each column for formatting
   col_widths = [max(len(str(x)) for x in df[col]) for col in df.columns]
   col_widths = [max(len(col), width) for col, width in zip(df.columns, col_widths)]
   
   # Create header separator
   header_sep = "=" * (sum(col_widths) + (len(col_widths) - 1) * 3 + 4)
   
   # Print title
   print(f"\n{title}")
   print(header_sep)
   
   # Format and print header
   header = " | ".join(f"{col:{width}}" for col, width in zip(df.columns, col_widths))
   print(f"| {header} |")
   
   # Print separator after header
   print("|" + "|".join("-" * (width + 2) for width in col_widths) + "|")
   
   # Print each row
   for _, row in df.iterrows():
       row_str = " | ".join(f"{str(val):{width}}" for val, width in zip(row, col_widths))
       print(f"| {row_str} |")
   
   print(header_sep + "\n")

# XLM-R Full Data
xlmr_full_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score", "Is Impossible Accuracy"],
   "Baseline (Validation)": [0.09, 0.12, 39.15],
   "Fine-Tuned (Validation)": [55.52, 69.59, 87.42],
   "Baseline (Test)": [0.20, 0.22, 39.31],
   "Fine-Tuned (Test)": [61.14, 70.65, 82.20]
})

# XLM-R Answerable Only
xlmr_answerable_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score"],
   "Baseline (Validation)": [0.00, 3.13],
   "Fine-Tuned (Validation)": [59.53, 76.88],
   "Baseline (Test)": [0.00, 3.49],
   "Fine-Tuned (Test)": [51.23, 71.66]
})

# MuRIL Full Data
muril_full_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score", "Is Impossible Accuracy"],
   "Baseline (Validation)": [0.13, 0.15, 53.68],
   "Fine-Tuned (Validation)": [57.08, 71.84, 80.80],
   "Baseline (Test)": [0.24, 0.26, 47.34],
   "Fine-Tuned (Test)": [58.94, 69.90, 75.00]
})

# MuRIL Answerable Only
muril_answerable_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score"],
   "Baseline (Validation)": [0.00, 2.30],
   "Fine-Tuned (Validation)": [60.89, 78.19],
   "Baseline (Test)": [0.00, 4.23],
   "Fine-Tuned (Test)": [51.60, 72.09]
})

# Print all tables with proper formatting
print_table_with_header(xlmr_full_val_test, "Table 1: XLM-R Full Data Performance (Validation & Test)")
print_table_with_header(muril_full_val_test, "Table 2: MuRIL Full Data Performance (Validation & Test)")
print_table_with_header(xlmr_answerable_val_test, "Table 3: XLM-R Answerable Only Performance")
print_table_with_header(muril_answerable_val_test, "Table 4: MuRIL Answerable Only Performance")

# Optionally save to CSV
xlmr_full_val_test.to_csv('xlmr_full_performance.csv', index=False)
xlmr_answerable_val_test.to_csv('xlmr_answerable_performance.csv', index=False)
muril_full_val_test.to_csv('muril_full_performance.csv', index=False)
muril_answerable_val_test.to_csv('muril_answerable_performance.csv', index=False)


Table 1: XLM-R Full Data Performance (Validation & Test)
| Metric                 | Baseline (Validation) | Fine-Tuned (Validation) | Baseline (Test) | Fine-Tuned (Test) |
|------------------------|-----------------------|-------------------------|-----------------|-------------------|
| Exact Match (EM)       | 0.09                  | 55.52                   | 0.2             | 61.14             |
| F1 Score               | 0.12                  | 69.59                   | 0.22            | 70.65             |
| Is Impossible Accuracy | 39.15                 | 87.42                   | 39.31           | 82.2              |


Table 2: MuRIL Full Data Performance (Validation & Test)
| Metric                 | Baseline (Validation) | Fine-Tuned (Validation) | Baseline (Test) | Fine-Tuned (Test) |
|------------------------|-----------------------|-------------------------|-----------------|-------------------|
| Exact Match (EM)       | 0.13                  | 57.08                   | 0