# TeQAS 2.0



> English (1 epoch) + Telugu (3 epochs) Fine-Tuning | complete dataset



## 🔶 XLM-R

#### ♦️ FIne-Tuning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install specific compatible versions
!pip install datasets==2.12.0
!pip install evaluate==0.4.0
!pip install nltk
!pip install rouge-score

Collecting datasets==2.12.0
  Downloading datasets-2.12.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.12.0)
  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
Collecting xxhash (from datasets==2.12.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.12.0)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting responses<0.19 (from datasets==2.12.0)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.12.0)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.14-py310-none-any.whl.metadata (6.6 kB)
D

In [None]:
import torch
import numpy as np
import wandb
import evaluate
import datasets
import transformers
import nltk
from rouge_score import rouge_scorer

# Print library versions
print("Library Versions:")
print(f"torch: {torch.__version__}")
print(f"numpy: {np.__version__}")
print(f"wandb: {wandb.__version__}")
print(f"evaluate: {evaluate.__version__}")
print(f"datasets: {datasets.__version__}")
print(f"transformers: {transformers.__version__}")
print(f"nltk: {nltk.__version__}")
print(f"rouge_score: rouge_score (no __version__ attribute)")

Library Versions:
torch: 2.5.1+cu121
numpy: 1.26.4
wandb: 0.19.1
evaluate: 0.4.0
datasets: 2.12.0
transformers: 4.47.1
nltk: 3.9.1
rouge_score: rouge_score (no __version__ attribute)


In [None]:
  #!/usr/bin/env python
# fine_tune_qa_trainer.py

### Import statements

import os
import torch
import numpy as np
import wandb
import evaluate
from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaTokenizerFast,
    Trainer,
    TrainingArguments
)
from transformers.trainer_utils import EvalPrediction
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import re
from rouge_score import rouge_scorer
from typing import List

### QATrainer With Custom Loss

class QATrainerWithCustomLoss(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Custom loss computation with boundary token weighting.
        """
        # Model outputs
        outputs = model(**inputs)

        # Get the original loss components
        start_logits = outputs.start_logits  # Shape: [batch_size, seq_len]
        end_logits = outputs.end_logits      # Shape: [batch_size, seq_len]
        start_positions = inputs["start_positions"]  # Shape: [batch_size]
        end_positions = inputs["end_positions"]      # Shape: [batch_size]

        # Original loss using CrossEntropyLoss
        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')  # Token-wise loss
        start_loss = loss_fct(start_logits, start_positions)  # Shape: [batch_size, seq_len]
        end_loss = loss_fct(end_logits, end_positions)        # Shape: [batch_size, seq_len]

        # Add boundary weighting
        # Higher weights for tokens near the true boundaries
        batch_size, seq_len = start_logits.shape
        weighted_start_loss = []
        weighted_end_loss = []

        for i in range(batch_size):
            # Get true start and end positions for this example
            true_start = start_positions[i]
            true_end = end_positions[i]

            # Create weight distribution centered around true positions
            start_weights = torch.tensor(
                create_boundary_weights(true_start.item(), seq_len),
                device=start_logits.device,
                dtype=start_logits.dtype
            )
            end_weights = torch.tensor(
                create_boundary_weights(true_end.item(), seq_len),
                device=end_logits.device,
                dtype=end_logits.dtype
            )

            # Apply weights to the loss
            weighted_start_loss.append(start_loss[i] * start_weights)
            weighted_end_loss.append(end_loss[i] * end_weights)

        # Stack and compute mean loss across the batch
        weighted_start_loss = torch.stack(weighted_start_loss)  # Shape: [batch_size, seq_len]
        weighted_end_loss = torch.stack(weighted_end_loss)      # Shape: [batch_size, seq_len]

        total_loss = (weighted_start_loss.mean() + weighted_end_loss.mean()) / 2.0

        return (total_loss, outputs) if return_outputs else total_loss

def create_boundary_weights(position, seq_len, window_size=3, peak_weight=2.0):
    """Create weight distribution around boundary position."""
    weights = np.ones(seq_len)

    # Create window around the boundary position
    start_idx = max(0, position - window_size)
    end_idx = min(seq_len, position + window_size + 1)

    # Linear decay of weights from center
    for idx in range(start_idx, end_idx):
        distance = abs(idx - position)
        weight = peak_weight * (1 - distance/window_size)
        if weight > 1:
            weights[idx] = weight

    return weights

### Metrics and Processing

def normalize_text(s):
    """Text normalization."""
    def remove_articles(txt):
        return re.sub(r"\b(a|an|the)\b", " ", txt)
    def remove_punc(txt):
        return re.sub(r"[^\w\s]", "", txt)
    def white_space_fix(txt):
        return " ".join(txt.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def compute_bleu(pred, gold):
    """Compute BLEU scores for different n-grams."""
    if not pred or not gold:
        return {"unigram": 0.0, "bigram": 0.0, "trigram": 0.0, "quadgram": 0.0}

    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()

    weights_dict = {
        "unigram": (1.0, 0, 0, 0),
        "bigram": (0.5, 0.5, 0, 0),
        "trigram": (0.33, 0.33, 0.34, 0),
        "quadgram": (0.25, 0.25, 0.25, 0.25),
    }

    smoothing = SmoothingFunction().method1
    scores = {}

    for name, weights in weights_dict.items():
        try:
            score = sentence_bleu([gold_tokens], pred_tokens, weights=weights, smoothing_function=smoothing)
        except Exception:
            score = 0.0
        scores[name] = score

    return scores


def compute_rouge(pred, gold):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    if not pred or not gold:
        return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
    try:
        scores = scorer.score(pred, gold)
        return {k: v.fmeasure for k, v in scores.items()}
    except Exception:
        return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

### Answer Post-processing and Metrics Computation:

def clean_prediction(text):
    """Clean predicted text by removing extra spaces and normalizing."""
    if not text:
        return text
    # Example cleaning: strip whitespace and remove extra spaces
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with a single space
    return text

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """Post-process model predictions with improved span extraction."""
    preds = {}
    limit = min(len(examples), len(start_logits))
    max_answer_length = 100
    n_best_size = 20

    for i in range(limit):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]
        ex_id = ex["id"]

        start_indices = np.argsort(start_logits[i])[-n_best_size:].tolist()
        end_indices = np.argsort(end_logits[i])[-n_best_size:].tolist()

        best_score = float('-inf')
        best_start = 0
        best_end = 0

        for start_idx in start_indices:
            for end_idx in end_indices:
                if (end_idx < start_idx or
                    end_idx - start_idx + 1 > max_answer_length or
                    start_idx >= len(offsets) or
                    end_idx >= len(offsets) or
                    offsets[start_idx] is None or
                    offsets[end_idx] is None):
                    continue

                score = start_logits[i][start_idx] + end_logits[i][end_idx]
                if score > best_score:
                    best_score = score
                    best_start = start_idx
                    best_end = end_idx

        if best_score != float('-inf'):
            start_char = offsets[best_start][0]
            end_char = offsets[best_end][1]
            pred_text = context[start_char:end_char]
            pred_text = clean_prediction(pred_text)
        else:
            pred_text = ""

        preds[ex_id] = pred_text

    return preds


def compute_metrics(eval_preds: EvalPrediction, raw_dataset):
    """Compute EM, F1, BLEU, and ROUGE metrics."""
    start_logits, end_logits = eval_preds.predictions

    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
        end_logits = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(raw_dataset, start_logits, end_logits)

    metrics = {
        "em": 0.0,
        "f1": 0.0,
        "bleu_unigram": 0.0,
        "bleu_bigram": 0.0,
        "rouge1": 0.0,
        "rouge2": 0.0,
        "rougeL": 0.0
    }

    total = 0

    for ex in raw_dataset:
        ex_id = ex["id"]
        pred = preds_dict.get(ex_id, "")
        gold = ex["gold_text"]

        # Basic metrics
        metrics["em"] += exact_match(pred, gold)
        metrics["f1"] += f1_score(pred, gold)

        # BLEU scores
        bleu_scores = compute_bleu(pred, gold)
        metrics["bleu_unigram"] += bleu_scores["unigram"]
        metrics["bleu_bigram"] += bleu_scores["bigram"]

        # ROUGE scores
        rouge_scores = compute_rouge(pred, gold)
        metrics["rouge1"] += rouge_scores["rouge1"]
        metrics["rouge2"] += rouge_scores["rouge2"]
        metrics["rougeL"] += rouge_scores["rougeL"]

        total += 1

    # Average all metrics
    for key in metrics:
        metrics[key] = (metrics[key] / total) * 100.0

    return metrics

### Custom Data Collator

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def squad_collate(features):
    """Collate features into batches."""
    for f in features:
        for key in ["input_ids", "attention_mask", "start_positions", "end_positions"]:
            if not isinstance(f[key], torch.Tensor):
                f[key] = torch.tensor(f[key], dtype=torch.long)

    batch = {
        "input_ids": torch.stack([f["input_ids"] for f in features]),
        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        "start_positions": torch.stack([f["start_positions"] for f in features]),
        "end_positions": torch.stack([f["end_positions"] for f in features]),
    }
    return batch

##### English

In [None]:
### Main Training Code

# Base directory setup
BASE_DIR = "/content/drive/MyDrive/TeQAS V5/XLM_R"
MODEL_NAME = "xlm-roberta-large"
DATA_DIR = os.path.join(BASE_DIR, "xlm_r_processed_english_squad_v2")
OUTPUT_DIR = os.path.join(BASE_DIR, "checkpoints_xlmr_answerable_1")
FINAL_MODEL_DIR = os.path.join(BASE_DIR, "final_xlmr_2.0_eng_1")

# Wandb configuration
WANDB_PROJECT = "TeQAS 2.0"
RUN_NAME = "xlmr_eng_run1"

def verify_directory(path, required_files):
    """Check if a directory contains all required files."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"Directory not found: {path}")
    missing_files = [f for f in required_files if not os.path.exists(os.path.join(path, f))]
    if missing_files:
        raise FileNotFoundError(f"Missing files in {path}: {', '.join(missing_files)}")
    print(f"Verified directory: {path}")


def main():
    # Initialize wandb
    wandb.init(project=WANDB_PROJECT, name=RUN_NAME)

    TOKENIZER_FILES = [
        "tokenizer_config.json",
        "special_tokens_map.json",
        "sentencepiece.bpe.model",
        "tokenizer.json"
    ]

    MODEL_FILES = [
        "config.json",
        "model.safetensors"  # Replace with "pytorch_model.bin" if not using safetensors
    ]


    # 1) Load tokenizer and save
    print("Loading tokenizer...")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained(OUTPUT_DIR)

    # 2) Load data
    print("Loading data...")
    train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
    val_list = torch.load(os.path.join(DATA_DIR, "val.pt"))
    test_list = []
    if os.path.exists(os.path.join(DATA_DIR, "test.pt")):
        test_list = torch.load(os.path.join(DATA_DIR, "test.pt"))

    # 3) Convert to HF Datasets
    train_dataset = Dataset.from_list(train_list)
    val_dataset = Dataset.from_list(val_list)
    test_dataset = Dataset.from_list(test_list) if test_list else None

    # 4) Load model
    print("Loading model...")
    model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_NAME)
    model.to(device)  # Ensure the model is on the correct device

    # Set dropout for regularization
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.2

    # 5) Training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",      # Evaluate once per epoch
        save_strategy="epoch",            # Save once per epoch
        num_train_epochs=1,               # Increase to 2 for better convergence
        learning_rate=2e-5,               # Slightly higher LR for faster training
        per_device_train_batch_size=32,   # Reduce batch size for better gradient diversity
        per_device_eval_batch_size=32,
        warmup_ratio=0.1,                 # Retain warmup for smoother start
        weight_decay=0.01,                # Regularization for stability
        max_grad_norm=1.0,                # Gradient clipping
        gradient_accumulation_steps=1,    # Disable accumulation to update weights more frequently
        label_smoothing_factor=0.1,       # Keep smoothing for better generalization
        logging_dir="logs_answerable",
        logging_steps=100,
        report_to="wandb"
    )

    # 6) Initialize Trainer
    def compute_metrics_wrapper(eval_pred):
        return compute_metrics(eval_pred, val_list)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=squad_collate,
        compute_metrics=compute_metrics_wrapper,
    )

    # 7) Train
    print("Starting training...")
    trainer.train()

    # 8) Save final model and tokenizer
    print("Saving final model and tokenizer...")
    trainer.save_model(FINAL_MODEL_DIR)
    tokenizer.save_pretrained(FINAL_MODEL_DIR)

    # Reload final model for evaluation
    print("\nReloading final model for evaluation...")
    final_model = XLMRobertaForQuestionAnswering.from_pretrained(FINAL_MODEL_DIR)
    final_model.to(device)

    # # Evaluate on validation set using the final model
    # print("\nEvaluating on validation set with the final model...")
    # val_trainer = Trainer(
    #     model=final_model,
    #     args=training_args,
    #     eval_dataset=val_dataset,
    #     data_collator=squad_collate,
    #     compute_metrics=lambda eval_pred: compute_metrics(eval_pred, val_list),
    # )
    # val_metrics = val_trainer.evaluate()
    # print("Validation Metrics:", val_metrics)

    # Evaluate on test set using the final model
    if test_dataset and len(test_list) > 0:
        print("\nEvaluating on test set with the final model...")
        test_trainer = Trainer(
            model=final_model,
            args=training_args,
            eval_dataset=test_dataset,
            data_collator=squad_collate,
            compute_metrics=lambda eval_pred: compute_metrics(eval_pred, test_list),
        )
        test_metrics = test_trainer.evaluate()
        print("Test Metrics:", test_metrics)

        # Log metrics to WandB
        wandb.log({
           # "final_val_metrics": val_metrics,
            "final_test_metrics": test_metrics
        })

    # Close WandB
    wandb.finish()
    print("\nTraining completed successfully!")

    # Verify saved files
    verify_directory(FINAL_MODEL_DIR, TOKENIZER_FILES + MODEL_FILES)

if __name__ == "__main__":
    main()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Loading data...


  train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
  val_list = torch.load(os.path.join(DATA_DIR, "val.pt"))
  test_list = torch.load(os.path.join(DATA_DIR, "test.pt"))


Loading model...


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...




Epoch,Training Loss,Validation Loss,Em,F1,Bleu Unigram,Bleu Bigram,Rouge1,Rouge2,Rougel
1,0.9103,0.958987,68.656195,79.67209,56.360128,40.980833,59.525473,37.845258,59.485369


Saving final model and tokenizer...

Reloading final model for evaluation...

Evaluating on test set with the final model...


Test Metrics: {'eval_loss': 0.8145819306373596, 'eval_model_preparation_time': 0.0069, 'eval_em': 73.18266149434982, 'eval_f1': 80.50265218050467, 'eval_bleu_unigram': 35.56329938915903, 'eval_bleu_bigram': 25.910992238915902, 'eval_rouge1': 37.778763421064305, 'eval_rouge2': 23.783572813894303, 'eval_rougeL': 37.763108128402266, 'eval_runtime': 235.8855, 'eval_samples_per_second': 50.27, 'eval_steps_per_second': 1.573}


0,1
eval/bleu_bigram,█▁
eval/bleu_unigram,█▁
eval/em,▁█
eval/f1,▁█
eval/loss,█▁
eval/model_preparation_time,▁
eval/rouge1,█▁
eval/rouge2,█▁
eval/rougeL,█▁
eval/runtime,▁█

0,1
eval/bleu_bigram,25.91099
eval/bleu_unigram,35.5633
eval/em,73.18266
eval/f1,80.50265
eval/loss,0.81458
eval/model_preparation_time,0.0069
eval/rouge1,37.77876
eval/rouge2,23.78357
eval/rougeL,37.76311
eval/runtime,235.8855



Training completed successfully!


FileNotFoundError: Missing files in /content/drive/MyDrive/TeQAS V5/XLM_R/final_xlmr_2.0_eng_1: vocab.json, merges.txt, added_tokens.json, pytorch_model.bin

##### తెలుగు

In [None]:
# Base directory setup
BASE_DIR = "/content/drive/MyDrive/TeQAS V5/XLM_R"
MODEL_NAME = os.path.join(BASE_DIR, "final_xlmr_2.0_eng_1")
DATA_DIR = os.path.join(BASE_DIR, "xlm_r_processed_telugu_squad_v2")
OUTPUT_DIR = os.path.join(BASE_DIR, "checkpoints_xlmr_answerable_tel_3")
FINAL_MODEL_DIR = os.path.join(BASE_DIR, "final_xlmr_2.0_tel_3")

# Wandb configuration
WANDB_PROJECT = "TeQAS 2.0"
RUN_NAME = "xlmr_tel_run3"

def main():
    # Initialize wandb
    wandb.init(project=WANDB_PROJECT, name=RUN_NAME)

    # 1) Load tokenizer and save
    print("Loading tokenizer...")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")
    tokenizer.save_pretrained(OUTPUT_DIR)

    # 2) Load data
    print("Loading data...")
    train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
    val_list = torch.load(os.path.join(DATA_DIR, "val.pt"))
    test_list = []
    if os.path.exists(os.path.join(DATA_DIR, "test.pt")):
        test_list = torch.load(os.path.join(DATA_DIR, "test.pt"))

    # 3) Convert to HF Datasets
    train_dataset = Dataset.from_list(train_list)
    val_dataset = Dataset.from_list(val_list)
    test_dataset = Dataset.from_list(test_list) if test_list else None

    # 4) Load model
    print("Loading model...")
    model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_NAME)
    model.to(device)  # Ensure the model is on the correct device

    # Set dropout for regularization
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.2

    # 5) Training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",      # Evaluate once per epoch
        save_strategy="epoch",            # Save once per epoch
        num_train_epochs=3,               # Increase to 2 for better convergence
        learning_rate=2e-5,               # Slightly higher LR for faster training
        per_device_train_batch_size=32,   # Reduce batch size for better gradient diversity
        per_device_eval_batch_size=32,
        warmup_ratio=0.1,                 # Retain warmup for smoother start
        weight_decay=0.01,                # Regularization for stability
        max_grad_norm=1.0,                # Gradient clipping
        gradient_accumulation_steps=1,    # Disable accumulation to update weights more frequently
        label_smoothing_factor=0.1,       # Keep smoothing for better generalization
        logging_dir="logs_answerable",
        logging_steps=100,
        report_to="wandb"
    )


    # 6) Initialize Trainer
    def compute_metrics_wrapper(eval_pred):
        return compute_metrics(eval_pred, val_list)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=squad_collate,
        compute_metrics=compute_metrics_wrapper,
    )

    # 7) Train
    print("Starting training...")
    trainer.train()

    # 8) Save final model
    print("Saving final model...")
    trainer.save_model(FINAL_MODEL_DIR)

    # Reload final model for evaluation
    print("\nReloading final model for evaluation...")
    final_model = XLMRobertaForQuestionAnswering.from_pretrained(FINAL_MODEL_DIR)
    final_model.to(device)

    # Evaluate on validation set using the final model
    print("\nEvaluating on validation set with the final model...")
    val_trainer = Trainer(
        model=final_model,
        args=training_args,
        eval_dataset=val_dataset,
        data_collator=squad_collate,
        compute_metrics=lambda eval_pred: compute_metrics(eval_pred, val_list),
    )
    val_metrics = val_trainer.evaluate()
    print("Validation Metrics:", val_metrics)

    # Evaluate on test set using the final model
    if test_dataset and len(test_list) > 0:
        print("\nEvaluating on test set with the final model...")
        test_trainer = Trainer(
            model=final_model,
            args=training_args,
            eval_dataset=test_dataset,
            data_collator=squad_collate,
            compute_metrics=lambda eval_pred: compute_metrics(eval_pred, test_list),
        )
        test_metrics = test_trainer.evaluate()
        print("Test Metrics:", test_metrics)

        # Log metrics to WandB
        wandb.log({
            "final_val_metrics": val_metrics,
            "final_test_metrics": test_metrics
        })
    else:
        wandb.log({"final_val_metrics": val_metrics})

    # Close WandB
    wandb.finish()
    print("\nTraining completed successfully!")

if __name__ == "__main__":
    main()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Loading data...


  train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
  val_list = torch.load(os.path.join(DATA_DIR, "val.pt"))
  test_list = torch.load(os.path.join(DATA_DIR, "test.pt"))


Loading model...




Starting training...




Epoch,Training Loss,Validation Loss,Em,F1,Bleu Unigram,Bleu Bigram,Rouge1,Rouge2,Rougel
1,1.5793,1.596374,54.072609,66.749584,43.490724,32.752506,13.016367,3.340144,13.016367
2,1.2286,1.485224,55.690016,69.682899,46.322471,35.09484,13.497645,3.435246,13.497645
3,1.0923,1.511566,55.899465,69.965079,46.249508,35.040951,13.530699,3.457289,13.530699


Saving final model...

Reloading final model for evaluation...

Evaluating on validation set with the final model...


Validation Metrics: {'eval_loss': 1.5115656852722168, 'eval_model_preparation_time': 0.0078, 'eval_em': 55.89946474284384, 'eval_f1': 69.96507908106994, 'eval_bleu_unigram': 46.24950823241404, 'eval_bleu_bigram': 35.04095097089434, 'eval_rouge1': 13.530699125300002, 'eval_rouge2': 3.457289323300494, 'eval_rougeL': 13.530699125300002, 'eval_runtime': 170.853, 'eval_samples_per_second': 50.301, 'eval_steps_per_second': 1.574}

Evaluating on test set with the final model...


Test Metrics: {'eval_loss': 1.4080579280853271, 'eval_model_preparation_time': 0.0068, 'eval_em': 61.40512631384843, 'eval_f1': 70.91967629635985, 'eval_bleu_unigram': 27.295939117647627, 'eval_bleu_bigram': 21.57858508907183, 'eval_rouge1': 7.426291310672646, 'eval_rouge2': 2.0978767682621635, 'eval_rougeL': 7.426291310672646, 'eval_runtime': 214.1874, 'eval_samples_per_second': 50.638, 'eval_steps_per_second': 1.583}


0,1
eval/bleu_bigram,▇███▁
eval/bleu_unigram,▇███▁
eval/em,▁▃▃▃█
eval/f1,▁▆▆▆█
eval/loss,█▄▅▅▁
eval/model_preparation_time,█▁
eval/rouge1,▇███▁
eval/rouge2,▇███▁
eval/rougeL,▇███▁
eval/runtime,▁▁▁▁█

0,1
eval/bleu_bigram,21.57859
eval/bleu_unigram,27.29594
eval/em,61.40513
eval/f1,70.91968
eval/loss,1.40806
eval/model_preparation_time,0.0068
eval/rouge1,7.42629
eval/rouge2,2.09788
eval/rougeL,7.42629
eval/runtime,214.1874



Training completed successfully!


#### ♦️ Evaluation

In [None]:
#!/usr/bin/env python
# eval_squad_v2_telugu.py
#
# A separate evaluation script that:
#  1) Loads a fine-tuned XLM-R model (SQuAD v2.0 style).
#  2) Runs inference on val/test datasets.
#  3) Computes EM, F1, BLEU, ROUGE, plus is_impossible accuracy.
#
# NOTE: Adapt paths (MODEL_DIR, DATA_DIR, etc.) as needed.

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaTokenizerFast
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import re

##############################################################################
# 1) Telugu-Specific Normalization
##############################################################################
def normalize_text_telugu(s):
    """
    Minimal Telugu-oriented normalization:
      - Removes extra punctuation
      - Lowercases any English letters
      - Strips extra whitespace
    """
    if not s:
        return ""
    # Remove everything not Telugu or alphanumeric. Adjust to your corpus if needed.
    s = re.sub(r"[^\u0C00-\u0C7Fa-zA-Z0-9\s]", "", s)
    # Lowercase (affects only English letters)
    s = s.lower()
    # Remove multiple spaces
    s = " ".join(s.split())
    return s

##############################################################################
# 2) Basic Cleaning for Predicted Spans
##############################################################################
def clean_prediction(text):
    if not text:
        return ""
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

##############################################################################
# 3) No-Answer Post-processing for SQuAD v2.0
##############################################################################
def postprocess_qa_predictions_squad_v2(
    examples,
    start_logits,
    end_logits,
    cls_index=0,
    max_answer_length=100,
    n_best_size=20,
    null_score_diff_threshold=0.0
):
    """
    - examples: list of dicts with "id", "context", "offset_mapping", etc.
    - start_logits[i], end_logits[i]: arrays of length [sequence_length].
    - cls_index: index for [CLS] token logit (XLM-R often has CLS at position 0).
    - null_score_diff_threshold: threshold for "no answer" decision.
    """
    preds = {}

    for i, ex in enumerate(examples):
        context = ex["context"]
        offsets = ex["offset_mapping"]
        ex_id   = ex["id"]

        # 1) Find best span
        best_score = float('-inf')
        best_start, best_end = 0, 0

        start_idxs = np.argsort(start_logits[i])[-n_best_size:].tolist()
        end_idxs   = np.argsort(end_logits[i])[-n_best_size:].tolist()

        for st in start_idxs:
            for en in end_idxs:
                if en < st:
                    continue
                if (en - st + 1) > max_answer_length:
                    continue
                if st >= len(offsets) or en >= len(offsets):
                    continue

                span_score = start_logits[i][st] + end_logits[i][en]
                if span_score > best_score:
                    best_score = span_score
                    best_start = st
                    best_end   = en

        # 2) Calculate no-answer score from [CLS] token
        cls_score = start_logits[i][cls_index] + end_logits[i][cls_index]

        # 3) Decide if no-answer
        score_diff = best_score - cls_score
        if score_diff < null_score_diff_threshold:
            preds[ex_id] = ""
        else:
            start_char = offsets[best_start][0]
            end_char   = offsets[best_end][1]
            text_span  = context[start_char:end_char]
            preds[ex_id] = clean_prediction(text_span)

    return preds

##############################################################################
# 4) Metrics: EM, F1, BLEU, ROUGE, plus is_impossible accuracy
##############################################################################
def exact_match(pred, gold):
    return 1.0 if normalize_text_telugu(pred) == normalize_text_telugu(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return float(pred_tokens == gold_tokens)
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if (precision + recall) == 0:
        return 0.0
    return (2.0 * precision * recall) / (precision + recall)

def compute_bleu(pred, gold):
    if not pred or not gold:
        return {"unigram": 0.0, "bigram": 0.0}
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    smoothing = SmoothingFunction().method1
    unigram = sentence_bleu([gold_tokens], pred_tokens,
                            weights=(1, 0, 0, 0),
                            smoothing_function=smoothing)
    bigram  = sentence_bleu([gold_tokens], pred_tokens,
                            weights=(0.5, 0.5, 0, 0),
                            smoothing_function=smoothing)
    return {"unigram": unigram, "bigram": bigram}

def compute_rouge(pred, gold):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=False)
    if not pred or not gold:
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
    pred_clean = normalize_text_telugu(pred)
    gold_clean = normalize_text_telugu(gold)
    scores = scorer.score(pred_clean, gold_clean)
    return {
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure
    }

def compute_is_impossible_accuracy(examples, predictions):
    """
    For each example, if gold_text is empty => gold is_impossible.
    We check if predicted text is also empty => predicted is_impossible.
    Then compute accuracy over *only* the is_impossible subset.
    """
    total_impossible = 0
    correct_impossible = 0
    for ex in examples:
        gold = ex.get("gold_text", "")
        gold_impossible = (gold.strip() == "")
        if gold_impossible:
            total_impossible += 1
            pred_text = predictions.get(ex["id"], "")
            if pred_text.strip() == "":
                correct_impossible += 1

    if total_impossible == 0:
        return 100.0  # If no impossible examples at all, define it as 100% or 0%
    return (correct_impossible / total_impossible) * 100.0

##############################################################################
# 5) Master Evaluation Function
##############################################################################
def evaluate_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=0.0):
    """
    Returns a dict with EM, F1, BLEU, ROUGE, plus 'is_impossible_acc'.
    """
    predictions = postprocess_qa_predictions_squad_v2(
        examples,
        start_logits,
        end_logits,
        cls_index=0,  # For xlm-roberta, typically 0 is [CLS]
        null_score_diff_threshold=null_score_diff_threshold
    )

    total = len(examples)
    metrics = {
        "em": 0.0,
        "f1": 0.0,
        "bleu_unigram": 0.0,
        "bleu_bigram": 0.0,
        "rouge1": 0.0,
        "rouge2": 0.0,
        "rougeL": 0.0
    }

    for ex in examples:
        gold = ex.get("gold_text", "")
        pred = predictions.get(ex["id"], "")

        metrics["em"] += exact_match(pred, gold)
        metrics["f1"] += f1_score(pred, gold)

        bleu_scores = compute_bleu(pred, gold)
        metrics["bleu_unigram"] += bleu_scores["unigram"]
        metrics["bleu_bigram"]  += bleu_scores["bigram"]

        rouge_scores = compute_rouge(pred, gold)
        metrics["rouge1"] += rouge_scores["rouge1"]
        metrics["rouge2"] += rouge_scores["rouge2"]
        metrics["rougeL"] += rouge_scores["rougeL"]

    # Average the main metrics
    for key in metrics:
        metrics[key] = (metrics[key] / total) * 100.0

    # Calculate is_impossible accuracy
    is_imp_acc = compute_is_impossible_accuracy(examples, predictions)
    metrics["is_impossible_acc"] = is_imp_acc

    return metrics, predictions

##############################################################################
# 6) Minimal Collate for Eval
##############################################################################
def squad_collate_eval(features):
    """
    For evaluation, we only need input_ids & attention_mask for the model.
    Convert lists to Tensors.
    """
    input_ids = []
    attention_masks = []
    for f in features:
        input_ids.append(torch.tensor(f["input_ids"], dtype=torch.long))
        attention_masks.append(torch.tensor(f["attention_mask"], dtype=torch.long))
    return {
        "input_ids": torch.stack(input_ids, dim=0),
        "attention_mask": torch.stack(attention_masks, dim=0)
    }

##############################################################################
# 7) Main Evaluation Code
##############################################################################
def main():

    BASE_DIR = "/content/drive/MyDrive/TeQAS V5/XLM_R"
    DATA_DIR = os.path.join(BASE_DIR, "xlm_r_processed_telugu_squad_v2")

    # Adjust these paths:
    MODEL_DIR = os.path.join(BASE_DIR, "final_xlmr_2.0_tel_3")
    VAL_FILE  = os.path.join(DATA_DIR, "val.pt")
    TEST_FILE = os.path.join(DATA_DIR, "test.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # Load model
    print("\nLoading final model from:", MODEL_DIR)
    model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_DIR)
    model.eval()
    model.to(device)

    # Load tokenizer if needed for any reason (optional)
    tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")

    # Function to run evaluation on a dataset (val or test)
    def run_eval(data_file, split_name="val"):
        if not os.path.exists(data_file):
            print(f"[{split_name}] file not found: {data_file}")
            return None
        print(f"Loading [{split_name}] data from:", data_file)
        data_list = torch.load(data_file)
        dataset = Dataset.from_list(data_list)
        dataloader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=squad_collate_eval)

        all_start_logits = []
        all_end_logits   = []

        print(f"Running inference on {split_name} set... (num_examples={len(dataset)})")
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                start_logits = outputs.start_logits.detach().cpu().numpy()
                end_logits   = outputs.end_logits.detach().cpu().numpy()

                all_start_logits.append(start_logits)
                all_end_logits.append(end_logits)

        all_start_logits = np.concatenate(all_start_logits, axis=0)
        all_end_logits   = np.concatenate(all_end_logits, axis=0)

        # Evaluate with threshold=0.0 (typical default)
        results, preds = evaluate_squad_v2(
            data_list,
            all_start_logits,
            all_end_logits,
            null_score_diff_threshold=0.0
        )
        print(f"\n[{split_name} Metrics]:")
        for k, v in results.items():
            print(f"  {k}: {v:.2f}")
        print("-"*40)
        return results

    # Evaluate on validation
    val_metrics = run_eval(VAL_FILE, "val")

    # Evaluate on test
    test_metrics = run_eval(TEST_FILE, "test")

    print("\nEvaluation script completed.")

if __name__ == "__main__":
    main()

Device: cuda

Loading final model from: /content/drive/MyDrive/TeQAS V5/XLM_R/final_xlmr_2.0_tel_3
Loading [val] data from: /content/drive/MyDrive/TeQAS V5/XLM_R/xlm_r_processed_telugu_squad_v2/val.pt


  data_list = torch.load(data_file)


Running inference on val set... (num_examples=8594)

[val Metrics]:
  em: 55.52
  f1: 69.59
  bleu_unigram: 45.88
  bleu_bigram: 34.80
  rouge1: 13.45
  rouge2: 1.35
  rougeL: 13.45
  is_impossible_acc: 87.42
----------------------------------------
Loading [test] data from: /content/drive/MyDrive/TeQAS V5/XLM_R/xlm_r_processed_telugu_squad_v2/test.pt
Running inference on test set... (num_examples=10846)

[test Metrics]:
  em: 61.14
  f1: 70.65
  bleu_unigram: 27.03
  bleu_bigram: 21.40
  rouge1: 7.38
  rouge2: 0.84
  rougeL: 7.38
  is_impossible_acc: 82.20
----------------------------------------

Evaluation script completed.


##### Comparision with Baseline Models

In [None]:
#!/usr/bin/env python
# eval_baseline_vs_finetuned.py

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaTokenizerFast,
    BertForQuestionAnswering,
    BertTokenizerFast
)
import re
from collections import defaultdict

##########################################
# 1) Normalize Telugu text for evaluation
##########################################
def normalize_text_telugu(s):
    """Minimal normalization for Telugu text."""
    if not s:
        return ""
    s = re.sub(r"[^\u0C00-\u0C7Fa-zA-Z0-9\s]", "", s)  # Keep Telugu, English, numbers
    s = s.lower()
    s = " ".join(s.split())  # Remove extra spaces
    return s

##########################################
# 2) Cleaning Predictions
##########################################
def clean_prediction(text):
    """Remove unwanted characters and extra spaces."""
    return text.strip().replace("\n", " ")

##########################################
# 3) Postprocessing for SQuAD v2.0
##########################################
def postprocess_qa_predictions_squad_v2(
    examples, start_logits, end_logits, cls_index=0, null_score_diff_threshold=0.0
):
    """Postprocess predictions by selecting the best span or no-answer option."""
    preds = {}
    for i, ex in enumerate(examples):
        context = ex["context"]
        offsets = ex["offset_mapping"]
        ex_id   = ex["id"]

        best_start, best_end, best_score = 0, 0, float("-inf")
        cls_score = start_logits[i][cls_index] + end_logits[i][cls_index]  # No-answer score

        for start_idx in np.argsort(start_logits[i])[-10:]:
            for end_idx in np.argsort(end_logits[i])[-10:]:
                if end_idx < start_idx or (end_idx - start_idx + 1) > 100:
                    continue
                score = start_logits[i][start_idx] + end_logits[i][end_idx]
                if score > best_score:
                    best_score = score
                    best_start = start_idx
                    best_end = end_idx

        if best_score - cls_score < null_score_diff_threshold:
            preds[ex_id] = ""
        else:
            start_char = offsets[best_start][0]
            end_char   = offsets[best_end][1]
            preds[ex_id] = clean_prediction(context[start_char:end_char])

    return preds

##########################################
# 4) Metrics: EM, F1, Is Impossible Accuracy
##########################################
def exact_match(pred, gold):
    return 1.0 if normalize_text_telugu(pred) == normalize_text_telugu(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return float(pred_tokens == gold_tokens)
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

def compute_is_impossible_accuracy(examples, predictions):
    """Compute accuracy for unanswerable questions."""
    total, correct = 0, 0
    for ex in examples:
        gold_empty = ex["gold_text"].strip() == ""
        pred_empty = predictions.get(ex["id"], "").strip() == ""
        if gold_empty:
            total += 1
            if pred_empty:
                correct += 1
    return (correct / total) * 100.0 if total > 0 else 100.0

##########################################
# 5) Master Evaluation Function
##########################################
def evaluate_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=0.0):
    predictions = postprocess_qa_predictions_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=null_score_diff_threshold)

    metrics = {"em": 0.0, "f1": 0.0, "is_impossible_acc": 0.0}
    total = len(examples)

    for ex in examples:
        pred = predictions.get(ex["id"], "")
        gold = ex["gold_text"]
        metrics["em"] += exact_match(pred, gold)
        metrics["f1"] += f1_score(pred, gold)

    metrics["em"] /= total
    metrics["f1"] /= total
    metrics["is_impossible_acc"] = compute_is_impossible_accuracy(examples, predictions)

    return metrics, predictions

##########################################
# 6) Baseline vs Fine-Tuned Evaluation
##########################################
def run_model_evaluation(model, tokenizer, dataset, device):
    """Evaluate the given model on the dataset."""
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=lambda x: {
        "input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in x]).to(device),
        "attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in x]).to(device)
    })

    all_start_logits, all_end_logits = [], []
    model.eval()

    with torch.no_grad():
        for batch in dataloader:
            outputs = model(**batch)
            all_start_logits.append(outputs.start_logits.cpu().numpy())
            all_end_logits.append(outputs.end_logits.cpu().numpy())

    return np.concatenate(all_start_logits), np.concatenate(all_end_logits)

def compare_models(base_model, fine_tuned_model, tokenizer, dataset, device, data_ = "validation"):
    """Compare baseline vs fine-tuned model performance."""
    print(f"=> Compare baseline vs fine-tuned model performance with {data_} data")
    print("\nEvaluating Baseline Model...")
    base_start_logits, base_end_logits = run_model_evaluation(base_model, tokenizer, dataset, device)
    base_metrics, _ = evaluate_squad_v2(dataset, base_start_logits, base_end_logits)

    print("\nEvaluating Fine-Tuned Model...")
    fine_tuned_start_logits, fine_tuned_end_logits = run_model_evaluation(fine_tuned_model, tokenizer, dataset, device)
    fine_tuned_metrics, _ = evaluate_squad_v2(dataset, fine_tuned_start_logits, fine_tuned_end_logits)

    print("\nBaseline Model Metrics:")
    print(base_metrics)

    print("\nFine-Tuned Model Metrics:")
    print(fine_tuned_metrics)

##########################################
# 7) Main Execution
##########################################
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    BASE_DIR = "/content/drive/MyDrive/TeQAS V5/XLM_R"
    BASELINE_MODEL = "xlm-roberta-large"  # Pretrained (not fine-tuned)
    FINETUNED_MODEL_PATH = os.path.join(BASE_DIR, "final_xlmr_2.0_tel_3")
    DATASET_VAL = os.path.join(BASE_DIR, "xlm_r_processed_telugu_squad_v2/val.pt")
    DATASET_TEST = os.path.join(BASE_DIR, "xlm_r_processed_telugu_squad_v2/test.pt")

    print("\nLoading Baseline Model:", BASELINE_MODEL)
    base_model = XLMRobertaForQuestionAnswering.from_pretrained(BASELINE_MODEL).to(device)
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(BASELINE_MODEL)

    print("\nLoading Fine-Tuned Model:", FINETUNED_MODEL_PATH)
    fine_tuned_model = XLMRobertaForQuestionAnswering.from_pretrained(FINETUNED_MODEL_PATH).to(device)

    print("\nLoading Datasets:", DATASET_VAL)
    dataset_val = Dataset.from_list(torch.load(DATASET_VAL))
    compare_models(base_model, fine_tuned_model, tokenizer, dataset_val, device, data_ = "validation")

    print("\nLoading Datasets:", DATASET_TEST)
    dataset_test = Dataset.from_list(torch.load(DATASET_TEST))
    compare_models(base_model, fine_tuned_model, tokenizer, dataset_test, device, data_ = "test")

if __name__ == "__main__":
    main()


Loading Baseline Model: xlm-roberta-large


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]


Loading Fine-Tuned Model: /content/drive/MyDrive/TeQAS V5/XLM_R/final_xlmr_2.0_tel_3

Loading Datasets: /content/drive/MyDrive/TeQAS V5/XLM_R/xlm_r_processed_telugu_squad_v2/val.pt


  dataset_val = Dataset.from_list(torch.load(DATASET_VAL))


=> Compare baseline vs fine-tuned model performance with validation data

Evaluating Baseline Model...

Evaluating Fine-Tuned Model...

Baseline Model Metrics:
{'em': 0.0908773562950896, 'f1': 0.11998905071550492, 'is_impossible_acc': 39.147869674185465}

Fine-Tuned Model Metrics:
{'em': 0.5551547591342797, 'f1': 0.6958794826060655, 'is_impossible_acc': 87.41854636591478}

Loading Datasets: /content/drive/MyDrive/TeQAS V5/XLM_R/xlm_r_processed_telugu_squad_v2/test.pt


  dataset_test = Dataset.from_list(torch.load(DATASET_TEST))


=> Compare baseline vs fine-tuned model performance with test data

Evaluating Baseline Model...

Evaluating Fine-Tuned Model...

Baseline Model Metrics:
{'em': 0.19629356444772267, 'f1': 0.21642521164105785, 'is_impossible_acc': 39.309453471196456}

Fine-Tuned Model Metrics:
{'em': 0.6113774663470404, 'f1': 0.7065033575269639, 'is_impossible_acc': 82.20088626292467}



## 🔷 MuRIL

#### ♦️ FIne-Tuning

In [None]:
#!/usr/bin/env python
# fine_tune_stage_muril.py

import os
import torch
import numpy as np
from datasets import Dataset
# Import the correct MuRIL-based BERT classes:
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
    Trainer,
    TrainingArguments
)
from transformers.trainer_utils import EvalPrediction

import re

# If you have the following definitions from prior code,
# ensure they are in the same file or properly imported:
from functools import partial

##########################################
# 1) Post-processing: Normalization & EM/F1
##########################################
def normalize_text(s):
    def remove_articles(txt):
        return re.sub(r"\b(a|an|the)\b", " ", txt)
    def remove_punc(txt):
        return re.sub(r"[^\w\s]", "", txt)
    def white_space_fix(txt):
        return " ".join(txt.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Because doc_stride can cause more features than original questions,
    we clamp iteration by min(len(examples), len(start_logits)).
    """
    preds = {}
    limit = min(len(examples), len(start_logits))
    for i in range(limit):
        ex       = examples[i]
        offsets  = ex["offset_mapping"]
        context  = ex["context"]
        ex_id    = ex["id"]

        s_idx = int(np.argmax(start_logits[i]))
        e_idx = int(np.argmax(end_logits[i]))
        if e_idx < s_idx:
            s_idx, e_idx = e_idx, s_idx

        if s_idx >= len(offsets):
            preds[ex_id] = ""
            continue
        if e_idx >= len(offsets):
            e_idx = len(offsets)-1

        start_char = offsets[s_idx][0]
        end_char   = offsets[e_idx][1]
        pred_text  = context[start_char:end_char]
        preds[ex_id] = pred_text
    return preds

def compute_metrics(eval_preds, raw_dataset):
    """
    eval_preds => (start_logits, end_logits)
    raw_dataset => the original list of dict with 'gold_text', 'offset_mapping', etc.
    We'll decode => measure EM/F1.
    """
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits   = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(raw_dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in raw_dataset:
        ex_id = ex["id"]
        pred  = preds_dict.get(ex_id, "")
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = (total_em / count)*100.0
    f1_val = (total_f1 / count)*100.0
    return {"em": em_val, "f1": f1_val}

##########################################
# 2) Collate => Provide start/end positions
##########################################
def squad_collate(features):
    """
    Convert any lists => Tensors if needed, then stack.
    Expects each feature to have "input_ids", "attention_mask",
    "start_positions", "end_positions".
    """
    for f in features:
        if not isinstance(f["input_ids"], torch.Tensor):
            f["input_ids"] = torch.tensor(f["input_ids"], dtype=torch.long)
        if not isinstance(f["attention_mask"], torch.Tensor):
            f["attention_mask"] = torch.tensor(f["attention_mask"], dtype=torch.long)
        if not isinstance(f["start_positions"], torch.Tensor):
            f["start_positions"] = torch.tensor(f["start_positions"], dtype=torch.long)
        if not isinstance(f["end_positions"], torch.Tensor):
            f["end_positions"] = torch.tensor(f["end_positions"], dtype=torch.long)

    input_ids      = torch.stack([f["input_ids"] for f in features])
    attention_mask = torch.stack([f["attention_mask"] for f in features])
    start_positions= torch.stack([f["start_positions"] for f in features])
    end_positions  = torch.stack([f["end_positions"] for f in features])

    return {
        "input_ids":       input_ids,
        "attention_mask":  attention_mask,
        "start_positions": start_positions,
        "end_positions":   end_positions,
    }

##### English

In [None]:
##########################################
# 3) 2-Stage Finetuning (English => Telugu)
##########################################

import wandb

# Base directory setup
BASE_DIR = "/content/drive/MyDrive/Te-QAS V07/MuRIL"
DATA_DIR = os.path.join(BASE_DIR, "muril_processed_english_squad_v2")
MODEL_NAME = "google/muril-large-cased"
OUTPUT_DIR = os.path.join(BASE_DIR, "checkpoints_muril_eng_1")
FINAL_MODEL_DIR = os.path.join(BASE_DIR, "final_muril_2.0_eng_1")

# Wandb configuration
WANDB_PROJECT = "Te-QAS 2.0 V-07"
RUN_NAME = "muril_eng_run1"

def main():
    # Stage 1: English Data
    import sys

    # Example:
    train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
    val_list   = torch.load(os.path.join(DATA_DIR, "val.pt"))
    test_list   = torch.load(os.path.join(DATA_DIR, "test.pt"))

    wandb.init(project=WANDB_PROJECT, name=RUN_NAME)

    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset   = Dataset.from_list(test_list)

    # 1) Use MuRIL's BERT-based classes

    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    model = BertForQuestionAnswering.from_pretrained(MODEL_NAME)

    # 2) Training args => 1 epoch
    training_args = TrainingArguments(
        output_dir= OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_muril",
        logging_steps=100,
        report_to="wandb"
    )

    from transformers.trainer_utils import EvalPrediction
    def hf_compute_metrics_stage1(p: EvalPrediction):
        return compute_metrics(p.predictions, val_list)

    # Build trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=squad_collate,
        tokenizer=tokenizer,
        compute_metrics=hf_compute_metrics_stage1
    )

    print("\n======== STAGE 1: Fine-tuning on English data for 1 epoch ========")
    trainer.train()

    trainer.save_model(FINAL_MODEL_DIR)
    print(f"Done! Model saved to {FINAL_MODEL_DIR}")

    # # Evaluate on validation set
    # print("\n======== Evaluating on validation set ========")
    # final_val_metrics = trainer.evaluate()
    # print("Stage 1 final val metrics:", final_val_metrics)

    # # Evaluate on test set
    # print("\n======== Evaluating on test set ========")
    # final_test_metrics = trainer.evaluate(eval_dataset=test_dataset)
    # print("Stage 1 final test metrics:", final_test_metrics)
    # print("Stage 1 done.")

    wandb.finish()

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python
# test_inference_muril.py

import os
import torch
import numpy as np
import re
from datasets import Dataset
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
    TrainingArguments,
    Trainer
)
import nltk  # for BLEU
from evaluate import load as load_metric  # for ROUGE - Fixed import

##########################################
# 1) Basic text normalization + EM/F1
##########################################
def normalize_text(s):
    def remove_articles(txt):
        return re.sub(r"\b(a|an|the)\b", " ", txt)
    def remove_punc(txt):
        return re.sub(r"[^\w\s]", "", txt)
    def white_space_fix(txt):
        return " ".join(txt.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common      = set(pred_tokens) & set(gold_tokens)
    num_same    = len(common)
    if len(pred_tokens)==0 or len(gold_tokens)==0:
        return 1.0 if pred_tokens==gold_tokens else 0.0
    prec = num_same / len(pred_tokens)
    rec  = num_same / len(gold_tokens)
    if prec+rec==0:
        return 0.0
    return 2*prec*rec/(prec+rec)

##########################################
# 2) Decoding predictions => text answers
##########################################
def postprocess_qa_predictions(examples, start_logits, end_logits):
    preds = {}
    limit = min(len(examples), len(start_logits))
    for i in range(limit):
        ex       = examples[i]
        offsets  = ex["offset_mapping"]
        context  = ex["context"]
        ex_id    = ex["id"]

        s_idx = int(np.argmax(start_logits[i]))
        e_idx = int(np.argmax(end_logits[i]))
        if e_idx < s_idx:
            s_idx, e_idx = e_idx, s_idx

        if s_idx >= len(offsets):
            preds[ex_id] = ""
            continue
        if e_idx >= len(offsets):
            e_idx = len(offsets)-1

        start_char = offsets[s_idx][0]
        end_char   = offsets[e_idx][1]
        pred_text  = context[start_char:end_char]
        preds[ex_id] = pred_text
    return preds

##########################################
# 3) Evaluate => EM, F1, BLEU, ROUGE
##########################################
def compute_all_metrics(examples, start_logits, end_logits):
    # Convert Tensors => numpy
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits   = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(examples, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    from nltk.translate.bleu_score import corpus_bleu
    corpus_refs = []
    corpus_hyps = []

    pred_texts = []
    gold_texts = []

    for ex in examples:
        ex_id = ex["id"]
        gold  = ex["gold_text"]
        pred  = preds_dict.get(ex_id, "")

        # SQuAD EM/F1
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

        # For BLEU/ROUGE
        pred_texts.append(pred)
        gold_texts.append(gold)

        # corpus BLEU expects tokenized lists
        gold_toks = normalize_text(gold).split()
        pred_toks = normalize_text(pred).split()
        corpus_refs.append([gold_toks])  # list of list
        corpus_hyps.append(pred_toks)

    # Final EM/F1
    em_val = (total_em / count)*100.0
    f1_val = (total_f1 / count)*100.0

    # BLEU (corpus)
    bleu_val = corpus_bleu(corpus_refs, corpus_hyps)*100.0

    # ROUGE - Fixed metric calculation
    rouge_metric = load_metric("rouge")
    results_rouge = rouge_metric.compute(
        predictions=pred_texts,
        references=gold_texts
    )
    r1_f = float(results_rouge['rouge1'])*100.0
    r2_f = float(results_rouge['rouge2'])*100.0
    rl_f = float(results_rouge['rougeL'])*100.0

    return {
        "em": em_val,
        "f1": f1_val,
        "bleu": bleu_val,
        "rouge1_f": r1_f,
        "rouge2_f": r2_f,
        "rougeL_f": rl_f
    }

##########################################
# 4) Custom collator => BERT
##########################################
def squad_collate(features):
    for f in features:
        if not isinstance(f["input_ids"], torch.Tensor):
            f["input_ids"] = torch.tensor(f["input_ids"], dtype=torch.long)
        if not isinstance(f["attention_mask"], torch.Tensor):
            f["attention_mask"] = torch.tensor(f["attention_mask"], dtype=torch.long)
        if not isinstance(f["start_positions"], torch.Tensor):
            f["start_positions"] = torch.tensor(f["start_positions"], dtype=torch.long)
        if not isinstance(f["end_positions"], torch.Tensor):
            f["end_positions"] = torch.tensor(f["end_positions"], dtype=torch.long)

    input_ids      = torch.stack([f["input_ids"] for f in features])
    attention_mask = torch.stack([f["attention_mask"] for f in features])
    start_positions= torch.stack([f["start_positions"] for f in features])
    end_positions  = torch.stack([f["end_positions"] for f in features])

    return {
        "input_ids":       input_ids,
        "attention_mask":  attention_mask,
        "start_positions": start_positions,
        "end_positions":   end_positions,
    }

##########################################
# 5) Main function
##########################################

BASE_DIR = "/content/drive/MyDrive/Te-QAS V07/MuRIL"
DATA_DIR = os.path.join(BASE_DIR, "muril_processed_english_squad_v2")
MODEL_DIR = os.path.join(BASE_DIR, "final_muril_2.0_eng_1")

def evaluate_data(trainer, data_list, data_name=""):
    print(f"\nRunning predictions on {data_name} set (size={len(data_list)})...")
    dataset = Dataset.from_list(data_list)
    preds_out = trainer.predict(dataset)
    start_logits, end_logits = preds_out.predictions

    print(f"Computing all metrics for {data_name} set...")
    scores = compute_all_metrics(data_list, start_logits, end_logits)

    print(f"\n========= {data_name.upper()} RESULTS =========")
    for k,v in scores.items():
        print(f"{k}: {v:.2f}")
    print("=" * (len(data_name) + 20))

    return scores

def main():
    # Default values instead of argparser
    batch_size = 32

    # 1) Load validation and test data
    print("Loading validation data ...")
    val_data_list = torch.load(os.path.join(DATA_DIR, "val.pt"))

    print("Loading test data ...")
    test_data_list = torch.load(os.path.join(DATA_DIR, "test.pt"))

    # 2) Load MuRIL BERT QA model
    model = BertForQuestionAnswering.from_pretrained(MODEL_DIR)

    # 3) Training args for inference
    eval_args = TrainingArguments(
        output_dir="inference_outputs_muril",
        per_device_eval_batch_size=batch_size
    )

    # 4) Build Trainer
    trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=squad_collate,
        tokenizer=None  # not necessary if no dynamic padding
    )

    # 5) Evaluate on validation set
    val_scores = evaluate_data(trainer, val_data_list, "validation")

    # 6) Evaluate on test set
    test_scores = evaluate_data(trainer, test_data_list, "test")

    # 7) Print combined summary
    print("\n========= COMBINED SUMMARY =========")
    print("Metric      | Validation | Test")
    print("-" * 35)
    for metric in val_scores.keys():
        print(f"{metric:<11} | {val_scores[metric]:>9.2f} | {test_scores[metric]:>6.2f}")
    print("=" * 35)

if __name__ == "__main__":
    main()

Loading validation data ...


  val_data_list = torch.load(os.path.join(DATA_DIR, "val.pt"))


Loading test data ...


  test_data_list = torch.load(os.path.join(DATA_DIR, "test.pt"))
  trainer = Trainer(



Running predictions on validation set (size=8605)...




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Computing all metrics for validation set...


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


em: 67.88
f1: 79.08
bleu: 32.16
rouge1_f: 60.98
rouge2_f: 38.82
rougeL_f: 60.95

Running predictions on test set (size=11947)...


Computing all metrics for test set...

em: 67.75
f1: 75.28
bleu: 15.27
rouge1_f: 38.45
rouge2_f: 24.25
rougeL_f: 38.39

Metric      | Validation | Test
-----------------------------------
em          |     67.88 |  67.75
f1          |     79.08 |  75.28
bleu        |     32.16 |  15.27
rouge1_f    |     60.98 |  38.45
rouge2_f    |     38.82 |  24.25
rougeL_f    |     60.95 |  38.39


##### తెలుగు

In [None]:
##########################################
# 3) 2-Stage Finetuning (English => Telugu)
##########################################

import wandb

# Base directory setup
BASE_DIR = "/content/drive/MyDrive/Te-QAS V07/MuRIL"
DATA_DIR = os.path.join(BASE_DIR, "muril_processed_telugu_squad_v2")
MODEL_NAME = os.path.join(BASE_DIR, "final_muril_2.0_eng_1")
OUTPUT_DIR = os.path.join(BASE_DIR, "checkpoints_muril_tel_3")
FINAL_MODEL_DIR = os.path.join(BASE_DIR, "final_muril_2.0_tel_3")

# Wandb configuration
WANDB_PROJECT = "Te-QAS 2.0 V-07"
RUN_NAME = "muril_tel_run3"

def main():
    # Stage 2: Telugu Data
    import sys

    # Example:
    train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
    val_list   = torch.load(os.path.join(DATA_DIR, "val.pt"))
    test_list   = torch.load(os.path.join(DATA_DIR, "test.pt"))

    wandb.init(project=WANDB_PROJECT, name=RUN_NAME)

    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset   = Dataset.from_list(test_list)

    # 1) Use Pre-Fine-tuned English Model

    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    model = BertForQuestionAnswering.from_pretrained(MODEL_NAME)
    print(f"Loading the Fine-Tuned Model from {MODEL_NAME}")

    # 2) Training args => 3 epochs
    training_args = TrainingArguments(
        output_dir= OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_muril",
        logging_steps=100,
        report_to="wandb"
    )

    from transformers.trainer_utils import EvalPrediction
    def hf_compute_metrics_stage1(p: EvalPrediction):
        return compute_metrics(p.predictions, val_list)

    # Build trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=squad_collate,
        tokenizer=tokenizer,
        compute_metrics=hf_compute_metrics_stage1
    )

    print("\n======== STAGE 2: Fine-tuning on Telugu data for 3 epochs ========")
    trainer.train()

    trainer.save_model(FINAL_MODEL_DIR)
    print(f"Done! Model saved to {FINAL_MODEL_DIR}")

    # # Evaluate on validation set
    # print("\n======== Evaluating on validation set ========")
    # final_val_metrics = trainer.evaluate()
    # print("Stage 1 final val metrics:", final_val_metrics)

    wandb.finish()

if __name__ == "__main__":
    main()


  train_list = torch.load(os.path.join(DATA_DIR, "train.pt"))
  val_list   = torch.load(os.path.join(DATA_DIR, "val.pt"))
  test_list   = torch.load(os.path.join(DATA_DIR, "test.pt"))


Loading the Fine-Tuned Model from /content/drive/MyDrive/Te-QAS V07/MuRIL/final_muril_2.0_eng_1


  trainer = Trainer(







Epoch,Training Loss,Validation Loss,Em,F1
1,1.7524,1.723241,56.928447,71.051366
2,1.4493,1.679375,56.695753,71.429071


Epoch,Training Loss,Validation Loss,Em,F1
1,1.7524,1.723241,56.928447,71.051366
2,1.4493,1.679375,56.695753,71.429071
3,1.1982,1.779184,56.649215,71.294583


Done! Model saved to /content/drive/MyDrive/Te-QAS V07/MuRIL/final_muril_2.0_tel_3


0,1
eval/em,█▂▁
eval/f1,▁█▆
eval/loss,▄▁█
eval/runtime,▁▁█
eval/samples_per_second,██▁
eval/steps_per_second,█▅▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇█
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
train/grad_norm,▃▄▂▃▄▅▄▁▂▃▃▄▃▄▃▆▃▂▄▄▁▅▅▄▇▆▁▃▃▂▁█▃▅▃▇▂▁▅▄
train/learning_rate,█████▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▂▂▂▂▁▁▁▁▁

0,1
eval/em,56.64921
eval/f1,71.29458
eval/loss,1.77918
eval/runtime,166.0303
eval/samples_per_second,51.768
eval/steps_per_second,1.62
total_flos,3.055817486897971e+17
train/epoch,3.0
train/global_step,10284.0
train/grad_norm,17.94786


#### ♦️ Evaluation

In [None]:
#!/usr/bin/env python
# eval_squad_v2_muril.py
#
# A separate evaluation script for your MuRIL QA model (SQuAD v2.0 style).
# Adapts the same logic you used previously with XLM-R, but now for MuRIL.

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import (
    BertForQuestionAnswering,
    BertTokenizerFast
)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import re

##############################################################################
# 1) Telugu-Specific Normalization
##############################################################################
def normalize_text_telugu(s):
    """
    Minimal Telugu-oriented normalization:
      - Removes extra punctuation
      - Lowercases any English letters
      - Strips extra whitespace
    """
    if not s:
        return ""
    # Remove everything not Telugu or alphanumeric. Adjust for your corpus if needed.
    s = re.sub(r"[^\u0C00-\u0C7Fa-zA-Z0-9\s]", "", s)
    # Lowercase (affects only English letters)
    s = s.lower()
    # Remove multiple spaces
    s = " ".join(s.split())
    return s

##############################################################################
# 2) Basic Cleaning for Predicted Spans
##############################################################################
def clean_prediction(text):
    if not text:
        return ""
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

##############################################################################
# 3) No-Answer Post-processing for SQuAD v2.0
##############################################################################
def postprocess_qa_predictions_squad_v2(
    examples,
    start_logits,
    end_logits,
    cls_index=0,
    max_answer_length=100,
    n_best_size=20,
    null_score_diff_threshold=0.0
):
    """
    - examples: list of dicts with "id", "context", "offset_mapping", etc.
    - start_logits[i], end_logits[i]: arrays of length [sequence_length].
    - cls_index: index for [CLS] token logit (for MuRIL, typically token_id=101 is [CLS],
      but in practice the position in the input is 0).
    - null_score_diff_threshold: threshold for "no answer" decision.
    """
    preds = {}

    for i, ex in enumerate(examples):
        context = ex["context"]
        offsets = ex["offset_mapping"]
        ex_id   = ex["id"]

        # 1) Find best span
        best_score = float('-inf')
        best_start, best_end = 0, 0

        start_idxs = np.argsort(start_logits[i])[-n_best_size:].tolist()
        end_idxs   = np.argsort(end_logits[i])[-n_best_size:].tolist()

        for st in start_idxs:
            for en in end_idxs:
                if en < st:
                    continue
                if (en - st + 1) > max_answer_length:
                    continue
                if st >= len(offsets) or en >= len(offsets):
                    continue

                span_score = start_logits[i][st] + end_logits[i][en]
                if span_score > best_score:
                    best_score = span_score
                    best_start = st
                    best_end   = en

        # 2) Calculate no-answer score from [CLS] token
        cls_score = start_logits[i][cls_index] + end_logits[i][cls_index]

        # 3) Decide if no-answer
        score_diff = best_score - cls_score
        if score_diff < null_score_diff_threshold:
            preds[ex_id] = ""
        else:
            start_char = offsets[best_start][0]
            end_char   = offsets[best_end][1]
            text_span  = context[start_char:end_char]
            preds[ex_id] = clean_prediction(text_span)

    return preds

##############################################################################
# 4) Metrics: EM, F1, BLEU, ROUGE, plus is_impossible accuracy
##############################################################################
def exact_match(pred, gold):
    return 1.0 if normalize_text_telugu(pred) == normalize_text_telugu(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return float(pred_tokens == gold_tokens)
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if (precision + recall) == 0:
        return 0.0
    return (2.0 * precision * recall) / (precision + recall)

def compute_bleu(pred, gold):
    if not pred or not gold:
        return {"unigram": 0.0, "bigram": 0.0}
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    smoothing = SmoothingFunction().method1
    unigram = sentence_bleu(
        [gold_tokens],
        pred_tokens,
        weights=(1, 0, 0, 0),
        smoothing_function=smoothing
    )
    bigram  = sentence_bleu(
        [gold_tokens],
        pred_tokens,
        weights=(0.5, 0.5, 0, 0),
        smoothing_function=smoothing
    )
    return {"unigram": unigram, "bigram": bigram}

def compute_rouge(pred, gold):
    scorer = rouge_scorer.RougeScorer(["rouge1","rouge2","rougeL"], use_stemmer=False)
    if not pred or not gold:
        return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
    pred_clean = normalize_text_telugu(pred)
    gold_clean = normalize_text_telugu(gold)
    scores = scorer.score(pred_clean, gold_clean)
    return {
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure
    }

def compute_is_impossible_accuracy(examples, predictions):
    """
    For each example, if gold_text is empty => gold is_impossible.
    We check if predicted text is also empty => predicted is_impossible.
    Then compute accuracy over *only* the is_impossible subset.
    """
    total_impossible = 0
    correct_impossible = 0
    for ex in examples:
        gold = ex.get("gold_text", "")
        gold_impossible = (gold.strip() == "")
        if gold_impossible:
            total_impossible += 1
            pred_text = predictions.get(ex["id"], "")
            if pred_text.strip() == "":
                correct_impossible += 1

    if total_impossible == 0:
        return 100.0  # or 0.0, depending on your convention
    return (correct_impossible / total_impossible) * 100.0

##############################################################################
# 5) Master Evaluation Function
##############################################################################
def evaluate_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=0.0):
    """
    Returns a dict with EM, F1, BLEU, ROUGE, plus 'is_impossible_acc'.
    """
    predictions = postprocess_qa_predictions_squad_v2(
        examples,
        start_logits,
        end_logits,
        cls_index=0,  # For MuRIL, position 0 is typically [CLS]
        null_score_diff_threshold=null_score_diff_threshold
    )

    total = len(examples)
    metrics = {
        "em": 0.0,
        "f1": 0.0,
        "bleu_unigram": 0.0,
        "bleu_bigram": 0.0,
        "rouge1": 0.0,
        "rouge2": 0.0,
        "rougeL": 0.0
    }

    for ex in examples:
        gold = ex.get("gold_text", "")
        pred = predictions.get(ex["id"], "")

        metrics["em"] += exact_match(pred, gold)
        metrics["f1"] += f1_score(pred, gold)

        bleu_scores = compute_bleu(pred, gold)
        metrics["bleu_unigram"] += bleu_scores["unigram"]
        metrics["bleu_bigram"]  += bleu_scores["bigram"]

        rouge_scores = compute_rouge(pred, gold)
        metrics["rouge1"] += rouge_scores["rouge1"]
        metrics["rouge2"] += rouge_scores["rouge2"]
        metrics["rougeL"] += rouge_scores["rougeL"]

    # Average the main metrics
    for key in metrics:
        metrics[key] = (metrics[key] / total) * 100.0

    # Calculate is_impossible accuracy
    is_imp_acc = compute_is_impossible_accuracy(examples, predictions)
    metrics["is_impossible_acc"] = is_imp_acc

    return metrics, predictions

##############################################################################
# 6) Minimal Collate for Eval
##############################################################################
def squad_collate_eval(features):
    """
    For evaluation, we only need input_ids & attention_mask for the model.
    Convert lists to Tensors.
    """
    input_ids = []
    attention_masks = []
    for f in features:
        input_ids.append(torch.tensor(f["input_ids"], dtype=torch.long))
        attention_masks.append(torch.tensor(f["attention_mask"], dtype=torch.long))
    return {
        "input_ids": torch.stack(input_ids, dim=0),
        "attention_mask": torch.stack(attention_masks, dim=0)
    }

##############################################################################
# 7) Main Evaluation Code
##############################################################################
def main():
    # Adjust these paths for your environment:
    MODEL_DIR = "/content/drive/MyDrive/Te-QAS V07/MuRIL/final_muril_2.0_tel_3"
    DATA_DIR  = "/content/drive/MyDrive/Te-QAS V07/MuRIL/muril_processed_telugu_squad_v2"
    VAL_FILE  = os.path.join(DATA_DIR, "val.pt")
    TEST_FILE = os.path.join(DATA_DIR, "test.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)

    # Load model
    print("\nLoading final MuRIL model from:", MODEL_DIR)
    model = BertForQuestionAnswering.from_pretrained(MODEL_DIR)
    model.eval()
    model.to(device)

    # Load tokenizer (optional for debugging or postprocessing, etc.)
    print("Loading tokenizer from:", MODEL_DIR)
    tokenizer = BertTokenizerFast.from_pretrained(MODEL_DIR)

    def run_eval(data_file, split_name="val"):
        if not os.path.exists(data_file):
            print(f"[{split_name}] file not found: {data_file}")
            return None

        print(f"Loading [{split_name}] data from:", data_file)
        data_list = torch.load(data_file)
        dataset = Dataset.from_list(data_list)
        dataloader = DataLoader(
            dataset,
            batch_size=16,
            shuffle=False,
            collate_fn=squad_collate_eval
        )

        all_start_logits = []
        all_end_logits   = []

        print(f"Running inference on {split_name} set... (num_examples={len(dataset)})")
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                start_log = outputs.start_logits.detach().cpu().numpy()
                end_log   = outputs.end_logits.detach().cpu().numpy()

                all_start_logits.append(start_log)
                all_end_logits.append(end_log)

        # Combine
        all_start_logits = np.concatenate(all_start_logits, axis=0)
        all_end_logits   = np.concatenate(all_end_logits, axis=0)

        # Evaluate with threshold=0.0 (typical default)
        results, _ = evaluate_squad_v2(
            data_list,
            all_start_logits,
            all_end_logits,
            null_score_diff_threshold=0.0
        )
        print(f"\n[{split_name.upper()} SET METRICS]:")
        for k, v in results.items():
            print(f"  {k}: {v:.2f}")
        print("-"*50)
        return results

    # Evaluate on validation
    val_metrics = run_eval(VAL_FILE, split_name="val")

    # Evaluate on test
    test_metrics = run_eval(TEST_FILE, split_name="test")
    print("\nEvaluation script completed.")

if __name__ == "__main__":
    main()

Device: cuda

Loading final MuRIL model from: /content/drive/MyDrive/Te-QAS V07/MuRIL/final_muril_2.0_tel_3
Loading tokenizer from: /content/drive/MyDrive/Te-QAS V07/MuRIL/final_muril_2.0_tel_3
Loading [val] data from: /content/drive/MyDrive/Te-QAS V07/MuRIL/muril_processed_telugu_squad_v2/val.pt


  data_list = torch.load(data_file)


Running inference on val set... (num_examples=8595)

[VAL SET METRICS]:
  em: 57.08
  f1: 71.84
  bleu_unigram: 49.57
  bleu_bigram: 37.35
  rouge1: 14.75
  rouge2: 1.69
  rougeL: 14.75
  is_impossible_acc: 80.80
--------------------------------------------------
Loading [test] data from: /content/drive/MyDrive/Te-QAS V07/MuRIL/muril_processed_telugu_squad_v2/test.pt
Running inference on test set... (num_examples=10846)

[TEST SET METRICS]:
  em: 58.94
  f1: 69.90
  bleu_unigram: 29.40
  bleu_bigram: 23.21
  rouge1: 8.00
  rouge2: 0.93
  rougeL: 8.00
  is_impossible_acc: 75.00
--------------------------------------------------

Evaluation script completed.


##### Comparision with Baseline Models

In [None]:
#!/usr/bin/env python
# eval_baseline_vs_finetuned_muril.py

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import BertForQuestionAnswering, BertTokenizerFast
import re

##########################################
# 1) Normalize Telugu text for evaluation
##########################################
def normalize_text_telugu(s):
    """Minimal normalization for Telugu text."""
    if not s:
        return ""
    s = re.sub(r"[^\u0C00-\u0C7Fa-zA-Z0-9\s]", "", s)  # Keep Telugu, English, numbers
    s = s.lower()
    s = " ".join(s.split())  # Remove extra spaces
    return s

##########################################
# 2) Cleaning Predictions
##########################################
def clean_prediction(text):
    """Remove unwanted characters and extra spaces."""
    return text.strip().replace("\n", " ")

##########################################
# 3) Postprocessing for SQuAD v2.0
##########################################
def postprocess_qa_predictions_squad_v2(examples, start_logits, end_logits, cls_index=0, null_score_diff_threshold=0.0):
    """Postprocess predictions by selecting the best span or no-answer option."""
    preds = {}
    for i, ex in enumerate(examples):
        context = ex["context"]
        offsets = ex["offset_mapping"]
        ex_id   = ex["id"]

        best_start, best_end, best_score = 0, 0, float("-inf")
        cls_score = start_logits[i][cls_index] + end_logits[i][cls_index]  # No-answer score

        for start_idx in np.argsort(start_logits[i])[-10:]:
            for end_idx in np.argsort(end_logits[i])[-10:]:
                if end_idx < start_idx or (end_idx - start_idx + 1) > 100:
                    continue
                score = start_logits[i][start_idx] + end_logits[i][end_idx]
                if score > best_score:
                    best_score = score
                    best_start = start_idx
                    best_end = end_idx

        if best_score - cls_score < null_score_diff_threshold:
            preds[ex_id] = ""
        else:
            start_char = offsets[best_start][0]
            end_char   = offsets[best_end][1]
            preds[ex_id] = clean_prediction(context[start_char:end_char])

    return preds

##########################################
# 4) Metrics: EM, F1, Is Impossible Accuracy
##########################################
def exact_match(pred, gold):
    return 1.0 if normalize_text_telugu(pred) == normalize_text_telugu(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text_telugu(pred).split()
    gold_tokens = normalize_text_telugu(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return float(pred_tokens == gold_tokens)
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

def compute_is_impossible_accuracy(examples, predictions):
    """Compute accuracy for unanswerable questions."""
    total, correct = 0, 0
    for ex in examples:
        gold_empty = ex["gold_text"].strip() == ""
        pred_empty = predictions.get(ex["id"], "").strip() == ""
        if gold_empty:
            total += 1
            if pred_empty:
                correct += 1
    return (correct / total) * 100.0 if total > 0 else 100.0

##########################################
# 5) Master Evaluation Function
##########################################
def evaluate_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=0.0):
    predictions = postprocess_qa_predictions_squad_v2(examples, start_logits, end_logits, null_score_diff_threshold=null_score_diff_threshold)

    metrics = {"em": 0.0, "f1": 0.0, "is_impossible_acc": 0.0}
    total = len(examples)

    for ex in examples:
        pred = predictions.get(ex["id"], "")
        gold = ex["gold_text"]
        metrics["em"] += exact_match(pred, gold)
        metrics["f1"] += f1_score(pred, gold)

    metrics["em"] /= total
    metrics["f1"] /= total
    metrics["is_impossible_acc"] = compute_is_impossible_accuracy(examples, predictions)

    return metrics, predictions

##########################################
# 6) Baseline vs Fine-Tuned Evaluation
##########################################
def run_model_evaluation(model, tokenizer, dataset, device):
    """Evaluate the given model on the dataset."""
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=lambda x: {
        "input_ids": torch.stack([torch.tensor(f["input_ids"]) for f in x]).to(device),
        "attention_mask": torch.stack([torch.tensor(f["attention_mask"]) for f in x]).to(device)
    })

    all_start_logits, all_end_logits = [], []
    model.eval()

    with torch.no_grad():
        for batch in dataloader:
            outputs = model(**batch)
            all_start_logits.append(outputs.start_logits.cpu().numpy())
            all_end_logits.append(outputs.end_logits.cpu().numpy())

    return np.concatenate(all_start_logits), np.concatenate(all_end_logits)

def compare_models(base_model, fine_tuned_model, tokenizer, dataset, device, data_="validation"):
    """Compare baseline vs fine-tuned model performance."""
    print(f"=> Compare baseline vs fine-tuned model performance with {data_} data")
    print("\nEvaluating Baseline Model...")
    base_start_logits, base_end_logits = run_model_evaluation(base_model, tokenizer, dataset, device)
    base_metrics, _ = evaluate_squad_v2(dataset, base_start_logits, base_end_logits)

    print("\nEvaluating Fine-Tuned Model...")
    fine_tuned_start_logits, fine_tuned_end_logits = run_model_evaluation(fine_tuned_model, tokenizer, dataset, device)
    fine_tuned_metrics, _ = evaluate_squad_v2(dataset, fine_tuned_start_logits, fine_tuned_end_logits)

    print("\nBaseline Model Metrics:")
    print(base_metrics)

    print("\nFine-Tuned Model Metrics:")
    print(fine_tuned_metrics)

##########################################
# 7) Main Execution
##########################################
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    BASE_DIR = "/content/drive/MyDrive/Te-QAS V07/MuRIL"
    BASELINE_MODEL = "google/muril-large-cased"
    FINETUNED_MODEL_PATH = os.path.join(BASE_DIR, "final_muril_2.0_tel_3")
    DATASET_VAL = os.path.join(BASE_DIR, "muril_processed_telugu_squad_v2/val.pt")
    DATASET_TEST = os.path.join(BASE_DIR, "muril_processed_telugu_squad_v2/test.pt")

    base_model = BertForQuestionAnswering.from_pretrained(BASELINE_MODEL).to(device)
    tokenizer = BertTokenizerFast.from_pretrained(BASELINE_MODEL)
    fine_tuned_model = BertForQuestionAnswering.from_pretrained(FINETUNED_MODEL_PATH).to(device)

    dataset_val = Dataset.from_list(torch.load(DATASET_VAL))
    compare_models(base_model, fine_tuned_model, tokenizer, dataset_val, device, data_="validation")

    dataset_test = Dataset.from_list(torch.load(DATASET_TEST))
    compare_models(base_model, fine_tuned_model, tokenizer, dataset_test, device, data_="test")

if __name__ == "__main__":
    main()


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google/muril-large-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  dataset_val = Dataset.from_list(torch.load(DATASET_VAL))


=> Compare baseline vs fine-tuned model performance with validation data

Evaluating Baseline Model...

Evaluating Fine-Tuned Model...

Baseline Model Metrics:
{'em': 0.12530541012216406, 'f1': 0.1459625570250129, 'is_impossible_acc': 53.68421052631579}

Fine-Tuned Model Metrics:
{'em': 0.5707969749854567, 'f1': 0.7183550698776989, 'is_impossible_acc': 80.85213032581454}


  dataset_test = Dataset.from_list(torch.load(DATASET_TEST))


=> Compare baseline vs fine-tuned model performance with test data

Evaluating Baseline Model...

Evaluating Fine-Tuned Model...

Baseline Model Metrics:
{'em': 0.2397197123363452, 'f1': 0.2573378939348231, 'is_impossible_acc': 47.34121122599705}

Fine-Tuned Model Metrics:
{'em': 0.5894338926793288, 'f1': 0.6989928720876525, 'is_impossible_acc': 75.01846381093058}


## ♦ Comparision Tables

In [2]:
import pandas as pd
import numpy as np

# Set display options for better formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.2f' % x if isinstance(x, (int, float)) else str(x))

def print_table_with_header(df, title):
   # Get the maximum width of each column for formatting
   col_widths = [max(len(str(x)) for x in df[col]) for col in df.columns]
   col_widths = [max(len(col), width) for col, width in zip(df.columns, col_widths)]

   # Create header separator
   header_sep = "=" * (sum(col_widths) + (len(col_widths) - 1) * 3 + 4)

   # Print title
   print(f"\n{title}")
   print(header_sep)

   # Format and print header
   header = " | ".join(f"{col:{width}}" for col, width in zip(df.columns, col_widths))
   print(f"| {header} |")

   # Print separator after header
   print("|" + "|".join("-" * (width + 2) for width in col_widths) + "|")

   # Print each row
   for _, row in df.iterrows():
       row_str = " | ".join(f"{str(val):{width}}" for val, width in zip(row, col_widths))
       print(f"| {row_str} |")

   print(header_sep + "\n")

# XLM-R Full Data
xlmr_full_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score", "Is Impossible Accuracy"],
   "Baseline (Validation)": [0.09, 0.12, 39.15],
   "Fine-Tuned (Validation)": [61.30, 75.31, 91.43],
   "Baseline (Test)": [0.20, 0.22, 39.31],
   "Fine-Tuned (Test)": [61.14, 70.65, 82.20]
})

# XLM-R Answerable Only
xlmr_answerable_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score"],
   "Baseline (Validation)": [0.00, 3.13],
   "Fine-Tuned (Validation)": [59.53, 76.88],
   "Baseline (Test)": [0.00, 3.49],
   "Fine-Tuned (Test)": [51.23, 71.66]
})

# MuRIL Full Data
muril_full_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score", "Is Impossible Accuracy"],
   "Baseline (Validation)": [0.13, 0.15, 53.68],
   "Fine-Tuned (Validation)": [57.08, 71.84, 80.80],
   "Baseline (Test)": [0.24, 0.26, 47.34],
   "Fine-Tuned (Test)": [58.94, 69.90, 75.00]
})

# MuRIL Answerable Only
muril_answerable_val_test = pd.DataFrame({
   "Metric": ["Exact Match (EM)", "F1 Score"],
   "Baseline (Validation)": [0.00, 2.30],
   "Fine-Tuned (Validation)": [60.89, 78.19],
   "Baseline (Test)": [0.00, 4.23],
   "Fine-Tuned (Test)": [51.60, 72.09]
})

# Print all tables with proper formatting
print_table_with_header(xlmr_full_val_test, "Table 1: XLM-R Full Data Performance (Validation & Test)")
print_table_with_header(muril_full_val_test, "Table 2: MuRIL Full Data Performance (Validation & Test)")
print_table_with_header(xlmr_answerable_val_test, "Table 3: XLM-R Answerable Only Performance")
print_table_with_header(muril_answerable_val_test, "Table 4: MuRIL Answerable Only Performance")

# Optionally save to CSV
xlmr_full_val_test.to_csv('xlmr_full_performance.csv', index=False)
xlmr_answerable_val_test.to_csv('xlmr_answerable_performance.csv', index=False)
muril_full_val_test.to_csv('muril_full_performance.csv', index=False)
muril_answerable_val_test.to_csv('muril_answerable_performance.csv', index=False)


Table 1: XLM-R Full Data Performance (Validation & Test)
| Metric                 | Baseline (Validation) | Fine-Tuned (Validation) | Baseline (Test) | Fine-Tuned (Test) |
|------------------------|-----------------------|-------------------------|-----------------|-------------------|
| Exact Match (EM)       | 0.09                  | 61.3                    | 0.2             | 61.14             |
| F1 Score               | 0.12                  | 75.31                   | 0.22            | 70.65             |
| Is Impossible Accuracy | 39.15                 | 91.43                   | 39.31           | 82.2              |


Table 2: MuRIL Full Data Performance (Validation & Test)
| Metric                 | Baseline (Validation) | Fine-Tuned (Validation) | Baseline (Test) | Fine-Tuned (Test) |
|------------------------|-----------------------|-------------------------|-----------------|-------------------|
| Exact Match (EM)       | 0.13                  | 57.08                   | 0

 Above metrics are rounded off to 2 decimals.