In [2]:
import json
import os

def filter_answerable_squad(input_path, output_path):
    """
    Reads a SQuAD 2.0 file, keeps only QAs where is_impossible=False,
    writes a new JSON file with the same structure but only answerable QAs.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_data = {
        "version": data.get("version", "filtered"),
        "data": []
    }

    for article in data["data"]:
        new_paragraphs = []
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            new_qas = []
            for qa in paragraph["qas"]:
                if not qa.get("is_impossible", False):
                    # keep only this QA
                    new_qas.append(qa)
            # If we found answerable QAs in this paragraph, keep them
            if new_qas:
                new_paragraphs.append({
                    "context": context,
                    "qas": new_qas
                })

        if new_paragraphs:
            new_data["data"].append({
                "title": article.get("title", ""),
                "paragraphs": new_paragraphs
            })

    # Save
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(new_data, f, indent=2, ensure_ascii=False)

def main():
    # example usage 
    input_file  = "../TeQAS 1.0/English Data/squad2.0_train.json"
    output_file = "squad2.1_train.json"
    filter_answerable_squad(input_file, output_file)
    print(f"Filtered answerable QAs saved to {output_file}")

    # example usage
    input_file  = "../TeQAS 1.0/English Data/squad2.0_test.json"
    output_file = "squad2.1_test.json"
    filter_answerable_squad(input_file, output_file)
    print(f"Filtered answerable QAs saved to {output_file}")

    # example usage
    input_file  = "../TeQAS 1.0/English Data/squad2.0_val.json"
    output_file = "squad2.1_val.json"
    filter_answerable_squad(input_file, output_file)
    print(f"Filtered answerable QAs saved to {output_file}")

if __name__ == "__main__":
    main()


Filtered answerable QAs saved to squad2.1_train.json
Filtered answerable QAs saved to squad2.1_test.json
Filtered answerable QAs saved to squad2.1_val.json


In [None]:
#!/usr/bin/env python
# preprocess_answerable.py

import json
import os
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import XLMRobertaTokenizerFast

# Adjust your paths here
train_json = "Telugu 2.1/squad2.1_telugu_train.json"      # SQuAD 2.0 train with unanswerable
val_json   = "Telugu 2.1/squad2.1_telugu_val.json"        # SQuAD 2.0 val
test_json  = "Telugu 2.1/squad2.1_telugu_test.json"       # SQuAD 2.0 test

out_dir    = "processed_telugu_answerable_data"     # where we'll write train.pt, val.pt, test.pt
os.makedirs(out_dir, exist_ok=True)

max_length = 512
model_tokenizer_name = "xlm-roberta-large"  # or "xlm-roberta-base" if GPU is limited

###############################################
# 2) Filter out unanswerable QAs
###############################################
def filter_answerable_squad(input_path):
    """
    Returns a new SQuAD JSON dict containing only QAs where is_impossible=False
    with at least one answer.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_data = {
        "version": data.get("version", "filtered_telugu"),
        "data": []
    }
    for article in data["data"]:
        new_paragraphs = []
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            new_qas = []
            for qa in paragraph["qas"]:
                if not qa.get("is_impossible", False) and qa.get("answers"):
                    new_qas.append(qa)
            if new_qas:
                new_paragraphs.append({
                    "context": context,
                    "qas": new_qas
                })
        if new_paragraphs:
            new_data["data"].append({
                "title": article.get("title", ""),
                "paragraphs": new_paragraphs
            })
    return new_data

###############################################
# 3) Build offset-based examples
###############################################
def build_answerable_examples(squad_data, tokenizer, max_length=384):
    """
    For each answerable QA:
      - tokenize question+context
      - find start/end token indices
      - store offset_mapping, context, gold_text, etc.
    """
    examples_out = []
    for article in squad_data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                ans = qa["answers"][0]
                ans_start = ans["answer_start"]
                ans_text  = ans["text"]
                ans_end   = ans_start + len(ans_text)

                enc = tokenizer(
                    qa["question"],
                    context,
                    max_length=max_length,
                    truncation="only_second",
                    return_offsets_mapping=True,
                    return_tensors="pt",
                    padding="max_length"
                )

                input_ids      = enc["input_ids"][0]
                attention_mask = enc["attention_mask"][0]
                offset_mapping = enc["offset_mapping"][0].tolist()

                # find start/end token indices
                start_token = None
                end_token   = None
                for i, (start_char, end_char) in enumerate(offset_mapping):
                    if start_char <= ans_start < end_char:
                        start_token = i
                    if start_char < ans_end <= end_char:
                        end_token = i

                # fallback if mismatch
                if start_token is None or end_token is None or end_token < start_token:
                    start_token = 0
                    end_token   = 0

                ex_item = {
                    "id": qa["id"],
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "start_positions": torch.tensor(start_token, dtype=torch.long),
                    "end_positions":   torch.tensor(end_token,   dtype=torch.long),
                    "offset_mapping":  offset_mapping,
                    "context":         context,
                    "gold_text":       ans_text
                }
                examples_out.append(ex_item)
    return examples_out

def main():
    print(f"Using tokenizer: {model_tokenizer_name}")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_tokenizer_name)

    # 1) Filter + build train
    print("Filtering + building train data (TELUGU) ...")
    train_data_raw = filter_answerable_squad(train_json)
    train_list = build_answerable_examples(train_data_raw, tokenizer, max_length)
    print(f"Train answerable size: {len(train_list)}")

    # 2) Filter + build val
    print("Filtering + building val data (TELUGU) ...")
    val_data_raw = filter_answerable_squad(val_json)
    val_list = build_answerable_examples(val_data_raw, tokenizer, max_length)
    print(f"Val answerable size: {len(val_list)}")

    # 3) Filter + build test
    print("Filtering + building test data (TELUGU) ...")
    test_data_raw = filter_answerable_squad(test_json)
    test_list = build_answerable_examples(test_data_raw, tokenizer, max_length)
    print(f"Test answerable size: {len(test_list)}")

    # 4) Save as .pt
    train_out = os.path.join(out_dir, "train.pt")
    val_out   = os.path.join(out_dir, "val.pt")
    test_out  = os.path.join(out_dir, "test.pt")

    torch.save(train_list, train_out)
    torch.save(val_list,   val_out)
    torch.save(test_list,  test_out)

    print(f"\nSaved final PT files to {out_dir}/")
    print("Done! Telugu answerable preprocessing completed.")

if __name__ == "__main__":
    main()

Using tokenizer: xlm-roberta-large
Filtering + building train data (TELUGU) ...
Train answerable size: 72039
Filtering + building val data (TELUGU) ...
Val answerable size: 6600
Filtering + building test data (TELUGU) ...
Test answerable size: 5430

Saved final PT files to processed_telugu_answerable_data/
Done! Telugu answerable preprocessing completed.


## English

In [1]:
#!/usr/bin/env python
# fine_tune_answerable_trainer.py

from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaConfig,
    Trainer,
    TrainingArguments,
    default_data_collator
)
import torch
import os
import numpy as np
from functools import partial
from tqdm import tqdm
import re
import wandb

# We'll reuse some code for postprocessing
############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens)==0 or len(gold_tokens)==0:
        return 1.0 if pred_tokens==gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision+recall==0:
        return 0.0
    return 2*precision*recall/(precision+recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    examples: a list of dict, each with offset_mapping, context, gold_text, id, etc.
    start_logits, end_logits: [num_samples, seq_len]

    returns: dict { example_id: predicted_text }
    """
    preds = {}
    for i, ex in enumerate(examples):
        offsets = ex["offset_mapping"]
        context = ex["context"]
        # best start/end
        start_idx = int(np.argmax(start_logits[i]))
        end_idx   = int(np.argmax(end_logits[i]))
        if end_idx < start_idx:
            start_idx, end_idx = end_idx, start_idx

        if start_idx >= len(offsets):
            preds[ex["id"]] = ""
            continue
        if end_idx >= len(offsets):
            end_idx = len(offsets)-1

        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]
        preds[ex["id"]] = pred_text
    return preds

def compute_metrics(eval_preds, dataset):
    """
    eval_preds => (start_logits, end_logits)
    dataset => the raw examples with gold_text
    We'll decode + compare
    """
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits   = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict[ex_id]
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = total_em / count * 100.0
    f1_val = total_f1 / count * 100.0
    return {
        "em": em_val,
        "f1": f1_val
    }

def main():    # 1) Load .pt files
    data_dir = "processed_english_answerable_data"  # from previous script
    train_list = torch.load(os.path.join(data_dir, "train.pt"))
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))

    # 2) Convert to huggingface Dataset
    from datasets import Dataset
    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)

    wandb.init(project="TeQAS 1.1", name="XLM-R Eng 1")

    # 3) Initialize model (Large recommended)
    model_name = "xlm-roberta-large"
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_name)

    # 4) Training args
    training_args = TrainingArguments(
        output_dir="checkpoints_xlmr_eng_answerable_v2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,            # Try 2-3
        per_device_train_batch_size=16,# or 32 if GPU memory allows
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_answerable",
        logging_steps=100
    )

    from transformers.trainer_utils import EvalPrediction

    def hf_compute_metrics(p: EvalPrediction):
        # p.predictions => (start_logits, end_logits)
        # dataset => we have val_dataset
        return compute_metrics(p.predictions, val_dataset)

    # 6) Build Trainer

    from transformers import Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,  # works if each sample has same keys: input_ids, attn_mask, start/end
        tokenizer=None,   
        compute_metrics=hf_compute_metrics
    )

    # 7) Train
    trainer.train()

    # 10) Save final
    trainer.save_model("final_xlmr_eng_answerable_v2")
    print("Done! Model + checkpoint saved.")
    
    # # 8) Evaluate on val => see final
    # def hf_compute_metrics_test(p: EvalPrediction):
    #     return compute_metrics(p.predictions, test_dataset)

    # test_metrics = trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")
    # # But note: we can't automatically change the compute_metrics. We can do a manual pass:
    # test_preds = trainer.predict(test_dataset)

    # # compute test set EM/F1
    # final_test = compute_metrics(test_preds.predictions, test_list)
    # print("Test set metrics:", final_test)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-27 18:05:12.404540: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  train_list = torch.load(os.path.join(data_dir, "train.pt"))
  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK bac

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Em,F1
1,0.7736,0.79318,71.075758,84.565608
2,0.589,0.748568,72.727273,85.78644




Done! Model + checkpoint saved.


In [2]:
#!/usr/bin/env python
# evaluate_model.py

import os
import re
import torch
import numpy as np
from datasets import Dataset
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizerFast

# Load preprocessed test dataset
DATA_DIR = "processed_english_answerable_data"
TEST_PATH = os.path.join(DATA_DIR, "test.pt")

print("\nLoading test dataset...")
test_list = torch.load(TEST_PATH)
test_dataset = Dataset.from_list(test_list)

# Load trained model
MODEL_PATH = "./final_xlmr_eng_answerable_v2"
print(f"Loading model from {MODEL_PATH}...")
model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_PATH, local_files_only=True)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Define Postprocessing & Evaluation
def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Postprocesses model predictions to extract answer text.
    - examples: List of dataset examples.
    - start_logits, end_logits: Logits output by the model.
    """
    preds = {}
    num_examples = len(examples)
    num_logits = len(start_logits)

    if num_logits != num_examples:
        print(f"⚠️ Warning: Logits size {num_logits} doesn't match dataset size {num_examples}. Truncating...")
        num_examples = min(num_examples, num_logits)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue
        
        if len(start_logits[i]) == 0 or len(end_logits[i]) == 0:
            preds[ex["id"]] = ""
            continue

        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        start_char = offsets[start_idx][0]
        end_char = offsets[end_idx][1]
        pred_text = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """
    Compute evaluation metrics (EM, F1, BLEU, ROUGE).
    """
    start_logits, end_logits = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    predictions = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred = predictions.get(ex_id, "")
        gold = ex["gold_text"]

        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count += 1

    return {
        "exact_match": (total_em / count) * 100.0,
        "f1": (total_f1 / count) * 100.0
    }

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

# Run Evaluation
print("\nEvaluating on test set...")
test_preds = []
with torch.no_grad():
    for example in test_list:
        inputs = {
            "input_ids": torch.tensor(example["input_ids"]).unsqueeze(0).to(device),
            "attention_mask": torch.tensor(example["attention_mask"]).unsqueeze(0).to(device),
        }
        outputs = model(**inputs)
        test_preds.append((outputs.start_logits.cpu().numpy(), outputs.end_logits.cpu().numpy()))

# Compute Metrics
test_start_logits = np.concatenate([p[0] for p in test_preds], axis=0)
test_end_logits = np.concatenate([p[1] for p in test_preds], axis=0)
test_metrics = compute_metrics((test_start_logits, test_end_logits), test_list)
print("Final Test Metrics:", test_metrics)


Loading test dataset...


  test_list = torch.load(TEST_PATH)


Loading model from ./final_xlmr_eng_answerable_v2...





Evaluating on test set...


  "input_ids": torch.tensor(example["input_ids"]).unsqueeze(0).to(device),
  "attention_mask": torch.tensor(example["attention_mask"]).unsqueeze(0).to(device),


Final Test Metrics: {'exact_match': 69.12955465587044, 'f1': 83.49781599734129}


In [1]:
#!/usr/bin/env python
# fine_tune_answerable_trainer.py

from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaConfig,
    Trainer,
    TrainingArguments,
    default_data_collator
)
import torch
import os
import numpy as np
from functools import partial
from tqdm import tqdm
import re

# We'll reuse some code for postprocessing
############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens)==0 or len(gold_tokens)==0:
        return 1.0 if pred_tokens==gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision+recall==0:
        return 0.0
    return 2*precision*recall/(precision+recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Postprocesses model predictions to extract answer text.
    - examples: List of dataset examples.
    - start_logits, end_logits: Logits output by the model.

    Returns:
    - Dictionary mapping example IDs to predicted answers.
    """
    preds = {}
    
    num_examples = len(examples)
    num_logits = len(start_logits)

    # ✅ Ensure logits and dataset sizes match
    if num_logits != num_examples:
        print(f"⚠️ Warning: Logits size {num_logits} doesn't match dataset size {num_examples}. Truncating...")
        num_examples = min(num_examples, num_logits)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        # ✅ Ensure `i` is within valid range
        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue
        
        # ✅ Ensure logits shape is valid before calling `np.argmax`
        if len(start_logits[i]) == 0 or len(end_logits[i]) == 0:
            preds[ex["id"]] = ""
            continue

        # Get best start/end indices
        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        # Extract the predicted answer
        start_char = offsets[start_idx][0]
        end_char = offsets[end_idx][1]
        pred_text = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """
    eval_preds => (start_logits, end_logits)
    dataset => the raw examples with gold_text
    We'll decode + compare
    """
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits   = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict[ex_id]
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    em_val = total_em / count * 100.0
    f1_val = total_f1 / count * 100.0
    return {
        "em": em_val,
        "f1": f1_val
    }

import wandb

def main():
    # 1) Load .pt files
    data_dir = "processed_telugu_answerable_data"  # from previous script
    train_list = torch.load(os.path.join(data_dir, "train.pt"))
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))

    # 2) Convert to huggingface Dataset
    from datasets import Dataset
    train_dataset = Dataset.from_list(train_list)
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)

    wandb.init(project="TeQAS 1.1 v2", name="XLM-R Tel 3")

    # 3) Initialize model (Large recommended)
    model_name = "final_xlmr_eng_answerable_v2"

    # Load Model
    print(f"Loading model: {model_name}")
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # ✅ Move model explicitly to GPU

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="checkpoints_xlmr_tel_answerable_v2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,            # Try 2-3
        per_device_train_batch_size=16,# or 32 if GPU memory allows
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_dir="logs_answerable",
        logging_steps=100
    )

    from transformers.trainer_utils import EvalPrediction

    def hf_compute_metrics(p: EvalPrediction):
        # p.predictions => (start_logits, end_logits)
        # dataset => we have val_dataset
        return compute_metrics(p.predictions, val_dataset)

    # Trainer Initialization
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
        tokenizer=None, 
        compute_metrics=hf_compute_metrics
    )

    # Train
    trainer.train()

    # 10) Save final
    trainer.save_model("final_xlmr_tel_answerable_3_v2")
    print("Done! Model + checkpoint saved.")

    wandb.finish()


if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
2025-01-28 08:24:25.542375: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  train_list = torch.load(os.path.join(data_dir, "train.pt"))
  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))
[34m[1mwandb[0m: Currently logged in as: [33msanthoshrishi9999[0m ([33msanthosh-rishi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Loading model: final_xlmr_eng_answerable_v2


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Em,F1
1,1.281,1.246652,57.651515,75.059659
2,1.0436,1.236136,59.075758,76.449596
3,0.8156,1.306881,59.530303,76.880735




Done! Model + checkpoint saved.


0,1
eval/em,▁▆█
eval/f1,▁▆█
eval/loss,▂▁█
eval/runtime,█▁▁
eval/samples_per_second,▁██
eval/steps_per_second,▁██
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▄▃█▄▃▄▂▃▃▂▅▃▂▃▂▄▃▄▂▃▂▂▃▂▃▃▃▄▃▄▅▂▁▅▂▃▃▃▃▂
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
eval/em,59.5303
eval/f1,76.88073
eval/loss,1.30688
eval/runtime,533.6189
eval/samples_per_second,12.368
eval/steps_per_second,0.388
total_flos,2.0070936901772698e+17
train/epoch,3.0
train/global_step,6756.0
train/grad_norm,16.10918


In [2]:
#!/usr/bin/env python
# evaluate_model.py

import os
import re
import torch
import numpy as np
from datasets import Dataset
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizerFast

# ✅ Load preprocessed test dataset
DATA_DIR = "processed_telugu_answerable_data"
TEST_PATH = os.path.join(DATA_DIR, "test.pt")

print("\nLoading test dataset...")
test_list = torch.load(TEST_PATH)
test_dataset = Dataset.from_list(test_list)

# ✅ Load trained model
MODEL_PATH = "./final_xlmr_tel_answerable_3_v2"
print(f"Loading model from {MODEL_PATH}...")
model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_PATH, local_files_only=True)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ✅ Define Postprocessing & Evaluation
def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Postprocesses model predictions to extract answer text.
    - examples: List of dataset examples.
    - start_logits, end_logits: Logits output by the model.
    """
    preds = {}
    num_examples = len(examples)
    num_logits = len(start_logits)

    if num_logits != num_examples:
        print(f"⚠️ Warning: Logits size {num_logits} doesn't match dataset size {num_examples}. Truncating...")
        num_examples = min(num_examples, num_logits)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue
        
        if len(start_logits[i]) == 0 or len(end_logits[i]) == 0:
            preds[ex["id"]] = ""
            continue

        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        start_char = offsets[start_idx][0]
        end_char = offsets[end_idx][1]
        pred_text = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """
    Compute evaluation metrics (EM, F1, BLEU, ROUGE).
    """
    start_logits, end_logits = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    predictions = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred = predictions.get(ex_id, "")
        gold = ex["gold_text"]

        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count += 1

    return {
        "exact_match": (total_em / count) * 100.0,
        "f1": (total_f1 / count) * 100.0
    }

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

def normalize_text(s):
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

# ✅ Run Evaluation
print("\nEvaluating on test set...")
test_preds = []
with torch.no_grad():
    for example in test_list:
        inputs = {
            "input_ids": torch.tensor(example["input_ids"]).unsqueeze(0).to(device),
            "attention_mask": torch.tensor(example["attention_mask"]).unsqueeze(0).to(device),
        }
        outputs = model(**inputs)
        test_preds.append((outputs.start_logits.cpu().numpy(), outputs.end_logits.cpu().numpy()))

# ✅ Compute Metrics
test_start_logits = np.concatenate([p[0] for p in test_preds], axis=0)
test_end_logits = np.concatenate([p[1] for p in test_preds], axis=0)

test_metrics = compute_metrics((test_start_logits, test_end_logits), test_list)
print("Final Test Metrics:", test_metrics)


Loading test dataset...


  test_list = torch.load(TEST_PATH)


Loading model from ./final_xlmr_tel_answerable_3_v2...





Evaluating on test set...


  "input_ids": torch.tensor(example["input_ids"]).unsqueeze(0).to(device),
  "attention_mask": torch.tensor(example["attention_mask"]).unsqueeze(0).to(device),


Final Test Metrics: {'exact_match': 51.23388581952118, 'f1': 71.65901983874353}


In [3]:
#!/usr/bin/env python
# compare_xlmr_baseline_vs_finetuned.py

import os
import re
import torch
import numpy as np
from datasets import Dataset
from transformers import XLMRobertaForQuestionAnswering, XLMRobertaTokenizerFast
import wandb

############################
# Postprocess + EM/F1
############################
def normalize_text(s):
    """Normalize text for comparison."""
    def remove_articles(t):
        return re.sub(r"\b(a|an|the)\b", " ", t)
    def remove_punc(t):
        return re.sub(r"[^\w\s]", "", t)
    def white_space_fix(t):
        return " ".join(t.split())
    return white_space_fix(remove_articles(remove_punc(s.lower())))

def exact_match(pred, gold):
    """Exact match score."""
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    """F1-score computation."""
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """Extract answer spans from model outputs."""
    preds = {}
    for i, ex in enumerate(examples):
        offsets = ex["offset_mapping"]
        context = ex["context"]

        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue

        start_idx = int(np.argmax(start_logits[i]))
        end_idx = int(np.argmax(end_logits[i]))

        if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
            preds[ex["id"]] = ""
            continue

        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]
        
        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, dataset):
    """Compute EM and F1 metrics."""
    (start_logits, end_logits) = eval_preds
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    preds_dict = postprocess_qa_predictions(dataset, start_logits, end_logits)

    total_em, total_f1, count = 0.0, 0.0, 0
    for ex in dataset:
        ex_id = ex["id"]
        pred  = preds_dict.get(ex_id, "")
        gold  = ex["gold_text"]
        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)
        count    += 1

    return {"em": total_em / count * 100.0, "f1": total_f1 / count * 100.0}

############################
# Load Data
############################
def load_data():
    """Load processed datasets."""
    data_dir = "processed_telugu_answerable_data"
    val_list   = torch.load(os.path.join(data_dir, "val.pt"))
    test_list  = torch.load(os.path.join(data_dir, "test.pt"))
    val_dataset   = Dataset.from_list(val_list)
    test_dataset  = Dataset.from_list(test_list)
    return val_dataset, test_dataset

############################
# Model Evaluation
############################
def evaluate_model(model_name, dataset, model_alias):
    """Load a model and evaluate it on the given dataset."""
    print(f"Loading model: {model_name}")
    model = XLMRobertaForQuestionAnswering.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")

    # Run inference
    test_preds = []
    with torch.no_grad():
        for example in dataset:
            inputs = {
                "input_ids": torch.tensor(example["input_ids"]).unsqueeze(0).to("cuda"),
                "attention_mask": torch.tensor(example["attention_mask"]).unsqueeze(0).to("cuda"),
            }
            outputs = model(**inputs)
            test_preds.append((outputs.start_logits.cpu().numpy(), outputs.end_logits.cpu().numpy()))

    # Compute metrics
    start_logits = np.concatenate([p[0] for p in test_preds], axis=0)
    end_logits = np.concatenate([p[1] for p in test_preds], axis=0)
    results = compute_metrics((start_logits, end_logits), dataset)
    
    print(f"Results for {model_alias}: {results}")
    return results

############################
# Main Function
############################
def main():
    # Load datasets
    val_dataset, test_dataset = load_data()

    # Initialize W&B for logging
    wandb.init(project="TeQAS_XLMR_Comparison", name="Baseline_vs_FineTuned_XLMR")

    # Evaluate Baseline XLM-R
    baseline_results_val  = evaluate_model("xlm-roberta-large", val_dataset, "Baseline XLM-R (Val)")
    baseline_results_test = evaluate_model("xlm-roberta-large", test_dataset, "Baseline XLM-R (Test)")

    # Evaluate Fine-Tuned XLM-R
    finetuned_results_val  = evaluate_model("./final_xlmr_tel_answerable_3_v2", val_dataset, "Fine-Tuned XLM-R (Val)")
    finetuned_results_test = evaluate_model("./final_xlmr_tel_answerable_3_v2", test_dataset, "Fine-Tuned XLM-R (Test)")

    # Comparison Table
    print("\n### Performance Comparison ###")
    print(f"{'Metric':<20}{'Baseline XLM-R':<20}{'Fine-Tuned XLM-R'}")
    print("-" * 60)
    print(f"{'Validation EM':<20}{baseline_results_val['em']:<20.2f}{finetuned_results_val['em']:.2f}")
    print(f"{'Validation F1':<20}{baseline_results_val['f1']:<20.2f}{finetuned_results_val['f1']:.2f}")
    print(f"{'Test EM':<20}{baseline_results_test['em']:<20.2f}{finetuned_results_test['em']:.2f}")
    print(f"{'Test F1':<20}{baseline_results_test['f1']:<20.2f}{finetuned_results_test['f1']:.2f}")

    # Log results
    wandb.log({
        "Baseline EM (Val)": baseline_results_val["em"],
        "Baseline F1 (Val)": baseline_results_val["f1"],
        "Fine-Tuned EM (Val)": finetuned_results_val["em"],
        "Fine-Tuned F1 (Val)": finetuned_results_val["f1"],
        "Baseline EM (Test)": baseline_results_test["em"],
        "Baseline F1 (Test)": baseline_results_test["f1"],
        "Fine-Tuned EM (Test)": finetuned_results_test["em"],
        "Fine-Tuned F1 (Test)": finetuned_results_test["f1"],
    })

    wandb.finish()
    print("✅ Comparison completed!")

if __name__ == "__main__":
    main()

  val_list   = torch.load(os.path.join(data_dir, "val.pt"))
  test_list  = torch.load(os.path.join(data_dir, "test.pt"))


Loading model: xlm-roberta-large


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for Baseline XLM-R (Val): {'em': 0.0, 'f1': 3.1289441386538477}
Loading model: xlm-roberta-large


Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Results for Baseline XLM-R (Test): {'em': 0.0, 'f1': 3.48595565464265}
Loading model: ./final_xlmr_tel_answerable_3_v2
Results for Fine-Tuned XLM-R (Val): {'em': 59.53030303030303, 'f1': 76.88073465726582}
Loading model: ./final_xlmr_tel_answerable_3_v2
Results for Fine-Tuned XLM-R (Test): {'em': 51.23388581952118, 'f1': 71.65901983874353}

### Performance Comparison ###
Metric              Baseline XLM-R      Fine-Tuned XLM-R
------------------------------------------------------------
Validation EM       0.00                59.53
Validation F1       3.13                76.88
Test EM             0.00                51.23
Test F1             3.49                71.66


0,1
Baseline EM (Test),▁
Baseline EM (Val),▁
Baseline F1 (Test),▁
Baseline F1 (Val),▁
Fine-Tuned EM (Test),▁
Fine-Tuned EM (Val),▁
Fine-Tuned F1 (Test),▁
Fine-Tuned F1 (Val),▁

0,1
Baseline EM (Test),0.0
Baseline EM (Val),0.0
Baseline F1 (Test),3.48596
Baseline F1 (Val),3.12894
Fine-Tuned EM (Test),51.23389
Fine-Tuned EM (Val),59.5303
Fine-Tuned F1 (Test),71.65902
Fine-Tuned F1 (Val),76.88073


✅ Comparison completed!
