In [2]:
#!/usr/bin/env python
# preprocess_answerable_xlmr_wiki.py

import json
import os
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
from transformers import XLMRobertaTokenizerFast

# Input and output paths
input_json = "../TeQAS 1.2/telugu_wiki.json"  # Your single input JSON file
out_dir = "processed_telugu_wiki_xlmr"  # Output directory
os.makedirs(out_dir, exist_ok=True)

max_length = 512
model_tokenizer_name = "xlm-roberta-large"  # or "xlm-roberta-base" if GPU is limited

###############################################
# 1) Filter out unanswerable QAs
###############################################
def filter_answerable_squad(input_path):
    """
    Returns a new SQuAD JSON dict containing only QAs where is_impossible=False
    with at least one answer.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_data = {
        "version": data.get("version", "filtered_telugu_wiki"),
        "data": []
    }
    for article in data["data"]:
        new_paragraphs = []
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            new_qas = []
            for qa in paragraph["qas"]:
                if not qa.get("is_impossible", False) and qa.get("answers"):
                    new_qas.append(qa)
            if new_qas:
                new_paragraphs.append({
                    "context": context,
                    "qas": new_qas
                })
        if new_paragraphs:
            new_data["data"].append({
                "title": article.get("title", ""),
                "paragraphs": new_paragraphs
            })
    return new_data

###############################################
# 2) Build offset-based examples
###############################################
def build_answerable_examples(squad_data, tokenizer, max_length=384):
    """
    For each answerable QA:
      - tokenize question+context
      - find start/end token indices
      - store offset_mapping, context, gold_text, etc.
    """
    examples_out = []
    for article in tqdm(squad_data["data"], desc="Processing articles"):
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                ans = qa["answers"][0]
                ans_start = ans["answer_start"]
                ans_text = ans["text"]
                ans_end = ans_start + len(ans_text)

                enc = tokenizer(
                    qa["question"],
                    context,
                    max_length=max_length,
                    truncation="only_second",
                    return_offsets_mapping=True,
                    return_tensors="pt",
                    padding="max_length"
                )

                input_ids = enc["input_ids"][0]
                attention_mask = enc["attention_mask"][0]
                offset_mapping = enc["offset_mapping"][0].tolist()

                # find start/end token indices
                start_token = None
                end_token = None
                for i, (start_char, end_char) in enumerate(offset_mapping):
                    if start_char <= ans_start < end_char:
                        start_token = i
                    if start_char < ans_end <= end_char:
                        end_token = i

                # fallback if mismatch
                if start_token is None or end_token is None or end_token < start_token:
                    start_token = 0
                    end_token = 0

                ex_item = {
                    "id": qa["id"],
                    "input_ids": input_ids,
                    "attention_mask": attention_mask,
                    "start_positions": torch.tensor(start_token, dtype=torch.long),
                    "end_positions": torch.tensor(end_token, dtype=torch.long),
                    "offset_mapping": offset_mapping,
                    "context": context,
                    "gold_text": ans_text
                }
                examples_out.append(ex_item)
    return examples_out

def main():
    print(f"Using tokenizer: {model_tokenizer_name}")
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_tokenizer_name)

    # Filter and build examples
    print("Filtering + building wiki data (TELUGU) ...")
    wiki_data_raw = filter_answerable_squad(input_json)
    wiki_examples = build_answerable_examples(wiki_data_raw, tokenizer, max_length)
    print(f"Wiki answerable examples size: {len(wiki_examples)}")

    # Save as .pt
    output_file = os.path.join(out_dir, "wiki_examples.pt")
    torch.save(wiki_examples, output_file)

    print(f"\nSaved processed file to {output_file}")
    print("Done! Telugu wiki preprocessing completed with XLM-RoBERTa.")

if __name__ == "__main__":
    main()

Using tokenizer: xlm-roberta-large




Filtering + building wiki data (TELUGU) ...


Processing articles: 100%|██████████| 199/199 [00:04<00:00, 48.49it/s]


Wiki answerable examples size: 947

Saved processed file to processed_telugu_wiki_xlmr/wiki_examples.pt
Done! Telugu wiki preprocessing completed with XLM-RoBERTa.


In [3]:
#!/usr/bin/env python
# evaluate_tydiqa_telugu.py

import os
import re
import torch
import numpy as np
from datasets import Dataset
from transformers import (
    XLMRobertaForQuestionAnswering,
    XLMRobertaTokenizerFast
)

# ------------------
# Adjust paths here
# ------------------
DATA_PATH = "processed_telugu_wiki_xlmr/wiki_examples.pt"

MODEL_PATH = "./final_xlmr_tel_answerable_3_v2"  # Path to your fine-tuned QA model

print("\n[INFO] Loading processed dataset...")
data_list = torch.load(DATA_PATH)
dataset = Dataset.from_list(data_list)

print(f"[INFO] Loading model from {MODEL_PATH}...")
model = XLMRobertaForQuestionAnswering.from_pretrained(MODEL_PATH)
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def postprocess_qa_predictions(examples, start_logits, end_logits):
    """
    Convert model logits into text predictions by:
      - Finding best start/end
      - Using offset_mapping to slice the original context
    """
    preds = {}
    num_examples = len(examples)

    for i in range(num_examples):
        ex = examples[i]
        offsets = ex["offset_mapping"]
        context = ex["context"]

        # If mismatch in array sizes, skip
        if i >= len(start_logits) or i >= len(end_logits):
            preds[ex["id"]] = ""
            continue

        start_idx = int(np.argmax(start_logits[i]))
        end_idx   = int(np.argmax(end_logits[i]))

        # Check valid indices
        if (
            start_idx >= len(offsets) or
            end_idx   >= len(offsets) or
            start_idx > end_idx
        ):
            preds[ex["id"]] = ""
            continue

        start_char = offsets[start_idx][0]
        end_char   = offsets[end_idx][1]
        pred_text  = context[start_char:end_char]

        preds[ex["id"]] = pred_text

    return preds

def compute_metrics(eval_preds, examples):
    """
    Compute EM and F1 on the predictions vs. gold_text.
    """
    start_logits, end_logits = eval_preds

    # Convert any torch.Tensors to numpy
    if isinstance(start_logits, torch.Tensor):
        start_logits = start_logits.cpu().numpy()
    if isinstance(end_logits, torch.Tensor):
        end_logits = end_logits.cpu().numpy()

    predictions = postprocess_qa_predictions(examples, start_logits, end_logits)

    total_em, total_f1 = 0.0, 0.0
    for ex_idx, ex in enumerate(examples):
        ex_id = ex["id"]
        pred  = predictions.get(ex_id, "")
        gold  = ex["gold_text"]

        total_em += exact_match(pred, gold)
        total_f1 += f1_score(pred, gold)

    count = len(examples)
    return {
        "exact_match": 100.0 * total_em / count,
        "f1":          100.0 * total_f1 / count
    }

def exact_match(pred, gold):
    return 1.0 if normalize_text(pred) == normalize_text(gold) else 0.0

def f1_score(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()

    common   = set(pred_tokens) & set(gold_tokens)
    num_same = len(common)
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 1.0 if pred_tokens == gold_tokens else 0.0
    precision = num_same / len(pred_tokens)
    recall    = num_same / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)

def normalize_text(s):
    """
    Lower text and remove punctuation, articles, extra whitespace.
    """
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)
    def remove_punc(text):
        return re.sub(r"[^\w\s]", "", text)
    def white_space_fix(text):
        return " ".join(text.split())

    s = s.lower()
    s = remove_articles(s)
    s = remove_punc(s)
    s = white_space_fix(s)
    return s

print("\n[INFO] Running inference on all examples...")
start_logits_list = []
end_logits_list   = []

with torch.no_grad():
    for example in data_list:
        input_ids      = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits_list.append(outputs.start_logits.cpu().numpy())
        end_logits_list.append(outputs.end_logits.cpu().numpy())

# Concatenate to get final arrays
start_logits = np.concatenate(start_logits_list, axis=0)
end_logits   = np.concatenate(end_logits_list,   axis=0)

print("[INFO] Computing metrics...")
metrics = compute_metrics((start_logits, end_logits), data_list)

print("\n===== Final Evaluation Metrics =====")
for k, v in metrics.items():
    print(f"{k}: {v:.2f}")
print("====================================")


[INFO] Loading processed dataset...


  data_list = torch.load(DATA_PATH)


[INFO] Loading model from ./final_xlmr_tel_answerable_3_v2...





[INFO] Running inference on all examples...


  input_ids      = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
  attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to(device)


[INFO] Computing metrics...

===== Final Evaluation Metrics =====
exact_match: 69.06
f1: 84.32
