In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
from pathlib import Path
import torch

base = Path.cwd() / "data-merged" / "data" / "air-exercise-2" / "Part-3"
answers_path = base / "msmarco-fira-21.qrels.qa-answers.tsv"
tuples_path  = base / "msmarco-fira-21.qrels.qa-tuples.tsv"
retrieval_path = base / "msmarco-fira-21.qrels.retrieval.tsv"

"""
manual parsing because pandas.read_csv() does not work.
content needs to be cleaned and has an inconsistent number of columns.
"""

def parse_answers(answers_path: Path) -> pd.DataFrame:
    answers_data = []  # Collect data in a list
    with open(answers_path, "r") as answers_f:
        for line in tqdm(answers_f.readlines()):
            split_line = line.strip().split("\t")
            qid = split_line[0]
            docid = split_line[1]
            rel_grade = split_line[2]
            text_selection = split_line[3:]
            answers_data.append({"queryid": qid, "documentid": docid, "relevance-grade": rel_grade, "text-selection": text_selection})

    return pd.DataFrame(answers_data)  # Create DataFrame from the list

def parse_tuples(tuples_path: Path) -> pd.DataFrame:
    tuples_data = []  # Collect data in a list
    with open(tuples_path, "r") as tuples_f:
        for line in tqdm(tuples_f.readlines()):
            split_line = line.strip().split("\t")
            qid = split_line[0]
            docid = split_line[1]
            rel_grade = split_line[2]
            question = split_line[3]
            context = split_line[4]
            text_selection = "\t".join(split_line[5:]).strip()
            tuples_data.append({"queryid": qid, "documentid": docid, "relevance-grade": rel_grade, "question": question,
                                "context": context, "text-selection": text_selection})

    tuples_df = pd.DataFrame(tuples_data)  # Create DataFrame from the list
    return tuples_df

tuples = parse_tuples(tuples_path)


100%|██████████| 52606/52606 [00:00<00:00, 180068.99it/s]


In [None]:
model_name = 'deepset/roberta-base-squad2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [None]:

batch_size = 1000
results = []

def process_batch(batch):
    batch_results = []
    for i, row in batch.iterrows():
        question = row['question']
        context = row['context']

        inputs = tokenizer(question, context, return_tensors="pt")

        # Move inputs to GPU if available
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Model inference
        output = model(**inputs)

        # Extract the most likely answer
        answer_start_idx = torch.argmax(output.start_logits)
        answer_end_idx = torch.argmax(output.end_logits)
        answer_tokens = inputs.input_ids[0, answer_start_idx: answer_end_idx + 1]
        answer = tokenizer.decode(answer_tokens)

        # Save the result
        batch_results.append({
            "queryid": row['queryid'],
            "documentid": row['documentid'],
            "question": row['question'],
            "relevance-grade": row['relevance-grade'],
            "context": row['context'],
            "text-selection": row['text-selection'],
            "output": answer
        })

    return batch_results

# Process tuples in batches
for start in tqdm(range(0, len(tuples), batch_size)):
    end = min(start + batch_size, len(tuples))
    batch = tuples[start:end]
    batch_results = process_batch(batch)
    results.extend(batch_results)

    # Clear GPU memory if using GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Save intermediate results
    results_df = pd.DataFrame(results)
    results_path = "./qa_results_intermediate.csv"
    results_df.to_csv(results_path, index=False)

# Save final results to a CSV file
results_df = pd.DataFrame(results)
results_path = "./qa_results_final.csv"
results_df.to_csv(results_path, index=False)
print(f"Final results saved to {results_path}")


100%|██████████| 53/53 [5:15:01<00:00, 356.63s/it]


Final results saved to ./qa_results_final.csv
