In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm # shows progress
from pathlib import Path
import torch


base = Path.cwd() / "data-merged" / "data" / "air-exercise-2" / "Part-3"
answers_path = base / "msmarco-fira-21.qrels.qa-answers.tsv"
tuples_path  = base / "msmarco-fira-21.qrels.qa-tuples.tsv"
retrieval_path = base / "msmarco-fira-21.qrels.retrieval.tsv"

"""
manual parsing because pandas.read_csv() does not work.
content needs to be cleaned and has an inconsistent number of columns.
"""

def parse_answers(answers_path: Path) -> pd.DataFrame:
    answers: pd.DataFrame = pd.DataFrame(columns=["queryid", "documentid", "relevance-grade", "text-selection"])
    answers_f = open(answers_path, "r")
    for line in tqdm(answers_f.readlines()):
        split_line = line.strip().split("\t")
        qid = split_line[0]
        docid = split_line[1]
        rel_grade = split_line[2]
        text_selection = split_line[3:]
        answers = answers.append({"queryid": qid, "documentid": docid, "relevance-grade": rel_grade, "text-selection": text_selection}, ignore_index=True)
    answers_f.close()
    return answers

def parse_tuples(tuples_path: Path) -> pd.DataFrame:
    tuples: pd.DataFrame = pd.DataFrame(columns=["queryid", "documentid", "relevance-grade","question",
                                                  "context","text-selection"])
    tuples_f = open(tuples_path, "r")
    for line in tqdm(tuples_f.readlines()):
        split_line = line.strip().split("\t")
        qid = split_line[0]
        docid = split_line[1]
        rel_grade = split_line[2]
        question = split_line[3]
        context = split_line[4]
        text_selection = "\t".join(split_line[5:]).strip()
        tuples = tuples.append({"queryid": qid, "documentid": docid, "relevance-grade": rel_grade, "question": question, "context": context,
                                "text-selection": text_selection}, ignore_index=True)
    tuples_f.close()
    return tuples.head(10)

tuples = parse_tuples(tuples_path)

print(tuples)

100%|██████████| 52606/52606 [02:20<00:00, 375.19it/s]

   queryid documentid relevance-grade  \
0   135386     100163               3   
1   290779     101026               3   
2    21741    1021598               3   
3   810210    1029662               3   
4  1097448     103635               3   
5    36133     103776               3   
6    85018    1042657               2   
7   987100      10462               2   
8   709560    1050990               3   
9   285729    1054505               2   

                                            question  \
0                          definition of imagination   
1  how many oscars has clint eastwood won?pdrijgh...   
2         are cold sores and fever blisters the same   
3            what is the cause of blood in the stool   
4              how many calories in slim fast shakes   
5                             average download speed   
6                                causes for shingles   
7              where is magma found within our earth   
8               what is all in basic metabol




In [2]:
model_name = 'deepset/roberta-base-squad2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


Downloading: 100%|██████████| 571/571 [00:00<00:00, 170kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.41MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.05MB/s]
Downloading: 100%|██████████| 772/772 [00:00<00:00, 227kB/s]
Downloading: 100%|██████████| 79.0/79.0 [00:00<00:00, 17.2kB/s]
Downloading: 100%|██████████| 496M/496M [00:39<00:00, 12.5MB/s] 


In [7]:
inputs0 = tokenizer(tuples['question'][0], tuples['context'][0], return_tensors="pt")
output0 = model(**inputs0)

: 

In [6]:
output0

'how many oscars has clint eastwood won?pdrijgheposrgijapeoikgjpesoar'