### 1) Dataset Generation: SQuAD + Trivia QA + Hotpot QA + Natural Questions QA

In [58]:
from datasets import load_dataset
from typing import List, Dict, Any
import pandas as pd

def process_squad(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['context'],
            "question": example['question'],
            "answer": example['answers']['text'][0] if example['answers']['text'] else ""
        }
        for example in dataset['train']
    ]

def process_adversarial_qa(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['context'],
            "question": example['question'],
            "answer": example['answers']['text'][0] if example['answers']['text'] else ""
        }
        for example in dataset['train']
    ]

def load_datasets():
    print("Loading datasets...")
    datasets = {
        "SQuAD": load_dataset("squad"),
        "AdversarialQA": load_dataset("adversarial_qa", "adversarialQA")
    }
    print("Datasets loaded successfully.")
    return datasets

def process_datasets(datasets):
    print("Processing datasets...")
    merged_data = []

    print("Processing SQuAD...")
    merged_data.extend(process_squad(datasets["SQuAD"]))
    print("Processing NewsQA...")
    merged_data.extend(process_adversarial_qa(datasets["AdversarialQA"]))

    # Convert to DataFrame for easy handling
    df = pd.DataFrame(merged_data)
    return df

In [59]:
loaded_datasets = load_datasets()

Loading datasets...


Downloading data:   0%|          | 0.00/4.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/457k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Datasets loaded successfully.


In [60]:
def view_squad(dataset):
    squad_df = pd.DataFrame(dataset['train'])
    print("SQuAD Sample:")
    display(squad_df.head())

def view_adversarial_qa(dataset):
    adversarial_qa_df = pd.DataFrame(dataset['train'])
    print("AdversarialQA Sample:")
    display(adversarial_qa_df.head())

In [61]:
view_squad(loaded_datasets['SQuAD'])

SQuAD Sample:


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [62]:
view_adversarial_qa(loaded_datasets['AdversarialQA'])

AdversarialQA Sample:


Unnamed: 0,id,title,context,question,answers,metadata
0,7ba1e8f4261d3170fcf42e84a81dd749116fae95,Brain,Another approach to brain function is to exami...,What sare the benifts of the blood brain barrir?,"{'text': ['isolated from the bloodstream'], 'a...","{'split': 'train', 'model_in_the_loop': 'Combi..."
1,5ec5ef305a259311596e85d811ade30bd68b079d,Brain,Another approach to brain function is to exami...,What is surrounded by cerebrospinal fluid?,"{'text': ['brain'], 'answer_start': [280]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
2,7cb230edfb15ad1fda8d157af1f2b574cbb02b4c,Brain,Another approach to brain function is to exami...,What does the skull protect?,"{'text': ['brain'], 'answer_start': [280]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
3,e1850f2a48b8f7c2231cec41ed63c1b638a8e2c7,Brain,Another approach to brain function is to exami...,What has been injected into rats to produce pr...,"{'text': ['chemicals'], 'answer_start': [723]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
4,7bc0ae1a8a24ea4f3398b5236ab9569bbc3e820b,Brain,Another approach to brain function is to exami...,What can cause issues with how the brain works?,"{'text': ['brain damage'], 'answer_start': [409]}","{'split': 'train', 'model_in_the_loop': 'Combi..."


In [63]:
# Process the datasets
merged_dataset = process_datasets(loaded_datasets)

Processing datasets...
Processing SQuAD...
Processing NewsQA...


In [65]:
# View the merged dataset
merged_dataset

Unnamed: 0,context,question,answer
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary
...,...,...,...
117594,"Some high-speed black-and-white films, such as...",What is the lowest ISO mentioned?,400
117595,"Some high-speed black-and-white films, such as...",What is the highest ISO mentioned?,3200
117596,"Some high-speed black-and-white films, such as...",What is Kodak'sbrand name of E6 film?,Ektachrome
117597,"Some high-speed black-and-white films, such as...",How do these films differ?,high-speed black-and-white films
