### 1) Dataset Generation: SQuAD + Trivia QA + Hotpot QA + Natural Questions QA

In [66]:
from datasets import load_dataset
from typing import List, Dict, Any
import pandas as pd

def process_squad(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['context'],
            "question": example['question'],
            "answer": example['answers']['text'][0] if example['answers']['text'] else ""
        }
        for example in dataset['train']
    ]

def process_adversarial_qa(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['context'],
            "question": example['question'],
            "answer": example['answers']['text'][0] if example['answers']['text'] else ""
        }
        for example in dataset['train']
    ]

def process_drop(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['passage'],
            "question": example['question'],
            "answer": example['answers_spans']['spans'][0] if example['answers_spans']['spans'] else ""
        }
        for example in dataset['train']
    ]

def process_duorc(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['plot'],
            "question": example['question'],
            "answer": example['answers'][0] if example['answers']['0'] else ""
        }
        for example in dataset['train']
    ]
def load_datasets():
    print("Loading datasets...")
    datasets = {
        "SQuAD": load_dataset("squad"),
        "AdversarialQA": load_dataset("adversarial_qa", "adversarialQA"),
        "DROP": load_dataset("ucinlp/drop"),
        "DUO_RC": load_dataset("ibm/duorc", "ParaphraseRC")
    }
    print("Datasets loaded successfully.")
    return datasets

def process_datasets(datasets):
    merged_data = []

    print("Processing SQuAD...")
    merged_data.extend(process_squad(datasets["SQuAD"]))
    print("Processing NewsQA...")
    merged_data.extend(process_adversarial_qa(datasets["AdversarialQA"]))
    print("Processing DROP...")
    merged_data.extend(process_drop(datasets["DROP"]))

    # Convert to DataFrame for easy handling
    df = pd.DataFrame(merged_data)
    return df

In [67]:
loaded_datasets = load_datasets()

Loading datasets...


Downloading readme:   0%|          | 0.00/6.91k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/77400 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9535 [00:00<?, ? examples/s]

Datasets loaded successfully.


In [68]:
def view_squad(dataset):
    squad_df = pd.DataFrame(dataset['train'])
    print("SQuAD Sample:")
    display(squad_df.head())

def view_adversarial_qa(dataset):
    adversarial_qa_df = pd.DataFrame(dataset['train'])
    print("AdversarialQA Sample:")
    display(adversarial_qa_df.head())

def view_drop(dataset):
    drop_df = pd.DataFrame(dataset['train'])
    print("DROP Sample:")
    display(drop_df.head())

In [69]:
view_squad(loaded_datasets['SQuAD'])

SQuAD Sample:


Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [70]:
view_adversarial_qa(loaded_datasets['AdversarialQA'])

AdversarialQA Sample:


Unnamed: 0,id,title,context,question,answers,metadata
0,7ba1e8f4261d3170fcf42e84a81dd749116fae95,Brain,Another approach to brain function is to exami...,What sare the benifts of the blood brain barrir?,"{'text': ['isolated from the bloodstream'], 'a...","{'split': 'train', 'model_in_the_loop': 'Combi..."
1,5ec5ef305a259311596e85d811ade30bd68b079d,Brain,Another approach to brain function is to exami...,What is surrounded by cerebrospinal fluid?,"{'text': ['brain'], 'answer_start': [280]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
2,7cb230edfb15ad1fda8d157af1f2b574cbb02b4c,Brain,Another approach to brain function is to exami...,What does the skull protect?,"{'text': ['brain'], 'answer_start': [280]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
3,e1850f2a48b8f7c2231cec41ed63c1b638a8e2c7,Brain,Another approach to brain function is to exami...,What has been injected into rats to produce pr...,"{'text': ['chemicals'], 'answer_start': [723]}","{'split': 'train', 'model_in_the_loop': 'Combi..."
4,7bc0ae1a8a24ea4f3398b5236ab9569bbc3e820b,Brain,Another approach to brain function is to exami...,What can cause issues with how the brain works?,"{'text': ['brain damage'], 'answer_start': [409]}","{'split': 'train', 'model_in_the_loop': 'Combi..."


In [71]:
view_drop(loaded_datasets['DROP'])

DROP Sample:


Unnamed: 0,section_id,query_id,passage,question,answers_spans
0,nfl_2201,f16c0ee7-f131-4a8b-a6ac-4d275ea68066,"To start the season, the Lions traveled south ...",How many points did the buccaneers need to tie...,"{'spans': ['3'], 'types': ['number']}"
1,nfl_2201,c9582e03-b01b-42ed-83e0-b90a5334aefa,"To start the season, the Lions traveled south ...",How many field goals did the Lions score?,"{'spans': ['2'], 'types': ['number']}"
2,nfl_2201,f703d43d-73fa-4fda-8913-d81bd5569700,"To start the season, the Lions traveled south ...",How long was the Lion's longest field goal?,"{'spans': ['28-yard'], 'types': ['span']}"
3,nfl_2201,2fd4f473-af2b-44ce-929a-20c82fa6be2c,"To start the season, the Lions traveled south ...",Who caught the touchdown for the fewest yard?,"{'spans': ['Mike Williams'], 'types': ['span']}"
4,nfl_2201,6592e06d-4ad6-484f-a9a5-5cb72c76dfee,"To start the season, the Lions traveled south ...",Who caught the shortest touchdown pass?,"{'spans': ['Calvin Johnson'], 'types': ['span']}"


In [72]:
# Process the datasets
merged_dataset = process_datasets(loaded_datasets)

Processing SQuAD...
Processing NewsQA...
Processing DROP...


In [73]:
# View the merged dataset
merged_dataset

Unnamed: 0,context,question,answer
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,the Main Building
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,a Marian place of prayer and reflection
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,a golden statue of the Virgin Mary
...,...,...,...
194994,Ireland tends towards independence in foreign ...,Which countries was the Shannon Airport used b...,Iraq
194995,Ireland tends towards independence in foreign ...,How many areas did Ireland contribute to peace...,4
194996,Ireland tends towards independence in foreign ...,Who did Irish soldiers enlist with in World Wa...,British armed forces
194997,Ireland tends towards independence in foreign ...,Which military used Shannon Airport in 2003?,United States military
