### 1) Dataset Generation: SQuAD + Trivia QA + Hotpot QA + Natural Questions QA

In [8]:
from datasets import load_dataset
from typing import List, Dict, Any
import pandas as pd

def process_squad(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['context'],
            "question": example['question'],
            "answer": example['answers']['text'][0] if example['answers']['text'] else ""
        }
        for example in dataset['train']
    ]

def process_trivia_qa(dataset: Any) -> List[Dict[str, str]]:
    return [
        {
            "context": example['entity_pages']['wiki_context'][0] if example['entity_pages']['wiki_context'] else "",
            "question": example['question'],
            "answer": example['answer']['value']
        }
        for example in dataset['train']
        if example['entity_pages']['wiki_context']
    ]

def process_hotpot_qa(dataset: Any) -> List[Dict[str, str]]:
    processed_data = []
    for example in dataset['train']:
        context = ""
        for title, sentences in example['context']:
            context += f"{title}: {' '.join(sentences)} "
        processed_data.append({
            "context": context.strip(),
            "question": example['question'],
            "answer": example['answer']
        })
    return processed_data

def load_datasets():
    print("Loading datasets...")
    datasets = {
        "SQuAD": load_dataset("squad"),
        "TriviaQA": load_dataset("trivia_qa", "unfiltered"),
        "HotpotQA": load_dataset("hotpot_qa", "distractor", trust_remote_code=True)
    }
    print("Datasets loaded successfully.")
    return datasets

def process_datasets(datasets):
    print("Processing datasets...")
    merged_data = []

    print("Processing SQuAD...")
    merged_data.extend(process_squad(datasets["SQuAD"]))
    print("Processing TriviaQA...")
    merged_data.extend(process_trivia_qa(datasets["TriviaQA"]))
    print("Processing HotpotQA...")
    merged_data.extend(process_hotpot_qa(datasets["HotpotQA"]))

    # Convert to DataFrame for easy handling
    df = pd.DataFrame(merged_data)
    return df

In [9]:
loaded_datasets = load_datasets()

Loading datasets...


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/36 [00:00<?, ?it/s]


SQuAD Sample:
Example 1:
id

Example 2:
title

Example 3:
context

Example 4:
question

Example 5:
answers


TriviaQA Sample:
Example 1:
question

Example 2:
question_id

Example 3:
question_source

Example 4:
entity_pages

Example 5:
search_results

Example 6:
answer


HotpotQA Sample:
Example 1:
id

Example 2:
question

Example 3:
answer

Example 4:
type

Example 5:
level

Example 6:
supporting_facts

Example 7:
context

Datasets: SQuAD, TriviaQA, HotpotQA loaded successfully!


In [10]:
squad_df = pd.DataFrame(loaded_datasets["SQuAD"]["train"])
print("SQuAD Dataset:")
print(squad_df.head())

Processing datasets...
Processing SQuAD...
Processing SQuAD...
Processing TriviaQA...
Processing TriviaQA...
Processing HotpotQA...
Processing HotpotQA...


ValueError: too many values to unpack (expected 2)

In [None]:
trivia_qa_df = pd.DataFrame(loaded_datasets["TriviaQA"]["train"])
print("TriviaQA Dataset:")
print(trivia_qa_df.head())

In [None]:
hotpot_qa_df = pd.DataFrame(loaded_datasets["HotpotQA"]["train"])
print("HotpotQA Dataset:")
print(hotpot_qa_df.head())

In [None]:
# Process the datasets
merged_dataset = process_datasets(loaded_datasets)

In [None]:
# View the merged dataset
print("Merged Dataset:")
print(merged_dataset.head())