# Preparing Q&A Datasets
This notebook will download and prepare Q&A datasets for training a feed forward network/routing network. The following datasets will be prepared: 
- ARC AI2 Reasoning Challenge: 7,787 grade-school science questions
- OpenBookQA: 5,957 science and reasoning questions
- MathQA: ~37,000 math word problems
- CommonsenseQA: 12,247 general reasoning questions

In [28]:
import pandas
from datasets import load_dataset, concatenate_datasets, Dataset
import os
import json
import re

### ARC AI2 Reasoning Challenge

https://huggingface.co/datasets/allenai/ai2_arc


In [29]:
ds_arc1 = load_dataset("allenai/ai2_arc", "ARC-Challenge")
ds_arc2 = load_dataset("allenai/ai2_arc", "ARC-Easy")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in both datasets
ds_arc1_train = ds_arc1['train'].map(lambda x: add_source_field(x, "ARC-Challenge"))
ds_arc1_test = ds_arc1['test'].map(lambda x: add_source_field(x, "ARC-Challenge"))
ds_arc1_validation = ds_arc1['validation'].map(lambda x: add_source_field(x, "ARC-Challenge"))

ds_arc2_train = ds_arc2['train'].map(lambda x: add_source_field(x, "ARC-Easy"))
ds_arc2_test = ds_arc2['test'].map(lambda x: add_source_field(x, "ARC-Easy"))
ds_arc2_validation = ds_arc2['validation'].map(lambda x: add_source_field(x, "ARC-Easy"))

# Concatenate all dataset splits
arc_train_set = concatenate_datasets([ds_arc1_train, ds_arc1_test, ds_arc1_validation,
                                      ds_arc2_train, ds_arc2_test, ds_arc2_validation])

# Save the new dataset to disk
arc_train_set.save_to_disk("../datasets/ARC_AI2.hf")

Saving the dataset (1/1 shards): 100%|██████████| 7787/7787 [00:00<00:00, 318178.72 examples/s]


### OpenBookQA

https://huggingface.co/datasets/allenai/openbookqa

In [30]:
ds_ob1 = load_dataset("allenai/openbookqa", "additional")
ds_ob2 = load_dataset("allenai/openbookqa", "main")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in both datasets
ds_ob1_train = ds_ob1['train'].map(lambda x: add_source_field(x, "OpenBook-Additional"))
ds_ob1_test = ds_ob1['test'].map(lambda x: add_source_field(x, "OpenBook-Additional"))
ds_ob1_validation = ds_ob1['validation'].map(lambda x: add_source_field(x, "OpenBook-Additional"))

ds_ob2_train = ds_ob2['train'].map(lambda x: add_source_field(x, "OpenBook-Main"))
ds_ob2_test = ds_ob2['test'].map(lambda x: add_source_field(x, "OpenBook-Mai"))
ds_ob2_validation = ds_ob2['validation'].map(lambda x: add_source_field(x, "OpenBook-Mai"))

# Concatenate all dataset splits
ob_train_set = concatenate_datasets([ds_ob1_train, ds_ob1_test, ds_ob1_validation,
                                     ds_ob2_train, ds_ob2_test, ds_ob2_validation])

# Save the new dataset to disk
ob_train_set.save_to_disk("../datasets/OpenBook.hf")

Saving the dataset (1/1 shards): 100%|██████████| 11914/11914 [00:00<00:00, 401198.98 examples/s]


### MathQA

https://math-qa.github.io/math-QA/

JSON files will needed to be converted to HF format with the same labels as the other Q&A datasets

In [31]:
json_dir = "../datasets/MathQA"

# List to store combined JSON data
all_data = []

# Loop through all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):  # Ensure it's a JSON file
        file_path = os.path.join(json_dir, filename)
        
        # Load the JSON file and append its content to all_data
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)  # Each file is a list of dictionaries
            all_data.extend(data)  # Concatenate lists

print('Number of examples in MathQA:', len(all_data))

Number of examples in MathQA: 37901


In [32]:
# Modify example problems to match labels and formatting for other datasets
# Function to transform options into the correct format
def parse_options(entry):
    # Extract choices using regex
    choices = re.findall(r"[a-e] \) (.*?)(?= , [a-e] \) |$)", entry)
    return {"text": choices, "label": ["A", "B", "C", "D", "E"]}  
       

# Function to update keys and values
def update_entry(entry):
    return {
        "question": entry["Problem"],  
        "rationale": entry["Rationale"],
        "choices": parse_options(entry["options"]),  
        "correct": entry["correct"].upper(),
        "annotatedFormula": entry["annotated_formula"],
        "linearFormula": entry["linear_formula"],
        "category": entry["category"],
        "source": "MathQA"
    }

# Apply transformation
updated_data = [update_entry(entry) for entry in all_data]
ds_mq = Dataset.from_list(updated_data)
ds_mq.save_to_disk("../datasets/MathQA.hf")


Saving the dataset (1/1 shards): 100%|██████████| 37901/37901 [00:00<00:00, 439431.54 examples/s]


### CommonsenseQA

https://huggingface.co/datasets/tau/commonsense_qa

In [33]:
ds_cs = load_dataset("tau/commonsense_qa")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in the dataset
ds_cs_train = ds_cs['train'].map(lambda x: add_source_field(x, "Commonsense"))
ds_cs_validation = ds_cs['validation'].map(lambda x: add_source_field(x, "Commonsense"))
ds_cs_test = ds_cs['test'].map(lambda x: add_source_field(x, "Commonsense"))

# Concatenate all dataset splits
cs_train_set = concatenate_datasets([ds_cs_train, ds_cs_validation, ds_cs_test])

# Save the new dataset to disk
cs_train_set.save_to_disk("../datasets/Commonsense.hf")

Map: 100%|██████████| 9741/9741 [00:00<00:00, 12150.95 examples/s]
Map: 100%|██████████| 1221/1221 [00:00<00:00, 13205.82 examples/s]
Map: 100%|██████████| 1140/1140 [00:00<00:00, 13420.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12102/12102 [00:00<00:00, 570446.46 examples/s]
