# Preparing Q&A Datasets
This notebook will download and prepare Q&A datasets for training a feed forward network/routing network. The following datasets will be prepared: 
- ARC AI2 Reasoning Challenge: 7,787 grade-school science questions
- OpenBookQA: 5,957 science and reasoning questions
- MathQA: ~37,000 math word problems
- CommonsenseQA: 12,247 general reasoning questions

In [45]:
import pandas
from datasets import load_dataset, concatenate_datasets, Dataset, load_from_disk
import os
import json
import re

### ARC AI2 Reasoning Challenge

https://huggingface.co/datasets/allenai/ai2_arc


In [76]:
ds_arc1 = load_dataset("allenai/ai2_arc", "ARC-Challenge")
ds_arc2 = load_dataset("allenai/ai2_arc", "ARC-Easy")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in both datasets
ds_arc1_train = ds_arc1['train'].map(lambda x: add_source_field(x, "ARC-Challenge"))
ds_arc1_test = ds_arc1['test'].map(lambda x: add_source_field(x, "ARC-Challenge"))
ds_arc1_validation = ds_arc1['validation'].map(lambda x: add_source_field(x, "ARC-Challenge"))

ds_arc2_train = ds_arc2['train'].map(lambda x: add_source_field(x, "ARC-Easy"))
ds_arc2_test = ds_arc2['test'].map(lambda x: add_source_field(x, "ARC-Easy"))
ds_arc2_validation = ds_arc2['validation'].map(lambda x: add_source_field(x, "ARC-Easy"))

# Concatenate all dataset splits
arc_train_set = concatenate_datasets([ds_arc1_train, ds_arc1_test, ds_arc1_validation,
                                      ds_arc2_train, ds_arc2_test, ds_arc2_validation])

# Define a mapping for standardizing answer keys
answer_key_mapping = {
    "1": "A", "2": "B", "3": "C", "4": "D",  # Convert numbers to letters
    "A": "A", "B": "B", "C": "C", "D": "D", "E": "E"  # Keep existing valid choices
}

# Function to normalize `answerKey`
def normalize_answer_key(example):
    if example["answerKey"] in answer_key_mapping:
        example["answerKey"] = answer_key_mapping[example["answerKey"]]
    else:
        example["answerKey"] = "UNKNOWN"  # Handle unexpected values
    return example

# Apply normalization to the dataset
arc_train_set = arc_train_set.map(normalize_answer_key)

# Save the new dataset to disk
arc_train_set.save_to_disk("../datasets/ARC_AI2.hf")

Map: 100%|██████████| 7787/7787 [00:01<00:00, 6364.68 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 7787/7787 [00:00<00:00, 324998.46 examples/s]


### OpenBookQA

https://huggingface.co/datasets/allenai/openbookqa

In [52]:
ds_ob1 = load_dataset("allenai/openbookqa", "additional")
ds_ob2 = load_dataset("allenai/openbookqa", "main")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in both datasets
ds_ob1_train = ds_ob1['train'].map(lambda x: add_source_field(x, "OpenBook-Additional"))
ds_ob1_test = ds_ob1['test'].map(lambda x: add_source_field(x, "OpenBook-Additional"))
ds_ob1_validation = ds_ob1['validation'].map(lambda x: add_source_field(x, "OpenBook-Additional"))

ds_ob2_train = ds_ob2['train'].map(lambda x: add_source_field(x, "OpenBook-Main"))
ds_ob2_test = ds_ob2['test'].map(lambda x: add_source_field(x, "OpenBook-Mai"))
ds_ob2_validation = ds_ob2['validation'].map(lambda x: add_source_field(x, "OpenBook-Mai"))

# Concatenate all dataset splits
ob_train_set = concatenate_datasets([ds_ob1_train, ds_ob1_test, ds_ob1_validation,
                                     ds_ob2_train, ds_ob2_test, ds_ob2_validation])

# Function to update keys and values
def update_entry(entry):
    return {
        "id": entry["id"],  
        "question": entry["question_stem"],
        "choices": entry["choices"],  
        "answerKey": entry["answerKey"],
        "source": entry["source"]
    }

# Apply transformation
updated_data = [update_entry(entry) for entry in ob_train_set]
ob_train_set = Dataset.from_list(updated_data)

# Save the new dataset to disk
ob_train_set.save_to_disk("../datasets/OpenBook.hf")

Saving the dataset (1/1 shards): 100%|██████████| 11914/11914 [00:00<00:00, 535846.89 examples/s]


### MathQA

https://math-qa.github.io/math-QA/

JSON files will needed to be converted to HF format with the same labels as the other Q&A datasets

In [51]:
json_dir = "../datasets/MathQA"

# List to store combined JSON data
all_data = []

# Loop through all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):  # Ensure it's a JSON file
        file_path = os.path.join(json_dir, filename)
        
        # Load the JSON file and append its content to all_data
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)  # Each file is a list of dictionaries
            all_data.extend(data)  # Concatenate lists

print('Number of examples in MathQA:', len(all_data))

Number of examples in MathQA: 37901


In [49]:
# Modify example problems to match labels and formatting for other datasets
# Function to transform options into the correct format
def parse_options(entry):
    # Extract choices using regex
    choices = re.findall(r"[a-e] \) (.*?)(?= , [a-e] \) |$)", entry)
    return {"text": choices, "label": ["A", "B", "C", "D", "E"]}  
       

# Function to update keys and values
def update_entry(entry):
    return {
        "question": entry["Problem"],  
        "rationale": entry["Rationale"],
        "choices": parse_options(entry["options"]),  
        "answerKey": entry["correct"].upper(),
        "annotatedFormula": entry["annotated_formula"],
        "linearFormula": entry["linear_formula"],
        "category": entry["category"],
        "source": "MathQA"
    }

# Apply transformation
updated_data = [update_entry(entry) for entry in all_data]
math_train_set = Dataset.from_list(updated_data)
math_train_set.save_to_disk("../datasets/MathQA.hf")


Saving the dataset (1/1 shards): 100%|██████████| 37901/37901 [00:00<00:00, 448946.36 examples/s]


### CommonsenseQA

https://huggingface.co/datasets/tau/commonsense_qa

In [81]:
ds_cs = load_dataset("tau/commonsense_qa")

# Add the "source" field to each example
def add_source_field(example, source_name):
    example['source'] = source_name
    return example

# Apply the transformation to each split in the dataset
ds_cs_train = ds_cs['train'].map(lambda x: add_source_field(x, "Commonsense"))
ds_cs_validation = ds_cs['validation'].map(lambda x: add_source_field(x, "Commonsense"))
# ds_cs_test = ds_cs['test'].map(lambda x: add_source_field(x, "Commonsense")) # no answerKey provided

# Concatenate all dataset splits
cs_train_set = concatenate_datasets([ds_cs_train, ds_cs_validation])

# Save the new dataset to disk
cs_train_set.save_to_disk("../datasets/Commonsense.hf")

Saving the dataset (1/1 shards): 100%|██████████| 10962/10962 [00:00<00:00, 166897.63 examples/s]


### ComSciQA

Two versions of this generated dataset: Llama8B and Llama70B

In [84]:
comsci_train_set = load_from_disk("../datasets/ComSciQA_Llama70B.hf/train")

# Define a mapping for standardizing answer keys
answer_key_mapping = {
    "0": "A", "1": "B", "2": "C", "3": "D",  # Convert numbers to letters
    "A": "A", "B": "B", "C": "C", "D": "D",  # Keep existing valid choices
}
# Function to normalize `answerKey`
def normalize_answer_key(example):
    if example["answerKey"] in answer_key_mapping:
        example["answerKey"] = answer_key_mapping[example["answerKey"]]
    else:
        example["answerKey"] = "UNKNOWN"  # Handle unexpected values
    return example


# Apply normalization to the dataset
comsci_train_set = comsci_train_set.map(normalize_answer_key).filter(lambda x: x["answerKey"] != "UNKNOWN")

comsci_train_set.save_to_disk("../datasets/Clean_ComSciQA_Llama70B.hf")

Map: 100%|██████████| 33728/33728 [00:10<00:00, 3306.31 examples/s] 
Filter: 100%|██████████| 33728/33728 [00:01<00:00, 29841.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33542/33542 [00:00<00:00, 50381.73 examples/s]


### Combine Datasets

In [87]:
dataset_dict = {
    "ComSciQA": comsci_train_set,
    "CommonsenseQA": cs_train_set,
    "MathQA": math_train_set,
    "OpenBookQA": ob_train_set,
    "ARC_AI2": arc_train_set
}

for name, dataset in dataset_dict.items():
    print(f"{name}: {dataset.num_rows} rows")

ComSciQA: 33542 rows
CommonsenseQA: 10962 rows
MathQA: 37901 rows
OpenBookQA: 11914 rows
ARC_AI2: 7787 rows


In [115]:
# Concatenate all datasets
keys_to_keep = ["question", "choices", "answerKey", "source"]

# Function to normalize `choices` into `answerChoices`
def normalize_choices(example):
    if "choices" in example and isinstance(example["choices"], dict):
        labels = example["choices"].get("label", [])
        texts = example["choices"].get("text", [])

        answer_map = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}

        example["choices"] = texts
        example["answerKey"] = answer_map.get(example["answerKey"])

    return example

# Normalize and filter datasets
filtered_datasets = []
for name, dataset in dataset_dict.items():
    # If dataset has multiple splits, take "train" split (adjust as needed)
    if isinstance(dataset, dict):
        dataset = dataset["train"]

    # Normalize choices
    dataset = dataset.map(normalize_choices)

    # Keep only the required columns
    filtered_dataset = dataset.select_columns(keys_to_keep)
    filtered_datasets.append(filtered_dataset)

# Concatenate all filtered datasets
combined_train_set = concatenate_datasets(filtered_datasets)
combined_train_set = combined_train_set.filter(lambda x: x["answerKey"] != 4)   # MMLU only has 4 answer choices

# Rename features to match MMLU
combined_train_set = combined_train_set.rename_columns({"source": "subject", "answerKey": "answer"})

# Print summary
print(combined_train_set)
combined_train_set.save_to_disk("../datasets/CombineQA.hf")

Map: 100%|██████████| 37901/37901 [00:49<00:00, 767.11 examples/s] 
Map: 100%|██████████| 11914/11914 [00:05<00:00, 2240.96 examples/s]
Filter: 100%|██████████| 102106/102106 [00:07<00:00, 14048.02 examples/s]


Dataset({
    features: ['question', 'choices', 'answer', 'subject'],
    num_rows: 93940
})


Saving the dataset (1/1 shards): 100%|██████████| 93940/93940 [00:05<00:00, 17479.49 examples/s]


In [116]:
combined_train_set[-1]

{'question': 'Scientists at a local university have been studying the impact that people have on Earth. One of the areas being studied is how the burning of fossil fuels affects the environment. Which effect of fossil fuel burning have the scientists most likely evaluated?',
 'choices': ['the production of nitrogen-fixing bacteria',
  'the mechanical weathering of roads',
  'the formation of acid rain',
  'the increase in runoff'],
 'answer': 2,
 'subject': 'ARC-Easy'}

In [117]:
unique_answer_keys = combined_train_set.unique("answer")
print(unique_answer_keys)

Flattening the indices: 100%|██████████| 93940/93940 [00:13<00:00, 6895.94 examples/s] 

[0, 1, 3, 2]





In [111]:
mmlu = load_dataset("cais/mmlu", "anatomy", split="test")
print(mmlu)

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 135
})


{'question': 'A lesion causing compression of the facial nerve at the stylomastoid foramen will cause ipsilateral',
 'subject': 'anatomy',
 'choices': ['paralysis of the facial muscles.',
  'paralysis of the facial muscles and loss of taste.',
  'paralysis of the facial muscles, loss of taste and lacrimation.',
  'paralysis of the facial muscles, loss of taste, lacrimation and decreased salivation.'],
 'answer': 0}