  from .autonotebook import tqdm as notebook_tqdm


# MMLU

In [26]:
from datasets import load_dataset
import os
mmlu = load_dataset("cais/mmlu", "all")
mmlu["test"]

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 14042
})

In [28]:
# Define a function to add the "prompt" field
def add_prompt(example):
    # Create a formatted string for the choices
    formatted_choices = '\n'.join([f"({label}) {choice}" for label, choice in zip(choice_labels, example["choices"])])
    # Concatenate the question and the formatted choices into a new field called "prompt"
    example["prompt"] = example["question"] + "\n" + formatted_choices
    return example

# Use the .map() method to apply the function to each row in the "test" split
mmlu["test"] = mmlu["test"].map(add_prompt)

In [29]:
mmlu["test"][0]

{'question': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
 'subject': 'abstract_algebra',
 'choices': ['0', '4', '2', '6'],
 'answer': 1,
 'prompt': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\n(0) 0\n(1) 4\n(2) 2\n(3) 6'}

In [23]:
mmlu["test"].to_json("./LayerSkip/custom_datasets/mmlu_test.jsonl", orient="records", lines=True)

Creating json from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 144.70ba/s]


14150821

In [30]:
# Get the unique subjects from the test split.
# You can do this by extracting the "subject" column and converting it to a set.
unique_subjects = set(mmlu["test"]["subject"])

# Create the output directory if it doesn't exist.
output_folder = "./LayerSkip/custom_datasets/mmlu/"
os.makedirs(output_folder, exist_ok=True)

# Loop over each unique subject and filter the dataset.
for subject in unique_subjects:
    # Filter the dataset to only the rows for the current subject.
    subject_ds = mmlu["test"].filter(lambda example: example["subject"] == subject)
    

    # Create a safe filename from the subject name.
    safe_subject = subject.replace(" ", "_")
    output_file = os.path.join(output_folder, f"{safe_subject}.jsonl")
    
    # Save the filtered dataset to a JSONL file.
    # Here we export records in a line-delimited JSON format.
    subject_ds.to_json(output_file, orient="records", lines=True)
    
    print(f"Saved subject '{subject}' to {output_file}")

Filter: 100%|██████████| 14042/14042 [00:00<00:00, 121319.34 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 371.44ba/s]


Saved subject 'marketing' to ./LayerSkip/custom_datasets/mmlu/marketing.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127316.08 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 546.06ba/s]


Saved subject 'high_school_computer_science' to ./LayerSkip/custom_datasets/mmlu/high_school_computer_science.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127349.39 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 594.43ba/s]


Saved subject 'college_computer_science' to ./LayerSkip/custom_datasets/mmlu/college_computer_science.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128514.83 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 507.60ba/s]


Saved subject 'logical_fallacies' to ./LayerSkip/custom_datasets/mmlu/logical_fallacies.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128080.52 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 637.92ba/s]


Saved subject 'college_mathematics' to ./LayerSkip/custom_datasets/mmlu/college_mathematics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126837.08 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 167.57ba/s]


Saved subject 'high_school_world_history' to ./LayerSkip/custom_datasets/mmlu/high_school_world_history.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127452.18 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 197.64ba/s]


Saved subject 'high_school_psychology' to ./LayerSkip/custom_datasets/mmlu/high_school_psychology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127674.87 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 487.77ba/s]


Saved subject 'virology' to ./LayerSkip/custom_datasets/mmlu/virology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128159.67 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 633.87ba/s]


Saved subject 'college_physics' to ./LayerSkip/custom_datasets/mmlu/college_physics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128186.17 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 571.66ba/s]


Saved subject 'electrical_engineering' to ./LayerSkip/custom_datasets/mmlu/electrical_engineering.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127367.84 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 297.05ba/s]


Saved subject 'prehistory' to ./LayerSkip/custom_datasets/mmlu/prehistory.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 125466.09 examples/s]
Creating json from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 78.36ba/s]


Saved subject 'professional_law' to ./LayerSkip/custom_datasets/mmlu/professional_law.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127305.34 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 270.34ba/s]


Saved subject 'security_studies' to ./LayerSkip/custom_datasets/mmlu/security_studies.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126896.38 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 160.42ba/s]


Saved subject 'professional_psychology' to ./LayerSkip/custom_datasets/mmlu/professional_psychology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128103.64 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 291.74ba/s]


Saved subject 'elementary_mathematics' to ./LayerSkip/custom_datasets/mmlu/elementary_mathematics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127073.52 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 556.13ba/s]


Saved subject 'human_sexuality' to ./LayerSkip/custom_datasets/mmlu/human_sexuality.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126709.64 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 281.31ba/s]


Saved subject 'professional_accounting' to ./LayerSkip/custom_datasets/mmlu/professional_accounting.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127958.64 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 424.78ba/s]


Saved subject 'conceptual_physics' to ./LayerSkip/custom_datasets/mmlu/conceptual_physics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128586.94 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 203.20ba/s]


Saved subject 'high_school_us_history' to ./LayerSkip/custom_datasets/mmlu/high_school_us_history.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128553.82 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 485.34ba/s]


Saved subject 'high_school_physics' to ./LayerSkip/custom_datasets/mmlu/high_school_physics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127099.85 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 383.04ba/s]


Saved subject 'college_medicine' to ./LayerSkip/custom_datasets/mmlu/college_medicine.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127631.15 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 349.06ba/s]


Saved subject 'high_school_mathematics' to ./LayerSkip/custom_datasets/mmlu/high_school_mathematics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127441.15 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 451.73ba/s]


Saved subject 'high_school_geography' to ./LayerSkip/custom_datasets/mmlu/high_school_geography.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126276.60 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 355.81ba/s]


Saved subject 'high_school_microeconomics' to ./LayerSkip/custom_datasets/mmlu/high_school_microeconomics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127874.46 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 629.78ba/s]


Saved subject 'college_chemistry' to ./LayerSkip/custom_datasets/mmlu/college_chemistry.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126533.79 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 354.58ba/s]


Saved subject 'high_school_chemistry' to ./LayerSkip/custom_datasets/mmlu/high_school_chemistry.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127302.04 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 485.56ba/s]


Saved subject 'formal_logic' to ./LayerSkip/custom_datasets/mmlu/formal_logic.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127892.78 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 281.35ba/s]


Saved subject 'moral_disputes' to ./LayerSkip/custom_datasets/mmlu/moral_disputes.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127628.94 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 647.17ba/s]


Saved subject 'medical_genetics' to ./LayerSkip/custom_datasets/mmlu/medical_genetics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127238.24 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 515.97ba/s]


Saved subject 'world_religions' to ./LayerSkip/custom_datasets/mmlu/world_religions.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128051.56 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 649.27ba/s]


Saved subject 'jurisprudence' to ./LayerSkip/custom_datasets/mmlu/jurisprudence.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127773.20 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 677.48ba/s]


Saved subject 'business_ethics' to ./LayerSkip/custom_datasets/mmlu/business_ethics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128404.71 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 558.05ba/s]


Saved subject 'college_biology' to ./LayerSkip/custom_datasets/mmlu/college_biology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127541.88 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 222.71ba/s]


Saved subject 'high_school_european_history' to ./LayerSkip/custom_datasets/mmlu/high_school_european_history.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126758.73 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 411.05ba/s]


Saved subject 'human_aging' to ./LayerSkip/custom_datasets/mmlu/human_aging.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127552.93 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 686.02ba/s]


Saved subject 'abstract_algebra' to ./LayerSkip/custom_datasets/mmlu/abstract_algebra.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126836.26 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 646.37ba/s]


Saved subject 'management' to ./LayerSkip/custom_datasets/mmlu/management.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127153.63 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 305.66ba/s]


Saved subject 'philosophy' to ./LayerSkip/custom_datasets/mmlu/philosophy.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127333.42 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 411.29ba/s]


Saved subject 'sociology' to ./LayerSkip/custom_datasets/mmlu/sociology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127594.93 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 248.57ba/s]


Saved subject 'high_school_macroeconomics' to ./LayerSkip/custom_datasets/mmlu/high_school_macroeconomics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127759.06 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 666.82ba/s]


Saved subject 'computer_security' to ./LayerSkip/custom_datasets/mmlu/computer_security.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128289.48 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 336.97ba/s]


Saved subject 'high_school_statistics' to ./LayerSkip/custom_datasets/mmlu/high_school_statistics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127472.87 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 541.48ba/s]


Saved subject 'anatomy' to ./LayerSkip/custom_datasets/mmlu/anatomy.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127687.60 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 589.67ba/s]


Saved subject 'public_relations' to ./LayerSkip/custom_datasets/mmlu/public_relations.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127288.56 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 231.03ba/s]


Saved subject 'professional_medicine' to ./LayerSkip/custom_datasets/mmlu/professional_medicine.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127518.13 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 366.76ba/s]


Saved subject 'clinical_knowledge' to ./LayerSkip/custom_datasets/mmlu/clinical_knowledge.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128134.30 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 660.94ba/s]


Saved subject 'us_foreign_policy' to ./LayerSkip/custom_datasets/mmlu/us_foreign_policy.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128004.24 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 463.56ba/s]


Saved subject 'astronomy' to ./LayerSkip/custom_datasets/mmlu/astronomy.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127705.60 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 568.03ba/s]


Saved subject 'econometrics' to ./LayerSkip/custom_datasets/mmlu/econometrics.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 126980.92 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 157.69ba/s]


Saved subject 'miscellaneous' to ./LayerSkip/custom_datasets/mmlu/miscellaneous.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127490.80 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 549.93ba/s]


Saved subject 'machine_learning' to ./LayerSkip/custom_datasets/mmlu/machine_learning.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128446.72 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 131.45ba/s]


Saved subject 'moral_scenarios' to ./LayerSkip/custom_datasets/mmlu/moral_scenarios.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 128254.84 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 306.35ba/s]


Saved subject 'high_school_biology' to ./LayerSkip/custom_datasets/mmlu/high_school_biology.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127226.14 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 608.49ba/s]


Saved subject 'global_facts' to ./LayerSkip/custom_datasets/mmlu/global_facts.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127016.25 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 197.51ba/s]


Saved subject 'nutrition' to ./LayerSkip/custom_datasets/mmlu/nutrition.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127215.15 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 513.38ba/s]


Saved subject 'international_law' to ./LayerSkip/custom_datasets/mmlu/international_law.jsonl


Filter: 100%|██████████| 14042/14042 [00:00<00:00, 127761.56 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 375.97ba/s]

Saved subject 'high_school_government_and_politics' to ./LayerSkip/custom_datasets/mmlu/high_school_government_and_politics.jsonl



