  from .autonotebook import tqdm as notebook_tqdm


# MMLU

In [2]:
from datasets import load_dataset
import os
mmlu = load_dataset("cais/mmlu", "all")
mmlu["test"]

Dataset({
    features: ['question', 'subject', 'choices', 'answer'],
    num_rows: 14042
})

In [28]:
# Define a function to add the "prompt" field
def add_prompt(example):
    # Create a formatted string for the choices
    formatted_choices = '\n'.join([f"({label}) {choice}" for label, choice in zip(choice_labels, example["choices"])])
    # Concatenate the question and the formatted choices into a new field called "prompt"
    example["prompt"] = example["question"] + "\n" + formatted_choices
    return example

# Use the .map() method to apply the function to each row in the "test" split
mmlu["test"] = mmlu["test"].map(add_prompt)

In [29]:
mmlu["test"][0]

{'question': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.',
 'subject': 'abstract_algebra',
 'choices': ['0', '4', '2', '6'],
 'answer': 1,
 'prompt': 'Find the degree for the given field extension Q(sqrt(2), sqrt(3), sqrt(18)) over Q.\n(0) 0\n(1) 4\n(2) 2\n(3) 6'}

In [23]:
mmlu["test"].to_json("./LayerSkip/custom_datasets/mmlu_test.jsonl", orient="records", lines=True)

Creating json from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 144.70ba/s]


14150821

In [None]:
# Get the unique subjects from the test split.
# You can do this by extracting the "subject" column and converting it to a set.
unique_subjects = set(mmlu["test"]["subject"])

# Create the output directory if it doesn't exist.
output_folder = "./LayerSkip/custom_datasets/mmlu/"
os.makedirs(output_folder, exist_ok=True)

# Loop over each unique subject and filter the dataset.
for subject in unique_subjects:
    # Filter the dataset to only the rows for the current subject.
    subject_ds = mmlu["test"].filter(lambda example: example["subject"] == subject)
    

    # Create a safe filename from the subject name.
    safe_subject = subject.replace(" ", "_")
    output_file = os.path.join(output_folder, f"{safe_subject}.jsonl")
    
    # Save the filtered dataset to a JSONL file.
    # Here we export records in a line-delimited JSON format.
    subject_ds.to_json(output_file, orient="records", lines=True)
    
    print(f"Saved subject '{subject}' to {output_file}")

# NQ-Open

In [None]:
from datasets import load_dataset
import json

# Load the validation split
dataset = load_dataset("google-research-datasets/nq_open", split="validation")

# Open a file for writing in line-delimited JSON format
with open("./LayerSkip/custom_datasets/nq_open_val.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        question = item["question"]
        answers = item["answer"]
        if answers:
            answer = answers[0]
            json_line = {"question": question, "answer": answer}
            f.write(json.dumps(json_line, ensure_ascii=False) + "\n")


# RACE Dataset

In [None]:
from datasets import load_dataset
import json
import ast

dataset = load_dataset("EleutherAI/race", split="test")
option_labels = ['A', 'B', 'C', 'D', 'E', 'F']

with open("./LayerSkip/custom_datasets/race_test.jsonl", "w", encoding="utf-8") as f:
    for row in dataset:
        article = row["article"]
        data_str = row["problems"]

        try:
            data = ast.literal_eval(data_str)
        except Exception as e:
            try:
                data = json.loads(data_str.replace("'", '"'))
            except Exception as e:
                raise

        for item in data:
            options_str = " ".join([f"{label}. {opt}" for label, opt in zip(option_labels, item['options'])])
            question = f"Article: {article} Question: {item['question']} Answer Options: {options_str}"
            answer = item["answer"]
            if answer:
                json_line = {"question": question, "answer": answer}
                f.write(json.dumps(json_line, ensure_ascii=False) + "\n") 


# Fix MMLU

In [3]:
import os
import pandas as pd


In [6]:
import json

input_path  = "./LayerSkip/custom_datasets/mmlu_test.jsonl"
output_path = "./LayerSkip/custom_datasets/mmlu_test_fixed.jsonl"

with open(input_path, "r", encoding="utf-8") as fin, \
     open(output_path, "w", encoding="utf-8") as fout:
    for line in fin:
        # parse the JSON object
        record = json.loads(line)
        
        # cast `answer` to string (even if it's already one)
        record["answer"] = str(record.get("answer", ""))

        # write back as a single JSON line
        fout.write(json.dumps(record, ensure_ascii=False) + "\n")


In [7]:
import json
from pathlib import Path

input_dir = Path("./LayerSkip/custom_datasets/mmlu")

for input_path in input_dir.glob("*.jsonl"):
    # build the corresponding output path: e.g. "foo.jsonl" → "foo_fixed.jsonl"
    output_path = input_path.with_name(f"{input_path.stem}_fixed.jsonl")

    with input_path.open("r", encoding="utf-8") as fin, \
         output_path.open("w", encoding="utf-8") as fout:

        for line in fin:
            record = json.loads(line)
            # ensure `answer` is a string
            record["answer"] = str(record.get("answer", ""))
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"Processed {input_path.name} → {output_path.name}")


Processed college_chemistry.jsonl → college_chemistry_fixed.jsonl
Processed college_mathematics.jsonl → college_mathematics_fixed.jsonl
Processed professional_accounting.jsonl → professional_accounting_fixed.jsonl
Processed philosophy.jsonl → philosophy_fixed.jsonl
Processed high_school_statistics.jsonl → high_school_statistics_fixed.jsonl
Processed business_ethics.jsonl → business_ethics_fixed.jsonl
Processed computer_security.jsonl → computer_security_fixed.jsonl
Processed formal_logic.jsonl → formal_logic_fixed.jsonl
Processed world_religions.jsonl → world_religions_fixed.jsonl
Processed clinical_knowledge.jsonl → clinical_knowledge_fixed.jsonl
Processed anatomy.jsonl → anatomy_fixed.jsonl
Processed high_school_computer_science.jsonl → high_school_computer_science_fixed.jsonl
Processed sociology.jsonl → sociology_fixed.jsonl
Processed high_school_us_history.jsonl → high_school_us_history_fixed.jsonl
Processed moral_scenarios.jsonl → moral_scenarios_fixed.jsonl
Processed marketing.js