In [49]:
# load the mmlu pro dataset from huggingface
from datasets import load_dataset
import pandas as pd

In [50]:

dataset = load_dataset("TIGER-Lab/MMLU-Pro")

print(dataset)

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src'],
        num_rows: 12032
    })
    validation: Dataset({
        features: ['question_id', 'question', 'options', 'answer', 'answer_index', 'cot_content', 'category', 'src'],
        num_rows: 70
    })
})


In [51]:
# list of all categories in the dataset
dataset.keys()

dict_keys(['test', 'validation'])

In [52]:
ds = dataset["test"]

In [53]:
df = ds.to_pandas()

In [54]:
df.head()

Unnamed: 0,question_id,question,options,answer,answer_index,cot_content,category,src
0,70,"Typical advertising regulatory bodies suggest,...","[Safe practices, Fear, Jealousy, Trivial, Unsa...",I,8,,business,ori_mmlu-business_ethics
1,71,Managers are entrusted to run the company in t...,"[Shareholders, Diligence, Self-interest, Share...",F,5,,business,ori_mmlu-business_ethics
2,72,There are two main issues associated with ____...,"[Down, Autonomy, Remuneration, Benefit, Down, ...",J,9,,business,ori_mmlu-business_ethics
3,73,_______ locate morality beyond the sphere of r...,"[Ethical egoism, Ethics of duty, Postmodern et...",C,2,,business,ori_mmlu-business_ethics
4,74,Some of key differences between Islamic finan...,"[Interest, Certain, Assured, Both tangible and...",G,6,,business,ori_mmlu-business_ethics


In [55]:
categories = df.groupby("category").size()[0:5]
listed_categories = categories.index.tolist()
print(listed_categories)


['biology', 'business', 'chemistry', 'computer science', 'economics']


In [56]:
# 5 model names
model_names = ["biology_model", "business_model", "chemistry_model", "computer_science_model", "economics_model"]

In [57]:
model_performance_map = {
    "biology": {"biology_model": 0.9, "business_model": 0.1, "chemistry_model": 0.5, "computer_science_model": 0.2, "economics_model": 0.3},
    "business": {"biology_model": 0.1, "business_model": 0.9, "chemistry_model": 0.2, "computer_science_model": 0.3, "economics_model": 0.4},
    "chemistry": {"biology_model": 0.1, "business_model": 0.2, "chemistry_model": 0.9, "computer_science_model": 0.4, "economics_model": 0.3},
    "computer_science": {"biology_model": 0.1, "business_model": 0.3, "chemistry_model": 0.4, "computer_science_model": 0.9, "economics_model": 0.3},
    "economics": {"biology_model": 0.1, "business_model": 0.4, "chemistry_model": 0.3, "computer_science_model": 0.3, "economics_model": 0.9}
}

In [58]:
# Get 100 questions from each of the top 5 most popular categories
questions = df[df["category"].isin(listed_categories)].groupby("category").apply(lambda x: x.sample(100))
# Convert the top 100 questions to a pandas DataFrame
questions_df = questions.reset_index(drop=True)


  questions = df[df["category"].isin(listed_categories)].groupby("category").apply(lambda x: x.sample(100))


In [59]:
# save the questions to a csv file
questions_df.to_csv("questions.csv", index=False)

In [60]:
# save the questions to a jsonl file
questions_df.to_json("data/questions.jsonl", orient="records", lines=True)

In [61]:
def number_to_letter(number: int) -> str:
    return chr(65 + number)

def create_mc_question_prompt(question: str, options: list[str]) -> str:
    prompt = f"""
    Question: {question}
    Choices: 
    {create_options_prompt(options)}
    Please solve this step by step, then output your answer on a new line as 'The answer is: X'
    where X is the letter corresponding to your choice
    """.replace("   ", "").replace("\n", "")
    return prompt

def create_options_prompt(options: list[str]) -> str:
    # turn options into dictionary with capital letter as key and option as value
    choices_string = ""
    for i, option in enumerate(options):
        choices_string += f"{number_to_letter(i)}: {option}. "
    return choices_string

questions_df['prompted_question'] = questions_df.apply(lambda row: create_mc_question_prompt(row["question"], row["options"]), axis=1)

In [62]:
questions_df[['prompted_question', 'answer']].iloc[0]['prompted_question']

" Question: In certain abnormal conditions, the stomach does not secretehydrochloric acid. What effects might this haveon thedigestive process? Choices:  A: The absence of HCl would cause the stomach lining to produce more acid.. B: The absence of HCl would lead to the complete digestion of lipids in the stomach.. C: The absence of HCl would lead to some loss of protein digestion, less denaturation of proteins, less killing of ingested bacteria, and less pancreatic secretion.. D: The absence of HCl would increase the level of gastric enzymes.. E: The absence of HCl would lead to improved digestion of carbohydrates.. F: The absence of HCl would stimulate the liver to release more bile.. G: The absence of HCl would have no effect on the digestive process.. H: The absence of HCl would enhance the activity of gastric lipase.. I: The absence of HCl would result in more efficient absorption of nutrients.. J: The absence of HCl would increase the secretion of pancreatic juice..  Please solve 

In [63]:
questions_df.head()

Unnamed: 0,question_id,question,options,answer,answer_index,cot_content,category,src,prompted_question
0,3098,"In certain abnormal conditions, the stomach do...",[The absence of HCl would cause the stomach li...,C,2,,biology,stemez-Biology,"Question: In certain abnormal conditions, the..."
1,3265,Short sequence by promoter that assists transc...,"[Promoter, Inducer, Repressor, Operator, Inhib...",D,3,,biology,ori_mmlu-high_school_biology,Question: Short sequence by promoter that ass...
2,3024,"Two 30"" plants are crossed, resulting in proge...","[AaBbCcDd×AABBCCDD, AaBbCcEeFf×AaBbCcEeFf, AaB...",E,4,,biology,stemez-Genetics,"Question: Two 30"" plants are crossed, resulti..."
3,2978,Distinguish betweenapoenzymesandcofactors .,[Apoenzymes can perform enzymatic functions wi...,D,3,,biology,stemez-Biology,Question: Distinguish betweenapoenzymesandcof...
4,3256,What hormone is responsible for metamorphosis ...,"[growth hormone, estrogen, thyroxine, insulin,...",C,2,,biology,stemez-Biology,Question: What hormone is responsible for met...


In [64]:
processed_questions = questions_df[['prompted_question', 'answer', 'category']]

In [65]:
processed_questions.to_json("data/processed_questions.jsonl", orient="records", lines=True)

In [66]:
data = processed_questions.copy()

In [67]:
data["model_performance"] = data["category"].map(model_performance_map)

In [68]:
# save data to a jsonl file
data.to_json("data/data.jsonl", orient="records", lines=True)
