## MMLU Categories and Subcategories

https://github.com/hendrycks/test/blob/master/categories.py

In [2]:
import os
import pandas as pd

import tiktoken
from PIL import Image, ImageDraw, ImageFont
import random

In [3]:
subcategories = {
    "abstract_algebra": ["math"],
    "anatomy": ["health"],
    "astronomy": ["physics"],
    "business_ethics": ["business"],
    "clinical_knowledge": ["health"],
    "college_biology": ["biology"],
    "college_chemistry": ["chemistry"],
    "college_computer_science": ["computer science"],
    "college_mathematics": ["math"],
    "college_medicine": ["health"],
    "college_physics": ["physics"],
    "computer_security": ["computer science"],
    "conceptual_physics": ["physics"],
    "econometrics": ["economics"],
    "electrical_engineering": ["engineering"],
    "elementary_mathematics": ["math"],
    "formal_logic": ["philosophy"],
    "global_facts": ["other"],
    "high_school_biology": ["biology"],
    "high_school_chemistry": ["chemistry"],
    "high_school_computer_science": ["computer science"],
    "high_school_european_history": ["history"],
    "high_school_geography": ["geography"],
    "high_school_government_and_politics": ["politics"],
    "high_school_macroeconomics": ["economics"],
    "high_school_mathematics": ["math"],
    "high_school_microeconomics": ["economics"],
    "high_school_physics": ["physics"],
    "high_school_psychology": ["psychology"],
    "high_school_statistics": ["math"],
    "high_school_us_history": ["history"],
    "high_school_world_history": ["history"],
    "human_aging": ["health"],
    "human_sexuality": ["culture"],
    "international_law": ["law"],
    "jurisprudence": ["law"],
    "logical_fallacies": ["philosophy"],
    "machine_learning": ["computer science"],
    "management": ["business"],
    "marketing": ["business"],
    "medical_genetics": ["health"],
    "miscellaneous": ["other"],
    "moral_disputes": ["philosophy"],
    "moral_scenarios": ["philosophy"],
    "nutrition": ["health"],
    "philosophy": ["philosophy"],
    "prehistory": ["history"],
    "professional_accounting": ["other"],
    "professional_law": ["law"],
    "professional_medicine": ["health"],
    "professional_psychology": ["psychology"],
    "public_relations": ["politics"],
    "security_studies": ["politics"],
    "sociology": ["culture"],
    "us_foreign_policy": ["politics"],
    "virology": ["health"],
    "world_religions": ["philosophy"],
}

categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering"],
    "humanities": ["history", "philosophy", "law"],
    "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
    "other (business, health, misc.)": ["other", "business", "health"],
}

In [4]:
subcategory_wise_division = {}
for category in categories.items():
    for category_inside in category[1]:
        for subcategory in subcategories.items():
            if subcategory[1][0] == category_inside:
                if category_inside not in subcategory_wise_division:
                    subcategory_wise_division[category_inside] = [subcategory[0]]
                else:
                    subcategory_wise_division[category_inside].append(subcategory[0])

print(subcategory_wise_division)

{'physics': ['astronomy', 'college_physics', 'conceptual_physics', 'high_school_physics'], 'chemistry': ['college_chemistry', 'high_school_chemistry'], 'biology': ['college_biology', 'high_school_biology'], 'computer science': ['college_computer_science', 'computer_security', 'high_school_computer_science', 'machine_learning'], 'math': ['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics'], 'engineering': ['electrical_engineering'], 'history': ['high_school_european_history', 'high_school_us_history', 'high_school_world_history', 'prehistory'], 'philosophy': ['formal_logic', 'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy', 'world_religions'], 'law': ['international_law', 'jurisprudence', 'professional_law'], 'politics': ['high_school_government_and_politics', 'public_relations', 'security_studies', 'us_foreign_policy'], 'culture': ['human_sexuality', 'sociology'], 'economics': ['econometrics', 'h

In [5]:
major_category = "math"
subcategories_in_major = subcategory_wise_division[major_category]
print(subcategories_in_major)

['abstract_algebra', 'college_mathematics', 'elementary_mathematics', 'high_school_mathematics', 'high_school_statistics']


In [6]:
for subcategory_in_major in subcategories_in_major:
    splits = {'test': f'{subcategory_in_major}/test-00000-of-00001.parquet', 'validation': f'{subcategory_in_major}/validation-00000-of-00001.parquet', 'dev': f'{subcategory_in_major}/dev-00000-of-00001.parquet'}
    df = pd.read_parquet("hf://datasets/cais/mmlu/" + splits["test"])
    display(df.head(2))
    print(len(df.index), "questions in", subcategory_in_major)

Unnamed: 0,question,subject,choices,answer
0,Find the degree for the given field extension ...,abstract_algebra,"[0, 4, 2, 6]",1
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...",abstract_algebra,"[8, 2, 24, 120]",2


100 questions in abstract_algebra


Unnamed: 0,question,subject,choices,answer
0,Let k be the number of real solutions of the e...,college_mathematics,"[k = 0 and n = 1, k = 1 and n = 0, k = n = 1, ...",1
1,"Up to isomorphism, how many additive abelian g...",college_mathematics,"[0, 1, 2, 3]",3


100 questions in college_mathematics


Unnamed: 0,question,subject,choices,answer
0,What is the value of p in 24 = 2p?,elementary_mathematics,"[p = 4, p = 8, p = 12, p = 24]",2
1,Ms. Perez drove a total of 40 miles in 5 days....,elementary_mathematics,"[5, 7, 8, 9]",2


378 questions in elementary_mathematics


Unnamed: 0,question,subject,choices,answer
0,"If a pentagon P with vertices at (– 2, – 4), (...",high_school_mathematics,"[(0, – 3), (4, 1), (2, 2), (– 4, –2)]",3
1,The length of a rectangle is twice its width. ...,high_school_mathematics,"[2500, 2, 50, 25]",2


270 questions in high_school_mathematics


Unnamed: 0,question,subject,choices,answer
0,The weight of an aspirin tablet is 300 milligr...,high_school_statistics,"[P(t > 1.54) with df = 6, 2P(t > 1.54) with df...",1
1,The waiting times for a new roller coaster rid...,high_school_statistics,"[0 to 31.7 minutes, 31.7 to 39.3 minutes, 25.3...",2


216 questions in high_school_statistics


In [7]:
enc = tiktoken.get_encoding("o200k_base")
print(enc.encode("hello world")) # prints: [24912, 2375]

[24912, 2375]


In [8]:
for encdoded_token in enc.encode("hello world"):
    print(enc.decode([encdoded_token]))

hello
 world


In [9]:
def get_color_palette():
    """Returns a list of aesthetic background colors."""
    return [(255, 200, 200), (200, 255, 200), (200, 200, 255), (255, 255, 200), (255, 200, 255)]

def visualize_tokens(text, img_width=500, img_height=300, font_path="arial.ttf", output_path="token_highlight.png"):
    enc = tiktoken.get_encoding("o200k_base")
    tokens = enc.encode(text)
    decoded_tokens = [enc.decode([t]) for t in tokens]
    
    colors = get_color_palette()
    last_color = None
    token_colors = {}
    
    for token in tokens:
        available_colors = [c for c in colors if c != last_color]
        token_colors[token] = random.choice(available_colors)
        last_color = token_colors[token]
    
    font_size = 40
    font = ImageFont.truetype(font_path, font_size)
    
    img = Image.new("RGB", (img_width, img_height), (255, 255, 255))  # Fixed background size
    draw = ImageDraw.Draw(img)
    
    x_offset, y_offset = 0, 0
    max_line_height = font_size + 10  # Ensuring uniform line heights
    
    for token, token_id in zip(decoded_tokens, tokens):
        visual_token = token.replace(" ", "•")  # Replace spaces with dots for visualization
        bbox = draw.textbbox((0, 0), visual_token, font=font)
        token_width, token_height = bbox[2] - bbox[0], max_line_height  # Remove extra spacing
        
        if x_offset + token_width > img_width:
            x_offset = 0  # Move to new line
            y_offset += max_line_height
            if y_offset + max_line_height > img_height:
                break  # Stop if exceeding image height
        
        draw.rectangle([x_offset, y_offset, x_offset + token_width, y_offset + token_height], fill=token_colors[token_id])
        draw.text((x_offset, y_offset), visual_token, font=font, fill=(0, 0, 0))
        x_offset += token_width  # No extra spacing
    
    img.save(output_path)
    img.show()

# Example usage
visualize_tokens("""hello world this is a test of how well the text wraps and moves to a new line
hello world this is a test of how well the text wraps and moves to a new line
hello hello""", img_width=500, img_height=300, font_path="arial.ttf")

In [10]:
list_experts = {}
list_experts['math'] = [
    "Mathematician",
    "PhD Student",
    "Math Professor",
    "Researcher",
    "High School Teacher",
    "Tutor",
    "Statistician",
    "Graduate Student",
    "Postdoctoral Researcher",
    "Lecturer",
    "Data Scientist",
    "Actuary",
    "Mathematics Olympiad Coach",
    "Educational Consultant",
    "Curriculum Developer"
]

In [11]:
class PromptTemplate:
    def __init__(self, template: str):
        self.template = template

    def invoke(self, variables: dict):
        return self.template.format(**variables)

prompt_template = PromptTemplate("You are a {expert_name}. Answer this question {question} Choose options 0, 1, 2, 3")

In [12]:
df

Unnamed: 0,question,subject,choices,answer
0,The weight of an aspirin tablet is 300 milligr...,high_school_statistics,"[P(t > 1.54) with df = 6, 2P(t > 1.54) with df...",1
1,The waiting times for a new roller coaster rid...,high_school_statistics,"[0 to 31.7 minutes, 31.7 to 39.3 minutes, 25.3...",2
2,All of the following statements are true for a...,high_school_statistics,"[The possible outcomes must all be numerical.,...",3
3,Which of the following is a true statement abo...,high_school_statistics,[If there is sufficient evidence to reject a n...,3
4,An outlier is an observation that,high_school_statistics,[is seen more frequently than the other observ...,3
...,...,...,...,...
211,Which of the following is the best description...,high_school_statistics,[The probability that the null hypothesis is t...,3
212,A drug company will conduct a randomized contr...,high_school_statistics,"[Patients will spend more money on Heartaid, e...",2
213,Two classes take the same exam. Suppose a cert...,high_school_statistics,[Students in the first class generally scored ...,0
214,The mean thrust of a certain model jet engine ...,high_school_statistics,[99.31% of the engines produced under the new ...,3


In [13]:
question_count = 1
subject = df['subject'][0]

subject_dir = os.path.join("visualizations", subject)
os.makedirs(subject_dir, exist_ok=True)

for question in df['question']:
    for expert_name in list_experts['math']:
        if question_count < 10:
            output = prompt_template.invoke({"expert_name": expert_name, "question": question})
            # Define the output path
            output_path = os.path.join(subject_dir, f"{question_count}.png")
            # Visualize and save the output
            visualize_tokens(output, output_path=output_path)
        else:
            break
    question_count += 1