### Import & Load Data

In [110]:
import sys
import os
import json
import pandas as pd
import numpy as np
import ast
import itertools
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("../")

from src.helpers import io
from src.classes.dataset import Dataset
from src.classes.annotation_set import AnnotationSet
from collections import defaultdict
%load_ext autoreload
%autoreload 2

import sys
import os
import json
import pandas as pd
import numpy as np
import ast
import itertools
import random
import copy
from datetime import datetime
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append("../")

from src.helpers import io
from src.classes.dataset import Dataset
from src.classes.annotation_set import AnnotationSet

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [115]:
def get_wildchat_dataset():
    PATH_TO_DATASET = "../data/automatic_annotations_v0/wildchat4k-raw.json"
    DATASET_ID = "wildchat_1m"
    PATH_TO_ANNOTATIONS_DIR = "../data/automatic_annotations_v0/gpto3mini-json-wildchat"
    # Load dataset (w/o annotations)
    dataset = Dataset.load(PATH_TO_DATASET)
   
    # Load annotations into dataset
    for fpath in io.listdir_nohidden(PATH_TO_ANNOTATIONS_DIR):
        annotation_set = AnnotationSet.load_automatic(path=fpath, source="automatic_v0")
        dataset.add_annotations(annotation_set)
    
    dataset.data = dataset.random_sample(1000)  # Randomly sample 1000 examples
    return dataset

def get_hle_dataset():
    PATH_TO_DATASET = "../data/automatic_annotations_v0/hle-raw.json"
    DATASET_ID = "hle"
    PATH_TO_ANNOTATIONS_DIR = "../data/automatic_annotations_v0/hle"
    # Load dataset (w/o annotations)
    dataset = Dataset.load(PATH_TO_DATASET)
    dataset.dataset_id = DATASET_ID
    print(dataset.dataset_id)

    # Load annotations into dataset
    for fpath in io.listdir_nohidden(PATH_TO_ANNOTATIONS_DIR):
        print(f"Loading ", fpath)
        annotation_set = AnnotationSet.load_automatic(path=fpath, source="automatic_v0")
        print(fpath, annotation_set.dataset_id)
        dataset.add_annotations(annotation_set)
    return dataset

def get_mmlu_dataset():
    print("Loading MMLU dataset...")
    PATH_TO_DATASET = "../data/automatic_annotations_v0/mmlu-raw.json"
    DATASET_ID = "mmlu"
    PATH_TO_ANNOTATIONS_DIR = "../data/automatic_annotations_v0/mmlu"
    # Load dataset (w/o annotations)
    dataset = Dataset.load(PATH_TO_DATASET)
    dataset.dataset_id = DATASET_ID
    print(dataset.dataset_id)

    # Load annotations into dataset
    for fpath in io.listdir_nohidden(PATH_TO_ANNOTATIONS_DIR):
  
        annotation_set = AnnotationSet.load_automatic(path=fpath, source="automatic_v0")
        print(fpath, annotation_set.dataset_id)
        dataset.add_annotations(annotation_set)
    return dataset

# Helper to split annotation keys if they are lists in string form
def split_annotation_key(key):
    try:
        # Try to parse as a Python list
        items = ast.literal_eval(key)
        if isinstance(items, list):
            a =[str(item).strip() for item in items if str(item).strip()]
            return a
        else:
            return [key]
    except Exception:
        return [key]

def aggregate_counts_by_category(annotation_pairs):
    category_counts = {}
    for key, count in annotation_pairs.items():
        categories = split_annotation_key(key)
        for cat in categories:
            if cat == '' or cat.lower() == 'none':
                continue
            category_counts[cat] = category_counts.get(cat, 0) + count
    return category_counts

def group_into_larger_categories(agg_wc, parent_dict=None):
    grouped_counts = defaultdict(int)
    for parent, children in parent_dict.items():
        for child in children:
            if child in agg_wc:
                grouped_counts[parent] += agg_wc[child]
    # HACK 
    grouped_counts["Other"] = grouped_counts.pop("AOther", 0) + grouped_counts.pop("Other", 0)
    return grouped_counts


### Spider Plot Code

In [116]:
import plotly.graph_objects as go

def make_spider_plot(data_to_compare, parent_dict = None, annotation_type = ""):

    categories =list(next(iter(data_to_compare.values())).keys())
    
    if parent_dict is not None:
        grouped_data_to_compare = {
            name: group_into_larger_categories(agg_wc, parent_dict=parent_dict)
            for name, agg_wc in data_to_compare.items()
        }
        # Use the keys from the first dataset in grouped_data_to_compare for categories
        categories = list(next(iter(grouped_data_to_compare.values())).keys())
        data_to_compare = grouped_data_to_compare


    # Remove categories where all datasets have 0 occurrences
    categories_to_keep = [
        cat for cat in categories
        if any(data_to_compare[ds].get(cat, 0) > 0 for ds in data_to_compare)
    ]
    categories = categories_to_keep

    # Prepare values for each dataset
    values_to_plot = {}

    for dataset_name, agg_wc in data_to_compare.items():
        print(f"Processing dataset: {dataset_name}")
        values = [agg_wc.get(cat, 0) for cat in categories]
        if len(values) >= 1:
            values += [values[0]]  # Close the loop for the radar plot
        values_to_plot[dataset_name] = values

    # Make scatterpolar 
    fig = go.Figure(
        data=[ 
            go.Scatterpolar(
                r=v,
                theta=categories,
                fill='toself',
                name=k
            ) for k, v in values_to_plot.items()
        ]
    )
    fig.update_layout(
        polar=dict(
            radialaxis=dict(visible=True)
        ),
        title=f"{annotation_type.capitalize()} Comparison",
        showlegend=True,
        width=1500,
        height=800
    )
    fig.show()

    return fig


### Spider Plot for Topics

In [117]:
wildchat_dataset = get_wildchat_dataset()
hle_dataset = get_hle_dataset()
mmlu_dataset = get_mmlu_dataset()

# Define the parent-child relationships for the categories
topic_parent_dict = {
    "Math, Science, Engineering, and Technology": [
        "Math & Sciences",
        "Technology, Software & Computing",
        "Non-software Engineering & Infrastructure",
        "Nature & Environment", 
    ],
    "Business, Finance, and Economics": [
        "Economics",
        "Business & Finances",
    ], 
    "Arts, Culture & Entertainment": [
        "Art & Design",
        "Culture",
        "Entertainment, Hobbies & Leisure",
        "Fantasy / Fiction / Fanfiction",
        "Literature & Writing",
        "Video Games",
        "Fashion & Beauty"
    ],
    "Society & Lifestyle": [
        "Food & Dining",
        "Travel & Tourism",
        "Lifestyle",
        "Sports",
        "Interpersonal Relationships & Communication",
        "Transportation"
    ],
    "Social Sciences & Humanities": [
        "History",
        "Linguistics & Languages",
        "Psychology, Philosophy & Human Behavior",
        "Religion & Spirituality",   
        "Geography", 
        "Education"
    ],
    "News, Politics, and Current Events": [
        "Politics & Elections",
        "Social Issues & Movements",
        "News & Current Affairs",
    ],
    "High-Stakes Decision Making:": [
        "Employment & Hiring",
        "Insurance & Social Scoring",
        "Law, Criminal Justice, Law Enforcement",
        "Housing", 
        "Immigration / Migration", 
        "Health & Medicine"
        ],
    "Adult & Sensitive Content": [
        "Adult & Illicit Content",         
    ],
    "AOther": ["Other", 
            "None"]
}

annotation_pairs1 = wildchat_dataset.get_annotation_distribution(
    "turn_topic",
    level="message",
    annotation_source="automatic_v0",
    annotation_as_list_type=True,
)
annotation_pairs2 = hle_dataset.get_annotation_distribution(
    "turn_topic",
    level="message",
    annotation_source="automatic_v0",
    annotation_as_list_type=True,
)
annotation_pairs3 = mmlu_dataset.get_annotation_distribution(
    "turn_topic",
    level="message",
    annotation_source="automatic_v0",
    annotation_as_list_type=True,
)

datasets = {
    "WildChat": annotation_pairs1,
    "Humanity's Last Exam": annotation_pairs2,
    "MMLU": annotation_pairs3
}
    
# Aggregate counts into categories 
topic_agg = {}
for dataset_name, annotation_pairs in datasets.items():
    print(f"Processing dataset: {dataset_name}")
    agg_wc = aggregate_counts_by_category(annotation_pairs)
    topic_agg[dataset_name] = agg_wc

make_spider_plot(data_to_compare = topic_agg, parent_dict=topic_parent_dict, annotation_type = "topic")

prompt-multi_turn_relationship: 0 / 10127 failed due to invalid annotations.
prompt-interaction_features: 0 / 10127 failed due to invalid annotations.
turn-sensitive_use_flags: 0 / 10127 failed due to invalid annotations.
turn-topic: 1 / 10127 failed due to invalid annotations.
response-interaction_features: 0 / 10127 failed due to invalid annotations.
prompt-function_purpose: 6 / 10127 failed due to invalid annotations.
prompt-media_format: 0 / 10127 failed due to invalid annotations.
response-media_format: 0 / 10127 failed due to invalid annotations.
response-answer_form: 0 / 10127 failed due to invalid annotations.
hle
Loading  ../data/automatic_annotations_v0/hle/prompt_interaction_features.jsonl
prompt-interaction_features: 1 / 1000 failed due to invalid annotations.
../data/automatic_annotations_v0/hle/prompt_interaction_features.jsonl hle
Loading  ../data/automatic_annotations_v0/hle/turn_sensitive_use_flags.jsonl
turn-sensitive_use_flags: 0 / 1000 failed due to invalid annotati

### IN PROGRESS: Heatmap

In [39]:
agg_wc = {'WildChat': {'Translation (language to language)': 54,
  'Information analysis (Content explanation / interpretation)': 226,
  'Advice, Guidance, & Recommendations (Instructions / How-to)': 175,
  'Content generation (code)': 191,
  'Content generation (general prose, discussion or explanation)': 231,
  'Content generation (creative / fiction writing)': 492,
  'Information retrieval (general info from web)': 224,
  'No clear task': 271,
  'Advice, Guidance, & Recommendations (Social and personal advice)': 12,
  'Content generation (administrative writing)': 25,
  'Content generation (academic / essay writing)': 89,
  'Editorial & formatting (Natural language style or re-formatting)': 26,
  'Editorial & formatting (Content expansion)': 9,
  'Editorial & formatting (Natural language content editing)': 88,
  'Reasoning (Other general problem solving)': 10,
  'Reasoning (Mathematical or numerical problem solving)': 36,
  'Editorial & formatting (Content summarization)': 29,
  'Content generation (brainstorming / ideation)': 30,
  'Content generation (prompts for another AI system)': 105,
  'Role-play / social simulation (platonic companion / friend)': 15,
  'Content generation (other)': 68,
  'Information analysis (Content quality review or assessment)': 8,
  'Information retrieval (general info from prompt content)': 38,
  'Role-play / social simulation (simulation of real person / celebrity)': 8,
  'Role-play / social simulation (romantic companion)': 29,
  'Other': 7,
  'Information analysis (Ranking or Scoring)': 1,
  'Advice, Guidance, & Recommendations (Professional advice)': 10,
  'Editorial & formatting (Information processing & re-formatting)': 14,
  'Reasoning (Verbal problems, logic games, puzzles or riddles)': 3,
  'Content generation (code documentation)': 1,
  'Editorial & formatting (Code style and re-formatting)': 2,
  'Advice, Guidance, & Recommendations (Activity / product recommendations)': 13,
  'Information analysis (Content Classification)': 3,
  'Editorial & formatting (Code content editing)': 5,
  'Role-play / social simulation (therapist / coach)': 1,
  'Information analysis (Other content analysis / description)': 1},
 "Humanity's Last Exam": {'Information retrieval (general info from web)': 113,
  'Reasoning (Mathematical or numerical problem solving)': 575,
  'Content generation (general prose, discussion or explanation)': 14,
  'Information analysis (Content explanation / interpretation)': 98,
  'Reasoning (Other general problem solving)': 113,
  'Reasoning (Verbal problems, logic games, puzzles or riddles)': 82,
  'No clear task': 26,
  'Information analysis (Other content analysis / description)': 46,
  'Information retrieval (general info from prompt content)': 57,
  'Content generation (code)': 16,
  'Content generation (academic / essay writing)': 11,
  'Information analysis (Content Classification)': 18,
  'Content generation (code documentation)': 3,
  'Information analysis (Content quality review or assessment)': 4,
  'Content generation (other)': 3,
  'Other': 21,
  'Editorial & formatting (Code content editing)': 2,
  'Advice, Guidance, & Recommendations (Professional advice)': 4,
  'Content generation (creative / fiction writing)': 3,
  'Advice, Guidance, & Recommendations (Instructions / How-to)': 4,
  'Role-play / social simulation (platonic companion / friend)': 1,
  'Advice, Guidance, & Recommendations (Social and personal advice)': 1},
 'MMLU': {'Reasoning (Mathematical or numerical problem solving)': 133,
  'Reasoning (Verbal problems, logic games, puzzles or riddles)': 494,
  'Information analysis (Content explanation / interpretation)': 167,
  'Information retrieval (general info from prompt content)': 114,
  'Information retrieval (general info from web)': 107,
  'Reasoning (Other general problem solving)': 100,
  'Information analysis (Content quality review or assessment)': 14,
  'Information analysis (Content Classification)': 51,
  'Content generation (academic / essay writing)': 6,
  'Information analysis (Other content analysis / description)': 28,
  'Information analysis (Ranking or Scoring)': 3,
  'Advice, Guidance, & Recommendations (Social and personal advice)': 1,
  'Content generation (creative / fiction writing)': 1,
  'Other': 11,
  'No clear task': 17,
  'Content generation (general prose, discussion or explanation)': 3,
  'Advice, Guidance, & Recommendations (Professional advice)': 2,
  'Content generation (other)': 1,
  'Advice, Guidance, & Recommendations (Instructions / How-to)': 3}}


parent_dict = {
    "Information Analysis": [
        'Information analysis (Content explanation / interpretation)',
        'Information analysis (Other content analysis / description)',
        'Information analysis (Content Classification)',
        'Information analysis (Ranking or Scoring)',
        'Information analysis (Content quality review or assessment)',
    ],
    "Advice, Guidance, & Recommendations": [
        'Advice, Guidance, & Recommendations (Activity / product recommendations)',
        'Advice, Guidance, & Recommendations (Instructions / How-to)',
        'Advice, Guidance, & Recommendations (Professional advice)',
        'Advice, Guidance, & Recommendations (Social and personal advice)',
        'Advice, Guidance, & Recommendations (Action planning (scheduling, robotics))',
    ],
    "Information Retrieval": [
        'Information retrieval (general info from web)',
        'Information retrieval (general info from prompt content)',
    ],
    "Content Generation": [
        'Content generation (other)',
        'Content generation (general prose, discussion or explanation)',
        'Content generation (creative / fiction writing)',
        'Content generation (code)',
        'Content generation (academic / essay writing)',
        'Content generation (prompts for another AI system)',
        'Content generation (administrative writing)',
        'Content generation (brainstorming / ideation)',
        'Content generation (code documentation)',
    ],
    "Translation": [
        'Translation (language to language)',
    ],
    "Editorial & Formatting": [
        'Editorial & formatting (Content expansion)',
        'Editorial & formatting (Content summarization)',
        'Editorial & formatting (Natural language content editing)',
        'Editorial & formatting (Information processing & re-formatting)',
        'Editorial & formatting (Code style and re-formatting)',
        'Editorial & formatting (Natural language style or re-formatting)',
        'Editorial & formatting (Code content editing)',
    ],
    "Role-play / Social Simulation": [
        'Role-play / social simulation (romantic companion)',
        'Role-play / social simulation (platonic companion / friend)',
        'Role-play / social simulation (simulation of real person / celebrity)',
        'Role-play / social simulation (therapist / coach)',
    ],
    "Reasoning": [
        'Reasoning (Other general problem solving)',
        'Reasoning (Mathematical or numerical problem solving)',
        'Reasoning (Verbal problems, logic games, puzzles or riddles)',
    ],
    "Other / No Clear Task": [
        'No clear task',
        'Other',
    ]
}

data_to_compare = agg_wc
if parent_dict is not None:
      grouped_data_to_compare = {
          name: group_into_larger_categories(agg_wc, parent_dict=parent_dict)
          for name, agg_wc in data_to_compare.items()
      }
      # Use the keys from the first dataset in grouped_data_to_compare for categories
      categories = list(next(iter(grouped_data_to_compare.values())).keys())
      data_to_compare = grouped_data_to_compare
      agg_wc = grouped_data_to_compare

In [105]:
import plotly.express as px

# Data 
baseline = agg_wc["WildChat"]

comparisons = {
   #  "WildChat (Baseline)": baseline,
    "Humanity's Last Exam": agg_wc["Humanity's Last Exam"],
    "MMLU": agg_wc["MMLU"]
}

def make_heatmap(comparisons, baseline, title, width):
    all_keys = set(baseline.keys())
    for comp_dict in comparisons.values():
        all_keys.update(comp_dict.keys())

    # Ensure all dicts have all keys, fill missing with 0
    baseline = {k: baseline.get(k, 0) for k in sorted(all_keys)}
    for name in comparisons:
        comparisons[name] = {k: comparisons[name].get(k, 0) for k in sorted(all_keys)}

    # Create a matrix where each row is the difference between a comparison dict and baseline

    z = []
    names = []
    for name, comp in comparisons.items():
        diff = {k: (comp[k] - baseline[k] ) for k in baseline.keys()}
        z.append([diff[k] for k in sorted(diff.keys())])
        names.append(name)


    # Custom colorscale for range -2000 to 2000, with 0 as grey
    colorscale = [
        [0.0, "darkblue"],        # -2000
        [0.4, "lightsteelblue"], # -1000
        [0.5, "lightgrey"],       # 0
        [0.9, "salmon"],         # +1000
        [1.0, "red"]              # +2000
    ]

    fig = px.imshow(
        z,
        text_auto=True,
        x=sorted(baseline.keys()),
        y=names,
        color_continuous_scale=colorscale,
        zmin=-1000,
        zmax=1000
    )

    fig.update_layout(coloraxis_colorscale=colorscale)
    fig.update_xaxes(tickangle=0)
    fig.update_xaxes(
        ticktext=[
            label.replace(' and ', ' and<br>').replace('&', '&<br>') if 'and' in label else label.replace('&', '&<br>')
            for label in fig.layout.xaxis.ticktext or fig.data[0].x
        ],
        tickvals=list(range(len(fig.data[0].x)))
    )
    fig.update_xaxes(title_font=dict(size=24))
    fig.update_yaxes(title_font=dict(size=24))
    fig.update_layout(
        width=width,
        height=600
    )
    fig.update_layout(title_text=title, title_x=0.5)
    fig.update_layout(xaxis_side="top")

    fig.show()

make_heatmap(comparisons, baseline, title = "Benchmark Purposes Compared to Natural Use", width =2000)

In [101]:
interactions = {'WildChat': {'Courtesy/Politeness': 191,
  'Reinforcement/Praise/Scolding': 35,
  'Role-assignment': 212,
  'Jailbreak attempt': 35,
  'Companionship': 6},
 "Humanity's Last Exam": {'Courtesy/Politeness': 9,
  'Jailbreak attempt': 3,
  'Role-assignment': 4},
 'MMLU': {'Jailbreak attempt': 2, 'Role-assignment': 2}}

comparisons = {
    "Humanity's Last Exam": interactions["Humanity's Last Exam"],
    "MMLU": interactions["MMLU"]
}
make_heatmap(comparisons, interactions["WildChat"], title = "Benchmark Interaction Features Compared to Natural Use", width = 1500)


In [100]:
sensitive_use = {'WildChat': {'Potential violation of external policy / ethics': 1,
  'Sexually explicit content (Other)': 97,
  'Weapons & drugs': 8,
  'Inciting violence, hateful or other harmful behavior (physical harm)': 23,
  'Cyberattacks (enabling/enacting malware, computer viruses, worms, malicious code, ...)': 6,
  'Cyberattacks': 1,
  'Inciting violence, hateful or other harmful behavior (harassment & bullying)': 48,
  'Sexually explicit content (fictitious person)': 37,
  'Sexually explicit content (Request/discussion of CSAM)': 3,
  'Possible presence of copyrighted, unreferenced material': 2,
  'Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes)': 14,
  'Sexually explicit content (real person)': 7,
  'Privacy concerns (Possible identifiable information)': 3,
  'Generating defamatory content': 4,
  'Criminal planning or other suspected illegal activity not listed elsewhere': 6,
  'Privacy concerns (Possible sensitive information)': 1,
  'Criminal planning or other suspected illegal activity not listed': 1},
 "Humanity's Last Exam": {'Privacy concerns (Possible identifiable information)': 5,
  'Possible presence of copyrighted, unreferenced material': 2,
  'Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes)': 3,
  'CBRN-related outputs': 2,
  'Cyberattacks (enabling/enacting malware, computer viruses, worms, malicious code, ...)': 2,
  'Inciting violence, hateful or other harmful behavior (physical harm)': 2,
  'Output misrepresentation (Automated decision-making without disclosure)': 1,
  'Sexually explicit content (Other)': 1,
  'Privacy concerns (Possible sensitive information)': 1,
  'Inciting violence, hateful or other harmful behavior (harassment & bullying)': 1,
  'Weapons & drugs': 1,
  'Inciting violence, hateful or other harmful behavior (self-harm)': 1},
 'MMLU': {'Cyberattacks (enabling/enacting malware, computer viruses, worms, malicious code, ...)': 3,
  'Sexually explicit content (Other)': 3,
  'Weapons & drugs': 16,
  'CBRN-related outputs': 7,
  'Inciting violence, hateful or other harmful behavior (physical harm)': 27,
  'Criminal planning or other suspected illegal activity not listed elsewhere': 23,
  'Potential violation of external policy / ethics': 3,
  'Possible presence of copyrighted, unreferenced material': 12,
  'Privacy concerns (Possible sensitive information)': 3,
  'Inciting violence, hateful or other harmful behavior (self-harm)': 1,
  'Inciting violence, hateful or other harmful behavior (harassment & bullying)': 8,
  'Generating defamatory content': 2,
  'Sexually explicit content (Request/discussion of CSAM)': 1,
  'Privacy concerns (Possible identifiable information)': 3,
  'Other': 10,
  'Discriminatory practices (Misrepresentation, stereotyping, or inappropriate reference to sensitive attributes)': 5,
  'Sexually explicit content (real person)': 3,
  'Misinformation': 1,
  'Privacy concerns (Possible sensitive information) (e.g., API keys, passwords, other confidential information)': 1}}

comparisons = {
    "Humanity's Last Exam": sensitive_use["Humanity's Last Exam"],
    "MMLU": sensitive_use["MMLU"]
}
make_heatmap(comparisons, sensitive_use["WildChat"], title = "Sensitive Use Prevalence Compared to Natural Use", width = 2000)

In [107]:
topics = {'WildChat': {'Fantasy / Fiction / Fanfiction': 250,
  'Technology, Software & Computing': 614,
  'Art & Design': 167,
  'Sports': 40,
  'Entertainment, Hobbies & Leisure': 132,
  'Literature & Writing': 138,
  'Video Games': 68,
  'Health & Medicine': 162,
  'Math & Sciences': 99,
  'Adult & Illicit Content': 205,
  'Business & Finances': 182,
  'Education': 123,
  'Linguistics & Languages': 134,
  'Housing': 16,
  'Food & Dining': 27,
  'Interpersonal Relationships & Communication': 106,
  'Psychology, Philosophy & Human Behavior': 52,
  'Other': 15,
  'Nature & Environment': 38,
  'Economics': 34,
  'Social Issues & Movements': 27,
  'Culture': 54,
  'History': 80,
  'Fashion & Beauty': 32,
  'Politics & Elections': 39,
  'Law, Criminal Justice, Law Enforcement': 43,
  'Employment & Hiring': 42,
  'Religion & Spirituality': 35,
  'Travel & Tourism': 26,
  'Non-software Engineering & Infrastructure': 25,
  'Lifestyle': 25,
  'Transportation': 19,
  'News & Current Affairs': 13,
  'Geography': 13,
  'Immigration / Migration': 1,
  'Insurance & Social Scoring': 9},
 "Humanity's Last Exam": {'Culture': 16,
  'History': 35,
  'Math & Sciences': 767,
  'Interpersonal Relationships & Communication': 8,
  'Psychology, Philosophy & Human Behavior': 19,
  'Geography': 24,
  'Linguistics & Languages': 34,
  'Literature & Writing': 43,
  'Non-software Engineering & Infrastructure': 42,
  'Technology, Software & Computing': 73,
  'Nature & Environment': 14,
  'Health & Medicine': 51,
  'Transportation': 4,
  'Fantasy / Fiction / Fanfiction': 19,
  'Entertainment, Hobbies & Leisure': 36,
  'Business & Finances': 13,
  'Law, Criminal Justice, Law Enforcement': 13,
  'Religion & Spirituality': 5,
  'Sports': 3,
  'Art & Design': 12,
  'Politics & Elections': 6,
  'Games': 3,
  'Housing': 6,
  'Education': 6,
  'Travel & Tourism': 1,
  'Economics': 10,
  'Video Games': 7,
  'Food & Dining': 3,
  'Lifestyle': 2,
  'Social Issues & Movements': 1,
  'Other': 1,
  'Philosophy': 1,
  'Employment & Hiring': 2},
 'MMLU': {'Business & Finances': 76,
  'Economics': 109,
  'Math & Sciences': 270,
  'Psychology, Philosophy & Human Behavior': 174,
  'Art & Design': 7,
  'Law, Criminal Justice, Law Enforcement': 187,
  'History': 78,
  'Politics & Elections': 79,
  'Education': 109,
  'Housing': 16,
  'Social Issues & Movements': 67,
  'Health & Medicine': 126,
  'Technology, Software & Computing': 31,
  'Sports': 10,
  'Geography': 43,
  'News & Current Affairs': 16,
  'Interpersonal Relationships & Communication': 81,
  'Religion & Spirituality': 33,
  'Literature & Writing': 14,
  'Linguistics & Languages': 17,
  'Culture': 35,
  'Employment & Hiring': 16,
  'Adult & Illicit Content': 21,
  'Non-software Engineering & Infrastructure': 10,
  'Food & Dining': 10,
  'Travel & Tourism': 2,
  'Philosophy & Human Behavior': 2,
  'Nature & Environment': 20,
  'Fantasy / Fiction / Fanfiction': 3,
  'Transportation': 5,
  'Lifestyle': 8,
  'Immigration / Migration': 7,
  'Entertainment, Hobbies & Leisure': 11,
  'Insurance & Social Scoring': 3,
  'Video Games': 2}}

parent_dict = {
    "Math, Science, Engineering, and Technology": [
        "Math & Sciences",
        "Technology, Software & Computing",
        "Non-software Engineering & Infrastructure",
        "Nature & Environment", 
    ],
    "Business, Finance, and Economics": [
        "Economics",
        "Business & Finances",
    ], 
    "Arts, Culture & Entertainment": [
        "Art & Design",
        "Culture",
        "Entertainment, Hobbies & Leisure",
        "Fantasy / Fiction / Fanfiction",
        "Literature & Writing",
        "Video Games",
        "Fashion & Beauty"
    ],
    "Society & Lifestyle": [
        "Food & Dining",
        "Travel & Tourism",
        "Lifestyle",
        "Sports",
        "Interpersonal Relationships & Communication",
        "Transportation"
    ],
    "Social Sciences & Humanities": [
        "History",
        "Linguistics & Languages",
        "Psychology, Philosophy & Human Behavior",
        "Religion & Spirituality",   
        "Geography", 
        "Education"
    ],
    "News, Politics, and Current Events": [
        "Politics & Elections",
        "Social Issues & Movements",
        "News & Current Affairs",
    ],
    "High-Stakes Decision Making": [
        "Employment & Hiring",
        "Insurance & Social Scoring",
        "Law, Criminal Justice, Law Enforcement",
        "Housing", 
        "Immigration / Migration", 
        "Health & Medicine"
        ],
    "Adult & Sensitive Content": [
        "Adult & Illicit Content",         
    ],
    "AOther": ["Other", 
            "None"]
}

data_to_compare = topics
if parent_dict is not None:
      grouped_data_to_compare = {
          name: group_into_larger_categories(agg_wc, parent_dict=parent_dict)
          for name, agg_wc in data_to_compare.items()
      }
      # Use the keys from the first dataset in grouped_data_to_compare for categories
      categories = list(next(iter(grouped_data_to_compare.values())).keys())
      data_to_compare = grouped_data_to_compare

baseline = data_to_compare["WildChat"]
comparisons = {
    "Humanity's Last Exam": data_to_compare["Humanity's Last Exam"],
    "MMLU": data_to_compare["MMLU"]
}
make_heatmap(baseline=baseline, comparisons=comparisons, title="Topic Distribution Compared to Natural Use", width=2000)