In [1]:
import ast
import os
import string

import json
import pandas as pd

In [2]:
# Define column names and data types
columns = {
    "prompt": "string",
    "task": "string",  # Task name
    "score_0": "float",  # Eurdem__Defne-llama3.1-8B
    "score_1": "float",  # Locutusque__Hercules-6.1-Llama-3.1-8B
    "score_2": "float",  # Nekochu__Llama-3.1-8B-German-ORPO
    "score_3": "float",  # VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct
    "score_4": "float",  # ValiantLabs__Llama3.1-8B-ShiningValiant2
    "score_5": "float",  # arcee-ai__Llama-Spark
    "score_6": "float",  # meta-llama__Llama-3.1-8B-Instruct
    "labels": "object",  # Use 'object' type to store lists
}

# Create an empty DataFrame with the specified columns and data types
df = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in columns.items()})

# Display the DataFrame
print(df)

Empty DataFrame
Columns: [prompt, task, score_0, score_1, score_2, score_3, score_4, score_5, score_6, labels]
Index: []


## The first model (Eurdem__Defne-llama3.1-8B)
### Bbh data

In [3]:
# 获取指定路径下所有以 "samples_leaderboard_bbh" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_bbh")]

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            prompt = f'Q: {doc["doc"]["input"]}\n\n  A:'
            score_0 = doc.get("acc_norm", None)

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "bbh", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.head())

                                              prompt task  score_0  score_1  \
0            Q: not ( True ) and ( True ) is\n\n  A:  bbh      1.0      NaN   
1       Q: not True or ( False and True ) is\n\n  A:  bbh      1.0      NaN   
2  Q: not not False and not not not False is\n\n  A:  bbh      1.0      NaN   
3        Q: ( True and not not not True ) is\n\n  A:  bbh      0.0      NaN   
4        Q: ( True ) and True or not True is\n\n  A:  bbh      0.0      NaN   

   score_2  score_3  score_4  score_5  score_6 labels  
0      NaN      NaN      NaN      NaN      NaN    NaN  
1      NaN      NaN      NaN      NaN      NaN    NaN  
2      NaN      NaN      NaN      NaN      NaN    NaN  
3      NaN      NaN      NaN      NaN      NaN    NaN  
4      NaN      NaN      NaN      NaN      NaN    NaN  


### GPQA data

In [4]:
# 获取指定路径下所有以 "samples_leaderboard_gpqa" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_gpqa")]

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            prompt = f'What is the correct answer to this question:{doc["doc"]["Question"]}\nChoices:\n(A) {doc["doc"]["Correct Answer"]}\n(B) {doc["doc"]["Incorrect Answer 1"]}\n(C) {doc["doc"]["Incorrect Answer 2"]}\n(D) {doc["doc"]["Incorrect Answer 3"]}\nAnswer: '
            score_0 = doc.get("acc_norm", None)

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "gpqa", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.tail())

                                                 prompt  task  score_0  \
6948  What is the correct answer to this question:A ...  gpqa      0.0   
6949  What is the correct answer to this question:We...  gpqa      0.0   
6950  What is the correct answer to this question:Th...  gpqa      0.0   
6951  What is the correct answer to this question:Wh...  gpqa      1.0   
6952  What is the correct answer to this question:Wh...  gpqa      0.0   

      score_1  score_2  score_3  score_4  score_5  score_6 labels  
6948      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
6949      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
6950      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
6951      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
6952      NaN      NaN      NaN      NaN      NaN      NaN    NaN  


### IFEval data

In [5]:
# 获取指定路径下所有以 "samples_leaderboard_ifeval" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_ifeval")]

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            prompt = doc["doc"]["prompt"]
            score_0 = 1.0 if doc.get("prompt_level_strict_acc", False) else 0.0

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "ifeval", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.tail())

                                                 prompt    task  score_0  \
7489  Come up with 3 names for a 2B software company...  ifeval      1.0   
7490  Write an itinerary for a 10-day trip to Biratn...  ifeval      1.0   
7491  Given the sentence "The dog barked at the cat,...  ifeval      1.0   
7492  What is the name of the green-eyed monster tha...  ifeval      0.0   
7493  If a + b = 10. And a > b. Is a = 6? Your answe...  ifeval      1.0   

      score_1  score_2  score_3  score_4  score_5  score_6 labels  
7489      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
7490      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
7491      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
7492      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
7493      NaN      NaN      NaN      NaN      NaN      NaN    NaN  


### MATH data

In [6]:
# 获取指定路径下所有以 "samples_leaderboard_math" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_math")]

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            prompt = "Problem:" + "\n" + doc["doc"]["problem"] + "\n\n" + "Solution:"
            score_0 = doc.get("exact_match", None)

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "math", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.tail())

                                                 prompt  task  score_0  \
8813  Problem:\nThree unit circles are drawn so they...  math      0.0   
8814  Problem:\nLet $ x$ be a real number such that ...  math      0.0   
8815  Problem:\nIf $0 < \theta < \frac{\pi}{2}$ and ...  math      0.0   
8816  Problem:\nLet $\mathbf{a},$ $\mathbf{b},$ and ...  math      0.0   
8817  Problem:\nOne line is defined by\n\[\begin{pma...  math      0.0   

      score_1  score_2  score_3  score_4  score_5  score_6 labels  
8813      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
8814      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
8815      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
8816      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
8817      NaN      NaN      NaN      NaN      NaN      NaN    NaN  


### MMLU Pro data

In [7]:
# 获取指定路径下所有以 "samples_leaderboard_mmlu_pro" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_mmlu_pro")]

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            prompt = f"{doc['doc']['question']}\n"

            for i in range(len(doc["doc"]["options"])):
                prompt += f"{string.ascii_uppercase[i]}. {doc['doc']['options'][i]}\n"

            prompt += "Answer:"
            score_0 = doc.get("acc", None)

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "mmlu_pro", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.tail())

                                                  prompt      task  score_0  \
20845  Consider the nonuniform field E= ya_x + xa_y +...  mmlu_pro      0.0   
20846  A 15 hp internal-combustion engine runs at 140...  mmlu_pro      0.0   
20847  Estimate the overall heat transfer coefficient...  mmlu_pro      0.0   
20848  Water at 340°K and a rate of 4 Kg/hr is requir...  mmlu_pro      1.0   
20849  The frequency range of a commercially broadcas...  mmlu_pro      0.0   

       score_1  score_2  score_3  score_4  score_5  score_6 labels  
20845      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
20846      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
20847      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
20848      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
20849      NaN      NaN      NaN      NaN      NaN      NaN    NaN  


### MUSR data

In [8]:
# 获取指定路径下所有以 "samples_leaderboard_musr" 开头的文件
path = "/workspace/code/lm-evaluation-harness/outputs/Eurdem__Defne-llama3.1-8B"  # 替换为你的文件路径
files = [f for f in os.listdir(path) if f.startswith("samples_leaderboard_musr")]

DOC_TO_TEXT = "{narrative}\n\n" "{question}\n\n" "{choices}\n" "Answer:"

# 读取每个 JSON 文件并添加到 DataFrame 中
for file_name in files:
    file_path = os.path.join(path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            choices = ""
            for i, choice in enumerate(ast.literal_eval(doc["doc"]["choices"])):
                choices += f"{i+1} - {choice}\n"
            prompt = DOC_TO_TEXT.format(
                narrative=doc["doc"]["narrative"], question=doc["doc"]["question"], choices=choices
            )
            score_0 = doc.get("acc_norm", None)

            # 将数据添加到 DataFrame 中
            df = pd.concat(
                [
                    df,
                    pd.DataFrame(
                        {"prompt": [prompt], "task": "musr", "score_0": [score_0]}
                    ),
                ],
                ignore_index=True,
            )

# 显示最终的 DataFrame
print(df.tail())

                                                  prompt  task  score_0  \
21601  In the modest rehearsal space of the Riverside...  musr      0.0   
21602  In the bustling robotics firm where I served a...  musr      0.0   
21603  The hum of anticipation filled NASA's Space Fl...  musr      1.0   
21604  In the heart of Manhattan's urban jungle, amid...  musr      0.0   
21605  In the pulsating heart of Silicon Valley, a bu...  musr      1.0   

       score_1  score_2  score_3  score_4  score_5  score_6 labels  
21601      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
21602      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
21603      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
21604      NaN      NaN      NaN      NaN      NaN      NaN    NaN  
21605      NaN      NaN      NaN      NaN      NaN      NaN    NaN  


## Other models
Locutusque__Hercules-6.1-Llama-3.1-8B, Nekochu__Llama-3.1-8B-German-ORPO, VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct,ValiantLabs__Llama3.1-8B-ShiningValiant2, arcee-ai__Llama-Spark, meta-llama__Llama-3.1-8B-Instruct

In [9]:
path = "/workspace/code/lm-evaluation-harness/outputs"

# 遍历其他6个文件夹
folders = [
    "Locutusque__Hercules-6.1-Llama-3.1-8B",
    "Nekochu__Llama-3.1-8B-German-ORPO",
    "VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct",
    "ValiantLabs__Llama3.1-8B-ShiningValiant2",
    "arcee-ai__Llama-Spark",
    "meta-llama__Llama-3.1-8B-Instruct",
]

# 定义文件夹对应的列
folder_score_columns = {
    "Locutusque__Hercules-6.1-Llama-3.1-8B": "score_1",
    "Nekochu__Llama-3.1-8B-German-ORPO": "score_2",
    "VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct": "score_3",
    "ValiantLabs__Llama3.1-8B-ShiningValiant2": "score_4",
    "arcee-ai__Llama-Spark": "score_5",
    "meta-llama__Llama-3.1-8B-Instruct": "score_6",
}

# 遍历每个文件夹并读取数据
for folder in folders:
    folder_path = os.path.join(path, folder)
    if os.path.isdir(folder_path):
        # 获取文件夹下所有以 "samples_leaderboard_bbh" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_bbh")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    prompt = f'Q: {doc["doc"]["input"]}\n\n  A:'
                    score = doc.get("acc_norm", None)

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score

        # 获取文件夹下所有以 "samples_leaderboard_gpqa" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_gpqa")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    prompt = f'What is the correct answer to this question:{doc["doc"]["Question"]}\nChoices:\n(A) {doc["doc"]["Correct Answer"]}\n(B) {doc["doc"]["Incorrect Answer 1"]}\n(C) {doc["doc"]["Incorrect Answer 2"]}\n(D) {doc["doc"]["Incorrect Answer 3"]}\nAnswer: '
                    score = doc.get("acc_norm", None)

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score

        # 获取文件夹下所有以 "samples_leaderboard_ifeval" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_ifeval")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    prompt = doc["doc"]["prompt"]
                    score = 1.0 if doc.get("prompt_level_strict_acc", False) else 0.0

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score

        # 获取文件夹下所有以 "samples_leaderboard_math" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_math")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    prompt = "Problem:" + "\n" + doc["doc"]["problem"] + "\n\n" + "Solution:"
                    score = doc.get("exact_match", None)

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score

        # 获取文件夹下所有以 "samples_leaderboard_mmlu_pro" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_mmlu_pro")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    prompt = f"{doc['doc']['question']}\n"

                    for i in range(len(doc["doc"]["options"])):
                        prompt += f"{string.ascii_uppercase[i]}. {doc['doc']['options'][i]}\n"

                    prompt += "Answer:"
                    score = doc.get("acc", None)

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score

        # 获取文件夹下所有以 "samples_leaderboard_musr" 开头的文件
        files = [
            f
            for f in os.listdir(folder_path)
            if f.startswith("samples_leaderboard_musr")
        ]
        score_column = folder_score_columns[folder]

        for file_name in files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    choices = ""
                    for i, choice in enumerate(ast.literal_eval(doc["doc"]["choices"])):
                        choices += f"{i+1} - {choice}\n"
                    prompt = DOC_TO_TEXT.format(
                        narrative=doc["doc"]["narrative"], question=doc["doc"]["question"], choices=choices
                    )
                    score = doc.get("acc_norm", None)

                    # 查找匹配的 prompt 行并更新对应的 score 列
                    df.loc[df["prompt"] == prompt, score_column] = score


## Generate labels

In [10]:
df["labels"] = df[
    ["score_0", "score_1", "score_2", "score_3", "score_4", "score_5", "score_6"]
].apply(
    lambda row: row[row == row.max()]
    .index.map(lambda x: int(x.split("_")[1]))
    .tolist(),
    axis=1,
)

In [11]:
df.tail()

Unnamed: 0,prompt,task,score_0,score_1,score_2,score_3,score_4,score_5,score_6,labels
21601,In the modest rehearsal space of the Riverside...,musr,0.0,0.0,1.0,0.0,0.0,0.0,0.0,[2]
21602,In the bustling robotics firm where I served a...,musr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0, 1, 2, 3, 4, 5, 6]"
21603,The hum of anticipation filled NASA's Space Fl...,musr,1.0,1.0,1.0,1.0,1.0,0.0,1.0,"[0, 1, 2, 3, 4, 6]"
21604,"In the heart of Manhattan's urban jungle, amid...",musr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0, 1, 2, 3, 4, 5, 6]"
21605,"In the pulsating heart of Silicon Valley, a bu...",musr,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[0, 1, 2, 3, 4, 5, 6]"


In [12]:
df = df.sample(frac=1, random_state=85).reset_index(drop=True)

print(df.head())

                                              prompt      task  score_0  \
0  Q: Which of the following is a humorous edit o...       bbh      1.0   
1   Kant's Universal Law formulation does not ref...  mmlu_pro      1.0   
2  Let G_n(s) be the probability generating funct...  mmlu_pro      0.0   
3  What is the correct answer to this question:A ...      gpqa      0.0   
4  Q: This SVG path element <path d="M 25.00,38.0...       bbh      1.0   

   score_1  score_2  score_3  score_4  score_5  score_6                 labels  
0      1.0      1.0      1.0      0.0      1.0      1.0     [0, 1, 2, 3, 5, 6]  
1      1.0      1.0      1.0      1.0      1.0      1.0  [0, 1, 2, 3, 4, 5, 6]  
2      0.0      0.0      0.0      1.0      0.0      0.0                    [4]  
3      1.0      0.0      0.0      0.0      0.0      1.0                 [1, 6]  
4      1.0      1.0      1.0      1.0      1.0      1.0  [0, 1, 2, 3, 4, 5, 6]  


In [13]:
train_df = df.iloc[: int(0.8 * len(df))]
test_df = df.iloc[int(0.8 * len(df)) :]

In [14]:
train_df.to_csv("/workspace/code/adapter_router/train.csv", index=False)
test_df.to_csv("/workspace/code/adapter_router/test.csv", index=False)

## Calculate result

In [15]:
tasks = ["bbh", "gpqa", "ifeval", "math", "mmlu_pro", "musr"]
models = [
    "Eurdem__Defne-llama3.1-8B",
    "Locutusque__Hercules-6.1-Llama-3.1-8B",
    "Nekochu__Llama-3.1-8B-German-ORPO",
    "VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct",
    "ValiantLabs__Llama3.1-8B-ShiningValiant2",
    "arcee-ai__Llama-Spark",
    "meta-llama__Llama-3.1-8B-Instruct",
]

In [16]:
accuracy_map = {}

for idx, model in enumerate(models):
    accuracy = []
    for task in tasks:
        task_df = train_df[train_df["task"] == task]
        mean_score = task_df[f"score_{idx}"].mean()
        accuracy.append(mean_score)
    accuracy_map[model] = accuracy

In [17]:
accuracy_map

{'Eurdem__Defne-llama3.1-8B': [0.5310404499242916,
  0.32421052631578945,
  0.4091954022988506,
  0.1456400742115028,
  0.3882045929018789,
  0.44660194174757284],
 'Locutusque__Hercules-6.1-Llama-3.1-8B': [0.5109236426562838,
  0.3263157894736842,
  0.47126436781609193,
  0.137291280148423,
  0.3764091858037578,
  0.41262135922330095],
 'Nekochu__Llama-3.1-8B-German-ORPO': [0.5009733939000649,
  0.33157894736842103,
  0.41379310344827586,
  0.0,
  0.33799582463465555,
  0.47572815533980584],
 'VAGOsolutions__Llama-3.1-SauerkrautLM-8b-Instruct': [0.518494484101233,
  0.33157894736842103,
  0.45517241379310347,
  0.15584415584415584,
  0.3973903966597077,
  0.4627831715210356],
 'ValiantLabs__Llama3.1-8B-ShiningValiant2': [0.44083928185161153,
  0.32105263157894737,
  0.18160919540229886,
  0.08534322820037106,
  0.3174321503131524,
  0.4255663430420712],
 'arcee-ai__Llama-Spark': [0.5230369889682025,
  0.3178947368421053,
  0.43448275862068964,
  0.13172541743970315,
  0.39018789144050

In [18]:
# 统计task的数量
test_df["task"].value_counts()

task
mmlu_pro    2452
bbh         1138
math         246
gpqa         242
musr         138
ifeval       106
Name: count, dtype: int64