In [9]:
import pandas as pd
import glob
from functools import reduce

def get_summary_csv(directory):
    pattern = (
        "/mnt/md0data/hiennm/opencompass-0.3.9/"
        + "outputs/"
        # + "default/"
        + f"{directory}/"
        + "summary/summary_*.csv"
    )
    return glob.glob(pattern)[0]

def read_many_reports(run_dirs):
    dfs = []
    for dir_name in run_dirs:
        csv_path = get_summary_csv(dir_name)
        df = pd.read_csv(csv_path)
        dfs.append(df)
    result = reduce(
        lambda left, right: pd.merge(left, right, on=[
            "dataset", "metric", "version", "mode"
        ]), dfs
    )
    return result

In [65]:
def get_overall_math_report(df):
    mmlu = df[df['dataset'].str.contains('mmlu', case=False)]
    math = df[df['dataset'].str.contains('gsm8k', case=False)]
    ifeval = df[df['dataset'].str.contains('if', case=False)]
    arc = df[df['dataset'].str.contains('arc', case=False)]
    total = pd.concat([
        mmlu.mean(numeric_only=True), 
        ifeval.mean(numeric_only=True), 
        arc.mean(numeric_only=True),
        math.mean(numeric_only=True), 
    ], axis=1, keys=[
        'mmlu', 'ifeval', 'arc', 'gsm8k'
    ])
    return total

def get_overall_code_report(df):
    mmlu = df[df['dataset'].str.contains('mmlu', case=False)]
    code = df[df['dataset'].str.contains('humaneval', case=False)]
    ifeval = df[df['dataset'].str.contains('if', case=False)]
    arc = df[df['dataset'].str.contains('arc', case=False)]
    total = pd.concat([
        mmlu.mean(numeric_only=True), 
        code.mean(numeric_only=True), 
        ifeval.mean(numeric_only=True), 
        arc.mean(numeric_only=True)
    ], axis=1, keys=[
        'mmlu', 'humaneval', 'ifeval', 'arc'
    ])
    return total

In [58]:
math_3b = read_many_reports(["merge-math-3b"])
code_3b = read_many_reports(["merge-code-3b"])
math_8b = read_many_reports(["merge-math-8b"])
code_8b = read_many_reports(["merge-code-8b"])

In [66]:
math_8b_report = get_overall_math_report(math_8b)
math_3b_report = get_overall_math_report(math_3b)
code_8b_report = get_overall_code_report(code_8b)
code_3b_report = get_overall_code_report(code_3b)

In [71]:
code_8b_report.sort_values(by="humaneval", ascending=False)

Unnamed: 0,mmlu,humaneval,ifeval,arc
code-ties-0.3-8b,61.907895,54.88,52.2575,73.975
code-ties-0.4-8b,62.039825,54.27,52.07,72.765
code-ties-0.2-8b,61.783333,53.66,52.69,74.17
code-ties-0.5-8b,62.012632,53.66,54.485,70.17
code-ties-0.1-8b,61.832632,53.05,50.45,74.08
IC-8b-u55,61.358421,53.05,51.6875,72.43
IC-8b-u37,61.54193,51.22,52.35,72.665
code-8b,61.774386,51.22,52.1425,74.52
IC-8b-u28,61.290175,50.61,50.6875,69.465
IC-8b-u46,61.064211,50.0,52.025,72.505


In [72]:
code_3b_report.sort_values(by="humaneval", ascending=False)

Unnamed: 0,mmlu,humaneval,ifeval,arc
code-3b,53.429474,40.24,35.325,59.3
code-ties-0.2-3b,53.507193,37.2,35.5125,60.37
IC-3b-u37,53.948246,36.59,35.05,57.545
code-ties-0.1-3b,53.159123,35.98,35.5575,59.615
IC-3b-u19,53.930702,35.98,36.0825,59.15
code-ties-0.3-3b,53.606842,35.37,37.205,60.625
IC-3b-u46,53.865439,35.37,35.285,58.965
code-ties-0.5-3b,53.552982,34.76,36.09,60.89
IC-3b-u28,53.89193,33.54,34.6825,58.525
code-ties-0.4-3b,53.617895,32.93,36.875,60.63


In [75]:
math_8b_report.sort_values(by="gsm8k", ascending=False)

Unnamed: 0,mmlu,ifeval,arc,gsm8k
math-ties-0.1-8b,59.314035,35.36,72.87,73.31
math-8b,58.16193,33.885,72.245,72.18
IC-8b-u28,59.651404,46.5375,74.2,70.74
IC-8b-u19,59.907544,47.105,71.675,70.43
IC-8b-u46,59.937719,48.61,74.315,69.75
IC-8b-u37,59.968596,48.0825,73.585,68.69
instruct-8b,60.514386,53.4125,74.995,63.46
math-ties-0.5-8b,62.012632,54.3925,71.745,62.55
math-ties-0.4-8b,62.039825,52.8625,73.365,62.24
math-ties-0.3-8b,61.907895,53.7675,72.43,61.49


In [76]:
math_3b_report.sort_values(by="gsm8k", ascending=False)

Unnamed: 0,mmlu,ifeval,arc,gsm8k
IC-3b-u19,52.392456,33.195,59.595,55.8
math-3b,51.904211,28.355,63.12,55.34
math-ties-0.1-3b,52.154035,28.685,62.015,55.12
math-ties-0.2-3b,52.675439,29.675,61.66,54.74
IC-3b-u28,52.647368,32.41,60.94,54.28
IC-3b-u37,52.648772,32.865,61.47,53.3
IC-3b-u46,53.403509,33.1625,59.745,53.3
math-ties-0.3-3b,52.904211,30.6625,61.85,52.84
math-ties-0.4-3b,53.147719,32.3875,61.895,52.31
IC-3b-u55,52.84807,32.9775,59.175,51.1
