In [1]:
import jsonlines
import os
from tqdm import tqdm 
import pandas as pd

def parse_metric_results(filename, name):
    num_correct = {}
    num_total = {}
    total = 1
    total_correct = -1
    correct_per_q = []
    with jsonlines.open(filename, "r") as reader:
        pbar = tqdm(enumerate(reader), desc=f"Parsing {filename}")
        for i, item in pbar:
            subtask = item["category"]
            if subtask not in num_total:
                num_total[subtask] = 0
                num_correct[subtask] = 0
            
            num_total[subtask] += 1
            
            if item["MMMUMetric_result"] == "correct":
                num_correct[subtask] += 1
                correct_per_q.append([f"mmmu__{i}", True])
            else:
                correct_per_q.append([f"mmmu__{i}", False])
            
            total_correct = sum([num_correct[key] for key in num_correct.keys()])
            total = sum([num_total[key] for key in num_total.keys()])
            
            pbar.set_postfix_str(f"overall_acc: {total_correct / total}")
            
        pd.DataFrame(correct_per_q).to_csv(os.path.join("/home/t-sijoshi/skill-set-mazda", f"{name}.csv"))   
        summary_entry = {
        "task": "mmmu",
        "name": name,
        "metric_results_path": filename,
        "accuracy": total_correct / total,
        "num_correct": total_correct,
        "num_total": total,
        "sub_tasks": []
        }
        
        for sub_task in num_total.keys():
            summary_entry["sub_tasks"].append(
                {
                    "name": sub_task,
                    "frac_of_data": num_total[sub_task] / total,
                    "num_correct": num_correct[sub_task],
                    "num_total": num_total[sub_task],
                    "accuracy": num_correct[sub_task] / num_total[sub_task]
                }
            )
    return summary_entry

In [2]:
base_path = "/home/t-sijoshi/LFM-Eval-Understand/logs/MMMU_PIPELINE/08-21-science-improve"
baselines = ["v1_format"]


for method in baselines:
    summary_entry = parse_metric_results(os.path.join(base_path, method, "eval_report", "metric_results.jsonl"), method)
    print(summary_entry)
    with jsonlines.open("/home/t-sijoshi/multimodal-data-gen/results/summary.jsonl", mode='a') as writer:
        writer.write(summary_entry)


Parsing /home/t-sijoshi/LFM-Eval-Understand/logs/MMMU_PIPELINE/08-21-science-improve/v1_format/eval_report/metric_results.jsonl: 900it [00:00, 2204.33it/s, overall_acc: 0.25]               


{'task': 'mmmu', 'name': 'v1_format', 'metric_results_path': '/home/t-sijoshi/LFM-Eval-Understand/logs/MMMU_PIPELINE/08-21-science-improve/v1_format/eval_report/metric_results.jsonl', 'accuracy': 0.25, 'num_correct': 225, 'num_total': 900, 'sub_tasks': [{'name': 'Art and Design', 'frac_of_data': 0.13333333333333333, 'num_correct': 35, 'num_total': 120, 'accuracy': 0.2916666666666667}, {'name': 'Business', 'frac_of_data': 0.16666666666666666, 'num_correct': 44, 'num_total': 150, 'accuracy': 0.29333333333333333}, {'name': 'Science', 'frac_of_data': 0.16666666666666666, 'num_correct': 34, 'num_total': 150, 'accuracy': 0.22666666666666666}, {'name': 'Health and Medicine', 'frac_of_data': 0.16666666666666666, 'num_correct': 37, 'num_total': 150, 'accuracy': 0.24666666666666667}, {'name': 'Humanities and Social Science', 'frac_of_data': 0.13333333333333333, 'num_correct': 23, 'num_total': 120, 'accuracy': 0.19166666666666668}, {'name': 'Tech and Engineering', 'frac_of_data': 0.23333333333333