In [9]:
import pandas as pd
import json
from pathlib import Path
from benchmark.utils.metadata import read_preproc, read_python, read_lean

In [28]:
def collate_metadata(base_path: str) -> pd.DataFrame:
    """
    Collates metadata.json files across indices into a single pandas DataFrame.
    
    Parameters:
    base_path (str): Base path containing indexed directories with metadata.json files
    
    Returns:
    pandas.DataFrame: Table with metadata information across all indices
    """
    # Initialize lists to store data
    records = []
    
    # Iterate through all metadata.json files
    base_dir = Path(base_path)
    for idx_dir in sorted(base_dir.glob("*")):
        if not idx_dir.is_dir():
            continue

        preproc = read_preproc(idx_dir)
        python = read_preproc(idx_dir)
        lean = read_lean(idx_dir)

        # Create a flat record
        record = {
            "idx": idx_dir.name, 
            "preproc_loops": preproc["loops"],
            "python_loops": python["loops"], 
            "lean_loops": lean["loops"],
            "preproc_success": preproc["latest_run_success"],
            "python_success": python["latest_run_success"],
            "lean_success": lean["latest_run_success"]
        }
        records.append(record)
    
    # Create DataFrame
    return pd.DataFrame(records).set_index("idx")

In [29]:
up = Path("..")
path = up / up / up / "artefacts" / "apps" / "train"

In [30]:
df = collate_metadata(path)

In [31]:
df

Unnamed: 0_level_0,preproc_loops,python_loops,lean_loops,preproc_success,python_success,lean_success
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,1,True,True,True
1,0,0,0,True,True,True
2,0,0,5,True,True,True
3,0,0,3,True,True,True
4,0,0,1,True,True,True
5,0,0,5,True,True,True
6,0,0,3,True,True,True
7,0,0,1,True,True,True
8,0,0,1,True,True,True
9,0,0,8,True,True,True


In [32]:
df.mean()

preproc_loops      0.294118
python_loops       0.294118
lean_loops         1.784314
preproc_success    0.980392
python_success     0.980392
lean_success       0.941176
dtype: float64

In [33]:
df.sum()

preproc_loops      15
python_loops       15
lean_loops         91
preproc_success    50
python_success     50
lean_success       48
dtype: int64