In [1]:
from judge_utils import load_all_results, main_path, results_path
import json
import os
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load results into dataframe
df_main = load_all_results(results_path)


In [3]:
# Reorder
df_main = df_main.sort_values('run_time').reset_index(drop=True)

In [4]:
# Merge columns for easier lookup
df_main["merge_key"] = df_main.apply(
    lambda row: (
        row["structure_id"],
        row["use_img"],
        row["use_json"],
        row["shot"]
    ),
    axis=1
)

judge_files = {
    "BASE": os.path.join(main_path, "analysis", "judge_analysis_BASE.json"),
    "CLAR_Q": os.path.join(main_path, "analysis", "judge_analysis_CLAR_Q.json"),
    "COMM_SH_REF": os.path.join(main_path, "analysis", "judge_analysis_COMM_SH_REF.json"),
    "IMPL_REF": os.path.join(main_path, "analysis", "judge_analysis_IMPL_REF.json")
}

for label, filepath in judge_files.items():
    try:
        with open(filepath, "r") as f:
            judge_data = json.load(f)
            
        rating_dict = {}
        for entry in judge_data:
            key = (
                entry["structure_id"],
                entry["use_img"],
                entry["use_json"],
                entry["shot"]
            )
            rating_dict[key] = entry["rating"]

        # Map the new column from the dictionary, using the merged key
        df_main[label] = df_main["merge_key"].map(rating_dict)
    except Exception:
        print(f"{filepath} missing")
        continue  # or handle error


/home/giovanni.duca-1/MLLMs-construction-company/analysis/judge_analysis_CLAR_Q.json missing
/home/giovanni.duca-1/MLLMs-construction-company/analysis/judge_analysis_COMM_SH_REF.json missing
/home/giovanni.duca-1/MLLMs-construction-company/analysis/judge_analysis_IMPL_REF.json missing


In [5]:
# Load the metrics from "parsed_actions_with_metrics.json"
with open(os.path.join(main_path, "analysis", "parsed_actions_with_metrics.json"), "r") as f:
    metrics_data = json.load(f)
    

In [6]:
# Build a dictionary keyed by the composite key from metrics_data
metrics_dict = {}
for entry in metrics_data:
    key = (
        entry["structure_id"],
        entry["use_img"],
        entry["use_json"],
        entry["shot"]
    )
    metrics_dict[key] = {
        "accuracy": entry["accuracy"],
        "precision": entry["precision"],
        "iou": entry["iou"],
        "action_format": entry["action_format"]
    }

# Map each metric onto df_main using the composite key column
df_main["accuracy"] = df_main["merge_key"].map(
    lambda key: metrics_dict[key]["accuracy"] if key in metrics_dict else None
)
df_main["precision"] = df_main["merge_key"].map(
    lambda key: metrics_dict[key]["precision"] if key in metrics_dict else None
)
df_main["iou"] = df_main["merge_key"].map(
    lambda key: metrics_dict[key]["iou"] if key in metrics_dict else None
)
df_main["action_format"] = df_main["merge_key"].map(
    lambda key: metrics_dict[key]["action_format"] if key in metrics_dict else None
)

# Optionally drop the temporary composite key column if no longer needed:
df_main.drop(columns=["merge_key"], inplace=True)

In [7]:
df_main = df_main.drop(columns=["json_file", "Model", "Quantization", "Device", "Number of models",	"Max new tokens",	"Repetition Penalty",	"Max rounds", "json_file"])
df_main


Unnamed: 0,run_time,structure_id,use_img,use_json,num_rounds,total_time_min,finished_by_architect,shot,BASE,accuracy,precision,iou,action_format
0,2025-02-11-1829-56,C1_bell,True,False,5,1.0,True,one-shot,3,0.00,0.00,0.00,correct
1,2025-02-11-1830-50,C1_bell,True,False,15,90.0,False,zero-shot,1,0.00,0.00,0.00,correct
2,2025-02-11-2008-08,C1_bell,True,True,50,6.0,False,one-shot,1,0.00,0.00,0.00,incorrect
3,2025-02-11-2014-53,C1_bell,True,True,11,1.0,True,zero-shot,1,0.76,0.91,0.71,correct
4,2025-02-11-2016-48,C1_bell,False,True,50,13.0,False,one-shot,1,0.00,0.00,0.00,incorrect
...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2025-02-13-0135-13,C21_spectacles,True,False,6,1.0,True,zero-shot,Undefined,0.00,0.00,0.00,correct
116,2025-02-13-0136-18,C21_spectacles,True,True,50,3.0,False,one-shot,Undefined,0.00,0.00,0.00,correct
117,2025-02-13-0139-36,C21_spectacles,True,True,50,10.0,False,zero-shot,Undefined,0.07,1.00,0.07,correct
118,2025-02-13-0149-55,C21_spectacles,False,True,50,9.0,False,one-shot,1,0.00,0.00,0.00,incorrect


In [8]:
df_main[df_main["BASE"] == 3]

Unnamed: 0,run_time,structure_id,use_img,use_json,num_rounds,total_time_min,finished_by_architect,shot,BASE,accuracy,precision,iou,action_format
0,2025-02-11-1829-56,C1_bell,True,False,5,1.0,True,one-shot,3,0.0,0.0,0.0,correct
9,2025-02-11-2041-13,C2_black-hole,True,True,8,1.0,True,zero-shot,3,0.0,0.0,0.0,correct
45,2025-02-12-0818-44,C9_asterisk,True,True,3,1.0,True,zero-shot,3,0.27,0.5,0.21,correct
85,2025-02-12-1702-41,C16_bloody-sword,True,False,5,1.0,True,zero-shot,3,0.0,0.0,0.0,correct


In [9]:
df_main[(df_main["BASE"] == 1) & (df_main["accuracy"] > 0)]

Unnamed: 0,run_time,structure_id,use_img,use_json,num_rounds,total_time_min,finished_by_architect,shot,BASE,accuracy,precision,iou,action_format
3,2025-02-11-2014-53,C1_bell,True,True,11,1.0,True,zero-shot,1,0.76,0.91,0.71,correct
5,2025-02-11-2030-45,C1_bell,False,True,50,7.0,False,zero-shot,1,0.05,1.0,0.05,correct
15,2025-02-11-2253-23,C4_flower_new,True,True,15,2.0,True,zero-shot,1,0.65,1.0,0.65,correct
41,2025-02-12-0620-06,C8_table2,False,True,5,1.0,True,zero-shot,1,0.83,0.62,0.56,correct
47,2025-02-12-0831-24,C9_asterisk,False,True,50,7.0,False,zero-shot,1,0.06,1.0,0.06,correct
53,2025-02-12-0903-58,C10_concentric_semicircles,False,True,50,7.0,False,zero-shot,1,0.1,1.0,0.1,correct
63,2025-02-12-1122-25,C12_diagonal-Ls,True,True,26,4.0,True,zero-shot,1,0.83,0.26,0.25,approximated
65,2025-02-12-1136-08,C12_diagonal-Ls,False,True,50,8.0,False,zero-shot,1,0.17,1.0,0.17,correct
71,2025-02-12-1351-22,C13_eye,False,True,50,10.0,False,zero-shot,1,0.1,1.0,0.1,correct
77,2025-02-12-1431-19,C14_diagonal-zigzag,False,True,6,1.0,True,zero-shot,1,0.62,1.0,0.62,correct


In [10]:
df_main[df_main["accuracy"] > 0]

Unnamed: 0,run_time,structure_id,use_img,use_json,num_rounds,total_time_min,finished_by_architect,shot,BASE,accuracy,precision,iou,action_format
3,2025-02-11-2014-53,C1_bell,True,True,11,1.0,True,zero-shot,1,0.76,0.91,0.71,correct
5,2025-02-11-2030-45,C1_bell,False,True,50,7.0,False,zero-shot,1,0.05,1.0,0.05,correct
11,2025-02-11-2053-17,C2_black-hole,False,True,33,101.0,False,zero-shot,Undefined,0.55,0.69,0.44,correct
15,2025-02-11-2253-23,C4_flower_new,True,True,15,2.0,True,zero-shot,1,0.65,1.0,0.65,correct
17,2025-02-11-2309-30,C4_flower_new,False,True,5,1.0,True,zero-shot,2,0.35,0.67,0.3,correct
27,2025-02-12-0127-34,C6_rectangle-chain,True,True,50,5.0,False,zero-shot,2,0.15,1.0,0.15,approximated
29,2025-02-12-0143-08,C6_rectangle-chain,False,True,50,11.0,False,zero-shot,Undefined,0.1,1.0,0.1,correct
41,2025-02-12-0620-06,C8_table2,False,True,5,1.0,True,zero-shot,1,0.83,0.62,0.56,correct
45,2025-02-12-0818-44,C9_asterisk,True,True,3,1.0,True,zero-shot,3,0.27,0.5,0.21,correct
47,2025-02-12-0831-24,C9_asterisk,False,True,50,7.0,False,zero-shot,1,0.06,1.0,0.06,correct


In [11]:
# Number of rounds analysis when the architect finishes the conversation
df_main[df_main["finished_by_architect"] == True].num_rounds.describe()


count    38.000000
mean      6.289474
std       4.171382
min       3.000000
25%       4.000000
50%       5.000000
75%       6.000000
max      26.000000
Name: num_rounds, dtype: float64

In [12]:
df_main[(df_main["accuracy"] > 0) & (df_main["precision"] > 0)]

Unnamed: 0,run_time,structure_id,use_img,use_json,num_rounds,total_time_min,finished_by_architect,shot,BASE,accuracy,precision,iou,action_format
3,2025-02-11-2014-53,C1_bell,True,True,11,1.0,True,zero-shot,1,0.76,0.91,0.71,correct
5,2025-02-11-2030-45,C1_bell,False,True,50,7.0,False,zero-shot,1,0.05,1.0,0.05,correct
11,2025-02-11-2053-17,C2_black-hole,False,True,33,101.0,False,zero-shot,Undefined,0.55,0.69,0.44,correct
15,2025-02-11-2253-23,C4_flower_new,True,True,15,2.0,True,zero-shot,1,0.65,1.0,0.65,correct
17,2025-02-11-2309-30,C4_flower_new,False,True,5,1.0,True,zero-shot,2,0.35,0.67,0.3,correct
27,2025-02-12-0127-34,C6_rectangle-chain,True,True,50,5.0,False,zero-shot,2,0.15,1.0,0.15,approximated
29,2025-02-12-0143-08,C6_rectangle-chain,False,True,50,11.0,False,zero-shot,Undefined,0.1,1.0,0.1,correct
41,2025-02-12-0620-06,C8_table2,False,True,5,1.0,True,zero-shot,1,0.83,0.62,0.56,correct
45,2025-02-12-0818-44,C9_asterisk,True,True,3,1.0,True,zero-shot,3,0.27,0.5,0.21,correct
47,2025-02-12-0831-24,C9_asterisk,False,True,50,7.0,False,zero-shot,1,0.06,1.0,0.06,correct


In [13]:
# Just 1 over 6 good accuracy ratings the one-shot was used
df_main[df_main["shot"] == "one-shot"].accuracy.value_counts()

accuracy
0.00    57
0.94     1
0.83     1
Name: count, dtype: int64

In [14]:
# Convert columns to numeric, coercing errors to NaN if necessary.
df_main["BASE"] = pd.to_numeric(df_main["BASE"], errors="coerce")
df_main["accuracy"] = pd.to_numeric(df_main["accuracy"], errors="coerce")
df_main["precision"] = pd.to_numeric(df_main["precision"], errors="coerce")
df_main["iou"] = pd.to_numeric(df_main["iou"], errors="coerce")

# Then group and compute the mean
table = df_main.groupby(["shot", "use_img", "use_json"])[["BASE", "accuracy", "precision"]].mean().reset_index()

print(table)

        shot use_img use_json      BASE  accuracy  precision
0   one-shot   False     True  1.000000  0.000000   0.000000
1   one-shot    True    False  1.769231  0.000000   0.000000
2   one-shot    True     True  1.153846  0.093158   0.093158
3  zero-shot   False     True  1.312500  0.312000   0.645000
4  zero-shot    True    False  1.500000  0.000000   0.000000
5  zero-shot    True     True  1.666667  0.257368   0.418947
