In [1]:
from json import load as json_load, dumps
from glob import glob
from pandas import DataFrame, merge, crosstab, concat
from numpy import unique
import scipy.stats as stats
import re

In [2]:
dict_trajs_pattern = {
    "gpt-4_orig_prompt_orig_topology": "trajs_gpt-4_orig_prompt_orig_topology_*/*.json", 
    "gpt-4_impr_prompt_orig_topology": "trajs_gpt-4_impr_prompt_orig_topology_*/*.json", 
    "gpt-4_impr_prompt_impr_topology": "trajs_gpt-4_impr_prompt_impr_topology_*/*.json", 
    "gpt-4o_orig_prompt_orig_topology": "trajs_gpt-4o_orig_prompt_orig_topology_*/*.json", 
    "gpt-4o_impr_prompt_orig_topology": "trajs_gpt-4o_impr_prompt_orig_topology_*/*.json",  
    "gpt-4o_impr_prompt_impr_topology": "trajs_gpt-4o_impr_prompt_impr_topology_*/*.json",  
}

dict_trajs_path = {k: glob(dict_trajs_pattern[k]) for k in dict_trajs_pattern.keys()}

dict_df = {}
dict_df_grouped = {}

for k in dict_trajs_path:
    list_trajs = []
    for p in dict_trajs_path[k]:
        match = re.search(r'topology_(\d+)', p)
        number = match.group(1)
        with open(p, "r") as fp:
            content = json_load(fp)
            list_trajs.append(
                {
                    "instance_id": content["instance_id"], 
                    "problem_statement": content["problem_statement"][0], 
                    "correct": content["other_data"]["correct"], 
                    "seed": number
                }
            )
    dict_df[k] = DataFrame(list_trajs)
    dict_df_grouped[k] = dict_df[k].groupby(["seed"]).agg({"correct": ["sum", "count"]}).reset_index(drop=False)
    dict_df_grouped[k].columns = ["seed", "number_corrects", "n_completed_trajs"]
    dict_df_grouped[k]["configuration"] = k
    dict_df_grouped[k] = dict_df_grouped[k].loc[:, ["configuration", "seed", "n_completed_trajs", "number_corrects"]]
    dict_df_grouped[k]["number_corrects_perc"] = dict_df_grouped[k]["number_corrects"] / 200 * 100
    display(dict_df_grouped[k])

Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4_orig_prompt_orig_topology,42,200,167,83.5
1,gpt-4_orig_prompt_orig_topology,43,200,170,85.0
2,gpt-4_orig_prompt_orig_topology,44,200,177,88.5
3,gpt-4_orig_prompt_orig_topology,45,197,167,83.5
4,gpt-4_orig_prompt_orig_topology,46,199,169,84.5
5,gpt-4_orig_prompt_orig_topology,47,199,167,83.5


Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4_impr_prompt_orig_topology,42,200,181,90.5
1,gpt-4_impr_prompt_orig_topology,43,200,180,90.0
2,gpt-4_impr_prompt_orig_topology,44,200,181,90.5
3,gpt-4_impr_prompt_orig_topology,45,199,176,88.0
4,gpt-4_impr_prompt_orig_topology,46,200,183,91.5
5,gpt-4_impr_prompt_orig_topology,47,200,176,88.0


Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4_impr_prompt_impr_topology,42,200,168,84.0
1,gpt-4_impr_prompt_impr_topology,43,200,174,87.0
2,gpt-4_impr_prompt_impr_topology,44,200,170,85.0
3,gpt-4_impr_prompt_impr_topology,45,200,172,86.0
4,gpt-4_impr_prompt_impr_topology,46,199,173,86.5
5,gpt-4_impr_prompt_impr_topology,47,200,169,84.5


Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4o_orig_prompt_orig_topology,42,198,172,86.0
1,gpt-4o_orig_prompt_orig_topology,43,197,172,86.0
2,gpt-4o_orig_prompt_orig_topology,44,197,165,82.5
3,gpt-4o_orig_prompt_orig_topology,45,198,170,85.0
4,gpt-4o_orig_prompt_orig_topology,46,195,163,81.5
5,gpt-4o_orig_prompt_orig_topology,47,196,169,84.5


Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4o_impr_prompt_orig_topology,42,200,174,87.0
1,gpt-4o_impr_prompt_orig_topology,43,198,182,91.0
2,gpt-4o_impr_prompt_orig_topology,44,198,176,88.0
3,gpt-4o_impr_prompt_orig_topology,45,199,178,89.0
4,gpt-4o_impr_prompt_orig_topology,46,199,179,89.5
5,gpt-4o_impr_prompt_orig_topology,47,198,179,89.5


Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4o_impr_prompt_impr_topology,42,194,180,90.0
1,gpt-4o_impr_prompt_impr_topology,43,198,178,89.0
2,gpt-4o_impr_prompt_impr_topology,44,198,175,87.5
3,gpt-4o_impr_prompt_impr_topology,45,199,177,88.5
4,gpt-4o_impr_prompt_impr_topology,46,198,182,91.0
5,gpt-4o_impr_prompt_impr_topology,47,193,174,87.0


In [3]:
df_grouped = concat([dict_df_grouped[k] for k in dict_df_grouped.keys()])

In [4]:
df_grouped

Unnamed: 0,configuration,seed,n_completed_trajs,number_corrects,number_corrects_perc
0,gpt-4_orig_prompt_orig_topology,42,200,167,83.5
1,gpt-4_orig_prompt_orig_topology,43,200,170,85.0
2,gpt-4_orig_prompt_orig_topology,44,200,177,88.5
3,gpt-4_orig_prompt_orig_topology,45,197,167,83.5
4,gpt-4_orig_prompt_orig_topology,46,199,169,84.5
5,gpt-4_orig_prompt_orig_topology,47,199,167,83.5
0,gpt-4_impr_prompt_orig_topology,42,200,181,90.5
1,gpt-4_impr_prompt_orig_topology,43,200,180,90.0
2,gpt-4_impr_prompt_orig_topology,44,200,181,90.5
3,gpt-4_impr_prompt_orig_topology,45,199,176,88.0


In [5]:
df_grouped.loc[:, ["configuration", "n_completed_trajs", "number_corrects", "number_corrects_perc"]].groupby(["configuration"]).agg({"n_completed_trajs": ["mean", "std", "count"], "number_corrects": ["mean", "std", "count"], "number_corrects_perc": ["mean", "std", "count"]})

Unnamed: 0_level_0,n_completed_trajs,n_completed_trajs,n_completed_trajs,number_corrects,number_corrects,number_corrects,number_corrects_perc,number_corrects_perc,number_corrects_perc
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
configuration,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gpt-4_impr_prompt_impr_topology,199.833333,0.408248,6,171.0,2.366432,6,85.5,1.183216,6
gpt-4_impr_prompt_orig_topology,199.833333,0.408248,6,179.5,2.880972,6,89.75,1.440486,6
gpt-4_orig_prompt_orig_topology,199.166667,1.169045,6,169.5,3.885872,6,84.75,1.942936,6
gpt-4o_impr_prompt_impr_topology,196.666667,2.503331,6,177.666667,3.011091,6,88.833333,1.505545,6
gpt-4o_impr_prompt_orig_topology,198.666667,0.816497,6,178.0,2.75681,6,89.0,1.378405,6
gpt-4o_orig_prompt_orig_topology,196.833333,1.169045,6,168.5,3.72827,6,84.25,1.864135,6


In [6]:
dict_df_grouped.keys()

dict_keys(['gpt-4_orig_prompt_orig_topology', 'gpt-4_impr_prompt_orig_topology', 'gpt-4_impr_prompt_impr_topology', 'gpt-4o_orig_prompt_orig_topology', 'gpt-4o_impr_prompt_orig_topology', 'gpt-4o_impr_prompt_impr_topology'])

In [7]:
statistic, p_value = stats.wilcoxon(
    list(dict_df_grouped["gpt-4_orig_prompt_orig_topology"]["number_corrects_perc"]), 
    list(dict_df_grouped["gpt-4_impr_prompt_orig_topology"]["number_corrects_perc"])
)
print(statistic, p_value)

0.0 0.03125


In [8]:
statistic, p_value = stats.wilcoxon(
    list(dict_df_grouped["gpt-4_orig_prompt_orig_topology"]["number_corrects_perc"]), 
    list(dict_df_grouped["gpt-4_impr_prompt_impr_topology"]["number_corrects_perc"])
)
print(statistic, p_value)

6.0 0.40625


In [9]:
statistic, p_value = stats.wilcoxon(
    list(dict_df_grouped["gpt-4o_orig_prompt_orig_topology"]["number_corrects_perc"]), 
    list(dict_df_grouped["gpt-4o_impr_prompt_orig_topology"]["number_corrects_perc"])
)
print(statistic, p_value)

0.0 0.03125


In [10]:
statistic, p_value = stats.wilcoxon(
    list(dict_df_grouped["gpt-4o_orig_prompt_orig_topology"]["number_corrects_perc"]), 
    list(dict_df_grouped["gpt-4o_impr_prompt_impr_topology"]["number_corrects_perc"])
)
print(statistic, p_value)

0.0 0.03125


In [11]:
statistic, p_value = stats.wilcoxon(
    list(dict_df_grouped["gpt-4o_impr_prompt_orig_topology"]["number_corrects_perc"]), 
    list(dict_df_grouped["gpt-4o_impr_prompt_impr_topology"]["number_corrects_perc"])
)
print(statistic, p_value)

9.0 0.8125


In [12]:
list_keys = list(dict_df.keys())

df = dict_df[list_keys[0]].loc[:, ["seed", "instance_id", "correct"]].copy()
df = df.rename({"correct": "correct_"+list_keys[0]}, axis=1)

for j in range(len(list_keys)-1):
    df = merge(
        df, 
        dict_df[list_keys[j+1]].loc[:, ["seed", "instance_id", "correct"]], 
        on=["seed", "instance_id"], how="inner", 
    )
    df = df.rename({"correct": "correct_"+list_keys[j+1]}, axis=1)

In [13]:
df.shape

(1167, 8)

In [14]:
df.sum()

seed                                        4646464646464646464646464646464646464646464646...
instance_id                                 9b19faba-d659-5822-b8ee-10914dddb518b1704622-7...
correct_gpt-4_orig_prompt_orig_topology                                                   996
correct_gpt-4_impr_prompt_orig_topology                                                  1049
correct_gpt-4_impr_prompt_impr_topology                                                  1003
correct_gpt-4o_orig_prompt_orig_topology                                                 1000
correct_gpt-4o_impr_prompt_orig_topology                                                 1048
correct_gpt-4o_impr_prompt_impr_topology                                                 1055
dtype: object

In [15]:
df

Unnamed: 0,seed,instance_id,correct_gpt-4_orig_prompt_orig_topology,correct_gpt-4_impr_prompt_orig_topology,correct_gpt-4_impr_prompt_impr_topology,correct_gpt-4o_orig_prompt_orig_topology,correct_gpt-4o_impr_prompt_orig_topology,correct_gpt-4o_impr_prompt_impr_topology
0,46,9b19faba-d659-5822-b8ee-10914dddb518,True,True,True,True,True,True
1,46,b1704622-73f8-5d2f-99d0-af4c6a80ee93,True,True,True,True,True,True
2,46,9e8ee62d-dd02-5c50-a42e-8eada69a0acc,True,True,True,True,True,True
3,46,03846ead-3648-58cb-864b-d42093699a1f,True,True,True,True,True,True
4,46,9e7ea617-0523-599b-8d24-129114eaf302,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...
1162,43,de2ed0ec-dcc0-55b2-b7ca-4ea127558d5b,True,True,True,True,True,True
1163,43,2a8fa54b-df25-5d6f-a7a0-e40885768b1b,True,True,True,True,True,True
1164,43,d5d5a02e-a48c-5884-b561-a281996f4b2f,True,True,True,True,True,True
1165,43,92b1c12e-e163-55c0-b5d5-d5a7098c7854,True,True,True,True,True,True


In [16]:
df.keys()

Index(['seed', 'instance_id', 'correct_gpt-4_orig_prompt_orig_topology',
       'correct_gpt-4_impr_prompt_orig_topology',
       'correct_gpt-4_impr_prompt_impr_topology',
       'correct_gpt-4o_orig_prompt_orig_topology',
       'correct_gpt-4o_impr_prompt_orig_topology',
       'correct_gpt-4o_impr_prompt_impr_topology'],
      dtype='object')

In [17]:
array_all_wrongs = df.loc[:, [
    'correct_gpt-4_orig_prompt_orig_topology',
    'correct_gpt-4_impr_prompt_orig_topology',
    'correct_gpt-4_impr_prompt_impr_topology',
    'correct_gpt-4o_orig_prompt_orig_topology',
    'correct_gpt-4o_impr_prompt_orig_topology',
    'correct_gpt-4o_impr_prompt_impr_topology' 
    ]
].sum(axis=1) == 0

In [18]:
unique(df.loc[array_all_wrongs, :]["instance_id"], return_counts=True)

(array(['094b6ef4-9501-550d-b9f8-4d5330d07e34',
        '58be8739-d76d-579a-8ceb-ffc2f3064f18',
        '65b61e80-7336-5503-84bd-8c4a500b011e',
        '882b00dd-ba1e-553d-b3c7-2760e2ff169c',
        '9a26e171-0fb9-5138-a73e-2a4b3da3bde5',
        'a5714da3-9107-5a28-a6df-bb97dcf4e29a',
        'e3a6bf34-7337-56bf-900a-4a10b6b061cc',
        'e6b8c5fe-c0ad-50cb-b84d-7dad800dc132',
        'fe9a1fc1-20e9-5e02-bc4a-4d56f3b205ea'], dtype=object),
 array([3, 6, 1, 2, 1, 4, 5, 2, 2]))