In [25]:
from social_agents.agent_builder import SocialAgentBuilder
from social_agents.utils import eval_experiment
import pandas as pd
from glob import glob
import os
all_results_path = SocialAgentBuilder.ROOT_FOLDER + "experiment_results.csv"
path_ = SocialAgentBuilder.ROOT_FOLDER + "output_*.json"
time_log_path = SocialAgentBuilder.ROOT_FOLDER + "time_log.csv"

out_files = [x for x in glob(path_) if x.find("_eval_") == -1]
evaluated_files = [x for x in glob(path_) if x.find("_eval_") > -1]

data_split = "validation"
threshold = 0.6
metric = "similarity"


all_results = []
for out_ in out_files:
    eval_name = out_.replace("json", f"_eval_{metric}_{str(threshold).replace('.', '')}.json")
    if eval_name in evaluated_files:
        print("already evaluated")
        continue
    
    exp_name = os.path.basename(out_).replace("output_", "").replace(".json", "")
    eval_dict = {"experiment_name": exp_name}
    eval_dict = eval_dict | eval_experiment(submission_path=out_, data_split=data_split, threshold=threshold)

    time_log_df = pd.read_csv(time_log_path)
    if exp_name not in time_log_df.columns:
        print(f"time not logged for {exp_name}")
    else:
        eval_dict["time_mean"] = time_log_df[exp_name].mean()
        eval_dict["time_std"] = time_log_df[exp_name].std()

    try:
        all_experiments_results_df = pd.read_csv(all_results_path)
        all_experiments_results_df = all_experiments_results_df.drop(columns=[col for col in all_experiments_results_df.columns if col.startswith('Unnamed')])
    except FileNotFoundError:
        all_experiments_results_df = pd.DataFrame()
    

    all_experiments_results_df = pd.concat([all_experiments_results_df, pd.DataFrame([eval_dict])], ignore_index=True)
    all_experiments_results_df.to_csv(all_results_path, index=False)
    all_results.append(eval_dict)

new_results_df = pd.DataFrame(all_results)
all_experiments_results_df = pd.read_csv(all_results_path)
summary_df = all_experiments_results_df[["experiment_name", "Useful_ratio", "3/3_ratio", "overall_punctuation", "time_mean", "time_std"]]
summary_df.to_csv(all_results_path.replace(".csv", "_summary.csv"), index=False)

already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
Running command: python eval_scripts/evaluation.py --metric similarity --input_path data_splits/validation.json --submission_path output/elbaff_experiment/output_llama8b_social_n3_Teee_Srrr.json --threshold 0.6
Distribution of the labels: Counter({'Useful': 382, 'not_able_to_evaluate': 102, 'Unhelpful': 49, 'Invalid': 25})
Distribution of the intervention punctuation: Counter({1.0: 73, 0.6666666666666666: 60, 0.3333333333333333: 43, 0: 10})
Overall punctuation 0.6845878136200716

already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
already evaluated
Running command: python eval_scripts/evaluation.py --metric similarity --input_path data_splits/valid

In [28]:
new_results_df.head()

Unnamed: 0,experiment_name,Useful,not_able_to_evaluate,Unhelpful,Invalid,Useful_ratio,not_able_to_evaluate_ratio,Unhelpful_ratio,Invalid_ratio,3/3,2/3,1/3,0/3,3/3_ratio,2/3_ratio,1/3_ratio,0/3_ratio,overall_punctuation,time_mean,time_std
0,llama8b_social_n3_Teee_Srrr,382,102,49,25,0.68,0.18,0.09,0.04,73,60,43,10,0.39,0.32,0.23,0.05,0.684588,0.271658,1.440042
1,llama8b_social_n3_Teee_Sdrr,363,109,66,20,0.65,0.2,0.12,0.04,62,69,39,16,0.33,0.37,0.21,0.09,0.650538,0.175149,1.226887
2,llama8b_social_n3_Teeo_Srdd,353,107,77,18,0.64,0.19,0.14,0.03,53,73,48,12,0.28,0.39,0.26,0.06,0.632616,0.043552,0.590147
3,llama8b_social_n3_Teee_Srdd,397,81,58,22,0.71,0.15,0.1,0.04,74,71,33,8,0.4,0.38,0.18,0.04,0.71147,0.749983,2.812153
4,llama8b_social_n3_Teeo_Sdrr,378,103,60,17,0.68,0.18,0.11,0.03,69,66,39,12,0.37,0.35,0.21,0.06,0.677419,0.077598,0.742544


In [None]:
all_experiments_results_df.sort_values(by="overa")

In [29]:
summary_df.sort_values(by="overall_punctuation", ascending=False)[:10]


Unnamed: 0,experiment_name,Useful_ratio,3/3_ratio,overall_punctuation,time_mean,time_std
32,gpt-4o-mini_social_n1_Te_Sr,0.73,0.43,0.725806,10.956791,2.838868
1,gpt-4o-mini_social_n1_Te_Sr,0.73,0.43,0.725806,10.956791,2.838868
0,gpt-4o-mini_social_n1_Te_Srr,0.73,0.42,0.725806,14.625542,2.615584
19,gpt-4o-mini_social_n1_Te_Srr,0.73,0.42,0.725806,14.625542,2.615584
124,llama8b_social_n3_Teee_Srdd,0.71,0.4,0.71147,0.749983,2.812153
57,llama8b_social_n3_Teee_Srd,0.7,0.42,0.700717,0.078821,0.754161
43,llama8b_social_n3_Teoo_Sd,0.69,0.4,0.693548,0.13377,0.643908
34,llama8b_social_n2_Tee_Sdd,0.7,0.42,0.693548,0.109199,0.665749
54,llama8b_social_n3_Teeo_Sdr,0.69,0.39,0.689964,0.0403,0.542073
51,llama8b_social_n3_Teoo_Srr,0.69,0.39,0.689964,0.286844,1.553684


In [None]:
all_experiments_results_df.columns

In [None]:
from glob import glob

files = glob("output/elbaff_experiment/final_states/*.json")

len(files)

In [None]:
from collections import Counter
import json

fail_fs = []
numbers = []
correct_n=[]
for f in files:
    with open(f, "r") as f_:
        s = json.load(f_)
        if len(s['final_cq']['critical_questions']) != 3:
            numbers.append(len(s['final_cq']['critical_questions']))
            fail_fs.append(f)
        else:
            correct_n.append(len(s['final_cq']['critical_questions']))


Counter(numbers).values()
#Counter(correct_n)
fail_fs = list(set(fail_fs))
len(numbers)

In [None]:
import os
for ffail in fail_fs:
    if os.path.exists(ffail):
        os.remove(ffail)
    else:
        print("Doesnt exist")

In [None]:
deleted = fail_fs

In [None]:
sorted(deleted)