In [65]:
import pandas as pd
import os

df = pd.read_csv(os.path.join("results", 'results.csv'), index_col=0)
df.shape

(81, 9)

In [66]:
df.head()

Unnamed: 0,dataset,prompt,total score,accuracy,completeness,fluency,conciseness,n_few_shot,n_bootstrapped_few_shot
0,housing_1.json,You are helping users understand an ML model's...,15.24,3.52,3.92,3.8,4.0,1.0,
1,housing_1.json,You are helping users understand an ML model's...,14.96,3.2,3.92,3.84,4.0,3.0,
0,housing_1.json,You are helping users understand an ML model's...,15.24,3.52,4.0,3.72,4.0,5.0,
1,housing_2.json,You are helping users understand an ML model's...,16.0,4.0,4.0,4.0,4.0,1.0,
2,housing_2.json,You are helping users understand an ML model's...,16.0,4.0,4.0,4.0,4.0,3.0,


In [67]:
dataset_names = {"housing_1.json": "House 1",
                 "housing_2.json": "House 2",
                 "housing_3.json": "House 3",
                 "mushroom_1.json": "Mush 1",
                 "mushroom_2.json": "Mush 2",
                 "student_1.json": "Student 1",
                 "student_2.json": "Student 2",
                 "pdf_1.json": "PDF 1",
                 "pdf_2.json": "PDF 2"}

df['dataset'] = df['dataset'].map(dataset_names)

In [68]:
prompts_to_ids = {df['prompt'].unique()[0]: "Prompt 1", 
                  df['prompt'].unique()[1]: "Prompt 2",
                  df['prompt'].unique()[2]: "Prompt 3"}

df['prompt'] = df['prompt'].map(prompts_to_ids)

In [69]:
df["n_few_shot"].fillna(0, inplace=True)
df["n_bootstrapped_few_shot"].fillna(0, inplace=True)

df["n_few_shot"] = df["n_few_shot"].astype(int)
df["n_bootstrapped_few_shot"] = df["n_bootstrapped_few_shot"].astype(int)

df["n_few_shot"].fillna(0, inplace=True)
df["n_bootstrapped_few_shot"].fillna(0, inplace=True)
df.head()

Unnamed: 0,dataset,prompt,total score,accuracy,completeness,fluency,conciseness,n_few_shot,n_bootstrapped_few_shot
0,House 1,Prompt 1,15.24,3.52,3.92,3.8,4.0,1,0
1,House 1,Prompt 1,14.96,3.2,3.92,3.84,4.0,3,0
0,House 1,Prompt 1,15.24,3.52,4.0,3.72,4.0,5,0
1,House 2,Prompt 1,16.0,4.0,4.0,4.0,4.0,1,0
2,House 2,Prompt 1,16.0,4.0,4.0,4.0,4.0,3,0


In [70]:
column_names = {"dataset": "Dataset",
                "prompt": "Prompt",
                "n_few_shot": "$L$",
                "n_bootstrapped_few_shot": "$B$",
                "accuracy": "Accuracy",
                "completeness": "Completeness",
                "fluency": "Fluency",
                "conciseness": "Conciseness",
                "total score": "Total score"}

df = df.rename(columns=column_names)
df = df[["Dataset", "Prompt", "$L$", "$B$", "Accuracy", "Completeness", "Fluency", "Conciseness", "Total score"]]

# round all scores to 3 decimal places
df = df.round(3)

df.to_csv(os.path.join("results", "cleaned_results.csv"))

df.head()

Unnamed: 0,Dataset,Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score
0,House 1,Prompt 1,1,0,3.52,3.92,3.8,4.0,15.24
1,House 1,Prompt 1,3,0,3.2,3.92,3.84,4.0,14.96
0,House 1,Prompt 1,5,0,3.52,4.0,3.72,4.0,15.24
1,House 2,Prompt 1,1,0,4.0,4.0,4.0,4.0,16.0
2,House 2,Prompt 1,3,0,4.0,4.0,4.0,4.0,16.0


In [71]:
df_by_technique = df.groupby(["Prompt", "$L$", "$B$"]).agg({"Accuracy": "mean", "Completeness": "mean", "Fluency": "mean", "Conciseness": "mean", "Total score": "mean"}).reset_index()

df_by_technique = df_by_technique.round(3)

df_by_technique.to_csv(os.path.join("results", "results_by_technique.csv"))
df_by_technique.to_latex(buf=os.path.join("results", "results_by_technique.tex"), index=False, float_format="%.3f", caption="Overall results for each prompt and few-shot setting.")

df_by_technique.head()

Unnamed: 0,Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score
0,Prompt 1,0,0,3.911,4.0,2.266,4.0,14.177
1,Prompt 1,0,1,3.782,3.944,2.45,4.0,14.177
2,Prompt 1,0,3,3.8,3.936,2.477,4.0,14.212
3,Prompt 1,1,0,3.169,3.384,3.796,4.0,14.349
4,Prompt 1,3,0,3.156,3.428,3.751,4.0,14.334


In [79]:
# BY TECHNIQUE WITH STD DEV

# Group by the specified columns and calculate both mean and standard deviation
df_by_technique_mean = df.groupby(["Prompt", "$L$", "$B$"]).agg({
    "Accuracy": "mean", 
    "Completeness": "mean", 
    "Fluency": "mean", 
    "Conciseness": "mean", 
    "Total score": "mean"
}).reset_index()

df_by_technique_std = df.groupby(["Prompt", "$L$", "$B$"]).agg({
    "Accuracy": "std", 
    "Completeness": "std", 
    "Fluency": "std", 
    "Conciseness": "std", 
    "Total score": "std"
}).reset_index()

# Combine mean and standard deviation into the desired format: mean (std)
for column in ["Accuracy", "Completeness", "Fluency", "Conciseness", "Total score"]:
    df_by_technique_mean[column] = df_by_technique_mean[column].apply(lambda x: f"{x:.3f}") + " (" + df_by_technique_std[column].apply(lambda x: f"{x:.2f}") + ")"

# Save to CSV and LaTeX
df_by_technique_mean.to_csv(os.path.join("results", "results_by_technique.csv"), index=False)
df_by_technique_mean.to_latex(
    buf=os.path.join("results", "results_by_technique.tex"), 
    index=False, 
    escape=False,  # escape=False allows LaTeX formatting (like parentheses) to be preserved
    caption="Overall results for each prompt and few-shot setting."
)

In [72]:
df_by_technique_mean.head()
df_by_dataset = df.groupby(["Dataset"]).agg({"Accuracy": "mean", "Completeness": "mean", "Fluency": "mean", "Conciseness": "mean", "Total score": "mean"}).reset_index()

df_by_dataset = df_by_dataset.round(3)

df_by_dataset.to_csv(os.path.join("results", "results_by_dataset.csv"))
df_by_dataset.to_latex(buf=os.path.join("results", "results_by_dataset.tex"), index=False, float_format="%.3f", caption="Overall results for each prompt and few-shot setting.")

df_by_dataset.head()

In [80]:
# BY DATASET WITH STD DEV

# Group by the specified columns and calculate both mean and standard deviation
df_by_dataset_mean = df.groupby(["Dataset"]).agg({
    "Accuracy": "mean", 
    "Completeness": "mean", 
    "Fluency": "mean", 
    "Conciseness": "mean", 
    "Total score": "mean"
}).reset_index()

df_by_dataset_std = df.groupby(["Dataset"]).agg({
    "Accuracy": "std", 
    "Completeness": "std", 
    "Fluency": "std", 
    "Conciseness": "std", 
    "Total score": "std"
}).reset_index()

# Combine mean and standard deviation into the desired format: mean (std)
for column in ["Accuracy", "Completeness", "Fluency", "Conciseness", "Total score"]:
    df_by_dataset_mean[column] = df_by_dataset_mean[column].apply(lambda x: f"{x:.3f}") + " (" + df_by_dataset_mean[column].apply(lambda x: f"{x:.2f}") + ")"

# Save to CSV and LaTeX
df_by_dataset_mean.to_csv(os.path.join("results", "results_by_dataset.csv"), index=False)
df_by_dataset_mean.to_latex(
    buf=os.path.join("results", "results_by_dataset.tex"), 
    index=False, 
    escape=False,  # escape=False allows LaTeX formatting (like parentheses) to be preserved
    caption="Overall results for each prompt and few-shot setting."
)