In [1]:
import pandas as pd
import os
from functools import reduce

# Directory containing summaries
summary_dir = './../data/eval/alfworld/final_eval_in_distribution'

# Find all 'summary.csv' files in the subdirectories of summary_dir
summaries = [os.path.join(root, file) for root, _, files in os.walk(summary_dir) for file in files if file == 'summary.csv']

# Read and clean all CSV files
dfs = [pd.read_csv(summary).drop(columns=['env_idx']) for summary in summaries]

# Ensure no duplicate model IDs across summaries
all_model_ids = set()
for df in dfs:
    model_ids = set(df['model_id'])
    if model_ids & all_model_ids:
        raise ValueError(f"Duplicate model_id(s) found across summaries: {model_ids & all_model_ids}")
    all_model_ids.update(model_ids)

# Flatten and pivot each dataframe
def flatten_dataframe(df):
    score_pivot = df.pivot(index='gamefile', columns='model_id', values='score').add_prefix('score_')
    actions_pivot = df.pivot(index='gamefile', columns='model_id', values='num_actions').add_prefix('num_actions_')
    return score_pivot.join(actions_pivot, how='outer').reset_index()

flattened_dfs = [flatten_dataframe(df) for df in dfs]

# Merge all dataframes on 'gamefile'
final_df = reduce(lambda left, right: pd.merge(left, right, on='gamefile', how='outer'), flattened_dfs)

# TODO: LEAP runs ran till max 20 actions, but for fair comparisons all failures must be set to max 30 actions to be compatible with other baselines
# Fix by rerunning LEAP till 30
for score_col in final_df.columns[final_df.columns.str.startswith('score_')]:
    num_actions_col = score_col.replace('score_', 'num_actions_')
    if num_actions_col in final_df.columns:
        mask = (final_df[score_col] == 0) & (final_df[num_actions_col] == 20)
        final_df.loc[mask, num_actions_col] = 30

# Map gamefile to category
prefixes = {
    'pick_and_place': 'pick',
    'pick_clean_then_place': 'clean',
    'pick_heat_then_place': 'heat',
    'pick_cool_then_place': 'cool',
    'look_at_obj': 'look',
    'pick_two_obj': 'picktwo'
}
final_df['category'] = final_df['gamefile'].apply(lambda x: next((v for k, v in prefixes.items() if x.split('/')[1].startswith(k)), None))

# Calculate mean scores and num_actions by 'category'
score_columns = [col for col in final_df.columns if col.startswith('score')]
num_actions_columns = [col for col in final_df.columns if col.startswith('num_actions')]
grouped_means = final_df.groupby('category')[score_columns + num_actions_columns].mean()

# Calculate overall means
overall_means = final_df[score_columns + num_actions_columns].mean().to_frame().T
overall_means.index = ['overall']

# Append overall means to grouped means
grouped_means_with_overall = pd.concat([grouped_means, overall_means])

# Reorder columns to place score and num_actions columns together
ordered_columns = [col for score_col in score_columns for col in (score_col, score_col.replace('score', 'num_actions'))]
grouped_means_with_overall[score_columns] *= 100
grouped_means_with_overall_ordered = grouped_means_with_overall[ordered_columns]

# Reindex rows to match desired order
desired_order = ['pick', 'clean', 'heat', 'cool', 'look', 'picktwo', 'overall']
grouped_means_with_overall_ordered = grouped_means_with_overall_ordered.reindex(desired_order)

# Format values to one decimal place
grouped_means_with_overall_ordered_formatted = grouped_means_with_overall_ordered.applymap(lambda x: f"{x:.1f}")

# Display final table
grouped_means_with_overall_ordered_formatted


  grouped_means_with_overall_ordered_formatted = grouped_means_with_overall_ordered.applymap(lambda x: f"{x:.1f}")


model_id,score_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter2,num_actions_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter2,score_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter3,num_actions_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter3,score_gpt-4o,num_actions_gpt-4o,score_gpt-4o-mini,num_actions_gpt-4o-mini,score_meta-llama/Meta-Llama-3-8B-Instruct,num_actions_meta-llama/Meta-Llama-3-8B-Instruct,score_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter0,num_actions_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter0,score_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter1,num_actions_leap-llm/Meta-Llama-3-8B-Instruct-sft-alfworld-iter1
pick,94.3,7.9,100.0,6.9,91.4,12.7,40.0,22.3,8.6,28.4,88.6,11.1,100.0,7.1
clean,92.6,11.7,96.3,10.5,33.3,25.4,11.1,28.9,0.0,30.0,51.9,20.7,100.0,10.0
heat,87.5,12.9,100.0,11.7,31.2,26.2,12.5,29.1,0.0,30.0,56.2,21.2,100.0,11.9
cool,88.0,13.4,87.5,12.3,12.0,28.8,4.0,30.0,0.0,30.0,80.0,15.9,92.0,12.4
look,92.3,10.8,100.0,7.8,84.6,13.4,53.8,18.8,7.7,28.5,76.9,13.1,92.3,9.5
picktwo,83.3,13.5,83.3,13.4,66.7,21.5,20.8,26.8,0.0,30.0,50.0,21.2,91.7,13.6
overall,90.0,11.4,94.2,10.3,54.3,21.1,22.9,26.1,2.9,29.5,68.6,16.9,96.4,10.5
