#### **NOTE: Please make sure to update the file path as needed to match your local or project-specific directory structure.**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import pandas as pd

In [None]:
ROOT = "/content/drive/MyDrive/001_projects"

### Initializing Key Mapping

In [None]:
key_mapping = {
    "Node State": "node_state",
    "Child ID Assignment": "child_id_assignment",
    "Looping Order": "looping_order",
    "Child State": "child_state",
    "Pruning": "pruning",
    "Pseudo-Recursive Call": "pseudo_recursive_call",
    "Base Case": "base_case",
    "Backtracking I": "backtracking_I",
    "Backtracking II": "backtracking_II",
    "Backtracking III": "backtracking_III",
    "Backtracking IV": "backtracking_IV",
    "Valid Search Trace": "valid_search_trace"
}


key_mapping = {v: k for k, v in key_mapping.items()}
key_mapping

{'node_state': 'Node State',
 'child_id_assignment': 'Child ID Assignment',
 'looping_order': 'Looping Order',
 'child_state': 'Child State',
 'pruning': 'Pruning',
 'pseudo_recursive_call': 'Pseudo-Recursive Call',
 'base_case': 'Base Case',
 'backtracking_I': 'Backtracking I',
 'backtracking_II': 'Backtracking II',
 'backtracking_III': 'Backtracking III',
 'backtracking_IV': 'Backtracking IV',
 'valid_search_trace': 'Valid Search Trace'}

#### Compiling Eval for gpt-4o

In [None]:
gpt_4o_folder_path = os.path.join(ROOT, "agentx/gpt-4o-run-2/eval/")
gpt_4o_output_dir = os.path.join(ROOT, "agentx/gpt-4o-run-2")

In [None]:
import os
import json
import pandas as pd

def process_results(folder_path, key_mapping, output_dir, file_prefix="game_24_gpt-4o"):
    """
    Processes JSON result files and generates three CSVs:
    - Failure percentage per check
    - Total checks per type
    - Final answer status

    Adds 'Game' column extracted from filename.

    Parameters:
    - folder_path: Path to the folder containing the JSON files
    - key_mapping: Dictionary mapping raw keys to readable column names
    - output_dir: Directory where the result CSVs should be saved
    - file_prefix: Prefix for the output file names

    Returns:
    - df_failure_pct, df_total, df_final_answer: The three generated DataFrames
    """
    failure_pct_list = []
    total_list = []
    final_answer_status = []
    game_names = []  # To store extracted game names

    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".json"):
            # Extract the game string from filename
            game_match = filename.split("_eval_report")[0]
            game_names.append(game_match.replace("_", ","))

            with open(os.path.join(folder_path, filename), 'r') as f:
                json_data = json.load(f)

            results_failure_pct = {}
            results_total = {}

            for key in key_mapping.keys():
                if key in json_data['checks_by_type']:
                    total = 0
                    failure = 0
                    if key != "valid_search_trace":
                        for detail in json_data['checks_by_type'][key]['details']:
                            if detail['result'] == "FAIL":
                                failure += 1
                            total += 1
                        results_failure_pct[key_mapping[key]] = failure
                        results_total[key_mapping[key]] = total
                    else:
                        results_failure_pct[key_mapping[key]] = json_data['checks_by_type'][key]['success_trace']
                        results_total[key_mapping[key]] = 1
                else:
                    results_failure_pct[key_mapping[key]] = "NA"
                    results_total[key_mapping[key]] = "NA"

            failure_pct_list.append(results_failure_pct)
            total_list.append(results_total)

            # Final Answer Check
            if "final_answer" in json_data['checks_by_type']:
                fail_count = json_data['checks_by_type']["final_answer"].get("FAIL", None)
                final_answer_status.append(0 if fail_count else 1)
            else:
                final_answer_status.append(0)

    # Convert to DataFrames
    df_failure_pct = pd.DataFrame(failure_pct_list)
    df_total = pd.DataFrame(total_list)

    # Add Game ID and Game name
    df_failure_pct['Game ID'] = list(range(len(df_failure_pct)))
    df_failure_pct['Game'] = game_names

    df_total['Game ID'] = list(range(len(df_total)))
    df_total['Game'] = game_names

    df_final_answer = pd.DataFrame({
        "Game ID": list(range(len(final_answer_status))),
        "Game": game_names,
        "Final Answer": final_answer_status
    })

    # Reorder columns
    df_failure_pct = df_failure_pct[['Game ID', 'Game'] + [col for col in df_failure_pct.columns if col not in ['Game ID', 'Game']]]
    df_total = df_total[['Game ID', 'Game'] + [col for col in df_total.columns if col not in ['Game ID', 'Game']]]

    # Save the CSVs
    df_failure_pct.to_csv(os.path.join(output_dir, f"{file_prefix}_gamewise_failure_count.csv"), index=False)
    df_total.to_csv(os.path.join(output_dir, f"{file_prefix}_gamewise_total_count.csv"), index=False)
    df_final_answer.to_csv(os.path.join(output_dir, f"{file_prefix}_gamewise_final_answer.csv"), index=False)

    return df_failure_pct, df_total, df_final_answer

In [None]:
df_failure_pct_gpt_4o, df_total_gpt_4o, df_final_answer_gpt_4o = process_results(gpt_4o_folder_path, key_mapping, gpt_4o_output_dir, "game_24_gpt-4o")

In [None]:
df_failure_pct_gpt_4o

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace
0,0,10101212,0,0,0,0,0,0,0.0,0,0.0,,,True
1,1,1278,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
2,2,131011,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
3,3,131012,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
4,4,1356,0,0,0,0,0,0,,0,,1.0,0.0,False
5,5,13710,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
6,6,16712,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
7,7,16912,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
8,8,2101011,0,0,0,0,0,0,,0,,0.0,0.0,True
9,9,2111213,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True


##### Analysis: Checking how many times the final answer given by the model was correct.

In [None]:
df_final_answer_gpt_4o['Final Answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Final Answer,Unnamed: 1_level_1
1,0.766667
0,0.233333


##### Analysis: Checking how many times the search trace is valid

In [None]:
    df_failure_pct_gpt_4o['Valid Search Trace'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Valid Search Trace,Unnamed: 1_level_1
True,0.933333
False,0.066667


##### Analysis: Games which reached a final answer, but had an invalid trace

In [None]:
gpt_4o_merged = pd.merge(df_failure_pct_gpt_4o, df_final_answer_gpt_4o, on=['Game ID', 'Game'])
gpt_4o_merged

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace,Final Answer
0,0,10101212,0,0,0,0,0,0,0.0,0,0.0,,,True,1
1,1,1278,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
2,2,131011,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
3,3,131012,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
4,4,1356,0,0,0,0,0,0,,0,,1.0,0.0,False,0
5,5,13710,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
6,6,16712,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
7,7,16912,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
8,8,2101011,0,0,0,0,0,0,,0,,0.0,0.0,True,0
9,9,2111213,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1


In [None]:
gpt_4o_merged[(gpt_4o_merged['Final Answer'] == 1) & (gpt_4o_merged['Valid Search Trace'] == False)]

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace,Final Answer


####  Interleave Two DataFrames by Rows

In [None]:
# Make sure both DataFrames are sorted by Game ID (just in case)
df_failure_sorted = df_failure_pct_gpt_4o.sort_values("Game ID").reset_index(drop=True)
df_total_sorted = df_total_cnt_gpt_4o.sort_values("Game ID").reset_index(drop=True)

interleaved_rows = []
for i in range(len(df_failure_sorted)):
    interleaved_rows.append(df_failure_sorted.iloc[i])
    interleaved_rows.append(df_total_sorted.iloc[i])

# Combine into one DataFrame
df_interleaved = pd.DataFrame(interleaved_rows).reset_index(drop=True)

# Add the 'Type' column
df_interleaved['Type'] = ['Error Count', 'Total Checks'] * (len(df_interleaved) // 2)

# Move 'Type' to come right after 'Game' column
cols = df_interleaved.columns.tolist()
game_id_idx = cols.index('Game ID')
game_idx = cols.index('Game')

# Place 'Type' after 'Game'
new_order = (
    cols[:game_idx + 1] +       # Up to 'Game'
    ['Type'] +                  # Add 'Type'
    [col for col in cols if col not in ('Type', 'Game ID', 'Game')]  # Rest
)
df_interleaved = df_interleaved[new_order]

In [None]:
df_interleaved.to_csv(os.path.join(gpt_4o_output_dir, "game_24_gpt-4o_gamewise_interleaved_table.csv"), index = False)

#### Compiling Eval for gpt-4o-mini

In [None]:
gpt_40_mini_folder_path = os.path.join(ROOT, "agentx/gpt-4o-mini-run-1/eval/")
gpt_40_mini_output_dir = os.path.join(ROOT, "agentx/gpt-4o-mini-run-1")

In [None]:
df_failure_pct_gpt_4o_mini, df_total_gpt_4o_mini, df_final_answer_gpt_4o_mini = process_results(gpt_40_mini_folder_path, key_mapping, gpt_40_mini_output_dir, "game_24_gpt-4o-mini")

##### Analysis: Checking how many times the final answer given by the model was correct.

In [None]:
df_final_answer_gpt_4o_mini['Final Answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Final Answer,Unnamed: 1_level_1
1,0.6
0,0.4


##### Analysis: Checking how many times the search trace is valid

In [None]:
df_failure_pct_gpt_4o_mini['Valid Search Trace'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
Valid Search Trace,Unnamed: 1_level_1
True,0.5
False,0.5


##### Analysis: Failure percentage

In [None]:
df_failure_pct_gpt_4o_mini

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace
0,0,10101212,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
1,1,1278,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
2,2,131011,0,0,0,0,0,0,0.0,0,0.0,,,True
3,3,131012,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
4,4,1356,0,1,1,1,0,0,0.0,0,0.0,,,False
5,5,13710,0,13,12,1,0,0,,0,,1.0,1.0,False
6,6,16712,0,0,0,0,0,0,0.0,0,0.0,,,True
7,7,16912,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
8,8,2101011,0,0,0,0,0,0,,0,,0.0,,True
9,9,2111213,0,2,2,1,0,0,,0,,2.0,0.0,False


In [None]:
df_failure_pct_gpt_4o_mini.iloc[:, 2:]

Unnamed: 0,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace
0,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
1,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
2,0,0,0,0,0,0,0.0,0,0.0,,,True
3,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
4,0,1,1,1,0,0,0.0,0,0.0,,,False
5,0,13,12,1,0,0,,0,,1.0,1.0,False
6,0,0,0,0,0,0,0.0,0,0.0,,,True
7,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
8,0,0,0,0,0,0,,0,,0.0,,True
9,0,2,2,1,0,0,,0,,2.0,0.0,False


In [None]:
df_failure_pct_gpt_4o_mini.iloc[:, 2:].apply(pd.to_numeric, errors='coerce').sum(axis=0)

Unnamed: 0,0
Node State,1.0
Child ID Assignment,73.0
Looping Order,72.0
Child State,9.0
Pruning,0.0
Pseudo-Recursive Call,0.0
Base Case,0.0
Backtracking I,19.0
Backtracking II,0.0
Backtracking III,13.0


In [None]:
df_total_gpt_4o_mini.iloc[:, 2:].apply(pd.to_numeric, errors='coerce').sum(axis=0)

Unnamed: 0,0
Node State,111.0
Child ID Assignment,587.0
Looping Order,586.0
Child State,107.0
Pruning,456.0
Pseudo-Recursive Call,107.0
Base Case,21.0
Backtracking I,132.0
Backtracking II,81.0
Backtracking III,51.0


In [None]:
error_rate_gpt_4o_mini = df_failure_pct_gpt_4o_mini.iloc[:, 2:].apply(pd.to_numeric, errors='coerce').sum(axis=0) / df_total_gpt_4o_mini.iloc[:, 2:].apply(pd.to_numeric, errors='coerce').sum(axis=0)

In [None]:
error_rate_gpt_4o_mini

Unnamed: 0,0
Node State,0.009009
Child ID Assignment,0.124361
Looping Order,0.122867
Child State,0.084112
Pruning,0.0
Pseudo-Recursive Call,0.0
Base Case,0.0
Backtracking I,0.143939
Backtracking II,0.0
Backtracking III,0.254902


##### Interleave two dataframe

In [None]:
# Make sure both DataFrames are sorted by Game ID (just in case)
df_failure_sorted = df_failure_pct_gpt_4o_mini.sort_values("Game ID").reset_index(drop=True)
df_total_sorted = df_total_gpt_4o_mini.sort_values("Game ID").reset_index(drop=True)

interleaved_rows = []
for i in range(len(df_failure_sorted)):
    interleaved_rows.append(df_failure_sorted.iloc[i])
    interleaved_rows.append(df_total_sorted.iloc[i])

# Combine into one DataFrame
df_interleaved = pd.DataFrame(interleaved_rows).reset_index(drop=True)

# Add the 'Type' column
df_interleaved['Type'] = ['Error Count', 'Total Checks'] * (len(df_interleaved) // 2)

# Move 'Type' to come right after 'Game' column
cols = df_interleaved.columns.tolist()
game_id_idx = cols.index('Game ID')
game_idx = cols.index('Game')

# Place 'Type' after 'Game'
new_order = (
    cols[:game_idx + 1] +       # Up to 'Game'
    ['Type'] +                  # Add 'Type'
    [col for col in cols if col not in ('Type', 'Game ID', 'Game')]  # Rest
)
df_interleaved = df_interleaved[new_order]

In [None]:
df_interleaved.to_csv(os.path.join(gpt_40_mini_output_dir, "game_24_gpt-4o_mini_gamewise_interleaved_table.csv"), index = False)

In [None]:
df_interleaved

Unnamed: 0,Game ID,Game,Type,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace
0,0,10101212,Error Count,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
1,0,10101212,Total Checks,4,13,13,4,8,4,1.0,5,4.0,1.0,1.0,1
2,1,1278,Error Count,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
3,1,1278,Total Checks,4,18,18,4,13,4,1.0,5,4.0,1.0,1.0,1
4,2,131011,Error Count,0,0,0,0,0,0,0.0,0,0.0,,,True
5,2,131011,Total Checks,3,5,5,3,1,3,1.0,4,4.0,,,1
6,3,131012,Error Count,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True
7,3,131012,Total Checks,4,17,17,4,12,4,1.0,5,4.0,1.0,1.0,1
8,4,1356,Error Count,0,1,1,1,0,0,0.0,0,0.0,,,False
9,4,1356,Total Checks,3,13,13,3,9,3,1.0,4,4.0,,,1


##### Analysis: Games which reached a final answer, but had an invalid trace

In [None]:
gpt_4o_mini_merged = pd.merge(df_failure_pct_gpt_4o_mini, df_final_answer_gpt_4o_mini, on=['Game ID', 'Game'])
gpt_4o_mini_merged

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace,Final Answer
0,0,10101212,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
1,1,1278,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
2,2,131011,0,0,0,0,0,0,0.0,0,0.0,,,True,1
3,3,131012,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
4,4,1356,0,1,1,1,0,0,0.0,0,0.0,,,False,0
5,5,13710,0,13,12,1,0,0,,0,,1.0,1.0,False,0
6,6,16712,0,0,0,0,0,0,0.0,0,0.0,,,True,1
7,7,16912,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,True,1
8,8,2101011,0,0,0,0,0,0,,0,,0.0,,True,0
9,9,2111213,0,2,2,1,0,0,,0,,2.0,0.0,False,0


In [None]:
gpt_4o_mini_merged[(gpt_4o_mini_merged['Final Answer'] == 1) & (gpt_4o_mini_merged['Valid Search Trace'] == False)]

Unnamed: 0,Game ID,Game,Node State,Child ID Assignment,Looping Order,Child State,Pruning,Pseudo-Recursive Call,Base Case,Backtracking I,Backtracking II,Backtracking III,Backtracking IV,Valid Search Trace,Final Answer
13,13,24811,0,3,3,1,0,0,0,3,0,0.0,,False,1
15,15,331212,0,0,0,0,0,0,0,1,0,,,False,1
22,22,4448,0,3,3,0,0,0,0,3,0,0.0,0.0,False,1
23,23,45711,0,4,4,1,0,0,0,3,0,0.0,1.0,False,1
