# Result Analysis

This notebook loads and analyzes the models_data.

In [None]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from collections import defaultdict

# Set plot style
plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

## Load Models Data

First, we'll find all the models_data.json files and load them into pandas.

In [None]:
# Find all models_data.json files
data_files = glob.glob('histories/baseline/**/models_data.json', recursive=True)
print(f"Found {len(data_files)} model data files")

# If no files found, maybe we need to run the postprocessing script first
if len(data_files) == 0:
    print("No models_data.json files found. You may need to run postprocess_experiments.py first.")

In [None]:
# Function to load data from a models_data.json file
def load_models_data(file_path):
    # Extract experiment configuration from the path
    config = os.path.dirname(file_path).replace('histories/baseline/', '')
    mode, model = config.split('/')

    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Convert nested structure to DataFrame rows
        rows = []
        for _, experiments in data.items():
            for exp in experiments:
                exp_copy = exp.copy()
                exp_copy['model'] = model
                exp_copy['config'] = mode
                # Handle unique_hypotheses (convert lists back to sets for counting if needed)
                if 'unique_hypotheses' in exp_copy and isinstance(exp_copy['unique_hypotheses'], list):
                    exp_copy['num_unique_hypotheses'] = len(exp_copy['unique_hypotheses'])
                rows.append(exp_copy)
        
        return pd.DataFrame(rows)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()

# Load all data files into a single DataFrame
df = [load_models_data(file) for file in data_files]
df = pd.concat(df, ignore_index=True)

# Display basic information
if not df.empty:
    print(f"Loaded {len(df)} experiment records across {df['model'].nunique()} models")
    print(f"Configurations: {df['config'].unique()}")
    df.head()

In [None]:
original_configs = sorted(df['config'].unique())
mapping = {
        "default": "Level 1",
        "no_context": "Level 2",
        "no_description": "Level 3",
        "no_description_anonymous": "Level 4"
    }
df['config'] = df['config'].map(mapping)

mapping = {
        "openai_o4-mini-high": "OpenAI o4 Mini",
        "google_gemini-2.5-flash-preview:thinking": "Gemini 2.5 Flash",
        "claude-3-7-sonnet-20250219": "Claude 3.7 Sonnet",
        "google_gemini-2.5-pro-preview": "Gemini 2.5 Pro",
        "vllm_Qwen3-32B": "Qwen3-32B",
        "ollama_gpt-oss_20b": "gpt-oss-20B"
    }
df['model'] = df['model'].map(mapping)
df['id'] = df['id'].astype(int)
df['id'] = df['id'].astype('category')
df.info()

In [None]:
# Find rows with missing ids
missing_ids = df[df['id'].isna()]
if not missing_ids.empty:
    print(f"Found {len(missing_ids)} rows with missing ids")
    print(missing_ids)

In [None]:
df.head(5)

In [None]:
successful_runs = df[df['success'] == True]

# 2. Group by 'model' and 'config' and aggregate the specified columns.
#    We'll sum the counts for each group.
#    Using 'num_unique_hypotheses' as it is a numeric representation.
result = successful_runs.groupby(['model', 'config']).agg(
    avg_samples_used=('samples_used', 'mean'),
    avg_test_used=('test_used', 'mean'),
    avg_iterations=('iterations', 'mean'),
    avg_num_unique_hypotheses=('num_unique_hypotheses', 'mean'),  # Summing the numeric count
    avg_hypotheses_count=('hypotheses_count', 'mean')
).reset_index()
result = result.round(2)
result

In [None]:
success_rate = df.groupby(['model', 'config'])['success'].mean().reset_index()
success_rate = success_rate.rename(columns={'success': 'Acc_rate'})
success_rate['Acc_rate'] = (success_rate['Acc_rate'] * 100).round(2)
print(success_rate)


In [None]:
df_with_dummy = df[df['num_dummy_var'] > 0]
success_rate = df_with_dummy.groupby(['model', 'config'])['success'].mean().reset_index()
success_rate = success_rate.rename(columns={'success': 'Acc_rate'})
success_rate['Acc_rate'] = (success_rate['Acc_rate'] * 100).round(2)
print(success_rate)

# Data analysis

## Solved Sets of a Model Under Given Level

In [None]:
level = "Level 1"
df_L1 = df[df['config'] == level]
Qwen_L1 = df_L1[df_L1['model'] == 'Qwen3-32B']
Qwen_L1_success = Qwen_L1[Qwen_L1['success'] == True]
Qwen_L1_success

## Solved sets problems of 3 model under a given level

In [None]:
# Is there any problem that is solved by some models but not others?
# Transform ID into categorical values
level = "Level 1"

df_L1 = df[df['config'] == level]
# Create a pivot table with problem IDs as index, models as columns, and success as values
problem_solution_matrix = df_L1.pivot_table(
    index='id',
    columns='model',
    values='success',
    observed='False',
    aggfunc='any'  # Use 'any' to check if any attempt was successful
).fillna(False)  # Fill NaN values with False (unsolved)

# Display the matrix
# print(problem_solution_matrix)
# Find problems solved by Claude but not by others
claude_solved = problem_solution_matrix['Claude 3.7 Sonnet'] == True
others_unsolved = (problem_solution_matrix['OpenAI o4 Mini'] == False) | (problem_solution_matrix['Gemini 2.5 Flash'] == False)
claude_only_solved = problem_solution_matrix[claude_solved & others_unsolved]

# print(f"Problems solved by Claude but not by others at {level}")
# print(claude_only_solved)

# Get the statistics we need to draw a Venn Diagram for the 3 sets of solved problems with the 3 models.
# Get the sets of solved problems for each model
claude_solved = set(problem_solution_matrix[problem_solution_matrix['Claude 3.7 Sonnet']].index)
openai_solved = set(problem_solution_matrix[problem_solution_matrix['OpenAI o4 Mini']].index)
gemini_solved = set(problem_solution_matrix[problem_solution_matrix['Gemini 2.5 Flash']].index)

# Calculate intersection and union statistics
claude_only = claude_solved - (openai_solved | gemini_solved)
openai_only = openai_solved - (claude_solved | gemini_solved)
gemini_only = gemini_solved - (claude_solved | openai_solved)

claude_openai = claude_solved & openai_solved - gemini_solved
claude_gemini = claude_solved & gemini_solved - openai_solved
openai_gemini = openai_solved & gemini_solved - claude_solved

all_solved = claude_solved & openai_solved & gemini_solved

# Print statistics
print(f"\nVenn Diagram Statistics for {level}:")
print(f"Total problems: {len(problem_solution_matrix)}")
print(f"\nClaude solved: {len(claude_solved)}")
print(f"OpenAI solved: {len(openai_solved)}")
print(f"Gemini solved: {len(gemini_solved)}")

print("\nUnique problems solved by each model:")
print(f"\nClaude only: {len(claude_only)} Problems, IDs: {claude_only}")
print(f"OpenAI only: {len(openai_only)} Problems, IDs: {openai_only}")
print(f"Gemini only: {len(gemini_only)} Problems, IDs: {gemini_only}")

print("\nProblems solved by only 2 of them:")
print(f"\nClaude + OpenAI: {len(claude_openai)} Problems, IDs: {claude_openai}")
print(f"Claude + Gemini: {len(claude_gemini)} Problems, IDs: {claude_gemini}")
print(f"OpenAI + Gemini: {len(openai_gemini)} Problems, IDs: {openai_gemini}")

print(f"\n Problems solved by all three models: {len(all_solved)}")


In [None]:
all_problems = set(problem_solution_matrix.index)
gemini_unsolved = all_problems - gemini_solved
np.array(gemini_unsolved)

### Plot a Venn Diagram

In [None]:
from matplotlib_venn import venn3, venn3_circles
# Create the Venn diagram
plt.figure(figsize=(6, 5), dpi=300)
diagram = venn3(subsets=(len(claude_only), len(openai_only), len(claude_openai), 
                         len(gemini_only), len(claude_gemini), len(openai_gemini), 
                         len(all_solved)),
               set_labels=('Claude 3.7 Sonnet', 'OpenAI o4 Mini', 'Gemini 2.5 Flash'))

# Set colors for each model
if diagram.patches:
    # Claude (A) - A color related to Anthropic/Claude - light purple
    diagram.get_patch_by_id('100').set_color("#CCAE91")  
    
    # OpenAI (B) - OpenAI brand color - light teal
    diagram.get_patch_by_id('010').set_color("#9AB39C")  
    
    # Gemini (C) - Google/Gemini brand color - light blue
    diagram.get_patch_by_id('001').set_color("#C3B8E4") 

# Add circles for better visibility
venn3_circles(subsets=(len(claude_only), len(openai_only), len(claude_openai), 
                         len(gemini_only), len(claude_gemini), len(openai_gemini), 
                         len(all_solved)), 
             linestyle='dashed', linewidth=1, color='gray')

# Calculate total problems solved by each model for the title
total_claude = len(claude_solved)
total_openai = len(openai_solved)
total_gemini = len(gemini_solved)
total_problems = len(problem_solution_matrix)

# Add title and subtitle with statistics
plt.title(f'{level} Solved Problems\n', fontsize=16)
# plt.figtext(0.5, 0.01, 
#            f'Total Problems: {total_problems} | ' +
#            f'Claude: {total_claude} ({total_claude/total_problems:.1%}) | ' +
#            f'OpenAI: {total_openai} ({total_openai/total_problems:.1%}) | ' +
#            f'Gemini: {total_gemini} ({total_gemini/total_problems:.1%})',
#            ha='center', fontsize=12)

# Add the counts to each section of the diagram
for idx, subset in enumerate(('100', '010', '110', '001', '101', '011', '111')):
    if diagram.get_label_by_id(subset):
        diagram.get_label_by_id(subset).set_text(f'{diagram.get_label_by_id(subset).get_text()}')

# Adjust layout and display
plt.tight_layout()
# plt.savefig('ai_model_comparison_venn.png', dpi=300, bbox_inches='tight')
plt.show()

## Solved sets of one model under different levels

In [None]:
Model = "Gemini 2.5 Flash" # “Claude 3.7 Sonnet” or “OpenAI o4 Mini” or “Gemini 2.5 Flash”

df_model = df[df['model'] == Model]
# Create a pivot table with problem IDs as index, models as columns, and success as values
problem_solution_matrix = df_model.pivot_table(
    index='id',
    columns='config',
    values='success',
    observed='False',
    aggfunc='any'  # Use 'any' to check if any attempt was successful
).fillna(False)  # Fill NaN values with False (unsolved)

# Get the statistics we need to draw a Venn Diagram for the 4 sets of solved problems under 4 levels.
# Get the sets of solved problems for each level
level1_solved = set(problem_solution_matrix[problem_solution_matrix['Level 1'] == True].index)
level2_solved = set(problem_solution_matrix[problem_solution_matrix['Level 2'] == True].index)
level3_solved = set(problem_solution_matrix[problem_solution_matrix['Level 3'] == True].index)
level4_solved = set(problem_solution_matrix[problem_solution_matrix['Level 4'] == True].index)

# Calculate intersections for Venn diagram
level1_only = level1_solved - level2_solved - level3_solved - level4_solved
level2_only = level2_solved - level1_solved - level3_solved - level4_solved
level3_only = level3_solved - level1_solved - level2_solved - level4_solved
level4_only = level4_solved - level1_solved - level2_solved - level3_solved

level1_2 = (level1_solved & level2_solved) - level3_solved - level4_solved
level1_3 = (level1_solved & level3_solved) - level2_solved - level4_solved
level1_4 = (level1_solved & level4_solved) - level2_solved - level3_solved
level2_3 = (level2_solved & level3_solved) - level1_solved - level4_solved
level2_4 = (level2_solved & level4_solved) - level1_solved - level3_solved
level3_4 = (level3_solved & level4_solved) - level1_solved - level2_solved

level1_2_3 = (level1_solved & level2_solved & level3_solved) - level4_solved
level1_2_4 = (level1_solved & level2_solved & level4_solved) - level3_solved
level1_3_4 = (level1_solved & level3_solved & level4_solved) - level2_solved
level2_3_4 = (level2_solved & level3_solved & level4_solved) - level1_solved

all_levels = level1_solved & level2_solved & level3_solved & level4_solved

# Print the results
print(f"Level 1 only: {len(level1_only)} problems")
print(f"Level 2 only: {len(level2_only)} problems")
print(f"Level 3 only: {len(level3_only)} problems")
print(f"Level 4 only: {len(level4_only)} problems")
print(f"Level 1 & 2 only: {len(level1_2)} problems")
print(f"Level 1 & 3 only: {len(level1_3)} problems")
print(f"Level 1 & 4 only: {len(level1_4)} problems")
print(f"Level 2 & 3 only: {len(level2_3)} problems")
print(f"Level 2 & 4 only: {len(level2_4)} problems")
print(f"Level 3 & 4 only: {len(level3_4)} problems")
print(f"Level 1, 2 & 3 only: {len(level1_2_3)} problems")
print(f"Level 1, 2 & 4 only: {len(level1_2_4)} problems")
print(f"Level 1, 3 & 4 only: {len(level1_3_4)} problems")
print(f"Level 2, 3 & 4 only: {len(level2_3_4)} problems")
print(f"All levels: {len(all_levels)} problems")

In [None]:
level4_unsolved = all_problems - level4_solved
print(np.array(level4_unsolved))
print(len(level4_unsolved))

### Plot the Venn Diagram

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Define level names
level_names = ['Level 1', 'Level 2', 'Level 3', 'Level 4']
# Create a dictionary to count combinations
combinations_count = {}

# Assuming we have the same region variables as your Venn diagram code
# Binary codes: 1000=Level1 only, 0100=Level2 only, etc.
combinations_count['1000'] = len(level1_only)
combinations_count['0100'] = len(level2_only)
combinations_count['1100'] = len(level1_2)
combinations_count['0010'] = len(level3_only)
combinations_count['1010'] = len(level1_3)
combinations_count['0110'] = len(level2_3)
combinations_count['1110'] = len(level1_2_3)
combinations_count['0001'] = len(level4_only)
combinations_count['1001'] = len(level1_4)
combinations_count['0101'] = len(level2_4)
combinations_count['1101'] = len(level1_2_4)
combinations_count['0011'] = len(level3_4)
combinations_count['1011'] = len(level1_3_4)
combinations_count['0111'] = len(level2_3_4)
combinations_count['1111'] = len(all_levels)

index_tuples = []
counts_for_multiindex_series = []

ordered_set_names = level_names

for binary_representation, count in combinations_count.items():
    boolean_tuple = tuple(bit == '1' for bit in binary_representation)
    index_tuples.append(boolean_tuple)
    counts_for_multiindex_series.append(count)

upset_data_multiindex = pd.Series(dtype=int)
multi_idx = pd.MultiIndex.from_tuples(index_tuples, names=ordered_set_names)
upset_data_multiindex = pd.Series(counts_for_multiindex_series, index=multi_idx, dtype=int)
print("Successfully created Series with MultiIndex:")

In [None]:
from upsetplot import plot
# Calculate total per level for the subtitle
total_level1 = len(level1_solved)
total_level2 = len(level2_solved)
total_level3 = len(level3_solved)
total_level4 = len(level4_solved)
total_problems = len(problem_solution_matrix)

fig = plt.figure(figsize=(12, 8), dpi=300)
plot(upset_data_multiindex,
        sort_by='degree',
        facecolor="#607D8B",
        show_counts=True,
        fig=fig)

plt.suptitle(f'Solved Sets by {Model} Under Different Level', fontsize=16, y=0.98)
if total_problems > 0:
    stats_text = (f'Total Problems: {total_problems} | '
                    f'L1: {total_level1} ({total_level1/total_problems:.1%}) | '
                    f'L2: {total_level2} ({total_level2/total_problems:.1%}) | '
                    f'L3: {total_level3} ({total_level3/total_problems:.1%}) | '
                    f'L4: {total_level4} ({total_level4/total_problems:.1%})')
else:
    stats_text = "Statistics unavailable"
fig.text(0.5, 0.02, stats_text, ha='center', va='bottom', fontsize=10)

# patches = [mpatches.Patch(color=color, label=level) for level, color in level_colors.items()]
# fig.legend(handles=patches, loc='upper right', bbox_to_anchor=(0.99, 0.95), title="Levels")

plt.tight_layout(rect=[0, 0.05, 1, 0.93])
# plt.savefig('difficulty_level_upset_multiindex.png', dpi=300, bbox_inches='tight')
plt.show()
print("Plotting with MultiIndex Series attempted.")