# Exploit Results - Exploratory Data Analysis

This notebook analyzes the benchmark results from the yudai-swe-agent exploit generation runs.

In [None]:
import json
import glob
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Preprocessing

In [None]:
# Get all main benchmark JSON files (excluding trajectory and result files)
files = sorted(glob.glob('exploit_results/benchmark_*.json'))
files = [f for f in files if '.traj.' not in f and '.result.' not in f]

print(f"Found {len(files)} benchmark result files")

# Load all data
all_data = []
for file_path in files:
    with open(file_path, 'r') as f:
        data = json.load(f)
        all_data.append(data)

print(f"Loaded {len(all_data)} benchmark runs")

In [None]:
# Create a flat dataframe with all cases
records = []

for run in all_data:
    run_id = run.get('run_id', '')
    run_status = run.get('status', 'unknown')
    model = run.get('settings', {}).get('model_name', 'unknown')
    cost_limit = run.get('settings', {}).get('cost_limit', None)
    
    started_at = run.get('started_at')
    finished_at = run.get('finished_at')
    
    for case in run.get('cases', []):
        record = {
            'run_id': run_id,
            'run_status': run_status,
            'model': model,
            'cost_limit': cost_limit,
            'started_at': started_at,
            'finished_at': finished_at,
            'case_name': case.get('case_name', ''),
            'chain': case.get('chain', 'unknown'),
            'fork_block_number': case.get('fork_block_number'),
            'status': case.get('status', 'unknown'),
            'duration_sec': case.get('duration_sec'),
            'episode_id': case.get('episode_id', ''),
            'error_type': case.get('error', {}).get('type') if case.get('error') else None,
            'error_message': case.get('error', {}).get('message') if case.get('error') else None,
            'source_length': case.get('source_length'),
            'metrics': case.get('metrics', {})
        }
        records.append(record)

df = pd.DataFrame(records)

# Convert datetime columns
df['started_at'] = pd.to_datetime(df['started_at'])
df['finished_at'] = pd.to_datetime(df['finished_at'])
df['date'] = df['started_at'].dt.date

print(f"\nDataFrame shape: {df.shape}")
df.head()

In [None]:
# Display basic statistics
print("=" * 60)
print("BASIC STATISTICS")
print("=" * 60)

print(f"\nTotal benchmark runs: {len(all_data)}")
print(f"Total cases executed: {len(df)}")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
print(f"\nUnique models tested: {df['model'].nunique()}")
print(f"Unique chains: {df['chain'].nunique()}")
print(f"Unique cases: {df['case_name'].nunique()}")

## 2. Status Distribution Analysis

In [None]:
# Status distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Run status
run_status_counts = df['run_status'].value_counts()
axes[0].pie(run_status_counts.values, labels=run_status_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Run Status Distribution', fontsize=14, fontweight='bold')

# Case status
case_status_counts = df['status'].value_counts()
axes[1].pie(case_status_counts.values, labels=case_status_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Case Status Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nRun Status:")
print(run_status_counts)
print("\nCase Status:")
print(case_status_counts)

## 3. Model Performance Analysis

In [None]:
# Model usage and performance
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Model usage count
model_counts = df['model'].value_counts()
axes[0, 0].barh(model_counts.index, model_counts.values, color='steelblue')
axes[0, 0].set_xlabel('Number of Runs')
axes[0, 0].set_title('Model Usage Distribution', fontsize=12, fontweight='bold')
axes[0, 0].grid(axis='x', alpha=0.3)

# Status by model
status_by_model = pd.crosstab(df['model'], df['status'])
status_by_model.plot(kind='bar', stacked=True, ax=axes[0, 1])
axes[0, 1].set_title('Status Distribution by Model', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Model')
axes[0, 1].set_ylabel('Count')
axes[0, 1].legend(title='Status')
axes[0, 1].tick_params(axis='x', rotation=45)

# Average duration by model
avg_duration = df.groupby('model')['duration_sec'].mean().sort_values(ascending=False)
axes[1, 0].barh(avg_duration.index, avg_duration.values, color='coral')
axes[1, 0].set_xlabel('Average Duration (seconds)')
axes[1, 0].set_title('Average Case Duration by Model', fontsize=12, fontweight='bold')
axes[1, 0].grid(axis='x', alpha=0.3)

# Success rate by model (excluding interrupted)
completed_df = df[df['status'].isin(['completed', 'failed', 'success'])]
if len(completed_df) > 0:
    model_success = completed_df.groupby('model')['status'].apply(
        lambda x: (x == 'success').sum() / len(x) * 100
    ).sort_values(ascending=False)
    axes[1, 1].barh(model_success.index, model_success.values, color='lightgreen')
    axes[1, 1].set_xlabel('Success Rate (%)')
    axes[1, 1].set_title('Success Rate by Model (Non-Interrupted)', fontsize=12, fontweight='bold')
    axes[1, 1].grid(axis='x', alpha=0.3)
else:
    axes[1, 1].text(0.5, 0.5, 'No completed runs', ha='center', va='center', transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('Success Rate by Model', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Chain Analysis

In [None]:
# Chain distribution and performance
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Chain distribution
chain_counts = df['chain'].value_counts()
axes[0].bar(chain_counts.index, chain_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_xlabel('Chain')
axes[0].set_ylabel('Number of Cases')
axes[0].set_title('Cases by Blockchain', fontsize=12, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Status by chain
status_by_chain = pd.crosstab(df['chain'], df['status'])
status_by_chain.plot(kind='bar', stacked=True, ax=axes[1])
axes[1].set_title('Status Distribution by Chain', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Chain')
axes[1].set_ylabel('Count')
axes[1].legend(title='Status')
axes[1].tick_params(axis='x', rotation=0)

# Average duration by chain
avg_duration_chain = df.groupby('chain')['duration_sec'].mean().sort_values(ascending=False)
axes[2].bar(avg_duration_chain.index, avg_duration_chain.values, color=['#95E1D3', '#F38181'])
axes[2].set_xlabel('Chain')
axes[2].set_ylabel('Average Duration (seconds)')
axes[2].set_title('Average Duration by Chain', fontsize=12, fontweight='bold')
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\nChain Statistics:")
print(df.groupby('chain').agg({
    'duration_sec': ['count', 'mean', 'std', 'min', 'max'],
    'source_length': ['mean', 'std']
}).round(2))

## 5. Duration Analysis

In [None]:
# Duration analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Histogram of durations
axes[0, 0].hist(df['duration_sec'].dropna(), bins=30, color='skyblue', edgecolor='black')
axes[0, 0].set_xlabel('Duration (seconds)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Case Durations', fontsize=12, fontweight='bold')
axes[0, 0].axvline(df['duration_sec'].mean(), color='red', linestyle='--', label=f'Mean: {df["duration_sec"].mean():.2f}s')
axes[0, 0].axvline(df['duration_sec'].median(), color='green', linestyle='--', label=f'Median: {df["duration_sec"].median():.2f}s')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Box plot by model
df.boxplot(column='duration_sec', by='model', ax=axes[0, 1])
axes[0, 1].set_title('Duration Distribution by Model', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Model')
axes[0, 1].set_ylabel('Duration (seconds)')
plt.sca(axes[0, 1])
plt.xticks(rotation=45, ha='right')

# Box plot by chain
df.boxplot(column='duration_sec', by='chain', ax=axes[1, 0])
axes[1, 0].set_title('Duration Distribution by Chain', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Chain')
axes[1, 0].set_ylabel('Duration (seconds)')

# Box plot by status
df.boxplot(column='duration_sec', by='status', ax=axes[1, 1])
axes[1, 1].set_title('Duration Distribution by Status', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Status')
axes[1, 1].set_ylabel('Duration (seconds)')
plt.sca(axes[1, 1])
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

print("\nDuration Statistics:")
print(df['duration_sec'].describe())

## 6. Error Analysis

In [None]:
# Error type analysis
error_df = df[df['error_type'].notna()].copy()

if len(error_df) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Error type distribution
    error_counts = error_df['error_type'].value_counts()
    axes[0].barh(error_counts.index, error_counts.values, color='salmon')
    axes[0].set_xlabel('Count')
    axes[0].set_title('Error Type Distribution', fontsize=12, fontweight='bold')
    axes[0].grid(axis='x', alpha=0.3)
    
    # Error types by model
    error_by_model = pd.crosstab(error_df['model'], error_df['error_type'])
    error_by_model.plot(kind='bar', stacked=True, ax=axes[1])
    axes[1].set_title('Error Types by Model', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Model')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Error Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("\nError Type Summary:")
    print(error_counts)
    
    print("\nSample Error Messages:")
    for error_type in error_counts.head(3).index:
        sample = error_df[error_df['error_type'] == error_type]['error_message'].iloc[0]
        print(f"\n{error_type}:")
        print(f"  {sample[:200]}..." if len(sample) > 200 else f"  {sample}")
else:
    print("No errors found in the dataset")

## 7. Time Series Analysis

In [None]:
# Time series analysis
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Runs over time
runs_per_day = df.groupby('date').size()
axes[0].plot(runs_per_day.index, runs_per_day.values, marker='o', linewidth=2, markersize=8)
axes[0].fill_between(runs_per_day.index, runs_per_day.values, alpha=0.3)
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Number of Cases')
axes[0].set_title('Benchmark Runs Over Time', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Status over time
status_over_time = df.groupby(['date', 'status']).size().unstack(fill_value=0)
status_over_time.plot(kind='area', stacked=True, ax=axes[1], alpha=0.7)
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Number of Cases')
axes[1].set_title('Status Distribution Over Time', fontsize=12, fontweight='bold')
axes[1].legend(title='Status')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 8. Source Code Length Analysis

In [None]:
# Source length analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Distribution of source lengths
axes[0].hist(df['source_length'].dropna(), bins=30, color='lightcoral', edgecolor='black')
axes[0].set_xlabel('Source Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Source Code Length', fontsize=12, fontweight='bold')
axes[0].axvline(df['source_length'].mean(), color='red', linestyle='--', label=f'Mean: {df["source_length"].mean():.0f}')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Source length vs duration scatter
valid_data = df[df['source_length'].notna() & df['duration_sec'].notna()]
axes[1].scatter(valid_data['source_length'], valid_data['duration_sec'], alpha=0.5, c=valid_data['status'].astype('category').cat.codes, cmap='viridis')
axes[1].set_xlabel('Source Length (characters)')
axes[1].set_ylabel('Duration (seconds)')
axes[1].set_title('Source Length vs Duration', fontsize=12, fontweight='bold')
axes[1].grid(alpha=0.3)

# Average source length by chain
avg_source_length = df.groupby('chain')['source_length'].mean().sort_values(ascending=False)
axes[2].bar(avg_source_length.index, avg_source_length.values, color=['#FFA07A', '#20B2AA'])
axes[2].set_xlabel('Chain')
axes[2].set_ylabel('Average Source Length')
axes[2].set_title('Average Source Code Length by Chain', fontsize=12, fontweight='bold')
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print("\nSource Length Statistics:")
print(df['source_length'].describe())

## 9. Case-Level Analysis

In [None]:
# Top cases by frequency
case_counts = df['case_name'].value_counts().head(15)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Most tested cases
axes[0].barh(case_counts.index, case_counts.values, color='mediumseagreen')
axes[0].set_xlabel('Number of Runs')
axes[0].set_title('Top 15 Most Tested Cases', fontsize=12, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Average duration per case (top 15)
avg_duration_case = df.groupby('case_name')['duration_sec'].mean().sort_values(ascending=False).head(15)
axes[1].barh(avg_duration_case.index, avg_duration_case.values, color='mediumpurple')
axes[1].set_xlabel('Average Duration (seconds)')
axes[1].set_title('Top 15 Cases by Average Duration', fontsize=12, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTotal unique cases: {df['case_name'].nunique()}")
print(f"\nMost tested cases:")
print(case_counts)

## 10. Summary Statistics

In [None]:
# Comprehensive summary
print("="*80)
print("COMPREHENSIVE SUMMARY")
print("="*80)

print(f"\nüìä Overall Statistics:")
print(f"  ‚Ä¢ Total benchmark runs: {len(all_data)}")
print(f"  ‚Ä¢ Total cases executed: {len(df)}")
print(f"  ‚Ä¢ Date range: {df['date'].min()} to {df['date'].max()}")
print(f"  ‚Ä¢ Unique cases: {df['case_name'].nunique()}")
print(f"  ‚Ä¢ Unique models: {df['model'].nunique()}")
print(f"  ‚Ä¢ Unique chains: {df['chain'].nunique()}")

print(f"\n‚úÖ Status Breakdown:")
for status, count in df['status'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"  ‚Ä¢ {status}: {count} ({percentage:.1f}%)")

print(f"\nü§ñ Model Usage:")
for model, count in df['model'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"  ‚Ä¢ {model}: {count} ({percentage:.1f}%)")

print(f"\n‚õìÔ∏è Chain Distribution:")
for chain, count in df['chain'].value_counts().items():
    percentage = (count / len(df)) * 100
    print(f"  ‚Ä¢ {chain}: {count} ({percentage:.1f}%)")

print(f"\n‚è±Ô∏è Duration Statistics:")
print(f"  ‚Ä¢ Mean: {df['duration_sec'].mean():.2f} seconds")
print(f"  ‚Ä¢ Median: {df['duration_sec'].median():.2f} seconds")
print(f"  ‚Ä¢ Min: {df['duration_sec'].min():.2f} seconds")
print(f"  ‚Ä¢ Max: {df['duration_sec'].max():.2f} seconds")
print(f"  ‚Ä¢ Total time: {df['duration_sec'].sum() / 3600:.2f} hours")

print(f"\n‚ö†Ô∏è Error Statistics:")
error_counts = df['error_type'].value_counts()
if len(error_counts) > 0:
    for error_type, count in error_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  ‚Ä¢ {error_type}: {count} ({percentage:.1f}%)")
else:
    print("  ‚Ä¢ No errors recorded")

print(f"\nüìù Source Code Statistics:")
print(f"  ‚Ä¢ Mean length: {df['source_length'].mean():.0f} characters")
print(f"  ‚Ä¢ Median length: {df['source_length'].median():.0f} characters")
print(f"  ‚Ä¢ Min length: {df['source_length'].min():.0f} characters")
print(f"  ‚Ä¢ Max length: {df['source_length'].max():.0f} characters")

print("\n" + "="*80)

## 11. Detailed DataFrame View

In [None]:
# Display the full dataframe
print("Full DataFrame:")
df

In [None]:
# Export summary to CSV for further analysis
output_file = 'exploit_results/benchmark_summary.csv'
df.to_csv(output_file, index=False)
print(f"Summary exported to: {output_file}")