In [None]:
# Import necessary libraries
import json
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting for Jupyter Notebook
%matplotlib inline

# Set a seaborn style for nicer visuals
sns.set(style="whitegrid")

In [None]:
# Directory where your JSON files reside
data_dir = "data"  # change this to your directory path

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(data_dir, "*.json"))

# Function to flatten one JSON object into a flat dictionary
def flatten_json(jdata):
    flat = {}
    # Process metadata
    for k, v in jdata.get('metadata', {}).items():
        flat[f'metadata_{k}'] = v
    # Process best_solution data
    for k, v in jdata.get('best_solution', {}).items():
        flat[f'best_solution_{k}'] = v
    # Process stats (we flatten each metric)
    for metric, stats in jdata.get('stats', {}).items():
        for stat_key, stat_val in stats.items():
            flat[f'stats_{metric}_{stat_key}'] = stat_val
    # Optionally, you could also store raw_metrics as-is or compute summaries
    for metric, values in jdata.get('raw_metrics', {}).items():
        # Here we store the mean of the raw metrics.
        flat[f'raw_{metric}_mean'] = np.mean(values) if values else np.nan
        flat[f'raw_{metric}_min'] = np.min(values) if values else np.nan
        flat[f'raw_{metric}_max'] = np.max(values) if values else np.nan
    return flat

# Loop through each file, load and flatten the JSON data
records = []
for f in json_files:
    with open(f, 'r') as infile:
        try:
            data = json.load(infile)
            flat_data = flatten_json(data)
            records.append(flat_data)
        except Exception as e:
            print(f"Error reading {f}: {e}")

# Create a pandas DataFrame from the flattened data
df = pd.DataFrame(records)
df.head()


In [None]:
# Convert datetime string to datetime object
df['metadata_datetime'] = pd.to_datetime(df['metadata_datetime'], format='%Y-%m-%dT%H_%M_%S.%fZ')

# Check the dataframe info and data types
df.info()


In [None]:
# Summary statistics for main numerical columns
df.describe()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x='stats_cost_avg', 
    y='best_solution_cost', 
    hue='metadata_problem',
    style='metadata_machine',
    data=df,
    s=100
)
plt.xlabel("Average Cost (stats)")
plt.ylabel("Best Solution Cost")
plt.title("Comparison of Best vs. Average Cost")
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['best_solution_cost'], bins=20, kde=True)
plt.xlabel("Best Solution Cost")
plt.title("Distribution of Best Solution Cost")
plt.show()


In [None]:
# Group by problem and summarize best and average costs
problem_summary = df.groupby('metadata_problem').agg({
    'best_solution_cost': ['mean', 'min', 'max'],
    'stats_cost_avg': ['mean', 'min', 'max'],
    'stats_cost_stdev': 'mean'
})
problem_summary.columns = ['_'.join(col).strip() for col in problem_summary.columns.values]
problem_summary.reset_index(inplace=True)
problem_summary


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='metadata_problem', y='raw_cycles_mean', data=df)
plt.title("Raw Mean Cycles by Problem")
plt.xlabel("Problem")
plt.ylabel("Mean Cycles")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='metadata_problem', y='raw_iterations_mean', data=df)
plt.title("Raw Mean Iterations by Problem")
plt.xlabel("Problem")
plt.ylabel("Mean Iterations")
plt.show()


In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
df.set_index('metadata_datetime', inplace=True)
df['best_solution_cost'].resample('D').mean().plot(figsize=(10,6), marker='o')
plt.title("Daily Mean Best Solution Cost")
plt.ylabel("Best Solution Cost")
plt.xlabel("Date")
plt.show()

# Reset index if needed:
df.reset_index(inplace=True)


In [None]:
pivot = pd.pivot_table(df, 
                       values='best_solution_cost', 
                       index='metadata_problem', 
                       columns='metadata_machine', 
                       aggfunc=np.mean)
pivot
