In [1]:
import json
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting for Jupyter Notebook
%matplotlib inline

# Seaborn style for nicer visuals
sns.set(style="whitegrid")

In [2]:
# Function to flatten one JSON object into a flat dictionary
def flatten_json(jdata):
    flat = {}
    # Process metadata
    for k, v in jdata.get('metadata', {}).items():
        flat[f'metadata_{k}'] = v
    # Process best_solution data
    for k, v in jdata.get('best_solution', {}).items():
        flat[f'best_solution_{k}'] = v
    # Process stats (we flatten each metric)
    for metric, stats in jdata.get('stats', {}).items():
        for stat_key, stat_val in stats.items():
            flat[f'stats_{metric}_{stat_key}'] = stat_val
    # Optionally, you could also store raw_metrics as-is or compute summaries
    for metric, values in jdata.get('raw_metrics', {}).items():
        # Here we store the mean of the raw metrics.
        flat[f'raw_{metric}_mean'] = np.mean(values) if values else np.nan
        flat[f'raw_{metric}_min'] = np.min(values) if values else np.nan
        flat[f'raw_{metric}_max'] = np.max(values) if values else np.nan
    return flat

def load_df(data_dir):
    # Get a list of all JSON files in the directory
    json_files = glob.glob(os.path.join(data_dir, "*.json"))
    
    # Loop through each file, load and flatten the JSON data
    records = []
    for f in json_files:
        with open(f, 'r') as infile:
            try:
                data = json.load(infile)
                flat_data = flatten_json(data)
                records.append(flat_data)
            except Exception as e:
                print(f"Error reading {f}: {e}")
    
    # Create a pandas DataFrame from the flattened data
    return pd.DataFrame(records)

In [5]:
dfs = {
    'gpu_effort': load_df("data/gpu-11_27pm"),
    'gpu': load_df("data/gpu-11_50am"),
    'cpuseq': load_df("data/cpu_seq"),
    'cpupar': load_df("data/cpu_par")
}

In [6]:
dfs['gpu_effort'].head()

Unnamed: 0,metadata_datetime,metadata_problem,metadata_solver,metadata_n,metadata_executions,metadata_cutoff,metadata_hit_percent,metadata_machine,best_solution_route,best_solution_cost,...,raw_costs_max,raw_seconds_mean,raw_seconds_min,raw_seconds_max,raw_cycles_mean,raw_cycles_min,raw_cycles_max,raw_iterations_mean,raw_iterations_min,raw_iterations_max
0,2025-05-13T12_32_03.224Z,berlin52,GPU Parallel Genetic Solver,52,30,0.0,0.0,WallE,"[41, 8, 9, 10, 43, 33, 51, 11, 52, 14, 13, 47,...",7544.365,...,8223.637,8.818763,8.735415,8.867067,32648720000.0,32340148584,32827549746,10000.0,10000,10000
1,2025-05-13T13_23_33.844Z,tsp225,GPU Parallel Genetic Solver,225,30,0.0,0.0,WallE,"[11, 12, 13, 14, 15, 16, 17, 18, 19, 203, 20, ...",3959.134,...,4133.802,52.779924,52.39063,53.22363,197077900000.0,195144768330,199038471077,10000.0,10000,10000
2,2025-05-13T14_28_52.738Z,pcb442,GPU Parallel Genetic Solver,442,30,0.0,0.0,WallE,"[357, 356, 434, 355, 354, 353, 316, 317, 318, ...",53226.53,...,54907.22,117.604997,116.7899,118.2106,443458700000.0,440385083554,445742267408,10000.0,10000,10000
3,2025-05-13T12_54_32.252Z,lin105,GPU Parallel Genetic Solver,105,30,0.0,0.0,WallE,"[78, 71, 68, 67, 64, 72, 77, 79, 86, 80, 76, 7...",14382.99,...,15221.34,21.299222,21.20149,21.41791,78853730000.0,78491916656,79293115392,10000.0,10000,10000
4,2025-05-13T12_42_54.585Z,kroB100,GPU Parallel Genetic Solver,100,30,0.0,0.0,WallE,"[14, 42, 2, 16, 50, 43, 89, 87, 66, 74, 60, 57...",22139.07,...,22991.71,19.951957,19.89343,20.04796,73865900000.0,73649221244,74221333634,10000.0,10000,10000


In [7]:
dfs['gpu'].head()

Unnamed: 0,metadata_datetime,metadata_problem,metadata_solver,metadata_n,metadata_executions,metadata_cutoff,metadata_hit_percent,metadata_machine,best_solution_route,best_solution_cost,...,raw_costs_max,raw_seconds_mean,raw_seconds_min,raw_seconds_max,raw_cycles_mean,raw_cycles_min,raw_cycles_max,raw_iterations_mean,raw_iterations_min,raw_iterations_max
0,2025-05-13T11_38_30.309Z,pcb1173,GPU Parallel Genetic Solver,1173,50,62581.2,0.98,WallE,"[244, 243, 245, 246, 248, 247, 219, 236, 235, ...",62503.39,...,62661.41,1711.146842,906.5451,9409.311,6446528000000.0,3418352280742,35480129878854,45342.7,23869,250000
1,2025-05-11T22_20_49.825Z,kroB100,GPU Parallel Genetic Solver,100,50,23248.05,1.0,WallE,"[40, 39, 70, 53, 73, 85, 93, 11, 3, 28, 91, 97...",23054.69,...,23247.96,1.322515,0.77711,11.88688,4896196000.0,2877004070,44007482028,645.28,372,5993
2,2025-05-11T22_24_17.022Z,lin105,GPU Parallel Genetic Solver,105,50,15097.95,1.0,WallE,"[88, 94, 95, 100, 99, 98, 93, 101, 102, 97, 96...",14736.31,...,15096.6,3.850169,0.794318,75.66312,14254050000.0,2940713287,280119098816,1799.42,356,35785
3,2025-05-12T09_15_28.389Z,rat783,GPU Parallel Genetic Solver,783,50,9598.54,1.0,WallE,"[461, 479, 481, 471, 478, 468, 465, 475, 488, ...",9588.249,...,9598.471,393.28361,287.4106,753.408,1482973000000.0,1083752675732,2840910895947,17066.4,12381,32509
4,2025-05-11T22_19_34.883Z,berlin52,GPU Parallel Genetic Solver,52,50,7843.68,0.82,WallE,"[27, 28, 12, 25, 4, 6, 15, 5, 24, 48, 46, 37, ...",7621.701,...,8223.637,47.097298,0.105894,220.8815,174363100000.0,392037013,817745037922,53850.6,106,250000


In [8]:
dfs['cpuseq'].head()

Unnamed: 0,metadata_datetime,metadata_problem,metadata_solver,metadata_n,metadata_executions,metadata_cutoff,metadata_hit_percent,metadata_machine,best_solution_route,best_solution_cost,...,raw_costs_max,raw_seconds_mean,raw_seconds_min,raw_seconds_max,raw_cycles_mean,raw_cycles_min,raw_cycles_max,raw_iterations_mean,raw_iterations_min,raw_iterations_max
0,2025-05-13T21_56_04.900Z,tsp225,CPU Sequential Genetic Solver,225,30,4193.33,0.9,fedora,"[128, 127, 126, 165, 166, 167, 151, 152, 153, ...",4171.67,...,4252.752,53.85269,2.575899,141.1994,118252200000.0,5656275944,310052318532,30867.9,1443,80000
1,2025-05-13T21_15_07.764Z,kroB100,CPU Sequential Genetic Solver,100,30,23248.05,0.9,fedora,"[44, 19, 92, 96, 18, 24, 55, 22, 23, 88, 54, 5...",22895.84,...,23624.6,13.230001,0.2499,70.00507,29051100000.0,548740720,153720741174,15116.733333,260,80000
2,2025-05-13T21_29_09.319Z,lin105,CPU Sequential Genetic Solver,105,30,15097.95,0.7,fedora,"[62, 105, 59, 56, 55, 50, 49, 40, 104, 45, 48,...",14904.2,...,15447.23,28.051818,0.326104,72.7983,61597460000.0,716074920,159853676334,30812.566667,329,80000
3,2025-05-13T21_08_30.863Z,berlin52,CPU Sequential Genetic Solver,52,30,7843.68,0.5,fedora,"[17, 3, 45, 19, 41, 8, 9, 10, 43, 33, 51, 11, ...",7746.858,...,8242.741,24.946672,0.074409,47.48952,54779140000.0,163387136,104279892134,43324.566667,107,80000


In [9]:
dfs['cpupar'].head()

Unnamed: 0,metadata_datetime,metadata_problem,metadata_solver,metadata_n,metadata_executions,metadata_cutoff,metadata_hit_percent,metadata_machine,best_solution_route,best_solution_cost,...,raw_costs_max,raw_seconds_mean,raw_seconds_min,raw_seconds_max,raw_cycles_mean,raw_cycles_min,raw_cycles_max,raw_iterations_mean,raw_iterations_min,raw_iterations_max
0,2025-05-14T07_52_27.052Z,berlin52,CPU Parallel Genetic Solver,52,30,7843.68,0.6,fedora,"[49, 32, 45, 19, 41, 8, 9, 10, 43, 33, 51, 11,...",7756.733,...,8067.793,28.101039,0.602783,77.77118,61705680000.0,1323624170,170773920504,2448.333333,49,6000
1,2025-05-14T07_57_49.591Z,kroB100,CPU Parallel Genetic Solver,100,30,23248.05,0.966667,fedora,"[38, 30, 75, 69, 49, 86, 68, 10, 21, 90, 46, 2...",22962.99,...,23691.2,10.751293,2.975679,122.6322,23608200000.0,6534138468,269281817200,504.4,128,6000
2,2025-05-14T08_16_05.867Z,lin105,CPU Parallel Genetic Solver,105,30,15097.95,0.766667,fedora,"[71, 78, 82, 83, 84, 85, 91, 92, 96, 97, 101, ...",14970.28,...,15381.78,36.542504,3.429284,128.6193,80241690000.0,7530170076,282428133372,1695.933333,143,6000
3,2025-05-14T08_30_23.134Z,tsp225,CPU Parallel Genetic Solver,225,30,4193.33,1.0,fedora,"[185, 184, 120, 175, 121, 122, 123, 124, 125, ...",4172.652,...,4193.276,28.575532,20.57119,141.8751,62747530000.0,45171119070,311536133590,604.066667,432,3158


In [None]:
# Convert datetime string to datetime object
df['metadata_datetime'] = pd.to_datetime(df['metadata_datetime'], format='%Y-%m-%dT%H_%M_%S.%fZ')

# Check the dataframe info and data types
df.info()


In [None]:
# Summary statistics for main numerical columns
df.describe()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x='stats_cost_avg', 
    y='best_solution_cost', 
    hue='metadata_problem',
    style='metadata_machine',
    data=df,
    s=100
)
plt.xlabel("Average Cost (stats)")
plt.ylabel("Best Solution Cost")
plt.title("Comparison of Best vs. Average Cost")
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['best_solution_cost'], bins=20, kde=True)
plt.xlabel("Best Solution Cost")
plt.title("Distribution of Best Solution Cost")
plt.show()


In [None]:
# Group by problem and summarize best and average costs
problem_summary = df.groupby('metadata_problem').agg({
    'best_solution_cost': ['mean', 'min', 'max'],
    'stats_cost_avg': ['mean', 'min', 'max'],
    'stats_cost_stdev': 'mean'
})
problem_summary.columns = ['_'.join(col).strip() for col in problem_summary.columns.values]
problem_summary.reset_index(inplace=True)
problem_summary


In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='metadata_problem', y='raw_cycles_mean', data=df)
plt.title("Raw Mean Cycles by Problem")
plt.xlabel("Problem")
plt.ylabel("Mean Cycles")
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='metadata_problem', y='raw_iterations_mean', data=df)
plt.title("Raw Mean Iterations by Problem")
plt.xlabel("Problem")
plt.ylabel("Mean Iterations")
plt.show()


In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
df.set_index('metadata_datetime', inplace=True)
df['best_solution_cost'].resample('D').mean().plot(figsize=(10,6), marker='o')
plt.title("Daily Mean Best Solution Cost")
plt.ylabel("Best Solution Cost")
plt.xlabel("Date")
plt.show()

# Reset index if needed:
df.reset_index(inplace=True)


In [None]:
pivot = pd.pivot_table(df, 
                       values='best_solution_cost', 
                       index='metadata_problem', 
                       columns='metadata_machine', 
                       aggfunc=np.mean)
pivot
