In [1]:
from pathlib import Path
import pandas as pd

In [2]:
from pathlib import Path
import pandas as pd

def parse_folder_name(folder_name: str) -> dict:
    """
    Splits the folder name by '__' and then by '-' to extract key-value pairs.
    Example:
      "N_structures-100__N_steps-20__coordinates_snr-0.2__n_corrector_steps-10__batch_size-128__N_samples_per_structure-1"
    Returns a dictionary of parameters.
    """
    params = {}
    for part in folder_name.split("__"):
        if '-' in part:
            key, value = part.split("-", 1)
            params[key] = value
    return params

def retrieve_dataframes(root_path: str|Path) -> pd.DataFrame:
    """
    Iterates over subdirectories in the given root_path, loads CSV files matching
    "*_df_results.csv" into dataframes, extracts parameters from the subfolder name,
    adds these parameters as new columns to each dataframe, and merges all dataframes
    into a single dataframe.
    """
    root_dir = Path(root_path)
    dataframes = []
    
    for subfolder in root_dir.iterdir():
        if subfolder.is_dir():
            # Extract parameters from the folder name once for the subfolder
            params = parse_folder_name(subfolder.name)
            # Iterate over CSV files that match the pattern
            for csv_file in subfolder.glob("*df_results.csv"):
                df = pd.read_csv(csv_file)
                # Add each parameter as a new column
                for key, value in params.items():
                    df[key] = value
                dataframes.append(df)
    
    if not dataframes:
        print(f'No data available in {root_path}')
        return

    final_df = pd.concat(dataframes, ignore_index=True)
    for col in final_df.columns:
        final_df[col] = pd.to_numeric(final_df[col], errors='ignore')
    return final_df

In [3]:
baseline_dir = Path("/data/scratch/shared/reents_t/baseline/parameter-checks")
df_baseline = retrieve_dataframes(baseline_dir)

baseline_w_noise_dir = Path('/data/scratch/shared/reents_t/baseline-w-noise')
df_baseline_w_noise = retrieve_dataframes(baseline_w_noise_dir)

repaint_v1_dir = Path('/data/scratch/shared/reents_t/parameter-checks')
df_repaint_v1 = retrieve_dataframes(repaint_v1_dir)


  final_df[col] = pd.to_numeric(final_df[col], errors='ignore')
  final_df[col] = pd.to_numeric(final_df[col], errors='ignore')
  final_df[col] = pd.to_numeric(final_df[col], errors='ignore')


In [4]:
df_baseline_perf = df_baseline.groupby(
    [
        'N_structures', 'N_steps', 'coordinates_snr', 'n_corrector_steps'
    ]
)[['Matches', 'Matches after relaxation']].mean().mul(100).sort_values('Matches after relaxation', ascending=False)

df_repaint_v1_perf = df_repaint_v1.groupby(
    [
        'N_structures', 'N_steps', 'coordinates_snr', 'n_resample_steps', 'n_corrector_steps'
    ]
)[['Matches', 'Matches after relaxation']].mean().mul(100).sort_values('Matches after relaxation', ascending=False)

df_baseline_w_noise_perf = df_baseline_w_noise.groupby(
    [
        'N_structures', 'N_steps', 'coordinates_snr', 'n_corrector_steps'
    ]
)[['Matches', 'Matches after relaxation']].mean().mul(100).sort_values('Matches after relaxation', ascending=False)

In [5]:
pd.merge(
    df_baseline_perf, df_baseline_w_noise_perf, left_index=True, right_index=True,
    suffixes=('_baseline', '_baseline_w_noise')
).sort_values(
    by=[
        'Matches after relaxation_baseline_w_noise',
        'Matches after relaxation_baseline', 
        ], ascending=False
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Matches_baseline,Matches after relaxation_baseline,Matches_baseline_w_noise,Matches after relaxation_baseline_w_noise
N_structures,N_steps,coordinates_snr,n_corrector_steps,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,50,0.6,5,67.0,72.0,64.0,73.0
100,50,0.4,5,66.0,70.0,64.0,73.0
100,50,0.6,10,67.0,70.0,67.0,72.0
100,50,0.6,2,63.0,67.0,61.0,68.0
100,50,0.2,10,66.0,71.0,59.0,67.0
100,20,0.4,5,60.0,68.0,53.0,67.0
100,50,0.2,5,62.0,69.0,58.0,66.0
100,50,0.4,2,60.0,67.0,54.0,66.0
100,20,0.6,10,68.0,73.0,60.0,65.0
100,50,0.4,10,72.0,76.0,60.0,64.0


In [11]:
pd.merge(
    df_baseline_perf, df_repaint_v1_perf, left_index=True, right_index=True,
    suffixes=('_baseline', '_repaint_v1')
).sort_values(
    by=[
        'Matches after relaxation_repaint_v1',
        'Matches after relaxation_baseline', 
        ], ascending=False
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Matches_baseline,Matches after relaxation_baseline,Matches_repaint_v1,Matches after relaxation_repaint_v1
N_structures,N_steps,coordinates_snr,n_corrector_steps,n_resample_steps,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100,200,0.4,5,5,75.0,76.0,74.0,77.0
100,100,0.6,10,5,74.0,76.0,75.0,77.0
100,500,0.4,2,5,74.0,75.0,75.0,77.0
100,1000,0.6,5,1,72.0,75.0,72.0,77.0
100,200,0.2,5,5,64.0,69.0,71.0,77.0
100,...,...,...,...,...,...,...,...
100,20,0.4,2,1,47.0,57.0,36.0,49.0
100,20,0.6,5,1,63.0,66.0,43.0,48.0
100,20,0.2,5,1,54.0,65.0,32.0,48.0
100,50,0.2,1,1,47.0,61.0,31.0,44.0


In [12]:
df_repaint_v1_perf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Matches,Matches after relaxation
N_structures,N_steps,coordinates_snr,n_resample_steps,n_corrector_steps,Unnamed: 5_level_1,Unnamed: 6_level_1
100,200,0.2,5,5,71.0,77.0
100,1000,0.6,1,5,72.0,77.0
100,200,0.4,5,5,74.0,77.0
100,500,0.4,5,2,75.0,77.0
100,100,0.6,5,10,75.0,77.0
100,...,...,...,...,...,...
100,20,0.2,1,10,39.0,49.0
100,20,0.6,1,5,43.0,48.0
100,20,0.2,1,5,32.0,48.0
100,50,0.2,1,1,31.0,44.0


In [13]:
df_baseline_perf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Matches,Matches after relaxation
N_structures,N_steps,coordinates_snr,n_corrector_steps,Unnamed: 4_level_1,Unnamed: 5_level_1
100,200,0.6,5,75.0,79.0
100,1000,0.4,5,77.0,79.0
100,1000,0.6,2,74.0,78.0
100,500,0.6,5,76.0,78.0
100,200,0.6,1,73.0,77.0
100,...,...,...,...,...
100,100,0.2,2,57.0,61.0
100,20,0.6,2,48.0,59.0
100,20,0.2,1,42.0,59.0
100,20,0.4,2,47.0,57.0
