In [None]:
import os
import pandas as pd

# 1. Base path to your “32” folder:
base_dir = "../results/32"

# 2. Prepare an empty list to collect DataFrames:
all_best_dfs = []

# 3. Walk through each subdirectory of “32”:
for root, dirs, files in os.walk(base_dir):
    # Look specifically for grid_search_bst_results.csv
    if "grid_search_bst_results.csv" in files:
        csv_path = os.path.join(root, "grid_search_bst_results.csv")
        # Derive a human‐readable “dataset_pair” label from the folder structure:
        # e.g. root might be "./results/32/cna_from_multi/train"
        dataset_pair = os.path.basename(os.path.dirname(root))  # “cna_from_multi”
        
        df = pd.read_csv(csv_path)
        df["dataset_pair"] = dataset_pair
        all_best_dfs.append(df)

# 4. Concatenate all into one DataFrame:
if len(all_best_dfs) > 0:
    global_best_df = pd.concat(all_best_dfs, ignore_index=True)
else:
    raise FileNotFoundError("No grid_search_bst_results.csv found under ./results/32")

# 5. Show a quick preview:
display(global_best_df)


In [None]:
mse_df = global_best_df[global_best_df["which_best"] == "mse"]
display(mse_df.sort_values("best_loss"))


In [None]:
import json
import matplotlib.pyplot as plt

# === Example: Load the losses of the best MSE model under cna_from_multi ===
dataset_name = "cna_from_multi"
metrics_folder = f"../results/32/{dataset_name}/train"

# JSON file that was saved when we found a new best MSE:
json_path = f"{metrics_folder}/best_by_mse_losses.json"

with open(json_path, "r") as f:
    losses_dict = json.load(f)
    # losses_dict has keys like:
    #   "train_loss"       : [ epoch0_loss, epoch1_loss, … ]
    #   "val_mse"          : [ val_mse_at_epoch, … ]
    #   "val_r2"           : [ … ]
    #   "val_cosine"       : [ … ]
    #   "val_mse_timestep" : [ … ]
    #   "val_r2_timestep"  : [ … ]
    #   "val_cosine_timestep": [ … ]

# Plot train vs. val_mse over epochs (assuming validation was every `validation_epochs`):
epochs = list(range(1, len(losses_dict["train_loss"]) + 1))

plt.figure(figsize=(6,4))
plt.plot(epochs, losses_dict["train_loss"], label="train_loss")
plt.plot(epochs, losses_dict["val_mse"],   label="val_mse")
plt.plot(epochs, losses_dict["val_r2"],    label="val_r2")
plt.plot(epochs, losses_dict["val_cosine"], label="val_cosine")

plt.xlabel("Val step")
plt.ylabel("Loss")
plt.title(f"{dataset_name} — Best MSE Model Loss Curves")
plt.legend()
plt.show()


In [None]:
import json
import pandas as pd

history_path = "../results/32/rnaseq_from_multi/train/grid_search_history.json"

with open(history_path, "r") as f:
    history_list = json.load(f)

# Turn it into a flat DataFrame where each row is one experiment
rows = []
for record in history_list:
    entry = {
        **record["params"],
        **record["best_val_losses"],
        "experiment_index": record["experiment_index"],
    }
    rows.append(entry)

history_df = pd.DataFrame(rows)
display(history_df)


In [None]:
# 1. Sort by “best_val_mse” ascending to see the top‐10 runs
top10 = history_df.sort_values("best_val_mse").head(10)
display(top10)

# 2. Check correlation between batch_size and val_mse:
display(history_df[["batch_size", "best_val_mse"]].corr())

# 3. If you want to see epoch‐by‐epoch curves for the very best run:
best_row = history_df.sort_values("best_val_mse").iloc[0]
best_idx = best_row["experiment_index"]
# Find the matching record in history_list to grab losses_history:
best_record = next(r for r in history_list if r["experiment_index"] == best_idx)


In [None]:
import os
import json
import matplotlib.pyplot as plt

# Base directory containing the “32” folder
BASE_DIR = "../results/32"

# List all dataset‐pair subfolders under BASE_DIR (e.g. “cna_from_multi”, “rnaseq_from_wsi”, etc.)
dataset_pairs = [
    name for name in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, name))
]

# For each dataset_pair, we expect a “train” subfolder with the JSONs
for ds in dataset_pairs:
    train_dir = os.path.join(BASE_DIR, ds, "train")
    if not os.path.isdir(train_dir):
        # Skip if there is no train folder
        continue

    # Define a helper that loads one of the “best_by_<metric>_losses.json” files
    def load_losses(metric_name):
        """
        Loads the losses JSON for best_by_<metric_name>.
        Returns a dict with keys:
          - "val_mse"             : list of validation‐MSE at each validation step
          - "val_cosine"          : list of validation‐cosine at each validation step
          - "val_mse_timestep"    : list of validation‐timestep‐MSE at each validation step
        """
        path = os.path.join(train_dir, f"best_by_{metric_name}_losses.json")
        if not os.path.isfile(path):
            raise FileNotFoundError(f"Could not find {path}")
        with open(path, "r") as f:
            return json.load(f)

    # For each reference metric, we will make one plot
    for ref_metric in ["mse", "cosine", "timestep"]:
        # 1) Load the JSON containing all loss curves for the model
        losses = load_losses(ref_metric)
        #    We expect at least these keys:
        #       losses["val_mse"]
        #       losses["val_cosine"]
        #       losses["val_mse_timestep"]
        #
        #    Each is a list whose length = number of validation checkpoints (e.g. epochs/validation_epochs).

        val_mse_list          = losses.get("val_mse", [])
        val_cosine_list       = losses.get("val_cosine", [])
        val_mse_timestep_list = losses.get("val_mse_timestep", [])

        # 2) Determine which index (0‐based) gives the best value of the reference metric:
        if ref_metric == "mse":
            best_idx = min(range(len(val_mse_list)), key=lambda i: val_mse_list[i])
            vname = "Validation MSE"
        elif ref_metric == "cosine":
            best_idx = min(range(len(val_cosine_list)), key=lambda i: val_cosine_list[i])
            vname = "Validation Cosine"
        else:  # ref_metric == "timestep"
            best_idx = min(range(len(val_mse_timestep_list)), key=lambda i: val_mse_timestep_list[i])
            vname = "Validation Timestep‐MSE"

        # 3) Build x‐axis as “checkpoint index” (1, 2, 3, …)
        steps = list(range(1, len(val_mse_list) + 1))

        # 4) Create the plot
        plt.figure(figsize=(8, 5))
        plt.plot(steps, val_mse_list,          label="val_mse")
        plt.plot(steps, val_cosine_list,       label="val_cosine")
        plt.plot(steps, val_mse_timestep_list, label="val_mse_timestep")

        # 5) Add a vertical line at the best index + 1 (because our x‐axis is 1‐based)
        plt.axvline(x=best_idx + 1, color="red", linestyle="--",
                    label=f"Best {vname} @ step {best_idx + 1}")

        # 6) Labels, title, legend
        plt.xlabel("Validation Checkpoint Index")
        plt.ylabel("Loss / Metric Value")
        plt.title(f"{ds} — losses for model “best_by_{ref_metric}”")
        plt.legend()

        # 7) Optionally save to file or simply show
        out_fname = os.path.join(
            train_dir,
            f"{ds}_best_by_{ref_metric}_losses_plot.png"
        )
        plt.tight_layout()
        plt.savefig(out_fname, dpi=150)
        plt.close()

        print(f"Saved plot for {ds} best_by_{ref_metric} → {out_fname}")

print("All plots generated.")


In [None]:
import os
import json
import matplotlib.pyplot as plt

# Base directory containing all dataset‐pair folders (e.g. “cna_from_multi”, etc.)
BASE_DIR = "../results/32"

# Output folder for combined images
IMAGES_DIR = "../results/images/32"
os.makedirs(IMAGES_DIR, exist_ok=True)


# List all dataset‐pair subfolders under BASE_DIR
dataset_pairs = [
    name for name in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, name))
]

for ds in dataset_pairs:
    train_dir = os.path.join(BASE_DIR, ds, "train")
    if not os.path.isdir(train_dir):
        # Skip if there is no "train" folder inside this dataset-pair
        continue

    # Helper to load the losses JSON for a given reference metric
    def load_losses(metric_name):
        path = os.path.join(train_dir, f"best_by_{metric_name}_losses.json")
        if not os.path.isfile(path):
            raise FileNotFoundError(f"Could not find {path}")
        with open(path, "r") as f:
            return json.load(f)

    # We will create one figure with 3 subplots (columns):
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 5), sharey=True)

    # Keep track of which panel (0,1,2) we are on
    for idx, ref_metric in enumerate(["mse", "cosine", "timestep"]):
        # Load loss curves for this reference metric
        losses = load_losses(ref_metric)
        val_mse_list          = losses.get("val_mse", [])
        val_cosine_list       = losses.get("val_cosine", [])
        val_mse_timestep_list = losses.get("val_mse_timestep", [])

        # Determine best‐index for the reference metric
        if ref_metric == "mse":
            best_idx = min(range(len(val_mse_list)), key=lambda i: val_mse_list[i])
            panel_title = "Best by MSE"
        elif ref_metric == "cosine":
            best_idx = min(range(len(val_cosine_list)), key=lambda i: val_cosine_list[i])
            panel_title = "Best by Cosine"
        else:  # ref_metric == "timestep"
            best_idx = min(range(len(val_mse_timestep_list)), key=lambda i: val_mse_timestep_list[i])
            panel_title = "Best by Timestep‐MSE"

        # x‐axis = validation checkpoint index (1, 2, 3, …)
        steps = list(range(1, len(val_mse_list) + 1))

        ax = axes[idx]
        ax.plot(steps, val_mse_list,          label="val_mse")
        ax.plot(steps, val_cosine_list,       label="val_cosine")
        ax.plot(steps, val_mse_timestep_list, label="val_mse_timestep")

        # Draw vertical line at the best step (add 1 because steps are 1‐based)
        ax.axvline(x=best_idx + 1, color="red", linestyle="--",
                   label=f"Best @ step {best_idx + 1}")

        ax.set_title(panel_title)
        ax.set_xlabel("Validation Checkpoint")
        if idx == 0:
            ax.set_ylabel("Loss / Metric Value")

        ax.legend(fontsize="small")

    # Overall figure title
    fig.suptitle(f"{ds} — Validation Metrics for best_by_<metric> Models", fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

    # Save to ./results/images/<experiment>.png
    out_path = os.path.join(IMAGES_DIR, f"{ds}.png")
    fig.savefig(out_path, dpi=150)
    plt.close(fig)

    print(f"Saved combined figure for {ds} → {out_path}")

print("All combined figures generated.")


In [None]:
import os
import json
import matplotlib.pyplot as plt

# Base directory containing all dataset‐pair folders
BASE_DIR = "../results/32"

# Output directory for images
IMAGES_DIR = "../results/images/32"
os.makedirs(IMAGES_DIR, exist_ok=True)

# List all dataset‐pair names that have a 'train' subfolder
dataset_pairs = [
    name for name in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, name, "train"))
]

# Number of dataset-pairs
num_datasets = len(dataset_pairs)

# Create a figure with one row per dataset, three columns (best_by_mse, best_by_cosine, best_by_timestep)
fig, axes = plt.subplots(nrows=num_datasets, ncols=3, figsize=(18, 5 * num_datasets), sharey=True)

# If there's only one dataset, ensure axes is 2D
if num_datasets == 1:
    axes = axes.reshape(1, 3)

for row_idx, ds in enumerate(dataset_pairs):
    train_dir = os.path.join(BASE_DIR, ds, "train")
    for col_idx, ref_metric in enumerate(["mse", "cosine", "timestep"]):
        # Load the corresponding JSON file
        json_path = os.path.join(train_dir, f"best_by_{ref_metric}_losses.json")
        with open(json_path, "r") as f:
            losses = json.load(f)

        val_mse_list = losses.get("val_mse", [])
        val_cosine_list = losses.get("val_cosine", [])
        val_mse_timestep_list = losses.get("val_mse_timestep", [])

        # Determine the best index for the reference metric
        if ref_metric == "mse":
            best_idx = min(range(len(val_mse_list)), key=lambda i: val_mse_list[i])
            panel_title = "Best by MSE"
        elif ref_metric == "cosine":
            best_idx = min(range(len(val_cosine_list)), key=lambda i: val_cosine_list[i])
            panel_title = "Best by Cosine"
        else:  # "timestep"
            best_idx = min(range(len(val_mse_timestep_list)), key=lambda i: val_mse_timestep_list[i])
            panel_title = "Best by Timestep‐MSE"

        # X-axis: checkpoint indices (1-based)
        steps = list(range(1, len(val_mse_list) + 1))

        ax = axes[row_idx, col_idx]
        ax.plot(steps, val_mse_list, label="val_mse")
        ax.plot(steps, val_cosine_list, label="val_cosine")
        ax.plot(steps, val_mse_timestep_list, label="val_mse_timestep")

        # Draw vertical line at the best step (add 1 because steps are 1‐based)
        ax.axvline(x=best_idx + 1, color="red", linestyle="--", label=f"Best Step {best_idx + 1}")

        # Fix y-axis range between 0 and 2
        ax.set_ylim(0, 2)

        if col_idx == 0:
            ax.set_ylabel(ds)
        ax.set_title(panel_title)
        if row_idx == num_datasets - 1:
            ax.set_xlabel("Checkpoint Index")

        ax.legend(fontsize="small")

fig.suptitle("All Experiments: Validation Metrics for Best Models", fontsize=18)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

# Save combined figure
out_path = os.path.join(IMAGES_DIR, "all_experiments.png")
fig.savefig(out_path, dpi=150)
plt.close(fig)

print(f"Saved combined figure for all experiments → {out_path}")


In [None]:
import pathlib

import torch


path = pathlib.Path(f'../results/32/cna_from_wsi') 

# read checkpoint dictionary
ckpt_path = path / f'train/best_by_mse.pth'
ckpt = torch.load(ckpt_path, map_location='cpu')

In [None]:
ckpt

In [None]:
# load from checkpoint
from types import SimpleNamespace


raw_cfg = ckpt["config"]
config = SimpleNamespace(**raw_cfg)
state_dict = ckpt[f"best_model_mse"]  

In [None]:
ckpt["best_loss"]

In [None]:
state_dict

In [None]:
import os
import json
import pandas as pd

# Base directory containing all dataset‐pair folders
BASE_DIR = "../results/32"

# --- No longer creating plots, so image directories and matplotlib are removed ---

# List all dataset‐pair names that have a 'train' subfolder
try:
    dataset_pairs = [
        name for name in os.listdir(BASE_DIR)
        if os.path.isdir(os.path.join(BASE_DIR, name, "train"))
    ]
except FileNotFoundError:
    print(f"Error: The base directory '{BASE_DIR}' was not found.")
    exit()

# List to store the results
epoch_results = []

print(f"Searching for best epoch in {len(dataset_pairs)} experiments based on MSE...")

# Iterate through each experiment to find the best epoch by MSE
for experiment_name in dataset_pairs:
    train_dir = os.path.join(BASE_DIR, experiment_name, "train")
    
    # We only care about the model selected using the MSE metric
    json_path = os.path.join(train_dir, "best_by_mse_losses.json")

    if not os.path.exists(json_path):
        print(f"  - WARNING: Skipping '{experiment_name}', file not found: {json_path}")
        continue

    with open(json_path, "r") as f:
        losses = json.load(f)

    # Get the list of validation MSE values
    val_mse_list = losses.get("val_mse", [])

    if not val_mse_list:
        print(f"  - WARNING: Skipping '{experiment_name}', no 'val_mse' data in JSON.")
        continue
        
    # Find the index corresponding to the minimum validation MSE
    best_idx = min(range(len(val_mse_list)), key=lambda i: val_mse_list[i])
    
    # Get the minimum MSE value itself
    min_mse_value = val_mse_list[best_idx]

    # The "epoch" is the checkpoint index, which is 1-based
    best_epoch = best_idx + 1

    # Store the result
    epoch_results.append({
        "Experiment": experiment_name,
        "Best Epoch (by MSE)": best_epoch,
        "Min Validation MSE": f"{min_mse_value:.6f}" # Format for readability
    })

# --- Display the results in a table ---
if epoch_results:
    df_results = pd.DataFrame(epoch_results)
    print("\n--- Best Epoch Results (based on minimum validation MSE) ---")
    print(df_results.to_string(index=False))
else:
    print("\nNo valid experiment results were found.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots look nice
sns.set_theme(style="whitegrid")

# --- 1. Analysis of Categorical Hyperparameters (What are the most common winning values?) ---
print("--- Analysis of Categorical Hyperparameters ---")

# List of categorical columns to analyze
categorical_params = ['architecture', 'batch_size', 'learning_rate']

for param in categorical_params:
    print(f"\nValue counts for '{param}':")
    # Print the frequency of each value
    print(mse_df[param].value_counts())
    
    # Create a count plot
    plt.figure(figsize=(10, 5))
    sns.countplot(y=param, data=mse_df, order=mse_df[param].value_counts().index, palette="viridis")
    plt.title(f'Frequency of Best Hyperparameter: {param}')
    plt.xlabel('Number of Best Models (by MSE)')
    plt.ylabel(param)
    plt.show()

# --- 2. Analysis of Numerical Hyperparameters (What are their distributions?) ---
print("\n--- Analysis of Numerical Hyperparameters ---")

# List of numerical columns to analyze
numerical_params = ['n_layers', 'initial_size', 'bottleneck_size', 'time_embedding_dimension', 'cond_embedding_dim']

# Get descriptive statistics (mean, std, min, max, etc.)
print("\nDescriptive Statistics for Numerical Parameters:")
display(mse_df[numerical_params].describe())

# Create histograms to see the distribution of each numerical parameter
for param in numerical_params:
    plt.figure(figsize=(10, 5))
    sns.histplot(mse_df[param], kde=True, bins=10)
    plt.title(f'Distribution of Best Hyperparameter: {param}')
    plt.xlabel(param)
    plt.ylabel('Frequency')
    plt.show()

# --- 3. Relationship between Hyperparameters and Performance (Loss) ---
print("\n--- Relationship between Hyperparameters and Performance ---")

# Box plot for categorical variables vs. best_loss
for param in categorical_params:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=param, y='best_loss', data=mse_df, palette="coolwarm")
    plt.title(f'Loss Distribution by {param}')
    plt.xlabel(param)
    plt.ylabel('Best MSE Loss')
    plt.show()
    
# Scatter plot for numerical variables vs. best_loss
for param in numerical_params:
    plt.figure(figsize=(10, 5))
    sns.scatterplot(x=param, y='best_loss', data=mse_df, alpha=0.7)
    plt.title(f'Loss vs. {param}')
    plt.xlabel(param)
    plt.ylabel('Best MSE Loss')
    plt.show()


# --- 4. Correlation Analysis (Are certain hyperparameters chosen together?) ---
print("\n--- Correlation Analysis of Numerical Hyperparameters ---")

# Calculate the correlation matrix
corr_matrix = mse_df[numerical_params].corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Best Numerical Hyperparameters')
plt.show()