In [None]:
"""Workbook to format regularization tests data."""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from __future__ import annotations

import itertools
from pathlib import Path

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

In [5]:
reg_data_dir = (
    base_data_dir
    / "training_results/dfreeze_v2/hg38_100kb_all_none/harmonized_donor_sex_1l_3000n/regularization_tests"
)
if not reg_data_dir.exists():
    raise FileNotFoundError(f"Directory {reg_data_dir} does not exist.")

In [6]:
comet_run_metadata = pd.read_csv(
    base_data_dir
    / "training_results"
    / "all_results_cometml_filtered_oversampling-fixed.csv"
)

## Acquire/Merge regularization runs weight data

In [7]:
# Initialize the list to store individual dataframes
data_frames = []

# Iterate through the directories in reg_data_dir
for folder in reg_data_dir.iterdir():
    if not folder.is_dir():
        continue

    split_folder = folder / "split0"
    weights_data_path = next(split_folder.glob("*weights_description.csv"))

    # Read the CSV, skipping the first row and setting the second row as columns
    weights_df = pd.read_csv(weights_data_path, skiprows=1, names=["metric", "value"])

    exp_folder = split_folder / "EpiLaP"
    exp_key = next(exp_folder.glob("*")).name

    # Transpose the dataframe and add a column for the experiment key
    weights_df = weights_df.set_index("metric").T
    weights_df["experimentKey"] = exp_key
    weights_df["folder_name"] = folder.name

    # Append the dataframe to the list
    data_frames.append(weights_df)

# Combine all dataframes into one final dataframe
reg_data_df = pd.concat(data_frames, ignore_index=True)
reg_data_df.columns.name = None

In [8]:
reg_runs_df = pd.merge(reg_data_df, comet_run_metadata, on="experimentKey", how="left")
reg_runs_df["hparams/dropout"] = 1 - reg_runs_df["hparams/keep_prob"]

In [9]:
summary_df = reg_runs_df[
    [
        "experimentKey",
        "folder_name",
        "hparams/dropout",
        "hparams/l1_scale",
        "hparams/l2_scale",
        "val_Accuracy",
        "val_F1Score",
    ]
    + list(reg_data_df.columns)[:-1]
]
reg_runs_df.to_csv(reg_data_dir / "weights_detail.csv", index=False)
summary_df.to_csv(reg_data_dir / "weights_detail_summary.csv", index=False)

## Weight distribution figure

In [92]:
# Initialize lists to store hyperparameters and image paths
hyperparams = []
images = []
l2_image = []

# Iterate through the directories in reg_data_dir
for folder in reg_data_dir.iterdir():
    if not folder.is_dir():
        continue

    sub_df = reg_runs_df[reg_runs_df["folder_name"] == folder.name]
    acc = sub_df["val_Accuracy"].values[0]
    # Get hyperparam values
    dropout = sub_df["hparams/dropout"].values[0]
    l1_scale = sub_df["hparams/l1_scale"].values[0]
    l2_scale = sub_df["hparams/l2_scale"].values[0]

    # Find the PNG image
    split_folder = folder / "split0"
    png_path = next(split_folder.glob("*.png"))

    if l2_scale > 0:
        l2_image.append([png_path, (dropout, l2_scale, acc)])
        continue

    # Store the hyperparameters and image path
    hyperparams.append((dropout, l1_scale, acc))
    images.append(png_path)

# Convert hyperparams list to a DataFrame for easy handling
hyperparams_df = pd.DataFrame(
    hyperparams, columns=["dropout", "l1_scale", "val_Accuracy"]
)

# Determine the unique values and grid size
unique_dropouts = [f"{val:.2f}" for val in sorted(hyperparams_df["dropout"].unique())]
unique_l1_scales = sorted(hyperparams_df["l1_scale"].unique())

In [12]:
def simple_crop(image, left=0, right=0, top=0, bottom=0):
    """
    Crop the image by removing specified number of pixels from each side.

    Parameters:
    image: numpy array of the image
    left: pixels to remove from left
    right: pixels to remove from right
    top: pixels to remove from top
    bottom: pixels to remove from bottom
    """
    height, width = image.shape[:2]
    return image[top : height - bottom, left : width - right]

In [None]:
# Create a figure with subplots
N_x = len(unique_l1_scales)
N_y = len(unique_dropouts)
fig, axes = plt.subplots(N_y, N_x, figsize=(15, 5))

# Plot each image in the corresponding subplot
for idx, (dropout, l1_scale, acc) in enumerate(hyperparams):
    img = mpimg.imread(images[idx])
    cropped_img = simple_crop(img, left=200, right=200, top=100, bottom=77)

    dropout = f"{dropout:.2f}"
    row = unique_dropouts.index(dropout)
    col = unique_l1_scales.index(l1_scale)

    ax = axes[row, col]
    ax.imshow(cropped_img, aspect="equal")

    ax.text(0, -5, f"Acc={acc:.3f}", fontsize=8, color="black")

# l2 image is plotted in the last subplot
img = mpimg.imread(l2_image[0][0])
cropped_img = simple_crop(img, left=200, right=200, top=100, bottom=77)

ax = axes[N_y - 1, N_x - 1]
ax.imshow(cropped_img, aspect="equal")

dropout, l2, acc = l2_image[0][1]
ax.text(0, -5, f"Acc={acc:.3f}\nL1=0\nD={dropout}\nL2={l2}", fontsize=8, color="black")

# Create labels for the subplots
for i, j in itertools.product(range(len(unique_dropouts)), range(len(unique_l1_scales))):
    # Only show y-axis ticks for leftmost plots
    if i == 0:
        l1_scale = unique_l1_scales[j]
        if float(l1_scale).is_integer():
            l1_scale = int(l1_scale)
        axes[i, j].set_title(f"L1: {l1_scale}", y=1.08)
    if j == 0:
        axes[i, j].set_ylabel(f"Dropout: {float(unique_dropouts[i]):.0%}")

    # Remove ticks
    axes[i, j].set_xticks([])
    axes[i, j].set_yticks([])

    for spine in axes[i, j].spines.values():
        spine.set_visible(False)

    # Set consistent size for all subplots
    axes[i, j].set_aspect("equal")

# Adjust layout and show the figure
plt.tight_layout()
plt.savefig(reg_data_dir / "regularization_tests.png", dpi=400)
plt.savefig(reg_data_dir / "regularization_tests.svg", dpi=400)
plt.show()