In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotnine as pn

from ..utils.const import FIGURE_PATH, OUTPUT_PATH

## set up output directory
figure_dir = Path(FIGURE_PATH) / "bench_meta"
figure_dir.mkdir(exist_ok=True, parents=True)

output_dir = Path(OUTPUT_PATH) / "bench_meta"
output_dir.mkdir(exist_ok=True, parents=True)

df_0 = pd.read_csv(Path("output") / "ms_bench" / "results.csv")
df_0["dataset"] = "ms"
df_0

df_1 = pd.read_csv(Path("output") / "ms_xenium_bench" / "results.csv")
df_1["dataset"] = "ms_xenium"
df_1

df_2 = pd.read_csv(Path("output") / "lupus_bench" / "results.csv")
df_2["dataset"] = "lupus"

df = pd.concat([
    df_0, df_1, df_2
])
df["setting"] = [ds + "__" + ct for ds, ct in zip(df["dataset"], df["celltype"])]

# compute mean and std for time and varexpl per setting 
mean_df = df.groupby("setting").aggregate({"varexpl": "mean", "time": "mean"})
mean_df.columns = [c + "_mean" for c in mean_df.columns]

std_df = df.groupby("setting").aggregate({"varexpl": "std", "time": "std"})
std_df.columns = [c + "_std" for c in std_df.columns]

df = df.join(mean_df, on="setting").join(std_df, on="setting")

df["varexpl_z_scaled"] = (df["varexpl"] - df["varexpl_mean"]) / df["varexpl_std"]
df["time_z_scaled"] = (df["time"] - df["time_mean"]) / df["time_std"]

df_summary = df.groupby(["setting", "init_alg", "optim_alg"]).aggregate({"varexpl_z_scaled": "mean", "time_z_scaled": "mean"}).reset_index()

df_summary_agg = df_summary.groupby(["init_alg", "optim_alg"]).aggregate({"varexpl_z_scaled": ["mean", "std"], "time_z_scaled": ["mean", "std"]}).reset_index()
df_summary_agg.columns = ['_'.join(col).strip('_') for col in df_summary_agg.columns.values]
df_summary_agg["varexpl_z_scaled_low"] = df_summary_agg["varexpl_z_scaled_mean"] - df_summary_agg["varexpl_z_scaled_std"]
df_summary_agg["varexpl_z_scaled_high"] = df_summary_agg["varexpl_z_scaled_mean"] + df_summary_agg["varexpl_z_scaled_std"]
df_summary_agg["time_z_scaled_low"] = df_summary_agg["time_z_scaled_mean"] - df_summary_agg["time_z_scaled_std"]
df_summary_agg["time_z_scaled_high"] = df_summary_agg["time_z_scaled_mean"] + df_summary_agg["time_z_scaled_std"]

color_map = {"frank_wolfe":  "#DAA520", "projected_gradients": "#006400"}

df_settings = df[["celltype", "dataset", "setting", "n_samples", "n_dimensions", "n_archetypes"]].drop_duplicates().reset_index(drop=True)
df_settings["n_samples_log10"] = np.log10(df_settings["n_samples"])
df_settings["dataset_long"] = df_settings["dataset"].map({
    "ms": "10X Chromium White Matter\nMultiple Sclerosis",
    "ms_xenium": "10X Xenium Spinal Coord\nMultiple Sclerosis",
    "lupus": "10X Chromium PBMC\nSystemic Lupus Erythematosus"
})

In [2]:
df_settings

Unnamed: 0,celltype,dataset,setting,n_samples,n_dimensions,n_archetypes,n_samples_log10,dataset_long
0,MG,ms,ms__MG,9239,10,6,3.965625,10X Chromium White Matter\nMultiple Sclerosis
1,AS,ms,ms__AS,13987,10,11,4.145725,10X Chromium White Matter\nMultiple Sclerosis
2,OL,ms,ms__OL,64834,10,5,4.811803,10X Chromium White Matter\nMultiple Sclerosis
3,OPC,ms,ms__OPC,3945,10,5,3.596047,10X Chromium White Matter\nMultiple Sclerosis
4,NEU,ms,ms__NEU,8171,10,9,3.912275,10X Chromium White Matter\nMultiple Sclerosis
5,EC,ms,ms__EC,2230,10,4,3.348305,10X Chromium White Matter\nMultiple Sclerosis
6,Oligo,ms_xenium,ms_xenium__Oligo,153038,10,4,5.184799,10X Xenium Spinal Coord\nMultiple Sclerosis
7,Astrocyte,ms_xenium,ms_xenium__Astrocyte,150751,10,4,5.17826,10X Xenium Spinal Coord\nMultiple Sclerosis
8,Myeloid,ms_xenium,ms_xenium__Myeloid,147478,10,5,5.168727,10X Xenium Spinal Coord\nMultiple Sclerosis
9,Vascular,ms_xenium,ms_xenium__Vascular,112456,10,4,5.050983,10X Xenium Spinal Coord\nMultiple Sclerosis
