In [None]:
"""Notebook to analyze the values in an HDF5 file."""

# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument, use-dict-literal, too-many-lines, too-many-branches

In [None]:
%load_ext autoreload
%autoreload 2

## SETUP

In [None]:
from __future__ import annotations

import copy
import json
import tarfile
from pathlib import Path
from typing import IO, Dict, Iterable, List, Tuple

import h5py
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

from epiclass.core.data_source import EpiDataSource  # pylint: disable=unused-import
from epiclass.core.epiatlas_treatment import (  # pylint: disable=unused-import
    ACCEPTED_TRACKS,
)
from epiclass.core.hdf5_loader import Hdf5Loader
from epiclass.core.metadata import Metadata
from epiclass.utils.bed_utils import bed_to_bins, bins_to_bed_ranges

ASSAY = "assay_epiclass"
TRACK_TYPE = "track_type"
CELL_TYPE = "harmonized_sample_ontology_intermediate"

In [None]:
%matplotlib inline

In [None]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epiclass"
input_base = base / "input"
output_base = base / "output"

chromsize_path = input_base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = (
    input_base
    / "metadata/dfreeze-v2/hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2.json"
)

base_logdir = output_base / "logs"
logdir = base_logdir / "epiatlas-dfreeze-v2.1/hdf5_stats"

if not logdir.exists():
    logdir.mkdir(parents=True)

In [None]:
paper_dir = output_base / "paper"
table_dir = paper_dir / "tables"

In [None]:
chromsizes: List[Tuple[str, int]] = EpiDataSource.load_external_chrom_file(chromsize_path)

chroms: List[str] = sorted([chrom for chrom, _ in chromsizes])

In [None]:
metadata = Metadata(metadata_path)
metadata_df = metadata.to_df()

## Global bin metrics analysis

e.g. mean/stddev, median/IRQ in raw hdf5 values, or other data like ChromScore or CNV. 

In [None]:
# hdf5_list_path = input_base / "hdf5_list" / "100kb_all_none_10samples.list"
# hdf5_list_path = (
#     input_base
#     / "hdf5_list"
#     / "hg38_2023-01-epiatlas-freeze"
#     / "100kb_all_none_0blklst.list"
# )

# datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
# my_meta = Metadata(datasource.metadata_file)
# my_meta.display_labels("track_type")

# my_meta.select_category_subsets("track_type", ACCEPTED_TRACKS)
# my_meta.display_labels("track_type")

# paths = Hdf5Loader.read_list(hdf5_list_path)

In [None]:
def read_hdf5_sizes(hdf5_path: Path | str, chroms: List[str]) -> Dict[str, int]:
    """Read the HDF5 file and return the data."""
    with h5py.File(hdf5_path, "r") as file:
        header = list(file.keys())[0]
        hdf5_data = file[header]
        chrom_lengths = {chrom: len(hdf5_data[chrom][...]) for chrom in chroms}  # type: ignore
    return chrom_lengths

In [None]:
def read_important_features(global_task_features_path) -> Dict[str, List[int]]:
    """Read the important features from the global task features file."""
    with open(global_task_features_path, "r", encoding="utf8") as file:
        important_features = json.load(file)
    return important_features

In [None]:
def read_cancer_important_bins() -> Dict[str, List[int]]:
    """Read the cancer important bins."""
    dir_path = Path.home() / "scratch/epiclass/join_important_features/global_info/cancer"
    index_dict = {}

    filepath = (
        dir_path / "cancer_intersection_merge_samplings_bed-details_blood_subset.tsv"
    )
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling_blood_subset"] = list(df["bin"])

    filepath = dir_path / "cancer_intersection_merge_samplings_bed-details_2.tsv"
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling"] = list(df["bin"])

    return index_dict

In [None]:
def plot_important_features_metrics(
    important_features: Dict[str, List[int]],
    npz_file_path: Path,
    logdir: Path | None = None,
    include_categories: Iterable[str] | None = None,
) -> Dict[str, Tuple[int, float, float]]:
    """Using the important features positions, plot (violin) the mean values according to the given npz file.

    Adds a violin for a random feature set of the same size, and one for the global distribution.

    Compute the KS test for the random features and the global distribution, and add the p-value to the plot.

    Args:
    - important_features: A dictionary with category names as keys, and lists of feature positions as values.
    - npz_file_path: The path to the npz file containing the bin metrics.
    - logdir: The directory where to save the plots.
    - include_categories: The categories to include in the plot.

    Returns:
    - A dictionary with category names as keys, and tuples of sample size and p-values as values.
    """
    with np.load(npz_file_path) as data:
        bin_metrics = {metric: data[metric] for metric in data.keys()}

    means = np.array(bin_metrics["mean"], dtype=np.float64)

    pvals = {}
    for category_name, features_pos in important_features.items():
        if include_categories and category_name not in include_categories:
            continue

        fig = go.Figure()

        selected_features = np.array(
            [means[pos] for pos in features_pos], dtype=np.float64
        )
        fig.add_trace(
            go.Violin(
                y=selected_features,
                name=f"{category_name} features (N={len(features_pos)})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )

        # Random features comparison
        N = len(features_pos)
        np.random.seed(42)
        random_features = np.random.choice(means, size=N, replace=False)
        fig.add_trace(
            go.Violin(
                y=random_features,
                name=f"Random features (N={N})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_random = stats.ks_2samp(selected_features, random_features)
        if pval_random < 0.0001:
            annot_random = " << 0.001"
        else:
            annot_random = f" = {pval_random:.3f}"

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                y=means,
                name=f"All features N={len(means)}",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_global = stats.ks_2samp(selected_features, means)
        if pval_global < 0.0001:
            annot_global = " << 0.001"
        else:
            annot_global = f" = {pval_global:.3f}"

        # Annotations for p-values
        fig.add_annotation(
            x=1,
            y=max(random_features) + 0.02,
            text=f"p-val {annot_random} (Selected vs. Random)",
            showarrow=False,
            xref="x",
            yref="y",
        )
        fig.add_annotation(
            x=2,
            y=max(means) + 0.02,
            text=f"p-val {annot_global} (Selected vs. Global)",
            showarrow=False,
            xref="x",
            yref="y",
        )

        # Small points
        fig.update_traces(marker=dict(size=2))

        fig.update_layout(
            title=f"Mean values for {category_name} features",
            xaxis_title="Feature set",
            yaxis_title="Mean values",
            violinmode="group",
            width=1200,
            height=800,
        )

        # sanity check, pval random vs global
        _, pval_random_vs_global = stats.ks_2samp(random_features, means)
        if pval_random_vs_global < 0.05:
            print(f"WARNING: pval_random_vs_global: {pval_random_vs_global}")

        pvals[category_name] = (N, pval_random, pval_global, pval_random_vs_global)

        if logdir:
            fig.write_html(logdir / f"{npz_file_path.stem}_{category_name}_violin.html")
            fig.write_image(logdir / f"{npz_file_path.stem}_{category_name}_violin.png")
            fig.write_image(logdir / f"{npz_file_path.stem}_{category_name}_violin.svg")

        fig.show()

    return pvals

## SHAP values: important regions vs genes

In [None]:
shap_regions_general_dir = table_dir / "dfreeze_v2/100kb_all_none/SHAP-MLP"
if not shap_regions_general_dir.exists():
    raise FileNotFoundError(f"Directory {shap_regions_general_dir} does not exist.")

In [None]:
gff_intersect_cols = [
    "chromosome",
    "start_100kb",
    "end_100kb",
    "seqname",
    "source",
    "feature",
    "start",
    "end",
    "score",
    "strand",
    "frame",
    "attribute",
    "overlap (bp)",
]

In [None]:
BED_COLS = ["chromosome", "start_100kb", "end_100kb"]

### BIOSPECIMENS

#### Read important bins values, and find possible classes for each unique bin.

In [None]:
features_dir = shap_regions_general_dir / "cell_type"
features_file = features_dir / "select_beds_top303.tar.gz"

ct_important_bins: Dict[str, List[int]] = {}
with tarfile.open(features_file, "r:gz") as tar:
    for member in tar.getmembers():
        filename = member.name
        if "merge_samplings" in filename and filename.endswith("bed"):
            file_obj: IO[bytes] = tar.extractfile(member)  # type: ignore

            cell_type = (
                filename.split("/")[1]
                .replace("merge_samplings_", "")
                .replace("_features.bed", "")
                .lower()
            )
            ct_important_bins[cell_type] = bed_to_bins(
                file_obj, chroms=chromsizes, resolution=100 * 1000
            )

In [None]:
all_bins = set()
for bins in ct_important_bins.values():
    all_bins.update(bins)

all_bins = sorted(all_bins)

In [None]:
# Find relevant cell types for each bin, optimized for pandas future vectorization
relevant_pairs_list = []
for cell_type, bins_list in ct_important_bins.items():
    for bin_idx in bins_list:
        relevant_pairs_list.append({"bin_index": bin_idx, CELL_TYPE: cell_type})

bin_to_relevant_ct_df = pd.DataFrame(relevant_pairs_list)
bin_to_relevant_ct_df["bin_index"] = bin_to_relevant_ct_df["bin_index"].astype(int)

assert bin_to_relevant_ct_df.shape[0] > len(ct_important_bins)

In [None]:
print(bin_to_relevant_ct_df.shape)
print(bin_to_relevant_ct_df["bin_index"].nunique())
print(bin_to_relevant_ct_df[CELL_TYPE].value_counts(dropna=False))

In [None]:
classifier_cell_types = set(bin_to_relevant_ct_df[CELL_TYPE].unique())
assert len(classifier_cell_types) == 16

##### Creating a new table with associated genes + cell types for each region.

The final desired format is  
chr, start, end, cell_types, genes

Reformat bin / cell type association to have full bed value and aggregated cell types

In [None]:
bed_ranges = bins_to_bed_ranges(all_bins, chromsizes, resolution=100 * 1000)
bin_to_bed_dict = dict(zip(all_bins, bed_ranges))
assert (
    len(bin_to_bed_dict) == len(all_bins) == bin_to_relevant_ct_df["bin_index"].nunique()
)

In [None]:
assert len(bin_to_bed_dict) == bin_to_relevant_ct_df["bin_index"].nunique()

In [None]:
bin_to_relevant_ct_df["bed_range"] = bin_to_relevant_ct_df["bin_index"].map(bin_to_bed_dict)  # type: ignore

bin_to_relevant_ct_df[["chr", "start", "end"]] = bin_to_relevant_ct_df["bed_range"].apply(
    pd.Series
)
bin_to_relevant_ct_df["region"] = (
    bin_to_relevant_ct_df["chr"].astype(str)
    + ":"
    + bin_to_relevant_ct_df["start"].astype(str)
    + "-"
    + bin_to_relevant_ct_df["end"].astype(str)
)

bin_to_relevant_ct_df.drop(columns=["bed_range"], inplace=True)

In [None]:
groupby_cols = [
    "bin_index",
    "chr",
    "start",
    "end",
    "region",
]  # redundant but want to keep

grouped_ct_df = (
    bin_to_relevant_ct_df.groupby(groupby_cols)[CELL_TYPE]
    .agg(lambda x: ";".join(map(str, sorted(set(x)))))
    .reset_index()
    .rename(
        columns={
            CELL_TYPE: "SHAP-MLP_associated_biospecimens",
            "bin_index": "bin_index_100kb",
        }
    )
)

In [None]:
assert grouped_ct_df.shape[0] == grouped_ct_df["bin_index_100kb"].nunique()

Reformat gff intersection file

We will make a version with minimal info, aggregated gene_id + feature_type,  
and another version that keeps details

In [None]:
gene_intersect_filepath = features_dir / "global_union_features_intersect_gff.tsv"
gene_intersect_df = pd.read_csv(
    gene_intersect_filepath,
    sep="\t",
    header=None,
    names=gff_intersect_cols,
    index_col=False,
)

# Redundant (seqname) or empty (score,frame)
gene_intersect_df = gene_intersect_df.drop(columns=["seqname", "score", "frame"])

In [None]:
print(f"Nb of genes: {gene_intersect_df.shape[0]}")
print(f"Nb of 100kb regions: {grouped_ct_df['bin_index_100kb'].nunique()}")

Full details

In [None]:
merged_gff = pd.merge(
    gene_intersect_df,
    grouped_ct_df,
    how="right",
    left_on=["chromosome", "start_100kb", "end_100kb"],
    right_on=["chr", "start", "end"],
    suffixes=("_gff_feature", "_ct_df"),
)

# Replace missing coordinates with grouped ct coordinates
merged_gff[["chromosome", "start_100kb", "end_100kb"]] = merged_gff[
    ["chr", "start_ct_df", "end_ct_df"]
]
merged_gff.drop(columns=["chr", "start_ct_df", "end_ct_df"], inplace=True)

In [None]:
merged_gff["feature_length"] = (
    merged_gff["end_gff_feature"] - merged_gff["start_gff_feature"] + 1
)
merged_gff["overlap (fraction)"] = (
    merged_gff["overlap (bp)"] / merged_gff["feature_length"]
)

In [None]:
# Rename some columns
merged_gff.rename(
    columns={"source": "gene_DB_source", "feature": "gene_type"}, inplace=True
)

In [None]:
col_order = [
    "bin_index_100kb",
    "chromosome",
    "start_100kb",
    "end_100kb",
    "region",
    "gene_DB_source",
    "gene_type",
    "start_gff_feature",
    "end_gff_feature",
    "feature_length",
    "overlap (bp)",
    "overlap (fraction)",
    "SHAP-MLP_associated_biospecimens",
    "attribute",
]
merged_gff = merged_gff[col_order]

# Int32 is nullable, not int32
merged_gff = merged_gff.astype(
    {
        "start_100kb": "Int32",
        "end_100kb": "Int32",
        "bin_index_100kb": "Int32",
        "start_gff_feature": "Int32",
        "end_gff_feature": "Int32",
        "feature_length": "Int32",
        "overlap (bp)": "Int32",
    },
)

In [None]:
merged_gff.to_csv(
    features_dir / "global_union_features_intersect_gff_with_ct.tsv",
    sep="\t",
    index=False,
)

Minimal details, aggregated gene info.

Start by reformating gff intersection file to have one line per region

In [None]:
gene_intersect_df.rename(columns={"feature": "gene_type"}, inplace=True)

In [None]:
# # Keep only relevant columns
gene_intersect_df = gene_intersect_df[
    ["chromosome", "start_100kb", "end_100kb", "gene_type", "attribute"]
].copy()

In [None]:
gene_intersect_df["attribute"] = gene_intersect_df["attribute"].str.split(
    ";", expand=True
)[0]

In [None]:
gene_intersect_df["gene_IDs"] = gene_intersect_df["attribute"].str.split(
    "ID=gene:", expand=True
)[1]
gene_intersect_df.drop(columns=["attribute"], inplace=True)

In [None]:
dup_cols = ["chromosome", "start_100kb", "end_100kb"]
grouped_gene_df = (
    gene_intersect_df.groupby(dup_cols)[["gene_IDs", "gene_type"]]
    .agg(lambda x: ";".join(map(str, list(x))))
    .reset_index()
)

In [None]:
grouped_gene_df = grouped_gene_df.merge(
    grouped_ct_df,
    how="right",
    left_on=["chromosome", "start_100kb", "end_100kb"],
    right_on=["chr", "start", "end"],
    suffixes=("_gene_df", "_ct_df"),
)

In [None]:
grouped_gene_df.drop(columns=["chromosome", "start_100kb", "end_100kb"], inplace=True)
grouped_gene_df.rename(columns={"start": "start_100kb", "end": "end_100kb"}, inplace=True)

In [None]:
grouped_gene_df

In [None]:
col_order = [
    "bin_index_100kb",
    "chr",
    "start_100kb",
    "end_100kb",
    "region",
    "SHAP-MLP_associated_biospecimens",
    "gene_IDs",
    "gene_type",
]
grouped_gene_df = grouped_gene_df[col_order]
grouped_gene_df.to_csv(
    features_dir / "global_union_features_intersect_gff_aggregated_with_ct.tsv",
    sep="\t",
    index=False,
)

### SEX/CANCER TOP SHAP intersect

In [None]:
def gff_intersect_aggregate(
    gff_intersect_df: pd.DataFrame, regions_df: pd.DataFrame, verbose: bool = False
) -> pd.DataFrame:
    """Aggregate gff intersect data to merge genes for each region (each region comes up only one time)"""
    intersect_df = gff_intersect_df.copy()
    regions_df = regions_df.copy()

    intersect_df = intersect_df.drop(
        columns=["seqname", "score", "frame", "overlap (bp)", "strand", "source"]
    )
    intersect_df["attribute"] = intersect_df["attribute"].str.split(";", expand=True)[0]
    intersect_df["gene_IDs"] = intersect_df["attribute"].str.split(
        "ID=gene:", expand=True
    )[1]
    intersect_df.drop(columns=["attribute"], inplace=True)

    intersect_df.rename(
        columns={"start": "feature_start", "end": "feature_end", "feature": "gene_type"},
        inplace=True,
    )

    if verbose:
        print(f"Regions: {regions_df.shape[0]}")
        print(f"Intersect: {intersect_df.shape[0]}")

    merged_df = pd.merge(
        regions_df,
        intersect_df,
        how="left",
        on=["chromosome", "start_100kb", "end_100kb"],
    )

    groupby_cols = ["bin_index_100kb", "chromosome", "start_100kb", "end_100kb"]
    merged_df = (
        merged_df.groupby(groupby_cols)[["gene_IDs", "gene_type"]]
        .agg(lambda x: ";".join(map(str, list(x))))
        .reset_index()
    )

    merged_df["region"] = (
        merged_df["chromosome"].astype(str)
        + ":"
        + merged_df["start_100kb"].astype(str)
        + "-"
        + merged_df["end_100kb"].astype(str)
    )

    # order
    merged_df = merged_df[
        [
            "bin_index_100kb",
            "chromosome",
            "start_100kb",
            "end_100kb",
            "region",
            "gene_IDs",
            "gene_type",
        ]
    ]

    return merged_df


# miaw

In [None]:
sex_dir = shap_regions_general_dir / "sex"
cancer_dir = shap_regions_general_dir / "cancer"

sex_intersect_path = sex_dir / "sex_intersection_merge_samplings_intersect_gff.tsv"
sex_regions_path = sex_dir / "sex_intersection_merge_samplings.bed"
sex_idx = bed_to_bins(sex_regions_path, chroms=chromsizes, resolution=100 * 1000)

cancer_intersect_path = (
    cancer_dir / "cancer_intersection_merge_samplings_intersect_gff.tsv"
)
cancer_regions_path = cancer_dir / "cancer_intersection_merge_samplings.bed"
cancer_idx = bed_to_bins(cancer_regions_path, chroms=chromsizes, resolution=100 * 1000)

sex_intersect_df = pd.read_csv(
    sex_intersect_path, sep="\t", header=None, index_col=False, names=gff_intersect_cols
)
sex_regions_df = pd.read_csv(
    sex_regions_path, sep="\t", header=None, index_col=False, names=BED_COLS
)
sex_regions_df["bin_index_100kb"] = sex_idx

cancer_intersect_df = pd.read_csv(
    cancer_intersect_path,
    sep="\t",
    header=None,
    index_col=False,
    names=gff_intersect_cols,
)
cancer_regions_df = pd.read_csv(
    cancer_regions_path, sep="\t", header=None, index_col=False, names=BED_COLS
)
cancer_regions_df["bin_index_100kb"] = cancer_idx

In [None]:
for df_intersect, df_regions, name in zip(
    [sex_intersect_df, cancer_intersect_df],
    [sex_regions_df, cancer_regions_df],
    ["sex", "cancer"],
):
    merged_df = gff_intersect_aggregate(df_intersect, df_regions)

    merged_df.to_csv(
        shap_regions_general_dir / name / f"{name}_intersect_gff_aggregated.tsv",
        sep="\t",
        index=False,
    )

## ChromScore hdf5 values

For each cell type important feature, find the average ChromScore value throughout associated cell types.  
If bin is present in multiple classes, just use files for all those classes.

### Read ChromScore values, and map files to their cell type.

Using hdf5 output from `bigwig_metrics.py`.

In [None]:
chromscore_dir = paper_dir / "data" / "ChromScore"
chromscore_file = chromscore_dir / "max_metrics_clean.h5"

to_clean = False
if not chromscore_file.exists():
    print(f"{chromscore_file} does not exist")
    chromscore_file = chromscore_dir / "max_metrics.h5"
    to_clean = True

if not chromscore_file.exists():
    raise FileNotFoundError(f"{chromscore_file} does not exist")

print(f"Loading {chromscore_file}")
chromscores_df: pd.DataFrame = pd.read_hdf(chromscore_file)  # type: ignore
print("Chromscores shape", chromscores_df.shape)
display(chromscores_df.head(n=2))

In [None]:
def transform_chromscore_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean up values in .h5 inplace.

    - If values in .h5 were stored a lists (with one value), get val
    - fillna as 0
    """
    # check for nans
    for col in df.columns:
        if df[col].isna().sum():
            print(col, df[col].isna().sum())

    # check if values are lists
    if isinstance(df.iloc[0, 0], list):
        df = df.map(lambda x: x[0])

    df.fillna(0, inplace=True)

    return df

In [None]:
if to_clean:
    print("Chromscores shape", chromscores_df.shape)
    chromscores_df = transform_chromscore_df(chromscores_df)
    print("Chromscores shape", chromscores_df.shape)

    chromscore_file = chromscore_dir / "max_metrics_clean.h5"
    chromscores_df.to_hdf(
        chromscore_file,
        key="df",
        mode="w",
        format="fixed",
        complevel=9,
    )

In [None]:
chromscores_df["epirr"] = chromscores_df.index.str.split(".").str[0]

In [None]:
# Create a mapping from epirr_id_without_version to cell_type
epirr_to_cell_type = dict(
    metadata_df.loc[:, ["epirr_id_without_version", CELL_TYPE]].values
)

In [None]:
chromscores_df[CELL_TYPE] = (
    chromscores_df["epirr"].map(epirr_to_cell_type).str.replace(" ", "_").str.lower()
)

In [None]:
# Only keep files from classifier 16ct
chromscores_df = chromscores_df[chromscores_df[CELL_TYPE].isin(classifier_cell_types)]
print("Chromscores shape", chromscores_df.shape)
display(chromscores_df.head(n=2))

### Find mean chromScore for each bin (for their relevant files)

In [None]:
# Melting global chromscores for future operation, now all values are in one column
# Assuming all columns starting with chr are bins, and all 100kb bins are present
region_cols_mapper = {
    col: idx
    for idx, col in enumerate(chromscores_df.columns)
    if isinstance(col, str) and col.startswith("chr")
}
assert len(region_cols_mapper) == 30321

In [None]:
chromscores_df.rename(columns=region_cols_mapper, inplace=True)  # type: ignore

In [None]:
melted_chromscores = chromscores_df.reset_index().rename(columns={"index": "filename"})
melted_chromscores = melted_chromscores.melt(
    id_vars=["epirr", CELL_TYPE],
    value_vars=list(region_cols_mapper.values()),
    var_name="bin_index",
    value_name="chromscore_value",
)
melted_chromscores["bin_index"] = melted_chromscores["bin_index"].astype(int)
print(melted_chromscores.shape)
print(melted_chromscores["bin_index"].nunique())
print(melted_chromscores["epirr"].nunique())
display(melted_chromscores.head(n=2))

In [None]:
# Merge melted chromscores with bin_to_relevant_ct_df, efficient for pandas
# Filters out all irrelevant bins
merged_df = pd.merge(
    melted_chromscores, bin_to_relevant_ct_df, on=["bin_index", CELL_TYPE], how="inner"
)
print(merged_df.shape)
print(merged_df["bin_index"].nunique())
print(merged_df["epirr"].nunique())
display(merged_df.head(n=2))

In [None]:
# Keep only columns of interest for plotting
merged_df = merged_df[["epirr", CELL_TYPE, "bin_index", "chromscore_value"]]

### Plot

In [None]:
logdir = paper_dir / "figures" / "chromscore"
if not logdir.exists():
    logdir.mkdir(parents=True)

# plot_important_features_chromscore_global(
#     avg_selected=avg,
#     all_avg=all_avg,
#     logdir=logdir,
# )

In [None]:
def test_distribution(
    x: List[float], y: List[float], verbose: bool = True
) -> Tuple[float, float]:
    """Test for distribution difference. x is used as reference for the number of samples.

    Welch's t-test and Brunner-Munzel test are computed.

    Returns:
        Tuple[float, float]: p-value for each test
    """
    if verbose:
        print(f"Number of samples in x: {len(x)}")
        print(f"Number of samples in y: {len(y)}")

    Welch_pval = stats.ttest_ind(
        a=x,
        b=y,
        equal_var=False,
        alternative="two-sided",
        nan_policy="raise",
    ).pvalue  # type: ignore

    # def statistic(x, y, axis):
    #     return np.mean(x, axis=axis) - np.mean(y, axis=axis)

    # perm_pval = stats.permutation_test(
    #     data=(x, y),
    #     statistic=statistic,
    #     permutation_type="independent",
    #     alternative="two-sided",
    #     n_resamples=9999,  # type: ignore
    #     random_state=42,
    # ).pvalue

    BM_pval = stats.brunnermunzel(
        x,
        y,
        alternative="two-sided",
        nan_policy="raise",
        distribution="t",
    ).pvalue

    return Welch_pval, BM_pval


def define_pval_label(pval: float) -> str:
    """Define p-value label."""
    pval_symbol = ""
    if pval < 0.001:
        pval_symbol = "<0.001***"
    elif pval < 0.01:
        pval_symbol = "<0.01**"
    elif pval < 0.05:
        pval_symbol = "<0.05*"
    elif pval >= 0.05:
        pval_symbol = ">0.05 NS"

    return pval_symbol

In [None]:
def prepare_chromscore_per_biospecimen_data(
    selected_bins_df: pd.DataFrame,
    all_chromscores_df: pd.DataFrame,
) -> Dict[str, Dict[str, List[float] | int]]:
    """
    Prepare plot data for chromscore per biospecimen plot.

    Each biospecimen need values for:
    - important features
    - random features
    - global distribution

    This is done with independent file subsets.
    """
    total_bins = all_chromscores_df["bin_index"].nunique()

    grouped_means = {}

    for biospecimen, df in selected_bins_df.groupby(by=CELL_TYPE):
        print(f"Processing {biospecimen}")

        all_chromscores_group = all_chromscores_df.loc[
            all_chromscores_df[CELL_TYPE] == biospecimen, :
        ]
        if not all_chromscores_group["chromscore_value"].isna().sum() == 0:
            raise ValueError(f"Missing values in chromscore_value for {biospecimen}")
        nb_files = df["epirr"].nunique()

        # Important features
        avg_per_bin = df.groupby("bin_index")["chromscore_value"].mean()
        nb_features = len(avg_per_bin)

        # Random features
        np.random.seed(42)
        random_features_idx = np.random.choice(
            range(total_bins), size=nb_features, replace=False
        )

        random_df = all_chromscores_group.loc[
            all_chromscores_group["bin_index"].isin(random_features_idx)
        ]
        random_features_means = random_df.groupby("bin_index")["chromscore_value"].mean()

        # Global distribution
        all_means_file_subset = all_chromscores_group.groupby("bin_index")[
            "chromscore_value"
        ].mean()

        grouped_means[biospecimen] = {
            "avg_per_bin": avg_per_bin,
            "random_features_means": random_features_means,
            "all_means_file_subset": all_means_file_subset,
            "nb_files": nb_files,
        }

    grouped_means["all_files"] = all_chromscores_df.groupby("bin_index")[
        "chromscore_value"
    ].mean()

    return grouped_means

In [None]:
# Sanity check, missing values
groupby = merged_df.groupby(CELL_TYPE)["chromscore_value"].apply(lambda x: x.isna().sum())
if not groupby.sum() == 0:
    display(groupby)
    raise ValueError("Missing values in chromscore_value")

This next cell can take more than 1 minute.

In [None]:
graph_data = prepare_chromscore_per_biospecimen_data(
    selected_bins_df=merged_df, all_chromscores_df=melted_chromscores
)

del merged_df, melted_chromscores

In [None]:
def plot_important_features_chromscore_global(
    avg_selected: pd.DataFrame,
    all_avg: pd.DataFrame,
    logdir: Path | None = None,
) -> None:
    """
    Plot violin for important features and random features.
    """
    fig = go.Figure()

    # Important features
    print("Tracing important features")
    N = len(avg_selected)
    fig.add_trace(
        go.Box(
            y=avg_selected,
            name=f"Important features per biospecimen (N={N})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    # Random features comparison
    print("Computing random features")
    np.random.seed(42)
    random_features_means = np.random.choice(all_avg, size=N, replace=False)

    print("Tracing random features")
    N = len(random_features_means)
    fig.add_trace(
        go.Violin(
            y=random_features_means,
            name=f"Random features, value on all files (N={N})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    # Global distribution comparison
    print("Tracing all features")
    fig.add_trace(
        go.Violin(
            y=all_avg,
            name=f"All features, all files (N={len(all_avg)})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="Important cell type features chromScore",
        xaxis_title="Feature set",
        yaxis_title="Average max value in selected regions of 100kb",
        violinmode="group",
        width=800,
        height=600,
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
def plot_chromscore_global_violin(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    cell_types: List[str] | None = None,
    logdir: Path | None = None,
    filename: str = "chromscore_global_violin",
) -> None:
    """
    Plot boxplots for important features with their cell type files subset
    vs all features for all files.

    Args:
        graph_data: Dict[str, Dict[str, List[float] | int]]. From prepare_chromscore_per_biospecimen_data.
        cell_types: List[str]|None. List of cell types to plot.

    """
    data = copy.deepcopy(graph_data)
    if not cell_types:
        cell_types = list(data.keys())
        cell_types.remove("all_files")

    colors = px.colors.qualitative.Dark24[0:2]

    fig = go.Figure()

    # Filter
    try:
        data = {biospecimen: graph_data[biospecimen] for biospecimen in cell_types}
    except KeyError as err:
        raise KeyError(
            f"A cell type is missing from the graph_data.\ncell types: {graph_data.keys()}.\nDesired: {cell_types}."
        ) from err

    important_features_vals = []
    for _, data in enumerate(data.values()):
        avg_per_bin: List[float] = data["avg_per_bin"]  # type: ignore
        important_features_vals.extend(avg_per_bin)
    N_subsets = len(important_features_vals)

    # Important features
    fig.add_trace(
        go.Violin(
            name="trace",
            legendgroup="Important features (SHAP)",
            side="negative",
            y=important_features_vals,
            fillcolor=colors[0],
            line=dict(color="black", width=1.5),
            showlegend=False,
            meanline_visible=True,
            points=False,
            spanmode="hard",
            box=dict(
                visible=True,
                fillcolor=colors[0],
                width=0.4,
                line_width=1,
            ),
        ),
    )

    # Global distribution comparison
    big_N = len(graph_data["all_files"])
    fig.add_trace(
        go.Violin(
            name="trace",
            legendgroup="All features",
            side="positive",
            y=graph_data["all_files"],
            fillcolor=colors[1],
            line=dict(color="black", width=1.5),
            showlegend=False,
            meanline_visible=True,
            points=False,
            spanmode="hard",
            box=dict(
                visible=True,
                fillcolor=colors[1],
                width=0.4,
                line_width=1,
            ),
        ),
    )

    pval = stats.ttest_ind(
        a=important_features_vals,
        b=graph_data["all_files"],
        equal_var=False,
        alternative="two-sided",
        nan_policy="raise",
    ).pvalue
    print(f"pval Welch's T-test:{pval:.4f}")

    # Legend with dummy points
    for i, name in enumerate(
        [f"Important SHAP features ({N_subsets})", f"All features ({big_N})"]
    ):
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                name=name,
                legendgroup=name.split("(")[0].strip(),
                showlegend=True,
                marker=dict(color=colors[i], symbol="square"),
            ),
        )

    fig.update_xaxes(showticklabels=False)

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        yaxis_title="Average of max value in selected regions of 100kb (over files)",
        width=700,
        height=700,
    )

    # fig.update_layout(violingap=0, violinmode='overlay')

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        fig.write_image(logdir / f"{filename}.svg")
        fig.write_image(logdir / f"{filename}.png", scale=1.5)
        fig.write_html(logdir / f"{filename}.html")

In [None]:
def plot_chromscore_per_biospecimen_box(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    logdir: Path | None = None,
) -> None:
    """
    Plot boxplots for important features and random features,
    using regions and files per biospecimen independently.
    """
    fig = go.Figure()

    colors = px.colors.qualitative.Dark24_r[0:3]

    fig = make_subplots(
        rows=4,
        cols=4,
        shared_yaxes=True,
        vertical_spacing=0.1,
        y_title="Average max value in selected regions of 100kb",
    )

    for idx, (biospecimen, data) in enumerate(graph_data.items()):
        try:
            avg_per_bin: List[float] = data["avg_per_bin"]  # type: ignore
            random_features_means: List[float] = data["random_features_means"]  # type: ignore
            all_means_file_subset: List[float] = data["all_means_file_subset"]  # type: ignore
        except KeyError:
            continue

        nb_files = data["nb_files"]
        nb_features = len(avg_per_bin)

        # Important features
        fig.add_trace(
            go.Box(
                y=avg_per_bin,
                x=[0] * len(avg_per_bin),
                line_color=colors[0],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        # Random features comparison
        fig.add_trace(
            go.Box(
                y=random_features_means,
                x=[1] * len(random_features_means),
                line_color=colors[1],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        # Global distribution comparison
        fig.add_trace(
            go.Box(
                y=all_means_file_subset,
                x=[2] * len(all_means_file_subset),
                boxpoints=False,
                line_color=colors[2],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        pvals = test_distribution(
            x=avg_per_bin,
            y=all_means_file_subset,
        )
        print(f"{biospecimen}, {nb_features} features, {nb_files} files")
        print(f"pvals [Welch, Permutation, BM]: {pvals}\n")

        pval = float(np.max(pvals))
        pval_symbol = define_pval_label(pval)

        group_name = f"{biospecimen}<br>({nb_features} features, {nb_files} files)<br>pval{pval_symbol}"

        fig.update_xaxes(
            showticklabels=False,
            row=idx // 4 + 1,
            col=idx % 4 + 1,
            title=group_name,
        )

    # Legend with dummy points
    for i, name in zip(
        range(3),
        ["Important SHAP features", "Random features", "All features (whiskers=max/min)"],
    ):
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                name=name,
                legendgroup=name,
                showlegend=True,
                marker=dict(color=colors[i], symbol="square"),
            ),
        )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="ChromScore per biospecimen file subset",
        width=1200,
        height=1200,
        legend=dict(
            itemsizing="constant",
        ),
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb_per_biospecimen_boxplot"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
# plot_chromscore_per_biospecimen_box(
#     graph_data=graph_data,
#     logdir=logdir,
# )

In [None]:
def plot_chromscore_per_biospecimen_violin(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    cell_types: List[str] | None = None,
    logdir: Path | None = None,
    do_subplots: bool = True,
    filename: str = "important_features_16ct_max_chromscore_100kb_per_biospecimen_2violin",
) -> pd.DataFrame:
    """
    Plot boxplots for important features and random features,
    using regions and files per biospecimen independently.

    Args:
        graph_data: Dict[str, Dict[str, List[float] | int]]. From prepare_chromscore_per_biospecimen_data.
        cell_types: List[str]|None. List of cell types to plot.

    Returns:
        pd.DataFrame. Dataframe with pvals.
    """
    data = copy.deepcopy(graph_data)
    if not cell_types:
        cell_types = list(data.keys())
        cell_types.remove("all_files")

    colors = px.colors.qualitative.Dark24[0:2]

    fig = go.Figure()
    if do_subplots:
        fig = make_subplots(
            rows=4,
            cols=4,
            shared_yaxes=True,
            vertical_spacing=0.1,
            y_title="Average of max value in selected regions of 100kb (over files)",
        )

    # Filter
    try:
        data = {biospecimen: graph_data[biospecimen] for biospecimen in cell_types}
    except KeyError as err:
        raise KeyError(
            f"A cell type is missing from the graph_data.\ncell types: {graph_data.keys()}.\nDesired: {cell_types}."
        ) from err

    all_pvals = []
    trace_names = []
    for idx, (biospecimen, data) in enumerate(data.items()):
        if biospecimen not in cell_types:
            continue

        avg_per_bin: List[float] = data["avg_per_bin"]  # type: ignore
        all_means_file_subset: List[float] = data["all_means_file_subset"]  # type: ignore

        nb_files = data["nb_files"]
        nb_features = len(avg_per_bin)

        if do_subplots:
            placement_dict = {
                "row": idx // 4 + 1,
                "col": idx % 4 + 1,
            }
        else:
            placement_dict = {}

        # Important features
        fig.add_trace(
            go.Violin(
                side="negative",
                name=f"trace{idx}",
                y=avg_per_bin,
                fillcolor=colors[0],
                line=dict(color="black", width=1.5 if do_subplots else 0),
                showlegend=False,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                legendgroup="All features",
                box=dict(
                    visible=True,
                    fillcolor=colors[0] if do_subplots else "black",
                    width=0.4,
                    line_width=0.5 if do_subplots else 0,
                ),
                scalemode="width",  # occupy all possible space for subplots
                scalegroup=f"trace{idx}",
            ),
            **placement_dict,  # type: ignore
        )

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                side="positive",
                name=f"trace{idx}",
                y=all_means_file_subset,
                fillcolor=colors[1],
                line=dict(color="black", width=1.5 if do_subplots else 0),
                showlegend=False,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                legendgroup="All features",
                box=dict(
                    visible=True,
                    fillcolor=colors[1] if do_subplots else "black",
                    width=0.4,
                    line_width=0.5 if do_subplots else 0,
                ),
                scalemode="width",
                scalegroup=f"trace{idx}",
            ),
            **placement_dict,  # type: ignore
        )

        pvals = test_distribution(
            x=avg_per_bin,
            y=all_means_file_subset,
            verbose=False,
        )
        print(f"{biospecimen}, {nb_features} features, {nb_files} files")
        print(f"pvals [Welch, BM]: {pvals}\n\n")

        all_pvals.append(
            [biospecimen, nb_files, nb_features, len(all_means_file_subset), *pvals]
        )

        pval = float(np.max(pvals))
        pval_symbol = define_pval_label(pval)

        if do_subplots:
            group_name = f"{biospecimen}<br>({nb_files} files, {nb_features} features)<br>p{pval_symbol}"
            fig.update_xaxes(
                showticklabels=False,
                row=idx // 4 + 1,
                col=idx % 4 + 1,
                title=group_name,
            )
        else:
            group_name = f"{biospecimen} ({nb_files} files, {nb_features} features), p{pval_symbol}"
            trace_names.append(group_name)

    # Manually set names for traces
    if not do_subplots:
        newnames = {f"trace{idx}": name for idx, name in enumerate(trace_names)}
        fig.for_each_trace(lambda t: t.update(name=newnames[t.name]))

    # Legend with dummy points
    for i, name in enumerate(["Important SHAP features", "All features"]):
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                name=name,
                legendgroup=name,
                showlegend=True,
                marker=dict(color=colors[i], symbol="square"),
            ),
        )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="ChromScore per biospecimen file subset",
        width=1200,
        height=1200,
        legend=dict(
            itemsizing="constant",
        ),
    )

    # fig.update_layout(violingap=0, violinmode='overlay')

    if not do_subplots:
        fig.update_layout(
            yaxis_title="Average of max value in selected regions of 100kb (over files)",
            xaxis_title="Biospecimen",
            width=700,
            height=700,
        )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        fig.write_image(logdir / f"{filename}.svg")
        fig.write_image(logdir / f"{filename}.png", scale=1.5)
        fig.write_html(logdir / f"{filename}.html")

    return pd.DataFrame(
        all_pvals,
        columns=[
            "biospecimen",
            "nb_files",
            "Nb features (N_1)",
            "Nb features global (N_2)",
            "pval_Welch",
            "pval_BM",
        ],
    )

In [None]:
# cell_types = ["t_cell", "neutrophil", "lymphocyte_of_b_lineage", "brain", "hepatocyte"]

# plot_chromscore_per_biospecimen_violin(
#     graph_data=graph_data,
#     do_subplots=False,
#     cell_types=cell_types,
#     # filename="v2_important_features_16ct_max_chromscore_100kb_per_biospecimen_2violin",
#     # logdir=logdir,
# )


pvals_df = plot_chromscore_per_biospecimen_violin(
    graph_data=graph_data,
    do_subplots=True,
    # cell_types=cell_types,
    filename="v4_important_features_16ct_max_chromscore_100kb_per_biospecimen_2violin",
    logdir=logdir,
)

In [None]:
pvals_df.set_index("biospecimen", inplace=True)

In [None]:
pvals_df["corrected_pval_Welch"] = pvals_df["pval_Welch"].apply(
    lambda x: min(1, x * len(pvals_df))
)
pvals_df["corrected_pval_BM"] = pvals_df["pval_BM"].apply(
    lambda x: min(1, x * len(pvals_df))
)

In [None]:
filepath = (
    logdir
    / "v4_important_features_16ct_max_chromscore_100kb_per_biospecimen_2violin_pvals.csv"
)
pvals_df.to_csv(filepath, index=True)

In [None]:
plot_chromscore_global_violin(
    graph_data=graph_data,
    logdir=logdir,
    filename="chromscore_16ct_global_2violin_v3",
)

## Other

In [None]:
# shap_md5s_path = input_base / "hdf5_list" / "md5_shap_assay_explain.list"
# with open(shap_md5s_path, "r", encoding="utf8") as f:
#     shap_md5s = set(f.read().splitlines())


def analyze_feature_vals(
    regions_dict: Dict[int, Tuple],
    md5s: List[str],
    hdf5_list: Path,
    logdir: Path,
    name: str,
    shap_md5s: List[str],
):
    """
    Generate and save a violin plot of provided feature values for the provided md5s, with some md5s highlighted.

    This function takes as input a list of md5s and a dictionary of regions, and generates a violin plot
    of the feature values for these md5s. It also highlights specific md5s by adding lines+markers for them.
    The function saves the plot as an HTML file and a PNG file in the provided log directory.

    Args:
        regions_dict (Dict[int, Tuple]): A dictionary mapping region indices to their respective genomic coordinates.
        md5s (List[str]): A list of md5s to analyze.
        hdf5_list (Path): Path to the list of hdf5 files to be used.
        logdir (Path): Directory where the resulting plot should be saved.
        name (str): Name used to save the resulting plot (will be part of the filename).
    """
    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
    hdf5_loader.load_hdf5s(hdf5_list, md5s, strict=True)
    N = len(hdf5_loader.signals)

    nb_highlight = 3
    highlight_md5s = list(set(md5s) & set(shap_md5s))[0:nb_highlight]

    traces = []
    highlight_values = {highlight_md5: [] for highlight_md5 in highlight_md5s}
    for region, region_bed in regions_dict.items():
        values = [signal[region] for signal in hdf5_loader.signals.values()]
        region_str = f"{region_bed[0]}:{region_bed[1]}-{region_bed[2]}"

        trace = go.Violin(
            y=values,
            name=region_str,
            points="all",
            box_visible=True,
            meanline_visible=True,
        )
        traces.append(trace)

        for highlight_md5 in highlight_md5s:
            highlight_value = hdf5_loader.signals[highlight_md5][region]
            highlight_values[highlight_md5].append((region_str, highlight_value))

    for (highlight_md5, highlight_value), marker_format in zip(
        highlight_values.items(),
        [["cross", "black"], ["circle", "blue"], ["diamond", "red"]],
    ):
        x, y = zip(*highlight_value)
        symbol, color = marker_format
        highlight_trace = go.Scatter(
            x=x,
            y=y,
            mode="lines+markers",
            name=f"{highlight_md5}",
            marker={"size": 6, "symbol": symbol, "color": color},
        )
        traces.append(highlight_trace)

    # Create the layout
    layout = go.Layout(
        title=f"Feature values distributions for {N} {name} samples (0blklst)",
        yaxis={"title": "z-score"},
        xaxis={"title": "Region"},
        showlegend=False,
    )

    # Create the figure with the data and layout
    fig = go.Figure(data=traces, layout=layout)
    fig.write_html(logdir / f"feature_values_{name}.html")

    width = 1200
    fig.write_image(
        logdir / f"feature_values_{name}.png", width=width, height=width * 3 / 4
    )
    # fig.show()

In [None]:
def plot_single_file(hdf5_list_path, md5, zscore: bool = True):
    """Produce a violin plot (save to html) of all feature values for a single sample."""
    if zscore:
        mode = "z-scores"
    else:
        mode = "raw values"

    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=zscore)
    signals = hdf5_loader.load_hdf5s(hdf5_list_path, [md5], strict=True).signals

    fig = px.violin(
        data_frame=list(signals.values())[0],
        box=True,
        points="all",
        title=f"Violin plot for {md5} {mode}",
    )
    fig.write_html(f"{md5}-{mode}.html")
    fig.show()

In [None]:
def evaluate_casting_error(filepath: Path | str, dataset_name: str):
    """Evaluate the casting error for a specific dataset in an HDF5 file."""
    with h5py.File(filepath, "r") as f:
        dataset: h5py.Dataset = f[dataset_name]  # type: ignore
        values: np.ndarray = dataset[:]  # type: ignore

        # Cast to float32 and compare max diff
        casted_dataset = dataset.astype(np.float32)[:]
        diff = np.abs(casted_dataset - values)
        max_diff = np.max(diff)
        print(f"Max diff when casting: {max_diff}")
        if max_diff > 1e-4:
            print("Induced casting error")
            print(f"Max value: {np.max(values)}")
            print(f"Filepath: {filepath}")
            print(f"Dataset name: {dataset_name}")


# traces = []
# for filepath in paths:
#     with h5py.File(filepath, "r+") as f:
#         for _, group in f.items():
#             for dataset_name, dataset in list(group.items()):
#                 # Extract the values from the dataset
#                 values = dataset[:]

#                 # Create a violin trace
#                 trace = go.Violin(y=values, name=dataset_name)

#                 # Add the trace to the data list
#                 traces.append(trace)

#                 evaluate_casting_error(filepath, dataset_name)

#     # Create the layout
#     layout = go.Layout(title="Violin Plots", yaxis={"title": "Values"})

#     # Create the figure with the data and layout
#     fig = go.Figure(data=traces, layout=layout)

#     # Show the violin plot
#     fig.show()
#     traces = []

In [None]:
def evaluate_descriptive_stats(
    df: pd.DataFrame, metadata_df: pd.DataFrame, metadata: Metadata, logdir: Path
):
    """Evaluate the descriptive statistics for a DataFrame."""
    percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
    stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
    metrics = set(stats_df.columns.values)
    stats_df = stats_df.join(metadata_df)  # type: ignore

    # Create violin plots, one plot for each metric, and a violin for each assay (per plot)
    allowed_metrics = metrics - set(["count", "mean", "std"])
    category_orders = {ASSAY: sorted(metadata.label_counter(ASSAY, verbose=False).keys())}
    for column in stats_df:
        if column not in allowed_metrics:
            continue
        fig = px.violin(
            data_frame=stats_df,
            x=column,
            y=ASSAY,
            box=True,
            points="all",
            title=f"Violin plot for {column}",
            color=ASSAY,
            category_orders=category_orders,
            height=800,
            hover_data={"md5sum": (df.index)},
        )
        fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
        fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")
    return stats_df

In [None]:
# # Assuming you have a list of arrays
# hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
# signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals
# df = pd.DataFrame.from_dict(signals, orient="index")
# # df.head()