In [None]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument, use-dict-literal, too-many-lines

## SETUP

In [None]:
from __future__ import annotations

import gc
import json
import tarfile
from collections import Counter
from pathlib import Path
from typing import IO, Dict, Iterable, List, Sequence, Tuple

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px  # type: ignore
import plotly.graph_objects as go  # type: ignore
from plotly.subplots import make_subplots  # type: ignore
from scipy import stats

from epi_ml.core.data_source import EpiDataSource  # pylint: disable=unused-import
from epi_ml.core.epiatlas_treatment import (  # pylint: disable=unused-import
    ACCEPTED_TRACKS,
)
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata
from epi_ml.utils.bed_utils import bed_to_bins, bins_to_bed_ranges

ASSAY = "assay_epiclass"
TRACK_TYPE = "track_type"
CELL_TYPE = "harmonized_sample_ontology_intermediate"

In [None]:
%matplotlib inline

In [None]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epiclass"
input_base = base / "input"
output_base = base / "output"

chromsize_path = input_base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = (
    input_base
    / "metadata/dfreeze-v2/hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2.json"
)

base_logdir = output_base / "logs"
logdir = base_logdir / "epiatlas-dfreeze-v2.1/hdf5_stats"

In [None]:
paper_dir = output_base / "paper"
table_dir = paper_dir / "tables"

In [None]:
chromsizes: List[Tuple[str, int]] = EpiDataSource.load_external_chrom_file(chromsize_path)

chroms: List[str] = sorted([chrom for chrom, _ in chromsizes])

In [None]:
metadata = Metadata(metadata_path)
metadata_df = metadata.to_df()

## Global bin metrics analysis

e.g. mean/stddev, median/IRQ in raw hdf5 values, or other data like ChromScore or CNV. 

In [None]:
hdf5_list_path = input_base / "hdf5_list" / "100kb_all_none_10samples.list"
# hdf5_list_path = (
#     input_base
#     / "hdf5_list"
#     / "hg38_2023-01-epiatlas-freeze"
#     / "100kb_all_none_0blklst.list"
# )

# datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
# my_meta = Metadata(datasource.metadata_file)
# my_meta.display_labels("track_type")

# my_meta.select_category_subsets("track_type", ACCEPTED_TRACKS)
# my_meta.display_labels("track_type")

paths = Hdf5Loader.read_list(hdf5_list_path)

In [None]:
def read_hdf5_sizes(hdf5_path: Path | str, chroms: List[str]) -> Dict[str, int]:
    """Read the HDF5 file and return the data."""
    with h5py.File(hdf5_path, "r") as file:
        header = list(file.keys())[0]
        hdf5_data = file[header]
        chrom_lengths = {chrom: len(hdf5_data[chrom][...]) for chrom in chroms}  # type: ignore
    return chrom_lengths

In [None]:
a_file = list(paths.values())[0]
chr_bin_sizes = read_hdf5_sizes(a_file, chroms)

In [None]:
len(chr_bin_sizes)

In [None]:
def plot_feature_positions(
    feature_dict: Dict[str, Sequence[int]],
    chr_bin_sizes: Dict[str, int],
    output_name: str,
    logdir: Path,
):
    """Plot the features into a global genome position plot.

    feature_dict: Dict[str, Iterable[int]]: A dictionary of feature names and positions.
    chr_bin_sizes: Dict[str, int]: The chromosome sizes. Needs to be matching the feature_dict resolution.
    output_name: str: The name of the output file.
    logdir: Path: The directory to save the output file.
    """
    layout = go.Layout(autosize=False, width=1500, height=500)
    fig = go.Figure(layout=layout)

    # Add the features
    # Sort features by set size
    sorted_features = sorted(
        feature_dict.items(), key=lambda item: len(item[1]), reverse=False
    )
    for i, (feature_name, feature_positions) in enumerate(sorted_features):
        fig.add_trace(
            go.Scatter(
                x=feature_positions,
                y=i * np.ones(len(feature_positions)),
                mode="markers",
                marker=dict(color="red", size=2),
                name=f"{feature_name} ({len(feature_positions)})",
            )
        )

    # Add vertical line for each chrom end
    line_position = 0
    for i, chrom in enumerate(list(chr_bin_sizes)):
        line_position += chr_bin_sizes[chrom]

        # Add vertical line for each chromosome, except last one
        if i != len(chr_bin_sizes) - 1:
            fig.add_shape(
                type="line",
                xref="x",  # Use the x-axis for positioning
                yref="paper",  # Use the figure's relative height for y positioning
                x0=line_position,
                x1=line_position,
                y0=0,  # Start from bottom of the plot area
                y1=1,  # Extend to the top of the plot area
                line=dict(
                    color="black",
                    width=1,
                    dash="dashdot",
                ),
            )

        fig.add_annotation(
            x=line_position - 800,  # Position the label at the chromosome boundary
            y=1
            + 0.05
            * (
                i % 2
            ),  # Adjust the y position to be near the top of the figure; use a relative value within 'paper' coordinate
            text=chrom,  # Chromosome label
            showarrow=False,  # Do not show an arrow pointing to the annotation
            xref="x",  # Use the x-axis for positioning
            yref="paper",  # Use the figure's relative height for y positioning
            xanchor="left",  # Anchor the text to the left of the x position
            yanchor="bottom",  # Anchor the text to the bottom of the y position
            font=dict(family="Arial", size=10, color="RoyalBlue"),
        )

    # Update the layout
    fig.update_layout(
        title="Important feature positions",
        showlegend=False,
        xaxis_title="Genomic position",
        xaxis=dict(range=[0 - 10, sum(chr_bin_sizes.values()) + 10]),
        yaxis_title="Set of features",
        yaxis=dict(
            tickmode="array",
            tickvals=list(range(len(feature_dict))),
            ticktext=[
                f"{name} (n={len(features)})" for name, features in sorted_features
            ],
        ),
    )

    fig.write_html(logdir / f"{output_name}.html")
    fig.write_image(logdir / f"{output_name}.png")
    fig.write_image(logdir / f"{output_name}.svg")
    fig.show()

In [None]:
global_important_features_dir = (
    Path.home() / "Projects/epiclass/output/models/SHAP/global_task_features/global_info"
)
global_important_features_path = (
    global_important_features_dir / "global_task_features.json"
)
with open(global_important_features_path, "r", encoding="utf8") as f:
    global_important_features = json.load(f)

In [None]:
plot_feature_positions(
    global_important_features,
    chr_bin_sizes,
    "important_features_on_genome",
    logdir=global_important_features_dir,
)

In [None]:
def compute_enrichment_per_chrom(
    chr_bin_sizes: Dict[str, int], bed_dir: Path
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Compute the enrichment per chromosome of each feature set.

    Args:
        chr_bin_sizes (Dict[str, int]): The chromosome sizes (number of bins).
        bed_dir (Path): The directory containing the bed files of the same resolution.

    Returns:
        pd.DataFrame: A DataFrame containing the relative enrichment values for each feature set across chromosomes.
        pd.DataFrame: A DataFrame containing the chromosome bin count for each feature set.
    """
    # Read the bed files
    bed_files = list(bed_dir.glob("*.bed"))
    chr_count = {}
    relative_chr_count = {}
    for bed_file in bed_files:
        with open(bed_file, "r", encoding="utf8") as f:
            lines = f.readlines()
        set_chr_count = Counter([line.split("\t")[0] for line in lines])
        set_relative_chr_count = {
            chrom: count / len(lines) for chrom, count in set_chr_count.items()
        }

        set_name = str(bed_file.stem).rsplit("_", 1)[0]
        chr_count[set_name] = set_chr_count
        relative_chr_count[set_name] = set_relative_chr_count

    # Chrom relative sizes
    relative_chromsize = {
        chrom: size / sum(chr_bin_sizes.values()) for chrom, size in chr_bin_sizes.items()
    }

    # Compute the enrichement
    relative_enrichement = {}
    for set_name, set_counter in relative_chr_count.items():
        relative_enrichement[set_name] = {
            chrom: count / relative_chromsize[chrom]
            for chrom, count in set_counter.items()
        }

    relative_enrichement = pd.DataFrame(
        data=relative_enrichement, index=list(chr_bin_sizes.keys())
    ).transpose()
    chr_count = pd.DataFrame(data=chr_count, index=list(chr_bin_sizes.keys())).transpose()
    return relative_enrichement, chr_count  # type: ignore

In [None]:
bed_dir = global_important_features_dir / "global_task_features_beds"
enrichment, chr_count = compute_enrichment_per_chrom(chr_bin_sizes, bed_dir)

In [None]:
enrichment.to_csv(global_important_features_dir / "chromosome_feature_enrichment.csv")
chr_count.to_csv(global_important_features_dir / "chromosome_feature_count.csv")

In [None]:
def plot_bin_metrics(npz_file: Path, output_name: str, chr_bin_sizes: Dict[str, int]):
    """Plot the bin metrics from a numpy file."""
    with np.load(npz_file) as data:
        bin_metrics = {k: data[k] for k in data.files}

    means = bin_metrics["mean"]
    std_devs = bin_metrics["std"]
    medians = bin_metrics["median"]
    iqrs = bin_metrics["iqr"]

    # subsample values
    # means = means
    # std_devs = std_devs
    # medians = medians
    # iqrs = iqrs

    # Indices for x-axis, assuming each point's x-coordinate is its index
    indices = np.arange(len(means))

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

    # Add scatter plot for means with standard deviation as error bars to the first subplot
    fig.add_trace(
        go.Scatter(
            x=indices,
            y=means,
            mode="markers",
            name="Mean with Std Dev",
            marker=dict(size=1),  # Smaller marker size
            error_y=dict(
                type="data",
                array=std_devs,
                visible=True,
                thickness=1,  # Thinner error bars
                width=2,  # Narrower end caps on error bars
            ),
        ),
        row=1,
        col=1,  # Position of the trace in the subplot grid
    )

    # Add scatter plot for medians with IQR as error bars to the second subplot
    fig.add_trace(
        go.Scatter(
            x=indices,
            y=medians,
            mode="markers",
            name="Median with IQR",
            marker=dict(size=1),  # Smaller marker size
            error_y=dict(
                type="data",
                array=iqrs / 2,  # Approximation
                visible=True,
                thickness=1,  # Thinner error bars
                width=2,  # Narrower end caps on error bars
            ),
        ),
        row=2,
        col=1,  # Position of the trace in the subplot grid
    )

    # Update layout for clarity
    fig.update_layout(
        title="Separate Metrics with Error Bars",
        xaxis_title="Bin position",
        yaxis_title="Mean Values",
        legend_title="Metric Type",
    )

    # Specific labels for the second subplot
    fig.update_yaxes(title_text="Median Values", row=2, col=1)

    # Add vertical line for each chrom end to both subplots
    for row in [1, 2]:
        line_position = 0
        for chrom in chr_bin_sizes:
            line_position += chr_bin_sizes[chrom]
            # Add vertical line to the first subplot
            fig.add_shape(
                type="line",
                x0=line_position,
                x1=line_position,
                y0=0,  # Start from bottom of the plot area
                y1=1,  # Extend to the top of the plot area
                line=dict(
                    color="black",
                    width=1,
                    dash="dashdot",
                ),
                xref="x",  # Reference to the x-axis of the subplot
                yref="y2 domain",
                row=row,
                col=1,
            )

            fig.add_annotation(
                x=line_position - 1000,  # Position the label at the chromosome boundary
                y=0.95,  # Adjust the y position to be near the top of the figure; use a relative value within 'paper' coordinate
                text=chrom,  # Chromosome label
                showarrow=False,  # Do not show an arrow pointing to the annotation
                xref="x",  # Use the x-axis for positioning
                yref="y2 domain",  # Use the figure's paper for y positioning
                xanchor="left",  # Anchor the text to the left of the x position
                yanchor="bottom",  # Anchor the text to the bottom of the y position
                font=dict(family="Arial", size=10, color="RoyalBlue"),
                row=row,
                col=1,
            )

    # Show plot
    fig.write_html(logdir / f"{output_name}.html")

In [None]:
def read_important_features(global_task_features_path) -> Dict[str, List[int]]:
    """Read the important features from the global task features file."""
    with open(global_task_features_path, "r", encoding="utf8") as file:
        important_features = json.load(file)
    return important_features

In [None]:
def read_cancer_important_bins() -> Dict[str, List[int]]:
    """Read the cancer important bins."""
    dir_path = Path.home() / "scratch/epiclass/join_important_features/global_info/cancer"
    index_dict = {}

    filepath = (
        dir_path / "cancer_intersection_merge_samplings_bed-details_blood_subset.tsv"
    )
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling_blood_subset"] = list(df["bin"])

    filepath = dir_path / "cancer_intersection_merge_samplings_bed-details_2.tsv"
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling"] = list(df["bin"])

    return index_dict

In [None]:
def plot_important_features_metrics(
    important_features: Dict[str, List[int]],
    npz_file_path: Path,
    logdir: Path | None = None,
    include_categories: Iterable[str] | None = None,
) -> Dict[str, Tuple[int, float, float]]:
    """Using the important features positions, plot (violin) the mean values according to the given npz file.

    Adds a violin for a random feature set of the same size, and one for the global distribution.

    Compute the KS test for the random features and the global distribution, and add the p-value to the plot.

    Args:
    - important_features: A dictionary with category names as keys, and lists of feature positions as values.
    - npz_file_path: The path to the npz file containing the bin metrics.
    - logdir: The directory where to save the plots.
    - include_categories: The categories to include in the plot.

    Returns:
    - A dictionary with category names as keys, and tuples of sample size and p-values as values.
    """
    with np.load(npz_file_path) as data:
        bin_metrics = {metric: data[metric] for metric in data.keys()}

    means = np.array(bin_metrics["mean"], dtype=np.float64)

    pvals = {}
    for category_name, features_pos in important_features.items():
        if include_categories and category_name not in include_categories:
            continue

        fig = go.Figure()

        selected_features = np.array(
            [means[pos] for pos in features_pos], dtype=np.float64
        )
        fig.add_trace(
            go.Violin(
                y=selected_features,
                name=f"{category_name} features (N={len(features_pos)})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )

        # Random features comparison
        N = len(features_pos)
        np.random.seed(42)
        random_features = np.random.choice(means, size=N, replace=False)
        fig.add_trace(
            go.Violin(
                y=random_features,
                name=f"Random features (N={N})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_random = stats.ks_2samp(selected_features, random_features)
        if pval_random < 0.0001:
            annot_random = " << 0.001"
        else:
            annot_random = f" = {pval_random:.3f}"

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                y=means,
                name=f"All features N={len(means)}",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_global = stats.ks_2samp(selected_features, means)
        if pval_global < 0.0001:
            annot_global = " << 0.001"
        else:
            annot_global = f" = {pval_global:.3f}"

        # Annotations for p-values
        fig.add_annotation(
            x=1,
            y=max(random_features) + 0.02,
            text=f"p-val {annot_random} (Selected vs. Random)",
            showarrow=False,
            xref="x",
            yref="y",
        )
        fig.add_annotation(
            x=2,
            y=max(means) + 0.02,
            text=f"p-val {annot_global} (Selected vs. Global)",
            showarrow=False,
            xref="x",
            yref="y",
        )

        # Small points
        fig.update_traces(marker=dict(size=2))

        fig.update_layout(
            title=f"Mean values for {category_name} features",
            xaxis_title="Feature set",
            yaxis_title="Mean values",
            violinmode="group",
            width=1200,
            height=800,
        )

        # sanity check, pval random vs global
        _, pval_random_vs_global = stats.ks_2samp(random_features, means)
        if pval_random_vs_global < 0.05:
            print(f"WARNING: pval_random_vs_global: {pval_random_vs_global}")

        pvals[category_name] = (N, pval_random, pval_global, pval_random_vs_global)

        if logdir:
            fig.write_html(logdir / f"{npz_file_path.stem}_{category_name}_violin.html")
            fig.write_image(logdir / f"{npz_file_path.stem}_{category_name}_violin.png")
            fig.write_image(logdir / f"{npz_file_path.stem}_{category_name}_violin.svg")

        fig.show()

    return pvals

## SHAP values: important biospecimen regions

### Read important bins values, and find possible classes for each unique bin.

In [None]:
features_dir = table_dir / "dfreeze_v2/100kb_all_none/SHAP-MLP/cell_type"
features_file = features_dir / "select_beds_top303.tar.gz"

ct_important_bins: Dict[str, List[int]] = {}
with tarfile.open(features_file, "r:gz") as tar:
    for member in tar.getmembers():
        filename = member.name
        if "merge_samplings" in filename and filename.endswith("bed"):
            file_obj: IO[bytes] = tar.extractfile(member)  # type: ignore

            cell_type = (
                filename.split("/")[1]
                .replace("merge_samplings_", "")
                .replace("_features.bed", "")
                .lower()
            )
            ct_important_bins[cell_type] = bed_to_bins(
                file_obj, chroms=chromsizes, resolution=100 * 1000
            )

In [None]:
all_bins = set()
for bins in ct_important_bins.values():
    all_bins.update(bins)

In [None]:
# Find relevant cell types for each bin, optimized for pandas future vectorization
relevant_pairs_list = []
for cell_type, bins_list in ct_important_bins.items():
    for bin_idx in bins_list:
        relevant_pairs_list.append({"bin_index": bin_idx, CELL_TYPE: cell_type})

bin_to_relevant_ct_df = pd.DataFrame(relevant_pairs_list)
bin_to_relevant_ct_df["bin_index"] = bin_to_relevant_ct_df["bin_index"].astype(int)

assert bin_to_relevant_ct_df.shape[0] > len(ct_important_bins)

In [None]:
print(bin_to_relevant_ct_df.shape)
print(bin_to_relevant_ct_df["bin_index"].nunique())

In [None]:
classifier_cell_types = set(bin_to_relevant_ct_df[CELL_TYPE].unique())
assert len(classifier_cell_types) == 16

#### Creating a new table with associated genes + cell types for each region.

The final desired format is  
chr, start, end, cell_types, genes

Reformat bin / cell type association to have full bed value and aggregated cell types

In [None]:
bed_ranges = bins_to_bed_ranges(all_bins, chromsizes, resolution=100 * 1000)
bin_to_bed_dict = dict(zip(all_bins, bed_ranges))
assert (
    len(bin_to_bed_dict) == len(all_bins) == bin_to_relevant_ct_df["bin_index"].nunique()
)

In [None]:
bin_to_relevant_ct_df["bed_range"] = bin_to_relevant_ct_df["bin_index"].map(bin_to_bed_dict)  # type: ignore

bin_to_relevant_ct_df[["chr", "start", "end"]] = bin_to_relevant_ct_df["bed_range"].apply(
    pd.Series
)
bin_to_relevant_ct_df.drop(columns=["bed_range"], inplace=True)

In [None]:
len(bin_to_relevant_ct_df)

In [None]:
grouped_ct_df = (
    bin_to_relevant_ct_df.groupby(["bin_index", "chr", "start", "end"])[CELL_TYPE]
    .agg(lambda x: ";".join(map(str, sorted(set(x)))))
    .reset_index()
    .rename(
        columns={
            CELL_TYPE: "SHAP-MLP_associated_biospecimens",
            "bin_index": "bin_index_100kb",
        }
    )
)

In [None]:
assert grouped_ct_df.shape[0] == grouped_ct_df["bin_index_100kb"].nunique()

Reformat gff intersection file

We will make a version with minimal info, aggregated gene_id + feature_type,  
and another version that keeps details

In [None]:
gene_intersect_filepath = features_dir / "global_union_features_intersect_gff.tsv"
columns = [
    "chromosome",
    "start_100kb",
    "end_100kb",
    "seqname",
    "source",
    "feature",
    "start",
    "end",
    "score",
    "strand",
    "frame",
    "attribute",
    "overlap (bp)",
]
gene_intersect_df = pd.read_csv(
    gene_intersect_filepath, sep="\t", header=None, names=columns, index_col=False
)

# Redundant (seqname) or empty (score,frame)
gene_intersect_df = gene_intersect_df.drop(columns=["seqname", "score", "frame"])

In [None]:
print(f"Nb of genes: {gene_intersect_df.shape[0]}")
print(f"Nb of 100kb regions: {grouped_ct_df['bin_index_100kb'].nunique()}")

Full details

In [None]:
merged_gff = gene_intersect_df.merge(
    grouped_ct_df,
    how="right",
    left_on=["chromosome", "start_100kb", "end_100kb"],
    right_on=["chr", "start", "end"],
    suffixes=("_gff_feature", "_ct_df"),
)
merged_gff = merged_gff.drop(columns=["chr", "start_ct_df", "end_ct_df"])

In [None]:
merged_gff["feature_length"] = (
    merged_gff["end_gff_feature"] - merged_gff["start_gff_feature"] + 1
)
merged_gff["overlap (fraction)"] = (
    merged_gff["overlap (bp)"] / merged_gff["feature_length"]
)

In [None]:
# Rename some columns
merged_gff.rename(
    columns={"source": "gene_DB_source", "feature": "gene_type"}, inplace=True
)

In [None]:
col_order = [
    "chromosome",
    "start_100kb",
    "end_100kb",
    "bin_index_100kb",
    "gene_DB_source",
    "gene_type",
    "start_gff_feature",
    "end_gff_feature",
    "feature_length",
    "overlap (bp)",
    "overlap (fraction)",
    "SHAP-MLP_associated_biospecimens",
    "attribute",
]
merged_gff = merged_gff[col_order]

# Int32 is nullable, not int32
merged_gff = merged_gff.astype(
    {
        "start_100kb": "Int32",
        "end_100kb": "Int32",
        "bin_index_100kb": "Int32",
        "start_gff_feature": "Int32",
        "end_gff_feature": "Int32",
        "feature_length": "Int32",
        "overlap (bp)": "Int32",
    },
)

In [None]:
merged_gff.to_csv(
    features_dir / "global_union_features_intersect_gff_with_ct.tsv",
    sep="\t",
    index=False,
)

Minimal details, aggregated gene info.

Start by reformating gff intersection file to have one line per region

In [None]:
gene_intersect_df.rename(columns={"feature": "gene_type"}, inplace=True)

In [None]:
# # Keep only relevant columns
gene_intersect_df = gene_intersect_df[
    ["chromosome", "start_100kb", "end_100kb", "gene_type", "attribute"]
].copy()

In [None]:
gene_intersect_df["attribute"] = gene_intersect_df["attribute"].str.split(
    ";", expand=True
)[0]

In [None]:
gene_intersect_df["gene_IDs"] = gene_intersect_df["attribute"].str.split(
    "ID=gene:", expand=True
)[1]
gene_intersect_df.drop(columns=["attribute"], inplace=True)

In [None]:
dup_cols = ["chromosome", "start_100kb", "end_100kb"]
grouped_gene_df = (
    gene_intersect_df.groupby(dup_cols)[["gene_IDs", "gene_type"]]
    .agg(lambda x: ";".join(map(str, sorted(set(x)))))
    .reset_index()
)

In [None]:
grouped_gene_df = grouped_gene_df.merge(
    grouped_ct_df,
    how="right",
    left_on=["chromosome", "start_100kb", "end_100kb"],
    right_on=["chr", "start", "end"],
    suffixes=("_gene_df", "_ct_df"),
)

In [None]:
grouped_gene_df.drop(columns=["chromosome", "start_100kb", "end_100kb"], inplace=True)
grouped_gene_df.rename(columns={"start": "start_100kb", "end": "end_100kb"}, inplace=True)

In [None]:
col_order = [
    "chr",
    "start_100kb",
    "end_100kb",
    "bin_index_100kb",
    "SHAP-MLP_associated_biospecimens",
    "gene_IDs",
    "gene_type",
]
grouped_gene_df = grouped_gene_df[col_order]
grouped_gene_df.to_csv(
    features_dir / "global_union_features_intersect_gff_aggregated_with_ct.tsv",
    sep="\t",
    index=False,
)

## ChromScore hdf5 values

For each cell type important feature, find the average ChromScore value throughout associated cell types.  
If bin is present in multiple classes, just use files for all those classes.

### Read ChromScore values, and map files to their cell type.

Using hdf5 output from `bigwig_metrics.py`.

In [None]:
chromscore_dir = paper_dir / "data" / "ChromScore"
chromscore_file = chromscore_dir / "max_metrics.h5"
if not chromscore_file.exists():
    print(f"{chromscore_file} does not exist")

chromscores_df = pd.read_hdf(chromscore_file)
print("Chromscores shape", chromscores_df.shape)
display(chromscores_df.head(n=2))

In [None]:
for col in chromscores_df.columns:
    if chromscores_df[col].isna().sum():
        print(col, chromscores_df[col].isna().sum())

In [None]:
chromscores_df.fillna(0, inplace=True)

In [None]:
chromscores_df["epirr"] = chromscores_df.index.str.split(".").str[0]

In [None]:
# Create a mapping from epirr_id_without_version to cell_type
epirr_to_cell_type = dict(
    metadata_df.loc[:, ["epirr_id_without_version", CELL_TYPE]].values
)

In [None]:
chromscores_df[CELL_TYPE] = (
    chromscores_df["epirr"].map(epirr_to_cell_type).str.replace(" ", "_").str.lower()
)

In [None]:
# Only keep files from classifier 16ct
chromscores_df = chromscores_df[chromscores_df[CELL_TYPE].isin(classifier_cell_types)]

In [None]:
print(chromscores_df["epirr"][0])

### Find mean chromScore for each bin (for their relevant files)

In [None]:
# Melting global chromscores for future operation, now all values are in one column
# Assuming all columns starting with chr are bins, and all 100kb bins are present
region_cols_mapper = {
    col: idx
    for idx, col in enumerate(chromscores_df.columns)
    if isinstance(col, str) and col.startswith("chr")
}
assert len(region_cols_mapper) == 30321

In [None]:
chromscores_df.rename(columns=region_cols_mapper, inplace=True)  # type: ignore

In [None]:
melted_chromscores = chromscores_df.reset_index().rename(columns={"index": "filename"})
melted_chromscores = melted_chromscores.melt(
    id_vars=["epirr", CELL_TYPE],
    value_vars=list(region_cols_mapper.values()),
    var_name="bin_index",
    value_name="chromscore_value",
)
melted_chromscores["bin_index"] = melted_chromscores["bin_index"].astype(int)
print(melted_chromscores.shape)
print(melted_chromscores["bin_index"].nunique())
print(melted_chromscores["epirr"].nunique())

In [None]:
# Merge melted chromscores with bin_to_relevant_ct_df, efficient for pandas
# Filters out all irrelevant bins
merged_df = pd.merge(
    melted_chromscores, bin_to_relevant_ct_df, on=["bin_index", CELL_TYPE], how="inner"
)
print(merged_df.shape)
print(merged_df["bin_index"].nunique())
print(merged_df["epirr"].nunique())

In [None]:
# Keep only columns of interest for plotting
merged_df = merged_df[["epirr", CELL_TYPE, "bin_index", "chromscore_value"]]

### Plot

In [None]:
def plot_important_features_chromscore_global(
    avg_selected: pd.DataFrame,
    all_avg: pd.DataFrame,
    logdir: Path | None = None,
) -> None:
    """
    Plot violin for important features and random features.
    """
    fig = go.Figure()

    # Important features
    print("Tracing important features")
    N = len(avg_selected)
    fig.add_trace(
        go.Box(
            y=avg_selected,
            name=f"Important features per biospecimen (N={N})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    # Random features comparison
    print("Computing random features")
    np.random.seed(42)
    random_features_means = np.random.choice(all_avg, size=N, replace=False)

    print("Tracing random features")
    N = len(random_features_means)
    fig.add_trace(
        go.Violin(
            y=random_features_means,
            name=f"Random features, value on all files (N={N})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    # Global distribution comparison
    print("Tracing all features")
    fig.add_trace(
        go.Violin(
            y=all_avg,
            name=f"All features, all files (N={len(all_avg)})",
            box_visible=True,
            meanline_visible=True,
            points=False,
            spanmode="hard",
        )
    )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="Important cell type features chromScore",
        xaxis_title="Feature set",
        yaxis_title="Average max value in selected regions of 100kb",
        violinmode="group",
        width=800,
        height=600,
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
# avg = merged_df.groupby("bin_index")["chromscore_value"].mean()
# all_avg = melted_chromscores.groupby("bin_index")["chromscore_value"].mean()

In [None]:
logdir = paper_dir / "figures" / "chromscore"
if not logdir.exists():
    logdir.mkdir(parents=True)

# plot_important_features_chromscore_global(
#     avg_selected=avg,
#     all_avg=all_avg,
#     logdir=logdir,
# )

In [None]:
def prepare_chromscore_per_biospecimen_data(
    selected_bins_df: pd.DataFrame,
    all_chromscores_df: pd.DataFrame,
) -> Dict[str, Dict[str, List[float] | int]]:
    """
    Prepare plot data for chromscore per biospecimen plot.

    Each biospecimen need values for:
    - important features
    - random features
    - global distribution

    This is done with independant file subsets.
    """
    total_bins = all_chromscores_df["bin_index"].nunique()

    grouped_means = {}

    for biospecimen, df in selected_bins_df.groupby(by=CELL_TYPE):
        print(f"Processing {biospecimen}")

        all_chromscores_group = all_chromscores_df.loc[
            all_chromscores_df[CELL_TYPE] == biospecimen, :
        ]
        assert all_chromscores_group["chromscore_value"].isna().sum() == 0
        nb_files = df["epirr"].nunique()

        # Important features
        avg_per_bin = df.groupby("bin_index")["chromscore_value"].mean()
        nb_features = len(avg_per_bin)

        # Random features
        np.random.seed(42)
        random_features_idx = np.random.choice(
            range(total_bins), size=nb_features, replace=False
        )

        random_df = all_chromscores_group.loc[
            all_chromscores_group["bin_index"].isin(random_features_idx)
        ]
        random_features_means = random_df.groupby("bin_index")["chromscore_value"].mean()

        # Global distribution
        all_means_file_subset = all_chromscores_group.groupby("bin_index")[
            "chromscore_value"
        ].mean()

        grouped_means[biospecimen] = {
            "avg_per_bin": avg_per_bin.to_list(),
            "random_features_means": random_features_means.to_list(),
            "all_means_file_subset": all_means_file_subset.to_list(),
            "nb_files": nb_files,
        }

    gc.collect()

    return grouped_means

In [None]:
graph_data = prepare_chromscore_per_biospecimen_data(
    selected_bins_df=merged_df, all_chromscores_df=melted_chromscores
)

In [None]:
def plot_chromscore_per_biospecimen(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    logdir: Path | None = None,
) -> None:
    """
    Plot violin for important features and random features,
    using regions and files per biospecimen independently.
    """
    fig = go.Figure()

    colors = px.colors.qualitative.Dark24_r[0:3]

    for biospecimen, data in graph_data.items():
        avg_per_bin = data["avg_per_bin"]
        random_features_means = data["random_features_means"]
        all_means_file_subset = data["all_means_file_subset"]

        nb_files = data["nb_files"]
        nb_features = len(avg_per_bin)

        # Important features
        group_name = f"{biospecimen} ({nb_features} features, {nb_files} files)"
        fig.add_trace(
            go.Violin(
                y=avg_per_bin,
                name=group_name,
                legendgroup="SHAP features",
                legendgrouptitle_text="SHAP features",
                box_visible=True,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                line_color=colors[0],
            )
        )

        # Random features comparison
        fig.add_trace(
            go.Violin(
                y=random_features_means,
                name=group_name,
                legendgroup="Random features",
                legendgrouptitle_text="Random features",
                box_visible=True,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                line_color=colors[1],
            )
        )

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                y=all_means_file_subset,
                name=group_name,
                legendgroup="All features",
                legendgrouptitle_text="All features",
                box_visible=True,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                line_color=colors[2],
            )
        )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="ChromScore per biospecimen file subset",
        xaxis_title="Biospecimen",
        yaxis_title="Average max value in selected regions of 100kb",
        violinmode="group",
        width=1200,
        height=1200,
        legend_groupclick="toggleitem",
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
# plot_chromscore_per_biospecimen(
#     graph_data=graph_data,
#     # logdir=logdir,
# )

In [None]:
def plot_chromscore_per_biospecimen_box(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    logdir: Path | None = None,
) -> None:
    """
    Plot boxplots for important features and random features,
    using regions and files per biospecimen independently.
    """
    fig = go.Figure()

    colors = px.colors.qualitative.Dark24_r[0:3]

    fig = make_subplots(
        rows=4,
        cols=4,
        shared_yaxes=True,
        vertical_spacing=0.1,
        y_title="Average max value in selected regions of 100kb",
    )

    for idx, (biospecimen, data) in enumerate(graph_data.items()):
        avg_per_bin = data["avg_per_bin"]
        random_features_means = data["random_features_means"]
        all_means_file_subset = data["all_means_file_subset"]

        nb_files = data["nb_files"]
        nb_features = len(avg_per_bin)

        # Important features
        fig.add_trace(
            go.Box(
                y=avg_per_bin,
                x=[0] * len(avg_per_bin),
                line_color=colors[0],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        # Random features comparison
        fig.add_trace(
            go.Box(
                y=random_features_means,
                x=[1] * len(random_features_means),
                line_color=colors[1],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        # Global distribution comparison
        fig.add_trace(
            go.Box(
                y=all_means_file_subset,
                x=[2] * len(all_means_file_subset),
                boxpoints=False,
                line_color=colors[2],
                showlegend=False,
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        pval = stats.ttest_ind(
            a=avg_per_bin,
            b=all_means_file_subset,
            equal_var=False,
            alternative="greater",
            nan_policy="raise",
        ).pvalue

        pval_symbol = ""
        if pval < 0.001:
            pval_symbol = "<0.001***"
        elif pval < 0.01:
            pval_symbol = "<0.01**"
        elif pval < 0.05:
            pval_symbol = "<0.05*"
        elif pval >= 0.05:
            pval_symbol = ">0.05 NS"
            print(f"Warning: pval={pval:.3f} for {biospecimen}")

        group_name = f"{biospecimen}<br>({nb_features} features, {nb_files} files)<br>pval{pval_symbol}"

        fig.update_xaxes(
            showticklabels=False,
            row=idx // 4 + 1,
            col=idx % 4 + 1,
            title=group_name,
        )

    # Legend with dummy points
    for i, name in zip(
        range(3),
        ["Important SHAP features", "Random features", "All features (whiskers=max/min)"],
    ):
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                name=name,
                legendgroup=name,
                showlegend=True,
                marker=dict(color=colors[i], symbol="square"),
            ),
        )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="ChromScore per biospecimen file subset",
        width=1200,
        height=1200,
        legend=dict(
            itemsizing="constant",
        ),
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb_per_biospecimen_boxplot"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
plot_chromscore_per_biospecimen_box(
    graph_data=graph_data,
    logdir=logdir,
)

In [None]:
def plot_chromscore_per_biospecimen_violin(
    graph_data: Dict[str, Dict[str, List[float] | int]],
    logdir: Path | None = None,
) -> None:
    """
    Plot boxplots for important features and random features,
    using regions and files per biospecimen independently.
    """
    fig = go.Figure()

    colors = px.colors.qualitative.Dark24[0:2]

    fig = make_subplots(
        rows=4,
        cols=4,
        shared_yaxes=True,
        vertical_spacing=0.1,
        y_title="Average max value in selected regions of 100kb",
    )

    for idx, (biospecimen, data) in enumerate(graph_data.items()):
        avg_per_bin = data["avg_per_bin"]
        all_means_file_subset = data["all_means_file_subset"]

        nb_files = data["nb_files"]
        nb_features = len(avg_per_bin)

        # Important features
        fig.add_trace(
            go.Violin(
                name="miaw",
                y=avg_per_bin,
                line_color=colors[0],
                showlegend=False,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                side="negative",
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                name="miaw",
                y=all_means_file_subset,
                line_color=colors[1],
                showlegend=False,
                meanline_visible=True,
                points=False,
                spanmode="hard",
                side="positive",
            ),
            row=idx // 4 + 1,
            col=idx % 4 + 1,
        )

        pval = stats.ttest_ind(
            a=avg_per_bin,
            b=all_means_file_subset,
            equal_var=False,
            alternative="greater",
            nan_policy="raise",
        ).pvalue

        pval_symbol = ""
        if pval < 0.001:
            pval_symbol = "<0.001***"
        elif pval < 0.01:
            pval_symbol = "<0.01**"
        elif pval < 0.05:
            pval_symbol = "<0.05*"
        elif pval >= 0.05:
            pval_symbol = ">0.05 NS"
            print(f"Warning: pval={pval:.3f} for {biospecimen}")

        group_name = f"{biospecimen}<br>({nb_features} features, {nb_files} files)<br>pval{pval_symbol}"

        fig.update_xaxes(
            showticklabels=False,
            row=idx // 4 + 1,
            col=idx % 4 + 1,
            title=group_name,
        )

    # Legend with dummy points
    for i, name in enumerate(["Important SHAP features", "All features"]):
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                name=name,
                legendgroup=name,
                showlegend=True,
                marker=dict(color=colors[i], symbol="square"),
            ),
        )

    fig.update_yaxes(range=[0, 1])

    fig.update_layout(
        title="ChromScore per biospecimen file subset",
        width=1200,
        height=1200,
        legend=dict(
            itemsizing="constant",
        ),
    )

    fig.show()

    if logdir is not None:
        print("Saving figure.")
        name = "important_features_16ct_max_chromscore_100kb_per_biospecimen_2violin"
        fig.write_image(logdir / f"{name}.svg")
        fig.write_image(logdir / f"{name}.png")
        fig.write_html(logdir / f"{name}.html")

In [None]:
plot_chromscore_per_biospecimen_violin(
    graph_data=graph_data,
    logdir=logdir,
)

In [None]:
def plot_important_features_chromscore_box_matplotlib(
    selected_features: pd.DataFrame,
    all_features: pd.DataFrame,
    logdir: Path | None = None,
) -> None:
    """
    Plot boxplot for important features, random features, and all features using Matplotlib.
    """
    # --- Data Preparation ---
    print("Preparing data")
    # Important features
    y_selected = selected_features[
        "chromscore_value"
    ].dropna()  # Important to drop NaNs for boxplot

    # Random features comparison
    np.random.seed(42)
    N_unique_features = selected_features["bin_index"].nunique()
    # Ensure we don't try to sample more unique features than available
    N_all_unique_features = all_features["bin_index"].nunique()
    sample_size = min(N_unique_features, N_all_unique_features)

    # Get unique bin_index values from all_features to sample from
    all_unique_bin_indices = all_features["bin_index"].unique()
    random_bin_indices = np.random.choice(
        all_unique_bin_indices, size=sample_size, replace=False
    )
    y_random = all_features.loc[
        all_features["bin_index"].isin(random_bin_indices), "chromscore_value"
    ].dropna()

    # Global distribution comparison
    y_all = all_features["chromscore_value"].dropna()

    data_to_plot = [y_selected, y_random, y_all]

    # --- Labels for the boxes ---
    # Using \n for line breaks in labels to make them more readable if long
    labels = [
        f"Important features\nper biospecimen (N={len(y_selected)})",
        f"Random features\n(N={len(y_random)})",  # Simplified label a bit
        f"All features\nall files (N={len(y_all)})",
    ]

    # --- Plotting ---
    print("Plotting")
    fig, ax = plt.subplots(
        figsize=(10, 7)
    )  # Adjust figsize as needed (width, height in inches)

    # Create the boxplot
    # patch_artist=True allows filling boxes with color if desired later
    # showfliers=False mimics boxpoints=False if outliers are not desired
    _ = ax.boxplot(data_to_plot, labels=labels, showfliers=False)

    # --- Customization ---
    ax.set_ylim(0, 1)
    ax.set_title("Important cell type features chromScore", fontsize=16)
    ax.set_xlabel("Feature set", fontsize=14)
    ax.set_ylabel("Average max value in selected regions of 100kb", fontsize=14)

    # Improve layout to prevent labels from overlapping
    plt.xticks(fontsize=10)  # Adjust x-tick label font size
    plt.yticks(fontsize=10)
    fig.tight_layout()  # Adjusts plot to ensure everything fits without overlapping

    # --- Display and Save ---
    # print("Displaying Matplotlib figure")
    # plt.show()

    print("Saving Matplotlib figure")
    if logdir is not None:
        logdir.mkdir(parents=True, exist_ok=True)  # Ensure logdir exists
        name = "important_features_16ct_max_chromscore_100kb_matplotlib"  # Added suffix
        print(f"Saving Matplotlib figure to {logdir}")
        try:
            # fig.savefig(logdir / f"{name}.svg", bbox_inches='tight')
            fig.savefig(logdir / f"{name}.png", dpi=300, bbox_inches="tight")
        except Exception as e:  # pylint: disable=broad-except
            print(f"Error saving figure: {e}")

In [None]:
plot_important_features_chromscore_box_matplotlib(
    selected_features=merged_df, all_features=melted_chromscores, logdir=logdir
)

## Other

In [None]:
# shap_md5s_path = input_base / "hdf5_list" / "md5_shap_assay_explain.list"
# with open(shap_md5s_path, "r", encoding="utf8") as f:
#     shap_md5s = set(f.read().splitlines())


def analyze_feature_vals(
    regions_dict: Dict[int, Tuple],
    md5s: List[str],
    hdf5_list: Path,
    logdir: Path,
    name: str,
    shap_md5s: List[str],
):
    """
    Generate and save a violin plot of provided feature values for the provided md5s, with some md5s highlighted.

    This function takes as input a list of md5s and a dictionary of regions, and generates a violin plot
    of the feature values for these md5s. It also highlights specific md5s by adding lines+markers for them.
    The function saves the plot as an HTML file and a PNG file in the provided log directory.

    Args:
        regions_dict (Dict[int, Tuple]): A dictionary mapping region indices to their respective genomic coordinates.
        md5s (List[str]): A list of md5s to analyze.
        hdf5_list (Path): Path to the list of hdf5 files to be used.
        logdir (Path): Directory where the resulting plot should be saved.
        name (str): Name used to save the resulting plot (will be part of the filename).
    """
    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
    hdf5_loader.load_hdf5s(hdf5_list, md5s, strict=True)
    N = len(hdf5_loader.signals)

    nb_highlight = 3
    highlight_md5s = list(set(md5s) & set(shap_md5s))[0:nb_highlight]

    traces = []
    highlight_values = {highlight_md5: [] for highlight_md5 in highlight_md5s}
    for region, region_bed in regions_dict.items():
        values = [signal[region] for signal in hdf5_loader.signals.values()]
        region_str = f"{region_bed[0]}:{region_bed[1]}-{region_bed[2]}"

        trace = go.Violin(
            y=values,
            name=region_str,
            points="all",
            box_visible=True,
            meanline_visible=True,
        )
        traces.append(trace)

        for highlight_md5 in highlight_md5s:
            highlight_value = hdf5_loader.signals[highlight_md5][region]
            highlight_values[highlight_md5].append((region_str, highlight_value))

    for (highlight_md5, highlight_value), marker_format in zip(
        highlight_values.items(),
        [["cross", "black"], ["circle", "blue"], ["diamond", "red"]],
    ):
        x, y = zip(*highlight_value)
        symbol, color = marker_format
        highlight_trace = go.Scatter(
            x=x,
            y=y,
            mode="lines+markers",
            name=f"{highlight_md5}",
            marker={"size": 6, "symbol": symbol, "color": color},
        )
        traces.append(highlight_trace)

    # Create the layout
    layout = go.Layout(
        title=f"Feature values distributions for {N} {name} samples (0blklst)",
        yaxis={"title": "z-score"},
        xaxis={"title": "Region"},
        showlegend=False,
    )

    # Create the figure with the data and layout
    fig = go.Figure(data=traces, layout=layout)
    fig.write_html(logdir / f"feature_values_{name}.html")

    width = 1200
    fig.write_image(
        logdir / f"feature_values_{name}.png", width=width, height=width * 3 / 4
    )
    # fig.show()

In [None]:
def plot_single_file(md5, zscore: bool = True):
    """Produce a violin plot (save to html) of all feature values for a single sample."""
    if zscore:
        mode = "z-scores"
    else:
        mode = "raw values"

    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=zscore)
    signals = hdf5_loader.load_hdf5s(hdf5_list_path, [md5], strict=True).signals

    fig = px.violin(
        data_frame=list(signals.values())[0],
        box=True,
        points="all",
        title=f"Violin plot for {md5} {mode}",
    )
    fig.write_html(f"{md5}-{mode}.html")
    fig.show()

In [None]:
def evaluate_casting_error(filepath: Path | str, dataset_name: str):
    """Evaluate the casting error for a specific dataset in an HDF5 file."""
    with h5py.File(filepath, "r") as f:
        dataset: h5py.Dataset = f[dataset_name]  # type: ignore
        values: np.ndarray = dataset[:]  # type: ignore

        # Cast to float32 and compare max diff
        casted_dataset = dataset.astype(np.float32)[:]
        diff = np.abs(casted_dataset - values)
        max_diff = np.max(diff)
        print(f"Max diff when casting: {max_diff}")
        if max_diff > 1e-4:
            print("Induced casting error")
            print(f"Max value: {np.max(values)}")
            print(f"Filepath: {filepath}")
            print(f"Dataset name: {dataset_name}")


# traces = []
# for filepath in paths:
#     with h5py.File(filepath, "r+") as f:
#         for _, group in f.items():
#             for dataset_name, dataset in list(group.items()):
#                 # Extract the values from the dataset
#                 values = dataset[:]

#                 # Create a violin trace
#                 trace = go.Violin(y=values, name=dataset_name)

#                 # Add the trace to the data list
#                 traces.append(trace)

#                 evaluate_casting_error(filepath, dataset_name)

#     # Create the layout
#     layout = go.Layout(title="Violin Plots", yaxis={"title": "Values"})

#     # Create the figure with the data and layout
#     fig = go.Figure(data=traces, layout=layout)

#     # Show the violin plot
#     fig.show()
#     traces = []

In [None]:
def evaluate_descriptive_stats(
    df: pd.DataFrame, metadata_df: pd.DataFrame, metadata: Metadata, logdir: Path
):
    """Evaluate the descriptive statistics for a DataFrame."""
    percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
    stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
    metrics = set(stats_df.columns.values)
    stats_df = stats_df.join(metadata_df)  # type: ignore

    # Create violin plots, one plot for each metric, and a violin for each assay (per plot)
    allowed_metrics = metrics - set(["count", "mean", "std"])
    category_orders = {ASSAY: sorted(metadata.label_counter(ASSAY, verbose=False).keys())}
    for column in stats_df:
        if column not in allowed_metrics:
            continue
        fig = px.violin(
            data_frame=stats_df,
            x=column,
            y=ASSAY,
            box=True,
            points="all",
            title=f"Violin plot for {column}",
            color=ASSAY,
            category_orders=category_orders,
            height=800,
            hover_data={"md5sum": (df.index)},
        )
        fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
        fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")
    return stats_df

In [None]:
# # Assuming you have a list of arrays
# hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
# signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals
# df = pd.DataFrame.from_dict(signals, orient="index")
# # df.head()