In [None]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument, use-dict-literal

In [None]:
from __future__ import annotations

import json
from collections import Counter
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple

import h5py
import numpy as np
import pandas as pd
import plotly.express as px  # type: ignore
import plotly.graph_objects as go  # type: ignore
from plotly.subplots import make_subplots  # type: ignore
from scipy import stats

from epi_ml.core.data_source import EpiDataSource  # pylint: disable=unused-import
from epi_ml.core.epiatlas_treatment import (  # pylint: disable=unused-import
    ACCEPTED_TRACKS,
)
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata

ASSAY = "assay_epiclass"
TRACK_TYPE = "track_type"
CELL_TYPE = "harmonized_sample_ontology_intermediate"

In [None]:
%matplotlib inline

In [None]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epiclass"
input_base = base / "input"
output_base = base / "output"

chromsize_path = input_base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = (
    input_base
    / "metadata/dfreeze-v2/hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2.json"
)

base_logdir = output_base / "logs"
logdir = base_logdir / "epiatlas-dfreeze-v2.1/hdf5_stats"

### Global bin metrics analysis

e.g. mean/stddev, median/IRQ in raw hdf5 values, or other data like ChromScore or CNV. 

In [None]:
hdf5_list_path = input_base / "hdf5_list" / "100kb_all_none_10samples.list"
# hdf5_list_path = (
#     input_base
#     / "hdf5_list"
#     / "hg38_2023-01-epiatlas-freeze"
#     / "100kb_all_none_0blklst.list"
# )

# datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
# my_meta = Metadata(datasource.metadata_file)
# my_meta.display_labels("track_type")

# my_meta.select_category_subsets("track_type", ACCEPTED_TRACKS)
# my_meta.display_labels("track_type")

paths = Hdf5Loader.read_list(hdf5_list_path)

In [None]:
chroms = Hdf5Loader.load_chroms(chromsize_path)

In [None]:
# chroms

In [None]:
def read_hdf5_sizes(hdf5_path: Path | str, chroms: List[str]) -> Dict[str, int]:
    """Read the HDF5 file and return the data."""
    with h5py.File(hdf5_path, "r") as file:
        header = list(file.keys())[0]
        hdf5_data = file[header]
        chrom_lengths = {chrom: len(hdf5_data[chrom][...]) for chrom in chroms}  # type: ignore
    return chrom_lengths

In [None]:
a_file = list(paths.values())[0]
chr_bin_sizes = read_hdf5_sizes(a_file, chroms)

In [None]:
len(chr_bin_sizes)

In [None]:
def plot_feature_positions(
    feature_dict: Dict[str, Sequence[int]],
    chr_bin_sizes: Dict[str, int],
    output_name: str,
    logdir: Path,
):
    """Plot the features into a global genome position plot.

    feature_dict: Dict[str, Iterable[int]]: A dictionary of feature names and positions.
    chr_bin_sizes: Dict[str, int]: The chromosome sizes. Needs to be matching the feature_dict resolution.
    output_name: str: The name of the output file.
    logdir: Path: The directory to save the output file.
    """
    layout = go.Layout(autosize=False, width=1500, height=500)
    fig = go.Figure(layout=layout)

    # Add the features
    # Sort features by set size
    sorted_features = sorted(
        feature_dict.items(), key=lambda item: len(item[1]), reverse=False
    )
    for i, (feature_name, feature_positions) in enumerate(sorted_features):
        fig.add_trace(
            go.Scatter(
                x=feature_positions,
                y=i * np.ones(len(feature_positions)),
                mode="markers",
                marker=dict(color="red", size=2),
                name=f"{feature_name} ({len(feature_positions)})",
            )
        )

    # Add vertical line for each chrom end
    line_position = 0
    for i, chrom in enumerate(list(chr_bin_sizes)):
        line_position += chr_bin_sizes[chrom]

        # Add vertical line for each chromosome, except last one
        if i != len(chr_bin_sizes) - 1:
            fig.add_shape(
                type="line",
                xref="x",  # Use the x-axis for positioning
                yref="paper",  # Use the figure's relative height for y positioning
                x0=line_position,
                x1=line_position,
                y0=0,  # Start from bottom of the plot area
                y1=1,  # Extend to the top of the plot area
                line=dict(
                    color="black",
                    width=1,
                    dash="dashdot",
                ),
            )

        fig.add_annotation(
            x=line_position - 800,  # Position the label at the chromosome boundary
            y=1
            + 0.05
            * (
                i % 2
            ),  # Adjust the y position to be near the top of the figure; use a relative value within 'paper' coordinate
            text=chrom,  # Chromosome label
            showarrow=False,  # Do not show an arrow pointing to the annotation
            xref="x",  # Use the x-axis for positioning
            yref="paper",  # Use the figure's relative height for y positioning
            xanchor="left",  # Anchor the text to the left of the x position
            yanchor="bottom",  # Anchor the text to the bottom of the y position
            font=dict(family="Arial", size=10, color="RoyalBlue"),
        )

    # Update the layout
    fig.update_layout(
        title="Important feature positions",
        showlegend=False,
        xaxis_title="Genomic position",
        xaxis=dict(range=[0 - 10, sum(chr_bin_sizes.values()) + 10]),
        yaxis_title="Set of features",
        yaxis=dict(
            tickmode="array",
            tickvals=list(range(len(feature_dict))),
            ticktext=[
                f"{name} (n={len(features)})" for name, features in sorted_features
            ],
        ),
    )

    fig.write_html(logdir / f"{output_name}.html")
    fig.write_image(logdir / f"{output_name}.png")
    fig.write_image(logdir / f"{output_name}.svg")
    fig.show()

In [None]:
global_important_features_dir = (
    Path.home() / "Projects/epiclass/output/models/SHAP/global_task_features/global_info"
)
global_important_features_path = (
    global_important_features_dir / "global_task_features.json"
)
with open(global_important_features_path, "r", encoding="utf8") as f:
    global_important_features = json.load(f)

In [None]:
plot_feature_positions(
    global_important_features,
    chr_bin_sizes,
    "important_features_on_genome",
    logdir=global_important_features_dir,
)

In [None]:
def compute_enrichment_per_chrom(
    chr_bin_sizes: Dict[str, int], bed_dir: Path
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Compute the enrichment per chromosome of each feature set.

    Args:
        chr_bin_sizes (Dict[str, int]): The chromosome sizes (number of bins).
        bed_dir (Path): The directory containing the bed files of the same resolution.

    Returns:
        pd.DataFrame: A DataFrame containing the relative enrichment values for each feature set across chromosomes.
        pd.DataFrame: A DataFrame containing the chromosome bin count for each feature set.
    """
    # Read the bed files
    bed_files = list(bed_dir.glob("*.bed"))
    chr_count = {}
    relative_chr_count = {}
    for bed_file in bed_files:
        with open(bed_file, "r", encoding="utf8") as f:
            lines = f.readlines()
        set_chr_count = Counter([line.split("\t")[0] for line in lines])
        set_relative_chr_count = {
            chrom: count / len(lines) for chrom, count in set_chr_count.items()
        }

        set_name = str(bed_file.stem).rsplit("_", 1)[0]
        chr_count[set_name] = set_chr_count
        relative_chr_count[set_name] = set_relative_chr_count

    # Chrom relative sizes
    relative_chromsize = {
        chrom: size / sum(chr_bin_sizes.values()) for chrom, size in chr_bin_sizes.items()
    }

    # Compute the enrichement
    relative_enrichement = {}
    for set_name, set_counter in relative_chr_count.items():
        relative_enrichement[set_name] = {
            chrom: count / relative_chromsize[chrom]
            for chrom, count in set_counter.items()
        }

    relative_enrichement = pd.DataFrame(
        data=relative_enrichement, index=list(chr_bin_sizes.keys())
    ).transpose()
    chr_count = pd.DataFrame(data=chr_count, index=list(chr_bin_sizes.keys())).transpose()
    return relative_enrichement, chr_count  # type: ignore

In [None]:
bed_dir = global_important_features_dir / "global_task_features_beds"
enrichment, chr_count = compute_enrichment_per_chrom(chr_bin_sizes, bed_dir)

In [None]:
enrichment.to_csv(global_important_features_dir / "chromosome_feature_enrichment.csv")
chr_count.to_csv(global_important_features_dir / "chromosome_feature_count.csv")

In [None]:
def plot_bin_metrics(npz_file: Path, output_name: str, chr_bin_sizes: Dict[str, int]):
    """Plot the bin metrics from a numpy file."""
    with np.load(npz_file) as data:
        bin_metrics = {k: data[k] for k in data.files}

    means = bin_metrics["mean"]
    std_devs = bin_metrics["std"]
    medians = bin_metrics["median"]
    iqrs = bin_metrics["iqr"]

    # subsample values
    # means = means
    # std_devs = std_devs
    # medians = medians
    # iqrs = iqrs

    # Indices for x-axis, assuming each point's x-coordinate is its index
    indices = np.arange(len(means))

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

    # Add scatter plot for means with standard deviation as error bars to the first subplot
    fig.add_trace(
        go.Scatter(
            x=indices,
            y=means,
            mode="markers",
            name="Mean with Std Dev",
            marker=dict(size=1),  # Smaller marker size
            error_y=dict(
                type="data",
                array=std_devs,
                visible=True,
                thickness=1,  # Thinner error bars
                width=2,  # Narrower end caps on error bars
            ),
        ),
        row=1,
        col=1,  # Position of the trace in the subplot grid
    )

    # Add scatter plot for medians with IQR as error bars to the second subplot
    fig.add_trace(
        go.Scatter(
            x=indices,
            y=medians,
            mode="markers",
            name="Median with IQR",
            marker=dict(size=1),  # Smaller marker size
            error_y=dict(
                type="data",
                array=iqrs / 2,  # Approximation
                visible=True,
                thickness=1,  # Thinner error bars
                width=2,  # Narrower end caps on error bars
            ),
        ),
        row=2,
        col=1,  # Position of the trace in the subplot grid
    )

    # Update layout for clarity
    fig.update_layout(
        title="Separate Metrics with Error Bars",
        xaxis_title="Bin position",
        yaxis_title="Mean Values",
        legend_title="Metric Type",
    )

    # Specific labels for the second subplot
    fig.update_yaxes(title_text="Median Values", row=2, col=1)

    # Add vertical line for each chrom end to both subplots
    for row in [1, 2]:
        line_position = 0
        for chrom in chr_bin_sizes:
            line_position += chr_bin_sizes[chrom]
            # Add vertical line to the first subplot
            fig.add_shape(
                type="line",
                x0=line_position,
                x1=line_position,
                y0=0,  # Start from bottom of the plot area
                y1=1,  # Extend to the top of the plot area
                line=dict(
                    color="black",
                    width=1,
                    dash="dashdot",
                ),
                xref="x",  # Reference to the x-axis of the subplot
                yref="y2 domain",
                row=row,
                col=1,
            )

            fig.add_annotation(
                x=line_position - 1000,  # Position the label at the chromosome boundary
                y=0.95,  # Adjust the y position to be near the top of the figure; use a relative value within 'paper' coordinate
                text=chrom,  # Chromosome label
                showarrow=False,  # Do not show an arrow pointing to the annotation
                xref="x",  # Use the x-axis for positioning
                yref="y2 domain",  # Use the figure's paper for y positioning
                xanchor="left",  # Anchor the text to the left of the x position
                yanchor="bottom",  # Anchor the text to the bottom of the y position
                font=dict(family="Arial", size=10, color="RoyalBlue"),
                row=row,
                col=1,
            )

    # Show plot
    fig.write_html(logdir / f"{output_name}.html")

In [None]:
def read_important_features(global_task_features_path) -> Dict[str, List[int]]:
    """Read the important features from the global task features file."""
    with open(global_task_features_path, "r", encoding="utf8") as file:
        important_features = json.load(file)
    return important_features

In [None]:
def read_cancer_important_bins() -> Dict[str, List[int]]:
    """Read the cancer important bins."""
    dir_path = Path.home() / "scratch/epiclass/join_important_features/global_info/cancer"
    index_dict = {}

    filepath = (
        dir_path / "cancer_intersection_merge_samplings_bed-details_blood_subset.tsv"
    )
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling_blood_subset"] = list(df["bin"])

    filepath = dir_path / "cancer_intersection_merge_samplings_bed-details_2.tsv"
    df = pd.read_csv(filepath, names=["chr", "start", "end", "bin", "details"], sep="\t")
    index_dict["cancer_intersection_merge_sampling"] = list(df["bin"])

    return index_dict

In [None]:
def plot_important_features_metrics(
    important_features: Dict[str, List[int]],
    npz_file_path: Path,
    logdir: Path | None = None,
    include_categories: Iterable[str] | None = None,
    md5s: Iterable[str] | None = None,
) -> Dict[str, Tuple[int, float, float]]:
    """Using the important features positions, plot (violin) the mean values according to the given npz file.

    Adds a violin for a random feature set of the same size, and one for the global distribution.

    Compute the KS test for the random features and the global distribution, and add the p-value to the plot.

    Args:
    - important_features: A dictionary with category names as keys, and lists of feature positions as values.
    - npz_file_path: The path to the npz file containing the bin metrics.
    - logdir: The directory where to save the plots.
    - include_categories: The categories to include in the plot.
    - md5s: The md5s to include in the plot.

    Returns:
    - A dictionary with category names as keys, and tuples of sample size and p-values as values.
    """
    with np.load(npz_file_path) as data:
        bin_metrics = {metric: data[metric] for metric in data.keys()}

    means = bin_metrics["mean"]

    pvals = {}
    for category_name, features_pos in important_features.items():
        if include_categories and category_name not in include_categories:
            continue

        fig = go.Figure()

        selected_features = [means[pos] for pos in features_pos]
        fig.add_trace(
            go.Violin(
                y=selected_features,
                name=f"{category_name} features (N={len(features_pos)})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )

        # Random features comparison
        N = len(features_pos)
        random_features = np.random.choice(means, size=N, replace=False)
        fig.add_trace(
            go.Violin(
                y=random_features,
                name=f"Random features (N={N})",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_random = stats.ks_2samp(selected_features, random_features)

        # Global distribution comparison
        fig.add_trace(
            go.Violin(
                y=means,
                name=f"All features N={len(means)}",
                box_visible=True,
                meanline_visible=True,
                points="all",
            )
        )
        _, pval_global = stats.ks_2samp(selected_features, means)

        # Annotations for p-values
        fig.add_annotation(
            x=1,
            y=max(selected_features),
            text=f"p-value (Selected vs. Random): {pval_random:.3e}",
            showarrow=False,
            xref="x",
            yref="y",
        )
        fig.add_annotation(
            x=2,
            y=max(means),
            text=f"p-value (Selected vs. Global): {pval_global:.3e}",
            showarrow=False,
            xref="x",
            yref="y",
        )

        # Small points
        fig.update_traces(marker=dict(size=2))

        fig.update_layout(
            title=f"Mean values for {category_name} features",
            xaxis_title="Feature set",
            yaxis_title="Mean values",
            violinmode="group",
        )

        pvals[category_name] = (N, pval_random, pval_global)

        if logdir:
            fig.write_html(logdir / f"{npz_file_path.stem}_{category_name}_violin.html")
            fig.write_image(logdir / f"{npz_file_path.stem}_{category_name}_violin.png")

        fig.show()

    return pvals

### CNV hdf5 values

In [None]:
# test_file = "/home/local/USHERBROOKE/rabj2301/Projects/epiclass/input/hdf5_list/CNV/test_6samples_metrics.npz"
# metrics_arrays = np.load(test_file)
# for array in metrics_arrays.values():
#     print(array)

In [None]:
logdir = logdir = base_logdir / "epiatlas-dfreeze-v2.1/hdf5_stats" / "CNV"
npz_file = logdir / "CNV_EpiAtlas_cancer_onlyLeukemia_100kb_all_none_metrics.npz"
if not npz_file.exists():
    print(f"{npz_file} does not exist")

# with np.load(npz_file) as data:
#     for k in data.files:
#         print(k, data[k].shape, data[k].dtype, data[k])

In [None]:
# plot_bin_metrics(npz_file, output_name=npz_file.stem, chr_bin_sizes=chr_bin_sizes)

In [None]:
# cancer_subset_features = read_cancer_important_bins()
# assert len(list(cancer_subset_features.values())[0]) == 233

In [None]:
# selected_features = read_cancer_important_bins()

# CNV_pvals = plot_important_features_metrics(selected_features, npz_file, logdir)

In [None]:
# df = pd.DataFrame.from_dict(
#     CNV_pvals, orient="index", columns=["Feature set size", "pval_random", "pval_global"]
# )
# df.to_csv(logdir / f"{npz_file.stem}_pvals_ks.csv")

### ChromScore hdf5 values

In [None]:
important_features_path = "/home/local/USHERBROOKE/rabj2301/scratch/epiclass/join_important_features/global_info/global_task_features.json"

In [None]:
logdir = base_logdir / "epiatlas-dfreeze-v2.1/hdf5_stats" / "ChromScore"
npz_file = logdir / "ChromScore_metrics_raw.npz"
if not npz_file.exists():
    print(f"{npz_file} does not exist")

features_dir = (
    Path.home()
    / "scratch/epiclass/join_important_features/hg38_100kb_all_none/global_info"
)
features_path = features_dir / "global_task_features.json"
selected_features = read_important_features(features_path)

In [None]:
chromScore_pvals = plot_important_features_metrics(
    important_features=selected_features,
    npz_file_path=npz_file,
    include_categories=[CELL_TYPE],
)

### Other

In [None]:
# shap_md5s_path = input_base / "hdf5_list" / "md5_shap_assay_explain.list"
# with open(shap_md5s_path, "r", encoding="utf8") as f:
#     shap_md5s = set(f.read().splitlines())


def analyze_feature_vals(
    regions_dict: Dict[int, Tuple],
    md5s: List[str],
    hdf5_list: Path,
    logdir: Path,
    name: str,
    shap_md5s: List[str],
):
    """
    Generate and save a violin plot of provided feature values for the provided md5s, with some md5s highlighted.

    This function takes as input a list of md5s and a dictionary of regions, and generates a violin plot
    of the feature values for these md5s. It also highlights specific md5s by adding lines+markers for them.
    The function saves the plot as an HTML file and a PNG file in the provided log directory.

    Args:
        regions_dict (Dict[int, Tuple]): A dictionary mapping region indices to their respective genomic coordinates.
        md5s (List[str]): A list of md5s to analyze.
        hdf5_list (Path): Path to the list of hdf5 files to be used.
        logdir (Path): Directory where the resulting plot should be saved.
        name (str): Name used to save the resulting plot (will be part of the filename).
    """
    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
    hdf5_loader.load_hdf5s(hdf5_list, md5s, strict=True)
    N = len(hdf5_loader.signals)

    nb_highlight = 3
    highlight_md5s = list(set(md5s) & set(shap_md5s))[0:nb_highlight]

    traces = []
    highlight_values = {highlight_md5: [] for highlight_md5 in highlight_md5s}
    for region, region_bed in regions_dict.items():
        values = [signal[region] for signal in hdf5_loader.signals.values()]
        region_str = f"{region_bed[0]}:{region_bed[1]}-{region_bed[2]}"

        trace = go.Violin(
            y=values,
            name=region_str,
            points="all",
            box_visible=True,
            meanline_visible=True,
        )
        traces.append(trace)

        for highlight_md5 in highlight_md5s:
            highlight_value = hdf5_loader.signals[highlight_md5][region]
            highlight_values[highlight_md5].append((region_str, highlight_value))

    for (highlight_md5, highlight_value), marker_format in zip(
        highlight_values.items(),
        [["cross", "black"], ["circle", "blue"], ["diamond", "red"]],
    ):
        x, y = zip(*highlight_value)
        symbol, color = marker_format
        highlight_trace = go.Scatter(
            x=x,
            y=y,
            mode="lines+markers",
            name=f"{highlight_md5}",
            marker={"size": 6, "symbol": symbol, "color": color},
        )
        traces.append(highlight_trace)

    # Create the layout
    layout = go.Layout(
        title=f"Feature values distributions for {N} {name} samples (0blklst)",
        yaxis={"title": "z-score"},
        xaxis={"title": "Region"},
        showlegend=False,
    )

    # Create the figure with the data and layout
    fig = go.Figure(data=traces, layout=layout)
    fig.write_html(logdir / f"feature_values_{name}.html")

    width = 1200
    fig.write_image(
        logdir / f"feature_values_{name}.png", width=width, height=width * 3 / 4
    )
    # fig.show()

In [None]:
def plot_single_file(md5, zscore: bool = True):
    """Produce a violin plot (save to html) of all feature values for a single sample."""
    if zscore:
        mode = "z-scores"
    else:
        mode = "raw values"

    hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=zscore)
    signals = hdf5_loader.load_hdf5s(hdf5_list_path, [md5], strict=True).signals

    fig = px.violin(
        data_frame=list(signals.values())[0],
        box=True,
        points="all",
        title=f"Violin plot for {md5} {mode}",
    )
    fig.write_html(f"{md5}-{mode}.html")
    fig.show()

In [None]:
def evaluate_casting_error(filepath: Path | str, dataset_name: str):
    """Evaluate the casting error for a specific dataset in an HDF5 file."""
    with h5py.File(filepath, "r") as f:
        dataset = f[dataset_name]
        values = dataset[:]

        # Cast to float32 and compare max diff
        casted_dataset = dataset.astype(np.float32)[:]
        diff = np.abs(casted_dataset - values)
        max_diff = np.max(diff)
        print(f"Max diff when casting: {max_diff}")
        if max_diff > 1e-4:
            print("Induced casting error")
            print(f"Max value: {np.max(values)}")
            print(f"Filepath: {filepath}")
            print(f"Dataset name: {dataset_name}")


# traces = []
# for filepath in paths:
#     with h5py.File(filepath, "r+") as f:
#         for _, group in f.items():
#             for dataset_name, dataset in list(group.items()):
#                 # Extract the values from the dataset
#                 values = dataset[:]

#                 # Create a violin trace
#                 trace = go.Violin(y=values, name=dataset_name)

#                 # Add the trace to the data list
#                 traces.append(trace)

#                 evaluate_casting_error(filepath, dataset_name)

#     # Create the layout
#     layout = go.Layout(title="Violin Plots", yaxis={"title": "Values"})

#     # Create the figure with the data and layout
#     fig = go.Figure(data=traces, layout=layout)

#     # Show the violin plot
#     fig.show()
#     traces = []

In [None]:
def evaluate_descriptive_stats(
    df: pd.DataFrame, metadata_df: pd.DataFrame, metadata: Metadata, logdir: Path
):
    """Evaluate the descriptive statistics for a DataFrame."""
    percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
    stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
    metrics = set(stats_df.columns.values)
    stats_df = stats_df.join(metadata_df)  # type: ignore

    # Create violin plots, one plot for each metric, and a violin for each assay (per plot)
    allowed_metrics = metrics - set(["count", "mean", "std"])
    category_orders = {ASSAY: sorted(metadata.label_counter(ASSAY, verbose=False).keys())}
    for column in stats_df:
        if column not in allowed_metrics:
            continue
        fig = px.violin(
            data_frame=stats_df,
            x=column,
            y=ASSAY,
            box=True,
            points="all",
            title=f"Violin plot for {column}",
            color=ASSAY,
            category_orders=category_orders,
            height=800,
            hover_data={"md5sum": (df.index)},
        )
        fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
        fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")
    return stats_df

In [None]:
# # Assuming you have a list of arrays
# hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
# signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals
# df = pd.DataFrame.from_dict(signals, orient="index")
# # df.head()