In [3]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument

'Notebook to analyze the values in an HDF5 file.'

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px

from epi_ml.core.data_source import EpiDataSource
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata

ASSAY = "assay_epiclass"

In [5]:
%matplotlib inline

In [6]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epilap/input"

hdf5_list_path = base / "hdf5_list" / "100kb_all_none.list"
chromsize_path = base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = (
    base / "metadata/hg38_2023_epiatlas_dfreeze_plus_encode_noncore_formatted_JR.json"
)

base_logdir = Path.home() / "Projects/epilap/output/logs"
logdir = base_logdir / "hg38_2022-epiatlas/hdf5_stats"

In [7]:
N_SAMPLES = 100

In [8]:
datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
my_meta = Metadata(datasource.metadata_file)
hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)

In [9]:
my_meta.remove_missing_labels("harmonized_donor_sex")

In [10]:
md5s = list(Hdf5Loader.read_list(hdf5_list_path).keys())[0:N_SAMPLES]
md5_metadata = pd.DataFrame([my_meta[md5] for md5 in md5s if md5 in my_meta])
md5_metadata.set_index("md5sum", inplace=True)
# md5_metadata.head()

In [11]:
signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals

In [12]:
# Assuming you have a list of arrays
df = pd.DataFrame.from_dict(signals, orient="index")
# df.head()

In [13]:
# Descriptive statistics
percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
metrics = set(stats_df.columns.values)

In [None]:
# print(sorted(metrics))

In [22]:
stats_df = stats_df.join(md5_metadata)

In [23]:
# Create violin plots, one plot for each metric, and a violin for each assay (per plot)
allowed_metrics = metrics - set(["count", "mean", "std"])
category_orders = {ASSAY: sorted(my_meta.label_counter(ASSAY, verbose=False).keys())}
for column in stats_df:
    if column not in allowed_metrics:
        continue
    fig = px.violin(
        data_frame=stats_df,
        x=column,
        y=ASSAY,
        box=True,
        points="all",
        title=f"Violin plot for {column}",
        color=ASSAY,
        category_orders=category_orders,
        height=800,
        hover_data={"md5sum": (df.index)},
    )
    fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
    fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")
    # fig.show()