In [10]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument

Package                  Version                Location
kaleido                  0.2.1+computecanada
packaging                23.1+computecanada
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px

from epi_ml.core.data_source import EpiDataSource
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata

In [3]:
%matplotlib inline

In [4]:
base = Path("/lustre06/project/6007017/rabyj/epilap/input/")

hdf5_list_path = base / "hdf5_list/hg38_2023-01-epiatlas-freeze/100kb_all_none.list"
chromsize_path = (
    base / "/lustre06/project/6007017/rabyj/epilap/input/chromsizes/hg38.noy.chrom.sizes"
)
metadata_path = (
    base / "metadata/hg38_2023_epiatlas_dfreeze_plus_encode_noncore_formatted_JR.json"
)

logdir = Path(
    "/lustre07/scratch/rabyj/epilap-logs/2023-01-epiatlas-freeze/hg38_100kb_all_none"
)

In [5]:
datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
my_meta = Metadata(datasource.metadata_file)
hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)

In [6]:
my_meta.remove_missing_labels("harmonized_donor_sex")

In [20]:
md5s = list(my_meta.md5s)[0:1000]
signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals

In [21]:
# Assuming you have a list of arrays
df = pd.DataFrame(signals.values()).T

# Descriptive statistics
percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
stats_df = df.describe(percentiles)

In [22]:
stats_dict = stats_df.T.to_dict("series")

In [23]:
# Create violin plots
for metric, values in stats_dict.items():
    fig = px.violin(x=values, box=True, points="all")
    fig.update_layout(title_text=f"Violin plot for {metric}")
    fig.write_image(logdir / f"100kb_all_none_hdf5_{metric}.png")
    fig.write_html(logdir / f"100kb_all_none_hdf5_{metric}.html")