In [None]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument

In [None]:
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from epi_ml.core.data_source import EpiDataSource
from epi_ml.core.epiatlas_treatment import ACCEPTED_TRACKS
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata

ASSAY = "assay_epiclass"
TRACK_TYPE = "track_type"

In [None]:
%matplotlib inline

In [None]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epilap/input"

chromsize_path = base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = base / "metadata/hg38_2023_epiatlas_dfreeze_formatted_JR.json"

base_logdir = Path.home() / "Projects/epilap/output/logs"
logdir = base_logdir / "hg38_2022-epiatlas/hdf5_stats"

In [None]:
# hdf5_list_path = base / "hdf5_list" / "100kb_all_none.list"
hdf5_list_path = (
    base / "hdf5_list" / "hg38_2023-01-epiatlas-freeze" / "compare_winsorize.list"
)

In [None]:
N_SAMPLES = 100

In [None]:
datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
my_meta = Metadata(datasource.metadata_file)
my_meta.display_labels("track_type")

my_meta.select_category_subsets("track_type", ACCEPTED_TRACKS)
my_meta.display_labels("track_type")

In [None]:
my_meta.remove_missing_labels("harmonized_donor_sex")

In [None]:
# # md5s = set(list(Hdf5Loader.read_list(hdf5_list_path).keys())[0:N_SAMPLES])
# md5s = set(list(Hdf5Loader.read_list(hdf5_list_path).keys()))
# # for md5 in md5s:
# #     if md5 not in my_meta:
# #         raise IndexError(f"Missing metadata for {md5}")

# df_md5_metadata = pd.DataFrame([my_meta[md5] for md5 in md5s if md5 in my_meta])
# df_md5_metadata.set_index("md5sum", inplace=True)

In [None]:
# print(f"{df_md5_metadata.shape[0]} files to analyze.")
# print(df_md5_metadata[TRACK_TYPE].value_counts())
# print(df_md5_metadata[ASSAY].value_counts())

In [None]:
# df_md5_metadata.head(10)

In [None]:
with open(hdf5_list_path, "r", encoding="utf8") as f:
    paths = [line.strip() for line in f.readlines()]

In [None]:
traces = []
for filepath in paths:
    with h5py.File(filepath, "r+") as f:
        for _, group in f.items():
            for dataset_name, dataset in list(group.items()):
                # Extract the values from the dataset
                values = dataset[:]

                # Create a violin trace
                trace = go.Violin(y=values, name=dataset_name)

                # Add the trace to the data list
                traces.append(trace)

                # # Cast to float32 and compare max diff
                # casted_dataset = dataset.astype(np.float32)[:]
                # diff = np.abs(casted_dataset - values)
                # max_diff = np.max(diff)
                # # print(f"Max diff when casting: {max_diff}")
                # if max_diff > 1e-4:
                #     print("Induced casting error")
                #     print(f"Max value: {np.max(values)}")
                #     print(f"Filepath: {filepath}")
                #     print(f"Dataset name: {dataset_name}")

    # Create the layout
    layout = go.Layout(title="Violin Plots", yaxis={"title": "Values"})

    # Create the figure with the data and layout
    fig = go.Figure(data=traces, layout=layout)

    # Show the violin plot
    fig.show()
    traces = []

In [None]:
hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)
# signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals

In [None]:
md5 = "402a78740e46888266209a5b7c3ece4c"
mode = "z-scores"
signals = hdf5_loader.load_hdf5s(hdf5_list_path, [md5], strict=True).signals
fig = px.violin(
    data_frame=list(signals.values())[0],
    box=True,
    points="all",
    title=f"Violin plot for {md5} {mode}",
)
fig.write_html(f"{md5}-{mode}.html")
fig.show()

In [None]:
# Assuming you have a list of arrays
df = pd.DataFrame.from_dict(signals, orient="index")
# df.head()

In [None]:
# Descriptive statistics
percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
metrics = set(stats_df.columns.values)

In [None]:
# print(sorted(metrics))

In [None]:
stats_df = stats_df.join(df_md5_metadata)

In [None]:
# stats_df.loc["7067b0916069bb8a54078a784ae60a65"].values
# stats_df[~stats_df["track_type"].isin(["fc", "pval"])]["track_type"].unique()

In [None]:
# Create violin plots, one plot for each metric, and a violin for each assay (per plot)
allowed_metrics = metrics - set(["count", "mean", "std"])
category_orders = {ASSAY: sorted(my_meta.label_counter(ASSAY, verbose=False).keys())}
for column in stats_df:
    if column not in allowed_metrics:
        continue
    fig = px.violin(
        data_frame=stats_df,
        x=column,
        y=ASSAY,
        box=True,
        points="all",
        title=f"Violin plot for {column}",
        color=ASSAY,
        category_orders=category_orders,
        height=800,
        hover_data={"md5sum": (df.index)},
    )
    fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
    fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")

In [None]:
# do same plots but only keep "raw" files.
df_filter = stats_df["track_type"].isin(["fc", "pval"])

display(stats_df[df_filter].head(10))
display(stats_df[~df_filter].head(10))