In [33]:
"""Notebook to analyze the values in an HDF5 file."""
# %pip list | grep "ka"
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument

'Notebook to analyze the values in an HDF5 file.'

In [34]:
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px

from epi_ml.core.data_source import EpiDataSource
from epi_ml.core.epiatlas_treatment import ACCEPTED_TRACKS
from epi_ml.core.hdf5_loader import Hdf5Loader
from epi_ml.core.metadata import Metadata

ASSAY = "assay_epiclass"
TRACK_TYPE = "track_type"

In [35]:
ACCEPTED_TRACKS

['raw',
 'ctl_raw',
 'Unique_plusRaw',
 'gembs_pos',
 'pval',
 'fc',
 'Unique_minusRaw',
 'gembs_neg']

In [36]:
%matplotlib inline

In [37]:
# base = Path("/lustre06/project/6007017/rabyj/epilap/input/")
base = Path.home() / "Projects/epilap/input"

hdf5_list_path = base / "hdf5_list" / "100kb_all_none_10samples.list"
chromsize_path = base / "chromsizes" / "hg38.noy.chrom.sizes"
metadata_path = base / "metadata/hg38_2023_epiatlas_dfreeze_formatted_JR.json"

base_logdir = Path.home() / "Projects/epilap/output/logs"
logdir = base_logdir / "hg38_2022-epiatlas/hdf5_stats"

In [38]:
N_SAMPLES = 100

In [39]:
datasource = EpiDataSource(hdf5_list_path, chromsize_path, metadata_path)
my_meta = Metadata(datasource.metadata_file)
my_meta.display_labels("track_type")

my_meta.select_category_subsets("track_type", ACCEPTED_TRACKS)
my_meta.display_labels("track_type")


Label breakdown for track_type
0 labels missing and ignored from count
raw: 5485
fc: 5471
pval: 5471
Unique_minusRaw: 1463
Unique_plusRaw: 1463
ctl_raw: 965
gembs_neg: 644
gembs_pos: 644
Unique_raw: 159
For a total of 21765 examples


Label breakdown for track_type
0 labels missing and ignored from count
raw: 5485
fc: 5471
pval: 5471
Unique_minusRaw: 1463
Unique_plusRaw: 1463
ctl_raw: 965
gembs_neg: 644
gembs_pos: 644
For a total of 21606 examples



In [40]:
my_meta.remove_missing_labels("harmonized_donor_sex")

In [41]:
hdf5_loader = Hdf5Loader(chrom_file=chromsize_path, normalization=True)

In [42]:
md5s = set(list(Hdf5Loader.read_list(hdf5_list_path).keys())[0:N_SAMPLES])

for md5 in md5s:
    if md5 not in my_meta:
        raise IndexError(f"Missing metadata for {md5}")

df_md5_metadata = pd.DataFrame([my_meta[md5] for md5 in md5s if md5 in my_meta])
df_md5_metadata.set_index("md5sum", inplace=True)

In [44]:
print(f"{df_md5_metadata.shape[0]} files to analyze.")
print(df_md5_metadata[TRACK_TYPE].value_counts())
print(df_md5_metadata[ASSAY].value_counts())

10 files to analyze.
fc                 5
pval               3
raw                1
Unique_minusRaw    1
Name: track_type, dtype: int64
h3k4me1     3
h3k27ac     2
h3k4me3     2
h3k27me3    1
rna_seq     1
h3k36me3    1
Name: assay_epiclass, dtype: int64


In [27]:
df_md5_metadata.head(10)

Unnamed: 0_level_0,epirr_id,data_generating_centre,assay_type,experiment_type,antibody,uuid,inputs,inputs_ctl,original_read_length,original_read_length_ctl,...,harmonized_donor_health_status,harmonized_donor_health_status_ontology_curie,automated_harmonized_donor_health_status_ontology_curie_ncit,automated_harmonized_donor_health_status_ontology_term_intermediate_order_unique,automated_harmonized_donor_health_status_ontology_term_high_order_unique,automated_harmonized_donor_health_status_ontology_term_intermediate_order,automated_harmonized_donor_health_status_ontology_term_high_order,harmonized_donor_life_status,track_type,assay_epiclass
md5sum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0010b731db8833f3816a75f454a9ce3e,IHECRE00004656.3,ENCODE,ChIP-Seq,H3K4me1,h3k4me1,d1e112ae-26f5-4d0c-967f-355a759029c5,ENCFF792VOC;ENCFF066RTT;ENCFF971HUG;ENCFF240YP...,ENCFF636HAV;ENCFF953XTR;ENCFF839TPC;ENCFF947ZS...,76;76;76;76;76,76;76;76;76;76;101;101;101;101,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k4me1
00187b8d3a864fc9795cd462648e6bff,IHECRE00000974.7,NIH_Roadmap_Epigenomics,ChIP-Seq,H3K27ac,h3k27ac,9dac3370-6335-4df0-bea0-9e684edfa350,SRR304956;SRR401265,SRR304972;SRR304973;SRR401268,36;36,36;36;36,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k27ac
001b8596f08afd6f67093fc64f6dca72,IHECRE00000269.2,BLUEPRINT,ChIP-Seq,H3K4me3,h3k4me3,3d64f386-99ad-42e7-862c-96fa74dbf9bb,EGAR00001288451,EGAR00001288455,42,42,...,,,,,,,,,pval,h3k4me3
00178d50cb4851132794ff3cb5f038bc,IHECRE00002497.1,BLUEPRINT,ChIP-Seq,H3K4me1,h3k4me1,c5b3fd4f-06d0-4ac9-b842-bb852c778b6b,EGAR00001422186,EGAR00001185144,100,43,...,,,,,,,,,pval,h3k4me1
0003d5ce4d695c324d7dfec4d5944bf4,IHECRE00004711.3,ENCODE,ChIP-Seq,H3K27ac,h3k27ac,69c60c6b-4729-4b3a-b7a4-01d7ab1c211f,ENCFF964NOS;ENCFF628GCG,ENCFF332CUC;ENCFF436CCE;ENCFF818YLR;ENCFF360ZX...,76;76,76;76;76;76;101;101,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k27ac
001794578dd271ca1327beb9cfd21697,IHECRE00004698.3,ENCODE,ChIP-Seq,H3K27me3,h3k27me3,a2b0231a-8592-4d70-bd02-8c4d628756de,ENCFF001ESL;ENCFF001ESM,ENCFF001HAM,36;36,36,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,raw,h3k27me3
0010a96642572d0891cc66c7b14c6a79,IHECRE00002284.1,BLUEPRINT,RNA-Seq,total-RNA-Seq,rna_seq,a1dfa1e8-b3ae-480e-9735-38321b563bd7,EGAR00001155886,,101,,...,,,,,,,,,Unique_minusRaw,rna_seq
000ba076cfff3e0ba0d837fde9495bc9,IHECRE00000963.7,NIH_Roadmap_Epigenomics,ChIP-Seq,H3K36me3,h3k36me3,97b33b52-0e1f-49a1-be88-1cf0ab7c9da1,ENCFF438GUQ;ENCFF734RWY;ENCFF550PAJ;ENCFF146TA...,ENCFF410NDJ;ENCFF525FPV;ENCFF782XZV;ENCFF955FM...,36;36;36;36;36,36;36;36;36;36,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,pval,h3k36me3
000c357f865a29f8ab02cbb3436721ce,IHECRE00001242.1,BLUEPRINT,ChIP-Seq,H3K4me3,h3k4me3,fdcc068d-14c1-4783-9b77-ac213a614a3d,EGAR00001355801,EGAR00001355843,42,42,...,Follicular Lymphoma,NCIM:C0024301,NCIT:C3209,Follicular Lymphoma,Lymphoproliferative Disorder,Lymphoma,Lymphoproliferative Disorder,,fc,h3k4me3
001963cc3a7e0b9caf89b9f49669c2d1,IHECRE00000929.7,NIH_Roadmap_Epigenomics,ChIP-Seq,H3K4me1,h3k4me1,38dff74d-0e0f-4664-92f9-addecaec8eb5,ENCFF285FNC,ENCFF627LYX,36,36,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k4me1


In [10]:
signals = hdf5_loader.load_hdf5s(hdf5_list_path, md5s, strict=True).signals

In [11]:
# Assuming you have a list of arrays
df = pd.DataFrame.from_dict(signals, orient="index")
# df.head()

In [22]:
# Descriptive statistics
percentiles = [0.01] + list(np.arange(0.05, 1, 0.05)) + [0.99] + [0.999]
stats_df = df.apply(pd.DataFrame.describe, percentiles=percentiles, axis=1)  # type: ignore
metrics = set(stats_df.columns.values)

In [13]:
# print(sorted(metrics))

In [23]:
stats_df = stats_df.join(df_md5_metadata)

In [17]:
# stats_df.loc["7067b0916069bb8a54078a784ae60a65"].values
# stats_df[~stats_df["track_type"].isin(["fc", "pval"])]["track_type"].unique()

In [18]:
# Create violin plots, one plot for each metric, and a violin for each assay (per plot)
allowed_metrics = metrics - set(["count", "mean", "std"])
category_orders = {ASSAY: sorted(my_meta.label_counter(ASSAY, verbose=False).keys())}
for column in stats_df:
    if column not in allowed_metrics:
        continue
    fig = px.violin(
        data_frame=stats_df,
        x=column,
        y=ASSAY,
        box=True,
        points="all",
        title=f"Violin plot for {column}",
        color=ASSAY,
        category_orders=category_orders,
        height=800,
        hover_data={"md5sum": (df.index)},
    )
    fig.write_image(logdir / f"100kb_all_none_hdf5_{column}.png")
    fig.write_html(logdir / f"100kb_all_none_hdf5_{column}.html")

In [25]:
# do same plots but only keep "raw" files.
df_filter = stats_df["track_type"].isin(["fc", "pval"])

display(stats_df[df_filter].head(10))
display(stats_df[~df_filter].head(10))

Unnamed: 0,count,mean,std,min,1%,5%,10%,15%,20%,25%,...,harmonized_donor_health_status,harmonized_donor_health_status_ontology_curie,automated_harmonized_donor_health_status_ontology_curie_ncit,automated_harmonized_donor_health_status_ontology_term_intermediate_order_unique,automated_harmonized_donor_health_status_ontology_term_high_order_unique,automated_harmonized_donor_health_status_ontology_term_intermediate_order,automated_harmonized_donor_health_status_ontology_term_high_order,harmonized_donor_life_status,track_type,assay_epiclass
0003d5ce4d695c324d7dfec4d5944bf4,30321.0,-1.610373e-08,1.000016,-0.881595,-0.881595,-0.880223,-0.6569,-0.630276,-0.614092,-0.598078,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k27ac
000ba076cfff3e0ba0d837fde9495bc9,30321.0,-4.428526e-08,1.000016,-0.450714,-0.450687,-0.450686,-0.445983,-0.445082,-0.44457,-0.444131,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,pval,h3k36me3
000c357f865a29f8ab02cbb3436721ce,30321.0,7.649273e-08,1.000016,-1.841085,-1.841085,-1.838582,-0.78296,-0.465282,-0.362765,-0.312002,...,Follicular Lymphoma,NCIM:C0024301,NCIT:C3209,Follicular Lymphoma,Lymphoproliferative Disorder,Lymphoma,Lymphoproliferative Disorder,,fc,h3k4me3
0010b731db8833f3816a75f454a9ce3e,30321.0,-2.818153e-08,1.000016,-2.18112,-2.18112,-2.172207,-1.191546,-0.605125,-0.48993,-0.416407,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k4me1
00178d50cb4851132794ff3cb5f038bc,30321.0,6.0389e-09,1.000016,-0.880709,-0.845083,-0.844347,-0.489502,-0.428204,-0.403222,-0.384804,...,,,,,,,,,pval,h3k4me1
00187b8d3a864fc9795cd462648e6bff,30321.0,2.41556e-08,1.000016,-1.544762,-1.544762,-1.544762,-0.974007,-0.691034,-0.593552,-0.50642,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k27ac
001963cc3a7e0b9caf89b9f49669c2d1,30321.0,-1.288299e-07,1.000016,-1.512872,-1.512872,-1.512872,-0.968683,-0.684698,-0.615498,-0.565632,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,fc,h3k4me1
001b8596f08afd6f67093fc64f6dca72,30321.0,2.41556e-08,1.000016,-0.572316,-0.496302,-0.491939,-0.48932,-0.487243,-0.48536,-0.483432,...,,,,,,,,,pval,h3k4me3


Unnamed: 0,count,mean,std,min,1%,5%,10%,15%,20%,25%,...,harmonized_donor_health_status,harmonized_donor_health_status_ontology_curie,automated_harmonized_donor_health_status_ontology_curie_ncit,automated_harmonized_donor_health_status_ontology_term_intermediate_order_unique,automated_harmonized_donor_health_status_ontology_term_high_order_unique,automated_harmonized_donor_health_status_ontology_term_intermediate_order,automated_harmonized_donor_health_status_ontology_term_high_order,harmonized_donor_life_status,track_type,assay_epiclass
0010a96642572d0891cc66c7b14c6a79,30321.0,-2.918801e-08,1.000016,-0.260594,-0.260594,-0.260594,-0.260594,-0.260594,-0.260594,-0.260188,...,,,,,,,,,Unique_minusRaw,rna_seq
001794578dd271ca1327beb9cfd21697,30321.0,-3.220746e-08,1.000016,-1.555108,-1.555108,-1.555108,-1.077413,-0.832617,-0.679643,-0.568795,...,,NCIM:C0549184,NCIT:C41132,General Qualifier,General Qualifier,,,,raw,h3k27me3
