In [1]:
"""Workstation for clean_hdf5.py"""
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument, missing-docstring, line-too-long

'Workstation for clean_hdf5.py'

In [2]:
from __future__ import annotations

import shutil
from pathlib import Path

import h5py

In [3]:
from epi_ml.utils.clean_hdf5 import (
    is_position_in_blacklist,
    load_bed,
    preprocess_bed,
    print_h5py_structure,
)
from tests.utils.clean_hdf5_test import test_is_position_in_blacklist

In [4]:
print_h5py_structure(
    str(
        Path.home()
        / "Projects/epilap/input/hdf5/hg38_2022-epiatlas/100kb_all_none/8a4eb67b1e30d37eede02a28f691a917_100kb_all_none_value.hdf5"
    )
)

8a4eb67b1e30d37eede02a28f691a917 <class 'h5py._hl.group.Group'>
8a4eb67b1e30d37eede02a28f691a917/chr1 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr10 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr11 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr12 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr13 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr14 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr15 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr16 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr17 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr18 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr19 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr2 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr20 <class 'h5p

In [5]:
home = Path("/home/local/USHERBROOKE/rabj2301/Projects")
input_dir = home / "epilap/input"
hdf5_list_path = input_dir / "hdf5_list" / "100kb_all_none_10samples.list"

In [6]:
test_is_position_in_blacklist()

Testing chr1:5-12...
Testing chr1:9-20...
Testing chr1:15-18...
Testing chr1:20-22...
Testing chr2:5-8...
Testing chr2:15-18...
Testing chr2:14-18...
Testing chr2:30-35...
Testing chr3:10-15...
Testing chr3:12-14...
Testing chr3:5-8...
Testing chr3:5-8...
Testing chr4:10-20...


In [7]:
og_hdf5_path = Path(
    "/home/local/USHERBROOKE/rabj2301/Projects/epilap/input/hdf5/hg38_2022-epiatlas/100kb_all_none/67959517fd1cd857151bbabca3abc74f_100kb_all_none_value.hdf5"
)
hdf5_path = og_hdf5_path.parent / (og_hdf5_path.stem + "_0blklst.hdf5")
shutil.copy(og_hdf5_path, hdf5_path)

bin_resolution = 100 * 1000  # Replace with the appropriate bin resolution value
blacklist_path = "/home/local/USHERBROOKE/rabj2301/Projects/epilap/input/filter/hg38_unified_blacklist.bed"
blacklist_bed = load_bed(blacklist_path)
blacklist_chromosome_intervals = preprocess_bed(blacklist_bed)


with h5py.File(hdf5_path, "r+") as file:
    header = list(file.keys())[0]
    hdf5_data = file[header]
    for chromosome, dataset in hdf5_data.items():  # type: ignore
        for i, value in enumerate(dataset):
            position = i * bin_resolution
            if is_position_in_blacklist(
                position,
                position + bin_resolution,
                blacklist_chromosome_intervals,
                chromosome,
                verbose=True,
            ):
                # print(i)
                dataset[i] = 0

    file.close()

chr1:600000-700000 is overlapping 628903-635103
chr1:5800000-5900000 is overlapping 5850087-5850570
chr1:8900000-9000000 is overlapping 8909610-8910013
chr1:9500000-9600000 is overlapping 9574580-9574996
chr1:32000000-32100000 is overlapping 32043823-32044202
chr1:33800000-33900000 is overlapping 33818964-33819343
chr1:38600000-38700000 is overlapping 38674335-38674714
chr1:50000000-50100000 is overlapping 50017081-50017545
chr1:52900000-53000000 is overlapping 52996949-52997328
chr1:55300000-55400000 is overlapping 55372488-55372868
chr1:67900000-68000000 is overlapping 67971776-67972155
chr1:73200000-73300000 is overlapping 73258720-73259099
chr1:76900000-77000000 is overlapping 76971068-76971594
chr1:93900000-94000000 is overlapping 93936365-93936746
chr1:102100000-102200000 is overlapping 102160407-102160786
chr1:103600000-103700000 is overlapping 103620975-103621377
chr1:106800000-106900000 is overlapping 106803432-106803815
chr1:121600000-121700000 is overlapping 121609948-125063