In [None]:
"""Workstation for clean_hdf5.py"""
# pylint: disable=redefined-outer-name, expression-not-assigned, import-error, not-callable, pointless-statement, no-value-for-parameter, undefined-variable, unused-argument, missing-docstring, line-too-long

In [56]:
from __future__ import annotations

import bisect
import shutil
from pathlib import Path
from typing import List, Tuple

import h5py

In [118]:
def print_h5py_structure(file):
    def print_structure(name, obj):
        print(name, type(obj))
        if isinstance(obj, h5py.Group):
            for key in obj.keys():
                print_structure(name + "/" + key, obj[key])
        elif isinstance(obj, h5py.Dataset):
            for attr_name, attr_value in obj.attrs.items():
                print(f"{name}: {attr_name} = {attr_value}")

    with h5py.File(file, "r") as f:
        for attr_name, attr_value in f.attrs.items():
            print(f"{attr_name} = {attr_value}")
        f.visititems(print_structure)

In [119]:
print_h5py_structure(
    Path.home()
    / "Projects/epilap/input/hdf5/hg38_2022-epiatlas/100kb_all_none/8a4eb67b1e30d37eede02a28f691a917_100kb_all_none_value.hdf5"
)

bin = [100000]
chrom_sizes_filename = b'hg38.noy.chrom'
signal_filename = b'8a4eb67b1e30d37eede02a28f691a917'
8a4eb67b1e30d37eede02a28f691a917 <class 'h5py._hl.group.Group'>
8a4eb67b1e30d37eede02a28f691a917/chr1 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr1: sumX = [18.77845635]
8a4eb67b1e30d37eede02a28f691a917/chr1: sumXX = [3.78193254]
8a4eb67b1e30d37eede02a28f691a917/chr10 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr10: sumX = [6.3425276]
8a4eb67b1e30d37eede02a28f691a917/chr10: sumXX = [0.66830409]
8a4eb67b1e30d37eede02a28f691a917/chr11 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr11: sumX = [10.67108039]
8a4eb67b1e30d37eede02a28f691a917/chr11: sumXX = [2.28167097]
8a4eb67b1e30d37eede02a28f691a917/chr12 <class 'h5py._hl.dataset.Dataset'>
8a4eb67b1e30d37eede02a28f691a917/chr12: sumX = [10.95107281]
8a4eb67b1e30d37eede02a28f691a917/chr12: sumXX = [2.12982816]
8a4eb67b1e30d37eede02a28f691a917/chr13 <class '

In [59]:
home = Path("/home/local/USHERBROOKE/rabj2301/Projects")
input_dir = home / "epilap/input"
hdf5_list_path = input_dir / "hdf5_list" / "100kb_all_none_10samples.list"

In [101]:
def load_bed(path: Path | str) -> List[Tuple[str, int, int]]:
    data = []
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("\t")
            chromosome = parts[0]
            start = int(parts[1])
            end = int(parts[2])
            data.append((chromosome, start, end))
    return data


def preprocess_bed(bed: List[Tuple[str, int, int]]):
    # Create a dictionary to map chromosomes to intervals
    chromosome_intervals = {}

    for interval in bed:
        chromosome = interval[0]

        if chromosome not in chromosome_intervals:
            chromosome_intervals[chromosome] = []

        chromosome_intervals[chromosome].append(interval)

    # Sort the intervals within each chromosome
    for intervals in chromosome_intervals.values():
        intervals.sort(key=lambda interval: interval[1])

    return chromosome_intervals


def is_position_in_blacklist(start, end, chromosome_intervals, chromosome, verbose=False):
    if chromosome not in chromosome_intervals:
        return False

    # Extract the intervals for the given chromosome
    intervals = chromosome_intervals[chromosome]

    # Extract the positions from the chromosome intervals
    start_positions = [interval[1] for interval in intervals]
    end_positions = [interval[2] - 1 for interval in intervals]  # 1 to 0 based

    # Perform binary search using bisect_right
    start_index = bisect.bisect_right(start_positions, start)

    # Check the interval before the start_index
    if start_index > 0:
        end_index = start_index - 1
        interval_start = start_positions[end_index]
        interval_end = end_positions[end_index]
        if interval_start <= end and start <= interval_end:
            if verbose:
                print(
                    f"{chromosome}:{start}-{end} is overlapping {interval_start}-{interval_end}"
                )
            return True

    # Check the interval at start_index if it exists
    if start_index < len(start_positions):
        interval_start = start_positions[start_index]
        interval_end = end_positions[start_index]
        if interval_start <= end and start <= interval_end:
            if verbose:
                print(
                    f"{chromosome}:{start}-{end} is overlapping {interval_start}-{interval_end}"
                )
            return True

    return False

In [102]:
def test_is_position_in_blacklist():
    # Sample blacklist bed intervals (0-based start, 1-based end coordinates)
    blacklist_bed = [
        ("chr1", 10, 20),
        ("chr2", 5, 15),
        ("chr2", 25, 30),
        ("chr3", 10, 15),
    ]

    # Preprocess the blacklist bed
    chromosome_intervals = preprocess_bed(blacklist_bed)

    # Test cases with region overlap
    test_cases = [
        ("chr1", 5, 12, True),  # Region overlaps with chr1 interval
        ("chr1", 9, 20, True),  # Region overlaps with chr1 interval
        ("chr1", 15, 18, True),  # Region overlaps with chr1 interval
        ("chr1", 20, 22, False),  # Region does not overlap with chr1 interval
        ("chr2", 5, 8, True),  # Region overlaps with chr2 interval
        ("chr2", 15, 18, False),  # Region does not overlap with chr2 interval
        ("chr2", 14, 18, True),  # Region overlap with chr2 interval
        ("chr2", 30, 35, False),  # Region does not overlap with chr2 interval
        ("chr3", 10, 15, True),  # Region overlaps with chr3 interval
        ("chr3", 12, 14, True),  # Region overlaps with chr3 interval
        ("chr3", 5, 8, False),  # Region does not overlap with chr3 interval
        ("chr3", 5, 8, False),  # Region does not overlap with chr3 interval
        ("chr4", 10, 20, False),  # No intervals for chr4
    ]

    for chromosome, start, end, expected in test_cases:
        print(f"Testing {chromosome}:{start}-{end}...")
        result = is_position_in_blacklist(start, end, chromosome_intervals, chromosome)
        assert result == expected, print(f"Expected {expected}, got {result}")

    print("Unit test passed!")


# Run the unit test
test_is_position_in_blacklist()

Testing chr1:5-12...
Testing chr1:9-20...
Testing chr1:15-18...
Testing chr1:20-22...
Testing chr2:5-8...
Testing chr2:15-18...
Testing chr2:14-18...
Testing chr2:30-35...
Testing chr3:10-15...
Testing chr3:12-14...
Testing chr3:5-8...
Testing chr3:5-8...
Testing chr4:10-20...
Unit test passed!


In [103]:
og_hdf5_path = Path(
    "/home/local/USHERBROOKE/rabj2301/Projects/epilap/input/hdf5/hg38_2022-epiatlas/100kb_all_none/67959517fd1cd857151bbabca3abc74f_100kb_all_none_value.hdf5"
)
hdf5_path = og_hdf5_path.parent / (og_hdf5_path.stem + "_0blklst.hdf5")
shutil.copy(og_hdf5_path, hdf5_path)

bin_resolution = 100 * 1000  # Replace with the appropriate bin resolution value
blacklist_path = "/home/local/USHERBROOKE/rabj2301/Projects/epilap/input/filter/hg38_unified_blacklist.bed"
blacklist_bed = load_bed(blacklist_path)
blacklist_chromosome_intervals = preprocess_bed(blacklist_bed)


with h5py.File(hdf5_path, "r+") as file:
    header = list(file.keys())[0]
    hdf5_data = file[header]
    for chromosome, dataset in hdf5_data.items():  # type: ignore
        for i, value in enumerate(dataset):
            position = i * bin_resolution
            if is_position_in_blacklist(
                position,
                position + bin_resolution,
                blacklist_chromosome_intervals,
                chromosome,
                verbose=True,
            ):
                # print(i)
                dataset[i] = 0

    file.close()

chr1:600000-700000 is overlapping 628903-635103
6
chr1:5800000-5900000 is overlapping 5850087-5850570
58
chr1:8900000-9000000 is overlapping 8909610-8910013
89
chr1:9500000-9600000 is overlapping 9574580-9574996
95
chr1:32000000-32100000 is overlapping 32043823-32044202
320
chr1:33800000-33900000 is overlapping 33818964-33819343
338
chr1:38600000-38700000 is overlapping 38674335-38674714
386
chr1:50000000-50100000 is overlapping 50017081-50017545
500
chr1:52900000-53000000 is overlapping 52996949-52997328
529
chr1:55300000-55400000 is overlapping 55372488-55372868
553
chr1:67900000-68000000 is overlapping 67971776-67972155
679
chr1:73200000-73300000 is overlapping 73258720-73259099
732
chr1:76900000-77000000 is overlapping 76971068-76971594
769
chr1:93900000-94000000 is overlapping 93936365-93936746
939
chr1:102100000-102200000 is overlapping 102160407-102160786
1021
chr1:103600000-103700000 is overlapping 103620975-103621377
1036
chr1:106800000-106900000 is overlapping 106803432-10680