In [1]:
import pyBigWig as pbw
import os
import re
import collections
import pandas as pd
import tensorflow_lattice as tfl
from Bio import SeqIO
import gzip
import tensorflow as tf
import time
import numpy as np
import multiprocessing as mp

In [2]:
train_data_path = "../data/train"
fasta_path = "../data/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz"
numpy_export_path = "../data/np_train/"


In [5]:
expanded_path = os.path.abspath(os.path.expanduser(os.path.expandvars(train_data_path)))
wig_files = [name for name in os.listdir(expanded_path) if name.endswith(".bigwig")]

def parse_name(filename):
    cell_line, assay = re.match("C(\d+)M(\d+)\.bigwig", filename).groups()
    return int(cell_line, base=10), int(assay, base=10)

def process_wig(wig_filename):
    dt = np.dtype("int, int, float")
    cell_line, assay = parse_name(wig_filename)
    pbwf = pbw.open(os.path.join(expanded_path, wig_filename), "r")
    for chrom in [f"chr{x}" for x in range(1, 23)] + ["chrX"]:
        arr = np.array(list(pbwf.intervals(chrom)), dtype=dt)
        np.save(os.path.abspath(os.path.join(numpy_export_path, f"{cell_line}_{assay}_{chrom}.npy")), arr)
        
with mp.Pool(32) as pool:
    pool.map(process_wig, wig_files)

In [3]:
class SparseWigTensor:
    target_chroms = [f"chr{x}" for x in range(1, 23)] + ["chrX"]
    missing = -1
    
    @staticmethod
    def parse_name(filename):
        cell_line, assay = re.match("C(\d+)M(\d+)\.bigwig", filename).groups()
        return int(cell_line, base=10), int(assay, base=10)
    
    def __init__(self, path):
        # {cell_line : {assay: wig}}
        self.wigs = collections.defaultdict(dict)
        self._load_wigs(path)
        self._initialize_chr_mapping()
        
    
    def _load_wigs(self, path):
        expanded_path = os.path.abspath(os.path.expanduser(os.path.expandvars(path)))
        wig_files = [name for name in os.listdir(expanded_path) if name.endswith(".bigwig")]
        for wig_filename in wig_files:
            cell_line, assay = SparseWigTensor.parse_name(wig_filename)
            self.wigs[cell_line][assay] = pbw.open(os.path.join(expanded_path, wig_filename), "r")
            
    def _initialize_chr_mapping(self):
        names = set()
        for _, _, pbw_file in self:
            names.update(pbw_file.chroms().keys())
        self.chr_mapping = {name: i for i, name in enumerate(sorted(names))}
        
            
    def close(self):
        for _, _, pbw_file in self:
            pbw_file.close()
    
    def __del__(self):
        self.close()

    def __iter__(self):
        for cell_line, assays in self.wigs.items():
            for assay_id, pbw_file in assays.items():
                yield (cell_line, assay_id, pbw_file)

In [4]:
swt = SparseWigTensor(train_data_path)

In [47]:
start = time.time()
dt = np.dtype("int, int, float")
for line, assay, pbwf in swt:
    for chrom in swt.target_chroms:
        arr = np.array(list(pbwf.intervals(chrom)), dtype=dt)
        np.save(os.path.abspath(os.path.join(numpy_export_path, f"{line}_{assay}_{chrom}.npy")), arr)
print("Total:", time.time() - start)

Total: 13.289730787277222


In [50]:
def convert_to_numpy(line, assay, pbwf):
    dt = np.dtype("int, int, float")
    for chrom in swt.target_chroms:
        arr = np.array(list(pbwf.intervals(chrom)), dtype=dt)
        np.save(os.path.abspath(os.path.join(numpy_export_path, f"{line}_{assay}_{chrom}.npy")), arr)

In [None]:
with ThreadPool(32) as pool:
    pool.starmap(convert_to_numpy, swt)