In [7]:
import os
import cooler
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, save_npz, vstack, csr_matrix, triu
from scipy.stats import f, zscore, ranksums
from schicluster.cool.utilities import get_chrom_offsets
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed
import pathlib
import joblib

In [9]:
all_groups = joblib.load('all_groups_list')

FileNotFoundError: [Errno 2] No such file or directory: 'all_groups_list'

In [3]:
ct = 'DG_Glut'
ages = ['8wk','9mo','18mo']
regions = []
leg = [f"{ct}.{age}" for age in ages]
leg

['DG_Glut.8wk', 'DG_Glut.9mo', 'DG_Glut.18mo']

In [4]:
ctgroup = [[_leg] for _leg in leg]
ctgroup

[['DG_Glut.8wk'], ['DG_Glut.9mo'], ['DG_Glut.18mo']]

In [5]:
indir = '/data/female-amb/region-loop'
outdir = 'loop_ct_region_age/diff/'
res = 10000
group = ct

In [5]:
p = pathlib.Path(outdir)
p.mkdir(exist_ok = True)

In [6]:
chrom_size_path = f'/ref/m3C/mm10.main.nochrM.nochrY.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
bins_df = cooler.binnify(chrom_sizes, res)
chrom_offset = get_chrom_offsets(bins_df)

In [7]:
bkl = pd.read_csv('/data/ref/m3C/mm10_bismark_loop_blacklist.bed', sep='\t', header=None, index_col=None)

In [8]:
def compute_anova(c, matrix):
    # c, matrix = args
    ngene = int(chrom_sizes.loc[c] // res) + 1
    bkl_tmp = bkl.loc[(bkl[0]==c), [1,2]].values // res
    cov = np.zeros(ngene)
    for xx,yy in bkl_tmp:
        cov[xx-7:yy+7] = 1
    tot, last = 0, 0
    Esum, E2sum, Elast, E2last, ss_intra = [csr_matrix((ngene, ngene)) for i in range(5)]
    for ctlist in ctgroup:
        for ct_age in ctlist:
            cool_e = cooler.Cooler(f'{indir}/{ct_age}/{ct_age}/{ct_age}.{matrix}.cool')
            E = triu(cool_e.matrix(balance=False, sparse=True).fetch(c))
            cool_e2 = cooler.Cooler(f'{indir}/{ct_age}/{ct_age}/{ct_age}.{matrix}2.cool')
            E2 = triu(cool_e2.matrix(balance=False, sparse=True).fetch(c))
            n = cool_e.info['group_n_cells']
            Esum += E * n
            E2sum += E2 * n
            tot += n
            # print(c, ct)
        Egroup = Esum - Elast
        E2group = E2sum - E2last
        Egroup.data = Egroup.data ** 2 / (tot - last)
        ss_intra += (E2group - Egroup)
        Elast = Esum.copy()
        E2last = E2sum.copy()
        last = tot
    Esum.data = Esum.data ** 2 / tot
    ss_total = E2sum - Esum
    ss_intra.data = 1 / ss_intra.data
    ss_total = ss_total.multiply(ss_intra)
    # print(c, ss_total.data.min(), ss_intra.data.min())

    ss_total.data = (ss_total.data - 1) * (tot - len(ctgroup)) / (len(ctgroup) - 1)
    ss_total = ss_total.tocoo()
    bklfilter = np.logical_and(cov[ss_total.row]==0, cov[ss_total.col]==0)
    distfilter = np.logical_and((ss_total.col-ss_total.row)>5, (ss_total.col-ss_total.row)<500)
    idxfilter = np.logical_and(bklfilter, distfilter)
    # print(idxfilter.sum(), len(idxfilter))
    ss_total = csr_matrix((ss_total.data[idxfilter], (ss_total.row[idxfilter], ss_total.col[idxfilter])), (ngene, ngene))
    
    p = pathlib.Path(f"{outdir}/{ct}")
    p.mkdir(exist_ok = True)
    save_npz(f'{outdir}/{ct}/{matrix}pv_{c}.npz', ss_total)

    return [c, matrix, tot]

In [9]:
cpu = 40
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for x in chrom_sizes.index:
        for y in ['Q', 'E', 'T']:
            future = executor.submit(
                compute_anova,
                c=x,
                matrix=y,
            )
            futures.append(future)

    # result = []
    for future in as_completed(futures):
        # result.append(future.result())
        # c1, c2 = result[-1][0], result[-1][1]
        tmp = future.result()
        print(f'{tmp[0]} {tmp[1]} finished')
        

chr9 Q finished
chr9 E finished
chr10 E finished
chr12 Q finished
chr11 Q finished
chr12 E finished
chr6 Q finished
chr6 T finished
chr13 Q finished
chr10 Q finished
chr13 E finished
chr9 T finished
chr8 Q finished
chr7 T finished
chr14 Q finished
chr3 E finished
chr10 T finished
chr11 E finished
chr7 E finished
chr12 T finished
chr8 E finished
chr5 T finished
chr11 T finished
chr4 Q finished
chr13 T finished
chr6 E finished
chr5 Q finished
chr7 Q finished
chr8 T finished
chr3 T finished
chr3 Q finished
chr2 E finished
chr2 Q finished
chr19 E finished
chr5 E finished
chr16 Q finished
chr4 E finished
chr19 Q finished
chr2 T finished
chr15 E finished
chr16 T finished
chr19 T finished
chr15 Q finished
chr15 T finished
chr16 E finished
chr4 T finished
chr14 E finished
chr18 Q finished
chr1 T finished
chr18 E finished
chr17 Q finished
chr18 T finished
chr17 T finished
chr14 T finished
chr17 E finished
chr1 Q finished
chr1 E finished
chrX Q finished
chrX E finished
chrX T finished


In [10]:
def chrom_iterator(input_dir, chrom_order, chrom_offset):
    for chrom in chrom_order:
        output_path = f'{input_dir}_{chrom}.npz'
        if not pathlib.Path(output_path).exists():
            continue
        chunk_size = 5000000
        data = load_npz(output_path).tocoo()
        df = pd.DataFrame({'bin1_id': data.row, 'bin2_id': data.col, 'count': data.data})
        df = df[df['bin1_id'] <= df['bin2_id']]
        for i, chunk_start in enumerate(range(0, df.shape[0], chunk_size)):
            chunk = df.iloc[chunk_start:chunk_start + chunk_size]
            chunk.iloc[:, :2] += chrom_offset[chrom]
            yield chunk

In [12]:
for matrix in ['Q', 'E', 'T']:
    output_path = f'{outdir}/{ct}/{matrix}pv'
    cooler.create_cooler(cool_uri=f'{output_path}.cool',
                         bins=bins_df,
                         pixels=chrom_iterator(input_dir=output_path,
                                               chrom_order=chrom_sizes.index,
                                               chrom_offset=chrom_offset
                                              ),
                         ordered=True,
                         dtypes={'count': np.float32})

In [13]:
import subprocess
subprocess.run(f'rm {outdir}/{ct}/*pv_c*.npz', shell = True)

CompletedProcess(args='rm loop_ct_age/diff//Oligo_NN/*pv_c*.npz', returncode=0)