In [1]:
import os
import cooler
import numpy as np
import pandas as pd
from scipy.stats import norm, zscore

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib import cm as cm
import seaborn as sns
from matplotlib.colors import LogNorm

from itertools import cycle, islice
import xarray as xr
from glob import glob
from scipy import ndimage as nd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
#mpl.rcParams['font.family'] = 'sans-serif'
#mpl.rcParams['font.sans-serif'] = 'Helvetica'

In [2]:
ct = 'MEA-BST_Gaba'
leg = [f'{ct}.{age}' for age in ['8wk','9mo','18mo']]
leg

['MEA-BST_Gaba.8wk', 'MEA-BST_Gaba.9mo', 'MEA-BST_Gaba.18mo']

In [3]:
chrom_size_path = '/ref/m3C/mm10.main.nochrM.nochrY.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
chrom_sizes = chrom_sizes.iloc[:-1]

In [4]:
indir = '/home/qzeng_salk_edu/project/240205-domain/CellType.Age.Diff.Domain/'

In [5]:
## Load boundary probability
bound_count_ct = pd.read_hdf(f'{indir}/{ct}/{ct}_boundcount.hdf', key='data').loc[leg]
cell_count_ct = pd.read_csv(f'{indir}/{ct}/{ct}_cellcount.csv.gz', index_col=0, header=0)['count'].loc[leg]
bound_prob_ct = bound_count_ct / cell_count_ct.values[:,None]
bound_prob_ct

Unnamed: 0,chr1_0,chr1_1,chr1_2,chr1_3,chr1_4,chr1_5,chr1_6,chr1_7,chr1_8,chr1_9,...,chr19_2448,chr19_2449,chr19_2450,chr19_2451,chr19_2452,chr19_2453,chr19_2454,chr19_2455,chr19_2456,chr19_2457
MEA-BST_Gaba.8wk,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011194,0.085821,0.014925,0.0,0.003731,0.007463,0.0,0.0,0.0,1.0
MEA-BST_Gaba.9mo,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022388,0.048507,0.011194,0.007463,0.0,0.007463,0.0,0.0,0.0,1.0
MEA-BST_Gaba.18mo,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.007463,0.100746,0.011194,0.003731,0.007463,0.007463,0.0,0.0,0.0,1.0


In [6]:
## Load insulation score
ins_count = xr.open_dataset(f'/home/qzeng_salk_edu/project/240205-domain/hicluster_bulk_domain/{ct}.insulation.nc')
ins_count = ins_count.sel({'bin': (ins_count['bin_chrom']!='chrX')})
ins_count['ratio'] = (ins_count.sel({'type':'inter'})['__xarray_dataarray_variable__'] / ins_count.sel({'type':'intra'}))['__xarray_dataarray_variable__']
ins = ins_count['ratio'].to_pandas().loc[leg]

In [7]:
genemeta = pd.read_csv('/ref/m3C/gencode.vM22.annotation.gene.sorted.bed.gz', sep='\t', header=None, index_col=3)
genemeta = genemeta[~genemeta.index.duplicated(keep='first')]

In [8]:
##select all age diff domain
binall = pd.read_hdf(f'CellType.Age.Diff.Domain/{ct}/{ct}_bin_stats.hdf', key='data')
selb = (binall['chi2filter'] & binall['ins_lm'] & (binall['probdiff']>0.05) & binall['diff_sc'])
selected_bins = selb[selb].index
selected_bins

Index(['chr1_2350', 'chr1_6631', 'chr1_7784', 'chr2_180', 'chr2_1781',
       'chr2_3212', 'chr2_6620', 'chr2_6860', 'chr3_265', 'chr4_4257',
       'chr5_691', 'chr5_2696', 'chr5_2972', 'chr5_3637', 'chr5_4085',
       'chr5_5161', 'chr5_5190', 'chr5_5415', 'chr5_5470', 'chr5_5940',
       'chr6_2048', 'chr6_5007', 'chr6_5729', 'chr7_3254', 'chr7_4412',
       'chr8_2887', 'chr8_3345', 'chr8_4833', 'chr10_233', 'chr11_220',
       'chr11_2181', 'chr11_2801', 'chr12_2973', 'chr13_487', 'chr13_1742',
       'chr13_4369', 'chr15_1618', 'chr15_2666', 'chr15_3478', 'chr16_3610',
       'chr16_3801', 'chr17_2533'],
      dtype='object')

In [9]:
len(selected_bins)

42

In [11]:
_bin = 'chr11_2181'
info_df = binall.loc[_bin] 
chrom, diff_domain_l, diff_domain_r = info_df['chrom'],info_df['start'], info_df['end']

lslop, rslop = 2000000, 2000000
ll, rr = (diff_domain_r - lslop), (diff_domain_l + rslop)
print(f"{chrom}:{diff_domain_l}-{diff_domain_r}")

chr11:54525000-54550000


In [12]:
resl = 10000
loopl, loopr = (ll//resl), (rr//resl)

In [None]:
dstall = []
for group in leg:
    #cool = cooler.Cooler(f'dataset/hicluster_bulk/{group}/{group}.Q.cool')
    cool = cooler.Cooler(f'/data/female-amb/loop/{group}/{group}/{group}.Q.cool')
    #cool = cooler.Cooler(f'/data/female-amb/impute/merged-25k/{group}.Q.cool')
    #cool = cooler.Cooler(f'/data/female-amb/CellType.Age.Merged.mcool/{group}.raw.mcool::resolutions/25000')
    Q = cool.matrix(balance=False, sparse=True).fetch(chrom).tocsr()
    tmp = Q[loopl:loopr, loopl:loopr].toarray()
    dst = nd.rotate(tmp, 45, order=0, reshape=True, prefilter=False, cval=0)
    dstall.append(dst)

In [None]:
required_repetitions = len(leg)
height_ratios = [1] + np.tile([3.5, 1,1], required_repetitions).tolist()


fig, axes = plt.subplots(len(leg) * 3 + 1, 1, figsize=(10, np.sum(height_ratios)/2), 
                         gridspec_kw={'height_ratios': height_ratios}, dpi=300, sharex='all')
legname = leg
tmpd = binall.loc[selb, 'start'] // resl - loopl

ax = axes[0]
sel = (binall['chrom']==chrom) & (binall['start']>=ll) & (binall['start']<rr)
xpos = (binall.loc[sel, 'start'] // resl - loopl) * np.sqrt(2)
ax.plot(xpos, binall.loc[sel, 'chi2_sc'].values, 
        c='C0', alpha=0.7)

ax.set_title('Chi-Square', fontsize=10)
ax.set_xlim([0, (loopr-loopl-1)*np.sqrt(2)])

for i in range(len(leg)):
    ax = axes[i*3+1]
    ax.set_title(legname[i], fontsize=10)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    img = ax.imshow(dstall[i], cmap='afmhot_r',vmin=0, vmax=0.01) #
    h = len(dstall[i])
    ax.set_ylim([0.5*h, 0.4*h])
    ax.set_xlim([0, h])
    ax.set_yticks([])
    ax.set_yticklabels([])

    ax = axes[i*3+2]
    ax.plot(xpos, bound_prob_ct.loc[leg[i], sel].values, c='C0', alpha=0.7)
    ax.scatter(tmpd * np.sqrt(2), np.zeros(len(tmpd))+0.05, color='r', s=4)
    ax.set_ylim([0, 0.3])

    ax = axes[i*3+3]
    sns.despine(bottom=True, ax=ax)
    ## plot compartment, use 10k coordinate
    selc = (bincomp['chr']==chrom) & (bincomp['start']>=ll) & (bincomp['start']<rr)
    x = ( bincomp.loc[selc, 'start'] // resl - loopl ) * np.sqrt(2)
    y = comp.loc[selc, leg[i]].values
    ax.fill_between(x, y, 0, where=y >= 0, facecolor='C3', interpolate=True)
    ax.fill_between(x, y, 0, where=y <= 0, facecolor='C0', interpolate=True)
    
    
ax.set_xlim([0, (loopr-loopl-1)*np.sqrt(2)])
ax.set_xticks(np.sqrt(2)*np.array(np.arange(0, loopr-loopl+1, 100).tolist() + [tss//resl-loopl, tes//resl-loopl]))
ax.set_xticklabels([])
ax.set_xticklabels([f'{(xx+loopl)/100}M' for xx in np.arange(0, loopr-loopl+1, 100)]+['L','R'])
fig.suptitle(f"{chrom}:{tss}-{tes}", fontsize=16)

In [20]:
# with PdfPages(f'{ct}.DiffDomain.pdf') as pdf:
#     for _bin in selected_bins:
#         info_df = binall.loc[_bin] 
#         chrom, tss, tes = info_df['chrom'],info_df['start'], info_df['end']
#         lslop, rslop = 1000000, 2000000
#         ll, rr = (tss - lslop), (tss + rslop)
#         resl = 10000
#         loopl, loopr = (ll//resl), (rr//resl)
    
#         dstall = []
#         for group in leg:
#             cool = cooler.Cooler(f'dataset/hicluster_bulk/{group}/{group}.Q.cool')
#             #cool = cooler.Cooler(f'/data/female-amb/loop/{group}/{group}/{group}.Q.cool')
#             #cool = cooler.Cooler(f'/data/female-amb/impute/merged-25k/{group}.Q.cool')
#             #cool = cooler.Cooler(f'/data/female-amb/CellType.Age.Merged.mcool/{group}.raw.mcool::resolutions/25000')
#             Q = cool.matrix(balance=False, sparse=True).fetch(chrom).tocsr()
#             tmp = Q[loopl:loopr, loopl:loopr].toarray()
#             dst = nd.rotate(tmp, 45, order=0, reshape=True, prefilter=False, cval=0)
#             dstall.append(dst)
        
#         combined_array = np.concatenate([arr.flatten() for arr in dstall])
#         quantile_95 = np.quantile(combined_array, 0.98)
        
#         fig, axes = plt.subplots(len(leg)*2+1, 1, figsize=(10, np.sum([1] + np.tile([3.5,1],len(leg)).tolist())/2), 
#                              gridspec_kw={'height_ratios':[1] + np.tile([3.5,1],len(leg)).tolist()}, dpi=300, sharex='all')
#         legname = leg
#         tmpd = binall.loc[selb, 'start'] // resl - loopl
        
#         ax = axes[0]
#         sel = (binall['chrom']==chrom) & (binall['start']>=ll) & (binall['start']<rr)
#         xpos = (binall.loc[sel, 'start'] // resl - loopl) * np.sqrt(2)
#         ax.plot(xpos, binall.loc[sel, 'chi2_sc'].values, 
#                 c='C0', alpha=0.7)
        
#         ax.set_title('Chi-Square', fontsize=10)
#         ax.set_xlim([0, (loopr-loopl-1)*np.sqrt(2)])
        
#         for i in range(len(leg)):
#             ax = axes[i*2+1]
#             ax.set_title(legname[i], fontsize=10)
#             ax.spines['right'].set_visible(False)
#             ax.spines['top'].set_visible(False)
#             ax.spines['bottom'].set_visible(False)
#             ax.spines['left'].set_visible(False)
#             img = ax.imshow(dstall[i], cmap='afmhot_r',vmin=0, vmax=0.012) #
#             h = len(dstall[i])
#             ax.set_ylim([0.5*h, 0.4*h])
#             ax.set_xlim([0, h])
#             ax.set_yticks([])
#             ax.set_yticklabels([])
        
#             ax = axes[i*2+2]
#             ax.plot(xpos, bound_prob_ct.loc[leg[i], sel].values, c='C0', alpha=0.7)
#             ax.scatter(tmpd * np.sqrt(2), np.zeros(len(tmpd))+0.05, color='r', s=4)
#             ax.set_ylim([0, 0.3])
#             # ax2 = ax.twinx()
#             # ax2.plot(xpos, ins.loc[leg[i], sel].values, c='C1', alpha=0.7)
#             # ax2.set_ylim([0.05, 0.4])
            
#         ax.set_xlim([0, (loopr-loopl-1)*np.sqrt(2)])
#         ax.set_xticks(np.sqrt(2)*np.array(np.arange(0, loopr-loopl+1, 100).tolist() + [tss//resl-loopl, tes//resl-loopl]))
#         ax.set_xticklabels([])
#         ax.set_xticklabels([f'{(xx+loopl)/100}M' for xx in np.arange(0, loopr-loopl+1, 100)]+['TSS','TES'])
#         fig.suptitle(f"{chrom}:{tss}-{tes}", fontsize=16)

#         pdf.savefig(fig)
#         print(_bin)
#         #plt.show(close = True)
#         # plt.savefig(f'/gale/netapp/entex/HBA/snm3C/plot/majortype_diff_FOXP2.pdf', transparent=True, dpi=300)