In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import subprocess
import pathlib
import glob
from tqdm import tqdm

In [2]:
gene_meta = pd.read_csv('/data/metadata/gencode.vM22.basic.annotation.gene.flat.tsv.gz', index_col=8,sep='\t')
gene_meta.head()

Unnamed: 0_level_0,chrom,source,feature,start,end,score,strand,phase,transcript_id,gene_type,...,gene_name,transcript_type,transcript_status,transcript_name,exon_number,exon_id,level,mgi_id,havana_gene,tag
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000102693.1,chr1,HAVANA,gene,3073253,3074322,.,+,.,,TEC,...,4933401J01Rik,,,,,,2,MGI:1918292,OTTMUSG00000049935.1,
ENSMUSG00000064842.1,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,,snRNA,...,Gm26206,,,,,,3,MGI:5455983,,
ENSMUSG00000051951.5,chr1,HAVANA,gene,3205901,3671498,.,-,.,,protein_coding,...,Xkr4,,,,,,2,MGI:3528744,OTTMUSG00000026353.2,
ENSMUSG00000102851.1,chr1,HAVANA,gene,3252757,3253236,.,+,.,,processed_pseudogene,...,Gm18956,,,,,,1,MGI:5011141,OTTMUSG00000049958.1,pseudo_consens
ENSMUSG00000103377.1,chr1,HAVANA,gene,3365731,3368549,.,-,.,,TEC,...,Gm37180,,,,,,2,MGI:5610408,OTTMUSG00000049960.1,


In [3]:
deg_path = '/home/qzeng_salk_edu/project/240228_RNA/DEG.stats'
cdir =f'/home/qzeng_salk_edu/project/240204-redo-compartment/Call.DiffComp.CellType.Age'

In [4]:
all_rna_cts = [pathlib.Path(path).name for path in glob.glob(f'{deg_path}/*')]
all_comp_cts = [pathlib.Path(path).name for path in glob.glob(f'{cdir}/*')]

In [5]:
shared_cts = list(set(all_rna_cts) & set(all_comp_cts))
len(shared_cts)

27

In [107]:
ct = 'L6_CT_CTX_Glut'
expr = pd.read_hdf(f'{deg_path}/{ct}/expr.hdf').T
deg_stats = np.load(f'{deg_path}/{ct}/{ct}.2mo-{ct}.18mo.npz')
deg_stats = pd.DataFrame({'fdr(2mo/18mo)':deg_stats['fdr'],'fc':deg_stats['fc']}, index = expr.index)
deg_stats.sort_values('fdr(2mo/18mo)').head(10)

Unnamed: 0_level_0,fdr(2mo/18mo),fc
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
Dclk1,6.095350000000001e-244,3.415961
Rian,1.4912069999999998e-216,2.701319
Gabrb2,3.717545e-213,3.839484
Ptk2,4.3671580000000007e-209,2.312741
Prkce,3.8514199999999995e-203,2.867263
Grik3,1.5196069999999999e-200,3.542829
Ank3,4.041933e-199,1.925813
Nav2,5.339143e-191,2.156993
Dtna,1.837287e-178,3.523846
Grin2b,6.886461e-178,1.890386


## get all clltype results

In [6]:
for ct in tqdm(shared_cts):
    expr = pd.read_hdf(f'{deg_path}/{ct}/expr.hdf').T
    deg_stats = np.load(f'{deg_path}/{ct}/{ct}.2mo-{ct}.18mo.npz')
    deg_stats = pd.DataFrame({'fdr(2mo/18mo)':deg_stats['fdr'],'fc':deg_stats['fc']}, index = expr.index)
    deg_stats.sort_values('fdr(2mo/18mo)').head(10)

    comp = pd.read_csv(f'{cdir}/{ct}/DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph' , 
                   sep='\t', header=0, index_col=None)
    
    deg_stats = deg_stats[deg_stats['fdr(2mo/18mo)'] < 0.01]
    up_genes = deg_stats[deg_stats['fc'] <1]
    up_genes = up_genes.sort_values('fc').index[:30]

    tmp = gene_meta[gene_meta['gene_name'].isin(up_genes)]
    tmp = tmp[['chrom','start','end','gene_name']]
    tmp.to_csv('tmp.upgene.bed', sep = '\t', header = None, index = None)

    tmp_ct = ct.replace('-','_')
    tmpcomp= comp[['chr','start','end',f'{tmp_ct}_8wk_100Kb',f'{tmp_ct}_9mo_100Kb',f'{tmp_ct}_18mo_100Kb','padj']]
    tmpcomp.columns = ['chr','start','end','8wk','9mo','18mo','padj']
    tmpcomp['18mo-8wk'] = tmpcomp['18mo'] - tmpcomp['8wk']
    tmpcomp.to_csv('tmp.comp.bed', sep = '\t', header = None, index = None)
    subprocess.run(f'bedtools intersect -a tmp.upgene.bed -b tmp.comp.bed -wa -wb  > Upgene.Comp.Result/{ct}.upgene.bed', shell = True)

100%|██████████| 27/27 [00:04<00:00,  5.51it/s]


## check gene and comp

In [104]:
resc = 100000
slop = 12
for ct in shared_cts:
    result = pd.read_csv(f'Upgene.Comp.Result/{ct}.upgene.bed', sep = '\t', header = None)
    result.index = result[4] + '_' + (result[5] // resc).astype(str)
    comp = pd.read_csv(f'{cdir}/{ct}/DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph', sep='\t', header=0, index_col=None)
    comp.index = comp['chr'] + '_' + (comp['start'] // resc).astype(str)
    
    for gene , tmpdf in result.groupby(3):
        gene_chunk = tmpdf.index
        if len(gene_chunk) > 1:
            _chr, start = gene_chunk[0].split('_')
            start = int(start)
            up_chunks = [f"{_chr}_{start-i}" for i in range(0, slop)]
        
            _chr, end = gene_chunk[-1].split('_')
            end = int(start)
            down_chunks = [f"{_chr}_{end+i}" for i in range(0, slop)]
            tmp_ct = ct.replace('-','_')
            gene_mean = comp.loc[comp.index.isin(gene_chunk)][f'{tmp_ct}_8wk'].mean()
            up_mean = comp.loc[comp.index.isin(up_chunks)][f'{tmp_ct}_8wk'].mean()
            down_mean = comp.loc[comp.index.isin(down_chunks)][f'{tmp_ct}_8wk'].mean()
            if gene_mean > 0.2 and up_mean < -0 and down_mean < -0:
                print(f'{ct} {gene}')
    
    

L23_IT_CTX_Glut 4933424G05Rik
Oligo_NN 9330111N05Rik
OPC_NN Myo5a
OPC_NN Scel
Astro-NT_NN Hivep2
L23_IT_ENT_Glut Myo5a
STR-PAL_Chst9_Gaba Phc3
L6_CT_CTX_Glut Skil
Microglia_NN Dock2
Microglia_NN Galnt7
VLMC_NN Flrt2


In [96]:
ct = 'Microglia_NN'
gene = 'Abcd2'
result = pd.read_csv(f'Upgene.Comp.Result/{ct}.upgene.bed', sep = '\t', header = None)
result.index = result[4] + '_' + (result[5] // resc).astype(str)
comp = pd.read_csv(f'{cdir}/{ct}/DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph', sep='\t', header=0, index_col=None)
comp.index = comp['chr'] + '_' + (comp['start'] // resc).astype(str)


In [97]:
tmpdf= result[result[3] == gene]
gene_chunk = tmpdf.index
    
_chr, start = gene_chunk[0].split('_')
start = int(start)
up_chunks = [f"{_chr}_{start-i}" for i in range(0, slop)]

_chr, end = gene_chunk[-1].split('_')
end = int(start)
down_chunks = [f"{_chr}_{end+i}" for i in range(0, slop)]
tmp_ct = ct.replace('-','_')
gene_mean = comp.loc[comp.index.isin(gene_chunk)][f'{tmp_ct}_8wk'].mean()
up_mean = comp.loc[comp.index.isin(up_chunks)][f'{tmp_ct}_8wk'].mean()
down_mean = comp.loc[comp.index.isin(down_chunks)][f'{tmp_ct}_8wk'].mean()

In [99]:
comp.loc[comp.index.isin(gene_chunk)]

Unnamed: 0,chr,start,end,Microglia_NN_8wk_100Kb,Microglia_NN_9mo_100Kb,Microglia_NN_18mo_100Kb,Microglia_NN_8wk,Microglia_NN_9mo,Microglia_NN_18mo,sample_maha,pval,padj,dist_clust
chr15_911,chr15,91100000,91200000,0.58749,0.45939,0.38815,0.58749,0.45939,0.38815,0.108076,0.947396,1.0,1


In [100]:
comp.loc[comp.index.isin(up_chunks)]

Unnamed: 0,chr,start,end,Microglia_NN_8wk_100Kb,Microglia_NN_9mo_100Kb,Microglia_NN_18mo_100Kb,Microglia_NN_8wk,Microglia_NN_9mo,Microglia_NN_18mo,sample_maha,pval,padj,dist_clust
chr15_900,chr15,90000000,90100000,-0.63995,-0.56975,-0.5324,-0.63995,-0.56975,-0.5324,0.004787578,0.997609,1.0,1
chr15_901,chr15,90100000,90200000,0.00065,-0.05113,-0.24671,0.00065,-0.05113,-0.24671,0.4447711,0.800607,1.0,1
chr15_902,chr15,90200000,90300000,-0.54436,-0.62283,-0.64685,-0.54436,-0.62283,-0.64685,0.004981857,0.997512,1.0,1
chr15_903,chr15,90300000,90400000,-0.98112,-0.93775,-0.83478,-0.98112,-0.93775,-0.83478,0.03995635,0.98022,1.0,1
chr15_904,chr15,90400000,90500000,-1.07055,-0.97782,-0.92805,-1.07055,-0.97782,-0.92805,0.02461881,0.987766,1.0,1
chr15_905,chr15,90500000,90600000,-1.02788,-1.01621,-1.0173,-1.02788,-1.01621,-1.0173,7.323e-07,1.0,1.0,1
chr15_906,chr15,90600000,90700000,-1.02949,-0.98621,-0.94718,-1.02949,-0.98621,-0.94718,0.001660119,0.99917,1.0,1
chr15_907,chr15,90700000,90800000,-1.05965,-1.07504,-1.01231,-1.05965,-1.07504,-1.01231,0.0009306073,0.999535,1.0,1
chr15_908,chr15,90800000,90900000,-1.00932,-1.06681,-0.99826,-1.00932,-1.06681,-0.99826,0.002003043,0.998999,1.0,1
chr15_909,chr15,90900000,91000000,-0.8795,-0.9722,-0.87379,-0.8795,-0.9722,-0.87379,0.01362546,0.99321,1.0,1
