In [1]:
import glob
import pandas as pd
import os
import subprocess
from pathlib import Path
import pathlib
from collections import defaultdict

In [2]:
allc_paths_dir = f'{os.getcwd()}/Total.Balanced.Merged.Allc'
#kind = 'M.CellType.Age'

In [3]:
mm10_wins = pd.read_csv('../mm10.main.chrom.blanced_windows.bed',
                        sep='\t', names=['chr','start','end','id'])
mm10_wins = mm10_wins[~mm10_wins['chr'].isin(['chrY','chrM'])]
mm10_wins.head()

Unnamed: 0,chr,start,end,id
0,chr1,0,97735985,chr1-p1
1,chr1,97735985,195471971,chr1-p2
2,chr2,0,91056612,chr2-p1
3,chr2,91056612,182113224,chr2-p2
4,chr3,0,80019840,chr3-p1


In [4]:
mm10_wins.shape

(35, 4)

## check ready cell types

In [5]:
ready_samples = []
for path in glob.glob(f'{allc_paths_dir}/*.CGN-Merge.allc.tsv.gz'):
    sample = path.split('/')[-1]
    ready_samples.append(sample)
len(ready_samples)

162

In [6]:
celltype_ages = defaultdict(list)
for sample in ready_samples:
    celltype, age,_,_,_,_,_ = sample.split('.')
    celltype_ages[celltype].append(age)

In [7]:
ready_celltypes =[]
for celltype, age_list in celltype_ages.items():
    if len(age_list) == 3:
        ready_celltypes.append(celltype)
    else:
        print(celltype)
len(ready_celltypes)

54

In [9]:
remerge_cts = ['MEA-COA_Glut',
 'Astro-TE_NN',
 'STR_D1_Sema5a_Gaba',
 'Vip_Gaba',
 'PAG-PPN_Pax5_Sox21_Gaba',
 'Astro-NT_NN',
 'PB_Evx2_Glut',
 'LDT-PCG-CS_Gata3_Lhx1_Gaba']

## generate scripts

In [18]:
kind = "2-BothGender.CellType.Age"

In [11]:
df = pd.DataFrame()
df['allc_path'] = glob.glob(f'{allc_paths_dir}/*.CGN-Merge.allc.tsv.gz')
df['allc_path'] = df['allc_path'].apply(lambda x: str(Path(x).resolve()))
df['kind'] = kind
df['sample'] = df.apply(lambda x: x['allc_path'].split('/')[-1].replace(x['kind']+'.','').replace('.Both-Gender.CGN-Merge.allc.tsv.gz',''), axis=1)
df['Rep'] = 'rep1'

for col,vals in zip(df['kind'][0].split('.')[1:],df['sample'].str.split('.', expand=True).values.T):
    df[col] = vals

#df['Celltype'] = df['allc_path'].apply(lambda x: x.split('/')[-2].split('.')[0])
df2 = df.copy()
df2['Rep'] = 'rep2'

In [12]:
df = pd.concat([df,df2])
df['sample'] = df['sample'] + '.' + df['Rep']
df.head()

Unnamed: 0,allc_path,kind,sample,Rep,CellType,Age
0,/home/qzeng/project/aging/230907-recall-dmr/Me...,BothGender.CellType.Age,Endo_NN.9mo.rep1,rep1,Endo_NN,9mo
1,/home/qzeng/project/aging/230907-recall-dmr/Me...,BothGender.CellType.Age,Microglia_NN.8wk.rep1,rep1,Microglia_NN,8wk
2,/home/qzeng/project/aging/230907-recall-dmr/Me...,BothGender.CellType.Age,NP_SUB_Glut.8wk.rep1,rep1,NP_SUB_Glut,8wk
3,/home/qzeng/project/aging/230907-recall-dmr/Me...,BothGender.CellType.Age,CHOR_NN.18mo.rep1,rep1,CHOR_NN,18mo
4,/home/qzeng/project/aging/230907-recall-dmr/Me...,BothGender.CellType.Age,OT_D3_Folh1_Gaba.9mo.rep1,rep1,OT_D3_Folh1_Gaba,9mo


In [14]:
df = df[df['CellType'].isin(remerge_cts)]

In [16]:
df.shape

(48, 6)

In [17]:
prepcmds = []
dml_test_cmds = []
dml_finalize_cmds = []
dmr_cmds = []
dmrfix_cmds = []

In [19]:
for ct,tmpdf in df.groupby('CellType'):
    ct_input_dir = Path(f'{kind}/input/{ct}').resolve()
    ct_input_dir.mkdir(parents=True,exist_ok=True)

    ct_output_dir = Path(f'{kind}/DMLtest/{ct}').resolve()
    ct_output_dir.mkdir(parents=True,exist_ok=True)

    finaldmr_dir = Path(f'{kind}/DMR/').resolve()
    finaldmr_dir.mkdir(parents=True,exist_ok=True)

    dmlparts = []
    for _,(chrom,start,end,wid) in mm10_wins.iterrows():
        subdf = tmpdf.copy()
        subdf['allc_path'] = subdf['sample'].apply(lambda x: ct_input_dir/f'{x}-{wid}.allc.tsv.gz')
        submeta_path = ct_input_dir/f'{ct}-{wid}.csv'

        subdf.to_csv(submeta_path)

        for inallc,outallc in zip(tmpdf['allc_path'],subdf['allc_path']):
            prepcmds.append(f'tabix {inallc} {chrom}:{start}-{end} | bgzip > {outallc}\n')


        cmd = f"{Path('dss-DMLtest.r').resolve()} {submeta_path} {ct_output_dir}/{ct}-{wid}\n"
        dml_test_cmds.append(cmd)

        dmlparts.append(f'{ct_output_dir}/{ct}-{wid}.DMLtest.bed')

    dmltottmpfn = f'{ct_output_dir}/{ct}.DMLtest.tmp.bed'
    dmltotfn = f'{ct_output_dir}/{ct}.DMLtest.bed'

    dml_finalize_cmds.append(f'tail -n +2 {dmlparts[0]} > {dmltottmpfn}\n')
    for p in dmlparts[1:]:
        dml_finalize_cmds.append(f'tail -n +2 {p} >> {dmltottmpfn}\n')

    dml_finalize_cmds.append(f'head -n 1 {p} > {dmltotfn}\n')
    dml_finalize_cmds.append(f'cat {dmltottmpfn}  >> {dmltotfn}\n')

    dmrfn = f'{ct_output_dir}/{ct}.DMR.bed'
    cmd = f"{Path('dss-CallDMR.r').resolve()} {dmltotfn} {dmrfn}\n"
    dmr_cmds.append(cmd)

    dmrfn = f'{ct_output_dir}/{ct}.DMR.bed'
    dmr_fixedfn = f'{finaldmr_dir}/{ct}.DMR.bed'
    cmd = f"awk '{{OFS=\"\\t\";printf \"%s\\t%d\\t%d\\n\", $1,$2-1,$3}}' {dmrfn} > {dmr_fixedfn}\n"
    dmrfix_cmds.append(cmd)

In [21]:
pathlib.Path("qsub").mkdir(parents=True, exist_ok=True)

In [22]:
with open('qsub/00.prep.cmd','w') as f:
    f.writelines(prepcmds)

In [23]:
with open('qsub/01.dmltest.cmd', 'w') as f:
    for i, cmd in enumerate(dml_test_cmds, start=1):
        f.write(f"{cmd.strip()} &\n")
        if i % 35 == 0:
            f.write("wait\n")
    f.write("wait\n") 

In [24]:
with open('qsub/02.dml-finalize.cmd','w') as f:
    f.writelines(dml_finalize_cmds)

In [25]:
with open('qsub/03.dmr.cmd','w') as f:
    f.writelines(dmr_cmds)

In [26]:
with open('qsub/04.fix-dss-dmr.cmd','w') as f:
    f.writelines(dmrfix_cmds)

In [19]:
! pwd

/home/qzeng/project/aging/230907-recall-dmr/Merge_DMR
