In [1]:
import glob
import pathlib
import pandas as pd
import subprocess
import joblib
import numpy as np

In [2]:
ref_dir= '/ref/m3C'
data_dir = '/data/female-amb/female-m3c-rmbkl'
meta_file = '/data/metadata/240104_m3C_META.csv'

## get AgingMajorType.Rep use_cells

In [3]:
use_cts = joblib.load('/data/metadata/m3c_use_cts')
use_cts = [ct.replace(' ','_').replace('/','') for ct in use_cts]

In [4]:
use_cts = ['Oligo_NN']

In [5]:
# load metadata
meta = pd.read_csv(meta_file, index_col = 0)
meta['AgingMajorType'] = meta['AgingMajorType'].apply(lambda x: x.replace(' ','_').replace('/',''))
meta = meta[meta['AgingMajorType'].isin(use_cts)]
meta['AgingMajorType.Age.Rep'] = meta['AgingMajorType'] + '.' + meta['Age'] + '.' + meta['Replicate']
meta['Age.Rep'] = meta['Age'] + '.' + meta['Replicate']

In [6]:
all_use_cell_index = []
rep_use_cts = []
for ct, tmpdf in meta.groupby('AgingMajorType'):
    cell_counts = tmpdf['Age.Rep'].value_counts()
    target_number = np.sqrt(cell_counts) * 20
    target_number = (pd.DataFrame([cell_counts, target_number]).T.min(axis =1)-1).astype(int)

    check = target_number > 100
    check = check.sum()
    if check == 6:
        rep_use_cts.append(ct)
    
    selected_cells = []
    for cluster, cell in tmpdf.groupby('Age.Rep'):
        cells = cell.sample(target_number[cluster], random_state = 3).index.tolist()
        selected_cells += cells
    
    all_use_cell_index += selected_cells   

In [7]:
target_number

Age.Rep
9mo.rep2     881
18mo.rep2    872
18mo.rep1    852
9mo.rep1     840
8wk.rep1     769
8wk.rep2     583
dtype: int64

## get all_rmbkl paths

In [8]:
all_paths = glob.glob(f"{data_dir}/*/*.tsv.gz")

In [9]:
cell_ids = [pathlib.Path(path).name.split('.')[0] for path in all_paths]

In [10]:
cell_table = pd.DataFrame({'cell_id':cell_ids,'paths':all_paths})

In [11]:
cell_table.index = cell_table['cell_id']

In [12]:
cell_table = cell_table.loc[all_use_cell_index]

## seperate by group

In [13]:
cell_table['AgingMajorType.Age.Rep'] = meta['AgingMajorType.Age.Rep']

In [13]:
for group, tmpdf in cell_table.groupby('AgingMajorType.Age.Rep'):
    tmpdf= tmpdf[['cell_id','paths']].copy()
    tmpdf.to_csv(f'cell_table/{group}.tsv', sep = '\t', header = None, index = None)

## merge

In [16]:
group = 'test'
chrom_size_path = f'{ref_dir}/mm10.main.nochrM.nochrY.chrom.sizes'
cell_table = 'test.tsv'
resolution = 5000
CHROM1 = 1
POS1 = 2
CHROM2 =5
POS2 = 6
MIN_POS_DIST = 2500
output_dir = 'CellType.Age.Rep.Raw.5kb'

command = f'hicluster merge-cell-raw \
--cell_table {cell_table} \
--chrom_size_path {chrom_size_path} \
--output_file {output_dir}/{group}.cool \
--resolution {resolution} \
--chr1 {CHROM1} \
--pos1 {POS1} \
--chr2 {CHROM2} \
--pos2 {POS2} \
--min_pos_dist {MIN_POS_DIST}'

In [28]:
#subprocess.run(command, shell = True)

In [17]:
pathlib.Path(f"{output_dir}/").mkdir(parents=True, exist_ok=True)

In [18]:
all_groups = [pathlib.Path(path).name.split('.tsv')[0] 
              for path in glob.glob(f'cell_table/*.tsv')]
all_groups[:3]

['L23_IT_ENT_Glut.18mo.rep2', 'PB_Evx2_Glut.8wk.rep1', 'Microglia_NN.9mo.rep1']

In [23]:
with open('merge_5kb_raw.sh', 'w') as file:
    for index, group in enumerate(all_groups):  # Use enumerate to keep track of the index
        cell_table = f'cell_table/{group}.tsv'
        command = f'hicluster merge-cell-raw \
--cell_table {cell_table} \
--chrom_size_path {chrom_size_path} \
--output_file {output_dir}/{group}.cool \
--resolution {resolution} \
--chr1 {CHROM1} \
--pos1 {POS1} \
--chr2 {CHROM2} \
--pos2 {POS2} \
--min_pos_dist {MIN_POS_DIST}'
        file.write(f"{command} &")
        file.write('\n')
        
        if (index + 1) % 40 == 0:
            file.write('wait\n')
    
    file.write('wait\n')


In [24]:
command

'hicluster merge-cell-raw --cell_table cell_table/STR_Gaba.18mo.rep1.tsv --chrom_size_path /ref/m3C/mm10.main.nochrM.nochrY.chrom.sizes --output_file CellType.Age.Rep.Raw.5kb/STR_Gaba.18mo.rep1.cool --resolution 5000 --chr1 1 --pos1 2 --chr2 5 --pos2 6 --min_pos_dist 2500'