In [1]:
import os
import pandas as pd
import subprocess
import glob
import pysam
from ALLCools.count_matrix.dataset import _CountQuantifier
import csv
from tqdm import tqdm

In [2]:
group_name = 'VipSncg Gaba.8wk'
cell_type, age = group_name.split('.')

In [3]:
input_dir = "/ceph/gale-1/qzeng/amb_datasets"
output_dir = f"{os.getcwd()}/balanced_allc"

In [4]:
male_merged_allcs = glob.glob(f'{input_dir}/M.CellType.Age/*/*.CGN-Merge.allc.tsv.gz')
female_merged_allcs = glob.glob(f'{input_dir}/F.CellType.Age/*/*.CGN-Merge.allc.tsv.gz')

male_ct_name = sorted(list(set([path.split('/')[-2].split('.')[0] for path in male_merged_allcs])))
female_ct_name = sorted(list(set([path.split('/')[-2].split('.')[0]  for path in female_merged_allcs])))
len(male_ct_name), len(female_ct_name)

(58, 47)

In [5]:
modified_male_ct_name = [ct.replace('_',' ').replace('-',' ') for ct in male_ct_name]
modified_female_ct_name = [ct.replace('_',' ').replace('-',' ') for ct in female_ct_name]

In [6]:
male_dict = dict(zip(modified_male_ct_name, male_ct_name))
female_dict = dict(zip(modified_female_ct_name, female_ct_name))

In [7]:
male_path = f"{input_dir}/M.CellType.Age/{male_dict[cell_type]}.{age}/{male_dict[cell_type]}.{age}.CGN-Merge.allc.tsv.gz"
female_path = f"{input_dir}/F.CellType.Age/{female_dict[cell_type]}.{age}/{female_dict[cell_type]}.{age}.CGN-Merge.allc.tsv.gz"

In [8]:
male_df = pd.read_csv(male_path, sep = '\t', header = None)
female_df = pd.read_csv(female_path, sep = '\t', header = None)

In [9]:
male_df['cytosine_id'] = male_df[0] + '-' + male_df[1].astype(str)
female_df['cytosine_id'] = female_df[0] + '-' + female_df[1].astype(str)
shared_cytosine_id = list(set(male_df['cytosine_id']) & set(female_df['cytosine_id']))
shared_cytosine_id = sorted(shared_cytosine_id)
len(shared_cytosine_id)

IOStream.flush timed out


20339783

In [10]:
def banlance_line(cytosine_id):
    chromosome, start_position = cytosine_id.split('-')
    start_position = int(start_position)
    
    merged_line = []

    for row in male_tabix.fetch(chromosome, start_position-1, start_position):
        _chr, pos, line, cg_format, male_mc, male_cov, _  = row.split('\t')  

    for row in female_tabix.fetch(chromosome, start_position-1, start_position):
        _chr, pos, line, cg_format, female_mc, female_cov, _  = row.split('\t')  

    div_by= int(male_cov) / int(female_cov)
    merged_cov = int(female_cov) + int(int(male_cov)/div_by)
    merged_mc = int(female_mc) + int(int(male_mc)/div_by)

    merged_line = [_chr, int(pos), line, cg_format, merged_mc, merged_cov, int(_)]
    return merged_line

In [11]:
male_tabix = pysam.TabixFile(male_path)
female_tabix = pysam.TabixFile(female_path)

all_lines = []
for cytosine_id in tqdm(shared_cytosine_id[:10000]):
    merged_line = banlance_line(cytosine_id)
    all_lines.append(merged_line)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2153.79it/s]


In [12]:
save_dir = '/home/qzeng/project/aging/230907-recall-dmr/Merge_DMR/Balanced.Merged.Allc'

In [13]:
use_name = group_name.replace(' ','_')
tsv_file = f"{save_dir}/{use_name}.Blanced.Merged.AllC.tsv"

with open(tsv_file, mode='w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    for command in all_lines:
        writer.writerow(command)

In [14]:
# import ray
# ray.init(ignore_reinit_error=True)

# male_tabix = pysam.TabixFile(male_path)
# female_tabix = pysam.TabixFile(female_path)

# @ray.remote(num_cpus=20)
# def balance_line(cytosine_id):
#     chromosome, start_position = cytosine_id.split('-')
#     start_position = int(start_position)

#     merged_line = []

#     for row in male_tabix.fetch(chromosome, start_position-1, start_position):
#         _chr, pos, line, cg_format, male_frac, male_cov, _  = row.split('\t')  

#     for row in female_tabix.fetch(chromosome, start_position-1, start_position):
#         _chr, pos, line, cg_format, female_frac, female_cov, _  = row.split('\t')  

#     div_by= int(male_cov) / int(female_cov)
#     merged_cov = int(female_cov) + int(int(male_cov)/div_by)
#     merged_mc = int(female_frac) + int(int(female_frac)/div_by)

#     merged_line = [_chr, int(pos), line, cg_format, merged_mc, merged_cov, int(_)]
#     return merged_line

# all_lines = ray.get([balance_line.remote(cytosine_id) for cytosine_id in shared_cytosine_id[:1000]])