In [1]:
# Parameters
cpu = 8
group_name = "Astro-TE_NN"
mem_gb = 10


In [2]:
import pandas as pd
import subprocess
import numpy as np
import pyBigWig
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#group_name = 'DG_Glut'

In [4]:
home_dir = '/home/qzeng_salk_edu/project/241018_pre_ml'
ct = group_name
dar_dir = '/data/combined_DARs_redo'
abc_dir = '/data/celltype_age_RPM_files'
_type = 'up_peaks'

In [5]:
dar_ct_dict = {'L23_IT_CTX_Glut':'L2-3_IT_CTX_Glut'}
bw_ct_dict = {'L23_IT_CTX_Glut':'L2_3_IT_CTX_Glut'}
if ct in bw_ct_dict.keys():
    dar_ct = dar_ct_dict[ct]
    bw_ct = bw_ct_dict[ct]
else:
    dar_ct = ct
    bw_ct = ct

In [6]:
atac_df  = pd.read_csv(f'{dar_dir}/diff_peaks_{dar_ct}_2vs18.csv', sep=',', index_col=0)

In [7]:
# add a chrom, start, end based on the index 
atac_df['chrom'] = atac_df.index.str.split(':').str[0]
atac_df['start'] = atac_df.index.str.split(':').str[1].str.split('-').str[0].astype(int)
atac_df['end'] = atac_df.index.str.split(':').str[1].str.split('-').str[1].astype(int)
atac_df

Unnamed: 0_level_0,log2(fold_change),p-value,adjusted p-value,chrom,start,end
feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
chr5:109558426-109558927,1.701923,0.000000e+00,0.000000e+00,chr5,109558426,109558927
chr7:26783186-26783687,3.733156,0.000000e+00,0.000000e+00,chr7,26783186,26783687
chr5:109557248-109557749,1.024222,1.253381e-251,3.117159e-248,chr5,109557248,109557749
chr7:47651736-47652237,4.687733,1.614651e-204,3.011729e-201,chr7,47651736,47652237
chr5:109556731-109557232,1.251654,3.179016e-200,4.743727e-197,chr5,109556731,109557232
...,...,...,...,...,...,...
chr17:6961171-6961672,0.252943,7.581730e-01,7.585797e-01,chr17,6961171,6961672
chr10:79716949-79717450,0.265331,7.663166e-01,7.666249e-01,chr10,79716949,79717450
chr13:52047390-52047891,0.269548,7.710420e-01,7.712487e-01,chr13,52047390,52047891
chr5:135393837-135394338,0.256367,8.954036e-01,8.955236e-01,chr5,135393837,135394338


In [8]:
up_atac_df = pd.read_csv(f'{dar_dir}/up_peaks_{dar_ct}.bed', sep='\t', header = None)
#add index like in the atac_df
up_atac_df.index = up_atac_df[0] + ':' + up_atac_df[1].astype(str) + '-' + up_atac_df[2].astype(str)
# add the log(fold_change) baes on the info in atac_df
up_atac_df['log2FoldChange'] = atac_df['log2(fold_change)']
up_atac_df.head()

Unnamed: 0,0,1,2,log2FoldChange
chr5:109558426-109558927,chr5,109558426,109558927,1.701923
chr7:26783186-26783687,chr7,26783186,26783687,3.733156
chr5:109557248-109557749,chr5,109557248,109557749,1.024222
chr7:47651736-47652237,chr7,47651736,47652237,4.687733
chr5:109556731-109557232,chr5,109556731,109557232,1.251654


In [9]:
# write the same for down_peaks
down_atac_df = pd.read_csv(f'{dar_dir}/down_peaks_{dar_ct}.bed', sep='\t', header = None)
down_atac_df.index = down_atac_df[0] + ':' + down_atac_df[1].astype(str) + '-' + down_atac_df[2].astype(str)
down_atac_df['log2FoldChange'] = atac_df['log2(fold_change)']
down_atac_df.head()

Unnamed: 0,0,1,2,log2FoldChange
chr13:23571010-23571511,chr13,23571010,23571511,-1.714639
chr13:23562214-23562715,chr13,23562214,23562715,-1.78749
chr13:23574072-23574573,chr13,23574072,23574573,-1.909879
chr7:44590526-44591027,chr7,44590526,44591027,-1.240907
chr13:23555969-23556470,chr13,23555969,23556470,-1.999283


In [10]:
# combina the down and up
atac_df = pd.concat([up_atac_df, down_atac_df])
# save atac_df to bed
atac_df.to_csv(f'{ct}.aDAR.bed', sep='\t', header=False, index=False)

In [11]:
# use bedtools intersect to find the genes that overlap with the DMRs
command = f"bedtools intersect -a {ct}.aDAR.bed -b {home_dir}/mm10_gene_2kb.bed -wa -wb > {ct}.aDAR_gene.bed"
subprocess.run(command, shell=True)

CompletedProcess(args='bedtools intersect -a Astro-TE_NN.aDAR.bed -b /home/qzeng_salk_edu/project/241018_pre_ml/mm10_gene_2kb.bed -wa -wb > Astro-TE_NN.aDAR_gene.bed', returncode=0)

In [12]:
subprocess.run(f"rm {ct}.aDAR.bed", shell=True)

CompletedProcess(args='rm Astro-TE_NN.aDAR.bed', returncode=0)

In [13]:
dar_gene_df = pd.read_csv(f'{ct}.aDAR_gene.bed', sep='\t', header=None)
dar_gene_df.columns = ['chr','start','end','log2(old/young)','gene_chr','gene_start','gene_end','gene_id','gene_strand','gene_name','gene_type']
dar_gene_df.head()

Unnamed: 0,chr,start,end,log2(old/young),gene_chr,gene_start,gene_end,gene_id,gene_strand,gene_name,gene_type
0,chr5,109558426,109558927,1.701923,chr5,109552711,109560993,ENSMUSG00000033467.8,-,Crlf2,protein_coding
1,chr5,109557248,109557749,1.024222,chr5,109552711,109560993,ENSMUSG00000033467.8,-,Crlf2,protein_coding
2,chr5,109556731,109557232,1.251654,chr5,109552711,109560993,ENSMUSG00000033467.8,-,Crlf2,protein_coding
3,chr5,109557903,109558404,1.598321,chr5,109552711,109560993,ENSMUSG00000033467.8,-,Crlf2,protein_coding
4,chr5,109167108,109167609,5.082705,chr5,109154068,109194107,ENSMUSG00000091635.1,-,Vmn2r13,protein_coding


In [14]:
atac_rmp = pd.read_csv(f"{abc_dir}/{ct}_RPM.txt", sep = '\t')
# add three coolumns chr, start and end based on ethe index 
atac_rmp['chr'] = atac_rmp.index.str.split(':').str[0]
atac_rmp['start'] = atac_rmp.index.str.split(':').str[1].str.split('-').str[0].astype(int)
atac_rmp['end'] = atac_rmp.index.str.split(':').str[1].str.split('-').str[1].astype(int)
atac_rmp

Unnamed: 0,Astro-TE NN:2mo,Astro-TE NN:9mo,Astro-TE NN:18mo,chr,start,end
chr1:3399698-3400199,1.935104,1.425978,1.328092,chr1,3399698,3400199
chr1:3400380-3400881,0.408325,0.206821,0.426887,chr1,3400380,3400881
chr1:3671537-3672038,6.811330,7.347597,8.632596,chr1,3671537,3672038
chr1:3670856-3671357,2.272416,2.481855,2.739189,chr1,3670856,3671357
chr1:3672169-3672670,0.822567,1.055877,1.102790,chr1,3672169,3672670
...,...,...,...,...,...,...
chrY:90804895-90805396,2.799096,5.029022,3.960559,chrY,90804895,90805396
chrY:90807492-90807993,7.610226,9.840337,7.861829,chrY,90807492,90807993
chrY:90806986-90807487,1.751654,2.547167,1.529677,chrY,90806986,90807487
chrY:90808570-90809071,6.260979,9.862108,8.846040,chrY,90808570,90809071


In [15]:
dar_gene_df.index = dar_gene_df['chr'] + ':' + dar_gene_df['start'].astype(str) + '-' + dar_gene_df['end'].astype(str)

In [16]:
shared_index = dar_gene_df.index.intersection(atac_rmp.index)
len(shared_index)

4229

In [17]:
dar_gene_df['2mo'] = atac_rmp.iloc[:,0].to_dict()
dar_gene_df['9mo'] = atac_rmp.iloc[:,1].to_dict()
dar_gene_df['18mo'] = atac_rmp.iloc[:,2].to_dict()

In [18]:
non_na_dar_gene_df = dar_gene_df[~dar_gene_df.isna().any(axis=1)]
with_na_dar_gene_df =  dar_gene_df[dar_gene_df.isna().any(axis=1)]

In [19]:
# use PyBigwig to get the atac information for the bw files
bw_dir = "/data/from_luisa/female_atac"
age_list = ['2mo', '9mo', '18mo']
total_counts_df = {}
for age in age_list:
    bw = pyBigWig.open(f"{bw_dir}/Female_{bw_ct}_{age}_merge.bw")
    total_counts = []
    for dmr_id in tqdm(with_na_dar_gene_df.index):
        _chr, start, end = dmr_id.split(':')[0], dmr_id.split(':')[1].split('-')[0], dmr_id.split(':')[1].split('-')[1]
        start, end = int(start), int(end)
        _count = bw.stats(_chr, start, end, type="mean")
        total_counts.append(_count[0])
        
    total_counts_df[age] = total_counts
total_counts_df = pd.DataFrame(total_counts_df)
    

  0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 1/337 [00:00<00:38,  8.62it/s]

  1%|          | 2/337 [00:00<00:36,  9.12it/s]

  1%|          | 3/337 [00:00<00:39,  8.56it/s]

  1%|          | 4/337 [00:00<00:39,  8.34it/s]

  2%|▏         | 6/337 [00:00<00:37,  8.72it/s]

  2%|▏         | 8/337 [00:00<00:32, 10.20it/s]

  3%|▎         | 10/337 [00:01<00:31, 10.52it/s]

  4%|▎         | 12/337 [00:01<00:33,  9.70it/s]

  4%|▍         | 13/337 [00:01<00:34,  9.31it/s]

  4%|▍         | 14/337 [00:01<00:46,  6.92it/s]

  5%|▍         | 16/337 [00:01<00:34,  9.19it/s]

  5%|▌         | 18/337 [00:01<00:30, 10.51it/s]

  6%|▌         | 20/337 [00:02<00:31,  9.93it/s]

  7%|▋         | 22/337 [00:02<00:29, 10.72it/s]

  7%|▋         | 24/337 [00:02<00:25, 12.11it/s]

  8%|▊         | 27/337 [00:02<00:23, 13.16it/s]

  9%|▊         | 29/337 [00:02<00:24, 12.74it/s]

  9%|▉         | 31/337 [00:02<00:21, 13.98it/s]

 10%|█         | 34/337 [00:03<00:18, 16.51it/s]

 11%|█         | 37/337 [00:03<00:16, 18.14it/s]

 12%|█▏        | 39/337 [00:03<00:16, 18.17it/s]

 12%|█▏        | 41/337 [00:03<00:20, 14.27it/s]

 13%|█▎        | 44/337 [00:03<00:17, 17.06it/s]

 14%|█▍        | 48/337 [00:03<00:14, 19.51it/s]

 15%|█▌        | 51/337 [00:03<00:16, 17.27it/s]

 16%|█▌        | 53/337 [00:04<00:19, 14.80it/s]

 17%|█▋        | 57/337 [00:04<00:14, 19.07it/s]

 18%|█▊        | 60/337 [00:04<00:13, 20.73it/s]

 19%|█▊        | 63/337 [00:04<00:12, 21.16it/s]

 20%|█▉        | 66/337 [00:04<00:13, 19.66it/s]

 20%|██        | 69/337 [00:04<00:13, 20.11it/s]

 21%|██▏       | 72/337 [00:04<00:12, 21.95it/s]

 23%|██▎       | 79/337 [00:05<00:07, 33.39it/s]

 25%|██▍       | 83/337 [00:05<00:09, 26.97it/s]

 26%|██▌       | 87/337 [00:05<00:08, 29.23it/s]

 27%|██▋       | 91/337 [00:05<00:09, 26.83it/s]

 29%|██▉       | 98/337 [00:05<00:07, 30.81it/s]

 31%|███▏      | 106/337 [00:05<00:05, 40.10it/s]

 33%|███▎      | 111/337 [00:06<00:06, 37.30it/s]

 41%|████      | 137/337 [00:06<00:02, 82.34it/s]

 44%|████▎     | 147/337 [00:06<00:03, 55.85it/s]

 46%|████▌     | 155/337 [00:06<00:03, 46.14it/s]

 48%|████▊     | 161/337 [00:07<00:05, 34.44it/s]

 49%|████▉     | 166/337 [00:07<00:04, 34.58it/s]

 51%|█████     | 171/337 [00:07<00:04, 34.37it/s]

 52%|█████▏    | 176/337 [00:07<00:04, 36.03it/s]

 54%|█████▎    | 181/337 [00:07<00:04, 34.70it/s]

 55%|█████▌    | 186/337 [00:07<00:04, 36.14it/s]

 56%|█████▋    | 190/337 [00:07<00:04, 31.60it/s]

 58%|█████▊    | 194/337 [00:08<00:04, 30.62it/s]

 60%|█████▉    | 202/337 [00:08<00:03, 40.91it/s]

 61%|██████▏   | 207/337 [00:08<00:03, 39.38it/s]

 63%|██████▎   | 212/337 [00:08<00:03, 32.55it/s]

 65%|██████▍   | 218/337 [00:08<00:03, 35.51it/s]

 66%|██████▌   | 223/337 [00:08<00:03, 36.49it/s]

 67%|██████▋   | 227/337 [00:08<00:03, 34.68it/s]

 71%|███████   | 238/337 [00:09<00:01, 51.66it/s]

 72%|███████▏  | 244/337 [00:09<00:01, 51.64it/s]

 75%|███████▌  | 254/337 [00:09<00:01, 58.87it/s]

 77%|███████▋  | 261/337 [00:09<00:01, 46.52it/s]

 79%|███████▉  | 267/337 [00:09<00:01, 47.78it/s]

 82%|████████▏ | 277/337 [00:09<00:01, 56.18it/s]

 84%|████████▍ | 284/337 [00:09<00:00, 58.42it/s]

 86%|████████▋ | 291/337 [00:10<00:00, 61.18it/s]

 88%|████████▊ | 298/337 [00:10<00:00, 61.52it/s]

 91%|█████████ | 305/337 [00:10<00:00, 55.31it/s]

 92%|█████████▏| 311/337 [00:10<00:00, 43.46it/s]

 94%|█████████▍| 316/337 [00:10<00:00, 37.61it/s]

 95%|█████████▌| 321/337 [00:10<00:00, 34.41it/s]

 99%|█████████▊| 332/337 [00:10<00:00, 49.51it/s]

100%|██████████| 337/337 [00:11<00:00, 30.36it/s]




  0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 1/337 [00:00<00:42,  7.83it/s]

  1%|          | 2/337 [00:00<00:45,  7.29it/s]

  1%|          | 3/337 [00:00<00:48,  6.87it/s]

  1%|          | 4/337 [00:00<00:57,  5.76it/s]

  2%|▏         | 6/337 [00:00<00:43,  7.62it/s]

  2%|▏         | 8/337 [00:00<00:33,  9.85it/s]

  3%|▎         | 10/337 [00:01<00:33,  9.86it/s]

  4%|▎         | 12/337 [00:01<00:32,  9.86it/s]

  4%|▍         | 14/337 [00:01<00:37,  8.62it/s]

  5%|▍         | 16/337 [00:01<00:31, 10.12it/s]

  5%|▌         | 18/337 [00:02<00:32,  9.71it/s]

  6%|▌         | 20/337 [00:02<00:29, 10.74it/s]

  7%|▋         | 22/337 [00:02<00:27, 11.36it/s]

  7%|▋         | 24/337 [00:02<00:24, 12.86it/s]

  8%|▊         | 27/337 [00:02<00:20, 15.33it/s]

  9%|▊         | 29/337 [00:02<00:22, 13.83it/s]

 10%|█         | 34/337 [00:02<00:14, 20.69it/s]

 11%|█         | 37/337 [00:02<00:14, 20.28it/s]

 12%|█▏        | 41/337 [00:03<00:12, 23.54it/s]

 14%|█▎        | 46/337 [00:03<00:10, 27.14it/s]

 15%|█▍        | 49/337 [00:03<00:12, 22.78it/s]

 15%|█▌        | 52/337 [00:03<00:15, 18.48it/s]

 16%|█▋        | 55/337 [00:03<00:14, 19.34it/s]

 17%|█▋        | 58/337 [00:03<00:13, 21.32it/s]

 18%|█▊        | 62/337 [00:04<00:12, 21.48it/s]

 20%|█▉        | 66/337 [00:04<00:12, 21.38it/s]

 20%|██        | 69/337 [00:04<00:13, 20.19it/s]

 22%|██▏       | 73/337 [00:04<00:11, 23.01it/s]

 24%|██▎       | 80/337 [00:04<00:08, 31.28it/s]

 25%|██▍       | 84/337 [00:04<00:08, 29.33it/s]

 26%|██▋       | 89/337 [00:05<00:07, 31.55it/s]

 28%|██▊       | 93/337 [00:05<00:07, 31.46it/s]

 29%|██▉       | 98/337 [00:05<00:06, 34.98it/s]

 31%|███       | 103/337 [00:05<00:06, 35.10it/s]

 32%|███▏      | 109/337 [00:05<00:06, 35.53it/s]

 41%|████      | 137/337 [00:05<00:02, 85.28it/s]

 43%|████▎     | 146/337 [00:06<00:03, 55.45it/s]

 45%|████▌     | 153/337 [00:06<00:04, 40.57it/s]

 47%|████▋     | 159/337 [00:06<00:05, 32.25it/s]

 49%|████▊     | 164/337 [00:07<00:06, 25.51it/s]

 50%|████▉     | 168/337 [00:07<00:06, 25.66it/s]

 52%|█████▏    | 174/337 [00:07<00:05, 27.23it/s]

 53%|█████▎    | 178/337 [00:07<00:05, 28.66it/s]

 54%|█████▍    | 182/337 [00:07<00:05, 29.05it/s]

 56%|█████▌    | 188/337 [00:07<00:04, 33.17it/s]

 57%|█████▋    | 192/337 [00:08<00:05, 26.39it/s]

 60%|█████▉    | 202/337 [00:08<00:03, 39.59it/s]

 61%|██████▏   | 207/337 [00:08<00:03, 34.73it/s]

 63%|██████▎   | 213/337 [00:08<00:03, 37.66it/s]

 65%|██████▍   | 218/337 [00:08<00:03, 36.75it/s]

 67%|██████▋   | 225/337 [00:08<00:02, 42.03it/s]

 68%|██████▊   | 230/337 [00:08<00:02, 40.75it/s]

 72%|███████▏  | 242/337 [00:09<00:01, 52.50it/s]

 75%|███████▌  | 253/337 [00:09<00:01, 62.09it/s]

 77%|███████▋  | 260/337 [00:09<00:01, 49.12it/s]

 79%|███████▉  | 266/337 [00:09<00:01, 47.60it/s]

 81%|████████  | 273/337 [00:09<00:01, 50.60it/s]

 83%|████████▎ | 279/337 [00:09<00:01, 47.37it/s]

 85%|████████▍ | 285/337 [00:09<00:01, 45.36it/s]

 86%|████████▋ | 291/337 [00:10<00:00, 48.36it/s]

 88%|████████▊ | 298/337 [00:10<00:00, 50.79it/s]

 90%|█████████ | 304/337 [00:10<00:00, 52.69it/s]

 92%|█████████▏| 310/337 [00:10<00:00, 35.31it/s]

 93%|█████████▎| 315/337 [00:10<00:00, 32.91it/s]

 95%|█████████▍| 319/337 [00:10<00:00, 33.81it/s]

 98%|█████████▊| 329/337 [00:10<00:00, 45.15it/s]

100%|██████████| 337/337 [00:11<00:00, 50.00it/s]

100%|██████████| 337/337 [00:11<00:00, 30.30it/s]




  0%|          | 0/337 [00:00<?, ?it/s]

  0%|          | 1/337 [00:00<00:53,  6.26it/s]

  1%|          | 2/337 [00:00<00:43,  7.74it/s]

  1%|          | 3/337 [00:00<00:46,  7.12it/s]

  1%|          | 4/337 [00:00<00:52,  6.31it/s]

  2%|▏         | 6/337 [00:00<00:45,  7.20it/s]

  2%|▏         | 8/337 [00:01<00:37,  8.70it/s]

  3%|▎         | 9/337 [00:01<00:40,  8.09it/s]

  3%|▎         | 11/337 [00:01<00:34,  9.53it/s]

  4%|▍         | 13/337 [00:01<00:32, 10.01it/s]

  4%|▍         | 15/337 [00:01<00:27, 11.78it/s]

  5%|▌         | 17/337 [00:01<00:32,  9.82it/s]

  6%|▌         | 20/337 [00:02<00:25, 12.62it/s]

  7%|▋         | 22/337 [00:02<00:25, 12.34it/s]

  7%|▋         | 24/337 [00:02<00:23, 13.38it/s]

  8%|▊         | 27/337 [00:02<00:21, 14.40it/s]

  9%|▊         | 29/337 [00:02<00:21, 14.14it/s]

 10%|█         | 34/337 [00:02<00:15, 19.20it/s]

 11%|█         | 36/337 [00:02<00:17, 17.03it/s]

 12%|█▏        | 40/337 [00:03<00:13, 21.76it/s]

 13%|█▎        | 44/337 [00:03<00:14, 20.49it/s]

 14%|█▍        | 48/337 [00:03<00:12, 23.85it/s]

 15%|█▌        | 51/337 [00:03<00:15, 18.58it/s]

 16%|█▌        | 54/337 [00:03<00:14, 19.76it/s]

 17%|█▋        | 58/337 [00:03<00:12, 22.76it/s]

 19%|█▉        | 64/337 [00:04<00:09, 29.82it/s]

 20%|██        | 68/337 [00:04<00:10, 26.62it/s]

 21%|██        | 71/337 [00:04<00:12, 22.04it/s]

 23%|██▎       | 78/337 [00:04<00:08, 29.49it/s]

 25%|██▍       | 84/337 [00:04<00:07, 33.64it/s]

 27%|██▋       | 90/337 [00:04<00:06, 38.23it/s]

 28%|██▊       | 96/337 [00:04<00:05, 42.43it/s]

 30%|██▉       | 101/337 [00:05<00:06, 37.40it/s]

 32%|███▏      | 109/337 [00:05<00:05, 42.52it/s]

 41%|████      | 137/337 [00:05<00:02, 93.96it/s]

 44%|████▍     | 148/337 [00:05<00:02, 65.83it/s]

 47%|████▋     | 157/337 [00:05<00:03, 50.72it/s]

 49%|████▊     | 164/337 [00:06<00:03, 43.72it/s]

 50%|█████     | 170/337 [00:06<00:03, 42.31it/s]

 52%|█████▏    | 176/337 [00:06<00:03, 42.78it/s]

 54%|█████▎    | 181/337 [00:06<00:03, 41.60it/s]

 55%|█████▌    | 186/337 [00:06<00:03, 43.01it/s]

 57%|█████▋    | 191/337 [00:06<00:04, 36.31it/s]

 60%|█████▉    | 202/337 [00:07<00:02, 49.33it/s]

 62%|██████▏   | 208/337 [00:07<00:02, 45.52it/s]

 64%|██████▎   | 214/337 [00:07<00:02, 45.03it/s]

 65%|██████▍   | 219/337 [00:07<00:02, 43.51it/s]

 67%|██████▋   | 226/337 [00:07<00:02, 49.50it/s]

 69%|██████▉   | 232/337 [00:07<00:02, 51.54it/s]

 72%|███████▏  | 242/337 [00:07<00:01, 62.75it/s]

 75%|███████▌  | 254/337 [00:07<00:01, 73.91it/s]

 78%|███████▊  | 262/337 [00:08<00:01, 54.35it/s]

 80%|███████▉  | 269/337 [00:08<00:01, 53.65it/s]

 82%|████████▏ | 277/337 [00:08<00:01, 56.77it/s]

 84%|████████▍ | 284/337 [00:08<00:00, 59.42it/s]

 87%|████████▋ | 293/337 [00:08<00:00, 63.08it/s]

 89%|████████▉ | 300/337 [00:08<00:00, 63.86it/s]

 91%|█████████ | 307/337 [00:08<00:00, 50.26it/s]

 93%|█████████▎| 313/337 [00:09<00:00, 45.33it/s]

 94%|█████████▍| 318/337 [00:09<00:00, 43.22it/s]

 96%|█████████▌| 323/337 [00:09<00:00, 35.56it/s]

 99%|█████████▉| 334/337 [00:09<00:00, 48.70it/s]

100%|██████████| 337/337 [00:09<00:00, 34.79it/s]




In [20]:
for age in age_list:
    with_na_dar_gene_df[age] = total_counts_df[age].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_na_dar_gene_df[age] = total_counts_df[age].values


In [21]:
with_na_dar_gene_df['log2(18mo/2mo)'] = np.log2(with_na_dar_gene_df['18mo']/with_na_dar_gene_df['2mo'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_na_dar_gene_df['log2(18mo/2mo)'] = np.log2(with_na_dar_gene_df['18mo']/with_na_dar_gene_df['2mo'])


In [22]:
# map rows in dar_gene_df that is in with_na_dar_gene_df.index, give 2mo column values based on with_na_dar_gene_df['2mo'].to_dict()
dar_gene_df.loc[with_na_dar_gene_df.index, '2mo'] = with_na_dar_gene_df['2mo'].to_dict()
dar_gene_df.loc[with_na_dar_gene_df.index, '9mo'] = with_na_dar_gene_df['9mo'].to_dict()
dar_gene_df.loc[with_na_dar_gene_df.index, '18mo'] = with_na_dar_gene_df['18mo'].to_dict()

In [23]:
dar_gene_df.to_csv(f'{home_dir}/ml_input/{ct}/{ct}.aDAR_gene.csv',  index=False)

In [24]:
subprocess.run(f"rm {ct}.aDAR_gene.bed", shell=True)

CompletedProcess(args='rm Astro-TE_NN.aDAR_gene.bed', returncode=0)