In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc = {'figure.figsize':(12, 6)})
sns.set_style("ticks")
sns.set_context("paper", font_scale=1.5)
sns.set_palette("crest")

In [3]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


#### read ASE dataframes

In [4]:
## Read the AI data for all samples and timepoints
df_usable = pd.read_parquet("ASE_dataframes/usable_data.parquet")
df_ai = pd.read_parquet("ASE_dataframes/ai_data.parquet")

In [5]:
df_usable.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,ai_type,ai_cat,chrom,pos,SNV,genomic_retained,gene_id,TPM,region,sample
12,HIP,ZT0,NC_044976.1:13800,3,4,7,0.428571,1.0,1.0,A,...,no_ai,non_AI,1,13800,1:13800,retained,ANKRD65,2.5,UTR,ZT0_HIP
23,HIP,ZT0,NC_044976.1:21050,3,3,6,0.5,1.0,1.0,G,...,no_ai,non_AI,1,21050,1:21050,retained,ANKRD65,2.5,UTR,ZT0_HIP


#### read bed files and only keep usable sites

In [10]:
usable_sites = set(df_usable['loci'].values)

In [20]:
animals = [f"ZT{i}" for i in range(0,23,2)]
for animal in animals:
    with open(f"baboon_to_human_liftover/bed_files_usable_sites/{animal}_PE_phase_added.bed", 'w') as outfile:
        with open(f"baboon_to_human_liftover/bed_files/{animal}_PE_phase_added.bed", 'r') as bedfile:
            lines_wrote = 0
            for lines in bedfile:
                dat = lines.strip().split("\t")
                if (f"{dat[0]}:{dat[2]}" in usable_sites):
                    outfile.write(lines)
                    lines_wrote += 1
    print(animal, lines_wrote)

ZT4 56426
ZT6 56101
ZT8 51391
ZT10 57819
ZT12 58122
ZT14 52106
ZT16 58962
ZT18 58756
ZT20 60712
ZT22 56132


In [28]:
from glob import glob
col_names = ['chr', 'start', 'end', 'loci', 'score', 'ref', 'alt', 'pass', 'info', 'format', 'sample']
bed_files = glob("baboon_to_human_liftover/bed_files_usable_sites/*.bed")
for ln, bed_file in enumerate(bed_files):
    if ln == 0:
        df = pd.read_csv(bed_file, sep="\t", header=None)
        df.columns = col_names
    else:
        dft = pd.read_csv(bed_file, sep="\t", header=None)
        dft.columns = col_names
        df = pd.concat([df, dft])
    print(ln, df.shape)
        

0 (56426, 11)
1 (106924, 11)
2 (164743, 11)
3 (220844, 11)
4 (281331, 11)
5 (333437, 11)
6 (384828, 11)
7 (442950, 11)
8 (501912, 11)
9 (558044, 11)
10 (616800, 11)
11 (677512, 11)


In [35]:
df.head(2)

Unnamed: 0,chr,start,end,loci,score,ref,alt,pass,info,format,sample
0,NC_044976.1,10271,10272,.,6866.02,G,C,PASS,AC=13;AF=0.542;AN=24;BaseQRankSum=-0.997;DP=49...,GT:AD:DP:GQ:PS,"0|1:17,22:39:99:194"
1,NC_044976.1,13799,13800,.,4685.08,A,G,PASS,AC=10;AF=0.417;AN=24;BaseQRankSum=-2.193;DP=43...,GT:AD:DP:GQ:PS,"1|0:18,22:40:99:194"


In [39]:
dft = df[['chr', 'start', 'end', "ref", "alt"]]
dft['loci'] = dft['chr'] + ":" + dft['start'].astype(str) + ":" + dft['ref'] + ":" + dft['alt']
 
dft[['chr', 'start', 'end', 'loci']].drop_duplicates().to_csv("baboon_to_human_liftover/bed_files_usable_sites/loci.bed", sep="\t", header=None, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dft['loci'] = dft['chr'] + ":" + dft['start'].astype(str) + ":" + dft['ref'] + ":" + dft['alt']


In [37]:
dft.shape

(677512, 6)

In [38]:
dft.head(2)

Unnamed: 0,chr,start,end,ref,alt,loci
0,NC_044976.1,10271,10272,G,C,NC_044976.1:10271:G:C
1,NC_044976.1,13799,13800,A,G,NC_044976.1:13799:A:G
