### Add annotation data to the concatenated data
P.S: only annotation data is added, but not phase data

In [1]:
import numpy as np
import pandas as pd
from pandarallel import pandarallel

In [2]:
import warnings
warnings.filterwarnings("ignore")
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
df = pd.read_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated.parquet")

In [4]:
df.head(3)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0
2,HIP,ZT0,NC_044976.1:4522,0,1,1,0.0,1.0,1.0


In [5]:
## read the annotation data
annt = pd.read_csv(f'../../wasp_rerun_june22/experiment_code/vcfs_annotated_with_snpeff_all/ZT0_snpeff_extracted.txt', sep="\t")
annt.head(2)

Unnamed: 0,CHROM,POS,REF,ALT,FILTER,ANN[*].ALLELE,ANN[*].EFFECT,ANN[*].IMPACT,ANN[*].GENE,ANN[*].GENEID,ANN[*].FEATURE,ANN[*].FEATUREID,ANN[*].BIOTYPE
0,NC_044976.1,204,T,A,PASS,.,.,.,.,.,.,.,.
1,NC_044976.1,441,G,C,PASS,.,.,.,.,.,.,.,.


In [6]:
## read the annotation data and add to the concatenated GATK counts
for idx, timepoint in enumerate(df['timepoint'].unique()):
    print(f"Running {timepoint}")
    snv = df[df['timepoint']==timepoint]
    annt = pd.read_csv(f'../../wasp_rerun_june22/experiment_code/vcfs_annotated_with_snpeff_all/{timepoint}_snpeff_extracted.txt', sep="\t")
    # create loci column
    annt['loci'] = annt['CHROM'].astype(str) + ':' + annt['POS'].astype(str)
    annt1 = annt[['loci', 'REF', 'ALT', "ANN[*].ALLELE", "ANN[*].EFFECT", "ANN[*].IMPACT", "ANN[*].GENE", "ANN[*].FEATURE", "ANN[*].BIOTYPE"]]

    # merge with snp data
    snv = snv.merge(annt1, on='loci', how='left')
    if idx == 0:
        df_fi = snv
    else:
        df_fi = df_fi.append(snv)

Running ZT0
Running ZT2
Running ZT4
Running ZT6
Running ZT8
Running ZT10
Running ZT12
Running ZT14
Running ZT16
Running ZT18
Running ZT20
Running ZT22


In [7]:
df_fi.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,ALT,ANN[*].ALLELE,ANN[*].EFFECT,ANN[*].IMPACT,ANN[*].GENE,ANN[*].FEATURE,ANN[*].BIOTYPE
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,A,.,.,.,.,.,.
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,A,.,.,.,.,.,.


In [8]:
df_fi.to_csv("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_singleline.csv", sep="\t", index=None)