In [1]:
import numpy as np
import pandas as pd
from glob import glob

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()
# pandarallel.initialize(progress_bar=True, nb_workers=8)

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns

### Read concatenated TPM table

In [3]:
df = pd.read_csv("../../rsem/gene_tpm_counts.tsv", sep="\t")

In [4]:
df.head(3)

Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,sample
0,A1BG,"XM_003916257.5,XM_009195529.4",2862.61,2655.86,52.0,1.92,1.0,HIP_ZT12
1,A1CF,"XM_003903944.5,XM_009214926.4,XM_031652209.1,X...",4138.25,3931.5,0.0,0.0,0.0,HIP_ZT12
2,A2ML1,XM_031650266.1,5656.0,5449.25,18.53,0.33,0.17,HIP_ZT12


In [5]:
df['timepoint'] = df['sample'].apply(lambda x: x.split('_')[1])
df['tissue'] = df['sample'].apply(lambda x: x.split('_')[0])

In [6]:
df.head(3)

Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,sample,timepoint,tissue
0,A1BG,"XM_003916257.5,XM_009195529.4",2862.61,2655.86,52.0,1.92,1.0,HIP_ZT12,ZT12,HIP
1,A1CF,"XM_003903944.5,XM_009214926.4,XM_031652209.1,X...",4138.25,3931.5,0.0,0.0,0.0,HIP_ZT12,ZT12,HIP
2,A2ML1,XM_031650266.1,5656.0,5449.25,18.53,0.33,0.17,HIP_ZT12,ZT12,HIP


#### Read RNA AI table

In [7]:
rna = pd.read_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added.parquet")

In [8]:
rna.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,ANN[*].GENE,ANN[*].FEATURE,ANN[*].BIOTYPE,uq_id,phase_info,phase_dat,is_phased,phase_id,homologBias,ai_type
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,...,.,.,.,HIP:ZT0:NC_044976.1:204,GT:AD:DP:GQ,"0/1:350,158:509:99",not_phased,not_phased,,MA
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,...,.,.,.,HIP:ZT0:NC_044976.1:4126,GT:AD:DP:GQ:PS,"1|0:19,17:36:99:204",phased,204,0.0,no_ai


In [9]:
def categorize_SNV(x):
    if x['totalCount'] <=5:
        return "LT5_reads"
    elif x['binomTest'] >= 0.05:
        return "binom_failed"
    elif x['fdr'] >= 0.05:
        return "fdr_failed"
    else:
        return x['ai_type']

In [10]:
rna['ai_cat'] = rna.progress_apply(categorize_SNV, axis=1)

  0%|          | 0/47889413 [00:00<?, ?it/s]

In [11]:
rna = rna.merge(df[['timepoint', 'tissue', 'gene_id', 'TPM', 'FPKM']],
                left_on=['timepoint', 'tissue', 'ANN[*].GENE'],
                right_on=['timepoint', 'tissue', 'gene_id'], how='left')

In [13]:
rna.head(20)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,phase_info,phase_dat,is_phased,phase_id,homologBias,ai_type,ai_cat,gene_id,TPM,FPKM
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,...,GT:AD:DP:GQ,"0/1:350,158:509:99",not_phased,not_phased,,MA,MA,,,
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,...,GT:AD:DP:GQ:PS,"1|0:19,17:36:99:204",phased,204,0.0,no_ai,LT5_reads,,,
2,HIP,ZT0,NC_044976.1:4522,0,1,1,0.0,1.0,1.0,G,...,GT:AD:DP:GQ:PS,"1|0:27,13:40:99:204",phased,204,1.0,no_ai,LT5_reads,,,
3,HIP,ZT0,NC_044976.1:6729,39,27,66,0.590909,0.175286,1.0,C,...,GT:AD:DP:GQ:PS,"1|0:18,18:36:99:204",phased,204,0.409091,no_ai,binom_failed,,,
4,HIP,ZT0,NC_044976.1:6871,55,53,108,0.509259,0.923401,1.0,G,...,GT:AD:DP:GQ:PS,"1|0:22,21:44:99:204",phased,204,0.490741,no_ai,binom_failed,,,
5,HIP,ZT0,NC_044976.1:7352,23,19,42,0.547619,0.643969,1.0,A,...,GT:AD:DP:GQ:PS,"0|1:14,20:35:99:204",phased,204,0.547619,no_ai,binom_failed,,,
6,HIP,ZT0,NC_044976.1:7697,36,31,67,0.537313,0.625407,1.0,A,...,GT:AD:DP:GQ:PS,"0|1:15,18:33:99:204",phased,204,0.537313,no_ai,binom_failed,,,
7,HIP,ZT0,NC_044976.1:7897,28,32,60,0.466667,0.698883,1.0,T,...,GT:AD:DP:GQ:PS,"0|1:15,22:37:99:204",phased,204,0.466667,no_ai,binom_failed,,,
8,HIP,ZT0,NC_044976.1:10272,0,1,1,0.0,1.0,1.0,G,...,GT:AD:DP:GQ:PS,"0|1:19,27:47:99:204",phased,204,0.0,no_ai,LT5_reads,TMEM88B,3.55,1.95
9,HIP,ZT0,NC_044976.1:11794,1,4,5,0.2,0.375,1.0,G,...,GT:AD:DP:GQ:PS,"1|0:22,13:36:99:204",phased,204,0.8,no_ai,LT5_reads,,,


In [14]:
rna.to_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added_with_TPM.parquet")

### categorize TPMs and FPKMs

In [3]:
### read the written file, since it's written in the previous cell
rna = pd.read_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added_with_TPM.parquet")

In [4]:
rna.head(2)

Unnamed: 0,tissue,timepoint,loci,refCount,altCount,totalCount,refBias,binomTest,fdr,REF,...,phase_info,phase_dat,is_phased,phase_id,homologBias,ai_type,ai_cat,gene_id,TPM,FPKM
0,HIP,ZT0,NC_044976.1:204,1,17,18,0.055556,0.000145,0.012577,T,...,GT:AD:DP:GQ,"0/1:350,158:509:99",not_phased,not_phased,,MA,MA,,,
1,HIP,ZT0,NC_044976.1:4126,1,0,1,1.0,1.0,1.0,G,...,GT:AD:DP:GQ:PS,"1|0:19,17:36:99:204",phased,204,0.0,no_ai,LT5_reads,,,


In [7]:
def categorize_PMs(x):
    if x <= 0.5:
        return "LT0.5"
    elif x <= 10:
        return "low"
    elif x <= 1000:
        return "medium"
    else:
        return "high"

In [14]:
rna['TPM_cat'] = rna['TPM'].progress_apply(categorize_PMs)

  0%|          | 0/47889413 [00:00<?, ?it/s]

In [15]:
rna['FPKM_cat'] = rna['FPKM'].progress_apply(categorize_PMs)

  0%|          | 0/47889413 [00:00<?, ?it/s]

In [16]:
rna['TPM_cat'].value_counts(), rna['FPKM_cat'].value_counts()

(medium    20708083
 low       14170626
 high       9292553
 LT0.5      3718151
 Name: TPM_cat, dtype: int64,
 low       17431080
 medium    16875389
 high       9231193
 LT0.5      4351751
 Name: FPKM_cat, dtype: int64)

In [20]:
rna.to_parquet("./data_files/concatenated_data/gatk_all_samples_concatenated_annt_added_multiline_dupes_dropped_phase_added_with_TPM.parquet")