In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.multicomp as mc
from scipy import stats
import pickle
from scipy.stats import mannwhitneyu as mannwu
from matplotlib.ticker import MultipleLocator
import os
from Bio import Seq, SeqIO

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from plotting import *
from regions import *

In [2]:
tagsRv = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [3]:
INDEL_excl_cols = ["EXCL_DP", "EXCL_MQ", "EXCL_SAO", "EXCL_NR", "EXCL_M_DP", "EXCL_TR_DP"]

In [4]:
h37Rv_path = "/n/data1/hms/dbmi/farhat/Sanjana/H37Rv"
h37Rv_seq = SeqIO.read(os.path.join(h37Rv_path, "GCF_000195955.2_ASM19595v2_genomic.gbff"), "genbank")
h37Rv_genes = pd.read_csv(os.path.join(h37Rv_path, "mycobrowser_h37rv_genes_v4.csv"))
h37Rv_regions = pd.read_csv(os.path.join(h37Rv_path, "mycobrowser_h37rv_v4.csv"))

# remove rRNAs, which are highly conserved. rrs, rrl, and rrf
rRNA_pos = []

for i, row in h37Rv_regions.query("Functional_Category=='stable RNAs' & Feature=='rRNA'").iterrows():
    rRNA_pos += list(np.arange(row['Start'], row['Stop'] + 1))
    
    
# exclude any indel within 100 bp of an insertion seq / phage and also within those regions
insertion_seqs_phages_pos = []

for i, row in h37Rv_regions.query("Functional_Category=='insertion seqs and phages'").iterrows():
    insertion_seqs_phages_pos += list(np.arange(row['Start'] - 100, row['Stop'] + 100 + 1))
    
insertion_seqs_phages_pos = np.unique(insertion_seqs_phages_pos)

# [done] AF ≥ 5%: SNP summary table [L1234 only]

pre = raw FreeBayes output

post_1 = after model exclusion

post_2 = after model exclusion and hard filtering on SAO, MQ and DP

post_3 = after model exclusion, hard filtering on SAO, MQ and DP, and region masking

In [5]:
L1234_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

In [6]:
L1234_SNP_source_dir = f"{L1234_parent_dir}/SNPs"

L1234_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                         "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                         "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq >= 0.05:

        ### SNPs ###
        SNP_df = pd.read_csv(f"{L1234_SNP_source_dir}/{tag}.SNPs.csv", low_memory=False)
        pre_SNP_df = SNP_df[SNP_df.FB_AF >= 0.05]

        # post_1
        post_1_SNP_df = pre_SNP_df[pre_SNP_df.pred_class == 1]

        # post_2
        post_2_SNP_df = post_1_SNP_df[(post_1_SNP_df.EXCL_SAO == 0) & (post_1_SNP_df.EXCL_DP == 0) & (post_1_SNP_df.EXCL_MQ == 0)]

        # post_3
        post_3_SNP_df = post_2_SNP_df[(post_2_SNP_df.REGION != "LM") & ~(post_2_SNP_df.POS.isin(rRNA_pos))]

        # FPs #
        # restrict to AF ≤ 50%
        FP_pre_SNP_df = pre_SNP_df[(pre_SNP_df.GT == 0) & (pre_SNP_df.FB_AF <= 0.5)]
        num_FP_SNP_pre = FP_pre_SNP_df.shape[0]

        FP_post_1_SNP_df = post_1_SNP_df[(post_1_SNP_df.GT == 0) & (post_1_SNP_df.FB_AF <= 0.5)]
        num_FP_SNP_post_1 = FP_post_1_SNP_df.shape[0]

        FP_post_2_SNP_df = post_2_SNP_df[(post_2_SNP_df.GT == 0) & (post_2_SNP_df.FB_AF <= 0.5)]
        num_FP_SNP_post_2 = FP_post_2_SNP_df.shape[0]

        FP_post_3_SNP_df = post_3_SNP_df[(post_3_SNP_df.GT == 0) & (post_3_SNP_df.FB_AF <= 0.5)]
        num_FP_SNP_post_3 = FP_post_3_SNP_df.shape[0]

        # TPs #
        # keep no upper-bound on TPs, there will be slight differences in AF from 50% due to simulations
        TP_pre_SNP_df = pre_SNP_df[pre_SNP_df.GT == 1]
        num_TP_SNP_pre = TP_pre_SNP_df.shape[0]
        
        TP_post_1_SNP_df = post_1_SNP_df[post_1_SNP_df.GT == 1]
        num_TP_SNP_post_1 = TP_post_1_SNP_df.shape[0]
        
        TP_post_2_SNP_df = post_2_SNP_df[post_2_SNP_df.GT == 1]
        num_TP_SNP_post_2 = TP_post_2_SNP_df.shape[0]
        
        TP_post_3_SNP_df = post_3_SNP_df[post_3_SNP_df.GT == 1]
        num_TP_SNP_post_3 = TP_post_3_SNP_df.shape[0]
    
        L1234_summary_df.loc[df_i] = [tag, depth, freq,
                                      num_TP_SNP_pre, num_TP_SNP_post_1, num_TP_SNP_post_2, num_TP_SNP_post_3,
                                      num_FP_SNP_pre, num_FP_SNP_post_1, num_FP_SNP_post_2, num_FP_SNP_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 

In [7]:
print(L1234_summary_df.shape)
L1234_summary_df.head(2)

(600, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_L1_mutant_50_0.05,50,0.05,8,8,2,2,514,239,152,15
1,sim1_L2_mutant_50_0.05,50,0.05,12,12,3,3,416,133,58,2


In [8]:
L1234_summary_df.to_csv("./data/filter_FP/ISS_L1234_SNP_filtering_summary.csv", index=False)

# [done] AF ≥ 5%: INDEL summary table

pre = raw FreeBayes output

post_1 = after AF adjustment

post_2 = after AF adjustment and hard filtering on DP, MQ, SAO (for anything but deletions > 10bp), no reads overlapping the INDEL site, bam-derived coverage at the site is lower than a third of the median bam-derived coverage across the sample, total reads at the site computed by the compute_adjusted_AF_indels function < 5

post_3 = after AF adjustment and hard filtering on DP, MQ, SAO (for anything but deletions > 10bp), no reads overlapping the INDEL site, bam-derived coverage at the site is lower than a third of the median bam-derived coverage across the sample, total reads at the site computed by the compute_adjusted_AF_indels function < 5, and region masking

## Find out how many strains have FP with FB_AF ≤ 50% and AF_Adj > 50%
```
(pre_INDEL_df.FB_AF <= 0.5) & (pre_INDEL_df.AF_Adj > 0.5)
```

In [26]:
Rv_INDEL_source_dir = f"{Rv_parent_dir}/INDELs"

Rv_switch_indel_df = pd.DataFrame(columns=["tag", "depth", "freq", "num_switch_INDEL"])
df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    INDEL_df = pd.read_csv(f"{Rv_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
    switch_INDEL_df = INDEL_df[((INDEL_df.GT == 0)) & (INDEL_df.FB_AF <= 0.5) & (INDEL_df.AF_Adj > 0.5)]

    Rv_switch_indel_df.loc[df_i] = [tag, depth, freq, switch_INDEL_df.shape[0]]
    df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [28]:
Rv_switch_indel_df.num_switch_INDEL.sum()

np.int64(27)

In [30]:
L1234_INDEL_source_dir = f"{L1234_parent_dir}/INDELs"

L1234_switch_indel_df = pd.DataFrame(columns=["tag", "depth", "freq", "num_switch_INDEL"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    INDEL_df = pd.read_csv(f"{L1234_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
    switch_INDEL_df = INDEL_df[((INDEL_df.GT == 0)) & (INDEL_df.FB_AF <= 0.5) & (INDEL_df.AF_Adj > 0.5)]

    L1234_switch_indel_df.loc[df_i] = [tag, depth, freq, switch_INDEL_df.shape[0]]
    df_i += 1


0 100 200 300 400 500 600 700 800 900 

In [31]:
L1234_switch_indel_df.num_switch_INDEL.sum()

np.int64(7127)

In [32]:
L1234_switch_indel_df[L1234_switch_indel_df.num_switch_INDEL > 0].shape[0]

998

In [35]:
L1234_switch_indel_df.num_switch_INDEL.min(), L1234_switch_indel_df.num_switch_INDEL.median(), L1234_switch_indel_df.num_switch_INDEL.max()

(0, 6.0, 20)

In [40]:
INDEL_excl_cols = ["EXCL_DP", "EXCL_MQ", "EXCL_SAO", "EXCL_NR", "EXCL_M_DP", "EXCL_TR_DP"]

## H37Rv

Probably won't use the FP data.

In [21]:
Rv_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"

In [22]:
Rv_INDEL_source_dir = f"{Rv_parent_dir}/INDELs"

Rv_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                      "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                      "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq >= 0.05:

        ## INDELs ##
        INDEL_df = pd.read_csv(f"{Rv_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
        pre_INDEL_df = INDEL_df[INDEL_df.FB_AF >= 0.05]

        # post_1
        # only consider INDELs with an adjusted AF above 5%
        post_1_INDEL_df = pre_INDEL_df[pre_INDEL_df.AF_Adj >= 0.05]

        # post_2
        post_2_INDEL_df = post_1_INDEL_df[post_1_INDEL_df[INDEL_excl_cols].eq(0).all(axis=1)]

        # post_3
        post_3_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.REGION != "LM") & ~(post_2_INDEL_df.POS.isin(rRNA_pos)) & ~(post_2_INDEL_df.POS.isin(insertion_seqs_phages_pos))]

        # FPs #
        # restrict to AF ≤ 50% for raw AF
        FP_pre_INDEL_df = pre_INDEL_df[(pre_INDEL_df.GT == 0) & (pre_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_pre = FP_pre_INDEL_df.shape[0]

        FP_post_1_INDEL_df = post_1_INDEL_df[(post_1_INDEL_df.GT == 0) & (post_1_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_1 = FP_post_1_INDEL_df.shape[0]

        FP_post_2_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.GT == 0) & (post_2_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_2 = FP_post_2_INDEL_df.shape[0]

        FP_post_3_INDEL_df = post_3_INDEL_df[(post_3_INDEL_df.GT == 0) & (post_3_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_3 = FP_post_3_INDEL_df.shape[0]

        # TPs #
        # keep no upper-bound on TPs, there will be slight differences in AF from 50% due to simulations
        TP_pre_INDEL_df = pre_INDEL_df[pre_INDEL_df.GT == 1]
        num_TP_INDEL_pre = TP_pre_INDEL_df.shape[0]
        
        TP_post_1_INDEL_df = post_1_INDEL_df[post_1_INDEL_df.GT == 1]
        num_TP_INDEL_post_1 = TP_post_1_INDEL_df.shape[0]
        
        TP_post_2_INDEL_df = post_2_INDEL_df[post_2_INDEL_df.GT == 1]
        num_TP_INDEL_post_2 = TP_post_2_INDEL_df.shape[0]
        
        TP_post_3_INDEL_df = post_3_INDEL_df[post_3_INDEL_df.GT == 1]
        num_TP_INDEL_post_3 = TP_post_3_INDEL_df.shape[0]
    
        Rv_summary_df.loc[df_i] = [tag, depth, freq,
                                   num_TP_INDEL_pre, num_TP_INDEL_post_1, num_TP_INDEL_post_2, num_TP_INDEL_post_3,
                                   num_FP_INDEL_pre, num_FP_INDEL_post_1, num_FP_INDEL_post_2, num_FP_INDEL_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [23]:
print(Rv_summary_df.shape)
Rv_summary_df.head(2)

(1500, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_h37rv_mutant_1_50_0.05,50,0.05,7,6,0,0,0,0,0,0
1,sim1_h37rv_mutant_2_50_0.05,50,0.05,4,4,2,2,0,0,0,0


In [24]:
Rv_summary_df.to_csv("./data/filter_FP/ISS_H37Rv_INDEL_filtering_summary.csv", index=False)

## L1234

No TP info.

In [14]:
L1234_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

In [15]:
L1234_INDEL_source_dir = f"{L1234_parent_dir}/INDELs"

L1234_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                         "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                         "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq >= 0.05:

        ## INDELs ##
        INDEL_df = pd.read_csv(f"{L1234_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
        pre_INDEL_df = INDEL_df[INDEL_df.FB_AF >= 0.05]

        # post_1
        # only consider INDELs with an adjusted AF above 5%
        post_1_INDEL_df = pre_INDEL_df[pre_INDEL_df.AF_Adj >= 0.05]

        # post_2
        post_2_INDEL_df = post_1_INDEL_df[post_1_INDEL_df[INDEL_excl_cols].eq(0).all(axis=1)]

        # post_3
        post_3_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.REGION != "LM") & ~(post_2_INDEL_df.POS.isin(rRNA_pos)) & ~(post_2_INDEL_df.POS.isin(insertion_seqs_phages_pos))]

        # FPs #
        # restrict to AF ≤ 50% for raw AF
        FP_pre_INDEL_df = pre_INDEL_df[(pre_INDEL_df.GT == 0) & (pre_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_pre = FP_pre_INDEL_df.shape[0]

        FP_post_1_INDEL_df = post_1_INDEL_df[(post_1_INDEL_df.GT == 0) & (post_1_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_1 = FP_post_1_INDEL_df.shape[0]

        FP_post_2_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.GT == 0) & (post_2_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_2 = FP_post_2_INDEL_df.shape[0]

        FP_post_3_INDEL_df = post_3_INDEL_df[(post_3_INDEL_df.GT == 0) & (post_3_INDEL_df.FB_AF <= 0.5)]
        num_FP_INDEL_post_3 = FP_post_3_INDEL_df.shape[0]

        # TPs #
        # no TPs
    
        L1234_summary_df.loc[df_i] = [tag, depth, freq,
                                      np.nan, np.nan, np.nan, np.nan,
                                      num_FP_INDEL_pre, num_FP_INDEL_post_1, num_FP_INDEL_post_2, num_FP_INDEL_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 

In [16]:
print(L1234_summary_df.shape)
L1234_summary_df

(600, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_L1_mutant_50_0.05,50,0.05,,,,,19,10,7,0
1,sim1_L2_mutant_50_0.05,50,0.05,,,,,26,15,10,0
2,sim1_L3_mutant_50_0.05,50,0.05,,,,,15,6,3,0
3,sim2_L1_mutant_50_0.05,50,0.05,,,,,22,9,5,0
4,sim2_L2_mutant_50_0.05,50,0.05,,,,,27,13,9,0
...,...,...,...,...,...,...,...,...,...,...,...
595,sim1_L4_mutant_700_0.5,700,0.50,,,,,23,14,12,1
596,sim2_L4_mutant_700_0.5,700,0.50,,,,,26,15,12,1
597,sim3_L4_mutant_700_0.5,700,0.50,,,,,24,14,11,1
598,sim4_L4_mutant_700_0.5,700,0.50,,,,,26,15,11,1


In [17]:
L1234_summary_df.to_csv("./data/filter_FP/ISS_L1234_INDEL_filtering_summary.csv", index=False)

### No switch INDELs

In [18]:
L1234_INDEL_source_dir = f"{L1234_parent_dir}/INDELs"

L1234_noSwitch_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                                  "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                                  "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq >= 0.05:

        ## INDELs ##
        INDEL_df = pd.read_csv(f"{L1234_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
        pre_INDEL_df = INDEL_df[INDEL_df.FB_AF >= 0.05]

        # post_1
        # only consider INDELs with an adjusted AF above 5%
        post_1_INDEL_df = pre_INDEL_df[pre_INDEL_df.AF_Adj >= 0.05]

        # post_2
        post_2_INDEL_df = post_1_INDEL_df[post_1_INDEL_df[INDEL_excl_cols].eq(0).all(axis=1)]

        # post_3
        post_3_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.REGION != "LM") & ~(post_2_INDEL_df.POS.isin(rRNA_pos)) & ~(post_2_INDEL_df.POS.isin(insertion_seqs_phages_pos))]

        # FPs #
        # restrict to AF ≤ 50% for raw AF and adjusted AF
        FP_pre_INDEL_df = pre_INDEL_df[(pre_INDEL_df.GT == 0) & (pre_INDEL_df.FB_AF <= 0.5) & (pre_INDEL_df.AF_Adj <= 0.5)]
        num_FP_INDEL_pre = FP_pre_INDEL_df.shape[0]

        FP_post_1_INDEL_df = post_1_INDEL_df[(post_1_INDEL_df.GT == 0) & (post_1_INDEL_df.FB_AF <= 0.5) & (post_1_INDEL_df.AF_Adj <= 0.5)]
        num_FP_INDEL_post_1 = FP_post_1_INDEL_df.shape[0]

        FP_post_2_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.GT == 0) & (post_2_INDEL_df.FB_AF <= 0.5) & (post_2_INDEL_df.AF_Adj <= 0.5)]
        num_FP_INDEL_post_2 = FP_post_2_INDEL_df.shape[0]

        FP_post_3_INDEL_df = post_3_INDEL_df[(post_3_INDEL_df.GT == 0) & (post_3_INDEL_df.FB_AF <= 0.5) & (post_3_INDEL_df.AF_Adj <= 0.5)]
        num_FP_INDEL_post_3 = FP_post_3_INDEL_df.shape[0]

        # TPs #
        # no TPs
    
        L1234_noSwitch_summary_df.loc[df_i] = [tag, depth, freq,
                                               np.nan, np.nan, np.nan, np.nan,
                                               num_FP_INDEL_pre, num_FP_INDEL_post_1, num_FP_INDEL_post_2, num_FP_INDEL_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 

In [19]:
L1234_noSwitch_summary_df

Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_L1_mutant_50_0.05,50,0.05,,,,,14,5,4,0
1,sim1_L2_mutant_50_0.05,50,0.05,,,,,24,13,8,0
2,sim1_L3_mutant_50_0.05,50,0.05,,,,,13,4,1,0
3,sim2_L1_mutant_50_0.05,50,0.05,,,,,17,4,3,0
4,sim2_L2_mutant_50_0.05,50,0.05,,,,,23,9,6,0
...,...,...,...,...,...,...,...,...,...,...,...
595,sim1_L4_mutant_700_0.5,700,0.50,,,,,15,6,5,1
596,sim2_L4_mutant_700_0.5,700,0.50,,,,,17,6,4,1
597,sim3_L4_mutant_700_0.5,700,0.50,,,,,16,6,4,1
598,sim4_L4_mutant_700_0.5,700,0.50,,,,,18,7,4,1


In [20]:
L1234_noSwitch_summary_df.to_csv("./data/filter_FP/ISS_L1234_INDEL_noSwitch_filtering_summary.csv", index=False)

# [done] AF < 5%: SNP summary table [L1234 only]

In [9]:
L1234_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

In [10]:
L1234_SNP_source_dir = f"{L1234_parent_dir}/SNPs"

L1234_AFunder5_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                                  "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                                  "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq < 0.05:

        ### SNPs ###
        SNP_df = pd.read_csv(f"{L1234_SNP_source_dir}/{tag}.SNPs.csv", low_memory=False)
        pre_SNP_df = SNP_df[SNP_df.FB_AF < 0.05]

        # post_1
        post_1_SNP_df = pre_SNP_df[pre_SNP_df.pred_class == 1]

        # post_2
        post_2_SNP_df = post_1_SNP_df[(post_1_SNP_df.EXCL_SAO == 0) & (post_1_SNP_df.EXCL_DP == 0) & (post_1_SNP_df.EXCL_MQ == 0)]

        # post_3
        post_3_SNP_df = post_2_SNP_df[(post_2_SNP_df.REGION != "LM") & ~(post_2_SNP_df.POS.isin(rRNA_pos))]

        # FPs #
        # already restricted to under 50% by default (only looking at AF < 5%)
        FP_pre_SNP_df = pre_SNP_df[pre_SNP_df.GT == 0]
        num_FP_SNP_pre = FP_pre_SNP_df.shape[0]

        FP_post_1_SNP_df = post_1_SNP_df[post_1_SNP_df.GT == 0]
        num_FP_SNP_post_1 = FP_post_1_SNP_df.shape[0]

        FP_post_2_SNP_df = post_2_SNP_df[post_2_SNP_df.GT == 0]
        num_FP_SNP_post_2 = FP_post_2_SNP_df.shape[0]

        FP_post_3_SNP_df = post_3_SNP_df[post_3_SNP_df.GT == 0]
        num_FP_SNP_post_3 = FP_post_3_SNP_df.shape[0]

        # TPs #
        TP_pre_SNP_df = pre_SNP_df[pre_SNP_df.GT == 1]
        num_TP_SNP_pre = TP_pre_SNP_df.shape[0]
        
        TP_post_1_SNP_df = post_1_SNP_df[post_1_SNP_df.GT == 1]
        num_TP_SNP_post_1 = TP_post_1_SNP_df.shape[0]
        
        TP_post_2_SNP_df = post_2_SNP_df[post_2_SNP_df.GT == 1]
        num_TP_SNP_post_2 = TP_post_2_SNP_df.shape[0]
        
        TP_post_3_SNP_df = post_3_SNP_df[post_3_SNP_df.GT == 1]
        num_TP_SNP_post_3 = TP_post_3_SNP_df.shape[0]
    
        L1234_AFunder5_summary_df.loc[df_i] = [tag, depth, freq,
                                               num_TP_SNP_pre, num_TP_SNP_post_1, num_TP_SNP_post_2, num_TP_SNP_post_3,
                                               num_FP_SNP_pre, num_FP_SNP_post_1, num_FP_SNP_post_2, num_FP_SNP_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 

In [11]:
print(L1234_AFunder5_summary_df.shape)
L1234_AFunder5_summary_df.head(2)

(400, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_L1_mutant_50_0.01,50,0.01,0,0,0,0,286,217,1,0
1,sim1_L2_mutant_50_0.01,50,0.01,2,2,0,0,224,159,0,0


In [13]:
L1234_AFunder5_summary_df.to_csv("./data/filter_FP/ISS_L1234_SNP_AFunder5_filtering_summary.csv", index=False)

# AF < 5%: INDEL summary table

## H37Rv

In [25]:
Rv_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"

In [26]:
Rv_INDEL_source_dir = f"{Rv_parent_dir}/INDELs"

Rv_AFunder5_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                               "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                               "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq < 0.05:

        ## INDELs ##
        INDEL_df = pd.read_csv(f"{Rv_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
        pre_INDEL_df = INDEL_df[INDEL_df.FB_AF < 0.05]

        # post_1
        # only consider INDELs with an adjusted AF above 0%
        post_1_INDEL_df = pre_INDEL_df[pre_INDEL_df.AF_Adj > 0]

        # post_2
        post_2_INDEL_df = post_1_INDEL_df[post_1_INDEL_df[INDEL_excl_cols].eq(0).all(axis=1)]

        # post_3
        post_3_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.REGION != "LM") & ~(post_2_INDEL_df.POS.isin(rRNA_pos)) & ~(post_2_INDEL_df.POS.isin(insertion_seqs_phages_pos))]

        # FPs #
        # restricted to AF ≤ 50% by default because we're looking at AF < 5%
        FP_pre_INDEL_df = pre_INDEL_df[pre_INDEL_df.GT == 0]
        num_FP_INDEL_pre = FP_pre_INDEL_df.shape[0]

        FP_post_1_INDEL_df = post_1_INDEL_df[post_1_INDEL_df.GT == 0]
        num_FP_INDEL_post_1 = FP_post_1_INDEL_df.shape[0]

        FP_post_2_INDEL_df = post_2_INDEL_df[post_2_INDEL_df.GT == 0]
        num_FP_INDEL_post_2 = FP_post_2_INDEL_df.shape[0]

        FP_post_3_INDEL_df = post_3_INDEL_df[post_3_INDEL_df.GT == 0]
        num_FP_INDEL_post_3 = FP_post_3_INDEL_df.shape[0]

        # TPs #
        TP_pre_INDEL_df = pre_INDEL_df[pre_INDEL_df.GT == 1]
        num_TP_INDEL_pre = TP_pre_INDEL_df.shape[0]
        
        TP_post_1_INDEL_df = post_1_INDEL_df[post_1_INDEL_df.GT == 1]
        num_TP_INDEL_post_1 = TP_post_1_INDEL_df.shape[0]
        
        TP_post_2_INDEL_df = post_2_INDEL_df[post_2_INDEL_df.GT == 1]
        num_TP_INDEL_post_2 = TP_post_2_INDEL_df.shape[0]
        
        TP_post_3_INDEL_df = post_3_INDEL_df[post_3_INDEL_df.GT == 1]
        num_TP_INDEL_post_3 = TP_post_3_INDEL_df.shape[0]
    
        Rv_AFunder5_summary_df.loc[df_i] = [tag, depth, freq,
                                            num_TP_INDEL_pre, num_TP_INDEL_post_1, num_TP_INDEL_post_2, num_TP_INDEL_post_3,
                                            num_FP_INDEL_pre, num_FP_INDEL_post_1, num_FP_INDEL_post_2, num_FP_INDEL_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [27]:
print(Rv_AFunder5_summary_df.shape)
Rv_AFunder5_summary_df.head(2)

(1000, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_h37rv_mutant_1_50_0.01,50,0.01,0,0,0,0,5,5,0,0
1,sim1_h37rv_mutant_2_50_0.01,50,0.01,1,1,0,0,7,7,0,0


In [28]:
Rv_AFunder5_summary_df.to_csv("./data/filter_FP/ISS_H37Rv_INDEL_AFunder5_filtering_summary.csv", index=False)

## L1234

In [29]:
L1234_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

In [30]:
L1234_INDEL_source_dir = f"{L1234_parent_dir}/INDELs"

L1234_AFunder5_summary_df = pd.DataFrame(columns=["tag", "depth", "freq",
                                                  "num_TP_pre", "num_TP_post_1", "num_TP_post_2", "num_TP_post_3",
                                                  "num_FP_pre", "num_FP_post_1", "num_FP_post_2", "num_FP_post_3"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    if freq < 0.05:

        ## INDELs ##
        INDEL_df = pd.read_csv(f"{L1234_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)
        pre_INDEL_df = INDEL_df[INDEL_df.FB_AF < 0.05]

        # post_1
        # only consider INDELs with an adjusted AF above 0%
        post_1_INDEL_df = pre_INDEL_df[pre_INDEL_df.AF_Adj > 0]

        # post_2
        post_2_INDEL_df = post_1_INDEL_df[post_1_INDEL_df[INDEL_excl_cols].eq(0).all(axis=1)]

        # post_3
        post_3_INDEL_df = post_2_INDEL_df[(post_2_INDEL_df.REGION != "LM") & ~(post_2_INDEL_df.POS.isin(rRNA_pos)) & ~(post_2_INDEL_df.POS.isin(insertion_seqs_phages_pos))]

        # FPs #
        # restricted to AF ≤ 50% by default because we're looking at AF < 5%
        FP_pre_INDEL_df = pre_INDEL_df[pre_INDEL_df.GT == 0]
        num_FP_INDEL_pre = FP_pre_INDEL_df.shape[0]

        FP_post_1_INDEL_df = post_1_INDEL_df[post_1_INDEL_df.GT == 0]
        num_FP_INDEL_post_1 = FP_post_1_INDEL_df.shape[0]

        FP_post_2_INDEL_df = post_2_INDEL_df[post_2_INDEL_df.GT == 0]
        num_FP_INDEL_post_2 = FP_post_2_INDEL_df.shape[0]

        FP_post_3_INDEL_df = post_3_INDEL_df[post_3_INDEL_df.GT == 0]
        num_FP_INDEL_post_3 = FP_post_3_INDEL_df.shape[0]

        # TPs #
        # no TPs
    
        L1234_AFunder5_summary_df.loc[df_i] = [tag, depth, freq,
                                               np.nan, np.nan, np.nan, np.nan,
                                               num_FP_INDEL_pre, num_FP_INDEL_post_1, num_FP_INDEL_post_2, num_FP_INDEL_post_3]
        df_i += 1

    

0 100 200 300 400 500 600 700 800 900 

In [32]:
print(L1234_AFunder5_summary_df.shape)
L1234_AFunder5_summary_df

(400, 11)


Unnamed: 0,tag,depth,freq,num_TP_pre,num_TP_post_1,num_TP_post_2,num_TP_post_3,num_FP_pre,num_FP_post_1,num_FP_post_2,num_FP_post_3
0,sim1_L1_mutant_50_0.01,50,0.01,,,,,13,9,0,0
1,sim1_L2_mutant_50_0.01,50,0.01,,,,,7,5,1,0
2,sim1_L3_mutant_50_0.01,50,0.01,,,,,6,2,0,0
3,sim2_L1_mutant_50_0.01,50,0.01,,,,,17,8,0,0
4,sim2_L2_mutant_50_0.01,50,0.01,,,,,8,7,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,sim1_L4_mutant_700_0.04,700,0.04,,,,,3,1,0,0
396,sim2_L4_mutant_700_0.04,700,0.04,,,,,4,2,1,0
397,sim3_L4_mutant_700_0.04,700,0.04,,,,,5,3,0,0
398,sim4_L4_mutant_700_0.04,700,0.04,,,,,6,2,1,0


In [33]:
L1234_AFunder5_summary_df.to_csv("./data/filter_FP/ISS_L1234_INDEL_AFunder5_filtering_summary.csv", index=False)