In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.multicomp as mc
from scipy import stats
import pickle
from scipy.stats import mannwhitneyu as mannwu
from matplotlib.ticker import MultipleLocator
import os
from Bio import Seq, SeqIO

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from plotting import *
from regions import *

In [2]:
tagsRv = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [3]:
AF_min = 0.05
AF_max = 0.95
MQ_thresh = 40
num_support_each_direction = 2

In [4]:
h37Rv_path = "/n/data1/hms/dbmi/farhat/Sanjana/H37Rv"
h37Rv_seq = SeqIO.read(os.path.join(h37Rv_path, "GCF_000195955.2_ASM19595v2_genomic.gbff"), "genbank")
h37Rv_genes = pd.read_csv(os.path.join(h37Rv_path, "mycobrowser_h37rv_genes_v4.csv"))
h37Rv_regions = pd.read_csv(os.path.join(h37Rv_path, "mycobrowser_h37rv_v4.csv"))

# remove rRNAs, which are highly conserved. rrs, rrl, and rrf
rRNA_pos = []

for i, row in h37Rv_regions.query("Functional_Category=='stable RNAs' & Feature=='rRNA'").iterrows():
    rRNA_pos += list(np.arange(row['Start'], row['Stop'] + 1))
    
    
# exclude any indel within 100 bp of an insertion seq / phage and also within those regions
insertion_seqs_phages_pos = []

for i, row in h37Rv_regions.query("Functional_Category=='insertion seqs and phages'").iterrows():
    insertion_seqs_phages_pos += list(np.arange(row['Start'] - 100, row['Stop'] + 100 + 1))
    
insertion_seqs_phages_pos = np.unique(insertion_seqs_phages_pos)

In [5]:
INDEL_excl_cols = ["EXCL_DP", "EXCL_MQ", "EXCL_SAO", "EXCL_NR", "EXCL_M_DP", "EXCL_TR_DP"]

In [6]:
def classify(AF):

    lowerAF_thresh = 0.05
    fixed_thresh = 0.95

    if AF == 0:
        return("0")
    elif AF > 0 and AF < lowerAF_thresh:
        return("lowMV")
    elif AF >= lowerAF_thresh and AF < fixed_thresh:
        return("MV")
    else:
        return("fixed")

In [7]:
def compute_fixed_category(AF_arr, adj_AF_arr):

    fixed_cat_arr = []
    
    for AF, adj_AF in zip(AF_arr, adj_AF_arr):

        first_class = classify(AF)
        second_class = classify(adj_AF)

        fixed_cat_arr.append(f"{first_class}-{second_class}")

    return fixed_cat_arr
    

In [8]:
WGS_source_dir = "/n/data1/hms/dbmi/farhat/shandu/projects/mixed_calls/benchmarking/comprehensive"

# Test

In [5]:
test_tags = ["sim1_L1_mutant_200_0.5", "sim1_L1_mutant_50_0.01"]

In [6]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"
source_dir = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_L1234"

## Filter variant summary file

In [8]:
output_dir = f"{parent_dir}/INDELs"

count = 0
for tag in test_tags:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{source_dir}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0] ## L1234-specific

    # filter for only the variants found by FreeBayes
    variant_summary_df = variant_summary_df[variant_summary_df.FB_found == 1]

    # extract necessary columns
    _variant_summary_df = variant_summary_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter INDELs
    INDEL_df = _variant_summary_df[_variant_summary_df.REF.str.len() != _variant_summary_df.ALT.str.len()].copy(deep=True)
        
    # rename columns for consistency with variables used by filtering model
    INDEL_df = INDEL_df.rename({"FB_AO": "AO"}, axis=1)

    # already split MNPs

    INDEL_df.to_csv(f"{output_dir}/{tag}.candidate_INDELs.csv", index=False)
        

0 

## Make tag map

In [13]:
bam_source_dir = f"{WGS_source_dir}/ISSL123_wgs_processing"
candidate_indel_dir = f"{parent_dir}/INDELs"

tag_df = pd.DataFrame(columns=["TAG", "BAM_FILE", "CANDIDATE_INDEL_FILE"])
df_i = 0

for tag in test_tags:

    depth = tag.split("_")[-2]

    bam_file = f"{bam_source_dir}/output{depth}x/{tag}/bam/{tag}.sorted.duprem.bam"
    candidate_indel_file = f"{candidate_indel_dir}/{tag}.candidate_INDELs.csv"

    assert(os.path.isfile(bam_file))
    assert(os.path.isfile(candidate_indel_file))

    tag_df.loc[df_i] = [tag, bam_file, candidate_indel_file]
    df_i += 1

In [14]:
print(tag_df.shape)
tag_df.head(2)

(2, 3)


Unnamed: 0,TAG,BAM_FILE,CANDIDATE_INDEL_FILE
0,sim1_L1_mutant_200_0.5,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...
1,sim1_L1_mutant_50_0.01,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...


In [15]:
tag_df.to_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FP_characteristics/tag_maps/ISS_L1234_indel_bam_var_tag_map_test.csv", index=False, header=False)

## Check files

In [20]:
df = pd.read_csv("/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234/INDELs/sim1_L1_mutant_200_0.5.INDELs.csv")
df

Unnamed: 0,POS,REF,ALT,GT,REGION,FB_AF,AO,FB_AO_F,FB_AO_R,FB_DP,...,FB_QUAL,FB_ORIGIN,FB_COMPLEX_TYPE,FB_MULTIPLE_HAPS,FB_MULTIALLELIC,FB_REF_GL,FB_ALT_GL,Indel_Support,Total_Reads,AF_Adj
0,55549,G,TCCGCCGCCG,0,LM,0.797980,79.0,36.0,43.0,99.0,...,2.500000e+03,55549.0,1.0,0.0,0.0,,,144.0,150.0,0.960000
1,208322,T,CG,0,other,0.993976,165.0,83.0,82.0,166.0,...,5.400000e+03,,0.0,0.0,0.0,-541.235,0.000,174.0,178.0,0.977528
2,333674,AC,C,0,LM,0.268041,78.0,53.0,25.0,291.0,...,0.000000e+00,333674.0,1.0,0.0,0.0,,,13.0,13.0,1.000000
3,336559,G,GGCTA,0,LM,0.337278,114.0,45.0,69.0,338.0,...,1.200000e-14,336558.0,1.0,0.0,0.0,,,57.0,58.0,0.982759
4,337884,G,GT,0,LM,0.014151,3.0,2.0,1.0,212.0,...,1.300000e-14,,0.0,0.0,0.0,0.000,-663.075,0.0,190.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,4253576,ACGACAAG,A,0,LM,1.000000,52.0,19.0,33.0,52.0,...,1.600000e+03,4253576.0,1.0,0.0,1.0,,,0.0,157.0,0.000000
62,4254346,TT,A,0,LM,1.000000,104.0,62.0,42.0,104.0,...,3.300000e+03,,0.0,0.0,0.0,-340.194,0.000,0.0,120.0,0.000000
63,4318424,TC,G,0,LM,0.992248,128.0,41.0,87.0,129.0,...,3.900000e+03,,0.0,0.0,0.0,-399.478,0.000,183.0,183.0,1.000000
64,4341407,G,GC,0,other,0.041860,9.0,5.0,4.0,215.0,...,0.000000e+00,,0.0,0.0,0.0,0.000,-640.751,0.0,153.0,0.000000


In [28]:
df[(df.AF_Adj >= 0.05) & (df.AF_Adj <= 0.95)].shape

(9, 21)

In [24]:
df[df.AF_Adj > 0].shape

(42, 21)

In [26]:
df.AF_Adj.isna().sum()

np.int64(0)

In [27]:
df[df.AF_Adj < 0.05].shape

(37, 21)

In [33]:
df[(df.AF_Adj > 0) & (df.AF_Adj < 0.05)].shape

(13, 21)

In [35]:
df[df.AF_Adj == 0].shape

(24, 21)

In [25]:
df[df.AF_Adj > 0.95].shape

(20, 21)

In [29]:
37+9+20

66

In [30]:
df[(df.FB_AF >= 0.05) & (df.FB_AF <= 0.95)].shape

(24, 21)

In [31]:
df[df.FB_AF < 0.05].shape

(25, 21)

In [34]:
df[(df.FB_AF > 0) & (df.FB_AF < 0.05)].shape

(25, 21)

In [36]:
df[df.FB_AF == 0].shape

(0, 21)

In [32]:
df[df.FB_AF > 0.95].shape

(17, 21)

# ISS L1234

In [9]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"
source_dir = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_L1234"

## Filter variant summary file

In [10]:
output_dir = f"{parent_dir}/INDELs"

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{source_dir}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0] ## L1234-specific

    # filter for only the variants found by FreeBayes
    variant_summary_df = variant_summary_df[variant_summary_df.FB_found == 1]

    # extract necessary columns
    _variant_summary_df = variant_summary_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter INDELs
    INDEL_df = _variant_summary_df[_variant_summary_df.REF.str.len() != _variant_summary_df.ALT.str.len()].copy(deep=True)
        
    # rename columns for consistency with variables used by filtering model
    INDEL_df = INDEL_df.rename({"FB_AO": "AO"}, axis=1)

    # already split MNPs

    INDEL_df.to_csv(f"{output_dir}/{tag}.candidate_INDELs.csv", index=False)
        

0 100 200 300 400 500 600 700 800 900 

## Make tag map

In [11]:
bam_source_dir = f"{WGS_source_dir}/ISSL123_wgs_processing"
candidate_indel_dir = f"{parent_dir}/INDELs"

tag_df = pd.DataFrame(columns=["TAG", "BAM_FILE", "CANDIDATE_INDEL_FILE"])
df_i = 0

for tag in tagsL1234:

    depth = tag.split("_")[-2]

    bam_file = f"{bam_source_dir}/output{depth}x/{tag}/bam/{tag}.sorted.duprem.bam"
    candidate_indel_file = f"{candidate_indel_dir}/{tag}.candidate_INDELs.csv"

    assert(os.path.isfile(bam_file))
    assert(os.path.isfile(candidate_indel_file))

    tag_df.loc[df_i] = [tag, bam_file, candidate_indel_file]
    df_i += 1

In [12]:
print(tag_df.shape)
tag_df.head(2)

(1000, 3)


Unnamed: 0,TAG,BAM_FILE,CANDIDATE_INDEL_FILE
0,sim1_L1_mutant_50_0.01,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...
1,sim1_L2_mutant_50_0.01,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...


In [13]:
tag_df.to_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FP_characteristics/tag_maps/ISS_L1234_indel_bam_var_tag_map.csv", index=False, header=False)

## Check files

In [14]:
tag_df = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FP_characteristics/tag_maps/ISS_L1234_indel_bam_var_tag_map.csv",
             names=["TAG", "BAM_FILE", "CANDIDATE_INDEL_FILE"])

In [15]:
rerun_tags = []

for tag in tagsL1234:

    final_indel_file = f"{candidate_indel_dir}/{tag}.INDELs.csv"

    if not os.path.isfile(final_indel_file):
        rerun_tags.append(tag)

len(rerun_tags)

0

## Add filters

In [16]:
indel_dir = f"{parent_dir}/INDELs"
depth_dir = f"{parent_dir}/bam_depths"

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    df_depth = pd.read_csv(f"{depth_dir}/{tag}.depth.tsv.gz", sep='\t', compression='gzip', names=['CHROM', 'POS', 'COV'])
    min_depth = df_depth.COV.median() / 3
    low_depth_sites = df_depth.query("COV < @min_depth").POS.values

    INDEL_df = pd.read_csv(f"{indel_dir}/{tag}.INDELs.csv", low_memory=False)

    INDEL_df["CATEGORY"] = compute_fixed_category(INDEL_df.FB_AF, INDEL_df.AF_Adj)

    # label with exclusion filters
    INDEL_df["EXCL_DP"] = (INDEL_df.FB_DP < 5).astype(int)
    INDEL_df["EXCL_MQ"] = (INDEL_df.FB_MQ < 40).astype(int)
    INDEL_df["EXCL_SAO"] = (~(INDEL_df.REF.str.len() - INDEL_df.ALT.str.len() > 10) & ((INDEL_df.FB_AO_F < 2) | (INDEL_df.FB_AO_R < 2))).astype(int)
    INDEL_df["EXCL_NR"] = INDEL_df.Indel_Support.isna().astype(int)
    INDEL_df["EXCL_M_DP"] = INDEL_df.POS.isin(low_depth_sites).astype(int)
    INDEL_df["EXCL_TR_DP"] = (INDEL_df.Total_Reads < 5).astype(int)
    
    INDEL_df.to_csv(f"{indel_dir}/{tag}.INDELs.csv", index=False)

0 100 200 300 400 500 600 700 800 900 

### Why is 549732 in low_depth_sites?

depth.tsv shows that COV=1 but there is a deletion??

In [30]:
INDEL_df = pd.read_csv(f"{indel_dir}/sim1_L1_mutant_50_0.01.INDELs.csv", low_memory=False)

In [31]:
INDEL_df[INDEL_df.EXCL_M_DP == 1]

Unnamed: 0,POS,REF,ALT,GT,REGION,FB_AF,AO,FB_AO_F,FB_AO_R,FB_DP,...,Indel_Support,Total_Reads,AF_Adj,CATEGORY,EXCL_DP,EXCL_MQ,EXCL_SAO,EXCL_NR,EXCL_M_DP,EXCL_TR_DP
4,549732,TACC,A,0,other,1.0,49.0,17.0,32.0,49.0,...,53.0,53.0,1.0,fixed-fixed,0,0,0,0,1,0
12,1468915,C,TCGGGGC,0,LM,0.944444,17.0,14.0,3.0,18.0,...,0.0,41.0,0.0,MV-0,0,0,0,0,1,0
13,1468915,CTTGTCGT,C,0,LM,0.944444,17.0,14.0,3.0,18.0,...,0.0,41.0,0.0,MV-0,0,0,0,0,1,0
17,2047426,CGCTAACTTCCCGTCTC,A,0,other,1.0,32.0,18.0,14.0,32.0,...,34.0,34.0,1.0,fixed-fixed,0,0,0,0,1,0
45,4253576,A,GCCCCGA,0,LM,1.0,18.0,9.0,9.0,18.0,...,0.0,31.0,0.0,fixed-0,0,0,0,0,1,0
46,4253576,ACGACAAG,A,0,LM,1.0,18.0,9.0,9.0,18.0,...,0.0,31.0,0.0,fixed-0,0,0,0,0,1,0
47,4254346,TT,A,0,LM,1.0,25.0,17.0,8.0,25.0,...,0.0,32.0,0.0,fixed-0,0,0,0,0,1,0


In [32]:
df_depth = pd.read_csv(f"{depth_dir}/sim1_L1_mutant_50_0.01.depth.tsv.gz", sep='\t', compression='gzip', names=['CHROM', 'POS', 'COV'])
min_depth = df_depth.COV.median() / 3
low_depth_sites = df_depth.query("COV < @min_depth").POS.values

In [33]:
df_depth[df_depth.POS == 549732]

Unnamed: 0,CHROM,POS,COV
549731,NC_000962.3,549732,1


In [34]:
549732 in low_depth_sites

True

# ISS H37Rv

In [9]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"
source_dir = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_H37Rv"

## Filter variant summary file

In [10]:
output_dir = f"{parent_dir}/INDELs"

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{source_dir}/{tag}.variant_summary.csv", low_memory=False)

    # filter for only the variants found by FreeBayes
    variant_summary_df = variant_summary_df[variant_summary_df.FB_found == 1]

    # extract necessary columns
    _variant_summary_df = variant_summary_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter INDELs
    INDEL_df = _variant_summary_df[_variant_summary_df.REF.str.len() != _variant_summary_df.ALT.str.len()].copy(deep=True)
        
    # rename columns for consistency with variables used by filtering model
    INDEL_df = INDEL_df.rename({"FB_AO": "AO"}, axis=1)

    # already split MNPs

    INDEL_df.to_csv(f"{output_dir}/{tag}.candidate_INDELs.csv", index=False)
        

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

## Make tag map

In [11]:
bam_source_dir = f"{WGS_source_dir}/ISS_wgs_processing"
candidate_indel_dir = f"{parent_dir}/INDELs"

tag_df = pd.DataFrame(columns=["TAG", "BAM_FILE", "CANDIDATE_INDEL_FILE"])
df_i = 0

for tag in tagsRv:

    depth = tag.split("_")[-2]

    bam_file = f"{bam_source_dir}/output{depth}x/{tag}/bam/{tag}.sorted.duprem.bam"
    candidate_indel_file = f"{candidate_indel_dir}/{tag}.candidate_INDELs.csv"

    assert(os.path.isfile(bam_file))
    assert(os.path.isfile(candidate_indel_file))

    tag_df.loc[df_i] = [tag, bam_file, candidate_indel_file]
    df_i += 1

In [12]:
print(tag_df.shape)
tag_df.head(2)

(2500, 3)


Unnamed: 0,TAG,BAM_FILE,CANDIDATE_INDEL_FILE
0,sim1_h37rv_mutant_1_50_0.01,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...
1,sim1_h37rv_mutant_2_50_0.01,/n/data1/hms/dbmi/farhat/shandu/projects/mixed...,/n/scratch/users/s/sm624/FP_characteristics/IS...


In [13]:
tag_df.to_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FP_characteristics/tag_maps/ISS_H37Rv_indel_bam_var_tag_map.csv", index=False, header=False)

## Check files

In [14]:
candidate_indel_dir = f"{parent_dir}/INDELs"

In [15]:
tag_df = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FP_characteristics/tag_maps/ISS_H37Rv_indel_bam_var_tag_map.csv",
             names=["TAG", "BAM_FILE", "CANDIDATE_INDEL_FILE"])

In [16]:
rerun_tags = []

for tag in tagsRv:

    final_indel_file = f"{candidate_indel_dir}/{tag}.INDELs.csv"

    if not os.path.isfile(final_indel_file):
        rerun_tags.append(tag)

len(rerun_tags)

0

## Add filters

In [17]:
indel_dir = f"{parent_dir}/INDELs"
depth_dir = f"{parent_dir}/bam_depths"

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    df_depth = pd.read_csv(f"{depth_dir}/{tag}.depth.tsv.gz", sep='\t', compression='gzip', names=['CHROM', 'POS', 'COV'])
    min_depth = df_depth.COV.median() / 3
    low_depth_sites = df_depth.query("COV < @min_depth").POS.values

    INDEL_df = pd.read_csv(f"{indel_dir}/{tag}.INDELs.csv", low_memory=False)

    INDEL_df["CATEGORY"] = compute_fixed_category(INDEL_df.FB_AF, INDEL_df.AF_Adj)

    # label with exclusion filters
    INDEL_df["EXCL_DP"] = (INDEL_df.FB_DP < 5).astype(int)
    INDEL_df["EXCL_MQ"] = (INDEL_df.FB_MQ < 40).astype(int)
    INDEL_df["EXCL_SAO"] = (~(INDEL_df.REF.str.len() - INDEL_df.ALT.str.len() > 10) & ((INDEL_df.FB_AO_F < 2) | (INDEL_df.FB_AO_R < 2))).astype(int)
    INDEL_df["EXCL_NR"] = INDEL_df.Indel_Support.isna().astype(int)
    INDEL_df["EXCL_M_DP"] = INDEL_df.POS.isin(low_depth_sites).astype(int)
    INDEL_df["EXCL_TR_DP"] = (INDEL_df.Total_Reads < 5).astype(int)
    
    INDEL_df.to_csv(f"{indel_dir}/{tag}.INDELs.csv", index=False)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

# ?Distribution of AF values before and after adjustment

In [None]:
candidate_indel_dir = f"{parent_dir}/INDELs"

TP_AF_diff_post_adj = {f:[] for f in freqs}
TP_AF_diff_post_adj_and_filter = {f:[] for f in freqs}

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    INDEL_df = pd.read_csv(f"{candidate_indel_dir}/{tag}.INDELs.csv", low_memory=False)

    TP_df = INDEL_df[INDEL_df.GT == 1]

    TP_after_AF_Adj = TP_df[TP_df.AF_Adj > 0]
    pre_AF_AF_adj = TP_after_AF_Adj.FB_AF
    post_AF_AF_adj = TP_after_AF_Adj.AF_Adj
    diff_AF_adj = post_AF_AF_adj - pre_AF_AF_adj
    TP_AF_diff_post_adj[freq] += list(diff_AF_adj)
    
    TP_after_AF_Adj_and_filtering = TP_after_AF_Adj[TP_after_AF_Adj[excl_cols].eq(0).all(axis=1)]
    pre_AF_AF_adj_and_filter = TP_after_AF_Adj_and_filtering.FB_AF
    post_AF_AF_adj_and_filter = TP_after_AF_Adj_and_filtering.AF_Adj
    diff_AF_adj_and_filter = post_AF_AF_adj_and_filter - pre_AF_AF_adj_and_filter
    TP_AF_diff_post_adj_and_filter[freq] += list(diff_AF_adj_and_filter)



In [None]:
with open("./data/filter_FP/ISS_H37Rv_INDEL_TP_AF_diff_post_adj.pkl", "wb") as out_f:
    pickle.dump(TP_AF_diff_post_adj, out_f)

with open("./data/filter_FP/ISS_H37Rv_INDEL_TP_AF_diff_post_adj_and_filter.pkl", "wb") as out_f:
    pickle.dump(TP_AF_diff_post_adj_and_filter, out_f)