# FILTER FP

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.multicomp as mc
from scipy import stats
import pickle
from scipy.stats import mannwhitneyu as mannwu
from matplotlib.ticker import MultipleLocator
import os

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from plotting import *
from regions import *

In [2]:
tagsRv = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("/home/sm624/projects/mixed_calls/benchmarking/comprehensive/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [3]:
AF_min = 0.05
AF_max = 0.95
MQ_thresh = 40
num_support_each_direction = 2

In [4]:
h37Rv_path = "/n/data1/hms/dbmi/farhat/Sanjana/H37Rv"
h37Rv_regions = pd.read_csv(os.path.join(h37Rv_path, "mycobrowser_h37rv_v4.csv"))

# remove rRNAs, which are highly conserved. rrs, rrl, and rrf
rRNA_pos = []

for i, row in h37Rv_regions.query("Functional_Category=='stable RNAs' & Feature=='rRNA'").iterrows():
    rRNA_pos += list(np.arange(row['Start'], row['Stop'] + 1))

In [5]:
h37Rv_regions.query("Functional_Category=='stable RNAs' & Feature=='rRNA'")

Unnamed: 0,Refseq_ID,Mycobrowser_Version,Feature,Start,Stop,Score,Strand,Frame,Locus,Name,...,SWISS-MODEL,Orthologues M. leprae,Orthologues M. marinum,Orthologues M. smegmatis,Orthologues M. bovis,Orthologues M. lepromatosis,Orthologues M. tuberculosis,Orthologues M. abscessus,Orthologues M. haemophilum,Orthologues M. orygis
1417,NC_000962.3,Mycobrowser_v4,rRNA,1471846,1473382,.,+,0.0,MTB000019,rrs,...,,,,,,,,,,
1419,NC_000962.3,Mycobrowser_v4,rRNA,1473658,1476795,.,+,0.0,MTB000020,rrl,...,,,,,,,,,,
1420,NC_000962.3,Mycobrowser_v4,rRNA,1476899,1477013,.,+,0.0,MTB000021,rrf,...,,,,,,,,,,


In [6]:
excl_cols = ["EXCL_DP", "EXCL_MQ", "EXCL_SAO"]

# ISS L1234

In [8]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

## Filter variant summary file

In [9]:
source_dir = f"{parent_dir}/position_metrics"
output_dir = f"{parent_dir}/SNPs"

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    position_metric_df = pd.read_csv(f"{source_dir}/{tag}.position_metrics.csv", low_memory=False)
    position_metric_df = position_metric_df[position_metric_df.L_GT == 0] ## L1234-specific

    # filter for only the variants found by FreeBayes
    position_metric_df = position_metric_df[position_metric_df.FB_found == 1]

    # extract necessary columns
    _position_metric_df = position_metric_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "COV_RATIO", "CLIPPED_BASES_RATIO", "DISCORDANT_READS_RATIO",
                                              "MEAN_ALT_BQ", "FB_STRAND_BIAS", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter SNPs
    SNP_df = _position_metric_df[(_position_metric_df.REF.str.len() == 1) & (_position_metric_df.ALT.str.len() == 1)].copy(deep=True)

    # rename columns for consistency with variables used by filtering model
    SNP_df = SNP_df.rename({"MEAN_ALT_BQ":"Mean_BQ_ALT_allele",
                            "FB_STRAND_BIAS": "SAF_prop_deviation_from_half"}, axis=1)

    # already split MNPs

    SNP_df.to_csv(f"{output_dir}/{tag}.candidate_SNPs.csv", index=False)
        

0 100 200 300 400 500 600 700 800 900 

## Make tag ID file

In [10]:
tag_id_df = pd.DataFrame(columns=["tag"])
tag_id_df.loc[0, :] = 1
tag_id_df.loc[0, "tag"] = "dummy"

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    lowAF_SNP_file = f"{output_dir}/{tag}.candidate_SNPs.csv"

    lowAF_SNP_df = pd.read_csv(lowAF_SNP_file)

    add_df = pd.DataFrame(columns=["tag"])
    add_df["tag"] = [tag]*lowAF_SNP_df.shape[0]

    tag_id_df = pd.concat([tag_id_df, add_df])
    tag_id_df.reset_index(drop=True, inplace=True)
    
tag_id_df.drop(0, inplace=True)
tag_id_df.reset_index(inplace=True, drop=True)

0 100 200 300 400 500 600 700 800 900 

In [13]:
print(tag_id_df.shape)
tag_id_df.head(2)

(1057642, 1)


Unnamed: 0,tag
0,sim1_L1_mutant_50_0.01
1,sim1_L1_mutant_50_0.01


In [14]:
tag_id_df.to_csv("/n/scratch/users/s/sm624/L1234_benchmarking_tag_ids.csv", index=False)

## Create final file of SNPs with predictions and add exclusions

1. Label low-frequency SNP (AF ≤ 95%) and fixed SNP (AF > 95%).
2. Label fixed exclusion filters.

In [16]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234"

In [17]:
predicted_MV_SNPs_file = f"{parent_dir}/variant_SNP_predictions.csv"
predicted_MV_SNPs_df = pd.read_csv(predicted_MV_SNPs_file, low_memory=False)
print(predicted_MV_SNPs_df.shape)
predicted_MV_SNPs_df.head(2)

(1057642, 26)


Unnamed: 0,tag,POS,REF,ALT,GT,REGION,FB_AF,COV_RATIO,CLIPPED_BASES_RATIO,DISCORDANT_READS_RATIO,...,FB_MQ,FB_QUAL,FB_ORIGIN,FB_COMPLEX_TYPE,FB_MULTIPLE_HAPS,FB_MULTIALLELIC,FB_REF_GL,FB_ALT_GL,predicted,pred_class
0,sim1_L1_mutant_50_0.01,21455,C,T,0,other,0.038462,0.956149,0.0,0.0,...,60.0,3.88768e-15,,0,0,0,0.0,-157.967,0.977903,1.0
1,sim1_L1_mutant_50_0.01,24721,G,C,0,LM,0.0625,0.706861,0.264706,0.0,...,60.0,0.0,24721.0,1,0,0,,,0.005015,0.0


In [18]:
source_dir = f"{parent_dir}/position_metrics"
output_dir = f"{parent_dir}/SNPs"

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    SNP_df = predicted_MV_SNPs_df[predicted_MV_SNPs_df.tag == tag]
    SNP_df = SNP_df.drop("tag", axis=1)

    # label with exclusion filters
    SNP_df["EXCL_DP"] = (SNP_df.FB_DP < 5).astype(int)
    SNP_df["EXCL_MQ"] = (SNP_df.FB_MQ < 40).astype(int)
    SNP_df["EXCL_SAO"] = ((SNP_df.FB_AO_F < 2) | (SNP_df.FB_AO_R < 2)).astype(int)
    
    SNP_df.to_csv(f"{output_dir}/{tag}.SNPs.csv", index=False)

0 100 200 300 400 500 600 700 800 900 

# ISS H37Rv

In [8]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"

## Filter variant summary file

In [9]:
source_dir = f"{parent_dir}/position_metrics"
output_dir = f"{parent_dir}/SNPs"

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    position_metric_df = pd.read_csv(f"{source_dir}/{tag}.position_metrics.csv", low_memory=False)

    # filter for only the variants found by FreeBayes
    position_metric_df = position_metric_df[position_metric_df.FB_found == 1]

    # extract necessary columns
    _position_metric_df = position_metric_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "COV_RATIO", "CLIPPED_BASES_RATIO", "DISCORDANT_READS_RATIO",
                                              "MEAN_ALT_BQ", "FB_STRAND_BIAS", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter SNPs
    SNP_df = _position_metric_df[(_position_metric_df.REF.str.len() == 1) & (_position_metric_df.ALT.str.len() == 1)].copy(deep=True)

    # rename columns for consistency with variables used by filtering model
    SNP_df = SNP_df.rename({"MEAN_ALT_BQ":"Mean_BQ_ALT_allele",
                            "FB_STRAND_BIAS": "SAF_prop_deviation_from_half"}, axis=1)

    # already split MNPs

    SNP_df.to_csv(f"{output_dir}/{tag}.candidate_SNPs.csv", index=False)
        

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

## Make tag ID file

In [10]:
tag_id_df = pd.DataFrame(columns=["tag"])
tag_id_df.loc[0, :] = 1
tag_id_df.loc[0, "tag"] = "dummy"

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    lowAF_SNP_file = f"{output_dir}/{tag}.candidate_SNPs.csv"

    lowAF_SNP_df = pd.read_csv(lowAF_SNP_file)

    add_df = pd.DataFrame(columns=["tag"])
    add_df["tag"] = [tag]*lowAF_SNP_df.shape[0]

    tag_id_df = pd.concat([tag_id_df, add_df])
    tag_id_df.reset_index(drop=True, inplace=True)
    
tag_id_df.drop(0, inplace=True)
tag_id_df.reset_index(inplace=True, drop=True)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [11]:
print(tag_id_df.shape)
tag_id_df.head(2)

(496843, 1)


Unnamed: 0,tag
0,sim1_h37rv_mutant_1_50_0.01
1,sim1_h37rv_mutant_1_50_0.01


In [12]:
tag_id_df.to_csv("/n/scratch/users/s/sm624/H37Rv_benchmarking_tag_ids.csv", index=False)

## Create final file of SNPs with predictions and add exclusions

1. Label low-frequency SNP (AF ≤ 95%) and fixed SNP (AF > 95%).
2. Label fixed exclusion filters.

In [13]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"

In [14]:
predicted_MV_SNPs_file = f"{parent_dir}/variant_SNP_predictions.csv"
predicted_MV_SNPs_df = pd.read_csv(predicted_MV_SNPs_file, low_memory=False)
print(predicted_MV_SNPs_df.shape)
predicted_MV_SNPs_df.head(2)

(496843, 26)


Unnamed: 0,tag,POS,REF,ALT,GT,REGION,FB_AF,COV_RATIO,CLIPPED_BASES_RATIO,DISCORDANT_READS_RATIO,...,FB_MQ,FB_QUAL,FB_ORIGIN,FB_COMPLEX_TYPE,FB_MULTIPLE_HAPS,FB_MULTIALLELIC,FB_REF_GL,FB_ALT_GL,predicted,pred_class
0,sim1_h37rv_mutant_1_50_0.01,10518,T,A,0,other,0.043478,0.883481,0.0,0.0,...,60.0,3.43565e-15,,0,0,0,0.0,-137.979,0.946788,1
1,sim1_h37rv_mutant_1_50_0.01,40246,C,T,0,other,0.033898,0.993,0.0,0.0,...,60.0,1.51453e-15,,0,0,0,0.0,-181.604,0.974799,1


In [15]:
source_dir = f"{parent_dir}/position_metrics"
output_dir = f"{parent_dir}/SNPs"

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    SNP_df = predicted_MV_SNPs_df[predicted_MV_SNPs_df.tag == tag]
    SNP_df = SNP_df.drop("tag", axis=1)

    # label with exclusion filters
    SNP_df["EXCL_DP"] = (SNP_df.FB_DP < 5).astype(int)
    SNP_df["EXCL_MQ"] = (SNP_df.FB_MQ < 40).astype(int)
    SNP_df["EXCL_SAO"] = ((SNP_df.FB_AO_F < 2) | (SNP_df.FB_AO_R < 2)).astype(int)
    
    SNP_df.to_csv(f"{output_dir}/{tag}.SNPs.csv", index=False)

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

# BinoSNP data

In [7]:
BStags = pd.read_csv("./data/source/binosnp_data_tags.csv", names=["tag"]).tag.values
len(BStags)

6

In [8]:
parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/BSdata"

## Filter variant summary file

In [10]:
source_dir = f"{parent_dir}/position_metrics"
output_dir = f"{parent_dir}/SNPs"

count = 0
for tag in BStags:

    position_metric_df = pd.read_csv(f"{source_dir}/{tag}.position_metrics.csv", low_memory=False)

    # filter for only the variants found by FreeBayes
    position_metric_df = position_metric_df[position_metric_df.FB_found == 1]

    # extract necessary columns
    _position_metric_df = position_metric_df[["POS", "REF", "ALT", "GT", "REGION", "FB_AF", "COV_RATIO", "CLIPPED_BASES_RATIO", "DISCORDANT_READS_RATIO",
                                              "MEAN_ALT_BQ", "FB_STRAND_BIAS", "FB_AO", "FB_AO_F", "FB_AO_R", "FB_DP", "FB_MQ", "FB_QUAL",
                                              "FB_ORIGIN", "FB_COMPLEX_TYPE", "FB_MULTIPLE_HAPS", "FB_MULTIALLELIC", "FB_REF_GL", "FB_ALT_GL"]]

    # filter SNPs
    SNP_df = _position_metric_df[(_position_metric_df.REF.str.len() == 1) & (_position_metric_df.ALT.str.len() == 1)].copy(deep=True)

    # rename columns for consistency with variables used by filtering model
    SNP_df = SNP_df.rename({"MEAN_ALT_BQ":"Mean_BQ_ALT_allele",
                            "FB_STRAND_BIAS": "SAF_prop_deviation_from_half"}, axis=1)

    # already split MNPs

    SNP_df.to_csv(f"{output_dir}/{tag}.candidate_SNPs.csv", index=False)
        

## Make tag ID file

In [12]:
tag_id_df = pd.DataFrame(columns=["tag"])
tag_id_df.loc[0, :] = 1
tag_id_df.loc[0, "tag"] = "dummy"

count = 0
for tag in BStags:

    lowAF_SNP_file = f"{output_dir}/{tag}.candidate_SNPs.csv"

    lowAF_SNP_df = pd.read_csv(lowAF_SNP_file)

    add_df = pd.DataFrame(columns=["tag"])
    add_df["tag"] = [tag]*lowAF_SNP_df.shape[0]

    tag_id_df = pd.concat([tag_id_df, add_df])
    tag_id_df.reset_index(drop=True, inplace=True)
    
tag_id_df.drop(0, inplace=True)
tag_id_df.reset_index(inplace=True, drop=True)

In [13]:
print(tag_id_df.shape)
tag_id_df.head(2)

(15492, 1)


Unnamed: 0,tag
0,Mix-SR1a-rpoB531-1
1,Mix-SR1a-rpoB531-1


In [14]:
tag_id_df.to_csv("/n/scratch/users/s/sm624/BSdata_benchmarking_tag_ids.csv", index=False)