# [SOURCE] PROBLEMATIC LINEAGE SNPS

In [1]:
import pandas as pd
import numpy as np
import pickle

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from regions import *

In [2]:
region_lengths = {"DR": DR_region_length, "HT": HT_region_length, "LM": sim_LM_region_length}

In [3]:
tagsRv = pd.read_csv("./data/source/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("./data/source/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [4]:
ISS_Rv_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_H37Rv"
ISS_L1234_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_L1234"

# [done] FN counts for increasing min AF

In [12]:
for freq_i in range(len(freqs)):

    min_freq = freqs[freq_i]

    print(f"min_freq: {min_freq} >>", end=" ")

    min_freq_df = pd.DataFrame(columns=["tag", "genome", "tool", "region", "min_freq", "num_FN"])
    df_i = 0

    for sim_i in range(1, 6):

        for depth in depths:

            for freq_j in range(len(freqs) - freq_i):

                freq = freqs[freq_i+freq_j]
    
                for mutant_num in range(1, 11):

                    tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                
                    for tool in tools:
                        
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        for region in regions:
                
                            region_FN = tool_FN[tool_FN.REGION_sim == region].shape[0]

                            min_freq_df.loc[df_i] = [tag, "H37Rv", tool, region, min_freq, region_FN]
                            df_i += 1

                for lineage in range(1, 5):

                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    mutant_summary_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
                
                    for tool in tools:
                        
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        region_FN = tool_FN[tool_FN.REGION_sim == "DR"].shape[0]

                        min_freq_df.loc[df_i] = [tag, f"L{lineage}", tool, "DR", min_freq, region_FN]
                        df_i += 1
                    
    
    
    min_freq_df.to_csv(f"./data/problematic_lineage_SNPs/ISS_num_FN_by_AF_minFreq{min_freq}.csv", index=False)
    print("written")


min_freq: 0.01 >> written
min_freq: 0.02 >> written
min_freq: 0.03 >> written
min_freq: 0.04 >> written
min_freq: 0.05 >> written
min_freq: 0.1 >> written
min_freq: 0.2 >> written
min_freq: 0.3 >> written
min_freq: 0.4 >> written
min_freq: 0.5 >> written


## H37Rv

In [5]:
for region in regions:

    region_df = pd.DataFrame(columns=["min_freq", "num_FN", "tool"])
    region_df.loc[0] = [0, 0, "dummy"]

    for min_freq in freqs:

        min_freq_df = pd.read_csv(f"./data/problematic_lineage_SNPs/ISS_num_FN_by_AF_minFreq{min_freq}.csv")
        min_freq_df = min_freq_df[min_freq_df.genome == "H37Rv"]

        region_min_freq_df = min_freq_df[min_freq_df.region == region]

        FN_lst = region_min_freq_df.num_FN

        add_df = pd.DataFrame(columns=region_df.columns)
        add_df["num_FN"] = FN_lst
        add_df["min_freq"] = region_min_freq_df.min_freq.values
        add_df["tool"] = region_min_freq_df.tool.values

        region_df = pd.concat([region_df, add_df])

    region_df.drop(0, inplace=True)

    region_df.to_csv(f"./data/problematic_lineage_SNPs/ISS_H37Rv_num_FN_by_AF_{region}.csv", index=False)
        

## L1234

In [6]:
L1234_df = pd.DataFrame(columns=["min_freq", "num_FN", "lineage", "tool"])
L1234_df.loc[0] = [0, 0, 0, "dummy"]

for min_freq in freqs:

    min_freq_df = pd.read_csv(f"./data/problematic_lineage_SNPs/ISS_num_FN_by_AF_minFreq{min_freq}.csv")
    min_freq_df = min_freq_df[min_freq_df.genome != "H37Rv"]

    FN_lst = min_freq_df.num_FN

    add_df = pd.DataFrame(columns=region_df.columns)
    add_df["num_FN"] = FN_lst
    add_df["min_freq"] = min_freq_df.min_freq.values
    add_df["lineage"] = min_freq_df.genome.values
    add_df["tool"] = min_freq_df.tool.values

    L1234_df = pd.concat([L1234_df, add_df])

L1234_df.drop(0, inplace=True)

L1234_df.to_csv(f"./data/problematic_lineage_SNPs/ISS_L1234_num_FN_by_AF_by_lineage.csv", index=False)
        

## FN counts per tag

In [8]:
genome = "H37Rv"

Rv_source = "/n/scratch/users/s/sm624/ISS_H37Rv_variant_call_counts_sim_LM/output"

Rv_metrics_dataframes = {}
for depth in depths:
    for freq in freqs:
        metrics_df = pd.read_csv("{}/variant_call_counts_{}x_{}.csv".format(Rv_source, depth, freq))
        Rv_metrics_dataframes[(depth, freq)] = metrics_df

for freq in freqs:

    Rv_tag_wise_num_FN_per_freq_df = pd.DataFrame(columns=["tag", "genome", "region", "tool", "min_freq", "num_FN"])
    df_i = 0

    for tool in tools:

        for region in regions:

            for depth in depths:
            
                depth_freq_metrics_df = Rv_metrics_dataframes[(depth, freq)]

                region_depth_freq_df = depth_freq_metrics_df[depth_freq_metrics_df.region == region]
                        
                tool_region_depth_freq_df = region_depth_freq_df[region_depth_freq_df.tool == tool]

                add_df = pd.DataFrame(columns=["tag", "genome", "region", "tool", "min_freq", "num_FN"])
                depth_tags = list(tool_region_depth_freq_df["tag"].values)
                add_df["tag"] = depth_tags
                add_df["genome"] = genome
                add_df["region"] = [region]*len(depth_tags)
                add_df["tool"] = [tool]*len(depth_tags)
                add_df["min_freq"] = [freq*100]*len(depth_tags)
                add_df["num_FN"] = list(tool_region_depth_freq_df["num_FN"].values)

                Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])

    Rv_tag_wise_num_FN_per_freq_df.to_csv("./data/problematic_lineage_SNPs/ISS_H37Rv_num_FN_min_AF_{}.csv".format(freq), index=False)
            

  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])
  Rv_tag_wise_num_FN_per_freq_df = pd.concat([Rv_tag_wise_num_FN_per_freq_df, add_df])


In [7]:
L1234_source = "/n/scratch/users/s/sm624/ISS_L1234_variant_call_counts/output"

L1234_metrics_dataframes = {}
for depth in depths:
    for freq in freqs:
        metrics_df = pd.read_csv("{}/variant_call_counts_{}x_{}.csv".format(L1234_source, depth, freq))
        L1234_metrics_dataframes[(depth, freq)] = metrics_df

for freq in freqs:

    L1234_tag_wise_num_FN_per_freq_df = pd.DataFrame(columns=["tag", "genome", "region", "tool", "min_freq", "num_FN"])
    df_i = 0

    for tool in tools:

        for region in ["DR"]:

            for depth in depths:
            
                depth_freq_metrics_df = L1234_metrics_dataframes[(depth, freq)]

                region_depth_freq_df = depth_freq_metrics_df[depth_freq_metrics_df.region == region]
                        
                tool_region_depth_freq_df = region_depth_freq_df[region_depth_freq_df.tool == tool]

                add_df = pd.DataFrame(columns=["tag", "genome", "region", "tool", "min_freq", "num_FN"])
                depth_tags = list(tool_region_depth_freq_df["tag"].values)
                add_df["tag"] = depth_tags
                add_df["genome"] = [x.split("_")[1] for x in depth_tags]
                add_df["region"] = [region]*len(depth_tags)
                add_df["tool"] = [tool]*len(depth_tags)
                add_df["min_freq"] = [freq*100]*len(depth_tags)
                add_df["num_FN"] = list(tool_region_depth_freq_df["num_FN"].values)

                L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])

    L1234_tag_wise_num_FN_per_freq_df.to_csv("./data/problematic_lineage_SNPs/ISS_L1234_num_FN_min_AF_{}.csv".format(freq), index=False)
            

  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])
  L1234_tag_wise_num_FN_per_freq_df = pd.concat([L1234_tag_wise_num_FN_per_freq_df, add_df])


## Specific FN

In [84]:
specific_FN_df = pd.DataFrame(columns=["tag", "simulation", "genome", "depth", "freq", "pos-ref-alt", "tool"])
df_1_i = 0

overall_num_FN_df = pd.DataFrame(columns=["tag", "simulation", "genome", "depth", "freq", "num_FN", "tool"])
df_2_i = 0

count = 0
for tag in tagsL1234:
    
    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])
    lineage = tag.split("_")[1]
    sim = int(tag.split("_")[0].strip("sim"))
    
    mutant_summary_df = pd.read_csv("{0}/{1}.final.csv".format(L1234_mutant_summary_source, tag))
    mutant_summary_nonbaseline_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
        
    for tool in tools:
        
        tool_FN = mutant_summary_nonbaseline_df[(mutant_summary_nonbaseline_df.GT == 1) & (mutant_summary_nonbaseline_df["{}_found".format(tool)] == 0)]
        
        for i in tool_FN.index:
            pos, ref, alt = tool_FN.loc[i, "POS"], tool_FN.loc[i, "REF"], tool_FN.loc[i, "ALT"]
                    
            specific_FN_df.loc[df_1_i] = [tag, sim, lineage, depth, freq, "{}-{}-{}".format(pos, ref, alt), tool]
            df_1_i += 1

        overall_num_FN_df.loc[df_2_i] = [tag, sim, lineage, depth, freq, tool_FN.shape[0], tool]
        df_2_i += 1
            
for tag in tagsRv:
    
    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])
    mutant_num = int(tag.split("_")[3])
    sim = int(tag.split("_")[0].strip("sim"))
    
    mutant_summary_df = pd.read_csv("{0}/{1}.csv".format(Rv_mutant_summary_source, tag))
    DR_df = mutant_summary_df[mutant_summary_df.POS.isin(DR_pos)]
        
    for tool in tools:
        
        tool_DR_FN = DR_df[(DR_df.GT == 1) & (DR_df["{}_found".format(tool)] == 0)]
        
        for i in tool_DR_FN.index:
            pos, ref, alt = tool_DR_FN.loc[i, "POS"], tool_DR_FN.loc[i, "REF"], tool_DR_FN.loc[i, "ALT"]
                    
            specific_FN_df.loc[df_1_i] = [tag, sim, "h37rv-"+str(mutant_num), depth, freq, "{}-{}-{}".format(pos, ref, alt), tool]
            df_1_i += 1

        overall_num_FN_df.loc[df_2_i] = [tag, sim, "h37rv-"+str(mutant_num), depth, freq, tool_DR_FN.shape[0], tool]
        df_2_i += 1

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [85]:
specific_FN_df.head()

Unnamed: 0,tag,simulation,genome,depth,freq,pos-ref-alt,tool
0,sim1_L1_mutant_50_0.01,1,L1,50,0.01,7570-C-T,FB
1,sim1_L1_mutant_50_0.01,1,L1,50,0.01,7582-A-C,FB
2,sim1_L1_mutant_50_0.01,1,L1,50,0.01,761137-C-T,FB
3,sim1_L1_mutant_50_0.01,1,L1,50,0.01,761139-C-G,FB
4,sim1_L1_mutant_50_0.01,1,L1,50,0.01,1473246-A-G,FB


In [86]:
specific_FN_df.to_csv("./data/problematic_lineage_SNPs/ISS_specific_FN.csv", index=False)

In [87]:
overall_num_FN_df.head()

Unnamed: 0,tag,simulation,genome,depth,freq,num_FN,tool
0,sim1_L1_mutant_50_0.01,1,L1,50,0.01,20,FB
1,sim1_L1_mutant_50_0.01,1,L1,50,0.01,20,LF
2,sim1_L1_mutant_50_0.01,1,L1,50,0.01,20,MT
3,sim1_L1_mutant_50_0.01,1,L1,50,0.01,20,PL
4,sim1_L1_mutant_50_0.01,1,L1,50,0.01,20,VD


In [88]:
overall_num_FN_df.to_csv("./data/problematic_lineage_SNPs/ISS_overall_num_FN.csv", index=False)