# [SOURCE] ACCURACY BY AF AND DEPTH

In [1]:
import pandas as pd
import numpy as np
import pickle

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from regions import *

In [2]:
region_lengths = {"DR": DR_region_length, "HT": HT_region_length, "LM": sim_LM_region_length}

In [3]:
tagsRv = pd.read_csv("./data/source/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("./data/source/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [4]:
ISS_Rv_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_H37Rv"
ISS_L1234_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_L1234"

# [done] H37Rv: Tool performance by depth for increasing AF

In [20]:
max_freq = 0.50

In [21]:
for tool in tools:

    print(f"{tool} >>", end=" ")

    depth_df = pd.DataFrame(columns=["tag", "depth", "min_freq", "precision", "recall"])
    df_i = 0

    for depth in depths:

        print(depth, end=" ")

        for freq_i in range(len(freqs)):
    
            min_freq = freqs[freq_i]

            for freq_j in range(len(freqs) - freq_i):

                freq = freqs[freq_i+freq_j]

                for sim_i in range(1, 6):
    
                    for mutant_num in range(1, 11):

                        tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
        
                        mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)].shape[0]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)].shape[0]
    
                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)].shape[0]

                        precision = calc_precision(tool_TP, tool_FP)
                        recall = calc_recall(tool_TP, tool_FN)

                        depth_df.loc[df_i] = [tag, depth, min_freq, precision, recall]
                        df_i += 1
                    
    
    
    depth_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_H37Rv_{tool}_metrics_by_depth_AF.csv", index=False)
    print("written")


FB >> 50 100 200 400 700 written
LF >> 50 100 200 400 700 written
MT >> 50 100 200 400 700 written
PL >> 50 100 200 400 700 written
VD >> 50 100 200 400 700 written
VS >> 50 100 200 400 700 written


# [done] H37Rv + L1-4: F1 per region for all AFs

In [6]:
max_freq = 0.50

## All depths

In [7]:
for freq_i in range(len(freqs)):

    min_freq = freqs[freq_i]

    print(f"min_freq: {min_freq} >>", end=" ")

    min_freq_df = pd.DataFrame(columns=["tag", "depth", "tool", "region", "min_freq", "num_TP", "num_FN", "num_FP"])
    df_i = 0

    for sim_i in range(1, 6):

        for depth in depths:

            for freq_j in range(len(freqs) - freq_i):

                freq = freqs[freq_i+freq_j]
    
                for mutant_num in range(1, 11):

                    tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]

                        for region in regions:
                
                            region_TP = tool_TP[tool_TP.REGION_sim == region].shape[0]
                            region_FN = tool_FN[tool_FN.REGION_sim == region].shape[0]
                            region_FP = tool_FP[tool_FP.REGION_sim == region].shape[0]

                            min_freq_df.loc[df_i] = [tag, depth, tool, region, min_freq, region_TP, region_FN, region_FP]
                            df_i += 1

                for lineage in range(1, 5):

                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    mutant_summary_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]
                
                        region_TP = tool_TP[tool_TP.REGION_sim == "DR"].shape[0]
                        region_FN = tool_FN[tool_FN.REGION_sim == "DR"].shape[0]
                        region_FP = tool_FP[tool_FP.REGION_sim == "DR"].shape[0]

                        min_freq_df.loc[df_i] = [tag, depth, tool, "DR", min_freq, region_TP, region_FN, region_FP]
                        df_i += 1
                    
    
    
    min_freq_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_by_AF_minFreq{min_freq}.csv", index=False)
    print("written")


min_freq: 0.01 >> written
min_freq: 0.02 >> written
min_freq: 0.03 >> written
min_freq: 0.04 >> written
min_freq: 0.05 >> written
min_freq: 0.1 >> written
min_freq: 0.2 >> written
min_freq: 0.3 >> written
min_freq: 0.4 >> written
min_freq: 0.5 >> written


In [8]:
for region in regions:

    region_df = pd.DataFrame(columns=["min_freq", "precision", "recall", "F1", "tool"])
    region_df.loc[0] = [0, 0, 0, 0, "dummy"]

    for min_freq in freqs:

        min_freq_df = pd.read_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_by_AF_minFreq{min_freq}.csv")

        region_min_freq_df = min_freq_df[min_freq_df.region == region]

        TP_lst = region_min_freq_df.num_TP
        FN_lst = region_min_freq_df.num_FN
        FP_lst = region_min_freq_df.num_FP

        precision_vals = [calc_precision(TP, FP) for TP,FP in zip(TP_lst, FP_lst)]
        recall_vals = [calc_precision(TP, FN) for TP,FN in zip(TP_lst, FN_lst)]

        F1_vals = [calc_F1(P, R) for P,R in zip(precision_vals, recall_vals)]

        add_df = pd.DataFrame(columns=region_df.columns)
        add_df["precision"] = precision_vals
        add_df["recall"] = recall_vals
        add_df["F1"] = F1_vals
        add_df["min_freq"] = region_min_freq_df.min_freq.values
        add_df["tool"] = region_min_freq_df.tool.values

        region_df = pd.concat([region_df, add_df])

    region_df.drop(0, inplace=True)

    region_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_by_AF_{region}.csv", index=False)
        

## Depths 50-200x

In [14]:
low_depths = [50, 100, 200]

In [17]:
for freq_i in range(len(freqs)):

    min_freq = freqs[freq_i]

    print(f"min_freq: {min_freq} >>", end=" ")

    min_freq_df = pd.DataFrame(columns=["tag", "depth", "tool", "region", "min_freq", "num_TP", "num_FN", "num_FP"])
    df_i = 0

    for sim_i in range(1, 6):

        for depth in low_depths:

            for freq_j in range(len(freqs) - freq_i):

                freq = freqs[freq_i+freq_j]
    
                for mutant_num in range(1, 11):

                    tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]

                        for region in regions:
                
                            region_TP = tool_TP[tool_TP.REGION_sim == region].shape[0]
                            region_FN = tool_FN[tool_FN.REGION_sim == region].shape[0]
                            region_FP = tool_FP[tool_FP.REGION_sim == region].shape[0]

                            min_freq_df.loc[df_i] = [tag, depth, tool, region, min_freq, region_TP, region_FN, region_FP]
                            df_i += 1

                for lineage in range(1, 5):

                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    mutant_summary_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]
                
                        region_TP = tool_TP[tool_TP.REGION_sim == "DR"].shape[0]
                        region_FN = tool_FN[tool_FN.REGION_sim == "DR"].shape[0]
                        region_FP = tool_FP[tool_FP.REGION_sim == "DR"].shape[0]

                        min_freq_df.loc[df_i] = [tag, depth, tool, "DR", min_freq, region_TP, region_FN, region_FP]
                        df_i += 1
                    
    
    
    min_freq_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_lowDepth_by_AF_minFreq{min_freq}.csv", index=False)
    print("written")


min_freq: 0.01 >> written
min_freq: 0.02 >> written
min_freq: 0.03 >> written
min_freq: 0.04 >> written
min_freq: 0.05 >> written
min_freq: 0.1 >> written
min_freq: 0.2 >> written
min_freq: 0.3 >> written
min_freq: 0.4 >> written
min_freq: 0.5 >> written


In [25]:
for region in regions:

    region_df = pd.DataFrame(columns=["min_freq", "precision", "recall", "F1", "tool"])
    region_df.loc[0] = [0, 0, 0, 0, "dummy"]

    for min_freq in freqs:

        min_freq_df = pd.read_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_lowDepth_by_AF_minFreq{min_freq}.csv")

        region_min_freq_df = min_freq_df[min_freq_df.region == region]

        TP_lst = region_min_freq_df.num_TP
        FN_lst = region_min_freq_df.num_FN
        FP_lst = region_min_freq_df.num_FP

        precision_vals = [calc_precision(TP, FP) for TP,FP in zip(TP_lst, FP_lst)]
        recall_vals = [calc_precision(TP, FN) for TP,FN in zip(TP_lst, FN_lst)]

        F1_vals = [calc_F1(P, R) for P,R in zip(precision_vals, recall_vals)]

        add_df = pd.DataFrame(columns=region_df.columns)
        add_df["precision"] = precision_vals
        add_df["recall"] = recall_vals
        add_df["F1"] = F1_vals
        add_df["min_freq"] = region_min_freq_df.min_freq.values
        add_df["tool"] = region_min_freq_df.tool.values

        region_df = pd.concat([region_df, add_df])

    region_df.drop(0, inplace=True)

    region_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_allAF_metrics_lowDepth_by_AF_{region}.csv", index=False)
        

# [done] H37Rv + L1-4: Precision and recall per region for AF 1-5%

In [9]:
low_freqs = [0.01, 0.02, 0.03, 0.04, 0.05]

In [10]:
max_freq = 0.05

## All depths

In [13]:
for freq_i in range(len(low_freqs)):

    min_freq = low_freqs[freq_i]

    print(f"min_freq: {min_freq} >>", end=" ")

    min_freq_df = pd.DataFrame(columns=["tag", "depth", "tool", "region", "min_freq", "num_TP", "num_FN", "num_FP"])
    df_i = 0

    for sim_i in range(1, 6):

        for depth in depths:

            for freq_j in range(len(low_freqs) - freq_i):

                freq = low_freqs[freq_i+freq_j]
    
                for mutant_num in range(1, 11):

                    tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]

                        for region in regions:
                
                            region_TP = tool_TP[tool_TP.REGION_sim == region].shape[0]
                            region_FN = tool_FN[tool_FN.REGION_sim == region].shape[0]
                            region_FP = tool_FP[tool_FP.REGION_sim == region].shape[0]

                            min_freq_df.loc[df_i] = [tag, depth, tool, region, min_freq, region_TP, region_FN, region_FP]
                            df_i += 1

                for lineage in range(1, 5):

                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    mutant_summary_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]
                
                        region_TP = tool_TP[tool_TP.REGION_sim == "DR"].shape[0]
                        region_FN = tool_FN[tool_FN.REGION_sim == "DR"].shape[0]
                        region_FP = tool_FP[tool_FP.REGION_sim == "DR"].shape[0]

                        min_freq_df.loc[df_i] = [tag, depth, tool, "DR", min_freq, region_TP, region_FN, region_FP]
                        df_i += 1
                    
    
    
    min_freq_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_by_AF_minFreq{min_freq}.csv", index=False)
    print("written")


min_freq: 0.01 >> written
min_freq: 0.02 >> written
min_freq: 0.03 >> written
min_freq: 0.04 >> written
min_freq: 0.05 >> written


In [45]:
for region in regions:

    region_df = pd.DataFrame(columns=["min_freq", "precision", "recall", "tool"])
    region_df.loc[0] = [0, 0, 0, "dummy"]

    for min_freq in low_freqs:

        min_freq_df = pd.read_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_by_AF_minFreq{min_freq}.csv")

        region_min_freq_df = min_freq_df[min_freq_df.region == region]

        TP_lst = region_min_freq_df.num_TP
        FN_lst = region_min_freq_df.num_FN
        FP_lst = region_min_freq_df.num_FP

        precision_vals = [calc_precision(TP, FP) for TP,FP in zip(TP_lst, FP_lst)]
        recall_vals = [calc_precision(TP, FN) for TP,FN in zip(TP_lst, FN_lst)]        

        add_df = pd.DataFrame(columns=region_df.columns)
        add_df["precision"] = precision_vals
        add_df["recall"] = recall_vals
        add_df["min_freq"] = region_min_freq_df.min_freq.values
        add_df["tool"] = region_min_freq_df.tool.values

        region_df = pd.concat([region_df, add_df])

    region_df.drop(0, inplace=True)

    region_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_by_AF_{region}.csv", index=False)
        

## Depths 50-200x

In [11]:
low_depths = [50, 100, 200]

In [12]:
for freq_i in range(len(low_freqs)):

    min_freq = low_freqs[freq_i]

    print(f"min_freq: {min_freq} >>", end=" ")

    min_freq_df = pd.DataFrame(columns=["tag", "depth", "tool", "region", "min_freq", "num_TP", "num_FN", "num_FP"])
    df_i = 0

    for sim_i in range(1, 6):

        for depth in low_depths:

            for freq_j in range(len(low_freqs) - freq_i):

                freq = low_freqs[freq_i+freq_j]
    
                for mutant_num in range(1, 11):

                    tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]

                        for region in regions:
                
                            region_TP = tool_TP[tool_TP.REGION_sim == region].shape[0]
                            region_FN = tool_FN[tool_FN.REGION_sim == region].shape[0]
                            region_FP = tool_FP[tool_FP.REGION_sim == region].shape[0]

                            min_freq_df.loc[df_i] = [tag, depth, tool, region, min_freq, region_TP, region_FN, region_FP]
                            df_i += 1

                for lineage in range(1, 5):

                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    mutant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    mutant_summary_df = mutant_summary_df[mutant_summary_df.L_GT == 0]
                
                    for tool in tools:
                        
                        tool_TP = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FN = mutant_summary_df[(mutant_summary_df.GT == 1) & (mutant_summary_df[f"{tool}_found"] == 0)]

                        tool_FP_all = mutant_summary_df[(mutant_summary_df.GT == 0) & (mutant_summary_df[f"{tool}_found"] == 1)]
                        tool_FP = tool_FP_all[(tool_FP_all[f"{tool}_AF"] >= min_freq) & (tool_FP_all[f"{tool}_AF"] <= max_freq)]
                
                        region_TP = tool_TP[tool_TP.REGION_sim == "DR"].shape[0]
                        region_FN = tool_FN[tool_FN.REGION_sim == "DR"].shape[0]
                        region_FP = tool_FP[tool_FP.REGION_sim == "DR"].shape[0]

                        min_freq_df.loc[df_i] = [tag, depth, tool, "DR", min_freq, region_TP, region_FN, region_FP]
                        df_i += 1
                    
    
    
    min_freq_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_lowDepth_by_AF_minFreq{min_freq}.csv", index=False)
    print("written")


min_freq: 0.01 >> written
min_freq: 0.02 >> written
min_freq: 0.03 >> written
min_freq: 0.04 >> written
min_freq: 0.05 >> written


In [13]:
for region in regions:

    region_df = pd.DataFrame(columns=["min_freq", "precision", "recall", "tool"])
    region_df.loc[0] = [0, 0, 0, "dummy"]

    for min_freq in low_freqs:

        min_freq_df = pd.read_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_lowDepth_by_AF_minFreq{min_freq}.csv")

        region_min_freq_df = min_freq_df[min_freq_df.region == region]

        TP_lst = region_min_freq_df.num_TP
        FN_lst = region_min_freq_df.num_FN
        FP_lst = region_min_freq_df.num_FP

        precision_vals = [calc_precision(TP, FP) for TP,FP in zip(TP_lst, FP_lst)]
        recall_vals = [calc_precision(TP, FN) for TP,FN in zip(TP_lst, FN_lst)]        

        add_df = pd.DataFrame(columns=region_df.columns)
        add_df["precision"] = precision_vals
        add_df["recall"] = recall_vals
        add_df["min_freq"] = region_min_freq_df.min_freq.values
        add_df["tool"] = region_min_freq_df.tool.values

        region_df = pd.concat([region_df, add_df])

    region_df.drop(0, inplace=True)

    region_df.to_csv(f"./data/accuracy_by_AF_and_depth/ISS_AF1to5_metrics_lowDepth_by_AF_{region}.csv", index=False)
        