# [SOURCE] FP CHARACTERISTICS

In [4]:
import pandas as pd
import numpy as np
import pickle
from pandas.errors import EmptyDataError

import sys
sys.path.append("./scripts/modules")

from benchmarking_definitions import *
from regions import *

In [5]:
region_lengths = {"DR": DR_region_length, "HT": HT_region_length, "LM": sim_LM_region_length}

In [6]:
tagsRv = pd.read_csv("./data/source/FINAL_tag_list.csv", names=["tag"])["tag"].values
tagsL1234 = pd.read_csv("./data/source/FINAL_tag_list_L1234.csv", names=["tag"]).tag.values

len(tagsRv), len(tagsL1234)

(2500, 1000)

In [7]:
ISS_Rv_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_H37Rv"
ISS_L1234_mutant_summary_source = "/n/scratch/users/s/sm624/benchmarking/variant_summaries/ISS_L1234"

In [8]:
ISS_Rv_position_metrics_source = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv/position_metrics"
ISS_L1234_position_metrics_source = "/n/scratch/users/s/sm624/FP_characteristics/ISS_L1234/position_metrics"

# [done] L1234: FP at AF > 50%

In [6]:
num_FP_AFover50_df = pd.DataFrame(columns=["tag", "tool", "num_FP"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        FP_AFover50 = FP_df[FP_df[f"{tool}_AF"] > 0.50].shape[0]

        num_FP_AFover50_df.loc[df_i] = [tag, tool, FP_AFover50]
        df_i += 1


0 100 200 300 400 500 600 700 800 900 

In [7]:
print(num_FP_AFover50_df.shape)
num_FP_AFover50_df.head(2)

(6000, 3)


Unnamed: 0,tag,tool,num_FP
0,sim1_L1_mutant_50_0.01,FB,99
1,sim1_L1_mutant_50_0.01,LF,81


In [8]:
num_FP_AFover50_df.to_csv("./data/FP_characteristics/ISS_L1234_num_FP_AFover50.csv", index=False)

# [done] Percentage of FP calls that are SNPs out of all FPs (per tool)

## H37Rv

In [11]:
Rv_FP_type_df = pd.DataFrame(columns=["tag"] + [f"{tool}_percent_SNP" for tool in tools] + [f"{tool}_total" for tool in tools])
Rv_FP_type_df["tag"] = tagsRv
Rv_FP_type_df = Rv_FP_type_df.set_index("tag")

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        tool_total = FP_df.shape[0]

        SNP_FP_df = FP_df[(FP_df.REF.str.len() == 1) & (FP_df.ALT.str.len() == 1)]
        tool_SNP_total = SNP_FP_df.shape[0]

        if tool_total > 0:
            percent_SNP = (tool_SNP_total/tool_total)*100
        else:
            percent_SNP = np.nan

        Rv_FP_type_df.loc[tag, f"{tool}_percent_SNP"] = percent_SNP
        Rv_FP_type_df.loc[tag, f"{tool}_total"] = tool_total


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [12]:
Rv_FP_type_df.head()

Unnamed: 0_level_0,FB_percent_SNP,LF_percent_SNP,MT_percent_SNP,PL_percent_SNP,VD_percent_SNP,VS_percent_SNP,FB_total,LF_total,MT_total,PL_total,VD_total,VS_total
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
sim1_h37rv_mutant_1_50_0.01,96.551724,,100.0,100.0,95.483871,,145,0,7,72,310,0
sim1_h37rv_mutant_2_50_0.01,94.354839,,,100.0,95.289855,,124,0,0,72,276,0
sim1_h37rv_mutant_3_50_0.01,96.240602,,100.0,100.0,95.932203,,133,0,11,63,295,0
sim1_h37rv_mutant_4_50_0.01,96.875,,100.0,100.0,95.238095,,128,0,1,68,294,0
sim1_h37rv_mutant_5_50_0.01,97.014925,,100.0,100.0,95.394737,,134,0,1,69,304,0


In [13]:
Rv_FP_type_df.to_csv("./data/FP_characteristics/ISS_H37Rv_percent_SNP.csv", index=True)

## L1234

In [6]:
L1234_FP_type_df = pd.DataFrame(columns=["tag"] + [f"{tool}_percent_SNP" for tool in tools] + [f"{tool}_total" for tool in tools])
L1234_FP_type_df["tag"] = tagsL1234
L1234_FP_type_df = L1234_FP_type_df.set_index("tag")

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]

        tool_total = FP_df.shape[0]

        SNP_FP_df = FP_df[(FP_df.REF.str.len() == 1) & (FP_df.ALT.str.len() == 1)]
        tool_SNP_total = SNP_FP_df.shape[0]

        percent_SNP = (tool_SNP_total/tool_total)*100

        L1234_FP_type_df.loc[tag, f"{tool}_percent_SNP"] = percent_SNP
        L1234_FP_type_df.loc[tag, f"{tool}_total"] = tool_total


0 100 200 300 400 500 600 700 800 900 

In [7]:
L1234_FP_type_df.head()

Unnamed: 0_level_0,FB_percent_SNP,LF_percent_SNP,MT_percent_SNP,PL_percent_SNP,VD_percent_SNP,VS_percent_SNP,FB_total,LF_total,MT_total,PL_total,VD_total,VS_total
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
sim1_L1_mutant_50_0.01,96.412037,94.134078,78.441558,97.791165,85.988201,95.918367,864,358,385,498,678,294
sim1_L2_mutant_50_0.01,95.535714,91.156463,74.587459,96.983759,86.933798,91.836735,672,294,303,431,574,245
sim1_L3_mutant_50_0.01,96.446701,93.714286,70.0,97.555556,88.651316,94.42623,788,350,370,450,608,305
sim2_L1_mutant_50_0.01,96.40914,95.856354,78.39196,97.864078,86.248132,96.527778,919,362,398,515,669,288
sim2_L2_mutant_50_0.01,95.135908,91.317365,70.819672,96.86747,87.438825,90.657439,699,334,305,415,613,289


In [8]:
L1234_FP_type_df.to_csv("./data/FP_characteristics/ISS_L1234_percent_SNP.csv", index=True)

# [done] Genome-wide FPR in H37Rv vs L1234

In [9]:
genome_FPR_df = pd.DataFrame(columns=["tag", "genome", "tool", "num_FP", "FPR"])
df_i = 0

count = 0

genome = "H37Rv"

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        tool_total_FP = tool_FP_df.shape[0]
        tool_FPR = tool_total_FP/genome_length

        genome_FPR_df.loc[df_i] = [tag, genome, tool, tool_total_FP, tool_FPR]
        df_i += 1

genome = "L1-4"

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        tool_total_FP = tool_FP_df.shape[0]
        tool_FPR = tool_total_FP/genome_length

        genome_FPR_df.loc[df_i] = [tag, genome, tool, tool_total_FP, tool_FPR]
        df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [10]:
print(genome_FPR_df.shape)
genome_FPR_df.head()

(21000, 5)


Unnamed: 0,tag,genome,tool,num_FP,FPR
0,sim1_h37rv_mutant_1_50_0.01,H37Rv,FB,145,3.3e-05
1,sim1_h37rv_mutant_1_50_0.01,H37Rv,LF,0,0.0
2,sim1_h37rv_mutant_1_50_0.01,H37Rv,MT,7,2e-06
3,sim1_h37rv_mutant_1_50_0.01,H37Rv,PL,72,1.6e-05
4,sim1_h37rv_mutant_1_50_0.01,H37Rv,VD,310,7e-05


In [11]:
genome_FPR_df.to_csv("./data/FP_characteristics/ISS_genome_wide_FPR.csv", index=False)

# [done] Percentage of total FP that are LM

In [12]:
percent_LM_FP_df = pd.DataFrame(columns=["tag", "genome", "tool", "num_FP", "num_LM_FP", "percent_LM_FP"])
df_i = 0

count = 0

genome = "H37Rv"
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_total_FP = tool_FP_df.shape[0]

        LM_tool_total_FP = tool_FP_df[tool_FP_df.REGION == "LM"].shape[0]

        if tool_total_FP > 0:
            percent_LM_FP = (LM_tool_total_FP/tool_total_FP)*100            

            percent_LM_FP_df.loc[df_i] = [tag, genome, tool, tool_total_FP, LM_tool_total_FP, percent_LM_FP]
            df_i += 1

genome = "L1-4"
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]
        tool_total_FP = tool_FP_df.shape[0]

        LM_tool_total_FP = tool_FP_df[tool_FP_df.REGION == "LM"].shape[0]

        if tool_total_FP > 0:
            percent_LM_FP = (LM_tool_total_FP/tool_total_FP)*100            

            percent_LM_FP_df.loc[df_i] = [tag, genome, tool, tool_total_FP, LM_tool_total_FP, percent_LM_FP]
            df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [13]:
print(percent_LM_FP_df.shape)
percent_LM_FP_df.head(2)

(13921, 6)


Unnamed: 0,tag,genome,tool,num_FP,num_LM_FP,percent_LM_FP
0,sim1_h37rv_mutant_1_50_0.01,H37Rv,FB,145,12,8.275862
1,sim1_h37rv_mutant_1_50_0.01,H37Rv,MT,7,1,14.285714


In [14]:
percent_LM_FP_df.to_csv("./data/FP_characteristics/ISS_percent_LM_FP.csv", index=False)

# [done] Percentage of total FP at AF = 1%

In [15]:
percent_AF1_FP_df = pd.DataFrame(columns=["tag", "genome", "tool", "num_FP", "num_AF1_FP", "percent_AF1_FP"])
df_i = 0

count = 0

genome = "H37Rv"
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_total_FP = tool_FP_df.shape[0]

        AF1_tool_total_FP = tool_FP_df[tool_FP_df[f"{tool}_AF"] == 0.01].shape[0]

        if tool_total_FP > 0:
            percent_AF1_FP = (AF1_tool_total_FP/tool_total_FP)*100            

            percent_AF1_FP_df.loc[df_i] = [tag, genome, tool, tool_total_FP, AF1_tool_total_FP, percent_AF1_FP]
            df_i += 1

genome = "L1-4"
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]
        tool_total_FP = tool_FP_df.shape[0]

        AF1_tool_total_FP = tool_FP_df[tool_FP_df[f"{tool}_AF"] == 0.01].shape[0]

        if tool_total_FP > 0:
            percent_AF1_FP = (AF1_tool_total_FP/tool_total_FP)*100            

            percent_AF1_FP_df.loc[df_i] = [tag, genome, tool, tool_total_FP, AF1_tool_total_FP, percent_AF1_FP]
            df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [16]:
print(percent_AF1_FP_df.shape)
percent_AF1_FP_df.head(2)

(13921, 6)


Unnamed: 0,tag,genome,tool,num_FP,num_AF1_FP,percent_AF1_FP
0,sim1_h37rv_mutant_1_50_0.01,H37Rv,FB,145,0,0.0
1,sim1_h37rv_mutant_1_50_0.01,H37Rv,MT,7,0,0.0


In [17]:
percent_AF1_FP_df.to_csv("./data/FP_characteristics/ISS_percent_AF1_FP.csv", index=False)

# [done] FP AF distribution per LM region

## H37Rv

In [26]:
count = 0

region_AF_distr_dict = {t:{"LM":[], "non-LM":[]} for t in tools}

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        LM_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "LM"]
        region_AF_distr_dict[tool]["LM"] += list(LM_tool_FP_df[f"{tool}_AF"].values*100)

        nonLM_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "LM"]
        region_AF_distr_dict[tool]["non-LM"] += list(nonLM_tool_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [27]:
with open(f"./data/FP_characteristics/ISS_H37Rv_region_AF_distr.pkl", "wb") as out_f:
    pickle.dump(region_AF_distr_dict, out_f)

## L1234

In [28]:
count = 0

region_AF_distr_dict = {t:{"LM":[], "non-LM":[]} for t in tools}

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        LM_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "LM"]
        region_AF_distr_dict[tool]["LM"] += list(LM_tool_FP_df[f"{tool}_AF"].values*100)

        nonLM_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "LM"]
        region_AF_distr_dict[tool]["non-LM"] += list(nonLM_tool_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 

In [29]:
with open(f"./data/FP_characteristics/ISS_L1234_region_AF_distr.pkl", "wb") as out_f:
    pickle.dump(region_AF_distr_dict, out_f)

# [done] FP AF distribution per DR region

## H37Rv

In [11]:
count = 0

DR_region_AF_distr_dict = {t:{"DR":[], "non-DR":[]} for t in tools}

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        DR_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "DR"]
        DR_region_AF_distr_dict[tool]["DR"] += list(DR_tool_FP_df[f"{tool}_AF"].values*100)

        nonDR_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "DR"]
        DR_region_AF_distr_dict[tool]["non-DR"] += list(nonDR_tool_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [12]:
with open(f"./data/FP_characteristics/ISS_H37Rv_DR_region_AF_distr.pkl", "wb") as out_f:
    pickle.dump(DR_region_AF_distr_dict, out_f)

## L1234

In [9]:
count = 0

DR_region_AF_distr_dict = {t:{"DR":[], "non-DR":[]} for t in tools}

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        DR_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "DR"]
        DR_region_AF_distr_dict[tool]["DR"] += list(DR_tool_FP_df[f"{tool}_AF"].values*100)

        nonDR_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "DR"]
        DR_region_AF_distr_dict[tool]["non-DR"] += list(nonDR_tool_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 

In [10]:
with open(f"./data/FP_characteristics/ISS_L1234_DR_region_AF_distr.pkl", "wb") as out_f:
    pickle.dump(DR_region_AF_distr_dict, out_f)

# [done] FP AF distribution in DR vs LM vs OTHER regions

In [10]:
test_LM_pos = [1, 3, 5, 9, 11, 12]
test_DR_pos = [1, 3, 7]
test_df = pd.DataFrame()
test_df["POS"] = [1, 3, 5, 7, 10, 12, 23]
test_df

Unnamed: 0,POS
0,1
1,3
2,5
3,7
4,10
5,12
6,23


In [12]:
# DR_nonLM
test_df[test_df.POS.isin(test_DR_pos) & ~test_df.POS.isin(test_LM_pos)]

Unnamed: 0,POS
3,7


In [13]:
# other_nonLM
test_df[~test_df.POS.isin(test_DR_pos) & ~test_df.POS.isin(test_LM_pos)]

Unnamed: 0,POS
4,10
6,23


In [14]:
# LM
test_df[test_df.POS.isin(test_LM_pos)]

Unnamed: 0,POS
0,1
1,3
2,5
5,12


## H37Rv

In [7]:
count = 0

all_region_AF_distr_dict = {t:{"DR_nonLM":[], "other_nonLM":[], "LM": []} for t in tools}

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    FP_df = variant_summary_df[variant_summary_df.GT == 0]

    DR_nonLM_FP_df = FP_df[FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    other_nonLM_FP_df = FP_df[~FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    LM_FP_df = FP_df[FP_df.POS.isin(LM_pos)]

    for tool in tools:

        tool_DR_nonLM_FP_df = DR_nonLM_FP_df[DR_nonLM_FP_df[f"{tool}_found"] == 1]
        all_region_AF_distr_dict[tool]["DR_nonLM"] += list(tool_DR_nonLM_FP_df[f"{tool}_AF"].values*100)

        tool_other_nonLM_FP_df = other_nonLM_FP_df[other_nonLM_FP_df[f"{tool}_found"] == 1]
        all_region_AF_distr_dict[tool]["other_nonLM"] += list(tool_other_nonLM_FP_df[f"{tool}_AF"].values*100)

        tool_LM_FP_df = LM_FP_df[LM_FP_df[f"{tool}_found"] == 1]
        all_region_AF_distr_dict[tool]["LM"] += list(tool_LM_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [8]:
with open(f"./data/FP_characteristics/ISS_H37Rv_FP_AF_distr_DR_LM_other.pkl", "wb") as out_f:
    pickle.dump(all_region_AF_distr_dict, out_f)

## L1234

In [9]:
count = 0

all_region_AF_distr_dict = {t:{"DR_nonLM":[], "other_nonLM":[], "LM": []} for t in tools}

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    FP_df = variant_summary_df[(variant_summary_df.L_GT == 0) & (variant_summary_df.GT == 0)]

    DR_nonLM_FP_df = FP_df[FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    other_nonLM_FP_df = FP_df[~FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    LM_FP_df = FP_df[FP_df.POS.isin(LM_pos)]

    for tool in tools:

        tool_DR_nonLM_FP_df = DR_nonLM_FP_df[(DR_nonLM_FP_df[f"{tool}_found"] == 1) & (DR_nonLM_FP_df[f"{tool}_AF"] <= 0.50)]
        all_region_AF_distr_dict[tool]["DR_nonLM"] += list(tool_DR_nonLM_FP_df[f"{tool}_AF"].values*100)

        tool_other_nonLM_FP_df = other_nonLM_FP_df[(other_nonLM_FP_df[f"{tool}_found"] == 1) & (other_nonLM_FP_df[f"{tool}_AF"] <= 0.50)]
        all_region_AF_distr_dict[tool]["other_nonLM"] += list(tool_other_nonLM_FP_df[f"{tool}_AF"].values*100)

        tool_LM_FP_df = LM_FP_df[(LM_FP_df[f"{tool}_found"] == 1) & (LM_FP_df[f"{tool}_AF"] <= 0.50)]
        all_region_AF_distr_dict[tool]["LM"] += list(tool_LM_FP_df[f"{tool}_AF"].values*100)



0 100 200 300 400 500 600 700 800 900 

In [10]:
with open(f"./data/FP_characteristics/ISS_L1234_FP_AF_distr_DR_LM_other.pkl", "wb") as out_f:
    pickle.dump(all_region_AF_distr_dict, out_f)

# [done] FP AF distribution per depth

In [67]:
count = 0

depth_AF_distr_dict = {t:{d:[] for d in depths} for t in tools}

for depth in depths:

    for sim_i in range(1, 6):

        for freq in freqs:

            for mutant_num in range(1, 11):

                if count % 100 == 0:
                    print(count, end=" ")
                count += 1

                tag = f"sim{sim_i}_h37rv_mutant_{mutant_num}_{depth}_{freq}"

                variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

                for tool in tools:
            
                    tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    depth_AF_distr_dict[tool][depth] += list(tool_FP_df[f"{tool}_AF"].values*100)
                    

            for lineage in range(1, 5):

                if count % 100 == 0:
                    print(count, end=" ")
                count += 1

                tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"

                variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
            
                for tool in tools:
            
                    tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]
                    depth_AF_distr_dict[tool][depth] += list(tool_FP_df[f"{tool}_AF"].values*100)


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [68]:
with open(f"./data/FP_characteristics/ISS_depth_AF_distr_dict.pkl", "wb") as out_f:
    pickle.dump(depth_AF_distr_dict, out_f)

# [done] FPR per depth

In [8]:
depth_FPR_df = pd.DataFrame(columns=["tag", "depth", "genome", "tool", "num_FP", "FPR"])
df_i = 0

count = 0

genome = "H37Rv"

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        tool_total_FP = tool_FP_df.shape[0]
        tool_FPR = tool_total_FP/genome_length

        depth_FPR_df.loc[df_i] = [tag, depth, genome, tool, tool_total_FP, tool_FPR]
        df_i += 1

genome = "L1-4"

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        tool_total_FP = tool_FP_df.shape[0]
        tool_FPR = tool_total_FP/genome_length

        depth_FPR_df.loc[df_i] = [tag, depth, genome, tool, tool_total_FP, tool_FPR]
        df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [9]:
print(depth_FPR_df.shape)
depth_FPR_df.head(2)

(21000, 6)


Unnamed: 0,tag,depth,genome,tool,num_FP,FPR
0,sim1_h37rv_mutant_1_50_0.01,50,H37Rv,FB,145,3.3e-05
1,sim1_h37rv_mutant_1_50_0.01,50,H37Rv,LF,0,0.0


In [10]:
depth_FPR_df.to_csv("./data/FP_characteristics/ISS_FPR_by_depth.csv", index=False)

# [done] L1234: FPR per region

In [13]:
all_region_lengths = {"DR": DR_region_length,
                      "HT": HT_region_length,
                      "LM": LM_region_length,
                      "other": genome_length-DR_region_length-HT_region_length-LM_region_length}

In [15]:
L1234_region_FPR_df = pd.DataFrame(columns=["tag", "region", "tool", "num_FP", "FPR"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        for region in regions_other:

            region_tool_num_FP = tool_FP_df[tool_FP_df.REGION == region].shape[0]

            region_length = all_region_lengths[region]

            region_tool_FPR = region_tool_num_FP/region_length

            L1234_region_FPR_df.loc[df_i] = [tag, region, tool, region_tool_num_FP, region_tool_FPR]
            df_i += 1


0 100 200 300 400 500 600 700 800 900 

In [16]:
print(L1234_region_FPR_df.shape)
L1234_region_FPR_df.head(2)

(24000, 5)


Unnamed: 0,tag,region,tool,num_FP,FPR
0,sim1_L1_mutant_50_0.01,DR,FB,0,0.0
1,sim1_L1_mutant_50_0.01,HT,FB,0,0.0


In [17]:
L1234_region_FPR_df.to_csv("./data/FP_characteristics/ISS_L1234_region_FPR.csv", index=False)

# [done] L1234: FPR in DR vs LM vs OTHER regions

In [31]:
DR_pos_arr = np.array(list(DR_pos.keys()))
LM_pos_arr = np.array(LM_pos)

DR_or_LM_pos_arr = np.union1d(DR_pos_arr, LM_pos_arr)
genome_pos_arr = np.arange(1, genome_length+1)

DR_nonLM_pos = DR_pos_arr[~np.isin(DR_pos_arr, LM_pos_arr)]
other_nonLM_pos = genome_pos_arr[~np.isin(genome_pos_arr, DR_or_LM_pos_arr)]

In [32]:
all_region_lengths = {"DR_nonLM": len(DR_nonLM_pos),
                      "other_nonLM": len(other_nonLM_pos),
                      "LM": len(LM_pos)}

In [37]:
L1234_region_FPR_df = pd.DataFrame(columns=["tag", "region", "tool", "num_FP", "FPR"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    FP_df = variant_summary_df[(variant_summary_df.L_GT == 0) & (variant_summary_df.GT == 0)]

    DR_nonLM_FP_df = FP_df[FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    other_nonLM_FP_df = FP_df[~FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    LM_FP_df = FP_df[FP_df.POS.isin(LM_pos)]

    region_FP_dfs = {"DR_nonLM": DR_nonLM_FP_df,
                  "other_nonLM": other_nonLM_FP_df,
                  "LM": LM_FP_df}

    for region in region_FP_dfs:

        region_FP_df = region_FP_dfs[region]

        region_length = all_region_lengths[region]

        for tool in tools:

            tool_region_FP_df = region_FP_df[(region_FP_df[f"{tool}_found"] == 1) & (region_FP_df[f"{tool}_AF"] <= 0.50)]

            tool_region_num_FP = tool_region_FP_df.shape[0]

            tool_region_FPR = tool_region_num_FP/region_length

            L1234_region_FPR_df.loc[df_i] = [tag, region, tool, tool_region_num_FP, tool_region_FPR]
            df_i += 1



0 100 200 300 400 500 600 700 800 900 

In [38]:
print(L1234_region_FPR_df.shape)
L1234_region_FPR_df.head(2)

(18000, 5)


Unnamed: 0,tag,region,tool,num_FP,FPR
0,sim1_L1_mutant_50_0.01,DR_nonLM,FB,0,0.0
1,sim1_L1_mutant_50_0.01,DR_nonLM,LF,0,0.0


In [39]:
L1234_region_FPR_df.to_csv("./data/FP_characteristics/ISS_L1234_FPR_DR_LM_other.csv", index=False)

# [done] H37Rv: FPR in DR vs LM vs OTHER regions

In [9]:
DR_pos_arr = np.array(list(DR_pos.keys()))
LM_pos_arr = np.array(LM_pos)

DR_or_LM_pos_arr = np.union1d(DR_pos_arr, LM_pos_arr)
genome_pos_arr = np.arange(1, genome_length+1)

DR_nonLM_pos = DR_pos_arr[~np.isin(DR_pos_arr, LM_pos_arr)]
other_nonLM_pos = genome_pos_arr[~np.isin(genome_pos_arr, DR_or_LM_pos_arr)]

In [10]:
all_region_lengths = {"DR_nonLM": len(DR_nonLM_pos),
                      "other_nonLM": len(other_nonLM_pos),
                      "LM": len(LM_pos)}

In [12]:
Rv_region_FPR_df = pd.DataFrame(columns=["tag", "region", "tool", "num_FP", "FPR"])
df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    FP_df = variant_summary_df[variant_summary_df.GT == 0]

    DR_nonLM_FP_df = FP_df[FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    other_nonLM_FP_df = FP_df[~FP_df.POS.isin(DR_pos) & ~FP_df.POS.isin(LM_pos)]
    LM_FP_df = FP_df[FP_df.POS.isin(LM_pos)]

    region_FP_dfs = {"DR_nonLM": DR_nonLM_FP_df,
                  "other_nonLM": other_nonLM_FP_df,
                  "LM": LM_FP_df}

    for region in region_FP_dfs:

        region_FP_df = region_FP_dfs[region]

        region_length = all_region_lengths[region]

        for tool in tools:

            tool_region_FP_df = region_FP_df[region_FP_df[f"{tool}_found"] == 1]

            tool_region_num_FP = tool_region_FP_df.shape[0]

            tool_region_FPR = tool_region_num_FP/region_length

            Rv_region_FPR_df.loc[df_i] = [tag, region, tool, tool_region_num_FP, tool_region_FPR]
            df_i += 1



0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 

In [13]:
print(Rv_region_FPR_df.shape)
Rv_region_FPR_df.head(2)

(45000, 5)


Unnamed: 0,tag,region,tool,num_FP,FPR
0,sim1_h37rv_mutant_1_50_0.01,DR_nonLM,FB,1,2.7e-05
1,sim1_h37rv_mutant_1_50_0.01,DR_nonLM,LF,0,0.0


In [14]:
Rv_region_FPR_df.to_csv("./data/FP_characteristics/ISS_H37Rv_FPR_DR_LM_other.csv", index=False)

# [done] L1234: FPR per region at AF ≥ 5%

In [6]:
all_region_lengths = {"DR": DR_region_length,
                      "HT": HT_region_length,
                      "LM": LM_region_length,
                      "other": genome_length-DR_region_length-HT_region_length-LM_region_length}

In [7]:
L1234_region_FPR_AF5andOver_df = pd.DataFrame(columns=["tag", "region", "tool", "num_FP", "FPR"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[(tool_FP_df[f"{tool}_AF"] >= 0.05) & (tool_FP_df[f"{tool}_AF"] <= 0.50)]

        for region in regions_other:

            region_tool_num_FP = tool_FP_df[tool_FP_df.REGION == region].shape[0]

            region_length = all_region_lengths[region]

            region_tool_FPR = region_tool_num_FP/region_length

            L1234_region_FPR_AF5andOver_df.loc[df_i] = [tag, region, tool, region_tool_num_FP, region_tool_FPR]
            df_i += 1


0 100 200 300 400 500 600 700 800 900 

In [8]:
print(L1234_region_FPR_AF5andOver_df.shape)
L1234_region_FPR_AF5andOver_df.head(2)

(24000, 5)


Unnamed: 0,tag,region,tool,num_FP,FPR
0,sim1_L1_mutant_50_0.01,DR,FB,0,0.0
1,sim1_L1_mutant_50_0.01,HT,FB,0,0.0


In [9]:
L1234_region_FPR_AF5andOver_df.to_csv("./data/FP_characteristics/ISS_L1234_region_AF5andOver_FPR.csv", index=False)

# [done] Percentage of FP with FP < 5% and FP ≥ 5% per LM region

In [30]:
AF5_tool_FP_stats_df = pd.DataFrame(columns=["tag", "genome", "tool", "LM_tool_FP", "LM_tool_FP_underAF5", "LM_tool_FP_AF5andOver",
                                             "nonLM_tool_FP", "nonLM_tool_FP_underAF5", "nonLM_tool_FP_AF5andOver"])
df_i = 0

count = 0

genome = "H37Rv"

for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])

    variant_summary_df = pd.read_csv(f"{ISS_Rv_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

        LM_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "LM"]
        LM_tool_FP = LM_tool_FP_df.shape[0]
        LM_tool_FP_underAF5 = LM_tool_FP_df[LM_tool_FP_df[f"{tool}_AF"] < 0.05].shape[0]
        LM_tool_FP_AF5andOver = LM_tool_FP_df[LM_tool_FP_df[f"{tool}_AF"] >= 0.05].shape[0]

        nonLM_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "LM"]
        nonLM_tool_FP = nonLM_tool_FP_df.shape[0]
        nonLM_tool_FP_underAF5 = nonLM_tool_FP_df[nonLM_tool_FP_df[f"{tool}_AF"] < 0.05].shape[0]
        nonLM_tool_FP_AF5andOver = nonLM_tool_FP_df[nonLM_tool_FP_df[f"{tool}_AF"] >= 0.05].shape[0]
        
        AF5_tool_FP_stats_df.loc[df_i] = [tag, genome, tool, LM_tool_FP, LM_tool_FP_underAF5, LM_tool_FP_AF5andOver,
                                          nonLM_tool_FP, nonLM_tool_FP_underAF5, nonLM_tool_FP_AF5andOver]
        df_i += 1

genome = "L1-4"

for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]
        
        LM_tool_FP_df = tool_FP_df[tool_FP_df.REGION == "LM"]
        LM_tool_FP = LM_tool_FP_df.shape[0]
        LM_tool_FP_underAF5 = LM_tool_FP_df[LM_tool_FP_df[f"{tool}_AF"] < 0.05].shape[0]
        LM_tool_FP_AF5andOver = LM_tool_FP_df[LM_tool_FP_df[f"{tool}_AF"] >= 0.05].shape[0]

        nonLM_tool_FP_df = tool_FP_df[tool_FP_df.REGION != "LM"]
        nonLM_tool_FP = nonLM_tool_FP_df.shape[0]
        nonLM_tool_FP_underAF5 = nonLM_tool_FP_df[nonLM_tool_FP_df[f"{tool}_AF"] < 0.05].shape[0]
        nonLM_tool_FP_AF5andOver = nonLM_tool_FP_df[nonLM_tool_FP_df[f"{tool}_AF"] >= 0.05].shape[0]
        
        AF5_tool_FP_stats_df.loc[df_i] = [tag, genome, tool, LM_tool_FP, LM_tool_FP_underAF5, LM_tool_FP_AF5andOver,
                                          nonLM_tool_FP, nonLM_tool_FP_underAF5, nonLM_tool_FP_AF5andOver]
        df_i += 1


0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 

In [31]:
print(AF5_tool_FP_stats_df.shape)
AF5_tool_FP_stats_df.head()

(21000, 9)


Unnamed: 0,tag,genome,tool,LM_tool_FP,LM_tool_FP_underAF5,LM_tool_FP_AF5andOver,nonLM_tool_FP,nonLM_tool_FP_underAF5,nonLM_tool_FP_AF5andOver
0,sim1_h37rv_mutant_1_50_0.01,H37Rv,FB,12,12,0,133,132,1
1,sim1_h37rv_mutant_1_50_0.01,H37Rv,LF,0,0,0,0,0,0
2,sim1_h37rv_mutant_1_50_0.01,H37Rv,MT,1,0,1,6,1,5
3,sim1_h37rv_mutant_1_50_0.01,H37Rv,PL,6,5,1,66,49,17
4,sim1_h37rv_mutant_1_50_0.01,H37Rv,VD,22,22,0,288,287,1


In [32]:
AF5_tool_FP_stats_df.to_csv("./data/FP_characteristics/ISS_AF5_tool_FP_stats.csv", index=False)

# [done] FPR by lineage background L1-4

In [34]:
L1234_lineage_FPR_df = pd.DataFrame(columns=["tag", "lineage", "tool", "num_FP", "FPR"])
df_i = 0

count = 0
for tag in tagsL1234:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    lineage = "Lineage " + tag.split("_")[1].strip("L")
    
    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]

    for tool in tools:

        tool_FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
        tool_FP_df = tool_FP_df[tool_FP_df[f"{tool}_AF"] <= 0.50]

        tool_num_FP = tool_FP_df.shape[0]

        tool_FPR = tool_num_FP/genome_length

        L1234_lineage_FPR_df.loc[df_i] = [tag, lineage, tool, tool_num_FP, tool_FPR]
        df_i += 1


0 100 200 300 400 500 600 700 800 900 

In [35]:
print(L1234_lineage_FPR_df.shape)
L1234_lineage_FPR_df.head(2)

(6000, 5)


Unnamed: 0,tag,lineage,tool,num_FP,FPR
0,sim1_L1_mutant_50_0.01,Lineage 1,FB,864,0.000196
1,sim1_L1_mutant_50_0.01,Lineage 1,LF,358,8.1e-05


In [36]:
L1234_lineage_FPR_df.to_csv("./data/FP_characteristics/ISS_L1234_lineage_FPR.csv", index=False)

# [done] L1234: SNV error model characteristics

## Coverage distribution (by simulated coverage)

In [37]:
COV_distr = {t:{"TP":[], "FP":[]} for t in tools}
    
for tool in tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    COV_distr[tool]["TP"] += list(TP_df["COV_RATIO"].values)
                    COV_distr[tool]["FP"] += list(FP_df["COV_RATIO"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_COV_distribution.pkl", "wb") as out_f:
    pickle.dump(COV_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
LoFreq
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
Pilon
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 
VarScan2
0 100 200 300 400 500 600 700 800 900 



## Strand bias

In [38]:
SB_tools = ["FB", "MT", "VD", "VS"]

In [39]:
SB_distr = {t:{"TP":[], "FP":[]} for t in tools}

for tool in SB_tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    SB_distr[tool]["TP"] += list(TP_df[f"{tool}_STRAND_BIAS"].values)
                    SB_distr[tool]["FP"] += list(FP_df[f"{tool}_STRAND_BIAS"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_SB_distribution.pkl", "wb") as out_f:
    pickle.dump(SB_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 
VarScan2
0 100 200 300 400 500 600 700 800 900 



## Variant base quality

In [40]:
vBQ_distr = {t:{"TP":[], "FP":[]} for t in tools}

for tool in tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    vBQ_distr[tool]["TP"] += list(TP_df["MEAN_ALT_BQ"].values)
                    vBQ_distr[tool]["FP"] += list(FP_df["MEAN_ALT_BQ"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_vBQ_distribution.pkl", "wb") as out_f:
    pickle.dump(vBQ_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
LoFreq
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
Pilon
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 
VarScan2
0 100 200 300 400 500 600 700 800 900 



## Soft-clipped bases

In [41]:
SC_distr = {t:{"TP":[], "FP":[]} for t in tools}

for tool in tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    SC_distr[tool]["TP"] += list(TP_df["CLIPPED_BASES_RATIO"].values)
                    SC_distr[tool]["FP"] += list(FP_df["CLIPPED_BASES_RATIO"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_SC_distribution.pkl", "wb") as out_f:
    pickle.dump(SC_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
LoFreq
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
Pilon
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 
VarScan2
0 100 200 300 400 500 600 700 800 900 



## Discordant reads

In [42]:
DR_distr = {t:{"TP":[], "FP":[]} for t in tools}

for tool in tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    DR_distr[tool]["TP"] += list(TP_df["DISCORDANT_READS_RATIO"].values)
                    DR_distr[tool]["FP"] += list(FP_df["DISCORDANT_READS_RATIO"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_DR_distribution.pkl", "wb") as out_f:
    pickle.dump(DR_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
LoFreq
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
Pilon
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 
VarScan2
0 100 200 300 400 500 600 700 800 900 



## Mapping quality

In [43]:
MQ_tools = ["FB", "MT", "PL", "VD"]

In [44]:
MQ_distr = {t:{"TP":[], "FP":[]} for t in tools}

for tool in MQ_tools:

    print(tool_mapping[tool])

    count = 0
    for sim_i in range(1, 6):
        for lineage in range(1, 5):
            for depth in depths:
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_position_metrics_source}/{tag}.position_metrics.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = FP_df[FP_df[f"{tool}_AF"] <= 0.50]
            
                    MQ_distr[tool]["TP"] += list(TP_df[f"{tool}_MQ"].values)
                    MQ_distr[tool]["FP"] += list(FP_df[f"{tool}_MQ"].values)

    print()

print()

with open(f"./data/FP_characteristics/ISS_L1234_MQ_distribution.pkl", "wb") as out_f:
    pickle.dump(MQ_distr, out_f)

FreeBayes
0 100 200 300 400 500 600 700 800 900 
Mutect2
0 100 200 300 400 500 600 700 800 900 
Pilon
0 100 200 300 400 500 600 700 800 900 
VarDict
0 100 200 300 400 500 600 700 800 900 



## (dropped) AO distribution (by simulated coverage)

In [None]:
for depth in depths:

    print(f"{depth}x")

    depth_AO_distr = {t:{"TP":[], "FP":[]} for t in tools}
    
    for tool in tools:
    
        print(tool_mapping[tool])
    
        count = 0
        for sim_i in range(1, 6):
            for lineage in range(1, 5):
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    TP_df = variant_summary_df[(variant_summary_df.GT == 1) & (variant_summary_df[f"{tool}_found"] == 1)]
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]
            
                    depth_AO_distr[tool]["TP"] += list(TP_df[f"{tool}_AO"].values)
                    depth_AO_distr[tool]["FP"] += list(FP_df[f"{tool}_AO"].values)
    
        print()

    print()

    with open(f"./data/FP_characteristics/ISS_L1234_AO_distribution_{depth}x.pkl", "wb") as out_f:
        pickle.dump(depth_AO_distr, out_f)

## (dropped) FP AF distribution (by simulated coverage and region)

In [None]:
for depth in depths:

    print(f"{depth}x")

    depth_AF_distr = {t:{r:[] for r in regions_other} for t in tools}
    
    for tool in tools:
    
        print(tool_mapping[tool])
    
        count = 0
        for sim_i in range(1, 6):
            for lineage in range(1, 5):
                for freq in freqs:
    
                    tag = f"sim{sim_i}_L{lineage}_mutant_{depth}_{freq}"
    
                    if count % 100 == 0:
                        print(count, end=" ")
                    count += 1
            
                    variant_summary_df = pd.read_csv(f"{ISS_L1234_mutant_summary_source}/{tag}.variant_summary.csv", low_memory=False)
                    variant_summary_df = variant_summary_df[variant_summary_df.L_GT == 0]
                    
                    FP_df = variant_summary_df[(variant_summary_df.GT == 0) & (variant_summary_df[f"{tool}_found"] == 1)]

                    for region in regions_other:
                        region_df = FP_df[FP_df.REGION == region]
                        depth_AF_distr[tool][region] += list(region_df[f"{tool}_AF"].values)
    
        print()

    print()

    with open(f"./data/FP_characteristics/ISS_L1234_FP_AF_distribution_by_region_{depth}x.pkl", "wb") as out_f:
        pickle.dump(depth_AF_distr, out_f)

# INDELS (FB only)

## H37Rv

In [None]:
Rv_parent_dir = "/n/scratch/users/s/sm624/FP_characteristics/ISS_H37Rv"
Rv_INDEL_source_dir = f"{Rv_parent_dir}/INDELs"

### Depth

In [None]:
Rv_FB_DP_distr = {"TP":[], "FP":[]}

df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    # these are the INDELs that have been pre-filtered for non-L_GT and FB_found==1
    INDEL_df = pd.read_csv(f"{Rv_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)

    TP_df = INDEL_df[INDEL_df.GT == 1]
    FP_df = INDEL_df[INDEL_df.GT == 0]

    Rv_FB_DP_distr["TP"] += list(TP_df["FB_DP"].values/depth)
    Rv_FB_DP_distr["FP"] += list(FP_df["FB_DP"].values/depth)

    

In [None]:
with open(f"./data/FP_characteristics/ISS_H37Rv_FB_INDEL_DP_distribution.pkl", "wb") as out_f:
    pickle.dump(Rv_FB_DP_distr, out_f)

### Mapping quality

In [None]:
Rv_FB_MQ_distr = {"TP":[], "FP":[]}

df_i = 0

count = 0
for tag in tagsRv:

    if count % 100 == 0:
        print(count, end=" ")
    count += 1

    depth = int(tag.split("_")[-2])
    freq = float(tag.split("_")[-1])

    # these are the INDELs that have been pre-filtered for non-L_GT and FB_found==1
    INDEL_df = pd.read_csv(f"{Rv_INDEL_source_dir}/{tag}.INDELs.csv", low_memory=False)

    TP_df = INDEL_df[INDEL_df.GT == 1]
    FP_df = INDEL_df[INDEL_df.GT == 0]

    Rv_FB_MQ_distr["TP"] += list(TP_df["FB_MQ"].values)
    Rv_FB_MQ_distr["FP"] += list(FP_df["FB_MQ"].values)

    

In [None]:
with open(f"./data/FP_characteristics/ISS_H37Rv_FB_INDEL_MQ_distribution.pkl", "wb") as out_f:
    pickle.dump(Rv_FB_MQ_distr, out_f)