# Preliminary Data Analysis
### 1. Variant Counts

This notebook contains the code that generated Table 1 in the original manuscript.

This code groups the variant lists (VCF files) submitted by the students. It counts the number of variants present for each pipeline configuration. It adds the variant counts in the high-confidence variant list, saves the numbers in an Excel file, and prints the numbers on the screen.


In [6]:
%load_ext autoreload
%autoreload 2

from utils import *
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
DIRECTORY = os.getcwd()

In [8]:
all_vcfs ={}

pipelines = ["mutect_yb_bwa", 
             "mutect_yb_bowtie", 
             "mutect_nb_bwa", 
             "mutect_nb_bowtie", 
             "strelka_yb_bwa", 
             "strelka_yb_bowtie", 
             "strelka_nb_bwa", 
             "strelka_nb_bowtie",
             "ss_nb_bowtie",
             "ss_nb_bwa",
             "ss_yb_bowtie",
             "ss_yb_bwa"]

# "yb" and "nb" represent "presence" and "absence" of the base recalibration step
# "bwa" and "bowtie" represent the aligner used
# "mutect", "strelka", and "ss (somaticsniper)" represent the variant caller used.

hc_vcf = os.path.join(DIRECTORY, "hc_bed_filtered.recode.vcf")
# "hc" represents the high-confidence variant list used

In [9]:
for pipline in iter(pipelines):
    all_vcfs[pipline] = read_all_vcfs(os.path.join(DIRECTORY, pipline))

In [10]:
def get_group(filename):
    group = filename.split("_")[0]
    return group[0].upper() + group[1:]


def get_pipeline(pipeline_name):
    pipeline_name = pipeline_name.replace("_", "-")
    pipeline_name = pipeline_name.replace("bwa", "BWA")
    pipeline_name = pipeline_name.replace("bowtie", "Bowtie")
    pipeline_name = pipeline_name.replace("ss", "SS")
    pipeline_name = pipeline_name.replace("mutect", "Mutect")
    pipeline_name = pipeline_name.replace("strelka", "Strelka")
    pipeline_name = pipeline_name.replace("yb", "YB")
    pipeline_name = pipeline_name.replace("nb", "NB")

    return pipeline_name



In [11]:
#Create a table that contains the variant counts for each pipeline and group

variant_counts = {'Pipeline':[], 'Group':[], 'Variant Count':[]}

for pipeline in pipelines:
    for i in range(len(all_vcfs[pipeline])):
        vcf_file = parse_vcf(os.path.join(DIRECTORY, pipeline, all_vcfs[pipeline][i]))
        variant_counts['Pipeline'].append(get_pipeline(pipeline))
        variant_counts['Group'].append(get_group(all_vcfs[pipeline][i]))
        variant_counts['Variant Count'].append(len(vcf_file))

        true_vcf = parse_vcf(os.path.join(DIRECTORY, hc_vcf))
        variant_counts['Group'].append('High Confidence')
        variant_counts['Pipeline'].append(get_pipeline(pipeline))
        variant_counts['Variant Count'].append(len(true_vcf))

variant_counts = pd.DataFrame(variant_counts)
pivot_df = variant_counts.pivot_table(index='Pipeline', columns='Group', values='Variant Count', aggfunc='first')
pivot_df.to_excel("variant_counts.xlsx")
pivot_df


Group,G1,G10,G11,G2,G3,G4,G5,G6,G7,G8,G9,High Confidence
Pipeline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Mutect-NB-BWA,926,435,235,715,518,4103,771,745,1078,745,275,1161
Mutect-NB-Bowtie,880,1819,381,892,668,3566,385,485,952,485,233,1161
Mutect-YB-BWA,1052,489,144,1044,851,2735,838,663,1050,663,322,1161
Mutect-YB-Bowtie,637,1830,246,588,734,2836,939,477,1004,477,324,1161
SS-NB-BWA,2889,8480,8480,2889,9397,2889,2889,2889,2889,2889,2889,1161
SS-NB-Bowtie,2406,6137,6858,2406,6858,2406,2406,2406,2406,2406,2406,1161
SS-YB-BWA,2313,7492,7492,2313,8290,1890,2313,2313,2313,2312,2312,1161
SS-YB-Bowtie,1752,4934,5521,1752,5521,1752,1752,1752,1752,1752,1752,1161
Strelka-NB-BWA,2204,2832,2898,2204,3169,108864,2204,2204,2203,2201,2201,1161
Strelka-NB-Bowtie,1313,107672,1839,1313,1786,90416,1313,1313,1313,1313,1313,1161
