In [10]:
%load_ext autoreload
%autoreload 2

In [11]:
import re, json, os, logging, io, pprint, subprocess

import pandas as pd

from cromwell_tools.cromwell_api import CromwellAPI as cwt
from cromwell_tools import cromwell_auth
from google.cloud import storage

In [13]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "/home/rcarter/.google/bioskryb-81ce35d92471.json"

Here we acquire and parse vcfs from Sentieon and Dragen for comparison to GATK

### Copy Sentieon's DNAscope and DRAGEN's VCFs over to scratch for analysis

In [13]:
dnascoped_vcfs = !gsutil ls gs://bioskryb-work-d8f6s9/ComparingGatkSentieonDRAGEN/data/01-Sentieon_output/DNAScope/**.vcf.gz*

In [None]:
for _dnascope_vcf in dnascoped_vcfs:
    sample_name = _dnascope_vcf.split("/")[-3]
    output_location = "scratch/{}.{}".format(sample_name, os.path.basename(_dnascope_vcf))
    !gsutil cp $_dnascope_vcf $output_location

In [17]:
dragen_vcfs = !gsutil ls gs://bioskryb-work-d8f6s9/ComparingGatkSentieonDRAGEN/data/02-DRAGEN_illumina/**.vcf.gz*

In [None]:
for _dragen_vcf in dragen_vcfs:
    sample_name = _dragen_vcf.split("/")[-3]
    output_location = "scratch/{}.{}".format(sample_name, os.path.basename(_dragen_vcf))
    !gsutil cp $_dragen_vcf $output_location

### Split vcfs into SNPs and indels and convert DRAGEN results from GRCh37 to GRCh38

In [36]:
local_sentieon_dnascope_vcfs = !ls scratch/*dnascope.vcf.gz | xargs -i basename {}
local_sentieon_dnascope_vcfs

['JW-11_merged_n450x10e6.dnascope.vcf.gz',
 'JW-23_merged_n450x10e6.dnascope.vcf.gz',
 'JW-31_merged_n450x10e6.dnascope.vcf.gz']

In [None]:
for _local_dnascope_vcf in local_sentieon_dnascope_vcfs:
    snp_output = re.sub(".vcf.gz", ".snp.vcf.gz", _local_dnascope_vcf)
    indel_output = re.sub(".vcf.gz", ".indel.vcf.gz", _local_dnascope_vcf)
    gatk_command = '"cd /home && gatk SplitVcfs --STRICT false -I {} --INDEL_OUTPUT {} --SNP_OUTPUT {}"'.format(_local_dnascope_vcf, indel_output, snp_output)
    !docker run -v /home/rcarter/ComparingGatkSentieonDRAGEN/scratch/:/home -ti gcr.io/bioskryb/gatk:4.1.3.0 /bin/bash -c $gatk_command

Download the chain file for crossmap and run using the py2 environment

In [38]:
!wget -O scratch/hg19ToHg38.over.chain.gz http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz

--2020-03-22 18:18:57--  http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 227698 (222K) [application/x-gzip]
Saving to: ‘scratch/hg19ToHg38.over.chain.gz’


2020-03-22 18:18:58 (27.4 MB/s) - ‘scratch/hg19ToHg38.over.chain.gz’ saved [227698/227698]



The following was run on the commandline since it required running from a different environment:

In [40]:
local_dragen_vcfs = !ls scratch/*dragen*.vcf.gz | xargs -i basename {} | grep -v GRCh38
local_dragen_vcfs

['JW-11_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_11_merged.INV137904-EId2.hard-filtered.vcf.gz',
 'JW-11_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_11_merged.INV137904-EId2.vcf.gz',
 'JW-23_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_23_merged.INV137904-EId4.hard-filtered.vcf.gz',
 'JW-23_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_23_merged.INV137904-EId4.vcf.gz',
 'JW-31_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW-31_merged.INV137904-EId6.hard-filtered.vcf.gz',
 'JW-31_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW-31_merged.INV137904-EId6.vcf.gz']

In [None]:
for _local_dragen_vcf in local_dragen_vcfs:
    grch38_output = re.sub(".vcf.gz", ".GRCh38.vcf.gz", _local_dragen_vcf)
    grch38_reject_output = re.sub(".vcf.gz", ".GRCh38.reject.vcf.gz", _local_dragen_vcf)
    liftover_command = '"cd /home && gatk LiftoverVcf -C b37ToHg38.over.chain -I {} -O {} -R /home2/Homo_sapiens_assembly38.fasta --REJECT {}"'.format(_local_dragen_vcf, grch38_output, grch38_reject_output)
    !docker run -ti -v /home/rcarter/ComparingGatkSentieonDRAGEN/scratch:/home -v /home/rcarter:/home2 gcr.io/bioskryb/gatk:4.1.3.0 /bin/bash -c $liftover_command

Compress, index, and split the Converted DRAGEN results (GRCh38)  into snps and indels

In [83]:
local_dragen_grch38_vcfs = !ls scratch/*.GRCh38.vcf.gz | xargs -i basename {}
local_dragen_grch38_vcfs

['JW-11_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_11_merged.INV137904-EId2.GRCh38.vcf.gz',
 'JW-11_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_11_merged.INV137904-EId2.hard-filtered.GRCh38.vcf.gz',
 'JW-23_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_23_merged.INV137904-EId4.GRCh38.vcf.gz',
 'JW-23_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW_23_merged.INV137904-EId4.hard-filtered.GRCh38.vcf.gz',
 'JW-31_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW-31_merged.INV137904-EId6.GRCh38.vcf.gz',
 'JW-31_merged.5006-dragen_map_align_sort_dedup_C_bed_VC_save_bam-DB_BioSkryb_JW-31_merged.INV137904-EId6.hard-filtered.GRCh38.vcf.gz']

In [None]:
for _local_dragen_grch38_vcf in local_dragen_grch38_vcfs:
    snp_output = re.sub(".vcf.gz", ".snp.vcf.gz", _local_dragen_grch38_vcf)
    indel_output = re.sub(".vcf.gz", ".indel.vcf.gz", _local_dragen_grch38_vcf)
    gatk_command = '"cd /home && gatk SplitVcfs --VALIDATION_STRINGENCY SILENT --STRICT false -I {invcf} --INDEL_OUTPUT {indel} --SNP_OUTPUT {snp}"'.format(invcf = _local_dragen_grch38_vcf, indel = indel_output, snp = snp_output)
    !docker run -v /home/rcarter/ComparingGatkSentieonDRAGEN/scratch/:/home -ti gcr.io/bioskryb/gatk:4.1.3.0 /bin/bash -c $gatk_command

In [93]:
sentieon_snp_vcfs = !ls scratch/*dnascope.snp.vcf.gz | xargs -i basename {}
dragen_hf_snp_vcfs = !ls scratch/*dragen*.snp.vcf.gz | grep hard-filtered | xargs -i basename {}
dragen_snp_vcfs = !ls scratch/*dragen*.snp.vcf.gz | grep -v hard-filtered | xargs -i basename {}

In [97]:
for _snp_vcf in sentieon_snp_vcfs:
    sample_name = re.sub("_merged.+", "", os.path.basename(_snp_vcf))
    folder_name = "NA12878_bulk2_vs_{}_sentieon_no_ploidy".format(sample_name)
    !cd scratch/ && ~/rtg-tools/bin/rtg-tools-3.10.1-4d58eadb/rtg RTG_MEM=4G  vcfeval --squash-ploidy  -b /home/rcarter/R_bioskryb_na12878_subsampled_with_bulk_analysis/vumc_subsampled_with_bulk.merged.snps_and_indels.vqsr.pass.snp.NA12878_Bulk2.vcf.gz -c $_snp_vcf -o $folder_name -t /home/rcarter/R_bioskryb_na12878_analysis/NA12878_comparison/SDF/
for _snp_vcf in dragen_hf_snp_vcfs:
    sample_name = re.sub("_merged.+", "", os.path.basename(_snp_vcf))
    folder_name = "NA12878_bulk2_vs_{}_dragen_hf_no_ploidy".format(sample_name)
    !cd scratch/ && ~/rtg-tools/bin/rtg-tools-3.10.1-4d58eadb/rtg RTG_MEM=4G  vcfeval --squash-ploidy  -b /home/rcarter/R_bioskryb_na12878_subsampled_with_bulk_analysis/vumc_subsampled_with_bulk.merged.snps_and_indels.vqsr.pass.snp.NA12878_Bulk2.vcf.gz -c $_snp_vcf -o $folder_name -t /home/rcarter/R_bioskryb_na12878_analysis/NA12878_comparison/SDF/
for _snp_vcf in dragen_snp_vcfs:
    sample_name = re.sub("_merged.+", "", os.path.basename(_snp_vcf))
    folder_name = "NA12878_bulk2_vs_{}_dragen_no_ploidy".format(sample_name)
    !cd scratch/ && ~/rtg-tools/bin/rtg-tools-3.10.1-4d58eadb/rtg RTG_MEM=4G  vcfeval --squash-ploidy  -b /home/rcarter/R_bioskryb_na12878_subsampled_with_bulk_analysis/vumc_subsampled_with_bulk.merged.snps_and_indels.vqsr.pass.snp.NA12878_Bulk2.vcf.gz -c $_snp_vcf -o $folder_name -t /home/rcarter/R_bioskryb_na12878_analysis/NA12878_comparison/SDF/

Reference sequence chrM is used in calls but not in baseline.
Threshold  True-pos-baseline  True-pos-call  False-pos  False-neg  Precision  Sensitivity  F-measure
----------------------------------------------------------------------------------------------------
    0.000            2882870        2882870     361881     426390     0.8885       0.8712     0.8797
     None            2882870        2882870     361881     426390     0.8885       0.8712     0.8797

Reference sequence chrM is used in calls but not in baseline.
Threshold  True-pos-baseline  True-pos-call  False-pos  False-neg  Precision  Sensitivity  F-measure
----------------------------------------------------------------------------------------------------
    0.000            2677786        2677786     276628     631474     0.9064       0.8092     0.8550
     None            2677786        2677786     276628     631474     0.9064       0.8092     0.8550

Reference sequence chrM is used in calls but not in baseline.
Thre