Skip to content

Commit

Permalink
Merge pull request #156 from Irallia/TEST/benchmarks/tool_comparison_…
Browse files Browse the repository at this point in the history
…macro_benchmarks

[BENCHMARK] tool comparison macro benchmarks
  • Loading branch information
Irallia committed Sep 17, 2021
2 parents 92cc315 + 7c72242 commit 547ce22
Show file tree
Hide file tree
Showing 19 changed files with 383 additions and 18 deletions.
Binary file added doc/CallerComparisonPlot.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions test/benchmark/caller_comparison/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
configfile: "Repos/iGenVar/test/benchmark/caller_comparison/config/config.yaml"

include: "workflow/rules/callers.smk"
include: "workflow/rules/eval.smk"
include: "workflow/rules/plots.smk"

##### Target rules #####

rule all:
input:
# SV calling
expand("results/caller_comparison/SVIM/variants.vcf"),
# SV lengths
# expand("results/caller_comparison/SV-plots/SV-length_SVIM_{minscore}.png", minscore=[3, 5, 7]),
# Evaluation
expand("results/caller_comparison/eval/results.all.png"),
# expand("pipeline/eval/results.tools.{vcf}.png", vcf=VCFS),
# expand("pipeline/eval/results.coverages.{vcf}.png", vcf=VCFS),
17 changes: 17 additions & 0 deletions test/benchmark/caller_comparison/config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
long_bam: data/long_reads/HG002.Sequel.10kb.pbmm2.hs37d5.whatshap.haplotag.RTG.10x.trio_sorted.bam
long_md_bam: data/long_reads/HG002.Sequel.10kb.pbmm2.hs37d5.whatshap.haplotag.RTG.10x.trio_sorted.md.bam
long_bai: data/long_reads/HG002.Sequel.10kb.pbmm2.hs37d5.whatshap.haplotag.RTG.10x.trio_sorted.bam.bai

reference_fa_gz: data/reference/hs37d5.fa.gz
reference_fa: data/reference/hs37d5.fa

parameters:
sample: HG002
min_var_length: 40
max_var_length: 1000000

quality_ranges:
igenvar: {from: 1, to: 80, step: 2}
svim: {from: 1, to: 80, step: 2}
sniffles: {from: 1, to: 80, step: 2}
pbsv: {from: 1, to: 80, step: 2}
161 changes: 161 additions & 0 deletions test/benchmark/caller_comparison/workflow/rules/callers.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
wildcard_constraints:
sample = config["parameters"]["sample"],
min_var_length = config["parameters"]["min_var_length"],
max_var_length = config["parameters"]["max_var_length"]

rule run_igenvar:
input:
bam = config["long_bam"]
output:
vcf = "results/caller_comparison/iGenVar/variants.vcf"
params:
min_qual = config["parameters"]["min_qual"]
shell:
"""
./build/iGenVar/bin/iGenVar -t 1 -j {input.bam} -o {output.vcf} \
--vcf_sample_name {sample} \
--method cigar_string \
--method split_read \
--min_var_length {min_var_length} \
--max_var_length {max_var_length} \
--min_qual 2
"""
# Defaults:
# --clustering_methods hierarchical_clustering --refinement_methods no_refinement
# --max_tol_inserted_length 5 --max_overlap 10 --hierarchical_clustering_cutoff 10

# SVIM
rule run_svim:
input:
bam = config["long_bam"],
bai = config["long_bai"],
genome = config["reference_fa_gz"]
output:
"results/caller_comparison/SVIM/variants.vcf"
resources:
mem_mb = 20000,
time_min = 600,
io_gb = 100
params:
working_dir = "results/caller_comparison/SVIM/",
threads: 1
conda:
"../../../envs/svim.yaml"
shell:
"""
svim alignment --sample {sample} \
--partition_max_distance 1000 \
--cluster_max_distance 0.5 \
--min_sv_size {min_var_length} \
--segment_gap_tolerance 20 \
--segment_overlap_tolerance 20 \
--interspersed_duplications_as_insertions \
--tandem_duplications_as_insertions \
--read_names \
--max_sv_size {max_var_length} \
--verbose \
{params.working_dir} {input.bam} {input.genome}
"""
# Defaults:
# --position_distance_normalizer 900 --edit_distance_normalizer 1.0

# SNIFFLES (we have to loop over min_support, because sniffles does not write a quality score into the vcf)
rule run_sniffles:
input:
bam = config["long_md_bam"],
output:
expand("results/caller_comparison/Sniffles/raw_variants.{minsupport}.vcf",
minsupport=list(range(config["quality_ranges"]["sniffles"]["from"],
config["quality_ranges"]["sniffles"]["to"]+1,
config["quality_ranges"]["sniffles"]["step"])))
resources:
mem_mb = 400000,
time_min = 1200,
io_gb = 100
params:
qual_from = config["quality_ranges"]["sniffles"]["from"],
qual_to = config["quality_ranges"]["sniffles"]["to"]+1,
qual_step = config["quality_ranges"]["sniffles"]["step"]
threads: 10
conda:
"../../../envs/sniffles.yaml"
shell:
"""
for i in $(seq {params.qual_from} {params.qual_step} {params.qual_to})
do
sniffles --mapped_reads {input.bam} --vcf results/caller_comparison/Sniffles/raw_variants.$i.vcf \
--min_support $i --min_length {min_var_length} --threads {threads} --genotype
done
"""

#see https://github.com/spiralgenetics/truvari/issues/43
rule fix_sniffles:
input:
"results/caller_comparison/Sniffles/raw_variants.{support}.vcf"
output:
"results/caller_comparison/Sniffles/variants.unsorted.min_qual_{support,[0-9]+}.vcf"
shell:
"sed 's/##INFO=<ID=SUPTYPE,Number=A/##INFO=<ID=SUPTYPE,Number=./' {input} > {output}"

# Split to SV classes
# Since iGenVar can only find INS and DEL so far, we filter these out for better comparability.
rule fix_sniffles_2_and_filter_insertions_and_deletions:
input:
"results/caller_comparison/Sniffles/variants.unsorted.min_qual_{support,[0-9]+}.vcf"
output:
"results/caller_comparison/Sniffles/variants.min_qual_{support,[0-9]+}.vcf"
shell:
"bcftools view -i 'SVTYPE=\"DEL\" | SVTYPE=\"INS\"' {input} | bcftools sort > {output}"

#PBSV
rule run_pbsv_dicsover:
input:
bam = config["long_bam"]
output:
svsig_gz = dynamic("results/caller_comparison/pbsv/signatures.{region}.svsig.gz")
resources:
mem_mb = 400000,
time_min = 2000,
io_gb = 100
threads: 1
conda:
"../../../envs/pbsv.yaml"
shell:
# "pbsv discover {input.bam} {output.svsig_gz}"
"""
for i in $(samtools view -H {input.bam} | grep '^@SQ' | cut -f2 | cut -d':' -f2); do
pbsv discover --region $i {input.bam} results/caller_comparison/pbsv/signatures.$i.svsig.gz
done
"""

rule run_pbsv_call:
input:
genome = config["reference_fa"],
svsig_gz = dynamic("results/caller_comparison/pbsv/signatures.{region}.svsig.gz")
output:
vcf = expand("results/caller_comparison/pbsv/variants.min_qual_{minsupport}.vcf",
minsupport=list(range(config["quality_ranges"]["pbsv"]["from"],
config["quality_ranges"]["pbsv"]["to"]+1,
config["quality_ranges"]["pbsv"]["step"])))
resources:
mem_mb = 400000,
time_min = 2000,
io_gb = 100
params:
qual_from = config["quality_ranges"]["pbsv"]["from"],
qual_to = config["quality_ranges"]["pbsv"]["to"]+1,
qual_step = config["quality_ranges"]["pbsv"]["step"]
threads: 1
conda:
"../../../envs/pbsv.yaml"
shell:
# pbsv call --types DEL,INS,DUP --min-sv-length {params.min_sv_length} --max-ins-length 100K \
"""
for i in $(seq {params.qual_from} {params.qual_step} {params.qual_to})
do
pbsv call --types DEL,INS --min-sv-length {min_var_length} --max-ins-length 100K \
--call-min-reads-all-samples $i --call-min-reads-one-sample $i \
--call-min-reads-per-strand-all-samples 0 --call-min-bnd-reads-all-samples 0 --call-min-read-perc-one-sample 0 \
--num-threads {threads} {input.genome} {input.svsig_gz} results/caller_comparison/pbsv/variants.min_qual_$i.vcf
done
"""
83 changes: 83 additions & 0 deletions test/benchmark/caller_comparison/workflow/rules/eval.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
rule filter_vcf:
input:
"results/caller_comparison/{caller,iGenVar|SVIM}/variants.vcf"
output:
"results/caller_comparison/{caller,iGenVar|SVIM}/variants.min_qual_{min_qual}.vcf"
shell:
"bcftools view -i 'QUAL>={wildcards.min_qual}' {input} > {output}"

rule bgzip:
input:
"{name}.vcf"
output:
"{name}.vcf.gz"
shell:
"bgzip -c {input} > {output}"

rule tabix:
input:
"{name}.vcf.gz"
output:
"{name}.vcf.gz.tbi"
shell:
"tabix -p vcf {input}"

rule truvari:
input:
vcf = "results/caller_comparison/{caller}/variants.min_qual_{min_qual}.vcf.gz",
index = "results/caller_comparison/{caller}/variants.min_qual_{min_qual}.vcf.gz.tbi"
params:
output_dir = "results/caller_comparison/eval/{caller}/min_qual_{min_qual}"
output:
summary = "results/caller_comparison/eval/{caller}/min_qual_{min_qual}/summary.txt"
conda:
"../../../envs/truvari.yaml"
shell:
"""
rm -rf {params.output_dir} && truvari bench -b data/truth_set/HG002_SVs_Tier1_v0.6.vcf.gz \
-c {input.vcf} -o {params.output_dir} --passonly --includebed data/truth_set/HG002_SVs_Tier1_v0.6.bed -p 0
"""

rule reformat_truvari_results:
input:
"results/caller_comparison/eval/{caller}/min_qual_{min_qual}/summary.txt"
output:
"results/caller_comparison/eval/{caller}/min_qual_{min_qual}/pr_rec.txt"
threads: 1
shell:
"""
cat {input} | grep '\<precision\>\|\<recall\>' | tr -d ',' |sed 's/^[ \t]*//' | tr -d '\"' | tr -d ' ' \
| tr ':' '\t' | awk 'OFS=\"\\t\" {{ print \"{wildcards.caller}\", \"{wildcards.min_qual}\", $1, $2 }}' > {output}
"""

rule cat_truvari_results_all:
input:
igenvar = expand("results/caller_comparison/eval/iGenVar/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["igenvar"]["from"],
config["quality_ranges"]["igenvar"]["to"]+1,
config["quality_ranges"]["igenvar"]["step"]))),
svim = expand("results/caller_comparison/eval/SVIM/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["svim"]["from"],
config["quality_ranges"]["svim"]["to"]+1,
config["quality_ranges"]["svim"]["step"]))),
sniffles = expand("results/caller_comparison/eval/Sniffles/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["sniffles"]["from"],
config["quality_ranges"]["sniffles"]["to"]+1,
config["quality_ranges"]["sniffles"]["step"]))),
pbsv = expand("results/caller_comparison/eval/pbsv/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["pbsv"]["from"],
config["quality_ranges"]["pbsv"]["to"]+1,
config["quality_ranges"]["pbsv"]["step"])))
output:
igenvar = temp("results/caller_comparison/eval/igenvar.all_results.txt"),
svim = temp("results/caller_comparison/eval/svim.all_results.txt"),
sniffles = temp("results/caller_comparison/eval/sniffles.all_results.txt"),
pbsv = temp("results/caller_comparison/eval/pbsv.all_results.txt"),
all = "results/caller_comparison/eval/all_results.txt"
threads: 1
run:
shell("cat {input.igenvar} > {output.igenvar}")
shell("cat {input.svim} > {output.svim}")
shell("cat {input.sniffles} > {output.sniffles}")
shell("cat {input.pbsv} > {output.pbsv}")
shell("cat {output.igenvar} {output.svim} {output.sniffles} {output.pbsv} > {output.all}")
12 changes: 12 additions & 0 deletions test/benchmark/caller_comparison/workflow/rules/plots.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rule plot_pr_all_results:
input:
"results/caller_comparison/eval/all_results.txt"
output:
"results/caller_comparison/eval/results.all.png"
log:
"logs/rplot.all.log"
shell:
"""
Rscript --vanilla Repos/iGenVar/test/benchmark/caller_comparison/workflow/scripts/plot_all_results.R \
{input} {output} > {log}
"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
library(tidyverse)
library(scales)

args = commandArgs(trailingOnly=TRUE)

res <- read_tsv(args[1], col_names = c("caller", "min_qual", "metric", "value"))
res$caller = factor(res$caller,
levels=c('iGenVar', 'SVIM', 'Sniffles', 'pbsv'),
labels=c('iGenVar', 'SVIM', 'Sniffles', 'pbsv'))
res %>%
filter(metric %in% c("recall", "precision")) %>%
pivot_wider(names_from=metric, values_from=value) %>%
filter(recall!=0 | precision!=0) %>%
mutate(precision = 100*precision, recall = 100*recall) %>%
ggplot(aes(recall, precision, color=caller, pch=caller)) +
geom_point(size=0.5) +
# scale_shape_manual(values=c(15,16,17)) +
# scale_color_manual(values=c("deepskyblue3", "goldenrod2", "firebrick2")) +
geom_path() +
labs(y = "Precision", x = "Recall", color = "Tool", pch = "Tool") +
lims(x=c(0,100), y=c(0,100)) +
theme_bw() +
theme(panel.spacing = unit(0.75, "lines")) +
theme(text = element_text(size=14), axis.text.x = element_text(size=9), axis.text.y = element_text(size=9))

ggsave(args[2], width=20, height=12)
8 changes: 8 additions & 0 deletions test/benchmark/envs/cyvcf2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- bioconda
dependencies:
- python>=3.6
- pip
- pip:
- cyvcf2
- matplotlib
File renamed without changes.
4 changes: 4 additions & 0 deletions test/benchmark/envs/pbsv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- bioconda
dependencies:
- pbsv=2.6.2
5 changes: 5 additions & 0 deletions test/benchmark/envs/samtools.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- bioconda
#Sorting BAM files from NGMLR fails in samtools version>=1.10, therefore fix to version 1.9
dependencies:
- samtools=1.9
4 changes: 4 additions & 0 deletions test/benchmark/envs/sniffles.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- bioconda
dependencies:
- sniffles=1.0.11
4 changes: 4 additions & 0 deletions test/benchmark/envs/svim.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- bioconda
dependencies:
- svim=1.4.2
7 changes: 7 additions & 0 deletions test/benchmark/envs/truvari.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
channels:
- bioconda
dependencies:
- python>=3.6
- pip
- pip:
- Truvari
8 changes: 0 additions & 8 deletions test/benchmark/envs/truvari_environment.yaml

This file was deleted.

0 comments on commit 547ce22

Please sign in to comment.