Skip to content
This repository has been archived by the owner on Oct 5, 2023. It is now read-only.

Commit

Permalink
port more processes from WDL
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi18av committed Oct 23, 2020
1 parent 275f07e commit 437c19b
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 0 deletions.
50 changes: 50 additions & 0 deletions modules/apply_recalibration/apply_recalibration.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
nextflow.enable.dsl = 2

params.recalibrated_vcf_filename
params.indel_filter_level
params.snp_filter_level
params.use_allele_specific_annotations

process GATK_APPLY_RECALIBRATION {
container "us.gcr.io/broad-gatk/gatk:4.1.1.0"

input:
path(input_vcf)
path(input_vcf_index)
path(indels_recalibration)
path(indels_recalibration_index)
path(indels_tranches)
path(snps_recalibration)
path(snps_recalibration_index)
path(snps_tranches)


output:
path("${params.recalibrated_vcf_filename}")
path("${params.recalibrated_vcf_filename}.tbi")

script:
"""
set -euo pipefail
gatk --java-options -Xms5g ApplyVQSR \
-O tmp.indel.recalibrated.vcf \
-V ${input_vcf} \
--recal-file ${indels_recalibration} \
${params.use_allele_specific_annotations ? '--use-allele-specific-annotations' : ''} \
--tranches-file ${indels_tranches} \
--truth-sensitivity-filter-level ${indel_filter_level} \
--create-output-variant-index true \
-mode INDEL
gatk --java-options -Xms5g ApplyVQSR \
-O ${recalibrated_vcf_filename} \
-V tmp.indel.recalibrated.vcf \
--recal-file ${snps_recalibration} \
${params.use_allele_specific_annotations ? '--use-allele-specific-annotations' : ''} \
--tranches-file ${snps_tranches} \
--truth-sensitivity-filter-level ${snp_filter_level} \
--create-output-variant-index true \
-mode SNP
"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
nextflow.enable.dsl = 2

process GATK_COLLECT_VARIANT_CALLING_METRICS {

input:
path(input_vcf)
path(input_vcf_index)
path(dbsnp_vcf)
path(dbsnp_vcf_index)
path(interval_list)
path(ref_dict)

output:
path("${params.metrics_file_prefix}.variant_calling_detail_metrics")
path("${params.metrics_file_prefix}.variant_calling_summary_metrics")

script:
"""
set -euo pipefail
gatk --java-options -Xms6g CollectVariantCallingMetrics \
--INPUT ${input_vcf} \
--DBSNP ${dbsnp_vcf} \
--SEQUENCE_DICTIONARY ${ref_dict} \
--OUTPUT ${params.metrics_filename_prefix} \
--THREAD_COUNT 8 \
--TARGET_INTERVALS ${interval_list}
"""
}
52 changes: 52 additions & 0 deletions modules/gather_tranches/gather_tranches.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
nextflow.enable.dsl = 2

params.output_filename

process GATK_GATHER_TRANCHES {
container "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152"

input:
path(tranches)


output:
path("${params.output_filename}")


shell:
// NOTE: This assumes that gsutil is installed
tranches_lines = write_lines(tranches)

'''
set -euo pipefail
tranches_fofn=!{tranches_lines}
# Jose says:
# Cromwell will fall over if we have it try to localize tens of thousands of files,
# so we manually localize files using gsutil.
# Using gsutil also lets us parallelize the localization, which (as far as we can tell)
# PAPI doesn't do.
# This is here to deal with the JES bug where commands may be run twice
rm -rf tranches
mkdir tranches
RETRY_LIMIT=5
count=0
until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do
sleep 1
((count++)) && ((count >= $RETRY_LIMIT)) && break
done
if [ "$count" -ge "$RETRY_LIMIT" ]; then
echo 'Could not copy all the tranches from the cloud' && exit 1
fi
cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list
/usr/gitc/gatk --java-options -Xms6g GatherTranches \
--input inputs.list \
--output !{output_filename}
'''

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
nextflow.enable.dsl = 2

process GATK_GATHER_VARIANT_CALLING_METRICS {

}
29 changes: 29 additions & 0 deletions modules/gather_vcfs/gather_vcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
nextflow.enable.dsl = 2

process GATK_GATHER_VCFS {
container "us.gcr.io/broad-gatk/gatk:4.1.1.0"

output:
path("${params.output_vcf_name}")
path("${params.output_vcf_name}.tbi")

script:

input_vcfs_str = input_vcfs.collect {" --input "}

"""
set -euo pipefail
# --ignore-safety-checks makes a big performance difference so we include it in our invocation.
# This argument disables expensive checks that the file headers contain the same set of
# genotyped samples and that files are in order by position of first record.
gatk --java-options -Xms6g GatherVcfsCloud \
--ignore-safety-checks \
--gather-type BLOCK \
--input ${input_vcfs_str} \
--output ${params.output_vcf_name}
tabix ${params.output_vcf_name}
"""

}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ params.use_allele_specific_annotations


process GATK_INDELS_VARIANT_RECALIBRATOR {
container "us.gcr.io/broad-gatk/gatk:4.1.1.0"

input:
path(sites_only_variant_filtered_vcf)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
nextflow.enable.dsl = 2

params.base_output_name

process GATK_SELECT_FINGERPRINT_SITE_VARIANTS {

input:
path(input_vcf)
path(haplotype_database)

output:
path("${params.base_output_name}.vcf.gz")
path("${params.base_output_name}.vcf.gz.tbi")

shell:

'''
set -euo pipefail
function hdb_to_interval_list() {
input=$1
awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1
}
hdb_to_interval_list !{haplotype_database} > hdb.interval_list
gatk --java-options -Xms6g SelectVariants \
--variant !{input_vcf} \
--intervals hdb.interval_list \
--output !{params.base_output_name}.vcf.gz
'''
}
55 changes: 55 additions & 0 deletions modules/snps_variant_recalibrator/snps_variant_recalibrator.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
nextflow.enable.dsl = 2

params.model_report_filename
pararms.recalibration_filename
params.tranches_filename
params.use_allele_specific_annotations

process GATK_SNPS_VARIANT_RECALIBRATOR {
container "us.gcr.io/broad-gatk/gatk:4.1.1.0"

input:
path(sites_only_variant_filtered_vcf)
path(sites_only_variant_filtered_vcf_index)
path(hapmap_resource_vcf)
path(omni_resource_vcf)
path(one_thousand_genomes_resource_vcf)
path(dbsnp_resource_vcf)
path(hapmap_resource_vcf_index)
path(omni_resource_vcf_index)
path(one_thousand_genomes_resource_vcf_index)
path(dbsnp_resource_vcf_index)

output:
path("${params.recalibration_filename}")
path("${params.recalibration_filename}.idx")
path("${params.tranches_filename}")


shell:
tranche_str = recalibration_tranche_values.collect().join(" -tranche ")
an_str = recalibration_annotation_values.collect().join(" -an ")

'''
set -euo pipefail
MODEL_REPORT=!{model_report}
gatk --java-options -Xms!{java_mem}g VariantRecalibrator \
-V !{sites_only_variant_filtered_vcf} \
-O !{recalibration_filename} \
--tranches-file !{tranches_filename} \
--trust-all-polymorphic \
-tranche !{tranche_str} \
-an !{an_str} \
!{params.use_allele_specific_annotations ? '--use-allele-specific-annotations' : ''} \
-mode SNP \
!{model_report_arg} \
--max-gaussians !{max_gaussians} \
-resource:hapmap,known=false,training=true,truth=true,prior=15 !{hapmap_resource_vcf} \
-resource:omni,known=false,training=true,truth=true,prior=12 !{omni_resource_vcf} \
-resource:1000G,known=false,training=true,truth=false,prior=10 !{one_thousand_genomes_resource_vcf} \
-resource:dbsnp,known=true,training=false,truth=false,prior=7 !{dbsnp_resource_vcf}
'''
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ params.tranches_filename
params.use_allele_specific_annotations

process GATK_SNPS_VARIANT_RECALIBRATOR_CREATE_MODEL {
container "us.gcr.io/broad-gatk/gatk:4.1.1.0"

input:
path(sites_only_variant_filtered_vcf)
Expand Down

0 comments on commit 437c19b

Please sign in to comment.