diff --git a/.gitignore b/.gitignore index c86f1b54b..4c62f7d42 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ # Blacklist common bioinformatics formats used in these workflows. **/*.fastq.gz **/*.fa -**/*.gtf \ No newline at end of file +**/*.gtf +**/*.bam +**/*.log + +# Blacklist Cromwell files +**/cromwell* diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 000000000..e7b719757 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2019 St. Jude Children's Research Hospital + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 6b5631a8f..2dfe7d97e 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,10 @@ This repository is intended to contain all source code related to bioinformatics workflows in St. Jude Cloud. This includes but is not limited to Docker container definition, WDL/CWL tools and workflows, and relevant documentation. The repo is -currently under construction and should be considered in an experimental state. \ No newline at end of file +currently under construction and should be considered in an experimental state. + +## RNA-seq workflow V2 +The RNA-seq workflow used by St. Jude Cloud to generate BAM files is implemented +in WDL. It uses the bioinformatics-base Docker image found in this repository. +Configuration files for the Cromwell runner have been included to facilitate running +on LSF in addition to local execution. diff --git a/conf/aws.conf b/conf/aws.conf new file mode 100755 index 000000000..87a5fe44e --- /dev/null +++ b/conf/aws.conf @@ -0,0 +1,51 @@ +include required(classpath("application")) + +aws { + + application-name = "cromwell" + auths = [ + { + name = "default" + scheme = "default" + } + ] + region = "" +} + +engine { + filesystems { + s3.auth = "default" + } +} + +backend { + default = "AWSBatch" + providers { + AWSBatch { + actor-factory = "cromwell.backend.impl.aws.AwsBatchBackendLifecycleActorFactory" + config { + + numSubmitAttempts = 6 + numCreateDefinitionAttempts = 6 + + // Base bucket for workflow executions + root = "s3:///cromwell-execution" + + // A reference to an auth defined in the `aws` stanza at the top. This auth is used to create + // Jobs and manipulate auth JSONs. + auth = "default" + + default-runtime-attributes { + queueArn: "" + } + + filesystems { + s3 { + // A reference to a potentially different auth for manipulating files via engine functions. + auth = "default" + } + } + } + } + } +} diff --git a/conf/lsf.conf b/conf/lsf.conf new file mode 100755 index 000000000..bc06365a1 --- /dev/null +++ b/conf/lsf.conf @@ -0,0 +1,59 @@ +include required(classpath("application")) + +call-caching { + enabled = true +} + +backend { + default = LSF + providers { + LSF { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + runtime-attributes = """ + Int cpu = 1 + Int hosts = 1 + Float? memory_mb = 3800 + String lsf_queue = "standard" + String? lsf_job_group + String? docker + """ + + #submit = """bsub -J ${job_name} -cwd ${cwd} -o ${out} -e ${err} -R rusage[mem=${memory_mb + "MB"}] -R "span[hosts=1]" -n ${cpu} ${"-q " + lsf_queue} /usr/bin/env bash ${script}""" + + submit = """ + bsub \ + -q ${lsf_queue} \ + -n ${cpu} \ + ${"-g " + lsf_job_group} \ + -R "rusage[mem=${memory_mb}] span[hosts=${hosts}]" \ + -J ${job_name} \ + -cwd ${cwd} \ + -o ${out} \ + -e ${err} \ + /usr/bin/env bash ${script} + """ + + submit-docker = """ + bsub \ + -q ${lsf_queue} \ + -n ${cpu} \ + ${"-g " + lsf_job_group} \ + -R "select[singularity] rusage[mem=${memory_mb}] span[hosts=${hosts}]" \ + -J ${job_name} \ + -cwd ${cwd} \ + -o ${cwd}/execution/stdout \ + -e ${cwd}/execution/stderr \ + "singularity exec --bind ${cwd}:${docker_cwd} docker://${docker} ${job_shell} ${script}" + """ + + + kill = "bkill ${job_id}" + check-alive = "bjobs ${job_id}" + job-id-regex = "Job <(\\d+)>.*" + + exit-code-timeout-seconds = 120 + } + } + } +} diff --git a/docker/bio-base.Dockerfile b/docker/bio-base.Dockerfile deleted file mode 100644 index 6dec8db80..000000000 --- a/docker/bio-base.Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM ubuntu:18.04 - -ENV MINICONDA_URL "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -ENV PATH /opt/conda/bin:$PATH - -RUN apt-get update -RUN apt-get upgrade -y -RUN apt-get install wget -y - -RUN wget ${MINICONDA_URL} -O miniconda.sh -RUN bash miniconda.sh -b -p /opt/conda/ -RUN rm -r miniconda.sh - -RUN conda update -n base -c defaults conda -y -RUN conda install \ - -c conda-forge \ - -c bioconda \ - coreutils==8.31 \ - picard==2.20.2 \ - samtools==1.9 \ - bwa==0.7.17 \ - star==2.7.1a \ - fastqc==0.11.8 \ - qualimap==2.2.2c \ - multiqc==1.7 \ - rseqc==3.0.0 \ - -y \ No newline at end of file diff --git a/docker/bioinformatics-base/Dockerfile b/docker/bioinformatics-base/Dockerfile new file mode 100755 index 000000000..752c524d1 --- /dev/null +++ b/docker/bioinformatics-base/Dockerfile @@ -0,0 +1,39 @@ +FROM rust:1.36.0 as fqlib-builder +RUN cargo install \ + --git https://github.com/stjude/fqlib.git \ + --tag v0.3.1 \ + --root /opt/fqlib/ + +FROM ubuntu:18.04 as builder + +ENV PATH /opt/conda/bin:$PATH + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install wget -y && \ + apt-get install gcc -y && \ + rm -r /var/lib/apt/lists/* + +RUN wget "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O miniconda.sh && \ + bash miniconda.sh -b -p /opt/conda/ && \ + rm miniconda.sh + +RUN conda update -n base -c defaults conda -y && \ + conda install \ + -c conda-forge \ + -c bioconda \ + coreutils==8.31 \ + picard==2.20.2 \ + samtools==1.9 \ + bwa==0.7.17 \ + star==2.7.1a \ + fastqc==0.11.8 \ + qualimap==2.2.2c \ + multiqc==1.7 \ + rseqc==3.0.0 \ + htseq==0.11.2 \ + deeptools==3.3.1 \ + -y && \ + conda clean --all -y + +COPY --from=fqlib-builder /opt/fqlib/bin/fq /usr/local/bin/ diff --git a/docker/star/Dockerfile b/docker/star/Dockerfile new file mode 100644 index 000000000..a8941d612 --- /dev/null +++ b/docker/star/Dockerfile @@ -0,0 +1 @@ +FROM stjudecloud/bioinformatics-base:bleeding-edge diff --git a/tools/deeptools.wdl b/tools/deeptools.wdl new file mode 100755 index 000000000..c89ad2b56 --- /dev/null +++ b/tools/deeptools.wdl @@ -0,0 +1,27 @@ +## Description: +## +## This WDL tool wraps the DeepTools tool (https://deeptools.readthedocs.io/en/develop/index.html). +## DeepTools is a suite of Python tools for analysis of high throughput sequencing analysis. + +task bamCoverage { + File bam + File bai + String prefix = basename(bam, ".bam") + + command { + if [ ! -e ${bam}.bai ] + then + ln -s ${bai} ${bam}.bai + fi + + bamCoverage --bam ${bam} --outFileName ${prefix}.bw --outFileFormat bigwig --numberOfProcessors "max" + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File bigwig = "${prefix}.bw" + } +} diff --git a/tools/fastqc.wdl b/tools/fastqc.wdl new file mode 100755 index 000000000..2ef07bcdd --- /dev/null +++ b/tools/fastqc.wdl @@ -0,0 +1,26 @@ +## Description: +## +## This WDL tool wraps the FastQC tool (https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +## FastQC generates quality control metrics for sequencing pipelines. + +task fastqc { + File bam + Int ncpu + String prefix = basename(bam, ".bam") + + command { + mkdir ${prefix}_fastqc_results + fastqc -f bam \ + -o ${prefix}_fastqc_results \ + -t ${ncpu} \ + ${bam} + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + Array[File] out_files = glob("${prefix}_fastqc_results/*") + } +} diff --git a/tools/fq.wdl b/tools/fq.wdl new file mode 100755 index 000000000..c9715d315 --- /dev/null +++ b/tools/fq.wdl @@ -0,0 +1,33 @@ +## Description: +## +## This WDL tool wraps the fq tool (https://github.com/stjude/fqlib). +## The fq library provides methods for manipulating Illumina generated +## FastQ files. + +task print_version { + command { + fq --version + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string(stdout()) + } + +} + +task fqlint { + File read1 + File read2 + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + command { + fq lint ${read1} ${read2} + } +} diff --git a/tools/htseq.wdl b/tools/htseq.wdl new file mode 100755 index 000000000..2c179a4db --- /dev/null +++ b/tools/htseq.wdl @@ -0,0 +1,24 @@ +## Description: +## +## This WDL tool wraps the htseq tool (https://github.com/simon-anders/htseq). +## HTSeq is a Python library for analyzing sequencing data. + +task count { + File bam + File gtf + String strand = "reverse" + String outfile = basename(bam, ".bam") + ".counts.txt" + + command { + htseq-count -f bam -r pos -s ${strand} -m union -i gene_name --secondary-alignments ignore --supplementary-alignments ignore ${bam} ${gtf} > ${outfile} + } + + runtime { + memory: "8G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File out = "${outfile}" + } +} diff --git a/tools/md5sum.wdl b/tools/md5sum.wdl new file mode 100755 index 000000000..1ae86eb6e --- /dev/null +++ b/tools/md5sum.wdl @@ -0,0 +1,50 @@ +## Description: +## +## This WDL tool wraps the md5sum tool from the GNU core +## utilities (https://github.com/coreutils/coreutils). +## md5sum is a utility for generating and verifying MD5 +## hashes. + +task print_version { + command { + md5sum --version + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string(stdout()) + } +} +task compute_checksum { + File infile + + command { + md5sum ${infile} > stdout.txt + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} +task check_checksum { + File infile + + command { + md5sum -c ${infile} > stdout.txt + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} diff --git a/tools/multiqc.wdl b/tools/multiqc.wdl new file mode 100755 index 000000000..1edd70a5a --- /dev/null +++ b/tools/multiqc.wdl @@ -0,0 +1,41 @@ +## Description: +## +## This WDL tool wraps the MultiQC tool (https://multiqc.info/). +## MultiQC aggregates quality control results for bioinformatics. + +task multiqc { + File star + File dups + String validate_sam_string + Array[File] qualimap_bamqc + Array[File] qualimap_rnaseq + Array[File] fastqc_files + File flagstat_file + + command { + echo ${star} > file_list.txt + echo ${dups} >> file_list.txt + echo ${validate_sam_string} > validate_sam.txt + echo validate_sam.txt >> file_list.txt + for file in ${sep=' ' qualimap_bamqc} ; do + echo $file >> file_list.txt + done + for file in ${sep=' ' qualimap_rnaseq} ; do + echo $file >> file_list.txt + done + for file in ${sep=' ' fastqc_files} ; do + echo $file >> file_list.txt + done + echo ${flagstat_file} >> file_list.txt + + multiqc --file-list file_list.txt -o multiqc_results + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File out = "multiqc_results" + } +} diff --git a/tools/picard.wdl b/tools/picard.wdl new file mode 100755 index 000000000..57e844b33 --- /dev/null +++ b/tools/picard.wdl @@ -0,0 +1,67 @@ +## Description: +## +## This WDL tool wraps the PicardTools library (https://broadinstitute.github.io/picard/). +## PicardTools is a set of Java tools for manipulating sequencing data. + +task mark_duplicates { + File bam + String prefix = basename(bam, ".bam") + + command { + picard MarkDuplicates I=${bam} \ + O=${prefix}.duplicates.bam \ + VALIDATION_STRINGENCY=SILENT \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false \ + COMPRESSION_LEVEL=5 \ + METRICS_FILE=${prefix}.metrics.txt + } + + runtime { + memory: "50G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File out = "${prefix}.duplicates.bam" + } +} + +task validate_bam { + File bam + + command { + picard ValidateSamFile I=${bam} \ + IGNORE=INVALID_PLATFORM_VALUE > stdout.txt + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} + +task bam_to_fastq { + File bam + String prefix = basename(bam, ".bam") + + command { + picard SamToFastq INPUT=${bam} \ + FASTQ=${prefix}_R1.fastq \ + SECOND_END_FASTQ=${prefix}_R2.fastq \ + RE_REVERSE=true + } + + runtime{ + memory: "25G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File read1 = "${prefix}_R1.fastq" + File read2 = "${prefix}_R2.fastq" + } +} diff --git a/tools/qc.wdl b/tools/qc.wdl new file mode 100755 index 000000000..f71351074 --- /dev/null +++ b/tools/qc.wdl @@ -0,0 +1,70 @@ +## Description: +## +## This WDL tool includes custom scripts to parse and validate QC output. + +task parse_validate_bam { + String in + Boolean strict = true + + command { + if [ "${strict}" == "true" ] + then + if [ $(echo "${in}" | grep -c "ERROR") -gt 0 ] + then + echo "Errors detected by Picard ValidateSamFile" > /dev/stderr + exit -1 + fi + elif [ $(echo "${in}" | grep -Ec "ERROR|WARNING") -gt 0 ] + then + echo "Errors detected by Picard ValidateSamFile" > /dev/stderr + exit -1 + fi + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + +} + +task parse_infer_experiment { + String in + + command { + if [ $(echo "${in}" | grep -c 'PairEnd') -gt 0 ] + then + if (( $(echo "$(echo "${in}" | tail -n 1 | sed 's/^.*: //') > 0.5" | bc -l ) )) + then + echo 'strand-specific-reverse' > stdout.txt + elif (( $(echo "$(echo "${in}" | tail -n 2 | head -n 1 | sed 's/^.*: //') > 0.5" | bc -l ) )) + then + echo 'strand-specific-forward' > stdout.txt + else + echo 'non-strand-specific' > stdout.txt + fi + elif [ $(echo "${in}" | grep -c 'SingleEnd') -gt 0 ] + then + if (( $(echo "$(echo "${in}" | tail -n 1 | sed 's/^.*: //') > 0.5" | bc -l ) )) + then + echo 'strand-specific-reverse' > stdout.txt + elif (( $(echo "$(echo "${in}" | tail -n 2 | head -n 1 | sed 's/^.*: //') > 0.5" | bc -l ) )) + then + echo 'strand-specific-forward' > stdout.txt + else + echo 'non-strand-specific' > stdout.txt + fi + else + echo "infer_experiment failed to determine type" > /dev/stderr + exit -1 + fi + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } + +} diff --git a/tools/qualimap.wdl b/tools/qualimap.wdl new file mode 100755 index 000000000..5cf30f899 --- /dev/null +++ b/tools/qualimap.wdl @@ -0,0 +1,45 @@ +## Description: +## +## This WDL tool wraps the QualiMap tool (http://qualimap.bioinfo.cipf.es/). +## QualiMap computes metrics to facilitate evaluation of sequencing data. + +task bamqc { + File bam + Int ncpu + String prefix = basename(bam, ".bam") + + command { + qualimap bamqc -bam ${bam} \ + -outdir ${prefix}_qualimap_results \ + -nt ${ncpu} \ + --java-mem-size=6g + } + + runtime { + memory: "8G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + Array[File] out_files = glob("${prefix}_qualimap_results/*") + } +} +task rnaseq { + File bam + File gencode_gtf + String outdir = "qualimap_rnaseq" + String strand = "strand-specific-reverse" + + command { + qualimap rnaseq -bam ${bam} -gtf ${gencode_gtf} -outdir ${outdir} -oc qualimap_counts.txt -p ${strand} -pe --java-mem-size=6G + } + + runtime { + memory: "8G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + Array[File] out_files = glob("${outdir}/*") + } +} diff --git a/tools/rseqc.wdl b/tools/rseqc.wdl new file mode 100755 index 000000000..ab22108d3 --- /dev/null +++ b/tools/rseqc.wdl @@ -0,0 +1,23 @@ +## Description: +## +## This WDL tool wraps the RSeQC tool (http://rseqc.sourceforge.net). +## RSeQC is a package for quality control of RNA-seq data. + +task infer_experiment { + File bam + Int? sample_size + Int? map_qual + File refgene_bed + + command { + infer_experiment.py -i ${bam} -r ${refgene_bed} ${"-s" + sample_size} ${"-q" + map_qual} > stdout.txt + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} diff --git a/tools/samtools.wdl b/tools/samtools.wdl new file mode 100755 index 000000000..3e5ab3a0e --- /dev/null +++ b/tools/samtools.wdl @@ -0,0 +1,92 @@ +## Description: +## +## This WDL tool wraps the SAMtools package (http://samtools.sourceforge.net/). +## SAMtools provides utlities for manipulating SAM format sequence alignments. + +task print_version { + command { + samtools --version + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string(stdout()) + } + +} + +task quickcheck { + File bam + + command { + samtools quickcheck ${bam} + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } +} + +task split { + File bam + Boolean? reject_unaccounted + String prefix = basename(bam, ".bam") + + command { + samtools split -u ${prefix}.unaccounted_reads.bam -f '%*_%!.%.' ${bam} + samtools view ${prefix}.unaccounted_reads.bam > unaccounted_reads.bam + if [ ${default='true' reject_unaccounted} -a -s unaccounted_reads.bam ] + then exit 1; + else rm ${prefix}.unaccounted_reads.bam + fi + rm unaccounted_reads.bam + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + Array[File] split_bams = glob("*.bam") + } +} + +task flagstat { + File bam + + String outfile = basename(bam, ".bam")+".flagstat.txt" + + command { + samtools flagstat ${bam} > ${outfile} + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File flagstat = outfile + } +} + +task index { + File bam + + String name = basename(bam) + String outfile = basename(bam)+".bai" + + command { + samtools index ${bam} ${outfile} + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File bai = outfile + } +} diff --git a/tools/star.wdl b/tools/star.wdl new file mode 100755 index 000000000..7812cc2c8 --- /dev/null +++ b/tools/star.wdl @@ -0,0 +1,84 @@ +## Description: +## +## This WDL tool wraps the STAR aligner (https://github.com/alexdobin/STAR). +## STAR is an RNA-seq aligner. + +task print_version { + command { + STAR --version + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string(stdout()) + } +} + +task build_db { + Int ncpu + File reference_fasta + File gencode_gtf + String stardb_dir_name + + command { + mkdir ${stardb_dir_name}; + STAR --runMode genomeGenerate \ + --genomeDir ${stardb_dir_name} \ + --runThreadN ${ncpu} \ + --limitGenomeGenerateRAM=45000000000 \ + --genomeFastaFiles ${reference_fasta} \ + --sjdbGTFfile ${gencode_gtf} \ + --sjdbOverhang 125 + } + + runtime { + memory: "50G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File dir = stardb_dir_name + } +} + +task alignment { + Array[File] read_one_fastqs + Array[File] read_two_fastqs + File stardb_dir + String output_prefix + String? read_groups + + command { + STAR --readFilesIn ${sep=',' read_one_fastqs} ${sep=',' read_two_fastqs} \ + --genomeDir ${stardb_dir} \ + --outSAMunmapped Within \ + --outSAMstrandField intronMotif \ + --outSAMtype BAM SortedByCoordinate \ + --outSAMattributes NH HI AS nM NM MD XS \ + --outFilterMultimapScoreRange 1 \ + --outFilterMultimapNmax 20 \ + --outFilterMismatchNmax 10 \ + --alignIntronMax 500000 \ + --alignMatesGapMax 1000000 \ + --sjdbScore 2 \ + --alignSJDBoverhangMin 1 \ + --outFilterMatchNminOverLread 0.66 \ + --outFilterScoreMinOverLread 0.66 \ + --outFileNamePrefix ${output_prefix} \ + --twopassMode Basic \ + --limitBAMsortRAM 58000000000 \ + ${"--outSAMattrRGline " + read_groups} + } + + runtime { + memory: "75G" + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + File star_bam = output_prefix + "Aligned.sortedByCoord.out.bam" + } +} diff --git a/tools/util.wdl b/tools/util.wdl new file mode 100755 index 000000000..77858e49f --- /dev/null +++ b/tools/util.wdl @@ -0,0 +1,37 @@ +## Description: +## +## This WDL tool includes custom scripts to parse and reformat +## task output as part of a workflow. + + +task get_read_groups { + File bam + + command { + samtools view -H ${bam} | grep "@RG" > stdout.txt + } + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} + +task prepare_read_groups_for_star { + String read_groups + + command <<< + echo "${read_groups}" | cut -f 2- | sed -e 's/\t/ /g' | awk '{print}' ORS=' , '| sed 's/ , $//' > stdout.txt + >>> + + runtime { + docker: 'stjudecloud/bioinformatics-base:bleeding-edge' + } + + output { + String out = read_string("stdout.txt") + } +} diff --git a/workflows/bam-to-fastqs.wdl b/workflows/bam-to-fastqs.wdl new file mode 100644 index 000000000..8bfd7aa71 --- /dev/null +++ b/workflows/bam-to-fastqs.wdl @@ -0,0 +1,45 @@ +## Description: +## +## This WDL workflow converts an input BAM file to a set of fastq files for read 1 and read 2. It performs QC checks along the way to validate the input and output. +## +## Inputs: +## +## bam - the input bam to convert to fastqs +## +## Output: +## +## read1s - an array of files with the first read in the pair +## read2s - an array of files with the second read in the pair +## +## LICENSING : +## MIT License +## +## Copyright 2019 St. Jude Children's Research Hospital +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +## +##The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +## +##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import "../tools/samtools.wdl" +import "../tools/picard.wdl" +import "../tools/fq.wdl" + +workflow bam_to_fastqs { + File bam + + call samtools.quickcheck { input: bam=bam } + call samtools.split { input: bam=bam } + scatter (split_bam in split.split_bams) { + call picard.bam_to_fastq { input: bam=split_bam } + } + scatter (reads in zip(bam_to_fastq.read1, bam_to_fastq.read2)) { + call fq.fqlint { input: read1=reads.left, read2=reads.right} + } + + output { + Array[File] read1s = bam_to_fastq.read1 + Array[File] read2s = bam_to_fastq.read2 + } +} diff --git a/workflows/star-alignment.wdl b/workflows/star-alignment.wdl new file mode 100755 index 000000000..0cc2f9dbe --- /dev/null +++ b/workflows/star-alignment.wdl @@ -0,0 +1,49 @@ +## Description: +## +## This WDL workflow runs the STAR aligner on a set of input fastq files. +## +## Inputs: +## +## read1s - an array of files with the first read in the pair +## read2s - an array of files with the second read in the pair +## stardb_dir - a STAR reference directory that was generated using STAR genomeGenerate +## output_prefix - the prefix name that should be used to name the STAR outputs +## read_groups (optional) - a list of the read group entries to use in the output BAM from STAR +## +## Output: +## +## star_bam - the aligned reads in BAM format from STAR +## +## LICENSING : +## MIT License +## +## Copyright 2019 St. Jude Children's Research Hospital +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +## +##The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +## +##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import "../tools/star.wdl" + +workflow star_alignment { + Array[File] read_one_fastqs + Array[File] read_two_fastqs + File stardb_dir + String output_prefix + String? read_groups + + call star.alignment { + input: + read_one_fastqs=read_one_fastqs, + read_two_fastqs=read_two_fastqs, + stardb_dir=stardb_dir, + output_prefix=output_prefix, + read_groups=read_groups + } + + output { + File star_bam = alignment.star_bam + } +} diff --git a/workflows/star-db-build.wdl b/workflows/star-db-build.wdl new file mode 100644 index 000000000..0f5a413ed --- /dev/null +++ b/workflows/star-db-build.wdl @@ -0,0 +1,44 @@ +## Description: +## +## This WDL workflow generates a set of genome reference files usable by the STAR aligner from an input reference file in FASTA format. +## +## Inputs: +## +## reference_fasta - the genome for which to generate STAR reference files in FASTA format +## gencode_gtf - the gene model file for the reference genome to use when generating STAR reference files +## stardb_dir_name - a name to use for the output STAR reference directory +## +## Output: +## +## out_dir - a directory containing the STAR reference files corresponding to the input genome +## +## LICENSING : +## MIT License +## +## Copyright 2019 St. Jude Children's Research Hospital +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +## +##The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +## +##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import "../tools/star.wdl" + +workflow build_db { + File reference_fasta + File gencode_gtf + String stardb_dir_name + + call star.build_db { + input: + reference_fasta=reference_fasta, + gencode_gtf=gencode_gtf, + stardb_dir_name=stardb_dir_name, + ncpu=4 + } + + output { + File out_dir = build_db.dir + } +} diff --git a/workflows/star-qc.wdl b/workflows/star-qc.wdl new file mode 100644 index 000000000..d41543497 --- /dev/null +++ b/workflows/star-qc.wdl @@ -0,0 +1,35 @@ +## Description: +## +## This WDL workflow runs a set of QC steps on an output STAR BAM file. +## +## Inputs: +## +## bam - the BAM generated by STAR +## +## LICENSING : +## MIT License +## +## Copyright 2019 St. Jude Children's Research Hospital +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +## +##The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +## +##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import "../tools/qc.wdl" +import "../tools/samtools.wdl" +import "../tools/picard.wdl" +import "../tools/fastqc.wdl" +import "../tools/qualimap.wdl" + +workflow star_qc { + File bam + Int ncpu + + call samtools.quickcheck { input: bam=bam } + call picard.mark_duplicates { input: bam=bam } + call picard.validate_bam { input: bam=bam } + call fastqc.fastqc { input: bam=bam, ncpu=ncpu } + call qualimap.bamqc { input: bam=bam, ncpu=ncpu } +} diff --git a/workflows/start-to-finish.wdl b/workflows/start-to-finish.wdl new file mode 100755 index 000000000..16e088cda --- /dev/null +++ b/workflows/start-to-finish.wdl @@ -0,0 +1,90 @@ +## Description: +## +## This WDL workflow runs the STAR RNA-seq alignment workflow for St. Jude Cloud. The workflow takes an input BAM file and splits it into fastq files for each read in the pair. +## The read pairs are then passed through STAR alignment to generate a BAM file. The BAM is run through Picard's MarkDuplicates and several QC steps including FastQC and Qualimap. +## Quantification is done using htseq-count. A final QC report is produced by MultiQC to generate a combined overview of the QC results for the sample. +## +## Inputs: +## +## reference_fasta - the genome for which to generate STAR reference files in FASTA format +## gencode_gtf - the gene model file for the reference genome to use when generating STAR reference files +## refgene_bed - refgene variants in BED format +## bam - input BAM file to realign +## ncpu (optional) - the number of CPUs to use when running steps that support multithreading +## +## LICENSING : +## MIT License +## +## Copyright 2019 St. Jude Children's Research Hospital +## +## Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +## +##The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +## +##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +import "./bam-to-fastqs.wdl" as b2fq +import "./star-db-build.wdl" as stardb_build +import "./star-alignment.wdl" as align +import "../tools/picard.wdl" +import "../tools/fastqc.wdl" +import "../tools/rseqc.wdl" +import "../tools/qualimap.wdl" +import "../tools/htseq.wdl" +import "../tools/samtools.wdl" +import "../tools/md5sum.wdl" +import "../tools/multiqc.wdl" +import "../tools/qc.wdl" +import "../tools/util.wdl" +import "../tools/deeptools.wdl" + +workflow start_to_finish { + File reference_fasta + File gencode_gtf + File refgene_bed + File bam + Int ncpu = 1 + + call stardb_build.build_db { + input: + reference_fasta=reference_fasta, + gencode_gtf=gencode_gtf, + stardb_dir_name="stardb" + } + call util.get_read_groups { input: bam=bam } + call util.prepare_read_groups_for_star { input: read_groups=get_read_groups.out } + call b2fq.bam_to_fastqs { input: bam=bam } + call align.star_alignment { + input: + read_one_fastqs=bam_to_fastqs.read1s, + read_two_fastqs=bam_to_fastqs.read2s, + stardb_dir=build_db.out_dir, + output_prefix="out", + read_groups=prepare_read_groups_for_star.out + } + call picard.mark_duplicates { input: bam=star_alignment.star_bam } + call picard.validate_bam { input: bam=mark_duplicates.out } + call qc.parse_validate_bam { input: in=validate_bam.out } + call fastqc.fastqc { input: bam=mark_duplicates.out, ncpu=ncpu } + call rseqc.infer_experiment { input: bam=mark_duplicates.out, refgene_bed=refgene_bed} + call qc.parse_infer_experiment { input: in=infer_experiment.out } + call qualimap.bamqc { input: bam=mark_duplicates.out, ncpu=ncpu } + call qualimap.rnaseq { input: bam=mark_duplicates.out, gencode_gtf=gencode_gtf } + call htseq.count { input: bam=mark_duplicates.out, gtf=gencode_gtf } + call samtools.flagstat { input: bam=mark_duplicates.out } + call samtools.index { input: bam=mark_duplicates.out } + call md5sum.compute_checksum { input: infile=mark_duplicates.out } + call deeptools.bamCoverage { input: bam=mark_duplicates.out, bai=index.bai } + call multiqc.multiqc { + input: + star=star_alignment.star_bam, + dups=mark_duplicates.out, + validate_sam_string=validate_bam.out, + qualimap_bamqc=bamqc.out_files, + qualimap_rnaseq=rnaseq.out_files, + fastqc_files=fastqc.out_files, + flagstat_file=flagstat.flagstat + } + +}