Merge pull request #28 from replikation/report_file

rki output, fastq_raw input, and some smaller fixes, results structuree optimized
replikation · Jan 22, 2021 · ed123b9 · ed123b9
2 parents 29e0d4a + db4f47d
commit ed123b9
Show file tree

Hide file tree

Showing 20 changed files with 335 additions and 37 deletions.
diff --git a/bin/rki_report_parser.sh b/bin/rki_report_parser.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#Info: Creates report.csv for RKI from all pangolin.csv-files in the actual working dir.
+
+SENDING_LAB_ID=$1
+
+echo "IMS_ID,SENDING_LAB,DATE_DRAW,SEQ_TYPE,SEQ_REASON,SAMPLE_TYPE,OWN_FASTA_ID" > rki_report.csv
+
+for FILENAME in lineage*.csv; do
+    IMS_ID=$(echo "IMS-00000-CVDP-00000")
+    SENDING_LAB=$(echo "$SENDING_LAB_ID")       #In Process export "SENDING_LAB_ID" (given as Input/default when using [--rki]).
+    DATE_DRAW=$(echo "YYYYMMDD")
+    SEQU_TYPE=$(echo "OXFORD_NANOPORE")
+    SEQU_REASON=$(echo "X")
+    SAMPLE_TYPE=$(echo "X")
+    OWN_FASTA_ID=$(tail -n+2 "$FILENAME" | rev | cut -f 6-10 -d "," | rev)
+    echo $IMS_ID","$SENDING_LAB","$DATE_DRAW","$SEQU_TYPE","$SEQU_REASON","$SAMPLE_TYPE","$OWN_FASTA_ID >> rki_report.csv
+done
+
+
diff --git a/configs/nodes.config b/configs/nodes.config
@@ -9,7 +9,7 @@ process {
     withLabel:  pangolin    { cpus = 4; memory = '4 GB' }
     withLabel:  president   { cpus = 1; memory = '2 GB' }
     withLabel:  snippy      { cpus = 4; memory = '4 GB' }
-    withLabel:  ubuntu      { cpus = 1; memory = '2 GB' }
+    withLabel:  ubuntu      { cpus = 2; memory = '2 GB' }
     withLabel:  guppy_cpu   { cpus = 60; memory = '60 GB' }
     withLabel:  guppy_gpu   { cpus = 10; memory = '16 GB' }
 }
diff --git a/data/rki_report/Readme.md b/data/rki_report/Readme.md
@@ -0,0 +1,110 @@
+# RKI Report-file Elements
+
+[toc]
+
+## Source -Please check first
+
+* https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/DESH/DESH.html
+* https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/DESH/Cryptshare-Anleitung.pdf?__blob=publicationFile [v1.1 (2021-01-21)]
+
+## Data Overview
+**The report must contain following seven elements in this order:**
+
+1. IMS_ID
+2. SENDING_LAB
+3. DATE_DRAW
+4. SEQ_TYPE
+5. SEQ_REASON
+6. SAMPLE_TYPE
+7. OWN_FASTA_ID
+
+In the following a short description of each element is given.
+
+
+### 1. IMS_ID
+
+Sequencing-based pseudonym as unique identifier for the aggregation in DEMIS ("Deutschen Elektronischen Melde- und Informationssystem für den Infektionsschutz"). 
+
+**Format: IMS-12345-CVDP-00001**
+* IMS: permanent prefix 
+* 12345: 5-digit identifier of the sequencing laboratory ("Untersuchungslabor"), analog to the already existing DEMIS-system (DEMIS-10001 to currently DEMIS-10563). The list is managed by the DEMIS-Geschäftsstelle. If you are not registered already pls reach out to demis@rki.de
+* CVDP: 4-digit DEMIS-abbreviation, which is directly dedicated to the "Meldetatbestand". Later additional pathogen-abbreviations, aside from SARS-CoV-2, applied.
+* 00001: Ongoing Number, which in phase 0 is autonmously continued by the laboratory (later in phase 1 the system will automatically gernerate it).
+
+
+### 2. SENDING_LAB
+
+12345: 5-digit identifier of the sending laboratory, analog to the already existing DEMIS-system.
+
+**ATTENTION:**
+This only applys for laboratories, which don´t sequence on their own, but instead sent their samples to other laboratories for sequencing. In case that the sending lab is also the sequencing lab the digit from the IMS_ID and the SENDING_LAB-id can be identical.
+
+
+### 3. DATE_DRAW
+
+Date of the sample isolation n ISO8601 (YYYYMMDD)
+
+
+### 4. SEQ_TYPE
+
+Used sequencing-platform. "OXFORD_NANOPORE" is provided automatically as entry.
+
+
+### 5. SEQ_REASON
+
+Cause for the sequencing. Choose one entry from the following list:
+
+|Entry|Description|
+|-|-|
+|X|Unknown to the sequencing laboratory|
+|N|No (e. g. random selection of a PCR-positive sample for sequencing)|
+|Y|Yes, but the kind of mutation or variante is unknown (to the sequencing laboratory) 
+|A|Yes, it exists evidence for the mutation/variante from previous diagnostic [spezifying in textfield after entry-letter]|
+
+**Note for "A":**
+* Textfield, max. length 64 signs
+* Mutation to specify in "[ ]", in case of multiple mutations divided by "/"
+* Example entry: A[B.1.1.7/B.1.351]
+
+
+### 6. SAMPLE_TYPE
+
+Type of sample. Choose one entry from the following list:
+
+|Entry|Description|
+|-|-|
+|s001|Upper respiratory swab sample (specimen)|
+|s002|Nasopharyngeal swab (specimen)|
+|s003|Swab from nasal sinus (specimen)|
+|s004|Anterior nares swab (specimen)|
+|s005|Oropharyngeal aspirate (specimen)|
+|s006|Nasopharyngeal aspirate (specimen)|
+|s007|Lower respiratory sample (specimen)|
+|s008|Bronchoalveolar lavage fluid sample (specimen)|
+|s009|Sputum specimen (specimen)|
+|s010|Specimen from trachea obtained by aspiration (specimen)|
+|s011|Pleural fluid specimen (specimen)|
+|s012|Specimen from lung obtained by biopsy (specimen)
+|s013|Blood specimen (specimen)|
+|s014|Plasma specimen or serum specimen or whole blood specimen (specimen)|
+|s015|Whole blood sample (specimen)|
+|s016|Stool specimen (specimen)|
+|s017|Urine specimen (specimen)|
+|s018|Lower respiratory fluid sample (specimen)|
+|s019|Nasopharyngeal washings (specimen)|
+|s020|Plasma specimen (specimen)|
+|s021|Saliva specimen (specimen)|
+|s022|Serum specimen (specimen)|
+|s023|Specimen unsatisfactory for evaluation (finding)|
+|s024|Swab of internal nose (specimen)|
+|s025|Throat swab (specimen)|
+|X|Unknown (to the sequencing laboratory)
+
+Value Set is geared to SNOMED CT and SNOMED CT COVID-19 Related Content(https://simplifier.net/covid-19labormeldung/materialsarscov2)
+
+### 7. OWN_FASTA_ID
+
+Laboratory-internal identifier, which enables the distinct assignment of the FASTA-file to the sequence (given in the FASTA-header). Autoprovided by PoreCov-workflow, according to your sample-names.
+
+**NOTE:**
+Used in phase 0 for the assignment of the metadata from the .csv-file to the sequence-data in the FASTA-file.
diff --git a/data/rki_report/Readme.pdf b/data/rki_report/Readme.pdf
diff --git a/nextflow.config b/nextflow.config
@@ -15,6 +15,7 @@ params {
     fasta = ''
     multifasta = ''
     fastq = ''
+    fastq_raw = ''
     list = false
 
     references = ''
@@ -34,6 +35,7 @@ params {
     one_end = false
 
     // parameters
+    rki=''
     primerV = 'V3'
     minLength = '400'
     maxLength = '700'
@@ -46,17 +48,23 @@ params {
     output = 'results'
     cachedir = "singularity_images"
     databases = "references_nCov19"
+    readsdir = "0.Lineages"
+    readqcdir = "1.Read_quality"
+    genomedir = "2.Genomes"
+    lineagedir = "3.Lineages"
+    rkidir = "4.RKI-summary"
+    runinfodir = "X.Pipeline-runinfo"
 }
 
 // runinfo
 timeline {
   enabled = true
-  file = "${params.output}/runinfo/execution_timeline.html"
+  file = "${params.output}/${params.runinfodir}/execution_timeline.html"
 }
 
 report {
   enabled = true
-  file = "${params.output}/runinfo/execution_report.html"
+  file = "${params.output}/${params.runinfodir}/execution_report.html"
 }
 
 

diff --git a/poreCov.nf b/poreCov.nf
@@ -65,7 +65,7 @@ if ( params.dir || workflow.profile.contains('test_fast5') ) { basecalling() }
 
 // params help
 if (!workflow.profile.contains('test_fastq') && !workflow.profile.contains('test_fast5') && !workflow.profile.contains('test_fasta')) {
-    if (!params.fasta &&  !params.dir &&  !params.fastq ) {
+    if (!params.fasta &&  !params.dir &&  !params.fastq &&  !params.fastq_raw ) {
         exit 1, "input missing, use [--fasta] [--fastq] or [--dir]"}
     if ((params.fasta && ( params.fastq || params.dir )) || ( params.fastq && params.dir )) {
         exit 1, "To much inputs: please us either: [--fasta], [--fastq] or [--dir]"} 
@@ -103,17 +103,24 @@ if (!workflow.profile.contains('test_fastq') && !workflow.profile.contains('test
 
 // fastq input or via csv file
     if (params.fastq && params.list && !workflow.profile.contains('test_fastq')) { 
-        fastq_input_ch = Channel
+        fastq_file_ch = Channel
         .fromPath( params.fastq, checkIfExists: true )
         .splitCsv()
         .map { row -> ["${row[0]}", file("${row[1]}", checkIfExists: true)] }
     }
     else if (params.fastq && !workflow.profile.contains('test_fastq')) { 
-        fastq_input_ch = Channel
+        fastq_file_ch = Channel
         .fromPath( params.fastq, checkIfExists: true)
         .map { file -> tuple(file.simpleName, file) }
     }
 
+// fastq raw input direct from basecalling
+    else if (params.fastq_raw && !workflow.profile.contains('test_fastq')) { 
+        fastq_dir_ch = Channel
+        .fromPath( params.fastq_raw, checkIfExists: true, type: 'dir')
+        .map { file -> tuple(file.simpleName, file) }
+    }
+
 // dir input
     if (params.dir && !workflow.profile.contains('test_fast5')) { dir_input_ch = Channel
         .fromPath( params.dir, checkIfExists: true, type: 'dir')
@@ -135,10 +142,12 @@ include { get_fast5 } from './modules/get_fast5_test_data.nf'
 include { artic_ncov_wf } from './workflows/artic_nanopore_nCov19.nf'
 include { basecalling_wf } from './workflows/basecalling.nf'
 include { build_database_wf } from './workflows/databases.nf'
+include { collect_fastq_wf } from './workflows/collect_fastq.nf'
 include { create_tree_wf } from './workflows/create_tree.nf'
 include { determine_lineage_wf } from './workflows/determine_lineage.nf'
 include { genome_quality_wf } from './workflows/genome_quality.nf'
 include { read_qc_wf } from './workflows/read_qc.nf'
+include { rki_report_wf } from './workflows/provide_rki.nf'
 include { toytree_wf } from './workflows/toytree.nf'
 
 /************************** 
@@ -152,17 +161,31 @@ workflow {
         if ( workflow.profile.contains('test_fast5')) { dir_input_ch =  get_fast5().map {it -> ['SARSCoV2', it] } }
 
     // 1. Reconstruct genomes
+        // fast5
         if (params.dir || workflow.profile.contains('test_fast5')) { 
             fasta_input_ch = artic_ncov_wf(basecalling_wf(dir_input_ch))
         }
-        if (params.fastq || workflow.profile.contains('test_fastq')) { 
+        // fastq input via dir and or files
+        if ( (params.fastq || params.fastq_raw) || workflow.profile.contains('test_fastq')) { 
+            if (params.fastq_raw && !params.fastq) { fastq_input_ch = collect_fastq_wf(fastq_dir_ch) }
+            if (!params.fastq_raw && params.fastq) { fastq_input_ch = fastq_file_ch }
+            if (params.fastq_raw && params.fastq) { fastq_input_ch = collect_fastq_wf(fastq_dir_ch).mix(fastq_file_ch) }
+
             read_qc_wf(fastq_input_ch)
             fasta_input_ch = artic_ncov_wf(fastq_input_ch)
         }
 
     // 2. Genome quality and lineages
         determine_lineage_wf(fasta_input_ch)
         genome_quality_wf(fasta_input_ch, reference_for_qc_input_ch)
+        if (params.rki) { 
+            // prepare metadata table
+            rki_report_wf(determine_lineage_wf.out)
+            // collect a multifasta file
+            fasta_input_ch
+                .map { it -> it[1] } 
+                .collectFile(name: 'all_genomes.fasta', storeDir: params.output + "/" + params.rkidir +"/")
+        }
 
 
     // 3. (optional) analyse genomes to references and build tree
@@ -214,10 +237,17 @@ def helpMSG() {
     --fastq         one fastq or fastq.gz file per sample or
                     multiple file-samples: --fastq 'sample_*.fasta.gz'
                     ${c_dim}[nCov genome reconstruction]${c_reset}
+    --fastq_raw     raw directory from guppy with basecalled .fastq files
+                    --fastq_raw 'basecalls/'
+                    add --single flag if you dont have barcodes (single sample)
+                    ${c_dim}[nCov genome reconstruction]${c_reset}
 
     --fasta         direct input of genomes, one file per genome
                     ${c_dim}[Lineage determination, Quality control]${c_reset}
 
+    ${c_yellow}Workflow control ${c_reset}
+    --rki           5-digit DEMIS identifier of sending laboratory for RKI style summary
+
     ${c_yellow}Parameters - Basecalling${c_reset}
     --localguppy    use a native guppy installation instead of a gpu-guppy-docker 
                     native guppy installation is used by default for singularity or conda
@@ -259,7 +289,8 @@ def helpMSG() {
                     [default: $params.cachedir] 
 
     ${c_yellow}Execution/Engine profiles:${c_reset}
-    poreCov supports profiles to run via different ${c_green}Executers${c_reset} and ${c_blue}Engines${c_reset} e.g.:
+    poreCov supports profiles to run via different ${c_green}Executers${c_reset} and ${c_blue}Engines${c_reset} 
+    examples:
      -profile ${c_green}local${c_reset},${c_blue}docker${c_reset}
      -profile ${c_yellow}test_fastq${c_reset},${c_green}slurm${c_reset},${c_blue}singularity${c_reset}
 
@@ -305,9 +336,9 @@ def defaultMSG(){
 
 def v1200_MSG() {
     log.info """
-    1200 bp options are used as primer scheme (V1200)
-      --minLength set to 250bp
-      --maxLength set to 1500bp
+    1200 bp amplicon scheme is used [--primerV V1200]
+    \033[2m  --minLength set to 250bp
+      --maxLength set to 1500bp\u001B[0m
     \u001B[1;30m______________________________________\033[0m
     """.stripIndent()
 }
@@ -321,3 +352,4 @@ def basecalling() {
     \u001B[1;30m______________________________________\033[0m
     """.stripIndent()
 }
+
diff --git a/workflows/collect_fastq.nf b/workflows/collect_fastq.nf
@@ -0,0 +1,17 @@
+include { collect_fastq } from './process/collect_fastq'
+
+workflow collect_fastq_wf {
+    take: 
+        fastq_dir  
+    main:
+        collect_fastq(fastq_dir)
+
+        if (params.single) { fastq_channel = collect_fastq.out }
+        else { fastq_channel = collect_fastq.out
+                            .map { it -> it[1] }
+                            .flatten()
+                            .map { it -> [ it.simpleName, it ] }
+        }
+
+    emit: fastq_channel
+} 
diff --git a/workflows/determine_lineage.nf b/workflows/determine_lineage.nf
@@ -8,13 +8,13 @@ workflow determine_lineage_wf {
         pangolin(fasta)
 
         // collect lineage also to a summary     
-        channel_tmp = pangolin.out[1]
+        channel_tmp = pangolin.out.map {it -> it[1]}
                 .splitCsv(header: true, sep: ',')
-                .collectFile(seed: 'taxon,lineage,probability,pangoLEARN_version,status,note\n', 
-                            storeDir: params.output + "/summary/") {
+                .collectFile(seed: 'sequence_name,lineage,probability,pangoLEARN_version,status,note\n', 
+                            storeDir: params.output + "/" + params.lineagedir + "/") {
                             row -> [ "metadata.tsv", row.taxon + ',' + row.lineage + ',' + row.probability + ',' + 
                             row.'pangoLEARN_version' + ',' + row.status + ',' + row.note + '\n']
                             }
     emit:
-        pangolin.out[0]
-} 
+        pangolin.out
+} 
diff --git a/workflows/process/artic.nf b/workflows/process/artic.nf
@@ -1,6 +1,6 @@
 process artic {
         label 'artic'
-        publishDir "${params.output}/fasta/${name}/", mode: 'copy'
+        publishDir "${params.output}/${params.genomedir}/${name}/", mode: 'copy'
     input:
         tuple val(name), path(reads)
     output:
@@ -11,12 +11,14 @@ process artic {
         artic minion --medaka --normalise 200 --threads ${task.cpus} --scheme-directory /artic-ncov2019/primer_schemes \
             --read-file ${reads} nCoV-2019/${params.primerV} ${name}
         zcat ${name}.pass.vcf.gz > SNP_${name}.pass.vcf
+
+        sed -i "1s/.*/>${name}/" *.consensus.fasta
         """
 }
 
 process artic_V1200 {
         label 'artic'
-        publishDir "${params.output}/fasta/${name}/", mode: 'copy'
+        publishDir "${params.output}/${params.genomedir}/${name}/", mode: 'copy'
     input:
         tuple val(name), path(reads), path(external_scheme)
     output:
@@ -27,6 +29,7 @@ process artic_V1200 {
         artic minion --medaka --normalise 200 --threads ${task.cpus} --scheme-directory ${external_scheme} \
             --read-file ${reads} nCoV-2019/${params.primerV} ${name}
         zcat ${name}.pass.vcf.gz > SNP_${name}.pass.vcf
-        """
 
+        sed -i "1s/.*/>${name}/" *.consensus.fasta
+        """
 }