From d149532e5d3c4a379bbd3991da922efc3426c4ed Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Tue, 15 Jul 2025 16:37:08 +0100 Subject: [PATCH 01/37] dp24-Busco-6-hotfix A quick patch to update busco 5.8 to busco 6 --- modules/nf-core/busco/busco/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index 6435dd7f..c296a8cb 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -3,8 +3,8 @@ process BUSCO_BUSCO { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container - ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c6/c607f319867d96a38c8502f751458aa78bbd18fe4c7c4fa6b9d8350e6ba11ebe/data' - : 'community.wave.seqera.io/library/busco_sepp:f2dbc18a2f7a5b64'}" + ? 'https://depot.galaxyproject.org/singularity/busco:6.0.0--pyhdfd78af_0' + : 'biocontainers/busco:6.0.0--pyhdfd78af_0'}" input: tuple val(meta), path(fasta, stageAs:'tmp_input/*') From e65ac0600b32cc13eac98e8d9ffadf62d5f95915 Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 13:35:42 +0100 Subject: [PATCH 02/37] Update main.nf Update to wave containers --- modules/nf-core/busco/busco/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index c296a8cb..6da2fc06 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -3,8 +3,8 @@ process BUSCO_BUSCO { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container - ? 'https://depot.galaxyproject.org/singularity/busco:6.0.0--pyhdfd78af_0' - : 'biocontainers/busco:6.0.0--pyhdfd78af_0'}" + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5addbdeb1831564b96f31cd130ee7d89f41f56252795176052eef9de4d3bae3a/data' + : 'community.wave.seqera.io/library/augustus_busco:b6a688c8989c7a72'}" input: tuple val(meta), path(fasta, stageAs:'tmp_input/*') From 009de8e3d6e61fc7bc0dffc97c7690c196c34330 Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 13:56:24 +0100 Subject: [PATCH 03/37] Update main.nf New new containers --- modules/nf-core/busco/busco/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index 6da2fc06..859bff9d 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -3,8 +3,8 @@ process BUSCO_BUSCO { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container - ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5addbdeb1831564b96f31cd130ee7d89f41f56252795176052eef9de4d3bae3a/data' - : 'community.wave.seqera.io/library/augustus_busco:b6a688c8989c7a72'}" + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/41/4137d65ab5b90d2ae4fa9d3e0e8294ddccc287e53ca653bb3c63b8fdb03e882f/data' + : 'community.wave.seqera.io/library/busco:6.0.0--a9a1426105f81165'}" input: tuple val(meta), path(fasta, stageAs:'tmp_input/*') From c2aaac380f288b252f75be20f0444901a401289a Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 13:56:59 +0100 Subject: [PATCH 04/37] Update environment.yml Env update --- modules/nf-core/busco/busco/environment.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml index ba8a40c0..5cbaae1b 100644 --- a/modules/nf-core/busco/busco/environment.yml +++ b/modules/nf-core/busco/busco/environment.yml @@ -1,9 +1,7 @@ --- # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - - conda-forge - - bioconda - +- conda-forge +- bioconda dependencies: - - bioconda::busco=5.8.3 - - bioconda::sepp=4.5.5 +- bioconda::busco=6.0.0 From c14c295bdf64e7465721f5ca1bd4a901db57e10d Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:01:18 +0100 Subject: [PATCH 05/37] Update base.config Addition of resources for BLASTN_TAXON I have a clutch of assemblies which are all failing at this step due to memory. I've added a multiplier too so they don't end up in a constant cycle of life and death at the same mem requirement. --- conf/base.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/base.config b/conf/base.config index a56b8807..23e897d3 100644 --- a/conf/base.config +++ b/conf/base.config @@ -110,6 +110,12 @@ process { time = 12.h } + withName: "BLAST_TAXON" { + cpus = 4 + memory = { 2.GB * task.attempt } + time = 12.h + } + withName: "NOHIT_LIST" { cpus = { task.attempt } memory = { 1.GB * Math.pow(4, task.attempt) } From 54aa4136ca7d9d3a957d86692049ffcafd483af8 Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 17:05:22 +0100 Subject: [PATCH 06/37] Update base.config Damon should use his eyes more --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 23e897d3..34d5e9d0 100644 --- a/conf/base.config +++ b/conf/base.config @@ -110,7 +110,7 @@ process { time = 12.h } - withName: "BLAST_TAXON" { + withName: "BLASTN_TAXON" { cpus = 4 memory = { 2.GB * task.attempt } time = 12.h From 118e8ca5cc35ab53c36e635d2c39b042cf7e21e3 Mon Sep 17 00:00:00 2001 From: Damon-Lee Pointon <51855558+DLBPointon@users.noreply.github.com> Date: Wed, 16 Jul 2025 19:27:46 +0100 Subject: [PATCH 07/37] Update base.config Update based on comments from @muffato --- conf/base.config | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/conf/base.config b/conf/base.config index 34d5e9d0..a429ef5f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -104,13 +104,7 @@ process { time = { 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt } } - withName: "BLAST_BLASTN" { - cpus = 4 - memory = 2.GB - time = 12.h - } - - withName: "BLASTN_TAXON" { + withName: "BLAST_BLASTN|BLASTN_TAXON" { cpus = 4 memory = { 2.GB * task.attempt } time = 12.h From 90e88a58e7c2e1c2833297081d968cb7522febd9 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 16 Jul 2025 21:57:47 +0100 Subject: [PATCH 08/37] Test data have moved off /lustre/scratch123 --- assets/test/samplesheet.csv | 8 ++++---- assets/test/samplesheet_raw.csv | 6 +++--- assets/test_full/full_samplesheet.csv | 4 ++-- conf/test_full.config | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/assets/test/samplesheet.csv b/assets/test/samplesheet.csv index 2431b0e0..01ba17e8 100644 --- a/assets/test/samplesheet.csv +++ b/assets/test/samplesheet.csv @@ -1,5 +1,5 @@ sample,datatype,datafile,library_layout -mMelMel3_hic,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED -mMelMel3_ont,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE +mMelMel3_hic,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3_ont,ont,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv index 53a5a42e..881dd51f 100644 --- a/assets/test/samplesheet_raw.csv +++ b/assets/test/samplesheet_raw.csv @@ -1,4 +1,4 @@ sample,datatype,datafile,library_layout -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED +mMelMel1,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED +mMelMel2,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED +mMelMel3,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index fb673840..b3aa6744 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ sample,datatype,datafile,library_layout -gfLaeSulp1_hic,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED -gfLaeSulp1_pacbio,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE +gfLaeSulp1_hic,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED +gfLaeSulp1_pacbio,pacbio,/nfs/treeoflife-01/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE diff --git a/conf/test_full.config b/conf/test_full.config index a86e0050..c62ca184 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -26,7 +26,7 @@ params { // Databases taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" - busco = "/lustre/scratch123/tol/resources/busco/latest" + busco = "/data/tol/resources/busco/latest" blastp = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscogenes.dmnd.tar.gz" blastx = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscoregions.dmnd.tar.gz" blastn = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/nt_gfLaeSulp1.1.tar.gz" From b8778147274e94e525bc67d452be63763c6b77fb Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 19 Aug 2025 15:42:34 +0100 Subject: [PATCH 09/37] Updates post nf-core merge --- .nf-core.yml | 2 +- CHANGELOG.md | 15 ++ modules.json | 2 +- modules/nf-core/busco/busco/busco-busco.diff | 2 +- modules/nf-core/busco/busco/environment.yml | 14 +- modules/nf-core/busco/busco/main.nf | 50 +++--- modules/nf-core/busco/busco/meta.yml | 117 +++++++------ .../nf-core/busco/busco/tests/main.nf.test | 78 +-------- .../busco/busco/tests/main.nf.test.snap | 156 +++++++----------- nextflow.config | 2 +- 10 files changed, 188 insertions(+), 250 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index 902933b1..a6039fde 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -43,4 +43,4 @@ template: outdir: . skip_features: - igenomes - version: 0.8.0 + version: 0.8.1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3de31c99..d63065c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.8.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.1)] – Sprigatito (H1) – [2025-08-19] + +### Enhancements & fixes + +- Upgrade Busco (#190) +- Update resource requirements for BLASTN modules (#191) + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| busco | 5.7.1 | 6.0.0 | + ## [[0.8.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.0)] – Sprigatito – [2025-05-19] ### Enhancements & fixes diff --git a/modules.json b/modules.json index 7278c96d..17b2e3b4 100644 --- a/modules.json +++ b/modules.json @@ -13,7 +13,7 @@ }, "busco/busco": { "branch": "master", - "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "git_sha": "de5b67889ef5f53d6996fb4dac300ed399a51212", "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, diff --git a/modules/nf-core/busco/busco/busco-busco.diff b/modules/nf-core/busco/busco/busco-busco.diff index 92a42df3..b0a3cbb4 100644 --- a/modules/nf-core/busco/busco/busco-busco.diff +++ b/modules/nf-core/busco/busco/busco-busco.diff @@ -11,7 +11,7 @@ Changes in 'busco/busco/main.nf': conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container -@@ -45,7 +44,7 @@ +@@ -51,7 +50,7 @@ def busco_lineage = lineage in ['auto', 'auto_prok', 'auto_euk'] ? lineage.replaceFirst('auto', '--auto-lineage').replaceAll('_', '-') : "--lineage_dataset ${lineage}" diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml index 5cbaae1b..42959e73 100644 --- a/modules/nf-core/busco/busco/environment.yml +++ b/modules/nf-core/busco/busco/environment.yml @@ -1,7 +1,7 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: -- conda-forge -- bioconda -dependencies: -- bioconda::busco=6.0.0 +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::busco=6.0.0 diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index 859bff9d..133883ea 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -5,31 +5,37 @@ process BUSCO_BUSCO { container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/41/4137d65ab5b90d2ae4fa9d3e0e8294ddccc287e53ca653bb3c63b8fdb03e882f/data' : 'community.wave.seqera.io/library/busco:6.0.0--a9a1426105f81165'}" + // Note: one test had to be disabled when switching to Busco 6.0.0, cf https://github.com/nf-core/modules/pull/8781/files + // Try to restore it when upgrading Busco to a later version input: - tuple val(meta), path(fasta, stageAs:'tmp_input/*') - val mode // Required: One of genome, proteins, or transcriptome - val lineage // Required: lineage for checking against, or "auto/auto_prok/auto_euk" for enabling auto-lineage - path busco_lineages_path // Recommended: BUSCO lineages file - downloads if not set - path config_file // Optional: BUSCO configuration file - val clean_intermediates // Optional: Remove intermediate files + tuple val(meta), path(fasta, stageAs: 'tmp_input/*') + // Required: One of genome, proteins, or transcriptome + val mode + // Required: lineage for checking against, or "auto/auto_prok/auto_euk" for enabling auto-lineage + val lineage + // Recommended: BUSCO lineages file - downloads if not set + path busco_lineages_path + // Optional: BUSCO configuration file + path config_file + val clean_intermediates output: - tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary - tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt , optional: true - tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true - tuple val(meta), path("*-busco.log") , emit: log , optional: true - tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table , optional: true - tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list , optional: true - tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins, optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir , optional: true - tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir , optional: true - tuple val(meta), path("*-busco") , emit: busco_dir - tuple val(meta), path("busco_downloads/lineages/*") , emit: downloaded_lineages , optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa"), emit: single_copy_faa , optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna"), emit: single_copy_fna , optional: true + tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary + tuple val(meta), path("short_summary.*.txt"), emit: short_summaries_txt, optional: true + tuple val(meta), path("short_summary.*.json"), emit: short_summaries_json, optional: true + tuple val(meta), path("*-busco.log"), emit: log, optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv"), emit: full_table, optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv"), emit: missing_busco_list, optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa"), emit: single_copy_proteins, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true + tuple val(meta), path("*-busco/*/translated_proteins"), emit: translated_dir, optional: true + tuple val(meta), path("*-busco"), emit: busco_dir + tuple val(meta), path("busco_downloads/lineages/*"), emit: downloaded_lineages, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa"), emit: single_copy_faa, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna"), emit: single_copy_fna, optional: true - path "versions.yml" , emit: versions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -105,7 +111,7 @@ process BUSCO_BUSCO { cat <<-END_VERSIONS > versions.yml "${task.process}": - busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + busco: \$( busco --version 2> /dev/null | sed 's/BUSCO //g' ) END_VERSIONS """ @@ -118,7 +124,7 @@ process BUSCO_BUSCO { cat <<-END_VERSIONS > versions.yml "${task.process}": - busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + busco: \$( busco --version 2> /dev/null | sed 's/BUSCO //g' ) END_VERSIONS """ } diff --git a/modules/nf-core/busco/busco/meta.yml b/modules/nf-core/busco/busco/meta.yml index 0222e490..281e3db0 100644 --- a/modules/nf-core/busco/busco/meta.yml +++ b/modules/nf-core/busco/busco/meta.yml @@ -26,26 +26,28 @@ input: type: file description: Nucleic or amino acid sequence file in FASTA format. pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" - - - mode: - type: string - description: The mode to run Busco in. One of genome, proteins, or transcriptome - pattern: "{genome,proteins,transcriptome}" - - - lineage: - type: string - description: The BUSCO lineage to use, or "auto", "auto_prok" or "auto_euk" - to automatically select lineage - - - busco_lineages_path: - type: directory - description: Path to local BUSCO lineages directory. - - - config_file: - type: file - description: Path to BUSCO config file. - - - clean_intermediates: - type: boolean - description: Flag to remove intermediate files. + ontologies: [] + - mode: + type: string + description: The mode to run Busco in. One of genome, proteins, or transcriptome + pattern: "{genome,proteins,transcriptome}" + - lineage: + type: string + description: The BUSCO lineage to use, or "auto", "auto_prok" or "auto_euk" to + automatically select lineage + - busco_lineages_path: + type: directory + description: Path to local BUSCO lineages directory. + - config_file: + type: file + description: Path to BUSCO config file. + ontologies: [] + - clean_intermediates: + type: boolean + description: Flag to remove intermediate files. output: - - batch_summary: - - meta: + batch_summary: + - - meta: type: map description: | Groovy Map containing sample information @@ -54,8 +56,9 @@ output: type: file description: Summary of all sequence files analyzed pattern: "*-busco.batch_summary.txt" - - short_summaries_txt: - - meta: + ontologies: [] + short_summaries_txt: + - - meta: type: map description: | Groovy Map containing sample information @@ -64,8 +67,9 @@ output: type: file description: Short Busco summary in plain text format pattern: "short_summary.*.txt" - - short_summaries_json: - - meta: + ontologies: [] + short_summaries_json: + - - meta: type: map description: | Groovy Map containing sample information @@ -74,8 +78,10 @@ output: type: file description: Short Busco summary in JSON format pattern: "short_summary.*.json" - - log: - - meta: + ontologies: + - edam: http://edamontology.org/format_3464 # JSON + log: + - - meta: type: map description: | Groovy Map containing sample information @@ -84,8 +90,9 @@ output: type: file description: BUSCO main log pattern: "*-busco.log" - - full_table: - - meta: + ontologies: [] + full_table: + - - meta: type: map description: | Groovy Map containing sample information @@ -94,8 +101,10 @@ output: type: file description: Full BUSCO results table pattern: "full_table.tsv" - - missing_busco_list: - - meta: + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + missing_busco_list: + - - meta: type: map description: | Groovy Map containing sample information @@ -104,8 +113,10 @@ output: type: file description: List of missing BUSCOs pattern: "missing_busco_list.tsv" - - single_copy_proteins: - - meta: + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + single_copy_proteins: + - - meta: type: map description: | Groovy Map containing sample information @@ -114,8 +125,9 @@ output: type: file description: Fasta file of single copy proteins (transcriptome mode) pattern: "single_copy_proteins.faa" - - seq_dir: - - meta: + ontologies: [] + seq_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -124,8 +136,8 @@ output: type: directory description: BUSCO sequence directory pattern: "busco_sequences" - - translated_dir: - - meta: + translated_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -135,8 +147,8 @@ output: description: Six frame translations of each transcript made by the transcriptome mode pattern: "translated_dir" - - busco_dir: - - meta: + busco_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -145,18 +157,19 @@ output: type: directory description: BUSCO lineage specific output pattern: "*-busco" - - downloaded_lineages: - - meta: + downloaded_lineages: + - - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test' ] - - "busco_downloads/lineages/*": + - busco_downloads/lineages/*: type: directory - description: Lineages downloaded by BUSCO when running the analysis, for example bacteria_odb12 + description: Lineages downloaded by BUSCO when running the analysis, for example + bacteria_odb12 pattern: "busco_downloads/lineages/*" - - single_copy_faa: - - meta: + single_copy_faa: + - - meta: type: map description: | Groovy Map containing sample information @@ -165,8 +178,9 @@ output: type: file description: Single copy .faa sequence files pattern: "*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa" - - single_copy_fna: - - meta: + ontologies: [] + single_copy_fna: + - - meta: type: map description: | Groovy Map containing sample information @@ -175,11 +189,14 @@ output: type: file description: Single copy .fna sequence files pattern: "*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@priyanka-surana" - "@charles-plessy" diff --git a/modules/nf-core/busco/busco/tests/main.nf.test b/modules/nf-core/busco/busco/tests/main.nf.test index 411ceb86..370e542d 100644 --- a/modules/nf-core/busco/busco/tests/main.nf.test +++ b/modules/nf-core/busco/busco/tests/main.nf.test @@ -24,7 +24,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues + input[2] = 'bacteria_odb10' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues input[3] = [] // Download busco lineage input[4] = [] // No config input[5] = false // Clean intermediates @@ -92,7 +92,7 @@ nextflow_process { ] ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -163,72 +163,6 @@ nextflow_process { } - test("test_busco_eukaryote_metaeuk") { - - config './nextflow.config' - - when { - params { - busco_args = '--tar --metaeuk' - } - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) - ] - input[1] = 'genome' - input[2] = 'eukaryota_odb10' - input[3] = [] - input[4] = [] - input[5] = false - """ - } - } - - then { - assert process.success - - with(path(process.out.short_summaries_txt[0][1]).text) { - assert contains('BUSCO version') - assert contains('The lineage dataset is') - assert contains('BUSCO was run in mode') - assert contains('Complete BUSCOs') - assert contains('Missing BUSCOs') - assert contains('Dependencies and versions') - } - - with(path(process.out.short_summaries_json[0][1]).text) { - assert contains('one_line_summary') - assert contains('mode') - assert contains('dataset') - } - - assert snapshot( - process.out.batch_summary[0][1], - process.out.full_table[0][1], - process.out.missing_busco_list[0][1], - process.out.versions[0] - ).match() - - with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { - assert contains('single_copy_busco_sequences.tar.gz') - assert contains('multi_copy_busco_sequences.tar.gz') - assert contains('fragmented_busco_sequences.tar.gz') - } - - with(path(process.out.log[0][1]).text) { - assert contains('DEBUG:busco.run_BUSCO') - assert contains('Results from dataset') - assert contains('how to cite BUSCO') - - } - - assert process.out.single_copy_proteins == [] - assert process.out.translated_dir == [] - } - - } test("test_busco_eukaryote_augustus") { @@ -292,7 +226,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) ] input[1] = 'proteins' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -358,7 +292,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fasta/test1.contigs.fa.gz', checkIfExists: true) ] input[1] = 'transcriptome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -423,7 +357,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = true @@ -467,7 +401,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false diff --git a/modules/nf-core/busco/busco/tests/main.nf.test.snap b/modules/nf-core/busco/busco/tests/main.nf.test.snap index 1026524b..5de40123 100644 --- a/modules/nf-core/busco/busco/tests/main.nf.test.snap +++ b/modules/nf-core/busco/busco/tests/main.nf.test.snap @@ -6,157 +6,123 @@ { "id": "test" }, - "test-bacteria_odb12-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], [ - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ] ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:50:57.218573431" + "timestamp": "2025-07-21T16:11:16.371060201" }, "test_busco_eukaryote_augustus": { "content": [ "test-eukaryota_odb10-busco.batch_summary.txt:md5,3ea3bdc423a461dae514d816bdc61c89", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:44:25.359421247" + "timestamp": "2025-07-21T16:09:47.906365972" }, "test_busco_genome_single_fasta": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,e3e503e1540b633d95c273c465945740", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,12e911830d66bab6dbf3523ac4392597", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:41:46.251404188" + "timestamp": "2025-07-21T16:08:41.497678114" }, "test_busco_genome_multi_fasta": { "content": [ [ - "full_table.tsv:md5,5a6bf59055e2040e74797a1e36c8e374", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039" + "full_table.tsv:md5,26b1d35d975593834acb4d4a91e225a1", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786" ], [ - "missing_busco_list.tsv:md5,a55eee6869fad9176d812e59886232fb", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46" + "missing_busco_list.tsv:md5,5dcdc7707035904a7d467ca1026b399a", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25" ], - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:42:28.126899794" - }, - "test_busco_eukaryote_metaeuk": { - "content": [ - "test-eukaryota_odb10-busco.batch_summary.txt:md5,ff6d8277e452a83ce9456bbee666feb6", - "full_table.tsv:md5,cfb55ab2ce590d2def51926324691aa8", - "missing_busco_list.tsv:md5,77e3d4503b2c13db0d611723fc83ab7e", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-03-12T10:43:59.997031348" + "timestamp": "2025-07-21T16:09:25.578789984" }, "test_busco_cleanup": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,e3e503e1540b633d95c273c465945740", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,12e911830d66bab6dbf3523ac4392597", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:50:48.928173488" + "timestamp": "2025-07-21T16:11:08.495786376" }, "test_busco_transcriptome": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,6cd69d8a66b5f8b7fd4a9de758e7a739", - "full_table.tsv:md5,4efc19f8d2cc7ea9e73425f09cb3ed97", - "missing_busco_list.tsv:md5,55f0322d494e5c165508712be63062bf", + "test-bacteria_odb10-busco.batch_summary.txt:md5,8734b3f379c4c0928e5dd4ea1873dc64", + "full_table.tsv:md5,645b65b725fd8b30ff6808e0ac671a73", + "missing_busco_list.tsv:md5,b1cc1c22d484439ac128af2290d7d9dd", [ - "9767721at2.faa:md5,1731738ca153959391f8302fd5a3679f", - "9778364at2.faa:md5,7a19a6b6696ae53efce30457b4dd1ab2", - "9782003at2.faa:md5,65d2a613c903852681981f8e8427dc70", - "9790352at2.faa:md5,5e18cfb68122dff7a61c5517246223fc", - "9791908at2.faa:md5,707ef4501f93a6e0dc217e037f26da54", - "9793681at2.faa:md5,e361d654145e70f06c386e75ad90f943", - "9800696at2.faa:md5,9e2f431e4aada7bdc2c317747105b874", - "9801107at2.faa:md5,83933b1426fc9abfe8891c49838cd02f", - "9801213at2.faa:md5,ec340354a86728189c3d1a294c0ccbad", - "9801753at2.faa:md5,39c09bd8a831c90aab44ded14c56d0e6", - "9802065at2.faa:md5,8361fa013dc1cd29af938c9d5ffebfe4", - "9802219at2.faa:md5,9e23aed07790f460da634f7f6132e73d", - "9802304at2.faa:md5,86b259197441716075f3d3d18f8743ba", - "9802309at2.faa:md5,b4b4613e9b69baa9274140c1b26cc27b", - "9802672at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", - "9803420at2.faa:md5,eec6f7189ce9a596ed6ead06f2229c8a", - "9803541at2.faa:md5,132954cc7bfcb1c1fe9da105867c4b78", - "9803667at2.faa:md5,ec31d499f6b523cb081af6a3284a5a5c", - "9803773at2.faa:md5,efbe4c35075dd8c871827d4e5ac72922", - "9804006at2.faa:md5,fca5b560714ba37be0be3e2597f74c5a", - "9804243at2.faa:md5,3280570e4357fb4daedaea8a066dbf0b", - "9804478at2.faa:md5,98c2cfd8f089812a41a1e66fea630b2d", - "9804933at2.faa:md5,de648025c49061c614c77e7c9ce7ab62", - "9805026at2.faa:md5,eea9da88f3cd718514493d6890bf7660", - "9806637at2.faa:md5,c8a9e0c37a8aeb1fd44db64fd93aa3e1", - "9806651at2.faa:md5,f5abacf8930d78c81fdeb0c91c8681a7", - "9807064at2.faa:md5,1167d5c4c044b4eb82fac5d1955e7130", - "9807233at2.faa:md5,7c8adb6556a7f9a0244e7c7e5f75f20d", - "9807240at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", - "9807458at2.faa:md5,bee695d260b2b7f8980a636fed6aa0c0", - "9808036at2.faa:md5,797ca476d2c7820151fec98d2815d6cb", - "9808348at2.faa:md5,4e8573a5d287e01aa4f5de8b48feaa42", - "9808936at2.faa:md5,30333f3f62f8e3d0ea6f6544d49572c6", - "9809052at2.faa:md5,0590efbf94fce0ad212513dcb2e8176f", - "9809084at2.faa:md5,37e6214b4204dc31858e2ef2bad5db4a", - "9809356at2.faa:md5,e18c1d5a4931a25baf7dbd1a40c417dc", - "9809796at2.faa:md5,857aac8a22c00472bfc9add7fde94c5c", - "9810191at2.faa:md5,72b63933bb045b680e0635eb03915cc0", - "9811804at2.faa:md5,da341c24e763a949d16432bb052af321", - "9812272at2.faa:md5,7a54f872dd8243c6814852d40cf1bfc0", - "9812943at2.faa:md5,149da17f067cdce328a73f6364a95b26", - "9813375at2.faa:md5,49835b9f3188434c771a840b628b07f6", - "9814755at2.faa:md5,9b4c4648d250c2e6d04acb78f9cf6df0" + "1024388at2.faa:md5,797d603d262a6595a112e25b73e878b0", + "1054741at2.faa:md5,cd4b928cba6b19b4437746ba507e7195", + "1093223at2.faa:md5,df9549708e5ffcfaee6a74dd70a0e5dc", + "1151822at2.faa:md5,12726afc1cdc40c13392e1596e93df3a", + "143460at2.faa:md5,d887431fd988a5556a523440f02d9594", + "1491686at2.faa:md5,d03362d19979b27306c192f1c74a84e5", + "1504821at2.faa:md5,4f5f6e5c57bac0092c1d85ded73d7e67", + "1574817at2.faa:md5,1153e55998c2929eacad2aed7d08d248", + "1592033at2.faa:md5,bb7a59e5f3a57ba12d10dabf4c77ab57", + "1623045at2.faa:md5,8fe38155feb1802beb97ef7714837bf5", + "1661836at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", + "1674344at2.faa:md5,bb41b44e53565a54cadf0b780532fe08", + "1698718at2.faa:md5,f233860000028eb00329aa85236c71e5", + "1990650at2.faa:md5,34a2d29c5f8b6253159ddb7a43fa1829", + "223233at2.faa:md5,dec6705c7846c989296e73942f953cbc", + "402899at2.faa:md5,acc0f271f9a586d2ce1ee41669b22999", + "505485at2.faa:md5,aa0391f8fa5d9bd19b30d844d5a99845", + "665824at2.faa:md5,47f8ad43b6a6078206feb48c2e552793", + "776861at2.faa:md5,f8b90c13f7c6be828dea3bb920195e3d", + "874197at2.faa:md5,8d22a35a768debe6f376fc695d233a69", + "932854at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", + "95696at2.faa:md5,247bfd1aef432f7b5456307768e9149c" ], - "single_copy_proteins.faa:md5,14124def13668c6d9b0d589207754b31", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "single_copy_proteins.faa:md5,73e2c5d6a9b0f01f2deea3cc5f21b764", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:45:08.029718703" + "timestamp": "2025-07-21T16:10:28.783205973" }, "test_busco_protein": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,44d4cdebd61a3c8e8981ddf1829f83b3", - "full_table.tsv:md5,350f9b1b6c37cfcf41be84e93ef41931", - "missing_busco_list.tsv:md5,a55eee6869fad9176d812e59886232fb", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,942dbb2d8ff26240860a794213db14a8", + "full_table.tsv:md5,4db33686f2755a09fdc9521ca89411bc", + "missing_busco_list.tsv:md5,5dcdc7707035904a7d467ca1026b399a", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:44:44.094048564" + "timestamp": "2025-07-21T16:10:05.674445797" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 2db7458f..16acff93 100644 --- a/nextflow.config +++ b/nextflow.config @@ -331,7 +331,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' - version = '0.8.0' + version = '0.8.1' doi = '10.5281/zenodo.7949058' } From 4b8034b4e968341a100313296c69183b9fa9c9a2 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Tue, 19 Aug 2025 16:16:50 +0100 Subject: [PATCH 10/37] Ignore environment.yml from prettier linting --- .prettierignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.prettierignore b/.prettierignore index edd29f01..2fcde7f9 100644 --- a/.prettierignore +++ b/.prettierignore @@ -11,3 +11,4 @@ testing* *.pyc bin/ ro-crate-metadata.json +modules/nf-core/busco/busco/environment.yml From 06ec96c10903d28bfc7d3862082d7f618d79bca8 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Wed, 20 Aug 2025 13:58:27 +0100 Subject: [PATCH 11/37] Update busco, thanks to muffato's prettier update to busco in nf-core --- .prettierignore | 1 - modules/nf-core/busco/busco/environment.yml | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.prettierignore b/.prettierignore index 2fcde7f9..edd29f01 100644 --- a/.prettierignore +++ b/.prettierignore @@ -11,4 +11,3 @@ testing* *.pyc bin/ ro-crate-metadata.json -modules/nf-core/busco/busco/environment.yml diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml index 42959e73..861982d0 100644 --- a/modules/nf-core/busco/busco/environment.yml +++ b/modules/nf-core/busco/busco/environment.yml @@ -1,7 +1,7 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - bioconda::busco=6.0.0 +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::busco=6.0.0 From a2d9443a89a45983265db11239301b284e9e8091 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 20 Aug 2025 18:54:02 +0100 Subject: [PATCH 12/37] URLs should be protected with <> --- CITATIONS.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 1e18be72..0f3ce9c7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,66 +1,66 @@ # sanger-tol/blobtoolkit: Citations -> Butt, Zaynab, et al. "sanger-tol/blobtoolkit" Zenodo, 2023, https://zenodo.org/doi/10.5281/zenodo.7949058. +> Butt, Zaynab, et al. "sanger-tol/blobtoolkit" Zenodo, 2023, ## [nf-core](https://nf-co.re) -> Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, https://doi.org/10.1038/s41587-020-0439-x. +> Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, ## [Nextflow](https://www.nextflow.io) -> Di Tommaso, Paolo, et al. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, vol. 35, no. 4, Apr. 2017, pp. 316–19, https://doi.org/10.1038/nbt.3820. +> Di Tommaso, Paolo, et al. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, vol. 35, no. 4, Apr. 2017, pp. 316–19, ## Pipeline tools - [BLAST+](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) - > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, https://doi.org/10.1186/1471-2105-10-421 + > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) - > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. + > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, - [BUSCO](https://gitlab.com/ezlab/busco) - > Manni, Mosè, et al. “BUSCO: Assessing Genomic Data Quality and Beyond.” Current Protocols, vol. 1, no. 12, Dec. 2021, https://doi.org/10.1002/cpz1.323. + > Manni, Mosè, et al. “BUSCO: Assessing Genomic Data Quality and Beyond.” Current Protocols, vol. 1, no. 12, Dec. 2021, - [Diamond](https://github.com/bbuchfink/diamond) - > Buchfink, Benjamin, et al. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods, vol. 18, no. 4, Apr. 2021, pp. 366–68, https://doi.org/10.1038/s41592-021-01101-x. + > Buchfink, Benjamin, et al. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods, vol. 18, no. 4, Apr. 2021, pp. 366–68, - [Fasta_windows](https://github.com/tolkit/fasta_windows) - > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows + > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. - [Minimap2](https://github.com/lh3/minimap2) - > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191. + > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, - [MultiQC](https://multiqc.info) - > Ewels, Philip, et al. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics, vol. 32, no. 19, 2016, pp. 3047–3048., https://doi.org/10.1093/bioinformatics/btw354. + > Ewels, Philip, et al. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics, vol. 32, no. 19, 2016, pp. 3047–3048., - [Samtools](https://www.htslib.org) - > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. + > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, - [SeqTK](https://github.com/lh3/seqtk) - > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk + > Li, Heng. "SeqTK v1.4" GitHub, 2023, ## Software packaging/containerisation tools - [Conda](https://conda.org/) - > conda contributors. conda: A system-level, binary package and environment manager running on all major operating systems and platforms. Computer software. https://github.com/conda/conda + > conda contributors. conda: A system-level, binary package and environment manager running on all major operating systems and platforms. Computer software. - [Bioconda](https://bioconda.github.io) - > Grüning, Björn, et al. “Bioconda: sustainable and comprehensive software distribution for the life sciences.", Nature Methods, vol. 15, Jul. 2018, pp. 475-6, https://doi.org/10.1038/s41592-018-0046-7. + > Grüning, Björn, et al. “Bioconda: sustainable and comprehensive software distribution for the life sciences.", Nature Methods, vol. 15, Jul. 2018, pp. 475-6, - [BioContainers](https://biocontainers.pro) - > da Veiga, Felipe, et al. “BioContainers: an open-source and community-driven framework for software standardization.", Bioinformatics, vol. 33, no. 16, Aug. 2017, pp. 2580-2, https://doi.org/10.1093/bioinformatics/btx192. + > da Veiga, Felipe, et al. “BioContainers: an open-source and community-driven framework for software standardization.", Bioinformatics, vol. 33, no. 16, Aug. 2017, pp. 2580-2, - [Docker](https://www.docker.com) @@ -68,4 +68,4 @@ - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) - > Kurtzer, Gregory M., et al. “Singularity: Scientific containers for mobility of compute.", PLOS ONE, vol. 12, no. 5, May 2017, pp. e0177459, https://doi.org/10.1371/journal.pone.0177459. + > Kurtzer, Gregory M., et al. “Singularity: Scientific containers for mobility of compute.", PLOS ONE, vol. 12, no. 5, May 2017, pp. e0177459, From 4361fa39cecde8305d97525798c500ebb955b2ef Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 20 Aug 2025 18:54:21 +0100 Subject: [PATCH 13/37] Added missing citations --- CITATIONS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 0f3ce9c7..5435dcd4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -16,6 +16,8 @@ > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, +- [BlobTk](https://github.com/genomehubs/blobtk) + - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, @@ -48,6 +50,10 @@ > Li, Heng. "SeqTK v1.4" GitHub, 2023, +- [WindowMasker](https://pubmed.ncbi.nlm.nih.gov/16287941/) + + > Morgulis, A., et al. 2006. WindowMasker: window-based masker for sequenced genomes. Bioinformatics. 22(2). pp.134–141. doi: 10.1093/bioinformatics/bti774. + ## Software packaging/containerisation tools - [Conda](https://conda.org/) From 75b125d23eb0b03a1c33d2650830bf485a8d46b1 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 20 Aug 2025 19:11:13 +0100 Subject: [PATCH 14/37] Docs update --- README.md | 32 ++++++++++++++++-------- docs/output.md | 14 +++++------ docs/usage.md | 67 +++++++++++++++++++++----------------------------- 3 files changed, 57 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index f635c40c..cab29ab0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png) [![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) +[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) @@ -23,10 +24,10 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) 7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond)) -8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) -11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) +11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) 12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk)) ## Usage @@ -40,13 +41,14 @@ First, prepare a samplesheet with your input data that looks as follows: ```csv sample,datatype,datafile,library_layout -mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED +mMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED -mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE +mMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE ``` -Each row represents an aligned file. -Rows with the same sample identifier are considered technical replicates. +Each row represents a read set (aligned or not). +The first column (sample name) must be unique. +If you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The library layout indicates whether the reads are paired or single. The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. @@ -78,9 +80,19 @@ For more details about the output files and reports, please refer to the [output ## Credits -sanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https://github.com/alxndrdiaz), [Zaynab Butt](https://github.com/zb32), [Matthieu Muffato](https://github.com/muffato), and [Priyanka Surana](https://github.com/priyanka-surana). The orignal design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar). +sanger-tol/blobtoolkit was written in Nextflow by: -We thank the following people for their assistance in the development of this pipeline: +- [Alexander Ramos Diaz](https://github.com/alxndrdiaz) +- [Zaynab Butt](https://github.com/zb32) +- [Priyanka Surana](https://github.com/priyanka-surana) +- [Matthieu Muffato](https://github.com/muffato) +- [Tyler Chafin](https://github.com/tkchafin) +- [Yumi Sims](https://github.com/yumisims) +- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon) + +The original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar). + +We thank the following people for their extensive assistance in the development of this pipeline: - [Guoying Qi](https://github.com/gq1) - [Bethan Yates](https://github.com/BethYates) @@ -91,7 +103,7 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations -If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) +If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/docs/output.md b/docs/output.md index c2ba0af4..cd417832 100644 --- a/docs/output.md +++ b/docs/output.md @@ -52,7 +52,7 @@ Images generated from the above blobdir using the [blobtk](https://github.com/bl ### BUSCO -BUSCO results generated by the pipeline (all BUSCO lineages that match the claassification of the species). +BUSCO results generated by the pipeline (all BUSCO lineages that match the classification of the species).
Output files @@ -71,14 +71,14 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas ### Repeat masking -Reults from the repeat-masker step -- only if the pipeline is run with `--mask`. +Results from the repeat-masker step -- only if the pipeline is run with `--mask`.
Output files - `repeats/` - `windowmasker/` - - `.fasta`: masked assembly in Fasta format. + - `.fasta`: masked assembly in FASTA format. - `.obinary`: frequency counts of repeats, in windowmasker's own binary format.
@@ -106,7 +106,7 @@ Those files are the raw data used to build the BlobDir. - `read_mapping/` - `/` - - `.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows. + - `.coverage.1k.bed.gz`: BedGraph file with the coverage of the alignments of that sample per 1 kbp windows.
@@ -119,8 +119,8 @@ Those files are the raw data used to build the BlobDir. Output files - `base_content/` - - `_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. - - `_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts. + - `_*nuc_windows.tsv.gz`: tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. + - `_freq_windows.tsv.gz`: tab-separated files with frequencies derived from the _k_-mer counts. @@ -157,7 +157,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `pipeline_info/blobtoolkit/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`. diff --git a/docs/usage.md b/docs/usage.md index 0bd4a94a..825394b7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,17 +14,6 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,datatype,datafile,library_layout -sample1,hic,hic.cram,PAIRED -sample2,illumina,illumina.cram,PAIRED -sample2,illumina,illumina.cram,PAIRED -``` - ### Full samplesheet The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. @@ -38,12 +27,12 @@ sample2,illumina,illumina.cram,PAIRED sample3,ont,ont.cram,SINGLE ``` -| Column | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | -| `datafile` | Full path to read data file. | -| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | +| Column | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. It doesn't have to be an actual _sample_ name. It is used to name the read set on the BlobToolKit viewer and therefore needs to be **unique** across the samplesheet. | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | +| `datafile` | Full path to read data file. | +| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | An [example samplesheet](../assets/test/samplesheet.csv) has been provided with the pipeline. @@ -92,7 +81,7 @@ The pipeline minimally requires outputs for the 'basal' lineages (archaea, eukar Configure access to your local databases with the `--busco`, `--blastp`, `--blastx`, `--blastn`, and `--taxdump` parameters. -Note that `--busco` refers to the download path of _all_ lineages. +Note that `--busco` refers to the download path which _contains_ the `lineages/` sub-directory. Then, when explicitly selecting the lineages to run the pipeline on, provide the names of these lineages _with_ their `_odb10` suffix as a comma-separated string. For instance: @@ -143,7 +132,7 @@ mkdir -p $NT cd $NT ``` -Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. +Retrieve the NCBI blast nt database (version 5) files and extract them. `wget` and the use of the FTP protocol are necessary to resolve the wildcard `nt.???.tar.gz`. We are using the `&&` syntax to ensure that each command completes without error before the next one is run: @@ -262,7 +251,7 @@ Nextflow ```bash # Public Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB # Draft Assemblies nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB @@ -274,7 +263,7 @@ see for some examples. ### Subworkflows -Here is a full list of snakemake subworkflows and their Nextflow couterparts: +Here is a full list of snakemake subworkflows and their Nextflow counterparts: - **`minimap.smk`** - Implemented as [`minimap_alignment.nf`](../subworkflows/local/minimap_alignment.nf). @@ -307,33 +296,33 @@ Here is a full list of snakemake subworkflows and their Nextflow couterparts: List of tools for any given dataset can be fetched from the API, for example https://blobtoolkit.genomehubs.org/api/v1/dataset/id/CAJEUD01.1/settings/software_versions. -| Dependency | Snakemake | Nextflow | -| ----------------- | --------- | -------- | -| blobtoolkit | 4.3.2 | 4.4.4 | -| blast | 2.12.0 | 2.14.1 | -| blobtk | 0.5.0 | 0.5.1 | -| busco | 5.3.2 | 5.5.0 | -| diamond | 2.0.15 | 2.1.8 | -| fasta_windows | | 0.2.4 | -| minimap2 | 2.24 | 2.24 | -| ncbi-datasets-cli | 14.1.0 | | -| nextflow | | 23.10.0 | -| python | 3.9.13 | 3.12.0 | -| samtools | 1.15.1 | 1.19.2 | -| seqtk | 1.3 | 1.4 | -| snakemake | 7.19.1 | | -| windowmasker | 2.12.0 | 2.14.0 | +| Dependency | Snakemake | Nextflow | +| ----------------- | --------- | ------------- | +| blobtoolkit | 4.3.2 | 4.4.6 | +| blast | 2.12.0 | 2.15.0 | +| blobtk | 0.5.0 | 0.5.1 | +| busco | 5.3.2 | 5.8.3 | +| diamond | 2.0.15 | 2.1.8 | +| fasta_windows | | 0.2.4 | +| minimap2 | 2.24 | 2.24-r1122 | +| ncbi-datasets-cli | 14.1.0 | | +| nextflow | | 24.04.2 | +| python | 3.9.13 | 3.12.0 | +| samtools | 1.15.1 | 1.20 and 1.21 | +| seqtk | 1.3 | 1.4 | +| snakemake | 7.19.1 | | +| windowmasker | 2.12.0 | 2.14.0 | > **NB:** Dependency has been **added** if only the Nextflow version information is present. > **NB:** Dependency has been **removed** if only the Snakemake version information is present. -> **NB:** Dependency has been **updated** if bothe the Snakemake and Nextflow version information is present. +> **NB:** Dependency has been **updated** if both the Snakemake and Nextflow version information is present. ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker –-accession GCA_accession --taxon "species name" --taxdump /path/to/taxdump --blastp /path/to/buscogenes.dmnd --blastn /path/to/blastn.nt --blastx /path/to/buscoregions.dmnd +nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker --accession GCA_accession --taxon "species name" --taxdump /path/to/taxdump --blastp /path/to/buscogenes.dmnd --blastn /path/to/blastn.nt --blastx /path/to/buscoregions.dmnd ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. From 84c9dfdcf55c27bf6743c1b3ed815e6e63f26026 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 20 Aug 2025 19:12:24 +0100 Subject: [PATCH 15/37] Added Damon to the list of contributors --- CITATION.cff | 10 ++++++++-- nextflow.config | 8 ++++++++ ro-crate-metadata.json | 25 ++++++++++++++++++------- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 5a892ee5..3a0ae93e 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -27,6 +27,12 @@ authors: given-names: Matthieu orcid: https://orcid.org/0000-0002-7860-3560 website: https://github.com/muffato + - affiliation: Wellcome Sanger Institute + email: dp24@sanger.ac.uk + family-names: Pointon + given-names: Damon-Lee Bernard + orcid: https://orcid.org/0000-0003-2949-6719 + website: https://github.com/DLBPointon - affiliation: Wellcome Sanger Institute email: 729395+gq1@users.noreply.github.com family-names: Qi @@ -57,13 +63,13 @@ authors: orcid: https://orcid.org/0000-0003-1658-1762 website: https://github.com/BethYates cff-version: 1.2.0 -date-released: "2025-04-25" +date-released: "2025-08-20" doi: 10.5281/zenodo.7949058 license: MIT message: If you use this software, please cite it using the metadata from this file and all references from CITATIONS.md . repository-code: https://github.com/sanger-tol/blobtoolkit -title: sanger-tol/blobtoolkit v0.8.0 - +title: sanger-tol/blobtoolkit v0.8.0 - Sprigatito type: software url: https://pipelines.tol.sanger.ac.uk/blobtoolkit version: 0.8.0 diff --git a/nextflow.config b/nextflow.config index 2db7458f..22603552 100644 --- a/nextflow.config +++ b/nextflow.config @@ -291,6 +291,14 @@ manifest { contribution: ['author', 'maintainer'], orcid: 'https://orcid.org/0000-0002-7860-3560' ], + [ + name: 'Pointon, Damon-Lee Bernard', + affiliation: 'Wellcome Sanger Institute', + email: 'dp24@sanger.ac.uk', + github: 'https://github.com/DLBPointon', + contribution: ['contributor'], + orcid: 'https://orcid.org/0000-0003-2949-6719' + ], [ name: 'Qi, Guoying', affiliation: 'Wellcome Sanger Institute', diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 6c09a32e..bac64400 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,8 +22,8 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "Stable", - "datePublished": "2025-05-05T22:38:28+00:00", - "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents an aligned file.\nRows with the same sample identifier are considered technical replicates.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https://github.com/alxndrdiaz), [Zaynab Butt](https://github.com/zb32), [Matthieu Muffato](https://github.com/muffato), and [Priyanka Surana](https://github.com/priyanka-surana). The orignal design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "datePublished": "2025-08-20T18:11:56+00:00", + "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -105,7 +105,7 @@ }, "mentions": [ { - "@id": "#979b8e87-8931-4977-9815-b403f45772f9" + "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7" } ], "name": "sanger-tol/blobtoolkit" @@ -156,6 +156,9 @@ { "@id": "https://orcid.org/0000-0001-8687-5905" }, + { + "@id": "https://orcid.org/0000-0003-2949-6719" + }, { "@id": "https://orcid.org/0000-0003-1262-8973" }, @@ -167,7 +170,7 @@ } ], "dateCreated": "", - "dateModified": "2025-05-05T23:38:28Z", + "dateModified": "2025-08-20T19:11:56Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nextflow", @@ -215,11 +218,11 @@ "version": "!>=24.04.2" }, { - "@id": "#979b8e87-8931-4977-9815-b403f45772f9", + "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7", "@type": "TestSuite", "instance": [ { - "@id": "#25bef01b-9ab2-4b38-9e93-733fb8af9c82" + "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0" } ], "mainEntity": { @@ -228,7 +231,7 @@ "name": "Test suite for sanger-tol/blobtoolkit" }, { - "@id": "#25bef01b-9ab2-4b38-9e93-733fb8af9c82", + "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0", "@type": "TestInstance", "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit", "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml", @@ -398,6 +401,14 @@ "name": "Muffato, Matthieu", "url": "https://github.com/muffato" }, + { + "@id": "https://orcid.org/0000-0003-2949-6719", + "@type": "Person", + "affiliation": "Wellcome Sanger Institute", + "email": "dp24@sanger.ac.uk", + "name": "Pointon, Damon-Lee Bernard", + "url": "https://github.com/DLBPointon" + }, { "@id": "https://orcid.org/0000-0003-1262-8973", "@type": "Person", From eb244955e12b006007dac573e2ef362ecc325106 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 21 Aug 2025 18:36:24 +0100 Subject: [PATCH 16/37] Corrected the initial version of Busco --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d63065c6..951ee356 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i | Dependency | Old version | New version | | ---------- | ----------- | ----------- | -| busco | 5.7.1 | 6.0.0 | +| busco | 5.8.3 | 6.0.0 | ## [[0.8.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.0)] – Sprigatito – [2025-05-19] From 1bba35ce803772f1590f8542859193fe382ff539 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 22 Aug 2025 14:59:10 +0100 Subject: [PATCH 17/37] Busco module update (stop ignoring errors) --- modules.json | 2 +- modules/nf-core/busco/busco/busco-busco.diff | 6 +++--- modules/nf-core/busco/busco/main.nf | 6 ++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/modules.json b/modules.json index 17b2e3b4..a2111293 100644 --- a/modules.json +++ b/modules.json @@ -13,7 +13,7 @@ }, "busco/busco": { "branch": "master", - "git_sha": "de5b67889ef5f53d6996fb4dac300ed399a51212", + "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, diff --git a/modules/nf-core/busco/busco/busco-busco.diff b/modules/nf-core/busco/busco/busco-busco.diff index b0a3cbb4..92a8374b 100644 --- a/modules/nf-core/busco/busco/busco-busco.diff +++ b/modules/nf-core/busco/busco/busco-busco.diff @@ -1,4 +1,5 @@ Changes in component 'nf-core/busco/busco' +'modules/nf-core/busco/busco/environment.yml' is unchanged 'modules/nf-core/busco/busco/meta.yml' is unchanged Changes in 'busco/busco/main.nf': --- modules/nf-core/busco/busco/main.nf @@ -21,8 +22,7 @@ Changes in 'busco/busco/main.nf': './*-busco/*/auto_lineage', './*-busco/*/**/{miniprot,hmmer,.bbtools}_output', -'modules/nf-core/busco/busco/environment.yml' is unchanged -'modules/nf-core/busco/busco/tests/main.nf.test' is unchanged -'modules/nf-core/busco/busco/tests/nextflow.config' is unchanged 'modules/nf-core/busco/busco/tests/main.nf.test.snap' is unchanged +'modules/nf-core/busco/busco/tests/nextflow.config' is unchanged +'modules/nf-core/busco/busco/tests/main.nf.test' is unchanged ************************************************************ diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index 133883ea..a5ecd9cd 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -109,6 +109,12 @@ process BUSCO_BUSCO { mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found." mv ${prefix}-busco/logs/busco.log ${prefix}-busco.log + if grep 'Run failed; check logs' ${prefix}-busco.batch_summary.txt > /dev/null + then + echo "Busco run failed" + exit 1 + fi + cat <<-END_VERSIONS > versions.yml "${task.process}": busco: \$( busco --version 2> /dev/null | sed 's/BUSCO //g' ) From 238d1986800c970b79658ba9edd6bad73e117e67 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 9 Sep 2025 16:09:35 +0100 Subject: [PATCH 18/37] Edited for consistency --- assets/schema_input.json | 4 ++-- nextflow_schema.json | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index db9d05c9..b97c5d0e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,7 +9,7 @@ "properties": { "sample": { "type": "string", - "description": "Sample Name", + "description": "Sample identifier", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided, be unique, and cannot contain spaces", "meta": ["id"] @@ -18,7 +18,7 @@ "type": "string", "pattern": "^\\S+$", "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"], - "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", + "errorMessage": "Data type must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", "meta": ["datatype"] }, "datafile": { diff --git a/nextflow_schema.json b/nextflow_schema.json index e83c29b1..26724683 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -303,7 +303,8 @@ "multiqc_methods_description": { "type": "string", "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" + "fa_icon": "fas fa-cog", + "hidden": true }, "validate_params": { "type": "boolean", @@ -325,9 +326,6 @@ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/institutional_config_options" - }, { "$ref": "#/$defs/reference_genome_options" }, From e805a38b69cceb26c117fb6f95cf4ced89ec3fd6 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 9 Sep 2025 16:11:10 +0100 Subject: [PATCH 19/37] More time to allow for larger genomes --- conf/base.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index a429ef5f..e0874410 100644 --- a/conf/base.config +++ b/conf/base.config @@ -84,7 +84,8 @@ process { cpus = 1 // 3 GB per 1 Gbp memory = { 3.GB * task.attempt * Math.ceil(meta.genome_size / 1000000000) } - time = { 4.h * task.attempt } + // 1 hour per 100 Mbp + time = { 1.h * Math.ceil(meta.genome_size / 100000000) * task.attempt } } withName: 'FASTAWINDOWS' { From ecad033f8470a2532edae76f5c9f0a5549ebe8cb Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 9 Sep 2025 16:15:54 +0100 Subject: [PATCH 20/37] Version bump --- .nf-core.yml | 2 +- CHANGELOG.md | 5 +++-- CITATION.cff | 6 +++--- nextflow.config | 2 +- ro-crate-metadata.json | 16 ++++++++-------- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index a6039fde..dc755be7 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -43,4 +43,4 @@ template: outdir: . skip_features: - igenomes - version: 0.8.1 + version: 0.9.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 951ee356..c3343656 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.8.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.1)] – Sprigatito (H1) – [2025-08-19] +## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – – [2025-09-11] ### Enhancements & fixes - Upgrade Busco (#190) -- Update resource requirements for BLASTN modules (#191) +- Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS +- Fixed the `test_full` profile (Sanger only) ### Software dependencies diff --git a/CITATION.cff b/CITATION.cff index 3a0ae93e..b87ed530 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -63,13 +63,13 @@ authors: orcid: https://orcid.org/0000-0003-1658-1762 website: https://github.com/BethYates cff-version: 1.2.0 -date-released: "2025-08-20" +date-released: "2025-09-10" doi: 10.5281/zenodo.7949058 license: MIT message: If you use this software, please cite it using the metadata from this file and all references from CITATIONS.md . repository-code: https://github.com/sanger-tol/blobtoolkit -title: sanger-tol/blobtoolkit v0.8.0 - Sprigatito +title: sanger-tol/blobtoolkit v0.9.0 type: software url: https://pipelines.tol.sanger.ac.uk/blobtoolkit -version: 0.8.0 +version: 0.9.0 diff --git a/nextflow.config b/nextflow.config index 3ff3a417..c65219f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -339,7 +339,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' - version = '0.8.1' + version = '0.9.0' doi = '10.5281/zenodo.7949058' } diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index bac64400..97a85b19 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,7 +22,7 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "Stable", - "datePublished": "2025-08-20T18:11:56+00:00", + "datePublished": "2025-09-10T14:37:46+00:00", "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { @@ -105,7 +105,7 @@ }, "mentions": [ { - "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7" + "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513" } ], "name": "sanger-tol/blobtoolkit" @@ -170,7 +170,7 @@ } ], "dateCreated": "", - "dateModified": "2025-08-20T19:11:56Z", + "dateModified": "2025-09-10T15:37:46Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nextflow", @@ -199,10 +199,10 @@ }, "url": [ "https://github.com/sanger-tol/blobtoolkit", - "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.8.0/" + "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.9.0/" ], "version": [ - "0.8.0" + "0.9.0" ] }, { @@ -218,11 +218,11 @@ "version": "!>=24.04.2" }, { - "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7", + "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513", "@type": "TestSuite", "instance": [ { - "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0" + "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd" } ], "mainEntity": { @@ -231,7 +231,7 @@ "name": "Test suite for sanger-tol/blobtoolkit" }, { - "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0", + "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd", "@type": "TestInstance", "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit", "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml", From b001c94005156064f9ea2bf755059c2dded824b8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 10 Sep 2025 09:18:47 +0100 Subject: [PATCH 21/37] Rolled out the session-retry mechanism to all API calls --- bin/generate_config.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/bin/generate_config.py b/bin/generate_config.py index 2135e475..1ac6c8f5 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -31,7 +31,18 @@ BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"] +# Wrapper around requests.get to use a "session", which can recover from network errors +def get_http_request_json(url): + retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + response = session.get(url) + return response.json() + +# Argument parsing def parse_args(args=None): Description = "Produce the various configuration files needed within the pipeline" @@ -99,12 +110,12 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: record_id = "taxon-%d" % taxon_name else: # Resolve the taxon_id of the species - response = requests.get(GOAT_LOOKUP_API % taxon_name).json() + response = get_http_request_json(GOAT_LOOKUP_API % taxon_name) taxon_id = int(response["results"][0]["result"]["taxon_id"]) record_id = response["results"][0]["id"] # Using API, get the taxon_ids of the species and all parents - response = requests.get(GOAT_RECORD_API % record_id).json() + response = get_http_request_json(GOAT_RECORD_API % record_id) body = response["records"][0]["record"] return make_taxon_info_from_goat(body) @@ -113,12 +124,7 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]: # "/" has to be double encoded, e.g. "Gymnodinium sp. CCAP1117/9" -> "Gymnodinium%20sp.%20CCAP1117%252F9" url_safe_taxon_name = urllib.parse.quote(str(taxon_name).replace("/", "%2F")) - retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) - adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(NCBI_TAXONOMY_API % url_safe_taxon_name).json() + response = get_http_request_json(NCBI_TAXONOMY_API % url_safe_taxon_name) if "taxonomy" in response["taxonomy_nodes"][0]: body = response["taxonomy_nodes"][0]["taxonomy"] if with_lineage: @@ -186,7 +192,7 @@ def get_odb( def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]: - response = requests.get(NCBI_DATASETS_API % accession).json() + response = get_http_request_json(NCBI_DATASETS_API % accession) if response["total_count"] != 1: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) @@ -212,7 +218,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] def get_sequence_report(accession: str): - response = requests.get(NCBI_SEQUENCE_API % accession).json() + response = get_http_request_json(NCBI_SEQUENCE_API % accession) if not response["reports"]: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) From 10840b19a775ac06b96b850c46b2d073cf631d01 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 10 Sep 2025 15:42:55 +0100 Subject: [PATCH 22/37] Changelog update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3343656..553ef0da 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes - Upgrade Busco (#190) +- The pipeline now stops on Busco failures - Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS - Fixed the `test_full` profile (Sanger only) From dce9497649421074f491420f92b1333f61fab8d8 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Wed, 10 Sep 2025 16:53:00 +0100 Subject: [PATCH 23/37] Release name ! --- CHANGELOG.md | 2 +- CITATION.cff | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 553ef0da..0926d1e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – – [2025-09-11] +## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-09-11] ### Enhancements & fixes diff --git a/CITATION.cff b/CITATION.cff index b87ed530..199f464e 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -69,7 +69,7 @@ license: MIT message: If you use this software, please cite it using the metadata from this file and all references from CITATIONS.md . repository-code: https://github.com/sanger-tol/blobtoolkit -title: sanger-tol/blobtoolkit v0.9.0 +title: sanger-tol/blobtoolkit v0.9.0 - Scyther type: software url: https://pipelines.tol.sanger.ac.uk/blobtoolkit version: 0.9.0 From 49fe8a149a4a22278d286e1a9dbf83d82ade81db Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 13:53:57 +0100 Subject: [PATCH 24/37] Update to include tmpdir --- .../diamond/blastp/diamond-blastp.diff | 21 +++++++++++++++++-- modules/nf-core/diamond/blastp/main.nf | 3 +++ .../diamond/blastx/diamond-blastx.diff | 21 +++++++++++++++++-- modules/nf-core/diamond/blastx/main.nf | 3 +++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff index 12608ea0..0ebe59eb 100644 --- a/modules/nf-core/diamond/blastp/diamond-blastp.diff +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -1,4 +1,7 @@ -Changes in module 'nf-core/diamond/blastp' +Changes in component 'nf-core/diamond/blastp' +'modules/nf-core/diamond/blastp/meta.yml' is unchanged +'modules/nf-core/diamond/blastp/environment.yml' is unchanged +Changes in 'diamond/blastp/main.nf': --- modules/nf-core/diamond/blastp/main.nf +++ modules/nf-core/diamond/blastp/main.nf @@ -12,6 +12,7 @@ @@ -17,13 +20,27 @@ Changes in module 'nf-core/diamond/blastp' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -59,6 +61,7 @@ +@@ -51,6 +53,8 @@ + gzip -c -d ${fasta} > ${fasta_name} + fi + ++ mkdir ./tmpdir/ ++ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ +@@ -59,7 +63,9 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ ++ --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} + cat <<-END_VERSIONS > versions.yml +'modules/nf-core/diamond/blastp/tests/main.nf.test' is unchanged +'modules/nf-core/diamond/blastp/tests/main.nf.test.snap' is unchanged +'modules/nf-core/diamond/blastp/tests/tags.yml' is unchanged ************************************************************ diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index ae5a1248..d2040260 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -53,6 +53,8 @@ process DIAMOND_BLASTP { gzip -c -d ${fasta} > ${fasta_name} fi + mkdir ./tmpdir/ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diamond \\ @@ -63,6 +65,7 @@ process DIAMOND_BLASTP { --outfmt ${outfmt} ${columns} \\ ${exclude_taxon} \\ ${args} \\ + --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff index eff4326a..28444535 100644 --- a/modules/nf-core/diamond/blastx/diamond-blastx.diff +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -1,4 +1,7 @@ -Changes in module 'nf-core/diamond/blastx' +Changes in component 'nf-core/diamond/blastx' +'modules/nf-core/diamond/blastx/meta.yml' is unchanged +'modules/nf-core/diamond/blastx/environment.yml' is unchanged +Changes in 'diamond/blastx/main.nf': --- modules/nf-core/diamond/blastx/main.nf +++ modules/nf-core/diamond/blastx/main.nf @@ -12,6 +12,7 @@ @@ -17,13 +20,27 @@ Changes in module 'nf-core/diamond/blastx' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -60,6 +62,7 @@ +@@ -52,6 +54,8 @@ + gzip -c -d ${fasta} > ${fasta_name} + fi + ++ mkdir ./tmpdir/ ++ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ +@@ -60,7 +64,9 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ ++ --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} \\ --log + +'modules/nf-core/diamond/blastx/tests/main.nf.test' is unchanged +'modules/nf-core/diamond/blastx/tests/main.nf.test.snap' is unchanged +'modules/nf-core/diamond/blastx/tests/tags.yml' is unchanged ************************************************************ diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index dfa82e24..a3fbb1c4 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -54,6 +54,8 @@ process DIAMOND_BLASTX { gzip -c -d ${fasta} > ${fasta_name} fi + mkdir ./tmpdir/ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diamond \\ @@ -64,6 +66,7 @@ process DIAMOND_BLASTX { --outfmt ${outfmt} ${columns} \\ ${exclude_taxon} \\ ${args} \\ + --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} \\ --log From 29d3bc73f617a92a665fdc7de9fe538ea668cfc4 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 13:55:18 +0100 Subject: [PATCH 25/37] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0926d1e4..f5791e16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The pipeline now stops on Busco failures - Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS - Fixed the `test_full` profile (Sanger only) +- Addition of `--tmpdir` to Diamond blast modules (#200) ### Software dependencies From 7e9ca920118e38455dc4a4ab2d403cea80af7780 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 13:57:38 +0100 Subject: [PATCH 26/37] Comment out some lines which may be causing a channel race condition --- subworkflows/local/input_check.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 32507e01..3bdae4ae 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -162,12 +162,12 @@ workflow INPUT_CHECK { } // Remove any invalid lineages from precomputed_busco - ch_busco_lineages_list = ch_busco_lineages.flatten() - ch_parsed_busco_filtered = ch_parsed_busco - .filter { meta, path -> - ch_busco_lineages.contains(meta.lineage) - } - ch_parsed_busco_filtered = ch_parsed_busco_filtered.ifEmpty { Channel.value([]) } + // ch_busco_lineages_list = ch_busco_lineages.flatten() + // ch_parsed_busco_filtered = ch_parsed_busco + // .filter { meta, path -> + // ch_busco_lineages.contains(meta.lineage) + // } + // ch_parsed_busco_filtered = ch_parsed_busco_filtered.ifEmpty { Channel.value([]) } // // Get the BUSCO path if set From cba18cc3641da56e8d543c9e994ed8fd614c13c7 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 15:51:03 +0100 Subject: [PATCH 27/37] Correction based on comments from @muffato --- conf/modules.config | 5 +- modules.json | 82 ++++++++++++++----- .../diamond/blastp/diamond-blastp.diff | 20 +++-- modules/nf-core/diamond/blastp/main.nf | 7 +- .../diamond/blastx/diamond-blastx.diff | 20 +++-- modules/nf-core/diamond/blastx/main.nf | 7 +- 6 files changed, 99 insertions(+), 42 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 87f686aa..af65237f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,11 +105,11 @@ process { } withName: "DIAMOND_BLASTP" { - ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastp_tmp/" : "" ) } } withName: "DIAMOND_BLASTX" { - ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastp_tmp/" : "" ) } } withName: "BLOBTK_DEPTH" { @@ -178,4 +178,3 @@ process { } } - diff --git a/modules.json b/modules.json index a2111293..c9b77b70 100644 --- a/modules.json +++ b/modules.json @@ -8,95 +8,129 @@ "blast/blastn": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "busco/busco": { "branch": "master", "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, "cat/cat": { "branch": "master", "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "diamond/blastp": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "multiqc": { "branch": "master", "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pigz/compress": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "b13f07be4c508d6ff6312d354d09f2493243e208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "seqtk/subseq": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/ustat": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -105,20 +139,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff index 0ebe59eb..81a8b4a4 100644 --- a/modules/nf-core/diamond/blastp/diamond-blastp.diff +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -20,25 +20,31 @@ Changes in 'diamond/blastp/main.nf': switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -51,6 +53,8 @@ +@@ -46,10 +48,16 @@ + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } ++ ++ def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ ++ def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' ++ + """ + if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - -+ mkdir ./tmpdir/ + ++ ${tmpdir ? "mkdir -p ${tmpdir}" : ''} + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` - diamond \\ -@@ -59,7 +63,9 @@ +@@ -59,6 +67,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ -+ --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} - cat <<-END_VERSIONS > versions.yml 'modules/nf-core/diamond/blastp/tests/main.nf.test' is unchanged 'modules/nf-core/diamond/blastp/tests/main.nf.test.snap' is unchanged diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index d2040260..382d149d 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -48,12 +48,16 @@ process DIAMOND_BLASTP { log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); break } + + def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ + def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' + """ if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - mkdir ./tmpdir/ + ${tmpdir ? "mkdir -p ${tmpdir}" : ''} DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` @@ -65,7 +69,6 @@ process DIAMOND_BLASTP { --outfmt ${outfmt} ${columns} \\ ${exclude_taxon} \\ ${args} \\ - --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff index 28444535..04b7d3c2 100644 --- a/modules/nf-core/diamond/blastx/diamond-blastx.diff +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -20,25 +20,31 @@ Changes in 'diamond/blastx/main.nf': switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -52,6 +54,8 @@ +@@ -47,10 +49,16 @@ + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } ++ ++ def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ ++ def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' ++ + """ + if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - -+ mkdir ./tmpdir/ + ++ ${tmpdir ? "mkdir -p ${tmpdir}" : ''} + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` - diamond \\ -@@ -60,7 +64,9 @@ +@@ -60,6 +68,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ + ${exclude_taxon} \\ ${args} \\ -+ --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} \\ --log - 'modules/nf-core/diamond/blastx/tests/main.nf.test' is unchanged 'modules/nf-core/diamond/blastx/tests/main.nf.test.snap' is unchanged diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index a3fbb1c4..9eddc5a4 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -49,12 +49,16 @@ process DIAMOND_BLASTX { log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); break } + + def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ + def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' + """ if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - mkdir ./tmpdir/ + ${tmpdir ? "mkdir -p ${tmpdir}" : ''} DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` @@ -66,7 +70,6 @@ process DIAMOND_BLASTX { --outfmt ${outfmt} ${columns} \\ ${exclude_taxon} \\ ${args} \\ - --tmpdir ./tmpdir/ \\ --out ${prefix}.${out_ext} \\ --log From c2dd92e60a9d91df723ac8b9e7e7be593502a302 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Fri, 19 Sep 2025 15:51:50 +0100 Subject: [PATCH 28/37] linter! --- modules.json | 82 ++++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 61 deletions(-) diff --git a/modules.json b/modules.json index c9b77b70..a2111293 100644 --- a/modules.json +++ b/modules.json @@ -8,129 +8,95 @@ "blast/blastn": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "busco/busco": { "branch": "master", "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, "cat/cat": { "branch": "master", "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "diamond/blastp": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "multiqc": { "branch": "master", "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/compress": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "b13f07be4c508d6ff6312d354d09f2493243e208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "seqtk/subseq": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/ustat": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -139,26 +105,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From 427a6bd5b4113f21bf4e053cc7822fdfdffdc8ca Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Thu, 11 Sep 2025 08:50:30 +0100 Subject: [PATCH 29/37] Changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0926d1e4..188097ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Enhancements & fixes - Upgrade Busco (#190) -- The pipeline now stops on Busco failures +- The pipeline now stops on Busco failures (#194) - Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS - Fixed the `test_full` profile (Sanger only) +- Fixed some documentation (#193 and #197) +- Made GENERATE_CONFIG more resilient to network errors (#197) ### Software dependencies From 4bd86b872cab0059619d8de164f23d99d7e4c0d5 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 22 Sep 2025 09:33:23 +0100 Subject: [PATCH 30/37] In this pipeline we are happy with Prodigal failing --- modules/nf-core/busco/busco/busco-busco.diff | 22 +++++++++++++++++--- modules/nf-core/busco/busco/main.nf | 9 ++++++-- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/modules/nf-core/busco/busco/busco-busco.diff b/modules/nf-core/busco/busco/busco-busco.diff index 92a8374b..77864722 100644 --- a/modules/nf-core/busco/busco/busco-busco.diff +++ b/modules/nf-core/busco/busco/busco-busco.diff @@ -1,5 +1,4 @@ Changes in component 'nf-core/busco/busco' -'modules/nf-core/busco/busco/environment.yml' is unchanged 'modules/nf-core/busco/busco/meta.yml' is unchanged Changes in 'busco/busco/main.nf': --- modules/nf-core/busco/busco/main.nf @@ -21,8 +20,25 @@ Changes in 'busco/busco/main.nf': def intermediate_files = [ './*-busco/*/auto_lineage', './*-busco/*/**/{miniprot,hmmer,.bbtools}_output', +@@ -112,8 +111,13 @@ + + if grep 'Run failed; check logs' ${prefix}-busco.batch_summary.txt > /dev/null + then +- echo "Busco run failed" +- exit 1 ++ if grep -Fx 'Sequence too long (max 32000000 permitted).' ${prefix}-busco.log > /dev/null ++ then ++ echo "Prodigal can't run on this genome. Skipping it" ++ else ++ echo "Busco run failed" ++ exit 1 ++ fi + fi + + cat <<-END_VERSIONS > versions.yml -'modules/nf-core/busco/busco/tests/main.nf.test.snap' is unchanged -'modules/nf-core/busco/busco/tests/nextflow.config' is unchanged +'modules/nf-core/busco/busco/environment.yml' is unchanged 'modules/nf-core/busco/busco/tests/main.nf.test' is unchanged +'modules/nf-core/busco/busco/tests/nextflow.config' is unchanged +'modules/nf-core/busco/busco/tests/main.nf.test.snap' is unchanged ************************************************************ diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index a5ecd9cd..53476230 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -111,8 +111,13 @@ process BUSCO_BUSCO { if grep 'Run failed; check logs' ${prefix}-busco.batch_summary.txt > /dev/null then - echo "Busco run failed" - exit 1 + if grep -Fx 'Sequence too long (max 32000000 permitted).' ${prefix}-busco.log > /dev/null + then + echo "Prodigal can't run on this genome. Skipping it" + else + echo "Busco run failed" + exit 1 + fi fi cat <<-END_VERSIONS > versions.yml From ab502c71eb8b3d333c053ce9bded65dd3892a4ec Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Mon, 22 Sep 2025 09:34:51 +0100 Subject: [PATCH 31/37] Increase quadratically to waste less time on retries --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index e0874410..20c72346 100644 --- a/conf/base.config +++ b/conf/base.config @@ -107,7 +107,7 @@ process { withName: "BLAST_BLASTN|BLASTN_TAXON" { cpus = 4 - memory = { 2.GB * task.attempt } + memory = { 2.GB * task.attempt * task.attempt } time = 12.h } From 6a31a21d0fd4f20b8e565fc99d52f3945d6b2a74 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Tue, 16 Sep 2025 10:55:38 +0100 Subject: [PATCH 32/37] Alternative implementation that doesn't rely on accessing a variable outside the closure --- subworkflows/local/busco_diamond_blastp.nf | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index a0138289..bea4d394 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -25,16 +25,12 @@ workflow BUSCO_DIAMOND { // Prepare the BUSCO lineages // // 0. Initialise sone variables - basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] - def lineage_position = 0 + def basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] // 1. Start from the taxon's lineages busco_lin - // 2. Add the (missing) basal lineages - | map { lineages -> (lineages + basal_lineages).unique() } - | flatten () - // 3. Add a (0-based) index to record the original order (i.e. by age) - | map { lineage_name -> [lineage_name, lineage_position++] } - // 4. Move the lineage information to `meta` to be able to distinguish the BUSCO jobs and group their outputs later + // 2. Add the (missing) basal lineages and a (0-based) index to record the original order (i.e. by age) + | flatMap { lineages -> (lineages + basal_lineages).unique().withIndex() } + // 3. Move the lineage information to `meta` to be able to distinguish the BUSCO jobs and group their outputs later | combine ( fasta ) | map { lineage_name, lineage_index, meta, genome -> [meta + [lineage_name: lineage_name, lineage_index: lineage_index], genome] } | set { ch_fasta_with_lineage } From 82b98906e9a10b1732592801ba6157e1443d43c9 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 22 Sep 2025 11:09:53 +0100 Subject: [PATCH 33/37] Updates based on comments --- CHANGELOG.md | 1 + modules.json | 82 ++++++++++++++----- .../diamond/blastp/diamond-blastp.diff | 18 ++-- modules/nf-core/diamond/blastp/main.nf | 6 +- .../diamond/blastx/diamond-blastx.diff | 18 ++-- modules/nf-core/diamond/blastx/main.nf | 6 +- nextflow.config | 2 +- nextflow_schema.json | 4 +- 8 files changed, 77 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5791e16..83d084ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS - Fixed the `test_full` profile (Sanger only) - Addition of `--tmpdir` to Diamond blast modules (#200) +- `--use_work_dir_as_temp` is no longer a hidden param. ### Software dependencies diff --git a/modules.json b/modules.json index a2111293..c9b77b70 100644 --- a/modules.json +++ b/modules.json @@ -8,95 +8,129 @@ "blast/blastn": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "busco/busco": { "branch": "master", "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, "cat/cat": { "branch": "master", "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "diamond/blastp": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "multiqc": { "branch": "master", "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pigz/compress": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "b13f07be4c508d6ff6312d354d09f2493243e208", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "seqtk/subseq": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "windowmasker/ustat": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -105,20 +139,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff index 81a8b4a4..cf18a7dd 100644 --- a/modules/nf-core/diamond/blastp/diamond-blastp.diff +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -20,24 +20,16 @@ Changes in 'diamond/blastp/main.nf': switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -46,10 +48,16 @@ - log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); - break - } -+ -+ def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ -+ def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' -+ - """ - if [ "${is_compressed}" == "true" ]; then +@@ -51,6 +53,8 @@ gzip -c -d ${fasta} > ${fasta_name} fi -+ -+ ${tmpdir ? "mkdir -p ${tmpdir}" : ''} ++ mkdir -p ./blastp_temp ++ DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` -@@ -59,6 +67,7 @@ + diamond \\ +@@ -59,6 +63,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index 382d149d..64550215 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -48,16 +48,12 @@ process DIAMOND_BLASTP { log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); break } - - def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ - def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' - """ if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - ${tmpdir ? "mkdir -p ${tmpdir}" : ''} + mkdir -p ./blastp_temp DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff index 04b7d3c2..650be74a 100644 --- a/modules/nf-core/diamond/blastx/diamond-blastx.diff +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -20,24 +20,16 @@ Changes in 'diamond/blastx/main.nf': switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -47,10 +49,16 @@ - log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); - break - } -+ -+ def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ -+ def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' -+ - """ - if [ "${is_compressed}" == "true" ]; then +@@ -52,6 +54,8 @@ gzip -c -d ${fasta} > ${fasta_name} fi -+ -+ ${tmpdir ? "mkdir -p ${tmpdir}" : ''} ++ mkdir -p ./blastx_temp ++ DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` -@@ -60,6 +68,7 @@ + diamond \\ +@@ -60,6 +64,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index 9eddc5a4..3093c673 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -49,16 +49,12 @@ process DIAMOND_BLASTX { log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); break } - - def tmpdir_arg = args =~ /--tmpdir\s+(\S+)/ - def tmpdir = tmpdir_arg ? tmpdir_arg[0][1] : '' - """ if [ "${is_compressed}" == "true" ]; then gzip -c -d ${fasta} > ${fasta_name} fi - ${tmpdir ? "mkdir -p ${tmpdir}" : ''} + mkdir -p ./blastx_temp DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diff --git a/nextflow.config b/nextflow.config index c65219f9..bacd63d9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,7 +38,7 @@ params { skip_taxon_filtering = false // Execution options - use_work_dir_as_temp = false + use_work_dir_as_temp = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 26724683..a6fbbcda 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -175,9 +175,9 @@ "properties": { "use_work_dir_as_temp": { "type": "boolean", - "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK, BLASTP, BLASTX) use the work directory for their temporary files, rather than the system default.", "fa_icon": "fas fa-arrow-circle-down", - "hidden": true + "hidden": false } }, "fa_icon": "fas fa-running" From f134a969bf73688ebe255b4cb059a88f66a3611f Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 22 Sep 2025 11:10:18 +0100 Subject: [PATCH 34/37] Updates based on comments --- modules.json | 82 ++++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 61 deletions(-) diff --git a/modules.json b/modules.json index c9b77b70..a2111293 100644 --- a/modules.json +++ b/modules.json @@ -8,129 +8,95 @@ "blast/blastn": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/blast/blastn/blast-blastn.diff" }, "busco/busco": { "branch": "master", "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, "cat/cat": { "branch": "master", "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "diamond/blastp": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff" }, "diamond/blastx": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff" }, "fastawindows": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/fastawindows/fastawindows.diff" }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/minimap2/align/minimap2-align.diff" }, "multiqc": { "branch": "master", "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/compress": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/index": { "branch": "master", "git_sha": "b13f07be4c508d6ff6312d354d09f2493243e208", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "samtools/view": { "branch": "master", "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/samtools/view/samtools-view.diff" }, "seqtk/subseq": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/seqtk/subseq/seqtk-subseq.diff" }, "untar": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/mkcounts": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "windowmasker/ustat": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -139,26 +105,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} From cc150bd9bccc370b7275c1caf1784be985ca4944 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 22 Sep 2025 12:22:33 +0100 Subject: [PATCH 35/37] Darn spelling mistakes! --- modules/nf-core/diamond/blastp/diamond-blastp.diff | 2 +- modules/nf-core/diamond/blastp/main.nf | 2 +- modules/nf-core/diamond/blastx/diamond-blastx.diff | 2 +- modules/nf-core/diamond/blastx/main.nf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff index cf18a7dd..7d0916da 100644 --- a/modules/nf-core/diamond/blastp/diamond-blastp.diff +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -24,7 +24,7 @@ Changes in 'diamond/blastp/main.nf': gzip -c -d ${fasta} > ${fasta_name} fi -+ mkdir -p ./blastp_temp ++ mkdir -p ./blastp_tmp + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index 64550215..4d453763 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -53,7 +53,7 @@ process DIAMOND_BLASTP { gzip -c -d ${fasta} > ${fasta_name} fi - mkdir -p ./blastp_temp + mkdir -p ./blastp_tmp DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff index 650be74a..5219a26f 100644 --- a/modules/nf-core/diamond/blastx/diamond-blastx.diff +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -24,7 +24,7 @@ Changes in 'diamond/blastx/main.nf': gzip -c -d ${fasta} > ${fasta_name} fi -+ mkdir -p ./blastx_temp ++ mkdir -p ./blastx_tmp + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index 3093c673..d45c5d5d 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -54,7 +54,7 @@ process DIAMOND_BLASTX { gzip -c -d ${fasta} > ${fasta_name} fi - mkdir -p ./blastx_temp + mkdir -p ./blastx_tmp DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` From a468982f627e97cac1c4708163a2cd6850040e97 Mon Sep 17 00:00:00 2001 From: DLBPointon Date: Mon, 22 Sep 2025 12:43:40 +0100 Subject: [PATCH 36/37] Darn spelling mistakes! --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index af65237f..a28287c3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -109,7 +109,7 @@ process { } withName: "DIAMOND_BLASTX" { - ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastp_tmp/" : "" ) } + ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastx_tmp/" : "" ) } } withName: "BLOBTK_DEPTH" { From c74dcea9c94c6cffc70eaaeb1697e07242824ca3 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 3 Oct 2025 11:19:24 +0100 Subject: [PATCH 37/37] New release date --- CHANGELOG.md | 2 +- CITATION.cff | 2 +- ro-crate-metadata.json | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87031f64..35f48151 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-09-11] +## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-10-03] ### Enhancements & fixes diff --git a/CITATION.cff b/CITATION.cff index 199f464e..5008c495 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -63,7 +63,7 @@ authors: orcid: https://orcid.org/0000-0003-1658-1762 website: https://github.com/BethYates cff-version: 1.2.0 -date-released: "2025-09-10" +date-released: "2025-10-03" doi: 10.5281/zenodo.7949058 license: MIT message: If you use this software, please cite it using the metadata from this file diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 97a85b19..9839d6d4 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,7 +22,7 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "Stable", - "datePublished": "2025-09-10T14:37:46+00:00", + "datePublished": "2025-10-03T10:18:25+00:00", "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { @@ -105,7 +105,7 @@ }, "mentions": [ { - "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513" + "@id": "#6708f1e3-5ef2-4308-b443-63390e3bac9c" } ], "name": "sanger-tol/blobtoolkit" @@ -170,7 +170,7 @@ } ], "dateCreated": "", - "dateModified": "2025-09-10T15:37:46Z", + "dateModified": "2025-10-03T11:18:25Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nextflow", @@ -218,11 +218,11 @@ "version": "!>=24.04.2" }, { - "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513", + "@id": "#6708f1e3-5ef2-4308-b443-63390e3bac9c", "@type": "TestSuite", "instance": [ { - "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd" + "@id": "#d0d252ee-26c2-4e03-bb49-98d3ad100a90" } ], "mainEntity": { @@ -231,7 +231,7 @@ "name": "Test suite for sanger-tol/blobtoolkit" }, { - "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd", + "@id": "#d0d252ee-26c2-4e03-bb49-98d3ad100a90", "@type": "TestInstance", "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit", "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml",