sanger-tol · muffato · Sep 10, 2025 · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -43,4 +43,4 @@ template:
   outdir: .
   skip_features:
     - igenomes
-  version: 0.8.1
+  version: 0.9.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,12 +3,14 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [[0.8.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.1)] – Sprigatito (H1) – [2025-08-19]
+## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-09-11]
 
 ### Enhancements & fixes
 
 - Upgrade Busco (#190)
-- Update resource requirements for BLASTN modules (#191)
+- The pipeline now stops on Busco failures
+- Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS
+- Fixed the `test_full` profile (Sanger only)
 
 ### Software dependencies
 

diff --git a/CITATION.cff b/CITATION.cff
@@ -63,13 +63,13 @@ authors:
     orcid: https://orcid.org/0000-0003-1658-1762
     website: https://github.com/BethYates
 cff-version: 1.2.0
-date-released: "2025-08-20"
+date-released: "2025-09-10"
 doi: 10.5281/zenodo.7949058
 license: MIT
 message: If you use this software, please cite it using the metadata from this file
   and all references from CITATIONS.md .
 repository-code: https://github.com/sanger-tol/blobtoolkit
-title: sanger-tol/blobtoolkit v0.8.0 - Sprigatito
+title: sanger-tol/blobtoolkit v0.9.0 - Scyther
 type: software
 url: https://pipelines.tol.sanger.ac.uk/blobtoolkit
-version: 0.8.0
+version: 0.9.0
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -9,7 +9,7 @@
         "properties": {
             "sample": {
                 "type": "string",
-                "description": "Sample Name",
+                "description": "Sample identifier",
                 "pattern": "^\\S+$",
                 "errorMessage": "Sample name must be provided, be unique, and cannot contain spaces",
                 "meta": ["id"]
@@ -18,7 +18,7 @@
                 "type": "string",
                 "pattern": "^\\S+$",
                 "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"],
-                "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'",
+                "errorMessage": "Data type must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'",
                 "meta": ["datatype"]
             },
             "datafile": {

diff --git a/bin/generate_config.py b/bin/generate_config.py
@@ -31,7 +31,18 @@
 
 BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"]
 
+# Wrapper around requests.get to use a "session", which can recover from network errors
+def get_http_request_json(url):
+    retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504])
+    adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+    session = requests.Session()
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    response = session.get(url)
+    return response.json()
+
 
+# Argument parsing
 def parse_args(args=None):
     Description = "Produce the various configuration files needed within the pipeline"
 
@@ -99,12 +110,12 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
         record_id = "taxon-%d" % taxon_name
     else:
         # Resolve the taxon_id of the species
-        response = requests.get(GOAT_LOOKUP_API % taxon_name).json()
+        response = get_http_request_json(GOAT_LOOKUP_API % taxon_name)
         taxon_id = int(response["results"][0]["result"]["taxon_id"])
         record_id = response["results"][0]["id"]
 
     # Using API, get the taxon_ids of the species and all parents
-    response = requests.get(GOAT_RECORD_API % record_id).json()
+    response = get_http_request_json(GOAT_RECORD_API % record_id)
     body = response["records"][0]["record"]
     return make_taxon_info_from_goat(body)
 
@@ -113,12 +124,7 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
 def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]:
     # "/" has to be double encoded, e.g. "Gymnodinium sp. CCAP1117/9" -> "Gymnodinium%20sp.%20CCAP1117%252F9"
     url_safe_taxon_name = urllib.parse.quote(str(taxon_name).replace("/", "%2F"))
-    retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504])
-    adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
-    session = requests.Session()
-    session.mount("http://", adapter)
-    session.mount("https://", adapter)
-    response = session.get(NCBI_TAXONOMY_API % url_safe_taxon_name).json()
+    response = get_http_request_json(NCBI_TAXONOMY_API % url_safe_taxon_name)
     if "taxonomy" in response["taxonomy_nodes"][0]:
         body = response["taxonomy_nodes"][0]["taxonomy"]
         if with_lineage:
@@ -186,7 +192,7 @@ def get_odb(
 
 
 def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]:
-    response = requests.get(NCBI_DATASETS_API % accession).json()
+    response = get_http_request_json(NCBI_DATASETS_API % accession)
     if response["total_count"] != 1:
         print(f"Assembly not found: {accession}", file=sys.stderr)
         sys.exit(1)
@@ -212,7 +218,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]
 
 
 def get_sequence_report(accession: str):
-    response = requests.get(NCBI_SEQUENCE_API % accession).json()
+    response = get_http_request_json(NCBI_SEQUENCE_API % accession)
     if not response["reports"]:
         print(f"Assembly not found: {accession}", file=sys.stderr)
         sys.exit(1)

diff --git a/conf/base.config b/conf/base.config
@@ -84,7 +84,8 @@ process {
         cpus   = 1
         // 3 GB per 1 Gbp
         memory = { 3.GB * task.attempt * Math.ceil(meta.genome_size / 1000000000) }
-        time   = { 4.h  * task.attempt }
+        // 1 hour per 100 Mbp
+        time   = { 1.h  * Math.ceil(meta.genome_size / 100000000) * task.attempt }
     }
 
     withName: 'FASTAWINDOWS' {

diff --git a/nextflow.config b/nextflow.config
@@ -339,7 +339,7 @@ manifest {
     mainScript      = 'main.nf'
     defaultBranch   = 'main'
     nextflowVersion = '!>=24.04.2'
-    version         = '0.8.1'
+    version         = '0.9.0'
     doi             = '10.5281/zenodo.7949058'
 }
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -303,7 +303,8 @@
                 "multiqc_methods_description": {
                     "type": "string",
                     "description": "Custom MultiQC yaml file containing HTML including a methods description.",
-                    "fa_icon": "fas fa-cog"
+                    "fa_icon": "fas fa-cog",
+                    "hidden": true
                 },
                 "validate_params": {
                     "type": "boolean",
@@ -325,9 +326,6 @@
         {
             "$ref": "#/$defs/input_output_options"
         },
-        {
-            "$ref": "#/$defs/institutional_config_options"
-        },
         {
             "$ref": "#/$defs/reference_genome_options"
         },

diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
@@ -22,7 +22,7 @@
             "@id": "./",
             "@type": "Dataset",
             "creativeWorkStatus": "Stable",
-            "datePublished": "2025-08-20T18:11:56+00:00",
+            "datePublished": "2025-09-10T14:37:46+00:00",
             "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n<!--[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)-->\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n   -profile <docker/singularity/.../institute> \\\n   --input samplesheet.csv \\\n   --outdir <OUTDIR> \\\n   --fasta genome.fasta \\\n   --accession GCA_XXXXXXXXX.X \\\n   --taxon XXXX \\\n   --taxdump /path/to/taxdump/database \\\n   --blastp /path/to/diamond/database \\\n   --blastn /path/to/blastn/database \\\n   --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
             "hasPart": [
                 {
@@ -105,7 +105,7 @@
             },
             "mentions": [
                 {
-                    "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7"
+                    "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513"
                 }
             ],
             "name": "sanger-tol/blobtoolkit"
@@ -170,7 +170,7 @@
                 }
             ],
             "dateCreated": "",
-            "dateModified": "2025-08-20T19:11:56Z",
+            "dateModified": "2025-09-10T15:37:46Z",
             "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/",
             "keywords": [
                 "nextflow",
@@ -199,10 +199,10 @@
             },
             "url": [
                 "https://github.com/sanger-tol/blobtoolkit",
-                "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.8.0/"
+                "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.9.0/"
             ],
             "version": [
-                "0.8.0"
+                "0.9.0"
             ]
         },
         {
@@ -218,11 +218,11 @@
             "version": "!>=24.04.2"
         },
         {
-            "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7",
+            "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513",
             "@type": "TestSuite",
             "instance": [
                 {
-                    "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0"
+                    "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd"
                 }
             ],
             "mainEntity": {
@@ -231,7 +231,7 @@
             "name": "Test suite for sanger-tol/blobtoolkit"
         },
         {
-            "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0",
+            "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd",
             "@type": "TestInstance",
             "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit",
             "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml",