diff --git a/.nf-core.yml b/.nf-core.yml index a6039fde..dc755be7 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -43,4 +43,4 @@ template: outdir: . skip_features: - igenomes - version: 0.8.1 + version: 0.9.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 951ee356..0926d1e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[0.8.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.1)] – Sprigatito (H1) – [2025-08-19] +## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-09-11] ### Enhancements & fixes - Upgrade Busco (#190) -- Update resource requirements for BLASTN modules (#191) +- The pipeline now stops on Busco failures +- Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS +- Fixed the `test_full` profile (Sanger only) ### Software dependencies diff --git a/CITATION.cff b/CITATION.cff index 3a0ae93e..199f464e 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -63,13 +63,13 @@ authors: orcid: https://orcid.org/0000-0003-1658-1762 website: https://github.com/BethYates cff-version: 1.2.0 -date-released: "2025-08-20" +date-released: "2025-09-10" doi: 10.5281/zenodo.7949058 license: MIT message: If you use this software, please cite it using the metadata from this file and all references from CITATIONS.md . repository-code: https://github.com/sanger-tol/blobtoolkit -title: sanger-tol/blobtoolkit v0.8.0 - Sprigatito +title: sanger-tol/blobtoolkit v0.9.0 - Scyther type: software url: https://pipelines.tol.sanger.ac.uk/blobtoolkit -version: 0.8.0 +version: 0.9.0 diff --git a/assets/schema_input.json b/assets/schema_input.json index db9d05c9..b97c5d0e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,7 +9,7 @@ "properties": { "sample": { "type": "string", - "description": "Sample Name", + "description": "Sample identifier", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided, be unique, and cannot contain spaces", "meta": ["id"] @@ -18,7 +18,7 @@ "type": "string", "pattern": "^\\S+$", "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"], - "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", + "errorMessage": "Data type must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", "meta": ["datatype"] }, "datafile": { diff --git a/bin/generate_config.py b/bin/generate_config.py index 2135e475..1ac6c8f5 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -31,7 +31,18 @@ BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"] +# Wrapper around requests.get to use a "session", which can recover from network errors +def get_http_request_json(url): + retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + response = session.get(url) + return response.json() + +# Argument parsing def parse_args(args=None): Description = "Produce the various configuration files needed within the pipeline" @@ -99,12 +110,12 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: record_id = "taxon-%d" % taxon_name else: # Resolve the taxon_id of the species - response = requests.get(GOAT_LOOKUP_API % taxon_name).json() + response = get_http_request_json(GOAT_LOOKUP_API % taxon_name) taxon_id = int(response["results"][0]["result"]["taxon_id"]) record_id = response["results"][0]["id"] # Using API, get the taxon_ids of the species and all parents - response = requests.get(GOAT_RECORD_API % record_id).json() + response = get_http_request_json(GOAT_RECORD_API % record_id) body = response["records"][0]["record"] return make_taxon_info_from_goat(body) @@ -113,12 +124,7 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]: # "/" has to be double encoded, e.g. "Gymnodinium sp. CCAP1117/9" -> "Gymnodinium%20sp.%20CCAP1117%252F9" url_safe_taxon_name = urllib.parse.quote(str(taxon_name).replace("/", "%2F")) - retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) - adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(NCBI_TAXONOMY_API % url_safe_taxon_name).json() + response = get_http_request_json(NCBI_TAXONOMY_API % url_safe_taxon_name) if "taxonomy" in response["taxonomy_nodes"][0]: body = response["taxonomy_nodes"][0]["taxonomy"] if with_lineage: @@ -186,7 +192,7 @@ def get_odb( def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]: - response = requests.get(NCBI_DATASETS_API % accession).json() + response = get_http_request_json(NCBI_DATASETS_API % accession) if response["total_count"] != 1: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) @@ -212,7 +218,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] def get_sequence_report(accession: str): - response = requests.get(NCBI_SEQUENCE_API % accession).json() + response = get_http_request_json(NCBI_SEQUENCE_API % accession) if not response["reports"]: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) diff --git a/conf/base.config b/conf/base.config index a429ef5f..e0874410 100644 --- a/conf/base.config +++ b/conf/base.config @@ -84,7 +84,8 @@ process { cpus = 1 // 3 GB per 1 Gbp memory = { 3.GB * task.attempt * Math.ceil(meta.genome_size / 1000000000) } - time = { 4.h * task.attempt } + // 1 hour per 100 Mbp + time = { 1.h * Math.ceil(meta.genome_size / 100000000) * task.attempt } } withName: 'FASTAWINDOWS' { diff --git a/nextflow.config b/nextflow.config index 3ff3a417..c65219f9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -339,7 +339,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' - version = '0.8.1' + version = '0.9.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index e83c29b1..26724683 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -303,7 +303,8 @@ "multiqc_methods_description": { "type": "string", "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" + "fa_icon": "fas fa-cog", + "hidden": true }, "validate_params": { "type": "boolean", @@ -325,9 +326,6 @@ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/institutional_config_options" - }, { "$ref": "#/$defs/reference_genome_options" }, diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index bac64400..97a85b19 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,7 +22,7 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "Stable", - "datePublished": "2025-08-20T18:11:56+00:00", + "datePublished": "2025-09-10T14:37:46+00:00", "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { @@ -105,7 +105,7 @@ }, "mentions": [ { - "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7" + "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513" } ], "name": "sanger-tol/blobtoolkit" @@ -170,7 +170,7 @@ } ], "dateCreated": "", - "dateModified": "2025-08-20T19:11:56Z", + "dateModified": "2025-09-10T15:37:46Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nextflow", @@ -199,10 +199,10 @@ }, "url": [ "https://github.com/sanger-tol/blobtoolkit", - "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.8.0/" + "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.9.0/" ], "version": [ - "0.8.0" + "0.9.0" ] }, { @@ -218,11 +218,11 @@ "version": "!>=24.04.2" }, { - "@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7", + "@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513", "@type": "TestSuite", "instance": [ { - "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0" + "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd" } ], "mainEntity": { @@ -231,7 +231,7 @@ "name": "Test suite for sanger-tol/blobtoolkit" }, { - "@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0", + "@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd", "@type": "TestInstance", "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit", "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml",