Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .nf-core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ template:
outdir: .
skip_features:
- igenomes
version: 0.8.1
version: 0.9.0
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[0.8.1](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.1)] – Sprigatito (H1) – [2025-08-19]
## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-09-11]

### Enhancements & fixes

- Upgrade Busco (#190)
- Update resource requirements for BLASTN modules (#191)
- The pipeline now stops on Busco failures
- Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS
- Fixed the `test_full` profile (Sanger only)

### Software dependencies

Expand Down
6 changes: 3 additions & 3 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ authors:
orcid: https://orcid.org/0000-0003-1658-1762
website: https://github.com/BethYates
cff-version: 1.2.0
date-released: "2025-08-20"
date-released: "2025-09-10"
doi: 10.5281/zenodo.7949058
license: MIT
message: If you use this software, please cite it using the metadata from this file
and all references from CITATIONS.md .
repository-code: https://github.com/sanger-tol/blobtoolkit
title: sanger-tol/blobtoolkit v0.8.0 - Sprigatito
title: sanger-tol/blobtoolkit v0.9.0 - Scyther
type: software
url: https://pipelines.tol.sanger.ac.uk/blobtoolkit
version: 0.8.0
version: 0.9.0
4 changes: 2 additions & 2 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"properties": {
"sample": {
"type": "string",
"description": "Sample Name",
"description": "Sample identifier",
"pattern": "^\\S+$",
"errorMessage": "Sample name must be provided, be unique, and cannot contain spaces",
"meta": ["id"]
Expand All @@ -18,7 +18,7 @@
"type": "string",
"pattern": "^\\S+$",
"enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"],
"errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'",
"errorMessage": "Data type must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'",
"meta": ["datatype"]
},
"datafile": {
Expand Down
26 changes: 16 additions & 10 deletions bin/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,18 @@

BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"]

# Wrapper around requests.get to use a "session", which can recover from network errors
def get_http_request_json(url):
retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504])
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
response = session.get(url)
return response.json()


# Argument parsing
def parse_args(args=None):
Description = "Produce the various configuration files needed within the pipeline"

Expand Down Expand Up @@ -99,12 +110,12 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
record_id = "taxon-%d" % taxon_name
else:
# Resolve the taxon_id of the species
response = requests.get(GOAT_LOOKUP_API % taxon_name).json()
response = get_http_request_json(GOAT_LOOKUP_API % taxon_name)
taxon_id = int(response["results"][0]["result"]["taxon_id"])
record_id = response["results"][0]["id"]

# Using API, get the taxon_ids of the species and all parents
response = requests.get(GOAT_RECORD_API % record_id).json()
response = get_http_request_json(GOAT_RECORD_API % record_id)
body = response["records"][0]["record"]
return make_taxon_info_from_goat(body)

Expand All @@ -113,12 +124,7 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo:
def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]:
# "/" has to be double encoded, e.g. "Gymnodinium sp. CCAP1117/9" -> "Gymnodinium%20sp.%20CCAP1117%252F9"
url_safe_taxon_name = urllib.parse.quote(str(taxon_name).replace("/", "%2F"))
retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504])
adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount("http://", adapter)
session.mount("https://", adapter)
response = session.get(NCBI_TAXONOMY_API % url_safe_taxon_name).json()
response = get_http_request_json(NCBI_TAXONOMY_API % url_safe_taxon_name)
if "taxonomy" in response["taxonomy_nodes"][0]:
body = response["taxonomy_nodes"][0]["taxonomy"]
if with_lineage:
Expand Down Expand Up @@ -186,7 +192,7 @@ def get_odb(


def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]:
response = requests.get(NCBI_DATASETS_API % accession).json()
response = get_http_request_json(NCBI_DATASETS_API % accession)
if response["total_count"] != 1:
print(f"Assembly not found: {accession}", file=sys.stderr)
sys.exit(1)
Expand All @@ -212,7 +218,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]


def get_sequence_report(accession: str):
response = requests.get(NCBI_SEQUENCE_API % accession).json()
response = get_http_request_json(NCBI_SEQUENCE_API % accession)
if not response["reports"]:
print(f"Assembly not found: {accession}", file=sys.stderr)
sys.exit(1)
Expand Down
3 changes: 2 additions & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ process {
cpus = 1
// 3 GB per 1 Gbp
memory = { 3.GB * task.attempt * Math.ceil(meta.genome_size / 1000000000) }
time = { 4.h * task.attempt }
// 1 hour per 100 Mbp
time = { 1.h * Math.ceil(meta.genome_size / 100000000) * task.attempt }
}

withName: 'FASTAWINDOWS' {
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ manifest {
mainScript = 'main.nf'
defaultBranch = 'main'
nextflowVersion = '!>=24.04.2'
version = '0.8.1'
version = '0.9.0'
doi = '10.5281/zenodo.7949058'
}

Expand Down
6 changes: 2 additions & 4 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,8 @@
"multiqc_methods_description": {
"type": "string",
"description": "Custom MultiQC yaml file containing HTML including a methods description.",
"fa_icon": "fas fa-cog"
"fa_icon": "fas fa-cog",
"hidden": true
},
"validate_params": {
"type": "boolean",
Expand All @@ -325,9 +326,6 @@
{
"$ref": "#/$defs/input_output_options"
},
{
"$ref": "#/$defs/institutional_config_options"
},
{
"$ref": "#/$defs/reference_genome_options"
},
Expand Down
16 changes: 8 additions & 8 deletions ro-crate-metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"@id": "./",
"@type": "Dataset",
"creativeWorkStatus": "Stable",
"datePublished": "2025-08-20T18:11:56+00:00",
"datePublished": "2025-09-10T14:37:46+00:00",
"description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n<!--[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)-->\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile <docker/singularity/.../institute> \\\n --input samplesheet.csv \\\n --outdir <OUTDIR> \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
"hasPart": [
{
Expand Down Expand Up @@ -105,7 +105,7 @@
},
"mentions": [
{
"@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7"
"@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513"
}
],
"name": "sanger-tol/blobtoolkit"
Expand Down Expand Up @@ -170,7 +170,7 @@
}
],
"dateCreated": "",
"dateModified": "2025-08-20T19:11:56Z",
"dateModified": "2025-09-10T15:37:46Z",
"dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/",
"keywords": [
"nextflow",
Expand Down Expand Up @@ -199,10 +199,10 @@
},
"url": [
"https://github.com/sanger-tol/blobtoolkit",
"https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.8.0/"
"https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.9.0/"
],
"version": [
"0.8.0"
"0.9.0"
]
},
{
Expand All @@ -218,11 +218,11 @@
"version": "!>=24.04.2"
},
{
"@id": "#72c4cd04-d41e-45e2-84e1-4d78cbedbfc7",
"@id": "#d7c7c1b0-2d46-43ce-a079-de7dd68db513",
"@type": "TestSuite",
"instance": [
{
"@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0"
"@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd"
}
],
"mainEntity": {
Expand All @@ -231,7 +231,7 @@
"name": "Test suite for sanger-tol/blobtoolkit"
},
{
"@id": "#3624c355-5dd2-4fe3-a151-c60947b758c0",
"@id": "#0d91ffa5-8069-46e2-a503-b87ba047f2fd",
"@type": "TestInstance",
"name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit",
"resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml",
Expand Down