diff --git a/.nf-core.yml b/.nf-core.yml index 902933b1..dc755be7 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -43,4 +43,4 @@ template: outdir: . skip_features: - igenomes - version: 0.8.0 + version: 0.9.0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 3de31c99..35f48151 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,27 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[0.9.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.9.0)] – Scyther – [2025-10-03] + +### Enhancements & fixes + +- Upgrade Busco (#190) +- The pipeline now stops on Busco failures (#194) +- Update resource requirements for BLASTN modules (#191) and BLOBTOOLKIT_WINDOWSTATS +- Fixed the `test_full` profile (Sanger only) +- Addition of `--tmpdir` to Diamond blast modules (#200) +- `--use_work_dir_as_temp` is no longer a hidden param. +- Fixed some documentation (#193 and #197) +- Made GENERATE_CONFIG more resilient to network errors (#197) + +### Software dependencies + +Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference. Only `Docker` or `Singularity` containers are supported, `conda` is not supported. + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| busco | 5.8.3 | 6.0.0 | + ## [[0.8.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.8.0)] – Sprigatito – [2025-05-19] ### Enhancements & fixes diff --git a/CITATION.cff b/CITATION.cff index 5a892ee5..5008c495 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -27,6 +27,12 @@ authors: given-names: Matthieu orcid: https://orcid.org/0000-0002-7860-3560 website: https://github.com/muffato + - affiliation: Wellcome Sanger Institute + email: dp24@sanger.ac.uk + family-names: Pointon + given-names: Damon-Lee Bernard + orcid: https://orcid.org/0000-0003-2949-6719 + website: https://github.com/DLBPointon - affiliation: Wellcome Sanger Institute email: 729395+gq1@users.noreply.github.com family-names: Qi @@ -57,13 +63,13 @@ authors: orcid: https://orcid.org/0000-0003-1658-1762 website: https://github.com/BethYates cff-version: 1.2.0 -date-released: "2025-04-25" +date-released: "2025-10-03" doi: 10.5281/zenodo.7949058 license: MIT message: If you use this software, please cite it using the metadata from this file and all references from CITATIONS.md . repository-code: https://github.com/sanger-tol/blobtoolkit -title: sanger-tol/blobtoolkit v0.8.0 - +title: sanger-tol/blobtoolkit v0.9.0 - Scyther type: software url: https://pipelines.tol.sanger.ac.uk/blobtoolkit -version: 0.8.0 +version: 0.9.0 diff --git a/CITATIONS.md b/CITATIONS.md index 1e18be72..5435dcd4 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,66 +1,72 @@ # sanger-tol/blobtoolkit: Citations -> Butt, Zaynab, et al. "sanger-tol/blobtoolkit" Zenodo, 2023, https://zenodo.org/doi/10.5281/zenodo.7949058. +> Butt, Zaynab, et al. "sanger-tol/blobtoolkit" Zenodo, 2023, ## [nf-core](https://nf-co.re) -> Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, https://doi.org/10.1038/s41587-020-0439-x. +> Ewels, Philip A., et al. “The Nf-Core Framework for Community-Curated Bioinformatics Pipelines.” Nature Biotechnology, vol. 38, no. 3, Feb. 2020, pp. 276–78, ## [Nextflow](https://www.nextflow.io) -> Di Tommaso, Paolo, et al. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, vol. 35, no. 4, Apr. 2017, pp. 316–19, https://doi.org/10.1038/nbt.3820. +> Di Tommaso, Paolo, et al. “Nextflow Enables Reproducible Computational Workflows.” Nature Biotechnology, vol. 35, no. 4, Apr. 2017, pp. 316–19, ## Pipeline tools - [BLAST+](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) - > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, https://doi.org/10.1186/1471-2105-10-421 + > Camacho, Chritiam, et al. “BLAST+: architecture and applications.” BMC Bioinformatics, vol. 10, no. 412, Dec. 2009, + +- [BlobTk](https://github.com/genomehubs/blobtk) - [BlobToolKit](https://github.com/blobtoolkit/blobtoolkit) - > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, https://doi.org/10.1534/g3.119.400908. + > Challis, Richard, et al. “BlobToolKit – Interactive Quality Assessment of Genome Assemblies.” G3 Genes|Genomes|Genetics, vol. 10, no. 4, Apr. 2020, pp. 1361–74, - [BUSCO](https://gitlab.com/ezlab/busco) - > Manni, Mosè, et al. “BUSCO: Assessing Genomic Data Quality and Beyond.” Current Protocols, vol. 1, no. 12, Dec. 2021, https://doi.org/10.1002/cpz1.323. + > Manni, Mosè, et al. “BUSCO: Assessing Genomic Data Quality and Beyond.” Current Protocols, vol. 1, no. 12, Dec. 2021, - [Diamond](https://github.com/bbuchfink/diamond) - > Buchfink, Benjamin, et al. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods, vol. 18, no. 4, Apr. 2021, pp. 366–68, https://doi.org/10.1038/s41592-021-01101-x. + > Buchfink, Benjamin, et al. “Sensitive Protein Alignments at Tree-of-Life Scale Using DIAMOND.” Nature Methods, vol. 18, no. 4, Apr. 2021, pp. 366–68, - [Fasta_windows](https://github.com/tolkit/fasta_windows) - > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. https://github.com/tolkit/fasta_windows + > Brown, Max, et al. "Fasta_windows v0.2.3". GitHub, 2021. - [Minimap2](https://github.com/lh3/minimap2) - > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, https://doi.org/10.1093/bioinformatics/bty191. + > Li, Heng. "Minimap2: pairwise alignment for nucleotide sequences." Bioinformatics, vol. 34, no. 18, Sep. 2018, pp. 3094-100, - [MultiQC](https://multiqc.info) - > Ewels, Philip, et al. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics, vol. 32, no. 19, 2016, pp. 3047–3048., https://doi.org/10.1093/bioinformatics/btw354. + > Ewels, Philip, et al. “MultiQC: Summarize Analysis Results for Multiple Tools and Samples in a Single Report.” Bioinformatics, vol. 32, no. 19, 2016, pp. 3047–3048., - [Samtools](https://www.htslib.org) - > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, https://doi.org/10.1093/gigascience/giab008. + > Danecek, Petr, et al. “Twelve Years of SAMtools and BCFtools.” GigaScience, vol. 10, no. 2, Jan. 2021, - [SeqTK](https://github.com/lh3/seqtk) - > Li, Heng. "SeqTK v1.4" GitHub, 2023, https://github.com/lh3/seqtk + > Li, Heng. "SeqTK v1.4" GitHub, 2023, + +- [WindowMasker](https://pubmed.ncbi.nlm.nih.gov/16287941/) + + > Morgulis, A., et al. 2006. WindowMasker: window-based masker for sequenced genomes. Bioinformatics. 22(2). pp.134–141. doi: 10.1093/bioinformatics/bti774. ## Software packaging/containerisation tools - [Conda](https://conda.org/) - > conda contributors. conda: A system-level, binary package and environment manager running on all major operating systems and platforms. Computer software. https://github.com/conda/conda + > conda contributors. conda: A system-level, binary package and environment manager running on all major operating systems and platforms. Computer software. - [Bioconda](https://bioconda.github.io) - > Grüning, Björn, et al. “Bioconda: sustainable and comprehensive software distribution for the life sciences.", Nature Methods, vol. 15, Jul. 2018, pp. 475-6, https://doi.org/10.1038/s41592-018-0046-7. + > Grüning, Björn, et al. “Bioconda: sustainable and comprehensive software distribution for the life sciences.", Nature Methods, vol. 15, Jul. 2018, pp. 475-6, - [BioContainers](https://biocontainers.pro) - > da Veiga, Felipe, et al. “BioContainers: an open-source and community-driven framework for software standardization.", Bioinformatics, vol. 33, no. 16, Aug. 2017, pp. 2580-2, https://doi.org/10.1093/bioinformatics/btx192. + > da Veiga, Felipe, et al. “BioContainers: an open-source and community-driven framework for software standardization.", Bioinformatics, vol. 33, no. 16, Aug. 2017, pp. 2580-2, - [Docker](https://www.docker.com) @@ -68,4 +74,4 @@ - [Singularity](https://docs.sylabs.io/guides/latest/user-guide/) - > Kurtzer, Gregory M., et al. “Singularity: Scientific containers for mobility of compute.", PLOS ONE, vol. 12, no. 5, May 2017, pp. e0177459, https://doi.org/10.1371/journal.pone.0177459. + > Kurtzer, Gregory M., et al. “Singularity: Scientific containers for mobility of compute.", PLOS ONE, vol. 12, no. 5, May 2017, pp. e0177459, diff --git a/README.md b/README.md index f635c40c..cab29ab0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ # ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png) [![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) +[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058) @@ -23,10 +24,10 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond)) 7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond)) -8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) +8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/)) 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit)) 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit)) -11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) +11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit)) 12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk)) ## Usage @@ -40,13 +41,14 @@ First, prepare a samplesheet with your input data that looks as follows: ```csv sample,datatype,datafile,library_layout -mMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED +mMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED mMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED -mMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE +mMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE ``` -Each row represents an aligned file. -Rows with the same sample identifier are considered technical replicates. +Each row represents a read set (aligned or not). +The first column (sample name) must be unique. +If you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique. The datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`). The library layout indicates whether the reads are paired or single. The aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline. @@ -78,9 +80,19 @@ For more details about the output files and reports, please refer to the [output ## Credits -sanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https://github.com/alxndrdiaz), [Zaynab Butt](https://github.com/zb32), [Matthieu Muffato](https://github.com/muffato), and [Priyanka Surana](https://github.com/priyanka-surana). The orignal design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar). +sanger-tol/blobtoolkit was written in Nextflow by: -We thank the following people for their assistance in the development of this pipeline: +- [Alexander Ramos Diaz](https://github.com/alxndrdiaz) +- [Zaynab Butt](https://github.com/zb32) +- [Priyanka Surana](https://github.com/priyanka-surana) +- [Matthieu Muffato](https://github.com/muffato) +- [Tyler Chafin](https://github.com/tkchafin) +- [Yumi Sims](https://github.com/yumisims) +- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon) + +The original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar). + +We thank the following people for their extensive assistance in the development of this pipeline: - [Guoying Qi](https://github.com/gq1) - [Bethan Yates](https://github.com/BethYates) @@ -91,7 +103,7 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations -If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) +If you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/schema_input.json b/assets/schema_input.json index db9d05c9..b97c5d0e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,7 +9,7 @@ "properties": { "sample": { "type": "string", - "description": "Sample Name", + "description": "Sample identifier", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided, be unique, and cannot contain spaces", "meta": ["id"] @@ -18,7 +18,7 @@ "type": "string", "pattern": "^\\S+$", "enum": ["hic", "illumina", "ont", "pacbio", "pacbio_clr"], - "errorMessage": "Data type, and must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", + "errorMessage": "Data type must be one of: 'hic' or 'illumina' or 'ont' or 'pacbio'", "meta": ["datatype"] }, "datafile": { diff --git a/assets/test/samplesheet.csv b/assets/test/samplesheet.csv index 2431b0e0..01ba17e8 100644 --- a/assets/test/samplesheet.csv +++ b/assets/test/samplesheet.csv @@ -1,5 +1,5 @@ sample,datatype,datafile,library_layout -mMelMel3_hic,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED -mMelMel3_ont,ont,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE +mMelMel3_hic,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/hic/GCA_922984935.2.subset.unmasked.hic.mMelMel3.cram,PAIRED +mMelMel1,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel1.cram,PAIRED +mMelMel2,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/illumina/GCA_922984935.2.subset.unmasked.illumina.mMelMel2.cram,PAIRED +mMelMel3_ont,ont,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/analysis/mMelMel3.2_paternal_haplotype/read_mapping/ont/GCA_922984935.2.subset.unmasked.ont.mMelMel3.cram,SINGLE diff --git a/assets/test/samplesheet_raw.csv b/assets/test/samplesheet_raw.csv index 53a5a42e..881dd51f 100644 --- a/assets/test/samplesheet_raw.csv +++ b/assets/test/samplesheet_raw.csv @@ -1,4 +1,4 @@ sample,datatype,datafile,library_layout -mMelMel1,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED -mMelMel2,illumina,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED -mMelMel3,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED +mMelMel1,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel1/illumina/31231_3#1_subset.cram,PAIRED +mMelMel2,illumina,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel2/illumina/31231_4#1_subset.cram,PAIRED +mMelMel3,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Meles_meles/genomic_data/mMelMel3/hic-arima2/35528_2#1_subset.cram,PAIRED diff --git a/assets/test_full/full_samplesheet.csv b/assets/test_full/full_samplesheet.csv index fb673840..b3aa6744 100644 --- a/assets/test_full/full_samplesheet.csv +++ b/assets/test_full/full_samplesheet.csv @@ -1,3 +1,3 @@ sample,datatype,datafile,library_layout -gfLaeSulp1_hic,hic,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED -gfLaeSulp1_pacbio,pacbio,/lustre/scratch123/tol/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE +gfLaeSulp1_hic,hic,/nfs/treeoflife-01/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/hic/GCA_927399515.1.unmasked.hic.gfLaeSulp1.cram,PAIRED +gfLaeSulp1_pacbio,pacbio,/nfs/treeoflife-01/resources/nextflow/test-data/Laetiporus_sulphureus/analysis/gfLaeSulp1.1/read_mapping/pacbio/GCA_927399515.1.unmasked.pacbio.gfLaeSulp1.cram,SINGLE diff --git a/bin/generate_config.py b/bin/generate_config.py index 2135e475..1ac6c8f5 100755 --- a/bin/generate_config.py +++ b/bin/generate_config.py @@ -31,7 +31,18 @@ BUSCO_BASAL_LINEAGES = ["eukaryota_odb10", "bacteria_odb10", "archaea_odb10"] +# Wrapper around requests.get to use a "session", which can recover from network errors +def get_http_request_json(url): + retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + response = session.get(url) + return response.json() + +# Argument parsing def parse_args(args=None): Description = "Produce the various configuration files needed within the pipeline" @@ -99,12 +110,12 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: record_id = "taxon-%d" % taxon_name else: # Resolve the taxon_id of the species - response = requests.get(GOAT_LOOKUP_API % taxon_name).json() + response = get_http_request_json(GOAT_LOOKUP_API % taxon_name) taxon_id = int(response["results"][0]["result"]["taxon_id"]) record_id = response["results"][0]["id"] # Using API, get the taxon_ids of the species and all parents - response = requests.get(GOAT_RECORD_API % record_id).json() + response = get_http_request_json(GOAT_RECORD_API % record_id) body = response["records"][0]["record"] return make_taxon_info_from_goat(body) @@ -113,12 +124,7 @@ def fetch_taxon_info_from_goat(taxon_name: typing.Union[str, int]) -> TaxonInfo: def fetch_taxon_info_from_ncbi(taxon_name: typing.Union[str, int], with_lineage=True) -> typing.Optional[TaxonInfo]: # "/" has to be double encoded, e.g. "Gymnodinium sp. CCAP1117/9" -> "Gymnodinium%20sp.%20CCAP1117%252F9" url_safe_taxon_name = urllib.parse.quote(str(taxon_name).replace("/", "%2F")) - retry_strategy = urllib3.util.Retry(total=5, backoff_factor=0.1, status_forcelist=[429, 500, 502, 503, 504]) - adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(NCBI_TAXONOMY_API % url_safe_taxon_name).json() + response = get_http_request_json(NCBI_TAXONOMY_API % url_safe_taxon_name) if "taxonomy" in response["taxonomy_nodes"][0]: body = response["taxonomy_nodes"][0]["taxonomy"] if with_lineage: @@ -186,7 +192,7 @@ def get_odb( def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int]]: - response = requests.get(NCBI_DATASETS_API % accession).json() + response = get_http_request_json(NCBI_DATASETS_API % accession) if response["total_count"] != 1: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) @@ -212,7 +218,7 @@ def get_assembly_info(accession: str) -> typing.Dict[str, typing.Union[str, int] def get_sequence_report(accession: str): - response = requests.get(NCBI_SEQUENCE_API % accession).json() + response = get_http_request_json(NCBI_SEQUENCE_API % accession) if not response["reports"]: print(f"Assembly not found: {accession}", file=sys.stderr) sys.exit(1) diff --git a/conf/base.config b/conf/base.config index a56b8807..20c72346 100644 --- a/conf/base.config +++ b/conf/base.config @@ -84,7 +84,8 @@ process { cpus = 1 // 3 GB per 1 Gbp memory = { 3.GB * task.attempt * Math.ceil(meta.genome_size / 1000000000) } - time = { 4.h * task.attempt } + // 1 hour per 100 Mbp + time = { 1.h * Math.ceil(meta.genome_size / 100000000) * task.attempt } } withName: 'FASTAWINDOWS' { @@ -104,9 +105,9 @@ process { time = { 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt } } - withName: "BLAST_BLASTN" { + withName: "BLAST_BLASTN|BLASTN_TAXON" { cpus = 4 - memory = 2.GB + memory = { 2.GB * task.attempt * task.attempt } time = 12.h } diff --git a/conf/modules.config b/conf/modules.config index 87f686aa..a28287c3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -105,11 +105,11 @@ process { } withName: "DIAMOND_BLASTP" { - ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastp_tmp/" : "" ) } } withName: "DIAMOND_BLASTX" { - ext.args = "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ext.args = { "--evalue 1.0e-25 --max-target-seqs 10 --max-hsps 1" + ( params.use_work_dir_as_temp ? " --tmpdir ./blastx_tmp/" : "" ) } } withName: "BLOBTK_DEPTH" { @@ -178,4 +178,3 @@ process { } } - diff --git a/conf/test_full.config b/conf/test_full.config index a86e0050..c62ca184 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -26,7 +26,7 @@ params { // Databases taxdump = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz" - busco = "/lustre/scratch123/tol/resources/busco/latest" + busco = "/data/tol/resources/busco/latest" blastp = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscogenes.dmnd.tar.gz" blastx = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/gfLaeSulp1.1.buscoregions.dmnd.tar.gz" blastn = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/resources/nt_gfLaeSulp1.1.tar.gz" diff --git a/docs/output.md b/docs/output.md index c2ba0af4..cd417832 100644 --- a/docs/output.md +++ b/docs/output.md @@ -52,7 +52,7 @@ Images generated from the above blobdir using the [blobtk](https://github.com/bl ### BUSCO -BUSCO results generated by the pipeline (all BUSCO lineages that match the claassification of the species). +BUSCO results generated by the pipeline (all BUSCO lineages that match the classification of the species).
Output files @@ -71,14 +71,14 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas ### Repeat masking -Reults from the repeat-masker step -- only if the pipeline is run with `--mask`. +Results from the repeat-masker step -- only if the pipeline is run with `--mask`.
Output files - `repeats/` - `windowmasker/` - - `.fasta`: masked assembly in Fasta format. + - `.fasta`: masked assembly in FASTA format. - `.obinary`: frequency counts of repeats, in windowmasker's own binary format.
@@ -106,7 +106,7 @@ Those files are the raw data used to build the BlobDir. - `read_mapping/` - `/` - - `.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows. + - `.coverage.1k.bed.gz`: BedGraph file with the coverage of the alignments of that sample per 1 kbp windows.
@@ -119,8 +119,8 @@ Those files are the raw data used to build the BlobDir. Output files - `base_content/` - - `_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. - - `_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts. + - `_*nuc_windows.tsv.gz`: tab-separated files with the counts of every _k_-mer for k ≤ 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer. + - `_freq_windows.tsv.gz`: tab-separated files with frequencies derived from the _k_-mer counts. @@ -157,7 +157,7 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `pipeline_info/blobtoolkit/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`. diff --git a/docs/usage.md b/docs/usage.md index 0bd4a94a..825394b7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,17 +14,6 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```console -sample,datatype,datafile,library_layout -sample1,hic,hic.cram,PAIRED -sample2,illumina,illumina.cram,PAIRED -sample2,illumina,illumina.cram,PAIRED -``` - ### Full samplesheet The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. @@ -38,12 +27,12 @@ sample2,illumina,illumina.cram,PAIRED sample3,ont,ont.cram,SINGLE ``` -| Column | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (\_). | -| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | -| `datafile` | Full path to read data file. | -| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | +| Column | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. It doesn't have to be an actual _sample_ name. It is used to name the read set on the BlobToolKit viewer and therefore needs to be **unique** across the samplesheet. | +| `datatype` | Type of sequencing data. Must be one of `hic`, `illumina`, `pacbio`, `pacbio_clr` or `ont`. | +| `datafile` | Full path to read data file. | +| `library_layout` | Layout of the library. Must be one of `SINGLE`, `PAIRED`. | An [example samplesheet](../assets/test/samplesheet.csv) has been provided with the pipeline. @@ -92,7 +81,7 @@ The pipeline minimally requires outputs for the 'basal' lineages (archaea, eukar Configure access to your local databases with the `--busco`, `--blastp`, `--blastx`, `--blastn`, and `--taxdump` parameters. -Note that `--busco` refers to the download path of _all_ lineages. +Note that `--busco` refers to the download path which _contains_ the `lineages/` sub-directory. Then, when explicitly selecting the lineages to run the pipeline on, provide the names of these lineages _with_ their `_odb10` suffix as a comma-separated string. For instance: @@ -143,7 +132,7 @@ mkdir -p $NT cd $NT ``` -Retrieve the NCBI blast nt database (version 5) files and tar gunzip them. +Retrieve the NCBI blast nt database (version 5) files and extract them. `wget` and the use of the FTP protocol are necessary to resolve the wildcard `nt.???.tar.gz`. We are using the `&&` syntax to ensure that each command completes without error before the next one is run: @@ -262,7 +251,7 @@ Nextflow ```bash # Public Assemblies -nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME –-accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB +nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --accession GCA_ACCESSION --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB # Draft Assemblies nextflow run sanger-tol/blobtoolkit --input SAMPLESHEET --fasta GENOME --taxon TAXON_ID --taxdump TAXDUMP_DB --blastp DMND_db --blastn BLASTN_DB --blastx BLASTX_DB @@ -274,7 +263,7 @@ see for some examples. ### Subworkflows -Here is a full list of snakemake subworkflows and their Nextflow couterparts: +Here is a full list of snakemake subworkflows and their Nextflow counterparts: - **`minimap.smk`** - Implemented as [`minimap_alignment.nf`](../subworkflows/local/minimap_alignment.nf). @@ -307,33 +296,33 @@ Here is a full list of snakemake subworkflows and their Nextflow couterparts: List of tools for any given dataset can be fetched from the API, for example https://blobtoolkit.genomehubs.org/api/v1/dataset/id/CAJEUD01.1/settings/software_versions. -| Dependency | Snakemake | Nextflow | -| ----------------- | --------- | -------- | -| blobtoolkit | 4.3.2 | 4.4.4 | -| blast | 2.12.0 | 2.14.1 | -| blobtk | 0.5.0 | 0.5.1 | -| busco | 5.3.2 | 5.5.0 | -| diamond | 2.0.15 | 2.1.8 | -| fasta_windows | | 0.2.4 | -| minimap2 | 2.24 | 2.24 | -| ncbi-datasets-cli | 14.1.0 | | -| nextflow | | 23.10.0 | -| python | 3.9.13 | 3.12.0 | -| samtools | 1.15.1 | 1.19.2 | -| seqtk | 1.3 | 1.4 | -| snakemake | 7.19.1 | | -| windowmasker | 2.12.0 | 2.14.0 | +| Dependency | Snakemake | Nextflow | +| ----------------- | --------- | ------------- | +| blobtoolkit | 4.3.2 | 4.4.6 | +| blast | 2.12.0 | 2.15.0 | +| blobtk | 0.5.0 | 0.5.1 | +| busco | 5.3.2 | 5.8.3 | +| diamond | 2.0.15 | 2.1.8 | +| fasta_windows | | 0.2.4 | +| minimap2 | 2.24 | 2.24-r1122 | +| ncbi-datasets-cli | 14.1.0 | | +| nextflow | | 24.04.2 | +| python | 3.9.13 | 3.12.0 | +| samtools | 1.15.1 | 1.20 and 1.21 | +| seqtk | 1.3 | 1.4 | +| snakemake | 7.19.1 | | +| windowmasker | 2.12.0 | 2.14.0 | > **NB:** Dependency has been **added** if only the Nextflow version information is present. > **NB:** Dependency has been **removed** if only the Snakemake version information is present. -> **NB:** Dependency has been **updated** if bothe the Snakemake and Nextflow version information is present. +> **NB:** Dependency has been **updated** if both the Snakemake and Nextflow version information is present. ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker –-accession GCA_accession --taxon "species name" --taxdump /path/to/taxdump --blastp /path/to/buscogenes.dmnd --blastn /path/to/blastn.nt --blastx /path/to/buscoregions.dmnd +nextflow run sanger-tol/blobtoolkit --input samplesheet.csv --outdir --fasta genome.fasta -profile docker --accession GCA_accession --taxon "species name" --taxdump /path/to/taxdump --blastp /path/to/buscogenes.dmnd --blastn /path/to/blastn.nt --blastx /path/to/buscoregions.dmnd ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. diff --git a/modules.json b/modules.json index 7278c96d..a2111293 100644 --- a/modules.json +++ b/modules.json @@ -13,7 +13,7 @@ }, "busco/busco": { "branch": "master", - "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "git_sha": "36c6c8445284e021d95ce30cdf743baef66b21aa", "installed_by": ["modules"], "patch": "modules/nf-core/busco/busco/busco-busco.diff" }, diff --git a/modules/nf-core/busco/busco/busco-busco.diff b/modules/nf-core/busco/busco/busco-busco.diff index 92a42df3..77864722 100644 --- a/modules/nf-core/busco/busco/busco-busco.diff +++ b/modules/nf-core/busco/busco/busco-busco.diff @@ -11,7 +11,7 @@ Changes in 'busco/busco/main.nf': conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container -@@ -45,7 +44,7 @@ +@@ -51,7 +50,7 @@ def busco_lineage = lineage in ['auto', 'auto_prok', 'auto_euk'] ? lineage.replaceFirst('auto', '--auto-lineage').replaceAll('_', '-') : "--lineage_dataset ${lineage}" @@ -20,6 +20,22 @@ Changes in 'busco/busco/main.nf': def intermediate_files = [ './*-busco/*/auto_lineage', './*-busco/*/**/{miniprot,hmmer,.bbtools}_output', +@@ -112,8 +111,13 @@ + + if grep 'Run failed; check logs' ${prefix}-busco.batch_summary.txt > /dev/null + then +- echo "Busco run failed" +- exit 1 ++ if grep -Fx 'Sequence too long (max 32000000 permitted).' ${prefix}-busco.log > /dev/null ++ then ++ echo "Prodigal can't run on this genome. Skipping it" ++ else ++ echo "Busco run failed" ++ exit 1 ++ fi + fi + + cat <<-END_VERSIONS > versions.yml 'modules/nf-core/busco/busco/environment.yml' is unchanged 'modules/nf-core/busco/busco/tests/main.nf.test' is unchanged diff --git a/modules/nf-core/busco/busco/environment.yml b/modules/nf-core/busco/busco/environment.yml index ba8a40c0..861982d0 100644 --- a/modules/nf-core/busco/busco/environment.yml +++ b/modules/nf-core/busco/busco/environment.yml @@ -3,7 +3,5 @@ channels: - conda-forge - bioconda - dependencies: - - bioconda::busco=5.8.3 - - bioconda::sepp=4.5.5 + - bioconda::busco=6.0.0 diff --git a/modules/nf-core/busco/busco/main.nf b/modules/nf-core/busco/busco/main.nf index 6435dd7f..53476230 100644 --- a/modules/nf-core/busco/busco/main.nf +++ b/modules/nf-core/busco/busco/main.nf @@ -3,33 +3,39 @@ process BUSCO_BUSCO { conda "${moduleDir}/environment.yml" container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container - ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c6/c607f319867d96a38c8502f751458aa78bbd18fe4c7c4fa6b9d8350e6ba11ebe/data' - : 'community.wave.seqera.io/library/busco_sepp:f2dbc18a2f7a5b64'}" + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/41/4137d65ab5b90d2ae4fa9d3e0e8294ddccc287e53ca653bb3c63b8fdb03e882f/data' + : 'community.wave.seqera.io/library/busco:6.0.0--a9a1426105f81165'}" + // Note: one test had to be disabled when switching to Busco 6.0.0, cf https://github.com/nf-core/modules/pull/8781/files + // Try to restore it when upgrading Busco to a later version input: - tuple val(meta), path(fasta, stageAs:'tmp_input/*') - val mode // Required: One of genome, proteins, or transcriptome - val lineage // Required: lineage for checking against, or "auto/auto_prok/auto_euk" for enabling auto-lineage - path busco_lineages_path // Recommended: BUSCO lineages file - downloads if not set - path config_file // Optional: BUSCO configuration file - val clean_intermediates // Optional: Remove intermediate files + tuple val(meta), path(fasta, stageAs: 'tmp_input/*') + // Required: One of genome, proteins, or transcriptome + val mode + // Required: lineage for checking against, or "auto/auto_prok/auto_euk" for enabling auto-lineage + val lineage + // Recommended: BUSCO lineages file - downloads if not set + path busco_lineages_path + // Optional: BUSCO configuration file + path config_file + val clean_intermediates output: - tuple val(meta), path("*-busco.batch_summary.txt") , emit: batch_summary - tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt , optional: true - tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true - tuple val(meta), path("*-busco.log") , emit: log , optional: true - tuple val(meta), path("*-busco/*/run_*/full_table.tsv") , emit: full_table , optional: true - tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv") , emit: missing_busco_list , optional: true - tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa") , emit: single_copy_proteins, optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences") , emit: seq_dir , optional: true - tuple val(meta), path("*-busco/*/translated_proteins") , emit: translated_dir , optional: true - tuple val(meta), path("*-busco") , emit: busco_dir - tuple val(meta), path("busco_downloads/lineages/*") , emit: downloaded_lineages , optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa"), emit: single_copy_faa , optional: true - tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna"), emit: single_copy_fna , optional: true + tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary + tuple val(meta), path("short_summary.*.txt"), emit: short_summaries_txt, optional: true + tuple val(meta), path("short_summary.*.json"), emit: short_summaries_json, optional: true + tuple val(meta), path("*-busco.log"), emit: log, optional: true + tuple val(meta), path("*-busco/*/run_*/full_table.tsv"), emit: full_table, optional: true + tuple val(meta), path("*-busco/*/run_*/missing_busco_list.tsv"), emit: missing_busco_list, optional: true + tuple val(meta), path("*-busco/*/run_*/single_copy_proteins.faa"), emit: single_copy_proteins, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences"), emit: seq_dir, optional: true + tuple val(meta), path("*-busco/*/translated_proteins"), emit: translated_dir, optional: true + tuple val(meta), path("*-busco"), emit: busco_dir + tuple val(meta), path("busco_downloads/lineages/*"), emit: downloaded_lineages, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa"), emit: single_copy_faa, optional: true + tuple val(meta), path("*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna"), emit: single_copy_fna, optional: true - path "versions.yml" , emit: versions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -103,9 +109,20 @@ process BUSCO_BUSCO { mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found." mv ${prefix}-busco/logs/busco.log ${prefix}-busco.log + if grep 'Run failed; check logs' ${prefix}-busco.batch_summary.txt > /dev/null + then + if grep -Fx 'Sequence too long (max 32000000 permitted).' ${prefix}-busco.log > /dev/null + then + echo "Prodigal can't run on this genome. Skipping it" + else + echo "Busco run failed" + exit 1 + fi + fi + cat <<-END_VERSIONS > versions.yml "${task.process}": - busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + busco: \$( busco --version 2> /dev/null | sed 's/BUSCO //g' ) END_VERSIONS """ @@ -118,7 +135,7 @@ process BUSCO_BUSCO { cat <<-END_VERSIONS > versions.yml "${task.process}": - busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' ) + busco: \$( busco --version 2> /dev/null | sed 's/BUSCO //g' ) END_VERSIONS """ } diff --git a/modules/nf-core/busco/busco/meta.yml b/modules/nf-core/busco/busco/meta.yml index 0222e490..281e3db0 100644 --- a/modules/nf-core/busco/busco/meta.yml +++ b/modules/nf-core/busco/busco/meta.yml @@ -26,26 +26,28 @@ input: type: file description: Nucleic or amino acid sequence file in FASTA format. pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}" - - - mode: - type: string - description: The mode to run Busco in. One of genome, proteins, or transcriptome - pattern: "{genome,proteins,transcriptome}" - - - lineage: - type: string - description: The BUSCO lineage to use, or "auto", "auto_prok" or "auto_euk" - to automatically select lineage - - - busco_lineages_path: - type: directory - description: Path to local BUSCO lineages directory. - - - config_file: - type: file - description: Path to BUSCO config file. - - - clean_intermediates: - type: boolean - description: Flag to remove intermediate files. + ontologies: [] + - mode: + type: string + description: The mode to run Busco in. One of genome, proteins, or transcriptome + pattern: "{genome,proteins,transcriptome}" + - lineage: + type: string + description: The BUSCO lineage to use, or "auto", "auto_prok" or "auto_euk" to + automatically select lineage + - busco_lineages_path: + type: directory + description: Path to local BUSCO lineages directory. + - config_file: + type: file + description: Path to BUSCO config file. + ontologies: [] + - clean_intermediates: + type: boolean + description: Flag to remove intermediate files. output: - - batch_summary: - - meta: + batch_summary: + - - meta: type: map description: | Groovy Map containing sample information @@ -54,8 +56,9 @@ output: type: file description: Summary of all sequence files analyzed pattern: "*-busco.batch_summary.txt" - - short_summaries_txt: - - meta: + ontologies: [] + short_summaries_txt: + - - meta: type: map description: | Groovy Map containing sample information @@ -64,8 +67,9 @@ output: type: file description: Short Busco summary in plain text format pattern: "short_summary.*.txt" - - short_summaries_json: - - meta: + ontologies: [] + short_summaries_json: + - - meta: type: map description: | Groovy Map containing sample information @@ -74,8 +78,10 @@ output: type: file description: Short Busco summary in JSON format pattern: "short_summary.*.json" - - log: - - meta: + ontologies: + - edam: http://edamontology.org/format_3464 # JSON + log: + - - meta: type: map description: | Groovy Map containing sample information @@ -84,8 +90,9 @@ output: type: file description: BUSCO main log pattern: "*-busco.log" - - full_table: - - meta: + ontologies: [] + full_table: + - - meta: type: map description: | Groovy Map containing sample information @@ -94,8 +101,10 @@ output: type: file description: Full BUSCO results table pattern: "full_table.tsv" - - missing_busco_list: - - meta: + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + missing_busco_list: + - - meta: type: map description: | Groovy Map containing sample information @@ -104,8 +113,10 @@ output: type: file description: List of missing BUSCOs pattern: "missing_busco_list.tsv" - - single_copy_proteins: - - meta: + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + single_copy_proteins: + - - meta: type: map description: | Groovy Map containing sample information @@ -114,8 +125,9 @@ output: type: file description: Fasta file of single copy proteins (transcriptome mode) pattern: "single_copy_proteins.faa" - - seq_dir: - - meta: + ontologies: [] + seq_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -124,8 +136,8 @@ output: type: directory description: BUSCO sequence directory pattern: "busco_sequences" - - translated_dir: - - meta: + translated_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -135,8 +147,8 @@ output: description: Six frame translations of each transcript made by the transcriptome mode pattern: "translated_dir" - - busco_dir: - - meta: + busco_dir: + - - meta: type: map description: | Groovy Map containing sample information @@ -145,18 +157,19 @@ output: type: directory description: BUSCO lineage specific output pattern: "*-busco" - - downloaded_lineages: - - meta: + downloaded_lineages: + - - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test' ] - - "busco_downloads/lineages/*": + - busco_downloads/lineages/*: type: directory - description: Lineages downloaded by BUSCO when running the analysis, for example bacteria_odb12 + description: Lineages downloaded by BUSCO when running the analysis, for example + bacteria_odb12 pattern: "busco_downloads/lineages/*" - - single_copy_faa: - - meta: + single_copy_faa: + - - meta: type: map description: | Groovy Map containing sample information @@ -165,8 +178,9 @@ output: type: file description: Single copy .faa sequence files pattern: "*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.faa" - - single_copy_fna: - - meta: + ontologies: [] + single_copy_fna: + - - meta: type: map description: | Groovy Map containing sample information @@ -175,11 +189,14 @@ output: type: file description: Single copy .fna sequence files pattern: "*-busco/*/run_*/busco_sequences/single_copy_busco_sequences/*.fna" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@priyanka-surana" - "@charles-plessy" diff --git a/modules/nf-core/busco/busco/tests/main.nf.test b/modules/nf-core/busco/busco/tests/main.nf.test index 411ceb86..370e542d 100644 --- a/modules/nf-core/busco/busco/tests/main.nf.test +++ b/modules/nf-core/busco/busco/tests/main.nf.test @@ -24,7 +24,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues + input[2] = 'bacteria_odb10' // Launch with 'auto' to use --auto-lineage, and specified lineages // 'auto' removed from test due to memory issues input[3] = [] // Download busco lineage input[4] = [] // No config input[5] = false // Clean intermediates @@ -92,7 +92,7 @@ nextflow_process { ] ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -163,72 +163,6 @@ nextflow_process { } - test("test_busco_eukaryote_metaeuk") { - - config './nextflow.config' - - when { - params { - busco_args = '--tar --metaeuk' - } - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) - ] - input[1] = 'genome' - input[2] = 'eukaryota_odb10' - input[3] = [] - input[4] = [] - input[5] = false - """ - } - } - - then { - assert process.success - - with(path(process.out.short_summaries_txt[0][1]).text) { - assert contains('BUSCO version') - assert contains('The lineage dataset is') - assert contains('BUSCO was run in mode') - assert contains('Complete BUSCOs') - assert contains('Missing BUSCOs') - assert contains('Dependencies and versions') - } - - with(path(process.out.short_summaries_json[0][1]).text) { - assert contains('one_line_summary') - assert contains('mode') - assert contains('dataset') - } - - assert snapshot( - process.out.batch_summary[0][1], - process.out.full_table[0][1], - process.out.missing_busco_list[0][1], - process.out.versions[0] - ).match() - - with(file(process.out.seq_dir[0][1]).listFiles().collect { it.name }) { - assert contains('single_copy_busco_sequences.tar.gz') - assert contains('multi_copy_busco_sequences.tar.gz') - assert contains('fragmented_busco_sequences.tar.gz') - } - - with(path(process.out.log[0][1]).text) { - assert contains('DEBUG:busco.run_BUSCO') - assert contains('Results from dataset') - assert contains('how to cite BUSCO') - - } - - assert process.out.single_copy_proteins == [] - assert process.out.translated_dir == [] - } - - } test("test_busco_eukaryote_augustus") { @@ -292,7 +226,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/genome/proteome.fasta', checkIfExists: true) ] input[1] = 'proteins' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -358,7 +292,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fasta/test1.contigs.fa.gz', checkIfExists: true) ] input[1] = 'transcriptome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false @@ -423,7 +357,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = true @@ -467,7 +401,7 @@ nextflow_process { file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/genome/genome.fna.gz', checkIfExists: true) ] input[1] = 'genome' - input[2] = 'bacteria_odb12' + input[2] = 'bacteria_odb10' input[3] = [] input[4] = [] input[5] = false diff --git a/modules/nf-core/busco/busco/tests/main.nf.test.snap b/modules/nf-core/busco/busco/tests/main.nf.test.snap index 1026524b..5de40123 100644 --- a/modules/nf-core/busco/busco/tests/main.nf.test.snap +++ b/modules/nf-core/busco/busco/tests/main.nf.test.snap @@ -6,157 +6,123 @@ { "id": "test" }, - "test-bacteria_odb12-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + "test-bacteria_odb10-busco.batch_summary.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], [ - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ] ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:50:57.218573431" + "timestamp": "2025-07-21T16:11:16.371060201" }, "test_busco_eukaryote_augustus": { "content": [ "test-eukaryota_odb10-busco.batch_summary.txt:md5,3ea3bdc423a461dae514d816bdc61c89", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:44:25.359421247" + "timestamp": "2025-07-21T16:09:47.906365972" }, "test_busco_genome_single_fasta": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,e3e503e1540b633d95c273c465945740", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,12e911830d66bab6dbf3523ac4392597", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:41:46.251404188" + "timestamp": "2025-07-21T16:08:41.497678114" }, "test_busco_genome_multi_fasta": { "content": [ [ - "full_table.tsv:md5,5a6bf59055e2040e74797a1e36c8e374", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039" + "full_table.tsv:md5,26b1d35d975593834acb4d4a91e225a1", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786" ], [ - "missing_busco_list.tsv:md5,a55eee6869fad9176d812e59886232fb", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46" + "missing_busco_list.tsv:md5,5dcdc7707035904a7d467ca1026b399a", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25" ], - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:42:28.126899794" - }, - "test_busco_eukaryote_metaeuk": { - "content": [ - "test-eukaryota_odb10-busco.batch_summary.txt:md5,ff6d8277e452a83ce9456bbee666feb6", - "full_table.tsv:md5,cfb55ab2ce590d2def51926324691aa8", - "missing_busco_list.tsv:md5,77e3d4503b2c13db0d611723fc83ab7e", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.3" - }, - "timestamp": "2025-03-12T10:43:59.997031348" + "timestamp": "2025-07-21T16:09:25.578789984" }, "test_busco_cleanup": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,e3e503e1540b633d95c273c465945740", - "full_table.tsv:md5,086f2ecdc90d47745c828c9b25357039", - "missing_busco_list.tsv:md5,9919aee2da9d30a3985aede354850a46", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,12e911830d66bab6dbf3523ac4392597", + "full_table.tsv:md5,660e2f556ca6efa97f0c2a8cebd94786", + "missing_busco_list.tsv:md5,0e08587f4dc65d9226a31433c1f9ba25", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:50:48.928173488" + "timestamp": "2025-07-21T16:11:08.495786376" }, "test_busco_transcriptome": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,6cd69d8a66b5f8b7fd4a9de758e7a739", - "full_table.tsv:md5,4efc19f8d2cc7ea9e73425f09cb3ed97", - "missing_busco_list.tsv:md5,55f0322d494e5c165508712be63062bf", + "test-bacteria_odb10-busco.batch_summary.txt:md5,8734b3f379c4c0928e5dd4ea1873dc64", + "full_table.tsv:md5,645b65b725fd8b30ff6808e0ac671a73", + "missing_busco_list.tsv:md5,b1cc1c22d484439ac128af2290d7d9dd", [ - "9767721at2.faa:md5,1731738ca153959391f8302fd5a3679f", - "9778364at2.faa:md5,7a19a6b6696ae53efce30457b4dd1ab2", - "9782003at2.faa:md5,65d2a613c903852681981f8e8427dc70", - "9790352at2.faa:md5,5e18cfb68122dff7a61c5517246223fc", - "9791908at2.faa:md5,707ef4501f93a6e0dc217e037f26da54", - "9793681at2.faa:md5,e361d654145e70f06c386e75ad90f943", - "9800696at2.faa:md5,9e2f431e4aada7bdc2c317747105b874", - "9801107at2.faa:md5,83933b1426fc9abfe8891c49838cd02f", - "9801213at2.faa:md5,ec340354a86728189c3d1a294c0ccbad", - "9801753at2.faa:md5,39c09bd8a831c90aab44ded14c56d0e6", - "9802065at2.faa:md5,8361fa013dc1cd29af938c9d5ffebfe4", - "9802219at2.faa:md5,9e23aed07790f460da634f7f6132e73d", - "9802304at2.faa:md5,86b259197441716075f3d3d18f8743ba", - "9802309at2.faa:md5,b4b4613e9b69baa9274140c1b26cc27b", - "9802672at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", - "9803420at2.faa:md5,eec6f7189ce9a596ed6ead06f2229c8a", - "9803541at2.faa:md5,132954cc7bfcb1c1fe9da105867c4b78", - "9803667at2.faa:md5,ec31d499f6b523cb081af6a3284a5a5c", - "9803773at2.faa:md5,efbe4c35075dd8c871827d4e5ac72922", - "9804006at2.faa:md5,fca5b560714ba37be0be3e2597f74c5a", - "9804243at2.faa:md5,3280570e4357fb4daedaea8a066dbf0b", - "9804478at2.faa:md5,98c2cfd8f089812a41a1e66fea630b2d", - "9804933at2.faa:md5,de648025c49061c614c77e7c9ce7ab62", - "9805026at2.faa:md5,eea9da88f3cd718514493d6890bf7660", - "9806637at2.faa:md5,c8a9e0c37a8aeb1fd44db64fd93aa3e1", - "9806651at2.faa:md5,f5abacf8930d78c81fdeb0c91c8681a7", - "9807064at2.faa:md5,1167d5c4c044b4eb82fac5d1955e7130", - "9807233at2.faa:md5,7c8adb6556a7f9a0244e7c7e5f75f20d", - "9807240at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", - "9807458at2.faa:md5,bee695d260b2b7f8980a636fed6aa0c0", - "9808036at2.faa:md5,797ca476d2c7820151fec98d2815d6cb", - "9808348at2.faa:md5,4e8573a5d287e01aa4f5de8b48feaa42", - "9808936at2.faa:md5,30333f3f62f8e3d0ea6f6544d49572c6", - "9809052at2.faa:md5,0590efbf94fce0ad212513dcb2e8176f", - "9809084at2.faa:md5,37e6214b4204dc31858e2ef2bad5db4a", - "9809356at2.faa:md5,e18c1d5a4931a25baf7dbd1a40c417dc", - "9809796at2.faa:md5,857aac8a22c00472bfc9add7fde94c5c", - "9810191at2.faa:md5,72b63933bb045b680e0635eb03915cc0", - "9811804at2.faa:md5,da341c24e763a949d16432bb052af321", - "9812272at2.faa:md5,7a54f872dd8243c6814852d40cf1bfc0", - "9812943at2.faa:md5,149da17f067cdce328a73f6364a95b26", - "9813375at2.faa:md5,49835b9f3188434c771a840b628b07f6", - "9814755at2.faa:md5,9b4c4648d250c2e6d04acb78f9cf6df0" + "1024388at2.faa:md5,797d603d262a6595a112e25b73e878b0", + "1054741at2.faa:md5,cd4b928cba6b19b4437746ba507e7195", + "1093223at2.faa:md5,df9549708e5ffcfaee6a74dd70a0e5dc", + "1151822at2.faa:md5,12726afc1cdc40c13392e1596e93df3a", + "143460at2.faa:md5,d887431fd988a5556a523440f02d9594", + "1491686at2.faa:md5,d03362d19979b27306c192f1c74a84e5", + "1504821at2.faa:md5,4f5f6e5c57bac0092c1d85ded73d7e67", + "1574817at2.faa:md5,1153e55998c2929eacad2aed7d08d248", + "1592033at2.faa:md5,bb7a59e5f3a57ba12d10dabf4c77ab57", + "1623045at2.faa:md5,8fe38155feb1802beb97ef7714837bf5", + "1661836at2.faa:md5,6c6d592c2fbb0d7a4e5e1f47a15644f0", + "1674344at2.faa:md5,bb41b44e53565a54cadf0b780532fe08", + "1698718at2.faa:md5,f233860000028eb00329aa85236c71e5", + "1990650at2.faa:md5,34a2d29c5f8b6253159ddb7a43fa1829", + "223233at2.faa:md5,dec6705c7846c989296e73942f953cbc", + "402899at2.faa:md5,acc0f271f9a586d2ce1ee41669b22999", + "505485at2.faa:md5,aa0391f8fa5d9bd19b30d844d5a99845", + "665824at2.faa:md5,47f8ad43b6a6078206feb48c2e552793", + "776861at2.faa:md5,f8b90c13f7c6be828dea3bb920195e3d", + "874197at2.faa:md5,8d22a35a768debe6f376fc695d233a69", + "932854at2.faa:md5,2eff2de1ab83b22f3234a529a44e22bb", + "95696at2.faa:md5,247bfd1aef432f7b5456307768e9149c" ], - "single_copy_proteins.faa:md5,14124def13668c6d9b0d589207754b31", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "single_copy_proteins.faa:md5,73e2c5d6a9b0f01f2deea3cc5f21b764", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:45:08.029718703" + "timestamp": "2025-07-21T16:10:28.783205973" }, "test_busco_protein": { "content": [ - "test-bacteria_odb12-busco.batch_summary.txt:md5,44d4cdebd61a3c8e8981ddf1829f83b3", - "full_table.tsv:md5,350f9b1b6c37cfcf41be84e93ef41931", - "missing_busco_list.tsv:md5,a55eee6869fad9176d812e59886232fb", - "versions.yml:md5,0046a4b8575cbc3635f2a9ee616fd840" + "test-bacteria_odb10-busco.batch_summary.txt:md5,942dbb2d8ff26240860a794213db14a8", + "full_table.tsv:md5,4db33686f2755a09fdc9521ca89411bc", + "missing_busco_list.tsv:md5,5dcdc7707035904a7d467ca1026b399a", + "versions.yml:md5,d3cecb346ce389a471bd041a53617d05" ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.10.3" + "nextflow": "25.04.6" }, - "timestamp": "2025-03-12T10:44:44.094048564" + "timestamp": "2025-07-21T16:10:05.674445797" } } \ No newline at end of file diff --git a/modules/nf-core/diamond/blastp/diamond-blastp.diff b/modules/nf-core/diamond/blastp/diamond-blastp.diff index 12608ea0..7d0916da 100644 --- a/modules/nf-core/diamond/blastp/diamond-blastp.diff +++ b/modules/nf-core/diamond/blastp/diamond-blastp.diff @@ -1,4 +1,7 @@ -Changes in module 'nf-core/diamond/blastp' +Changes in component 'nf-core/diamond/blastp' +'modules/nf-core/diamond/blastp/meta.yml' is unchanged +'modules/nf-core/diamond/blastp/environment.yml' is unchanged +Changes in 'diamond/blastp/main.nf': --- modules/nf-core/diamond/blastp/main.nf +++ modules/nf-core/diamond/blastp/main.nf @@ -12,6 +12,7 @@ @@ -17,7 +20,16 @@ Changes in module 'nf-core/diamond/blastp' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -59,6 +61,7 @@ +@@ -51,6 +53,8 @@ + gzip -c -d ${fasta} > ${fasta_name} + fi + ++ mkdir -p ./blastp_tmp ++ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ +@@ -59,6 +63,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ @@ -26,4 +38,7 @@ Changes in module 'nf-core/diamond/blastp' --out ${prefix}.${out_ext} +'modules/nf-core/diamond/blastp/tests/main.nf.test' is unchanged +'modules/nf-core/diamond/blastp/tests/main.nf.test.snap' is unchanged +'modules/nf-core/diamond/blastp/tests/tags.yml' is unchanged ************************************************************ diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf index ae5a1248..4d453763 100644 --- a/modules/nf-core/diamond/blastp/main.nf +++ b/modules/nf-core/diamond/blastp/main.nf @@ -53,6 +53,8 @@ process DIAMOND_BLASTP { gzip -c -d ${fasta} > ${fasta_name} fi + mkdir -p ./blastp_tmp + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diamond \\ diff --git a/modules/nf-core/diamond/blastx/diamond-blastx.diff b/modules/nf-core/diamond/blastx/diamond-blastx.diff index eff4326a..5219a26f 100644 --- a/modules/nf-core/diamond/blastx/diamond-blastx.diff +++ b/modules/nf-core/diamond/blastx/diamond-blastx.diff @@ -1,4 +1,7 @@ -Changes in module 'nf-core/diamond/blastx' +Changes in component 'nf-core/diamond/blastx' +'modules/nf-core/diamond/blastx/meta.yml' is unchanged +'modules/nf-core/diamond/blastx/environment.yml' is unchanged +Changes in 'diamond/blastx/main.nf': --- modules/nf-core/diamond/blastx/main.nf +++ modules/nf-core/diamond/blastx/main.nf @@ -12,6 +12,7 @@ @@ -17,7 +20,16 @@ Changes in module 'nf-core/diamond/blastx' switch ( out_ext ) { case "blast": outfmt = 0; break case "xml": outfmt = 5; break -@@ -60,6 +62,7 @@ +@@ -52,6 +54,8 @@ + gzip -c -d ${fasta} > ${fasta_name} + fi + ++ mkdir -p ./blastx_tmp ++ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ +@@ -60,6 +64,7 @@ --db \$DB \\ --query ${fasta_name} \\ --outfmt ${outfmt} ${columns} \\ @@ -26,4 +38,7 @@ Changes in module 'nf-core/diamond/blastx' --out ${prefix}.${out_ext} \\ --log +'modules/nf-core/diamond/blastx/tests/main.nf.test' is unchanged +'modules/nf-core/diamond/blastx/tests/main.nf.test.snap' is unchanged +'modules/nf-core/diamond/blastx/tests/tags.yml' is unchanged ************************************************************ diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf index dfa82e24..d45c5d5d 100644 --- a/modules/nf-core/diamond/blastx/main.nf +++ b/modules/nf-core/diamond/blastx/main.nf @@ -54,6 +54,8 @@ process DIAMOND_BLASTX { gzip -c -d ${fasta} > ${fasta_name} fi + mkdir -p ./blastx_tmp + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` diamond \\ diff --git a/nextflow.config b/nextflow.config index 2db7458f..bacd63d9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,7 +38,7 @@ params { skip_taxon_filtering = false // Execution options - use_work_dir_as_temp = false + use_work_dir_as_temp = false // MultiQC options multiqc_config = null @@ -291,6 +291,14 @@ manifest { contribution: ['author', 'maintainer'], orcid: 'https://orcid.org/0000-0002-7860-3560' ], + [ + name: 'Pointon, Damon-Lee Bernard', + affiliation: 'Wellcome Sanger Institute', + email: 'dp24@sanger.ac.uk', + github: 'https://github.com/DLBPointon', + contribution: ['contributor'], + orcid: 'https://orcid.org/0000-0003-2949-6719' + ], [ name: 'Qi, Guoying', affiliation: 'Wellcome Sanger Institute', @@ -331,7 +339,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=24.04.2' - version = '0.8.0' + version = '0.9.0' doi = '10.5281/zenodo.7949058' } diff --git a/nextflow_schema.json b/nextflow_schema.json index e83c29b1..a6fbbcda 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -175,9 +175,9 @@ "properties": { "use_work_dir_as_temp": { "type": "boolean", - "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK) use the work directory for their temporary files, rather than the system default.", + "description": "Set to true to make tools (e.g. sort, FastK, MerquryFK, BLASTP, BLASTX) use the work directory for their temporary files, rather than the system default.", "fa_icon": "fas fa-arrow-circle-down", - "hidden": true + "hidden": false } }, "fa_icon": "fas fa-running" @@ -303,7 +303,8 @@ "multiqc_methods_description": { "type": "string", "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" + "fa_icon": "fas fa-cog", + "hidden": true }, "validate_params": { "type": "boolean", @@ -325,9 +326,6 @@ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/institutional_config_options" - }, { "$ref": "#/$defs/reference_genome_options" }, diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 6c09a32e..9839d6d4 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -22,8 +22,8 @@ "@id": "./", "@type": "Dataset", "creativeWorkStatus": "Stable", - "datePublished": "2025-05-05T22:38:28+00:00", - "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with not hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents an aligned file.\nRows with the same sample identifier are considered technical replicates.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by [Alexander Ramos Diaz](https://github.com/alxndrdiaz), [Zaynab Butt](https://github.com/zb32), [Matthieu Muffato](https://github.com/muffato), and [Priyanka Surana](https://github.com/priyanka-surana). The orignal design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following doi: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "datePublished": "2025-10-03T10:18:25+00:00", + "description": "# ![sanger-tol/blobtoolkit](docs/images/sanger-tol-blobtoolkit_logo.png)\n\n[![GitHub Actions CI Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml/badge.svg)](https://github.com/sanger-tol/blobtoolkit/actions/workflows/linting.yml)\n[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7949058-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7949058)\n\n\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=conda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/sanger-tol/blobtoolkit)\n\n## Introduction\n\n**sanger-tol/blobtoolkit** is a bioinformatics pipeline that can be used to identify and analyse non-target DNA for eukaryotic genomes.\nIt takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome statistics, coverage and completeness information, combines them in a TSV file by window size to create a BlobDir dataset and static plots.\n\n1. Calculate genome statistics in windows ([`fastawindows`](https://github.com/tolkit/fasta_windows))\n2. Calculate Coverage ([`blobtk/depth`](https://github.com/blobtoolkit/blobtk))\n3. Determine the appropriate BUSCO lineages from the taxonomy.\n4. Run BUSCO ([`busco`](https://busco.ezlab.org/))\n5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))\n7. Run BLASTx against sequences with no hit ([`diamond/blastx`](https://github.com/bbuchfink/diamond))\n8. Run BLASTn against sequences still with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))\n9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))\n10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))\n11. Import analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))\n12. Create static plot images ([`blobtk/images`](https://github.com/blobtoolkit/blobtk))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nsample,datatype,datafile,library_layout\nmMelMel3_hic,hic,GCA_922984935.2.hic.mMelMel3.cram,PAIRED\nmMelMel1,illumina,GCA_922984935.2.illumina.mMelMel1.cram,PAIRED\nmMelMel3_ont,ont,GCA_922984935.2.ont.mMelMel3.cram,SINGLE\n```\n\nEach row represents a read set (aligned or not).\nThe first column (sample name) must be unique.\nIf you have multiple read sets from the same actual sample, make sure you edit the sample names to make them unique.\nThe datatype refers to the sequencing technology used to generate the underlying raw data and follows a controlled vocabulary (`ont`, `hic`, `pacbio`, `pacbio_clr`, `illumina`).\nThe library layout indicates whether the reads are paired or single.\nThe aligned read files can be generated using the [sanger-tol/readmapping](https://github.com/sanger-tol/readmapping) pipeline.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run sanger-tol/blobtoolkit \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \\\n --fasta genome.fasta \\\n --accession GCA_XXXXXXXXX.X \\\n --taxon XXXX \\\n --taxdump /path/to/taxdump/database \\\n --blastp /path/to/diamond/database \\\n --blastn /path/to/blastn/database \\\n --blastx /path/to/blastx/database\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details, please refer to the [usage documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/usage) and the [parameter documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/parameters).\n\n## Pipeline output\n\nFor more details about the output files and reports, please refer to the [output documentation](https://pipelines.tol.sanger.ac.uk/blobtoolkit/output).\n\n## Credits\n\nsanger-tol/blobtoolkit was written in Nextflow by:\n\n- [Alexander Ramos Diaz](https://github.com/alxndrdiaz)\n- [Zaynab Butt](https://github.com/zb32)\n- [Priyanka Surana](https://github.com/priyanka-surana)\n- [Matthieu Muffato](https://github.com/muffato)\n- [Tyler Chafin](https://github.com/tkchafin)\n- [Yumi Sims](https://github.com/yumisims)\n- [Damon-Lee Bernard Pointon](https://github.com/DLBPointon)\n\nThe original design and coding for [BlobToolKit software and Snakemake pipeline](https://github.com/blobtoolkit/blobtoolkit) was done by [Richard Challis](https://github.com/rjchallis) and [Sujai Kumar](https://github.com/sujaikumar).\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Guoying Qi](https://github.com/gq1)\n- [Bethan Yates](https://github.com/BethYates)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\n## Citations\n\nIf you use sanger-tol/blobtoolkit for your analysis, please cite it using the following DOI: [10.5281/zenodo.7949058](https://doi.org/10.5281/zenodo.7949058)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -105,7 +105,7 @@ }, "mentions": [ { - "@id": "#979b8e87-8931-4977-9815-b403f45772f9" + "@id": "#6708f1e3-5ef2-4308-b443-63390e3bac9c" } ], "name": "sanger-tol/blobtoolkit" @@ -156,6 +156,9 @@ { "@id": "https://orcid.org/0000-0001-8687-5905" }, + { + "@id": "https://orcid.org/0000-0003-2949-6719" + }, { "@id": "https://orcid.org/0000-0003-1262-8973" }, @@ -167,7 +170,7 @@ } ], "dateCreated": "", - "dateModified": "2025-05-05T23:38:28Z", + "dateModified": "2025-10-03T11:18:25Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nextflow", @@ -196,10 +199,10 @@ }, "url": [ "https://github.com/sanger-tol/blobtoolkit", - "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.8.0/" + "https://pipelines.tol.sanger.ac.uk//blobtoolkit/0.9.0/" ], "version": [ - "0.8.0" + "0.9.0" ] }, { @@ -215,11 +218,11 @@ "version": "!>=24.04.2" }, { - "@id": "#979b8e87-8931-4977-9815-b403f45772f9", + "@id": "#6708f1e3-5ef2-4308-b443-63390e3bac9c", "@type": "TestSuite", "instance": [ { - "@id": "#25bef01b-9ab2-4b38-9e93-733fb8af9c82" + "@id": "#d0d252ee-26c2-4e03-bb49-98d3ad100a90" } ], "mainEntity": { @@ -228,7 +231,7 @@ "name": "Test suite for sanger-tol/blobtoolkit" }, { - "@id": "#25bef01b-9ab2-4b38-9e93-733fb8af9c82", + "@id": "#d0d252ee-26c2-4e03-bb49-98d3ad100a90", "@type": "TestInstance", "name": "GitHub Actions workflow for testing sanger-tol/blobtoolkit", "resource": "repos/sanger-tol/blobtoolkit/actions/workflows/ci.yml", @@ -398,6 +401,14 @@ "name": "Muffato, Matthieu", "url": "https://github.com/muffato" }, + { + "@id": "https://orcid.org/0000-0003-2949-6719", + "@type": "Person", + "affiliation": "Wellcome Sanger Institute", + "email": "dp24@sanger.ac.uk", + "name": "Pointon, Damon-Lee Bernard", + "url": "https://github.com/DLBPointon" + }, { "@id": "https://orcid.org/0000-0003-1262-8973", "@type": "Person", diff --git a/subworkflows/local/busco_diamond_blastp.nf b/subworkflows/local/busco_diamond_blastp.nf index a0138289..bea4d394 100644 --- a/subworkflows/local/busco_diamond_blastp.nf +++ b/subworkflows/local/busco_diamond_blastp.nf @@ -25,16 +25,12 @@ workflow BUSCO_DIAMOND { // Prepare the BUSCO lineages // // 0. Initialise sone variables - basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] - def lineage_position = 0 + def basal_lineages = [ "eukaryota_odb10", "bacteria_odb10", "archaea_odb10" ] // 1. Start from the taxon's lineages busco_lin - // 2. Add the (missing) basal lineages - | map { lineages -> (lineages + basal_lineages).unique() } - | flatten () - // 3. Add a (0-based) index to record the original order (i.e. by age) - | map { lineage_name -> [lineage_name, lineage_position++] } - // 4. Move the lineage information to `meta` to be able to distinguish the BUSCO jobs and group their outputs later + // 2. Add the (missing) basal lineages and a (0-based) index to record the original order (i.e. by age) + | flatMap { lineages -> (lineages + basal_lineages).unique().withIndex() } + // 3. Move the lineage information to `meta` to be able to distinguish the BUSCO jobs and group their outputs later | combine ( fasta ) | map { lineage_name, lineage_index, meta, genome -> [meta + [lineage_name: lineage_name, lineage_index: lineage_index], genome] } | set { ch_fasta_with_lineage } diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 32507e01..3bdae4ae 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -162,12 +162,12 @@ workflow INPUT_CHECK { } // Remove any invalid lineages from precomputed_busco - ch_busco_lineages_list = ch_busco_lineages.flatten() - ch_parsed_busco_filtered = ch_parsed_busco - .filter { meta, path -> - ch_busco_lineages.contains(meta.lineage) - } - ch_parsed_busco_filtered = ch_parsed_busco_filtered.ifEmpty { Channel.value([]) } + // ch_busco_lineages_list = ch_busco_lineages.flatten() + // ch_parsed_busco_filtered = ch_parsed_busco + // .filter { meta, path -> + // ch_busco_lineages.contains(meta.lineage) + // } + // ch_parsed_busco_filtered = ch_parsed_busco_filtered.ifEmpty { Channel.value([]) } // // Get the BUSCO path if set