sanger-tol · yumisims · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 23, 2025
diff --git a/conf/test.config b/conf/test.config
@@ -28,7 +28,7 @@ params {
     input     = "${projectDir}/assets/test/samplesheet_s3.csv"
 
     // Fasta references
-    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.phiXspike.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz"
     accession = "GCA_922984935.2"
     taxon     = "Meles meles"
 

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -20,7 +20,7 @@ params {
     input     = "${projectDir}/assets/test_full/full_samplesheet.csv"
 
     // Fasta references
-    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Laetiporus_sulphureus/assembly/release/gfLaeSulp1.1/insdc/GCA_927399515.1.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz"
     accession = "GCA_927399515.1"
     taxon     = "Laetiporus sulphureus"
 

diff --git a/conf/test_raw.config b/conf/test_raw.config
@@ -29,7 +29,7 @@ params {
     align     = true
 
     // Fasta references
-    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Meles_meles/assembly/release/mMelMel3.1_paternal_haplotype/GCA_922984935.2.subset.fasta.gz"
+    fasta     = "https://tolit.cog.sanger.ac.uk/test-data/Aythya_fuligula/assembly/bAytFul3.fa.gz"
     accession = "GCA_922984935.2"
     taxon     = "Meles meles"
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -101,6 +101,43 @@ For instance:
 --busco path-to-databases/busco/ --busco_lineages vertebrata_odb10,bacteria_odb10,fungi_odb10
 ```
 
+### BUSCO database path format
+
+**Important**: The `--busco` parameter must be a directory containing the `lineages/` subdirectory, **NOT** to the `lineages/` directory itself. BUSCO databases are always directories, never individual files.
+
+```bash
+# ✅ Correct - points to the parent directory
+--busco /path/to/busco_downloads/
+
+# ❌ Common mistake - includes /lineages at the end
+--busco /path/to/busco_downloads/lineages/
+
+# ❌ Another common mistake - points to a specific lineage
+--busco /path/to/busco_downloads/lineages/eukaryota_odb10/
+```
+
+The pipeline will automatically detect and correct paths ending with `/lineages` or pointing to specific lineage directories (e.g., `eukaryota_odb10`) to prevent common errors where BUSCO tries to access incorrect paths.
+
+### BLAST database path formats
+
+The `--blastn` parameter accepts two formats:
+
+1. **Directory path** (for backwards compatibility):
+
+   ```bash
+   --blastn /path/to/databases/nt_2024_10/
+   ```
+
+   This works only if the directory contains a single BLAST database.
+
+2. **Direct file path** (recommended for clarity):
+   ```bash
+   --blastn /path/to/databases/nt_2024_10/nt.nal
+   ```
+   This is required if your database directory contains multiple BLAST databases. Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available.
+
+If multiple databases are found in a directory, the pipeline will fail with a clear error message listing all available databases and suggesting the exact file paths to use.
+
 ### Getting databases ready for the pipeline
 
 The BlobToolKit pipeline can be run in many different ways. The default way requires access to several databases:
@@ -152,17 +189,37 @@ wget "ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/nt.???.tar.gz" -P $NT/ &&
 for file in $NT/*.tar.gz; do
     tar xf $file -C $NT && rm $file;
 done
+```
 
 wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" &&
 tar xf taxdb.tar.gz -C $NT &&
 rm taxdb.tar.gz
 
 # Compress and cleanup
+
 cd ..
 tar -cvzf $NT_TAR $NT
 rm -r $NT
+
+````
+
+##### Important: Handling directories with multiple BLAST databases
+
+If your database directory contains multiple BLAST databases (e.g., both `nt` and `nr` databases), you must specify the exact path to the `.nal` file to avoid ambiguity:
+
+```bash
+# ❌ This will fail if multiple databases are present
+--blastn /path/to/databases/
+
+# ✅ Specify the exact database file
+--blastn /path/to/databases/nt.nal
 ```
 
+The pipeline supports two formats for the `--blastn` parameter:
+
+- **Directory path**: `/path/to/databases/nt_2024_10/` (only works if directory contains a single BLAST database)
+- **Direct file path**: `/path/to/databases/nt_2024_10/nt.nal` (recommended for directories with multiple databases). Note: When you specify a direct `.nal` file path, the pipeline automatically uses the parent directory to ensure all associated database files are available.
+
 #### 3. UniProt reference proteomes database
 
 You need [diamond blast](https://github.com/bbuchfink/diamond) installed for this step.
@@ -177,7 +234,7 @@ UNIPROT=/path/to/databases/uniprot_${DATE}
 UNIPROT_TAR=/path/to/databases/uniprot_${DATE}.tar.gz
 mkdir -p $UNIPROT
 cd $UNIPROT
-```
+````
 
 The UniProt `Refseq_Proteomes_YYYY_MM.tar.gz` file is very large (close to 200 GB) and will take a long time to download.
 The command below looks complex because it needs to get around the problem of using wildcards with wget and curl.

diff --git a/nextflow.config b/nextflow.config
@@ -18,6 +18,7 @@ params {
     busco_lineages             = null
     precomputed_busco          = null
     busco_gene_predictor       = null
+    ntdb_prefix                = null
 
     // Reference options
     fasta                      = null

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -108,8 +108,8 @@
                     "format": "file-path",
                     "exists": true,
                     "mimetype": "text/plain",
-                    "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$",
-                    "description": "Path to FASTA genome file.",
+                    "pattern": "^\\S+\\.fa(\\.gz)?$",
+                    "description": "Path to FASTA genome file (must have .fa extension).",
                     "fa_icon": "far fa-file-code"
                 }
             }
@@ -124,7 +124,8 @@
                 "busco": {
                     "type": "string",
                     "format": "path",
-                    "description": "Local directory where clade-specific BUSCO lineage datasets are stored",
+                    "description": "Local directory where clade-specific BUSCO lineage datasets are stored. Must be a directory containing the 'lineages/' subdirectory.",
+                    "help_text": "BUSCO databases are always directories, never individual files. Do NOT include '/lineages' at the end of the path or point to specific lineage directories. The pipeline will automatically correct paths ending with '/lineages' or pointing to specific lineages. Example: '/path/to/busco_downloads/' not '/path/to/busco_downloads/lineages/' or '/path/to/busco_downloads/lineages/eukaryota_odb10/'",
                     "fa_icon": "fas fa-folder-open"
                 },
                 "lineage_tax_ids": {
@@ -155,9 +156,16 @@
                     "type": "string",
                     "format": "path",
                     "exists": true,
-                    "description": "Path to the nucleotide BLAST database",
+                    "description": "Path to the nucleotide BLAST database. Can be either a directory containing a single database or a direct path to a .nal file.",
+                    "help_text": "For directories with multiple databases, specify the exact .nal file path to avoid ambiguity. When a .nal file is specified, the parent directory is used to ensure all database files are available. Example: '/path/to/databases/nt.nal' instead of '/path/to/databases/'",
                     "fa_icon": "fas fa-file-archive"
                 },
+                "ntdb_prefix": {
+                    "type": "string",
+                    "description": "Database basename (prefix) to select when --blastn is a directory. Provide the name without extension, e.g. 'nt' for 'nt.nal'.",
+                    "help_text": "When supplying a directory to --blastn, set --ntdb_prefix to identify the database basename (without the .nal extension). The pipeline will look for a file named <prefix>.nal inside that directory and use it. Example: --blastn /path/to/databases/ --ntdb_prefix nt.",
+                    "fa_icon": "fas fa-tag"
+                },
                 "taxdump": {
                     "type": "string",
                     "format": "path",

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -3,8 +3,6 @@
 //
 
 include { samplesheetToList         } from 'plugin/nf-schema'
-
-
 include { UNTAR                     } from '../../modules/nf-core/untar/main'
 include { CAT_CAT                   } from '../../modules/nf-core/cat/cat/main'
 include { SAMTOOLS_FLAGSTAT         } from '../../modules/nf-core/samtools/flagstat/main'
@@ -45,6 +43,14 @@ workflow INPUT_CHECK {
         .map { db_meta, db_path ->
             if (db_meta.type in ["blastp", "blastx"] && db_path.isDirectory()) {
                 [db_meta, file(db_path.toString() + "/${db_path.name}", checkIfExists: true)]
+            } else if (db_meta.type == "blastn") {
+                // Special handling for BLAST nucleotide databases
+                def (resolved_path, db_name) = validateBlastnDatabase(db_path)
+                [db_meta, resolved_path]
+            } else if (db_meta.type == "busco") {
+                // Special handling for BUSCO databases
+                def resolved_path = validateBuscoDatabase(db_path)
+                [db_meta, resolved_path]
             } else {
                 [db_meta, db_path]
             }
@@ -272,3 +278,180 @@ def get_read_counts ( stats ) {
 
     return read_count_meta
 }
+
+/*
+ * Function to validate and resolve BUSCO database paths
+ * Handles the common user error of including '/lineages' at the end of the path
+ */
+def validateBuscoDatabase(db_path) {
+    def path_file = file(db_path)
+    if (path_file.isDirectory()) {
+        // Check if path ends with /lineages and has a parent directory
+        if (path_file.name == 'lineages' && path_file.parent != null) {
+            def parent_dir = file(path_file.parent)
+            log.info "BUSCO path correction: Detected '/lineages' suffix in path"
+            log.info "  Original path: ${path_file}"
+            log.info "  Corrected path: ${parent_dir}"
+            log.info "This prevents the common error where BUSCO tries to use '${path_file}/lineages/lineage_name' instead of '${parent_dir}/lineages/lineage_name'"
+            return parent_dir
+        }
+        // Check if path points to a specific lineage directory (e.g., eukaryota_odb10)
+        else if (path_file.name.endsWith('_odb10') && path_file.parent != null) {
+            def parent_dir = file(path_file.parent)
+            // Check if parent is 'lineages' - if so, we need to go up two levels
+            if (parent_dir.name == 'lineages' && parent_dir.parent != null) {
+                def busco_root = file(parent_dir.parent)
+                log.info "BUSCO path correction: Detected specific lineage directory in path"
+                log.info "  Original path: ${path_file} (specific lineage: ${path_file.name})"
+                log.info "  Corrected path: ${busco_root}"
+                log.info "This prevents the error where BUSCO tries to use a specific lineage directory instead of the root BUSCO database directory"
+                log.warn "Use `--busco_lineages ${path_file.name}` to control the lineage"
+                return busco_root
+            } else {
+                error """
+                ERROR: Invalid BUSCO lineage directory structure: ${path_file}
+                It appears you're pointing to a specific BUSCO lineage directory (${path_file.name}),
+                but the expected directory structure is:
+                /path/to/busco_downloads/lineages/${path_file.name}/
+                Please provide the path to the root BUSCO database directory.
+                Example: --busco /path/to/busco_downloads/
+                """
+            }
+        } else {
+            // Path looks correct, return as-is
+            log.info "Using BUSCO database path: ${path_file}"
+            return path_file
+        }
+    } else {
+        error """
+        ERROR: Invalid BUSCO database path: ${path_file}
+        BUSCO databases must be directories containing the 'lineages/' subdirectory.
+        Please ensure the path points to a valid BUSCO database directory.
+        Common issues:
+        - Path should point to the directory containing 'lineages/' subdirectory
+        - Do NOT include '/lineages' at the end of the path
+        - Do NOT point to a specific lineage directory (e.g., eukaryota_odb10)
+        - BUSCO databases cannot be individual files
+        Example: --busco /path/to/busco_downloads/
+        NOT: --busco /path/to/busco_downloads/lineages/
+        NOT: --busco /path/to/busco_downloads/lineages/eukaryota_odb10/
+        """
+    }
+}
+
+/*
+ * Function to validate and resolve BLAST nucleotide database paths
+ * Handles both directory paths (for backwards compatibility) and direct .nal file paths
+ */
+def validateBlastnDatabase(db_path) {
+    def path_file = file(db_path)
+    if (path_file.isFile()) {
+        // Direct file provided - validate it's a .nal file and create isolated directory
+        if (path_file.name.endsWith('.nal')) {
+            if (!path_file.exists()) {
+                error """
+                ERROR: BLAST database file not found: ${path_file}
+                Please check that the path is correct and the file exists.
+                """
+            }
+            def parent_dir = file(path_file.parent)
+            def db_name = path_file.name.replaceAll('\\.nal$', '')
+
+            // Create a temporary directory in the system temp folder with a UUID to avoid
+            // writing into the database parent directory
+            def uuid = java.util.UUID.randomUUID().toString()
+            // Create isolated directory inside the pipeline working directory
+            def temp_dir = file("${System.getProperty('user.dir')}/.btk_isolated_${db_name}_${uuid}")
+            if (!temp_dir.exists()) {
+                temp_dir.mkdirs()
+            }
+
+            // Find all files belonging to this specific database
+            def db_files = parent_dir.listFiles().findAll {
+                it.name.startsWith("${db_name}.") ||
+                it.name in ['taxdb.btd', 'taxdb.bti', 'taxonomy4blast.sqlite3']
+            }
+
+            // Create symlinks in the temporary directory
+            db_files.each { source_file ->
+                def link_file = file("${temp_dir}/${source_file.name}")
+                if (!link_file.exists()) {
+                    // Create symbolic link
+                    link_file.createLink(source_file)
+                }
+            }
+
+            log.info "Direct BLAST database file specified: ${path_file}"
+            log.info "Database name: ${db_name}"
+            log.info "Created isolated directory: ${temp_dir}"
+            log.info "This ensures only the specified database is available to BLAST"
+            return [temp_dir, db_name]
+        } else {
+            error """
+            ERROR: Invalid BLAST database file: ${path_file}
+            The file must have a .nal extension.
+            Please provide either:
+                - A directory containing a single BLAST database
+                - The direct path to a .nal file
+            Example: --blastn /path/to/databases/nt.nal
+            """
+        }
+    } else if (path_file.isDirectory()) {
+        // Directory provided - require the user to specify the database prefix
+        log.info "BLAST database directory provided: ${path_file}"
+        def prefix = (this.binding.hasVariable('params') && params.containsKey('ntdb_prefix')) ? params.ntdb_prefix : null
+        if (!prefix) {
+            error """
+            ERROR: A BLAST database directory was provided (${path_file}) but no database prefix was supplied.
+            The pipeline requires you to select which database inside the directory to use.
+            Please provide the database prefix (basename without extension) using --ntdb_prefix.
+            Example: --blastn ${path_file} --ntdb_prefix nt  (will select ${path_file}/nt.nal)
+            """
+        }
+
+        // Look for the requested .nal file inside the directory
+        def expected_name = "${prefix}.nal"
+        def expected_file = path_file.listFiles().find { it.name == expected_name }
+        if (!expected_file) {
+            error """
+            ERROR: Requested BLAST database prefix '${prefix}' not found in ${path_file}
+            Expected file: ${path_file}/${expected_name}
+            Please ensure the prefix passed with --ntdb_prefix matches a .nal file in the directory.
+            """
+        }
+
+        // Create isolated directory with symlinks to the chosen database files
+        def parent_dir = file(expected_file.parent)
+        def db_name = expected_file.name.replaceAll('\\.nal$', '')
+        def uuid = java.util.UUID.randomUUID().toString()
+    // Create isolated directory inside the pipeline working directory
+    def temp_dir = file("${System.getProperty('user.dir')}/.btk_isolated_${db_name}_${uuid}")
+        if (!temp_dir.exists()) {
+            temp_dir.mkdirs()
+        }
+
+        def db_files = parent_dir.listFiles().findAll {
+            it.name.startsWith("${db_name}.") ||
+            it.name in ['taxdb.btd', 'taxdb.bti', 'taxonomy4blast.sqlite3']
+        }
+
+        db_files.each { source_file ->
+            def link_file = file("${temp_dir}/${source_file.name}")
+            if (!link_file.exists()) {
+                link_file.createLink(source_file)
+            }
+        }
+
+        log.info "Using BLAST database '${db_name}' from directory: ${path_file}"
+        log.info "Created isolated directory: ${temp_dir}"
+        return [temp_dir, db_name]
+    } else {
+        error """
+        ERROR: Invalid database path: ${path_file}
+        The path must point to either:
+            - A directory containing a single BLAST database
+            - A direct path to a .nal file
+        Example: --blastn /path/to/databases/nt.nal
+        """
+    }
+}