Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix tmpdir being set from base command #200

Merged
merged 5 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions aviary/aviary.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import logging
import os
from datetime import datetime
import tempfile

# Debug
debug={1:logging.CRITICAL,
Expand Down Expand Up @@ -198,7 +197,7 @@ def main():
help='Path to the location that will be treated used for temporary files. If none is specified, the TMPDIR \n'
'environment variable will be used. Can be configured within the `configure` subcommand',
dest='tmpdir',
default=tempfile.gettempdir(),
default=None,
)

base_group.add_argument(
Expand Down Expand Up @@ -1290,9 +1289,6 @@ def manage_env_vars(args):
if args.conda_prefix is None:
args.conda_prefix = Config.get_software_db_path('CONDA_ENV_PATH', '--conda-prefix')

if args.tmpdir is None:
args.tmpdir = Config.get_software_db_path('TMPDIR', '--tmpdir')

try:
if args.gtdb_path is None:
args.gtdb_path = Config.get_software_db_path('GTDBTK_DATA_PATH', '--gtdb-path')
Expand Down
2 changes: 1 addition & 1 deletion aviary/modules/annotation/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ rule eggnog:
params:
mag_extension = config['mag_extension'],
eggnog_db = config['eggnog_folder'],
tmpdir = config["tmpdir"]
tmpdir = config["tmpdir"] if config["tmpdir"] else "$TMPDIR",
output:
done = 'data/eggnog/done'
threads:
Expand Down
31 changes: 4 additions & 27 deletions aviary/modules/assembly/assembly.smk
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -367,31 +367,8 @@ rule spades_assembly:
"envs/spades.yaml"
benchmark:
"benchmarks/spades_assembly.benchmark.txt"
shell:
"""
rm -rf data/spades_assembly/tmp;
minimumsize=500000;
actualsize=$(stat -c%s data/short_reads.filt.fastq.gz);
if [ -d "data/spades_assembly/" ]
then
spades.py --restart-from last --memory {params.max_memory} -t {threads} -o data/spades_assembly -k {params.kmer_sizes} --tmp-dir {params.tmpdir} > {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
elif [ $actualsize -ge $minimumsize ]
then
if [ {params.long_read_type} = "ont" ] || [ {params.long_read_type} = "ont_hq" ]
then
spades.py --checkpoints all --memory {params.max_memory} --meta --nanopore {input.long_reads} --12 {input.fastq} \
-o data/spades_assembly -t {threads} -k {params.kmer_sizes} --tmp-dir {params.tmpdir} >> {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
else
spades.py --checkpoints all --memory {params.max_memory} --meta --pacbio {input.long_reads} --12 {input.fastq} \
-o data/spades_assembly -t {threads} -k {params.kmer_sizes} --tmp-dir {params.tmpdir} >> {log} 2>&1 && \
cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta
fi
else
mkdir -p {output.spades_folder} && touch {output.fasta}
fi
"""
script:
"scripts/spades_assembly.py"


# Perform short read assembly only with no other steps
Expand Down Expand Up @@ -458,14 +435,14 @@ rule spades_assembly_coverage:
log:
"logs/spades_assembly_coverage.log"
params:
tmpdir = config["tmpdir"]
tmpdir = f"TMPDIR={config['tmpdir']}" if config["tmpdir"] else ""
conda:
"../../envs/coverm.yaml"
benchmark:
"benchmarks/spades_assembly_coverage.benchmark.txt"
shell:
"""
TMPDIR={params.tmpdir} coverm contig -m metabat -t {threads} -r {input.fasta} --interleaved {input.fastq} --bam-file-cache-directory data/cached_bams/ > {output.assembly_cov} 2> {log};
{params.tmpdir} coverm contig -m metabat -t {threads} -r {input.fasta} --interleaved {input.fastq} --bam-file-cache-directory data/cached_bams/ > {output.assembly_cov} 2> {log};
AroneyS marked this conversation as resolved.
Show resolved Hide resolved
mv data/cached_bams/*.bam {output.bam} && samtools index -@ {threads} {output.bam} 2>> {log}
"""

Expand Down
13 changes: 11 additions & 2 deletions aviary/modules/assembly/scripts/assemble_short_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ def assemble_short_reads(
:return:
'''

if not tmp_dir:
try:
tmp_dir = os.environ["TMPDIR"]
except KeyError:
tmp_dir_arg = ""

if tmp_dir:
tmp_dir_arg = f"--tmp-dir {tmp_dir}"

# deal with read sets i.e. are we coassembling? Which assembler are we using?
# Non co-assembled reads are handled the same for each assembler
if read_set1 != 'none':
Expand Down Expand Up @@ -86,7 +95,7 @@ def assemble_short_reads(
# Run chosen assembler
if use_megahit:
max_memory_in_bytes = max_memory * 1024*1024*1024
command = f"megahit {read_string} -t {threads} -m {max_memory_in_bytes} -o data/megahit_assembly --tmp-dir {tmp_dir}"
command = f"megahit {read_string} -t {threads} -m {max_memory_in_bytes} -o data/megahit_assembly {tmp_dir_arg}"

with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
Expand All @@ -97,7 +106,7 @@ def assemble_short_reads(
else:
kmers = " ".join(kmer_sizes)
command = f"spades.py --memory {max_memory} --meta -t {threads} " \
f"-o data/short_read_assembly {read_string} -k {kmers} --tmp-dir {tmp_dir}"
f"-o data/short_read_assembly {read_string} -k {kmers} {tmp_dir_arg}"
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
Expand Down
89 changes: 89 additions & 0 deletions aviary/modules/assembly/scripts/spades_assembly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import subprocess
import os
import sys
from typing import List

def spades_asssembly(
input_fastq: str,
input_long_reads: str,
output_fasta: str,
output_spades_folder: str,
max_memory: int,
threads: int,
kmer_sizes: List[int],
tmp_dir: str,
long_read_type: str,
log: str):

'''
Assemble short reads and long reads (if any) using spades
:param input_fastq: short reads fastq file
:param input_long_reads: long reads fastq file
:param output_fasta: output fasta file
:param output_spades_folder: output spades folder
:param max_memory: maximum memory to use
:param threads: number of threads
:param tmpdir: temporary directory
:param kmer_sizes: list of kmer sizes
:param long_read_type: type of long reads
:param log: log file
:return:
'''
if tmp_dir:
tmp_dir_arg = f"--tmp-dir {tmp_dir}"
else:
try:
tmp_dir = os.environ["TMPDIR"]
tmp_dir_arg = f"--tmp-dir {tmp_dir}"
except KeyError:
tmp_dir_arg = ""

if os.path.exists("data/spades_assembly/tmp"):
with open(log, 'a') as logf:
subprocess.run("rm -rf data/spades_assembly/tmp".split(), stdout=logf, stderr=subprocess.STDOUT)
# remove existing temporary directory
minimumsize=500000
actualsize = int(subprocess.check_output('stat -c%s data/short_reads.filt.fastq.gz', shell=True))
# check if directory exists
if os.path.exists("data/spades_assembly"):
# resume previous assembly
command = f"spades.py --restart-from last --memory {max_memory} -t {threads} " \
f"-o data/spades_assembly -k {kmer_sizes} {tmp_dir_arg}"
# run cmd
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
subprocess.run("cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta".split(), stdout=logf, stderr=subprocess.STDOUT)
elif actualsize >= minimumsize:
if long_read_type in ["ont","ont_hq"]:
command = f"spades.py --checkpoints all --memory {max_memory} --meta --nanopore {input_long_reads} --12 {input_fastq} "\
f"-o data/spades_assembly -t {threads} -k {kmer_sizes} {tmp_dir_arg} "
else:
command = f"spades.py --checkpoints all --memory {max_memory} --meta --pacbio {input_long_reads} --12 {input_fastq} "\
f"-o data/spades_assembly -t {threads} -k {kmer_sizes} {tmp_dir_arg} "
# run cmd
with open(log, 'a') as logf:
logf.write(f"Queueing command {command}\n")
subprocess.run(command.split(), stdout=logf, stderr=subprocess.STDOUT)
subprocess.run("cp data/spades_assembly/scaffolds.fasta data/spades_assembly.fasta".split(), stdout=logf, stderr=subprocess.STDOUT)
else:
with open(log, 'a') as logf:
subprocess.run(f"mkdir -p {output.spades_folder} && touch {output.fasta}".split(), stdout=logf, stderr=subprocess.STDOUT)


if __name__ == '__main__':
log = snakemake.log[0]
with open(log, 'w') as logf: pass

spades_asssembly(
snakemake.input.fastq,
snakemake.input.long_reads,
snakemake.output.fasta,
snakemake.output.spades_folder,
snakemake.params.max_memory,
snakemake.threads,
snakemake.params.kmer_sizes,
snakemake.params.tmpdir,
snakemake.params.long_read_type,
log
)
3 changes: 2 additions & 1 deletion aviary/modules/binning/scripts/get_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def get_coverage(
threads: int,
log: str,
):
os.environ["TMPDIR"] = tmpdir
if tmpdir: os.environ["TMPDIR"] = tmpdir

if long_reads != "none" and not os.path.exists("data/long_cov.tsv"):
if long_read_type in ["ont", "ont_hq"]:
coverm_cmd = f"coverm contig -t {threads} -r {input_fasta} --single {' '.join(long_reads)} -p minimap2-ont -m length trimmed_mean variance --bam-file-cache-directory data/binning_bams/ --discard-unmapped --min-read-percent-identity 0.85 --output-file data/long_cov.tsv".split()
Expand Down
5 changes: 3 additions & 2 deletions aviary/modules/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(self,


self.conda_prefix = args.conda_prefix
self.tmpdir = os.path.abspath(args.tmpdir)
self.tmpdir = os.path.abspath(args.tmpdir) if args.tmpdir else None
self.resources = args.resources
self.output = os.path.abspath(args.output)
self.threads = args.max_threads
Expand Down Expand Up @@ -444,7 +444,8 @@ def run_workflow(self, cores=16, profile=None, cluster_retries=None,
self._validate_config()

cores = max(int(self.threads), cores)
os.environ["TMPDIR"] = self.tmpdir
if self.tmpdir is not None:
os.environ["TMPDIR"] = self.tmpdir
for workflow in self.workflows:
cmd = (
"snakemake --snakefile {snakefile} --directory {working_dir} "
Expand Down
1 change: 0 additions & 1 deletion aviary/modules/strain_analysis/strain_analysis.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ rule lorikeet:
params:
mag_extension = config['mag_extension'],
parallel_genomes = 8,
tmpdir = config['tmpdir']
resources:
mem_mb=int(config["max_memory"])*512
threads:
Expand Down
2 changes: 1 addition & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ ani: none
precluster_ani: none
precluster_method: none
pggb_params: none
tmpdir: /tmp
tmpdir: none
Loading