Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions qp_pacbio/data/resources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2):
syndna:
node_count: 1
nprocs: 16
wall_time_limit: 10:00:00
mem_in_gb: 60
wall_time_limit: 4:00:00
mem_in_gb: 20
max_tasks: 16
finish:
node_count: 1
nprocs: 16
wall_time_limit: 1-00:00:00
mem_in_gb: 120
nprocs: 1
wall_time_limit: 4:00:00
mem_in_gb: 4
52 changes: 39 additions & 13 deletions qp_pacbio/data/templates/syndna.sbatch
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general this file is fine, but I think there's considerable room for optimization via piping. There are several SAM and BAM files that appear to be intermediates and do not really need to be written to disk unless they are needed for a downstream process.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you are right; however, trying to simulate the current available commands and processes. Now, if you see some easy, low hanging fruits, please let me know so I can add them.

Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
#SBATCH -n {{nprocs}}
#SBATCH --time {{wall_time_limit}}
#SBATCH --mem {{mem_in_gb}}G
#SBATCH -o {{output}}/minimap2/logs/%x-%A_%a.out
#SBATCH -e {{output}}/minimap2/logs/%x-%A_%a.err
#SBATCH -o {{output}}/syndna/logs/%x-%A_%a.out
#SBATCH -e {{output}}/syndna/logs/%x-%A_%a.err
#SBATCH --array {{array_params}}

source ~/.bashrc
set -e
{{conda_environment}}
out_folder={{output}}/syndna
mkdir -p
mkdir -p ${out_folder}
cd ${out_folder}
db_folder=/scratch/qp-pacbio/minimap2/syndna/

Expand All @@ -28,23 +28,49 @@ mkdir -p ${out_folder}/filtered/
sn_folder=${out_folder}/bioms/${sample_name}
mkdir -p ${sn_folder}

txt=${sn_folder}/${sample_name}.txt
tsv=${txt/.txt/.tsv}
coverm contig --single $filename --reference ${db_folder}/All_synDNA_inserts.fasta --mapper minimap2-hifi \
--min-read-percent-identity 0.95 --min-read-aligned-percent 0.0 -m mean count --threads {{nprocs}} \
--output-file ${sn_folder}/${sample_name}.txt
cat ${sn_folder}/${sample_name}_insert_counts.txt | sed 's/Contig/\#OTU ID/' | \
sed 's/ Read Count//' > ${sn_folder}/${sample_name}.tsv
biom convert -i ${sn_folder}/${sample_name}.txt -o ${sn_folder}/${sample_name}.biom --to-hdf5
--output-file ${txt}

awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}' ${txt} | \
sed 's/Contig/\#OTU ID/' | sed 's/All_synDNA_inserts.fasta\///' | \
sed 's/ Read Count//' | sed "s/${fn}/${sample_name}/" > ${tsv}

# if counts is zero mark it as missing and stop
counts=`tail -n +2 ${tsv} | awk '{sum += $NF} END {print sum}'`
if [[ "$counts" == "0" ]]; then
echo ${sample_name} > {{output}}/failed_${SLURM_ARRAY_TASK_ID}.log
exit 0
fi

biom convert -i ${tsv} -o ${sn_folder}/syndna.biom --to-hdf5

# removing AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta not coverm
# ---- original commands ----
# minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
# samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
# seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
# ---- original commands ----
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq

# removing GCF_000184185.1_ASM18418v1_genomic_chroso.fna use coverm
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq
samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.sam
coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.sam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_no_inserts.bam
# ---- original commands ----
# minimap2 -x map-hifi -t 8 -a --MD --eqx -o reads.sam ecoli_genome.fna reads.fastq
# samtools view -bS -@ 8 reads.fastq | samtools sort -@ 24 -O bam -o reads.sorted.bam
# coverm filter --bam-files reads.sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads 8 -o reads_filtered.sorted.bam
# samtools view -O SAM -o reads_filtered.sam ./reads_filtered.sorted.bam
# awk '{print $1}' reads_filtered.sam > reads_filtered.txt
# seqkit grep -v -f reads_filtered.txt reads.fastq > reads_no_ecoli.fastq
# ---- original commands ----
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid.fastq
samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.bam
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems wrong. You're running samtools view on a FASTQ file. Shouldn't it be samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_GCF_000184185.sam?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jianshu93, can you comment? I'm not actually sure.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be the SAM file. I think I have the SAM as input in the example commands?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what @jianshu93 sent:

minimap2 -x map-hifi -t 8 -a --MD --eqx -o reads.sam ecoli_genome.fna reads.fastq
samtools view -bS -@ 8 reads.fastq | samtools sort -@ 24 -O bam -o reads.sorted.bam
coverm filter --bam-files reads.sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads 8 -o reads_filtered.sorted.bam
samtools view -O SAM -o reads_filtered.sam ./reads_filtered.sorted.bam
awk '{print $1}' reads_filtered.sam > reads_filtered.txt
seqkit grep -v -f reads_filtered.txt reads.fastq > reads_no_ecoli.fastq

@lucaspatel, could you also check?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think is OK based on the commands sent by @jianshu93. Additionally, I have added as comments the original commands to make sure they are the same and remove checking them in the tests to avoid confusion or extra lines.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, just tested it on barnacle2 and this really does seem to work:
head -n 100 15814.PSQ.0044_no_plasmid.fastq | samtools view -bS -@ 4 - | samtools sort -@ 4 -O sam

Good with me then

coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.bam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_GCF_000184185.bam
awk '{print $1}' ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam > ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_GCF_000184185.fastq | gz > ${out_folder}/filtered/${fn}
awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}'
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_no_plasmid.fastq | gzip > ${out_folder}/filtered/${fn}

touch {{output}}/completed_${SLURM_ARRAY_TASK_ID}.log
6 changes: 3 additions & 3 deletions qp_pacbio/data/templates/syndna_finish.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
#SBATCH -n {{nprocs}}
#SBATCH --time {{wall_time_limit}}
#SBATCH --mem {{mem_in_gb}}G
#SBATCH -o {{output}}/merge/logs/%x-%A_%a.out
#SBATCH -e {{output}}/merge/logs/%x-%A_%a.err
#SBATCH -o {{output}}/finish/logs/%x-%A_%a.out
#SBATCH -e {{output}}/finish/logs/%x-%A_%a.err

source ~/.bashrc
set -e
{{conda_environment}}
cd {{output}}/

biom_merge_pacbio --base {{output}} --type syndna
biom_merge_pacbio --base {{output}}/syndna --merge-type syndna

# find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
# micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz
Expand Down
2 changes: 1 addition & 1 deletion qp_pacbio/data/templates/woltka_minimap2_merge.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ for f in `ls bioms/*/per-gene.biom`; do
done | parallel --halt now,fail=1 -j {{nprocs}}
wait

biom_merge_pacbio --base {{output}} --type woltka
biom_merge_pacbio --base {{output}} --merge-type woltka

find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz
Expand Down
32 changes: 25 additions & 7 deletions qp_pacbio/qp_pacbio.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ def generate_sample_list(qclient, artifact_id, out_dir):
with open(out_fp, "w", encoding="utf-8") as f:
f.write("\n".join(lines))

preparation_information = join(out_dir, "prep_info.tsv")
prep.set_index("sample_name").to_csv(preparation_information, sep="\t")

return len(lines)


Expand Down Expand Up @@ -454,11 +457,26 @@ def syndna_processing(qclient, job_id, parameters, out_dir):

errors = []
ainfo = []
fp_biom = f"{out_dir}/syndna.biom"

failures = glob(f"{out_dir}/failed_*.log")
if failures:
errors.append("Samples failed: ")
for f in failures:
with open(f, "r") as fp:
errors.append(fp.read())
return False, ainfo, "\n".join(errors)

completed = len(glob(f"{out_dir}/completed_*.log"))
with open(f"{out_dir}/sample_list.txt") as fp:
samples = len(fp.readlines())

if completed != samples:
errors.append(f"There are {samples - completed} missing samples.")

fp_biom = f"{out_dir}/syndna/syndna.biom"
# do we need to stor alignments?
# fp_alng = f'{out_dir}/sams/final/alignment.tar'

if exists(fp_biom): # and exists(fp_alng):
if not errors and exists(fp_biom): # and exists(fp_alng):
# if we got to this point a preparation file should exist in
# the output folder
prep = pd.read_csv(f"{out_dir}/prep_info.tsv", index_col=None, sep="\t")
Expand Down Expand Up @@ -492,7 +510,7 @@ def syndna_processing(qclient, job_id, parameters, out_dir):
"contact qiita.help@gmail.com for more information"
)

fp_seqs = f"{out_dir}/filtered"
fp_seqs = f"{out_dir}/syndna/filtered"
reads = []
for f in glob(f"{fp_seqs}/*.fastq.gz"):
reads.append((f, "raw_forward_seqs"))
Expand Down Expand Up @@ -558,7 +576,7 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
"mem_in_gb": step_resources["mem_in_gb"],
"array_params": f"1-{njobs}%{step_resources['max_tasks']}",
}
minimap2_script = _write_slurm(f"{out_dir}/minimap2", m2t, **params)
minimap2_script = _write_slurm(f"{out_dir}/syndna", m2t, **params)

m2mt = JGT("syndna_finish.sbatch")
step_resources = resources["finish"]
Expand All @@ -570,6 +588,6 @@ def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
"mem_in_gb": step_resources["mem_in_gb"],
"url": url,
}
minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params)
minimap2_finish_script = _write_slurm(f"{out_dir}/finish", m2mt, **params)

return minimap2_script, minimap2_merge_script
return minimap2_script, minimap2_finish_script
41 changes: 22 additions & 19 deletions qp_pacbio/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------
import enum
from glob import glob
from os import makedirs
from os.path import join
Expand All @@ -21,6 +20,7 @@
PACBIO_PROCESSING_STEPS,
generate_minimap2_processing,
generate_sample_list,
generate_syndna_processing,
pacbio_generate_templates,
)
from qp_pacbio.util import client_connect
Expand Down Expand Up @@ -57,18 +57,23 @@ def execute(url, job_id, output_dir):
command = job_info["command"]
artifact_id = parameters["artifact"]

if command == "Woltka v0.1.7, minimap2":
main_fp, merge_fp = generate_minimap2_processing(
regular_commands = {
"Woltka v0.1.7, minimap2": generate_minimap2_processing,
"Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2)": generate_syndna_processing,
}

if command in regular_commands.keys():
first_fp, second_fp = regular_commands[command](
qclient, job_id, output_dir, parameters, url
)

# Submitting jobs and returning id
main_job = run(["sbatch", main_fp], stdout=PIPE)
main_job_id = main_job.stdout.decode("utf8").split()[-1]
cmd = ["sbatch", "-d", f"afterok:{main_job_id}", merge_fp]
merge_job = run(cmd, stdout=PIPE)
merge_job_id = merge_job.stdout.decode("utf8").split()[-1]
print(f"{main_job_id}, {merge_job_id}")
first_job = run(["sbatch", first_fp], stdout=PIPE)
first_job_id = first_job.stdout.decode("utf8").split()[-1]
cmd = ["sbatch", "-d", f"afterok:{first_job_id}", second_fp]
second_job = run(cmd, stdout=PIPE)
second_job_id = second_job.stdout.decode("utf8").split()[-1]
print(f"{first_job_id}, {second_job_id}")
qclient.update_job_step(job_id, "Step 2 of 4: Aligning sequences")
elif command == "PacBio processing":
frp = join(output_dir, "results")
Expand Down Expand Up @@ -151,22 +156,20 @@ def _biom_merge(tables):
return full


class BIOMMergeOptions(enum.Enum):
SYNDNA = enum.auto()
WOLTKA = enum.auto()


@click.command()
@click.option("--base", type=click.Path(exists=True), required=True)
@click.option("--type", type=click.Choice(BIOMMergeOptions, case_sensitive=False))
def biom_merge(base, type: BIOMMergeOptions):
@click.option(
"--merge-type", type=click.Choice(["syndna", "woltka"], case_sensitive=False)
)
def biom_merge(base, merge_type):
"""Merges all PacBio biom tables"""
if type == BIOMMergeOptions.SYNDNA:
merge_type = merge_type.lower()
if merge_type == "syndna":
ranks = ["syndna"]
elif type == BIOMMergeOptions.WOLTKA:
elif merge_type == "woltka":
ranks = ["none", "per-gene", "ko", "ec", "pathway"]
else:
raise ValueError(f"Type '{type}' not supported")
raise ValueError(f"Type '{merge_type}' not supported")

for rank in ranks:
rank = rank + ".biom"
Expand Down
Loading
Loading