From 09dc316822592105d5f595c6a45e1d0856ac9b4e Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 11 Aug 2024 16:43:27 -0700 Subject: [PATCH 01/47] initial add --- sequence_processing_pipeline/TRConvertJob.py | 212 +++++++++++++ .../templates/cloudspades-isolate.sbatch | 110 +++++++ .../templates/cloudspades.sbatch | 115 ++++++++ .../templates/integrate.sbatch | 120 ++++++++ .../templates/telllink-isolate.sbatch | 60 ++++ .../templates/telllink.sbatch | 61 ++++ .../templates/tellread-cleanup.sbatch | 19 ++ .../templates/tellread.sbatch | 105 +++++++ .../templates/tellread.sh | 279 ++++++++++++++++++ 9 files changed, 1081 insertions(+) create mode 100644 sequence_processing_pipeline/TRConvertJob.py create mode 100644 sequence_processing_pipeline/templates/cloudspades-isolate.sbatch create mode 100644 sequence_processing_pipeline/templates/cloudspades.sbatch create mode 100644 sequence_processing_pipeline/templates/integrate.sbatch create mode 100644 sequence_processing_pipeline/templates/telllink-isolate.sbatch create mode 100644 sequence_processing_pipeline/templates/telllink.sbatch create mode 100644 sequence_processing_pipeline/templates/tellread-cleanup.sbatch create mode 100644 sequence_processing_pipeline/templates/tellread.sbatch create mode 100755 sequence_processing_pipeline/templates/tellread.sh diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py new file mode 100644 index 00000000..5d277609 --- /dev/null +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -0,0 +1,212 @@ +from os.path import join, exists +from sequence_processing_pipeline.Job import Job +from sequence_processing_pipeline.PipelineError import (PipelineError, + JobFailedError) +import logging +import re + + +class TRConvertJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, nprocs, wall_time_limit, pmem, bcl_tool_path, + modules_to_load, qiita_job_id): + """ + TRConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param nprocs: The maximum number of parallel processes to use. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param bcl_tool_path: The path to either bcl2fastq or bcl-convert. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + """ + super().__init__(run_dir, + output_path, + 'TRConvertJob', + [bcl_tool_path], + 1000, + modules_to_load=modules_to_load) + + # for metagenomics pipelines, sample_sheet_path will reflect a real + # sample_sheet file. For amplicon pipelines, sample_sheet_path will + # reference a dummy sample_sheet file. + self.sample_sheet_path = sample_sheet_path + self.queue_name = queue_name + self.node_count = node_count + self.nprocs = nprocs + self.wall_time_limit = wall_time_limit + self.pmem = pmem + self.bcl_tool = bcl_tool_path + self.qiita_job_id = qiita_job_id + self.job_script_path = join(self.output_path, f"{self.job_name}.sh") + self.suffix = 'fastq.gz' + + tmp = False + for executable_name in ['bcl2fastq', 'bcl-convert']: + if executable_name in self.bcl_tool: + tmp = True + break + + if not tmp: + raise PipelineError(f'{self.bcl_tool} is not the path to a known' + 'executable') + + self._file_check(self.sample_sheet_path) + + # As the sample-sheet is validated by the Pipeline object before + # being passed to TRConvertJob, additional validation isn't needed. + + self._generate_job_script() + + def _generate_job_script(self): + """ + Generate a Torque job script for processing supplied root_directory. + :return: The path to the newly-created job-script. + """ + lines = [] + + lines.append("#!/bin/bash") + lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}") + lines.append(f"#SBATCH -p {self.queue_name}") + lines.append(f'#SBATCH -N {self.node_count}') + lines.append(f'#SBATCH -n {self.nprocs}') + lines.append("#SBATCH --time %d" % self.wall_time_limit) + + # send an email to the list of users defined below when a job starts, + # terminates, or aborts. This is used to confirm that the package's + # own reporting mechanism is reporting correctly. + lines.append("#SBATCH --mail-type=ALL") + + # list of users to be contacted independently of this package's + # notification system, when a job starts, terminates, or gets aborted. + lines.append("#SBATCH --mail-user qiita.help@gmail.com") + + lines.append(f"#SBATCH --mem-per-cpu {self.pmem}") + + lines.append("set -x") + lines.append('date') + lines.append('hostname') + lines.append(f'cd {self.root_dir}') + + if self.modules_to_load: + lines.append("module load " + ' '.join(self.modules_to_load)) + + # Assume that the bcl-convert tool is named 'bcl-convert' and choose + # accordingly. + if 'bcl-convert' in self.bcl_tool: + lines.append(('%s ' + '--sample-sheet "%s" ' + '--output-directory %s ' + '--bcl-input-directory . ' + '--bcl-num-decompression-threads 16 ' + '--bcl-num-conversion-threads 16 ' + '--bcl-num-compression-threads 16 ' + '--bcl-num-parallel-tiles 16 ' + '--bcl-sampleproject-subdirectories true ' + '--force') % (self.bcl_tool, + self.sample_sheet_path, + self.output_path)) + + # equivalent cp for bcl-conversion (see below) needed. + else: + lines.append(('%s ' + '--sample-sheet "%s" ' + '--minimum-trimmed-read-length 1 ' + '--mask-short-adapter-reads 1 ' + '-R . ' + '-o %s ' + '--loading-threads 16 ' + '--processing-threads 16 ' + '--writing-threads 16 ' + '--create-fastq-for-index-reads ' + '--ignore-missing-positions ') % + (self.bcl_tool, + self.sample_sheet_path, + self.output_path)) + + with open(self.job_script_path, 'w') as f: + for line in lines: + # remove long spaces in some lines. + line = re.sub(r'\s+', ' ', line) + f.write(f"{line}\n") + + def run(self, callback=None): + """ + Run BCL2Fastq/BCLConvert conversion + :param callback: optional function taking two parameters (id, status) + that is called when a running process's status is + changed. + :return: + """ + try: + job_info = self.submit_job(self.job_script_path, + exec_from=self.log_path, + callback=callback) + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.info(f'Successful job: {job_info}') + + def parse_logs(self): + log_path = join(self.output_path, 'Logs') + errors = join(log_path, 'Errors.log') + + msgs = [] + + if not exists(errors): + # we do not raise an Error in this case because it's expected that + # parse_logs() will be called in response to an exceptional + # condition. + msgs.append(f"'{errors} does not exist") + + with open(errors, 'r') as f: + lines = f.readlines() + for line in [x.strip() for x in lines]: + msgs.append(line) + + return msgs + + @staticmethod + def parse_job_script(job_script_path): + # Returns run-directory and sample-sheet path from a job-script. + + if not exists(job_script_path): + raise ValueError(f"'{job_script_path}' is not a valid path") + + with open(job_script_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + + # As this code creates this file, we can expect it to be of a certain + # format. + if lines[0] != '#!/bin/bash': + raise ValueError(f"'{job_script_path}' is not a valid path") + + result = {} + + m = re.match('^cd (.*)$', lines[12]) + + if m: + result['run_directory'] = m.group(1) + else: + raise ValueError("could not detect run_directory in " + f"'{job_script_path}'") + + m = re.match('^bcl-convert --sample-sheet "(.*?)" ', lines[14]) + + if m: + result['sample_sheet_path'] = m.group(1) + else: + raise ValueError("could not detect sample-sheet path in " + f"'{job_script_path}'") + + return result diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch new file mode 100644 index 00000000..5d0e5015 --- /dev/null +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -0,0 +1,110 @@ +#!/bin/bash -l +#SBATCH -J cs-assemble +#SBATCH --time 24:00:00 +#SBATCH --mem 64gb +#SBATCH -N 1 +#SBATCH -c 12 +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH -p qiita + +### --gres=gpu:1 +source activate qiime2-2023.5 +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x +set -e + +echo $TMPDIR + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +base=${OUTPUT} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +mamba activate activate qiime2-2023.5 +module load gcc_9.3.0 + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) + +# assumes 1-based array index, eg --array 1-N +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +cs=${base}/cloudspades-isolate/${sample} + +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${cs} ]]; then + rm -fr ${cs} + fi +fi + +#acs=${base}/cloudspades/${sample}-ariadne +#acscs=${acs}/assembled +mkdir -p ${cs} +#mkdir -p ${acs} +#mkdir -p ${acscs} + +pushd ~/spades-cloudspades-paper/assembler/ +./spades.py \ + -o ${cs} \ + --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 +module unload gcc_9.3.0 +popd + +mamba activate quast +quast \ + -o ${cs}/quast-scaffolds \ + -t ${SLURM_JOB_CPUS_PER_NODE} \ + ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 + +# remove intermediates that currently dont have a downstream use +if [[ -d ${cs}/K21 ]]; then + rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +fi + +#pushd $HOME/2023.08.29-ariadne/ariadne +#mamba activate ariadne-gcc8.5.0 +#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH} +#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH} +## parameters from Lauren Mak, 9.25.23 +#./spades.py \ +# -o ${acs} \ +# --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ +# --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ +# --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \ +# --meta \ +# --only-assembler \ +# --search-distance 5000 --size-cutoff 6 -k 55 \ +# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1 +#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH} +#popd +# +#pushd ~/spades-cloudspades-paper/assembler/ +#module load gcc_9.3.0 +#./spades.py \ +# -o ${acscs} \ +# --gemcode1-1 ${acs}/K55/5000.R1.fastq \ +# --gemcode1-2 ${acs}/K55/5000.R2.fastq \ +# --meta \ +# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1 +#module unload gcc_9.3.0 +#popd +# +#mamba activate quast +#quast \ +# -o ${acscs}/quast-scaffolds \ +# -t ${SLURM_JOB_CPUS_PER_NODE} \ +# ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1 diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch new file mode 100644 index 00000000..fbc30ae2 --- /dev/null +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -0,0 +1,115 @@ +#!/bin/bash -l +#SBATCH -J cs-assemble +#SBATCH --time 24:00:00 +#SBATCH --mem 128gb +#SBATCH -N 1 +#SBATCH -c 12 +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH -p qiita + +### --gres=gpu:1 +source activate qiime2-2023.5 +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x +set -e + +echo $TMPDIR + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +base=${OUTPUT} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +mamba activate activate qiime2-2023.5 +module load gcc_9.3.0 + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) + +# assumes 1-based array index, eg --array 1-N +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +cs=${base}/cloudspades/${sample} + +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${cs} ]]; then + rm -fr ${cs} + fi +fi + +#acs=${base}/cloudspades/${sample}-ariadne +#acscs=${acs}/assembled +mkdir -p ${cs} +#mkdir -p ${acs} +#mkdir -p ${acscs} + +#pushd ~/spades-cloudspades-paper/assembler/ +#pushd /home/mcdonadt/cloudspades-0.1/spades-cloudspades-0.1/assembler/bin +#pushd /home/qiita/CHARLIE/TELLREAD/spades-cloudspades-0.1/assembler/bin +pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin + +./spades.py \ + -o ${cs} \ + --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + --meta \ + -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 +module unload gcc_9.3.0 +popd + +mamba activate quast +quast \ + -o ${cs}/quast-scaffolds \ + -t ${SLURM_JOB_CPUS_PER_NODE} \ + ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 + +# remove intermediates that currently dont have a downstream use +if [[ -d ${cs}/K21 ]]; then + rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +fi + +#pushd $HOME/2023.08.29-ariadne/ariadne +#mamba activate ariadne-gcc8.5.0 +#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH} +#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH} +## parameters from Lauren Mak, 9.25.23 +#./spades.py \ +# -o ${acs} \ +# --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ +# --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ +# --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \ +# --meta \ +# --only-assembler \ +# --search-distance 5000 --size-cutoff 6 -k 55 \ +# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1 +#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH} +#popd +# +#pushd ~/spades-cloudspades-paper/assembler/ +#module load gcc_9.3.0 +#./spades.py \ +# -o ${acscs} \ +# --gemcode1-1 ${acs}/K55/5000.R1.fastq \ +# --gemcode1-2 ${acs}/K55/5000.R2.fastq \ +# --meta \ +# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1 +#module unload gcc_9.3.0 +#popd +# +#mamba activate quast +#quast \ +# -o ${acscs}/quast-scaffolds \ +# -t ${SLURM_JOB_CPUS_PER_NODE} \ +# ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1 diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch new file mode 100644 index 00000000..4d7af5aa --- /dev/null +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -0,0 +1,120 @@ +#!/bin/bash -l +#SBATCH -J integrate +#SBATCH --time 24:00:00 +#SBATCH --mem 8gb +#SBATCH -N 1 +#SBATCH -c 1 +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err +#SBATCH -p qiita + +source activate rust +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html +cores=${SLURM_CPUS_PER_TASK} + +if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then + echo "Not operating in an array" + exit 1 +fi + +if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then + echo "Line extraction assumes 1-based index" + exit 1 +fi + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +if [[ -z ${BASE} ]]; then + echo "BASE not specified" + exit 1 +fi + +tellread=${OUTPUT} +if [[ ! -d ${tellread} ]]; then + echo "${tellread} not found" + exit 1 +fi + +set -x +set -e +set -o pipefail + +samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR=$(mktemp -d) +function cleanup { + echo "Removing $TMPDIR" + rm -r $TMPDIR + unset TMPDIR +} +trap cleanup EXIT + +files=${TMPDIR}/integration.files +/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files} +mkdir -p ${tellread}/integrated + +if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R1" + exit 1 +fi + +if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R2" + exit 1 +fi + +if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} I1" + exit 1 +fi + +r1=$(grep -m 1 "_R1_${sample}" ${files}) +r2=$(grep -m 1 "_R2_${sample}" ${files}) +i1=$(grep -m 1 "_I1_${sample}" ${files}) +r1out=${tellread}/integrated/${sample}.R1.fastq.gz +r2out=${tellread}/integrated/${sample}.R2.fastq.gz +i1out=${tellread}/integrated/${sample}.I1.fastq.gz + +if [[ ! -s ${r1} ]]; then + echo "${r1} is empty, cannot integrate" + if [[ -s ${r2} ]]; then + echo "R1 and R2 are inconsistent" + exit 1 + fi + if [[ -s ${i1} ]]; then + echo "R1 and I1 are inconsistent" + exit 1 + fi + + # reflect the empties so Qiita can know of them + touch ${r1out} + touch ${r2out} + touch ${i1out} + exit 0 +fi + +# this can probably be backgrounded but then you have to get creative to +# not mask a nonzero exit status (e.g., the python process raising) +cat ${i1} | gzip > ${i1out} + +mamba activate tellread-integrate +#python ${BASE}/integrate-indices-np.py integrate \ +python ${BASE}/integrate-indices-np.py integrate \ + --no-sort \ + --r1-in ${r1} \ + --r2-in ${r2} \ + --i1-in ${i1} \ + --r1-out ${r1out} \ + --r2-out ${r2out} \ + --threads ${cores} diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch new file mode 100644 index 00000000..9f778757 --- /dev/null +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -0,0 +1,60 @@ +#!/bin/bash -l +#SBATCH --mem 160G +#SBATCH -N 1 +#SBATCH -c 16 +#SBATCH -t 96:00:00 +#SBATCH -J tellink-isolate +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH -p qiita + +set -x +set -e + +module load singularity_3.6.4 + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +#base=/panfs/dtmcdonald/${LABELTAG} +base=/panfs/qiita/TELLREAD/${LABELTAG} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +k=79 +lc=35 +cores=${SLURM_CPUS_PER_TASK} + +tl=${base}/tell-link-isolate/${sample} +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${tl} ]]; then + rm -fr ${tl} + fi +fi + +mkdir -p ${tl} + +HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/ +${HOME_PATH}/tellink-release/run_tellink_sing.sh \ + -r1 ${base}/integrated/${sample}.R1.fastq.gz \ + -r2 ${base}/integrated/${sample}.R2.fastq.gz \ + -i1 ${base}/integrated/${sample}.I1.fastq.gz \ + -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -k ${k} \ + -lc ${lc} \ + -p ${sample} \ + -j ${cores} + +# remove temporary data +if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +fi diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch new file mode 100644 index 00000000..64a69072 --- /dev/null +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -0,0 +1,61 @@ +#!/bin/bash -l +#SBATCH --mem 160G +#SBATCH -N 1 +#SBATCH -c 16 +#SBATCH -t 96:00:00 +#SBATCH -J tellink +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH -p qiita + +set -x +set -e + +module load singularity_3.6.4 + +if [[ -z "${LABELTAG}" ]]; then + echo "LABEL is not specified" + exit 1 +fi + +base=/panfs/${USER}/${LABELTAG} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +k=79 +lc=35 +cores=${SLURM_CPUS_PER_TASK} + +tl=${base}/tell-link/${sample} +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${tl} ]]; then + rm -fr ${tl} + fi +fi + +mkdir -p ${tl} + +HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/ +${HOME_PATH}/tellink-release/run_tellink_sing.sh \ + -r1 ${base}/integrated/${sample}.R1.fastq.gz \ + -r2 ${base}/integrated/${sample}.R2.fastq.gz \ + -i1 ${base}/integrated/${sample}.I1.fastq.gz \ + -d metagenomics \ + -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -k ${k} \ + -lc ${lc} \ + -p ${sample} \ + -j ${cores} + +# remove temporary data +if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +fi + diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch new file mode 100644 index 00000000..a8808822 --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -0,0 +1,19 @@ +#!/bin/bash -l +#SBATCH -J cleanup +#SBATCH --time 24:00:00 +#SBATCH --mem 8gb +#SBATCH -N 1 +#SBATCH -c 1 +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=FAIL +#SBATCH --output %x-%A.out +#SBATCH --error %x-%A.err +#SBATCH -p qiita + +if [[ -z "${OUTPUT}" ]]; then + echo "OUTPUT is not specified" + exit 1 +fi + +# remove unused large outputs +rm -fr ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch new file mode 100644 index 00000000..be5ef9e7 --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -0,0 +1,105 @@ +#!/bin/bash -l + +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH --mem 16G +#SBATCH --partition=short +#SBATCH -t 96:00:00 +#SBATCH -J tellread +#SBATCH --output %x-%A.out +#SBATCH --error %x-%A.err +#SBATCH --mail-user=qiita.help@gmail.com +#SBATCH --mail-type=BEGIN,FAIL +#SBATCH -p qiita + +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x + +if [[ -z "${N_SAMPLES}" ]]; then + echo "N_SAMPLES is not specified" + exit 1 +fi + +if [[ -z "${SEQRUNPATH}" ]]; then + echo "SEQRUNPATH is not specified" + exit 1 +fi + +if [[ -z "${LANE}" ]]; then + echo "LANE is not specified" + exit 1 +fi + +if [[ -z "${SAMPLES}" ]]; then + echo "SAMPLES is not specified" + exit 1 +fi + +if [[ -z "${REFS}" ]]; then + echo "REFS is not specified" + exit 1 +fi + +if [[ -z "${OUTPUT}" ]]; then + echo "OUTPUT is not specified" + exit 1 +fi + +export TMPDIR=/panfs/${USER}/tmp +mkdir -p ${TMPDIR} +export TMPDIR=$(mktemp -d) +seqrun_path=${SEQRUNPATH} + +if [[ ${LANE} == "L001" ]]; then + lane=s_1 +elif [[ ${LANE} == "L002" ]]; then + lane=s_2 +elif [[ ${LANE} == "L003" ]]; then + lane=s_3 +elif [[ ${LANE} == "L004" ]]; then + lane=s_4 +elif [[ ${LANE} == "L005" ]]; then + lane=s_5 +elif [[ ${LANE} == "L006" ]]; then + lane=s_6 +elif [[ ${LANE} == "L007" ]]; then + lane=s_7 +elif [[ ${LANE} == "L008" ]]; then + lane=s_8 +else + echo "Unrecognized lane: ${LANE}" + exit 1 +fi + +# yes, hard coded, not great but progress. +extra="" +if [[ ! -z ${REFBASE} ]]; then + extra="-f ${REFBASE}" +fi + +mkdir -p ${OUTPUT} + +module load singularity_3.6.4 +$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ + -i ${seqrun_path} \ + -o ${OUTPUT} \ + -s $(echo ${SAMPLES} | tr -d '"') \ + -g $(echo ${REFS} | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} \ + ${extra} \ + -l ${lane} + + +if [[ -d ${OUTPUT}/Full ]]; then + echo "Run appears successful" +elif [[ -d ${OUTPUT}/1_demult/Full ]]; then + echo "Run appears unsuccessful but has output" + exit 1 +else + echo "Run appears unsuccessful" + exit 1 +fi diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh new file mode 100755 index 00000000..78b8862a --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -0,0 +1,279 @@ +#!/bin/bash + +script_name=${0##*/} + +function help () { + echo "Submit for TELL-read" + echo "" + echo "Usage: ${script_name} -s -l [-r reference_map] [-b reference_base]" + echo "" + echo -e "\t-s\tPath to the sequencing run." + echo -e "\t-i\tThe sample sheet." + echo -e "\t-l\tThe lane to process." + echo -e "\t-r\tA file specifying reference genomes to use [OPTIONAL]" + echo -e "\t-b\tReference genome base directory [OPTIONAL]" + echo -e "\t-m\tMode, isolate or metagenomic [OPTIONAL]" + echo "" +} + +# references right now are only used for techdev + +# derived from https://www.redhat.com/sysadmin/arguments-options-bash-scripts +while getopts "hs:i:l:r:b:m:" option; do + case ${option} in + h) + help + exit;; + s) seqrunpath=${OPTARG};; + l) lane=${OPTARG};; + r) reference_map=${OPTARG};; + b) reference_base=${OPTARG};; + m) mode=${OPTARG};; + \?) + echo "Error: Invalid option" + exit;; + *) + echo "Error: Invalid option" + exit;; + esac +done + +# nifty +# https://unix.stackexchange.com/a/621007 +: ${seqrunpath:?Missing -s} +: ${lane:?Missing -i} + +if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then + if [[ -z ${reference_map} ]]; then + echo "-b used without -r" + exit 1 + fi + if [[ -z ${reference_base} ]]; then + echo "-r used without -b" + exit 1 + fi + if [[ ! -d ${reference_base} ]]; then + echo "reference base not found" + exit 1 + fi + + tag=reference-based +else + tag=reference-free +fi + +samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv" + +# trim trailing slash +# https://stackoverflow.com/a/32845647/19741 +safepath=$(echo ${seqrunpath} | sed 's:/*$::') +label=$(basename ${safepath}) +labeltag=${label}-${tag} +output=/panfs/${USER}/${labeltag} + +if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then + echo "Cannot access the lane" + exit 1 +fi + +if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then + echo "FOO" + sbatch_cores=2 + sbatch_mem=8G + norm=TRUE + wall=24:00:00 + mode=NA +elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then + echo "BAR" + sbatch_cores=2 + sbatch_mem=8G + norm=TRUE + wall=24:00:00 + mode=NA +else + echo "BAZ" + sbatch_cores=16 + sbatch_mem=160G + norm=FALSE + assemble=TRUE + wall=48:00:00 +fi + +if [[ ${mode} == "isolate" ]]; then + ISOLATE_MODE=TRUE +elif [[ ${mode} == "metagenomic" ]]; then + ISOLATE_MODE=FALSE +elif [[ ${mode} == "NA" ]]; then + ISOLATE_MODE=FALSE +else + echo "unknown mode: ${mode}" + exit 1 +fi + +set -e +set -o pipefail + +declare -a s +declare -a g +# below extended regex might be broken because C5\d\d happens in column 0, not column 1 +# of the hacked sample-sheet. +#for sample in $(egrep -o ",C5[0-9][0-9]," ${samplesheet} | tr -d "," | sort) +for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) +do + echo "SAMPLE: ${sample}" + # get references if they exist + if [[ -f ${reference_map} ]]; then + if $(grep -Fq ${sample} ${reference_map}); then + ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n") + if [[ ${ref} != "NONE" ]]; then + if [[ ! -d "${reference_base}/${ref}" ]]; then + echo "${reference_base}/${ref}" + echo "${ref} not found" + exit 1 + fi + g[${#g[@]}]=${ref} + s[${#s[@]}]=${sample} + fi + fi + else + g[${#g[@]}]=NONE + s[${#s[@]}]=${sample} + fi +done +n_samples=${#s[@]} + +echo "Submitting:" +echo "S: ${s[@]}" +echo "G: ${g[@]}" + +# https://stackoverflow.com/a/17841619/19741 +function join_by { local IFS="$1"; shift; echo "$*"; } +s=$(join_by , "${s[@]}") +g=$(join_by , "${g[@]}") + +base=$(dirname ${0}) +submit_script=$(dirname ${0})/tellread.sbatch +integrate_script=$(dirname ${0})/integrate.sbatch +norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch +asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch +clean_script=$(dirname ${0})/tellread-cleanup.sbatch + +if [[ ${ISOLATE_MODE} == "TRUE" ]]; then + asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch + asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch +else + asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch + asm_tellink_script=$(dirname ${0})/telllink.sbatch +fi + +if [[ ! -f ${submit_script} ]]; then + echo "Cannot access submit script" + exit 1 +fi +if [[ ! -f ${asm_cloudspades_script} ]]; then + echo "Cannot access cloudspades assembly script" + exit 1 +fi +if [[ ! -f ${asm_tellink_script} ]]; then + echo "Cannot access tell-link assembly script" + exit 1 +fi +if [[ ! -f ${integrate_script} ]]; then + echo "Cannot access integrate script" + exit 1 +fi +if [[ ! -f ${clean_script} ]]; then + echo "Cannot access clean script" + exit 1 +fi + +datetag=$(date "+%Y.%m.%d") +scriptcopy=$(pwd)/tellread_script-${datetag}.sh +submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch +asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch +asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch +normcopy=$(pwd)/norm_submission-${datetag}.sbatch +intcopy=$(pwd)/integrate_submission-${datetag}.sbatch +cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch +arguments=$(pwd)/provided_script_arguments.txt +if [[ -f ${scriptcopy} ]]; then + echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit" + exit 1 +fi +if [[ -f ${submitcopy} ]]; then + echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit" + exit 1 +fi + +# CHARLIE +echo $@ > ${arguments} +cp ${0} ${scriptcopy} +cp ${submit_script} ${submitcopy} +cp ${asm_cloudspades_script} ${asmcscopy} +cp ${asm_tellink_script} ${asmtlcopy} +cp ${integrate_script} ${intcopy} +cp ${clean_script} ${cleancopy} +chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy} + +set -x + +echo "C" + +trjob=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag} \ + -c ${sbatch_cores} \ + --mem ${sbatch_mem} \ + --time ${wall} \ + --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ + ${submit_script}) + +echo "D" + +if [[ ${norm} == "TRUE" ]]; then + cp ${norm_script} ${normcopy} + chmod gou-w ${normcopy} + norm_counts_job=$(sbatch \ + --parsable \ + --dependency=afterok:${trjob} \ + -J ${labeltag}-${datetag}-norm-counts \ + --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \ + ${norm_script}) +fi + +echo "E" +integrate_job=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag}-integrate \ + --dependency=afterok:${trjob} \ + --array 1-${n_samples} \ + --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \ + ${integrate_script}) + +if [[ ${assemble} == "TRUE" ]]; then + csj=$(sbatch \ + --parsable \ + --dependency=aftercorr:${integrate_job} \ + -J ${labeltag}-${datetag}-cloudspades \ + --array 1-${n_samples} \ + --export LABELTAG=${labeltag},OUTPUT=${output} \ + ${asm_cloudspades_script}) + tlj=$(sbatch \ + --parsable \ + --dependency=aftercorr:${integrate_job} \ + -J ${labeltag}-${datetag}-tell-link \ + --array 1-${n_samples} \ + --export LABELTAG=${labeltag},OUTPUT=${output} \ + ${asm_tellink_script}) + cleanupdep=${csj}:${tlj} +else + cleanupdep=${integrate_job} + echo "Not assembling" +fi + +cleanup=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag}-cleanup \ + --dependency=afterok:${cleanupdep} \ + --export OUTPUT=${output} \ + ${clean_script}) From 3406cbfa1de9f946c774c981efca085239015fb8 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 11 Aug 2024 16:47:54 -0700 Subject: [PATCH 02/47] initial cleanup --- .../templates/cloudspades-isolate.sbatch | 39 ----------------- .../templates/cloudspades.sbatch | 43 ------------------- .../templates/integrate.sbatch | 1 - .../templates/telllink-isolate.sbatch | 1 - .../templates/tellread.sh | 5 +-- 5 files changed, 1 insertion(+), 88 deletions(-) diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index 5d0e5015..cf18a094 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -10,7 +10,6 @@ #SBATCH --mail-type=FAIL #SBATCH -p qiita -### --gres=gpu:1 source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; @@ -49,11 +48,7 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then fi fi -#acs=${base}/cloudspades/${sample}-ariadne -#acscs=${acs}/assembled mkdir -p ${cs} -#mkdir -p ${acs} -#mkdir -p ${acscs} pushd ~/spades-cloudspades-paper/assembler/ ./spades.py \ @@ -74,37 +69,3 @@ quast \ if [[ -d ${cs}/K21 ]]; then rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp fi - -#pushd $HOME/2023.08.29-ariadne/ariadne -#mamba activate ariadne-gcc8.5.0 -#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH} -#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH} -## parameters from Lauren Mak, 9.25.23 -#./spades.py \ -# -o ${acs} \ -# --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ -# --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ -# --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \ -# --meta \ -# --only-assembler \ -# --search-distance 5000 --size-cutoff 6 -k 55 \ -# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1 -#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH} -#popd -# -#pushd ~/spades-cloudspades-paper/assembler/ -#module load gcc_9.3.0 -#./spades.py \ -# -o ${acscs} \ -# --gemcode1-1 ${acs}/K55/5000.R1.fastq \ -# --gemcode1-2 ${acs}/K55/5000.R2.fastq \ -# --meta \ -# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1 -#module unload gcc_9.3.0 -#popd -# -#mamba activate quast -#quast \ -# -o ${acscs}/quast-scaffolds \ -# -t ${SLURM_JOB_CPUS_PER_NODE} \ -# ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1 diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index fbc30ae2..f80f6626 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -10,7 +10,6 @@ #SBATCH --mail-type=FAIL #SBATCH -p qiita -### --gres=gpu:1 source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; @@ -49,15 +48,7 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then fi fi -#acs=${base}/cloudspades/${sample}-ariadne -#acscs=${acs}/assembled mkdir -p ${cs} -#mkdir -p ${acs} -#mkdir -p ${acscs} - -#pushd ~/spades-cloudspades-paper/assembler/ -#pushd /home/mcdonadt/cloudspades-0.1/spades-cloudspades-0.1/assembler/bin -#pushd /home/qiita/CHARLIE/TELLREAD/spades-cloudspades-0.1/assembler/bin pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin ./spades.py \ @@ -79,37 +70,3 @@ quast \ if [[ -d ${cs}/K21 ]]; then rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp fi - -#pushd $HOME/2023.08.29-ariadne/ariadne -#mamba activate ariadne-gcc8.5.0 -#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH} -#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH} -## parameters from Lauren Mak, 9.25.23 -#./spades.py \ -# -o ${acs} \ -# --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ -# --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ -# --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \ -# --meta \ -# --only-assembler \ -# --search-distance 5000 --size-cutoff 6 -k 55 \ -# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1 -#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH} -#popd -# -#pushd ~/spades-cloudspades-paper/assembler/ -#module load gcc_9.3.0 -#./spades.py \ -# -o ${acscs} \ -# --gemcode1-1 ${acs}/K55/5000.R1.fastq \ -# --gemcode1-2 ${acs}/K55/5000.R2.fastq \ -# --meta \ -# -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1 -#module unload gcc_9.3.0 -#popd -# -#mamba activate quast -#quast \ -# -o ${acscs}/quast-scaffolds \ -# -t ${SLURM_JOB_CPUS_PER_NODE} \ -# ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1 diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index 4d7af5aa..acdf1224 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -109,7 +109,6 @@ fi cat ${i1} | gzip > ${i1out} mamba activate tellread-integrate -#python ${BASE}/integrate-indices-np.py integrate \ python ${BASE}/integrate-indices-np.py integrate \ --no-sort \ --r1-in ${r1} \ diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index 9f778757..85d061c2 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -20,7 +20,6 @@ if [[ -z "${LABELTAG}" ]]; then exit 1 fi -#base=/panfs/dtmcdonald/${LABELTAG} base=/panfs/qiita/TELLREAD/${LABELTAG} if [[ ! -d ${base} ]]; then echo "${base} not found" diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh index 78b8862a..628f1e99 100755 --- a/sequence_processing_pipeline/templates/tellread.sh +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -38,7 +38,6 @@ while getopts "hs:i:l:r:b:m:" option; do esac done -# nifty # https://unix.stackexchange.com/a/621007 : ${seqrunpath:?Missing -s} : ${lane:?Missing -i} @@ -116,8 +115,7 @@ set -o pipefail declare -a s declare -a g # below extended regex might be broken because C5\d\d happens in column 0, not column 1 -# of the hacked sample-sheet. -#for sample in $(egrep -o ",C5[0-9][0-9]," ${samplesheet} | tr -d "," | sort) +# of the hacked sample-sheet. for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) do echo "SAMPLE: ${sample}" @@ -205,7 +203,6 @@ if [[ -f ${submitcopy} ]]; then exit 1 fi -# CHARLIE echo $@ > ${arguments} cp ${0} ${scriptcopy} cp ${submit_script} ${submitcopy} From c5540f78ced5b44f90c97def11e7ffdf61ce0166 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 11 Aug 2024 20:33:23 -0700 Subject: [PATCH 03/47] first pass at converting TELLREAD scripts --- sequence_processing_pipeline/TRConvertJob.py | 68 +++++++++++++++++++ .../templates/cloudspades-isolate.sbatch | 36 ++++++---- .../templates/cloudspades.sbatch | 21 +++--- .../templates/integrate.sbatch | 21 +++--- .../templates/telllink-isolate.sbatch | 25 ++++--- .../templates/telllink.sbatch | 25 ++++--- .../templates/tellread-cleanup.sbatch | 20 +++--- .../templates/tellread.sbatch | 25 ++++--- .../templates/tellread.sh | 64 ++++------------- 9 files changed, 184 insertions(+), 121 deletions(-) diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 5d277609..81d6bda8 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -6,6 +6,74 @@ import re +tellread.sh +# {{CHARLIE_TELLREAD_MAP}} = samplesheet to telread.sh (-i option) must equal "/home/qiita_test/qiita-spots/tellread_mapping.csv" + +tellread.sbatch +#SBATCH -J {{job_name}} # tellread +#SBATCH -p {{queue_name}} # qiita +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 4 +#SBATCH --mem {{mem_in_gb}}G # 16G +#SBATCH --time {{wall_time_limit}} # 96:00:00 +{{CHARLIE_TMPDIR}} = /panfs/${USER}/tmp - replace with something in the work directory +{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} = $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh +{{modules_to_load}} # singularity_3.6.4 + +tellink-isolate.sbatch +#SBATCH -J {{job_name}} # tellink-isolate +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 16 +#SBATCH --mem {{mem_in_gb}}G # 160G +#SBATCH --time {{wall_time_limit}} # 96:00:00 +#SBATCH -p {{queue_name}} # qiita + +{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh +{{modules_to_load}} # singularity_3.6.4 + +telllink.sbatch +#SBATCH -J {{job_name}} # tellink +#SBATCH --mem {{mem_in_gb}}G # 160G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 16 +#SBATCH --time {{wall_time_limit}} # 96:00:00 +#SBATCH -p {{queue_name}} # qiita +{{modules_to_load}} # singularity_3.6.4 +{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh + +integrate.sbatch (should this be renamed?) +#SBATCH -J {{job_name}} # integrate +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +cloudspades-isolate.sbatch: +#SBATCH -J {{job_name}} # cs-assemble +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 64G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 12 +#SBATCH -p {{queue_name}} # qiita + +module load {{modules_to_load}} # gcc_9.3.0 + +{{CHARLIE_SPADES_PATH}} = ~/spades-cloudspades-paper/assembler/spades.py + + +tellread-cleanup.sbatch +#SBATCH -J {{job_name}} # cleanup +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + + + + + class TRConvertJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, nprocs, wall_time_limit, pmem, bcl_tool_path, diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index cf18a094..390a7f90 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -1,15 +1,20 @@ #!/bin/bash -l -#SBATCH -J cs-assemble -#SBATCH --time 24:00:00 -#SBATCH --mem 64gb -#SBATCH -N 1 -#SBATCH -c 12 +#SBATCH -J {{job_name}} # cs-assemble +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 64G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 12 +#SBATCH -p {{queue_name}} # qiita + +# for now these can be left hard-coded. #SBATCH --output %x-%A_%a.out #SBATCH --error %x-%A_%a.err -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL -#SBATCH -p qiita +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; @@ -19,6 +24,8 @@ function logger () { set -x set -e +# this gets set in the environment from another script. For now let's +# run with that. echo $TMPDIR if [[ -z "${LABELTAG}" ]]; then @@ -32,8 +39,11 @@ if [[ ! -d ${base} ]]; then exit 1 fi +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. mamba activate activate qiime2-2023.5 -module load gcc_9.3.0 + +module load {{modules_to_load}} # gcc_9.3.0 samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) @@ -50,8 +60,7 @@ fi mkdir -p ${cs} -pushd ~/spades-cloudspades-paper/assembler/ -./spades.py \ +pushd {{CHARLIE_SPADES_PATH}} \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ @@ -59,7 +68,10 @@ pushd ~/spades-cloudspades-paper/assembler/ module unload gcc_9.3.0 popd -mamba activate quast +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. +mamba activate quast + quast \ -o ${cs}/quast-scaffolds \ -t ${SLURM_JOB_CPUS_PER_NODE} \ diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index f80f6626..a9f1ec45 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -1,15 +1,20 @@ #!/bin/bash -l -#SBATCH -J cs-assemble -#SBATCH --time 24:00:00 -#SBATCH --mem 128gb -#SBATCH -N 1 -#SBATCH -c 12 +#SBATCH -J {{job_name}} # cs-assemble +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 128G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 12 +#SBATCH -p {{queue_name}} # qiita + +# for now these can be left hard-coded. #SBATCH --output %x-%A_%a.out #SBATCH --error %x-%A_%a.err -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL -#SBATCH -p qiita +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index acdf1224..787da4b2 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -1,15 +1,20 @@ #!/bin/bash -l -#SBATCH -J integrate -#SBATCH --time 24:00:00 -#SBATCH --mem 8gb -#SBATCH -N 1 -#SBATCH -c 1 -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL +#SBATCH -J {{job_name}} # integrate +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +# for now these can be left hard-coded. #SBATCH --output %x-%A_%a.out #SBATCH --error %x-%A_%a.err -#SBATCH -p qiita +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. source activate rust function logger () { echo "$(date) :: ${@}"; diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index 85d061c2..0f08c0a3 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -1,19 +1,23 @@ #!/bin/bash -l -#SBATCH --mem 160G -#SBATCH -N 1 -#SBATCH -c 16 -#SBATCH -t 96:00:00 -#SBATCH -J tellink-isolate +#SBATCH -J {{job_name}} # tellink-isolate +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 16 +#SBATCH --mem {{mem_in_gb}}G # 160G +#SBATCH --time {{wall_time_limit}} # 96:00:00 +#SBATCH -p {{queue_name}} # qiita + +# for now these can be left hard-coded. #SBATCH --output %x-%A_%a.out #SBATCH --error %x-%A_%a.err -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL -#SBATCH -p qiita + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL set -x set -e -module load singularity_3.6.4 +module load {{modules_to_load}} # singularity_3.6.4 if [[ -z "${LABELTAG}" ]]; then echo "LABELTAG is not specified" @@ -42,8 +46,7 @@ fi mkdir -p ${tl} -HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/ -${HOME_PATH}/tellink-release/run_tellink_sing.sh \ +{{TELLLINK_SING_PATH}} \ -r1 ${base}/integrated/${sample}.R1.fastq.gz \ -r2 ${base}/integrated/${sample}.R2.fastq.gz \ -i1 ${base}/integrated/${sample}.I1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch index 64a69072..591ac69d 100644 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -1,19 +1,23 @@ #!/bin/bash -l -#SBATCH --mem 160G -#SBATCH -N 1 -#SBATCH -c 16 -#SBATCH -t 96:00:00 -#SBATCH -J tellink +#SBATCH -J {{job_name}} # tellink +#SBATCH --mem {{mem_in_gb}}G # 160G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 16 +#SBATCH --time {{wall_time_limit}} # 96:00:00 +#SBATCH -p {{queue_name}} # qiita + +# for now these can be left hard-coded. #SBATCH --output %x-%A_%a.out #SBATCH --error %x-%A_%a.err -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL -#SBATCH -p qiita + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL set -x set -e -module load singularity_3.6.4 +module load {{modules_to_load}} # singularity_3.6.4 if [[ -z "${LABELTAG}" ]]; then echo "LABEL is not specified" @@ -42,8 +46,7 @@ fi mkdir -p ${tl} -HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/ -${HOME_PATH}/tellink-release/run_tellink_sing.sh \ +{{TELLLINK_SING_PATH}} \ -r1 ${base}/integrated/${sample}.R1.fastq.gz \ -r2 ${base}/integrated/${sample}.R2.fastq.gz \ -i1 ${base}/integrated/${sample}.I1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch index a8808822..f3388ef7 100644 --- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -1,14 +1,18 @@ #!/bin/bash -l -#SBATCH -J cleanup -#SBATCH --time 24:00:00 -#SBATCH --mem 8gb -#SBATCH -N 1 -#SBATCH -c 1 -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=FAIL +#SBATCH -J {{job_name}} # cleanup +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=BEGIN,FAIL + +# for now these can be left hard-coded. #SBATCH --output %x-%A.out #SBATCH --error %x-%A.err -#SBATCH -p qiita if [[ -z "${OUTPUT}" ]]; then echo "OUTPUT is not specified" diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index be5ef9e7..800503f0 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -1,16 +1,19 @@ #!/bin/bash -l +#SBATCH -J {{job_name}} # tellread +#SBATCH -p {{queue_name}} # qiita +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 4 +#SBATCH --mem {{mem_in_gb}}G # 16G +#SBATCH --time {{wall_time_limit}} # 96:00:00 -#SBATCH -N 1 -#SBATCH -c 4 -#SBATCH --mem 16G +# for now these can be left hard-coded. #SBATCH --partition=short -#SBATCH -t 96:00:00 -#SBATCH -J tellread #SBATCH --output %x-%A.out #SBATCH --error %x-%A.err -#SBATCH --mail-user=qiita.help@gmail.com -#SBATCH --mail-type=BEGIN,FAIL -#SBATCH -p qiita + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=BEGIN,FAIL function logger () { echo "$(date) :: ${@}"; @@ -49,7 +52,7 @@ if [[ -z "${OUTPUT}" ]]; then exit 1 fi -export TMPDIR=/panfs/${USER}/tmp +export TMPDIR={{CHARLIE_TMPDIR}} mkdir -p ${TMPDIR} export TMPDIR=$(mktemp -d) seqrun_path=${SEQRUNPATH} @@ -83,8 +86,8 @@ fi mkdir -p ${OUTPUT} -module load singularity_3.6.4 -$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ +module load {{modules_to_load}} # singularity_3.6.4 +{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} \ -i ${seqrun_path} \ -o ${OUTPUT} \ -s $(echo ${SAMPLES} | tr -d '"') \ diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh index 628f1e99..ffaf726e 100755 --- a/sequence_processing_pipeline/templates/tellread.sh +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -1,42 +1,13 @@ #!/bin/bash +samplesheet={{CHARLIE_TELLREAD_MAP}} # previously -i option +seqrunpath={{CHARLIE_SEQRUNPATH}} # previously -s option +lane={{CHARLIE_LANE}} # previously -l option +reference_map={{CHARLIE_REFERENCE_MAP}} # previously -r option +reference_base={{CHARLIE_REFERENCE_BASE}} # previously -b option +mode={{CHARLIE_MODE}} $ # previously -m option -script_name=${0##*/} - -function help () { - echo "Submit for TELL-read" - echo "" - echo "Usage: ${script_name} -s -l [-r reference_map] [-b reference_base]" - echo "" - echo -e "\t-s\tPath to the sequencing run." - echo -e "\t-i\tThe sample sheet." - echo -e "\t-l\tThe lane to process." - echo -e "\t-r\tA file specifying reference genomes to use [OPTIONAL]" - echo -e "\t-b\tReference genome base directory [OPTIONAL]" - echo -e "\t-m\tMode, isolate or metagenomic [OPTIONAL]" - echo "" -} - -# references right now are only used for techdev - -# derived from https://www.redhat.com/sysadmin/arguments-options-bash-scripts -while getopts "hs:i:l:r:b:m:" option; do - case ${option} in - h) - help - exit;; - s) seqrunpath=${OPTARG};; - l) lane=${OPTARG};; - r) reference_map=${OPTARG};; - b) reference_base=${OPTARG};; - m) mode=${OPTARG};; - \?) - echo "Error: Invalid option" - exit;; - *) - echo "Error: Invalid option" - exit;; - esac -done +# preserve error-checking of parameters to preserve as much of the original +# script as possible, even though this could be done in python. # https://unix.stackexchange.com/a/621007 : ${seqrunpath:?Missing -s} @@ -61,8 +32,6 @@ else tag=reference-free fi -samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv" - # trim trailing slash # https://stackoverflow.com/a/32845647/19741 safepath=$(echo ${seqrunpath} | sed 's:/*$::') @@ -75,22 +44,22 @@ if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then exit 1 fi +# for now this can stay here to keep greater compatibility with the original script. +# however these fields should eventually be parameters that can be configured in the config file. + if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then - echo "FOO" sbatch_cores=2 sbatch_mem=8G norm=TRUE wall=24:00:00 mode=NA elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then - echo "BAR" sbatch_cores=2 sbatch_mem=8G norm=TRUE wall=24:00:00 mode=NA else - echo "BAZ" sbatch_cores=16 sbatch_mem=160G norm=FALSE @@ -118,7 +87,7 @@ declare -a g # of the hacked sample-sheet. for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) do - echo "SAMPLE: ${sample}" + echo "sample found: ${sample}" # get references if they exist if [[ -f ${reference_map} ]]; then if $(grep -Fq ${sample} ${reference_map}); then @@ -140,10 +109,6 @@ do done n_samples=${#s[@]} -echo "Submitting:" -echo "S: ${s[@]}" -echo "G: ${g[@]}" - # https://stackoverflow.com/a/17841619/19741 function join_by { local IFS="$1"; shift; echo "$*"; } s=$(join_by , "${s[@]}") @@ -214,8 +179,6 @@ chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cle set -x -echo "C" - trjob=$(sbatch \ --parsable \ -J ${labeltag}-${datetag} \ @@ -225,8 +188,6 @@ trjob=$(sbatch \ --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ ${submit_script}) -echo "D" - if [[ ${norm} == "TRUE" ]]; then cp ${norm_script} ${normcopy} chmod gou-w ${normcopy} @@ -238,7 +199,6 @@ if [[ ${norm} == "TRUE" ]]; then ${norm_script}) fi -echo "E" integrate_job=$(sbatch \ --parsable \ -J ${labeltag}-${datetag}-integrate \ From 5bac0ffd2793140136fe04c22c5d2f9e92c46310 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 11 Aug 2024 22:14:28 -0700 Subject: [PATCH 04/47] Second pass at integrating tellread scripts --- sequence_processing_pipeline/TRConvertJob.py | 283 ++++++++++++++---- .../templates/cloudspades-isolate.sbatch | 2 +- .../templates/cloudspades.sbatch | 6 +- .../templates/integrate.sbatch | 1 + .../templates/telllink-isolate.sbatch | 2 +- .../templates/telllink.sbatch | 3 +- .../templates/tellread-cleanup.sbatch | 2 +- .../templates/tellread.sbatch | 4 +- .../templates/tellread.sh | 12 +- 9 files changed, 248 insertions(+), 67 deletions(-) diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 81d6bda8..f5250139 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -1,74 +1,56 @@ -from os.path import join, exists +from jinja2 import BaseLoader, TemplateNotFound +from metapool import load_sample_sheet +from os import stat, makedirs, rename +from os.path import join, basename, dirname, exists, abspath, getmtime from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) +from sequence_processing_pipeline.Pipeline import Pipeline +from shutil import move import logging +from sequence_processing_pipeline.Commands import split_similar_size_bins +from sequence_processing_pipeline.util import iter_paired_files +from jinja2 import Environment +import glob import re +from sys import executable +import pathlib + + +# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader +class KISSLoader(BaseLoader): + def __init__(self, path): + # pin the path for loader to the location sequence_processing_pipeline + # (the location of this file), along w/the relative path to the + # templates directory. + self.path = join(pathlib.Path(__file__).parent.resolve(), path) + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) -tellread.sh -# {{CHARLIE_TELLREAD_MAP}} = samplesheet to telread.sh (-i option) must equal "/home/qiita_test/qiita-spots/tellread_mapping.csv" -tellread.sbatch -#SBATCH -J {{job_name}} # tellread -#SBATCH -p {{queue_name}} # qiita -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 4 -#SBATCH --mem {{mem_in_gb}}G # 16G -#SBATCH --time {{wall_time_limit}} # 96:00:00 -{{CHARLIE_TMPDIR}} = /panfs/${USER}/tmp - replace with something in the work directory -{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} = $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh -{{modules_to_load}} # singularity_3.6.4 -tellink-isolate.sbatch -#SBATCH -J {{job_name}} # tellink-isolate -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 16 -#SBATCH --mem {{mem_in_gb}}G # 160G -#SBATCH --time {{wall_time_limit}} # 96:00:00 -#SBATCH -p {{queue_name}} # qiita -{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh -{{modules_to_load}} # singularity_3.6.4 +from os.path import join, exists +from sequence_processing_pipeline.Job import Job +from sequence_processing_pipeline.PipelineError import (PipelineError, + JobFailedError) +import logging +import re + -telllink.sbatch -#SBATCH -J {{job_name}} # tellink -#SBATCH --mem {{mem_in_gb}}G # 160G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 16 -#SBATCH --time {{wall_time_limit}} # 96:00:00 -#SBATCH -p {{queue_name}} # qiita -{{modules_to_load}} # singularity_3.6.4 -{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh -integrate.sbatch (should this be renamed?) -#SBATCH -J {{job_name}} # integrate -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 8G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 1 -#SBATCH -p {{queue_name}} # qiita -cloudspades-isolate.sbatch: -#SBATCH -J {{job_name}} # cs-assemble -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 64G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 12 -#SBATCH -p {{queue_name}} # qiita -module load {{modules_to_load}} # gcc_9.3.0 -{{CHARLIE_SPADES_PATH}} = ~/spades-cloudspades-paper/assembler/spades.py -tellread-cleanup.sbatch -#SBATCH -J {{job_name}} # cleanup -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 8G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 1 -#SBATCH -p {{queue_name}} # qiita @@ -113,6 +95,11 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.job_script_path = join(self.output_path, f"{self.job_name}.sh") self.suffix = 'fastq.gz' + # for projects that use sequence_processing_pipeline as a dependency, + # jinja_env must be set to sequence_processing_pipeline's root path, + # rather than the project's root path. + self.jinja_env = Environment(loader=KISSLoader('templates')) + tmp = False for executable_name in ['bcl2fastq', 'bcl-convert']: if executable_name in self.bcl_tool: @@ -130,6 +117,194 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self._generate_job_script() + def _generate_script_one(self): + template = self.jinja_env.get_template("tellread.sh") + + tellread_map = "/home/qiita_test/qiita-spots/tellread_mapping.csv" + seqrun_path = "/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4" + lane = 'L008' + reference_map = "" + reference_base = "" + mode = "metagenomic" + + return template.render(tellread_map=tellread_map, + seqrun_path=seqrun_path, + lane=lane, + reference_map=reference_map, + reference_base=reference_base, + mode=mode) + + def _generate_script_two(self): + template = self.jinja_env.get_template("tellread-cleanup.sbatch") + + job_name = "cleanup" + wall_time_limit = "24:00:00" + mem_in_gb = "8" + node_count = "1" + cores_per_task = "1" + queue_name = "qiita" + + return template.render(job_name=job_name, + wall_time_limit=wall_time_limit, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + queue_name=queue_name) + + def _generate_script_three(self): + template = self.jinja_env.get_template("tellread.sbatch") + job_name = "tellread" + wall_time_limit = "96:00:00" + mem_in_gb = "16" + node_count = "1" + cores_per_task = "4" + queue_name = "qiita" + tellread_sbatch_tmp_dir = "/panfs/${USER}/tmp" + tr_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh" + modules_to_load = ["singularity_3.6.4"] + + return template.render(job_name=job_name, + wall_time_limit=wall_time_limit, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + queue_name=queue_name, + tmp_dir=tellread_sbatch_tmp_dir, + sing_script_path=tr_sing_script_path, + modules_to_load=' '.join(modules_to_load)) + + def _generate_script_four(self): + template = self.jinja_env.get_template("telllink-isolate.sbatch") + + job_name = "tellink-isolate" + wall_time_limit = "96:00:00" + node_count = "1" + cores_per_task = "16" + mem_in_gb = "160" + queue_name = "qiita" + modules_to_load = ["singularity_3.6.4"] + sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" + + return template.render(job_name=job_name, + wall_time_limit=wall_time_limit, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + queue_name=queue_name, + modules_to_load=' '.join(modules_to_load), + sing_path=sing_path) + + def _generate_script_five(self): + template = self.jinja_env.get_template("telllink.sbatch") + + job_name = "tellink" + mem_in_gb = "160" + node_count = "1" + cores_per_task = "16" + wall_time_limit = "96:00:00" + queue_name = "qiita" + modules_to_load = ["singularity_3.6.4"] + sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" + + return template.render(job_name=job_name, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + wall_time_limit=wall_time_limit, + queue_name=queue_name, + modules_to_load=' '.join(modules_to_load), + sing_path=sing_path) + + def _generate_script_six(self): + template = self.jinja_env.get_template("integrate.sbatch") + + job_name = "integrate" + mem_in_gb = "8" + node_count = "1" + cores_per_task = "1" + wall_time_limit = "24:00:00" + queue_name = "qiita" + + return template.render(job_name=job_name, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + wall_time_limit=wall_time_limit, + queue_name=queue_name) + + def _generate_script_seven(self): + template = self.jinja_env.get_template("cloudspades-isolate.sbatch") + + job_name = "cs-assemble" + mem_in_gb = "64" + node_count = "1" + cores_per_task = "12" + wall_time_limit = "24:00:00" + queue_name = "qiita" + modules_to_load = ["gcc_9.3.0"] + spades_path = "~/spades-cloudspades-paper/assembler/spades.py" + + return template.render(job_name=job_name, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + wall_time_limit=wall_time_limit, + queue_name=queue_name, + modules_to_load=' '.join(modules_to_load), + spades_path=spades_path) + + def _generate_script_eight(self): + template = self.jinja_env.get_template("cloudspades.sbatch") + + job_name = "cs-assemble" + wall_time_limit = "24:00:00" + mem_in_gb = "128" + node_count = "1" + cores_per_task = "12" + queue_name = "qiita" + modules_to_load = ["gcc_9.3.0"] + spades_path = "TBD" # for now pass but don't use this spades_path var. + + return template.render(job_name=job_name, + mem_in_gb=mem_in_gb, + node_count=node_count, + cores_per_task=cores_per_task, + wall_time_limit=wall_time_limit, + queue_name=queue_name, + modules_to_load=' '.join(modules_to_load), + spades_path=spades_path) + + def _generate_job_scripts(self): + scripts = [ + { + "template": self.jinja_env.get_template("cloudspades.sbatch"), + "params": { + "job_name": "cs-assemble", + "wall_time_limit": "24:00:00", + "mem_in_gb": "128", + "node_count": "1", + "cores_per_task": "12", + "queue_name": "qiita", + "modules_to_load": ' '.join(["gcc_9.3.0"]), + "spades_path": "TBD" + } + + }, + {}, + {} + + ] + + for script in scripts: + template = self.jinja_env.get_template(script["template"]) + params = script["params"] + result = template.render(**params) + + + + + + def _generate_job_script(self): """ Generate a Torque job script for processing supplied root_directory. diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index 390a7f90..f8a2b000 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -60,7 +60,7 @@ fi mkdir -p ${cs} -pushd {{CHARLIE_SPADES_PATH}} \ +pushd {{spades_path}} \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index a9f1ec45..96673309 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -37,8 +37,11 @@ if [[ ! -d ${base} ]]; then exit 1 fi +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. mamba activate activate qiime2-2023.5 -module load gcc_9.3.0 + +module load {{modules_to_load}} # gcc_9.3.0 samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) @@ -56,6 +59,7 @@ fi mkdir -p ${cs} pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin +# for now don't use {{spades.py}} ./spades.py \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index 787da4b2..f4161466 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -21,6 +21,7 @@ function logger () { echo "$(date) :: ${@}" 1>&2; } + # https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html cores=${SLURM_CPUS_PER_TASK} diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index 0f08c0a3..f842cddf 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -46,7 +46,7 @@ fi mkdir -p ${tl} -{{TELLLINK_SING_PATH}} \ +{{sing_path}} \ -r1 ${base}/integrated/${sample}.R1.fastq.gz \ -r2 ${base}/integrated/${sample}.R2.fastq.gz \ -i1 ${base}/integrated/${sample}.I1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch index 591ac69d..39daa383 100644 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -33,6 +33,7 @@ fi samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} +# leave these hardcoded for now k=79 lc=35 cores=${SLURM_CPUS_PER_TASK} @@ -46,7 +47,7 @@ fi mkdir -p ${tl} -{{TELLLINK_SING_PATH}} \ +{{sing_path}} \ -r1 ${base}/integrated/${sample}.R1.fastq.gz \ -r2 ${base}/integrated/${sample}.R2.fastq.gz \ -i1 ${base}/integrated/${sample}.I1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch index f3388ef7..d5edf855 100644 --- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -20,4 +20,4 @@ if [[ -z "${OUTPUT}" ]]; then fi # remove unused large outputs -rm -fr ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full +rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index 800503f0..89633da9 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -52,7 +52,7 @@ if [[ -z "${OUTPUT}" ]]; then exit 1 fi -export TMPDIR={{CHARLIE_TMPDIR}} +export TMPDIR={{tmp_dir}} mkdir -p ${TMPDIR} export TMPDIR=$(mktemp -d) seqrun_path=${SEQRUNPATH} @@ -87,7 +87,7 @@ fi mkdir -p ${OUTPUT} module load {{modules_to_load}} # singularity_3.6.4 -{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} \ +{{sing_script_path}} \ -i ${seqrun_path} \ -o ${OUTPUT} \ -s $(echo ${SAMPLES} | tr -d '"') \ diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh index ffaf726e..5584b6c0 100755 --- a/sequence_processing_pipeline/templates/tellread.sh +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -1,10 +1,10 @@ #!/bin/bash -samplesheet={{CHARLIE_TELLREAD_MAP}} # previously -i option -seqrunpath={{CHARLIE_SEQRUNPATH}} # previously -s option -lane={{CHARLIE_LANE}} # previously -l option -reference_map={{CHARLIE_REFERENCE_MAP}} # previously -r option -reference_base={{CHARLIE_REFERENCE_BASE}} # previously -b option -mode={{CHARLIE_MODE}} $ # previously -m option +samplesheet={{tellread_map}} # previously -i option +seqrunpath={{seqrun_path}} # previously -s option +lane={{lane}} # previously -l option +reference_map={{reference_map}} # previously -r option +reference_base={{reference_base}} # previously -b option +mode={{mode}} $ # previously -m option # preserve error-checking of parameters to preserve as much of the original # script as possible, even though this could be done in python. From 16ec4170f4667dae6a23cb414fbd7a9202f38fa5 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 11 Aug 2024 23:11:11 -0700 Subject: [PATCH 05/47] third pass adding tellread --- sequence_processing_pipeline/TRConvertJob.py | 377 +++++-------------- 1 file changed, 102 insertions(+), 275 deletions(-) diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index f5250139..47b72c58 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -1,20 +1,11 @@ -from jinja2 import BaseLoader, TemplateNotFound -from metapool import load_sample_sheet -from os import stat, makedirs, rename -from os.path import join, basename, dirname, exists, abspath, getmtime +from jinja2 import BaseLoader, TemplateNotFound, Environment +from os.path import join, exists, getmtime from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) -from sequence_processing_pipeline.Pipeline import Pipeline -from shutil import move import logging -from sequence_processing_pipeline.Commands import split_similar_size_bins -from sequence_processing_pipeline.util import iter_paired_files -from jinja2 import Environment -import glob -import re -from sys import executable import pathlib +import re # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader @@ -35,27 +26,6 @@ def get_source(self, environment, template): return source, path, lambda: mtime == getmtime(path) - - -from os.path import join, exists -from sequence_processing_pipeline.Job import Job -from sequence_processing_pipeline.PipelineError import (PipelineError, - JobFailedError) -import logging -import re - - - - - - - - - - - - - class TRConvertJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, nprocs, wall_time_limit, pmem, bcl_tool_path, @@ -92,7 +62,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.pmem = pmem self.bcl_tool = bcl_tool_path self.qiita_job_id = qiita_job_id - self.job_script_path = join(self.output_path, f"{self.job_name}.sh") self.suffix = 'fastq.gz' # for projects that use sequence_processing_pipeline as a dependency, @@ -115,169 +84,12 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, # As the sample-sheet is validated by the Pipeline object before # being passed to TRConvertJob, additional validation isn't needed. - self._generate_job_script() - - def _generate_script_one(self): - template = self.jinja_env.get_template("tellread.sh") - - tellread_map = "/home/qiita_test/qiita-spots/tellread_mapping.csv" - seqrun_path = "/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4" - lane = 'L008' - reference_map = "" - reference_base = "" - mode = "metagenomic" - - return template.render(tellread_map=tellread_map, - seqrun_path=seqrun_path, - lane=lane, - reference_map=reference_map, - reference_base=reference_base, - mode=mode) - - def _generate_script_two(self): - template = self.jinja_env.get_template("tellread-cleanup.sbatch") - - job_name = "cleanup" - wall_time_limit = "24:00:00" - mem_in_gb = "8" - node_count = "1" - cores_per_task = "1" - queue_name = "qiita" - - return template.render(job_name=job_name, - wall_time_limit=wall_time_limit, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - queue_name=queue_name) - - def _generate_script_three(self): - template = self.jinja_env.get_template("tellread.sbatch") - job_name = "tellread" - wall_time_limit = "96:00:00" - mem_in_gb = "16" - node_count = "1" - cores_per_task = "4" - queue_name = "qiita" - tellread_sbatch_tmp_dir = "/panfs/${USER}/tmp" - tr_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh" - modules_to_load = ["singularity_3.6.4"] - - return template.render(job_name=job_name, - wall_time_limit=wall_time_limit, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - queue_name=queue_name, - tmp_dir=tellread_sbatch_tmp_dir, - sing_script_path=tr_sing_script_path, - modules_to_load=' '.join(modules_to_load)) - - def _generate_script_four(self): - template = self.jinja_env.get_template("telllink-isolate.sbatch") - - job_name = "tellink-isolate" - wall_time_limit = "96:00:00" - node_count = "1" - cores_per_task = "16" - mem_in_gb = "160" - queue_name = "qiita" - modules_to_load = ["singularity_3.6.4"] - sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" - - return template.render(job_name=job_name, - wall_time_limit=wall_time_limit, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - queue_name=queue_name, - modules_to_load=' '.join(modules_to_load), - sing_path=sing_path) - - def _generate_script_five(self): - template = self.jinja_env.get_template("telllink.sbatch") - - job_name = "tellink" - mem_in_gb = "160" - node_count = "1" - cores_per_task = "16" - wall_time_limit = "96:00:00" - queue_name = "qiita" - modules_to_load = ["singularity_3.6.4"] - sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" - - return template.render(job_name=job_name, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - wall_time_limit=wall_time_limit, - queue_name=queue_name, - modules_to_load=' '.join(modules_to_load), - sing_path=sing_path) - - def _generate_script_six(self): - template = self.jinja_env.get_template("integrate.sbatch") - - job_name = "integrate" - mem_in_gb = "8" - node_count = "1" - cores_per_task = "1" - wall_time_limit = "24:00:00" - queue_name = "qiita" - - return template.render(job_name=job_name, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - wall_time_limit=wall_time_limit, - queue_name=queue_name) - - def _generate_script_seven(self): - template = self.jinja_env.get_template("cloudspades-isolate.sbatch") - - job_name = "cs-assemble" - mem_in_gb = "64" - node_count = "1" - cores_per_task = "12" - wall_time_limit = "24:00:00" - queue_name = "qiita" - modules_to_load = ["gcc_9.3.0"] - spades_path = "~/spades-cloudspades-paper/assembler/spades.py" - - return template.render(job_name=job_name, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - wall_time_limit=wall_time_limit, - queue_name=queue_name, - modules_to_load=' '.join(modules_to_load), - spades_path=spades_path) - - def _generate_script_eight(self): - template = self.jinja_env.get_template("cloudspades.sbatch") - - job_name = "cs-assemble" - wall_time_limit = "24:00:00" - mem_in_gb = "128" - node_count = "1" - cores_per_task = "12" - queue_name = "qiita" - modules_to_load = ["gcc_9.3.0"] - spades_path = "TBD" # for now pass but don't use this spades_path var. - - return template.render(job_name=job_name, - mem_in_gb=mem_in_gb, - node_count=node_count, - cores_per_task=cores_per_task, - wall_time_limit=wall_time_limit, - queue_name=queue_name, - modules_to_load=' '.join(modules_to_load), - spades_path=spades_path) + self._generate_job_scripts() def _generate_job_scripts(self): scripts = [ { - "template": self.jinja_env.get_template("cloudspades.sbatch"), + "template": "cloudspades.sbatch", "params": { "job_name": "cs-assemble", "wall_time_limit": "24:00:00", @@ -288,94 +100,109 @@ def _generate_job_scripts(self): "modules_to_load": ' '.join(["gcc_9.3.0"]), "spades_path": "TBD" } - }, - {}, - {} - + { + "template": "cloudspades-isolate.sbatch", + "params": { + "job_name": "cs-assemble", + "wall_time_limit": "24:00:00", + "mem_in_gb": "64", + "node_count": "1", + "cores_per_task": "12", + "queue_name": "qiita", + "modules_to_load": ' '.join(["gcc_9.3.0"]), + "spades_path": "~/spades-cloudspades-paper/assembler/" + "spades.py" + } + }, + { + "template": "integrate.sbatch", + "params": { + "job_name": "integrate", + "wall_time_limit": "24:00:00", + "mem_in_gb": "8", + "node_count": "1", + "cores_per_task": "1", + "queue_name": "qiita" + } + }, + { + "template": "telllink.sbatch", + "params": { + "job_name": "telllink", + "wall_time_limit": "96:00:00", + "mem_in_gb": "160", + "node_count": "1", + "cores_per_task": "16", + "queue_name": "qiita", + "modules_to_load": ' '.join(["singularity_3.6.4"]), + "sing_path": "/projects/long_read_collab/code/tellseq/" + "release_v1.11/tellink-release/" + "run_tellink_sing.sh" + } + }, + { + "template": "telllink-isolate.sbatch", + "params": { + "job_name": "tellink-isolate", + "wall_time_limit": "96:00:00", + "node_count": "1", + "cores_per_task": "16", + "mem_in_gb": "160", + "queue_name": "qiita", + "modules_to_load": ' '.join(["singularity_3.6.4"]), + "sing_path": "/projects/long_read_collab/code/tellseq/" + "release_v1.11/tellink-release/" + "run_tellink_sing.sh" + } + }, + { + "template": "tellread.sbatch", + "params": { + "job_name": "tellread", + "wall_time_limit": "96:00:00", + "mem_in_gb": "16", + "node_count": "1", + "cores_per_task": "4", + "queue_name": "qiita", + "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp", + "tr_sing_script_path": "$HOME/qiita-spots/tellread-release" + "-novaseqX/run_tellread_sing.sh", + "modules_to_load": ' '.join(["singularity_3.6.4"]) + } + }, + { + "template": "tellread-cleanup.sbatch", + "params": { + "job_name": "cleanup", + "wall_time_limit": "24:00:00", + "mem_in_gb": "8", + "node_count": "1", + "cores_per_task": "1", + "queue_name": "qiita" + } + }, + { + "template": "", + "params": { + "tellread_map": "/home/qiita_test/qiita-spots/" + "tellread_mapping.csv", + "seqrun_path": "/sequencing/igm_runs/" + "240216_LH00444_0058_A22357VLT4", + "lane": 'L008', + "reference_map": "", + "reference_base": "", + "mode": "metagenomic" + } + } ] for script in scripts: template = self.jinja_env.get_template(script["template"]) params = script["params"] - result = template.render(**params) - - - - - - - def _generate_job_script(self): - """ - Generate a Torque job script for processing supplied root_directory. - :return: The path to the newly-created job-script. - """ - lines = [] - - lines.append("#!/bin/bash") - lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}") - lines.append(f"#SBATCH -p {self.queue_name}") - lines.append(f'#SBATCH -N {self.node_count}') - lines.append(f'#SBATCH -n {self.nprocs}') - lines.append("#SBATCH --time %d" % self.wall_time_limit) - - # send an email to the list of users defined below when a job starts, - # terminates, or aborts. This is used to confirm that the package's - # own reporting mechanism is reporting correctly. - lines.append("#SBATCH --mail-type=ALL") - - # list of users to be contacted independently of this package's - # notification system, when a job starts, terminates, or gets aborted. - lines.append("#SBATCH --mail-user qiita.help@gmail.com") - - lines.append(f"#SBATCH --mem-per-cpu {self.pmem}") - - lines.append("set -x") - lines.append('date') - lines.append('hostname') - lines.append(f'cd {self.root_dir}') - - if self.modules_to_load: - lines.append("module load " + ' '.join(self.modules_to_load)) - - # Assume that the bcl-convert tool is named 'bcl-convert' and choose - # accordingly. - if 'bcl-convert' in self.bcl_tool: - lines.append(('%s ' - '--sample-sheet "%s" ' - '--output-directory %s ' - '--bcl-input-directory . ' - '--bcl-num-decompression-threads 16 ' - '--bcl-num-conversion-threads 16 ' - '--bcl-num-compression-threads 16 ' - '--bcl-num-parallel-tiles 16 ' - '--bcl-sampleproject-subdirectories true ' - '--force') % (self.bcl_tool, - self.sample_sheet_path, - self.output_path)) - - # equivalent cp for bcl-conversion (see below) needed. - else: - lines.append(('%s ' - '--sample-sheet "%s" ' - '--minimum-trimmed-read-length 1 ' - '--mask-short-adapter-reads 1 ' - '-R . ' - '-o %s ' - '--loading-threads 16 ' - '--processing-threads 16 ' - '--writing-threads 16 ' - '--create-fastq-for-index-reads ' - '--ignore-missing-positions ') % - (self.bcl_tool, - self.sample_sheet_path, - self.output_path)) - - with open(self.job_script_path, 'w') as f: - for line in lines: - # remove long spaces in some lines. - line = re.sub(r'\s+', ' ', line) - f.write(f"{line}\n") + job_script_path = join(self.output_path, script["template"]) + with open(job_script_path, 'w') as f: + f.write(template.render(**params)) def run(self, callback=None): """ From 74cab5d6ca468ea6281c62bb17c5aad5d03540be Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 13 Aug 2024 19:47:09 -0700 Subject: [PATCH 06/47] fourth pass --- sequence_processing_pipeline/TRConvertJob.py | 105 +++----- .../templates/cloudspades-isolate.sbatch | 3 +- .../templates/cloudspades.sbatch | 2 +- .../templates/tellread.sh | 14 +- .../cloudspades-isolate.sbatch | 84 +++++++ .../data/tellread_output/cloudspades.sbatch | 81 ++++++ .../data/tellread_output/integrate.sbatch | 125 ++++++++++ .../tellread_output/telllink-isolate.sbatch | 62 +++++ .../data/tellread_output/telllink.sbatch | 64 +++++ .../tellread_output/tellread-cleanup.sbatch | 23 ++ .../data/tellread_output/tellread.sbatch | 108 ++++++++ .../tests/data/tellread_output/tellread.sh | 236 ++++++++++++++++++ 12 files changed, 829 insertions(+), 78 deletions(-) create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sh diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 47b72c58..8cc1a14a 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -1,11 +1,8 @@ from jinja2 import BaseLoader, TemplateNotFound, Environment from os.path import join, exists, getmtime from sequence_processing_pipeline.Job import Job -from sequence_processing_pipeline.PipelineError import (PipelineError, - JobFailedError) -import logging +from sequence_processing_pipeline.PipelineError import PipelineError import pathlib -import re # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader @@ -63,11 +60,17 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.bcl_tool = bcl_tool_path self.qiita_job_id = qiita_job_id self.suffix = 'fastq.gz' + self.job_script_path = None # for projects that use sequence_processing_pipeline as a dependency, # jinja_env must be set to sequence_processing_pipeline's root path, # rather than the project's root path. - self.jinja_env = Environment(loader=KISSLoader('templates')) + self.jinja_env = Environment(loader=KISSLoader('templates'), + # set Jinja2 comment strings to be + # anything other than '{#' and '#}', + # which can be used in shell scripts. + comment_start_string='%%%%%%%%%%', + comment_end_string='%%%%%%%%%%') tmp = False for executable_name in ['bcl2fastq', 'bcl-convert']: @@ -112,7 +115,6 @@ def _generate_job_scripts(self): "queue_name": "qiita", "modules_to_load": ' '.join(["gcc_9.3.0"]), "spades_path": "~/spades-cloudspades-paper/assembler/" - "spades.py" } }, { @@ -129,7 +131,7 @@ def _generate_job_scripts(self): { "template": "telllink.sbatch", "params": { - "job_name": "telllink", + "job_name": "tellink", "wall_time_limit": "96:00:00", "mem_in_gb": "160", "node_count": "1", @@ -163,11 +165,12 @@ def _generate_job_scripts(self): "wall_time_limit": "96:00:00", "mem_in_gb": "16", "node_count": "1", + "tmp_dir": "/panfs/${USER}/tmp", "cores_per_task": "4", "queue_name": "qiita", "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp", - "tr_sing_script_path": "$HOME/qiita-spots/tellread-release" - "-novaseqX/run_tellread_sing.sh", + "sing_script_path": "$HOME/qiita-spots/tellread-release" + "-novaseqX/run_tellread_sing.sh", "modules_to_load": ' '.join(["singularity_3.6.4"]) } }, @@ -183,7 +186,7 @@ def _generate_job_scripts(self): } }, { - "template": "", + "template": "tellread.sh", "params": { "tellread_map": "/home/qiita_test/qiita-spots/" "tellread_mapping.csv", @@ -201,9 +204,13 @@ def _generate_job_scripts(self): template = self.jinja_env.get_template(script["template"]) params = script["params"] job_script_path = join(self.output_path, script["template"]) + with open(job_script_path, 'w') as f: f.write(template.render(**params)) + if script['template'] == "tellread.sh": + self.job_script_path = job_script_path + def run(self, callback=None): """ Run BCL2Fastq/BCLConvert conversion @@ -212,71 +219,31 @@ def run(self, callback=None): changed. :return: """ - try: - job_info = self.submit_job(self.job_script_path, - exec_from=self.log_path, - callback=callback) - except JobFailedError as e: - # When a job has failed, parse the logs generated by this specific - # job to return a more descriptive message to the user. - info = self.parse_logs() - # prepend just the message component of the Error. - info.insert(0, str(e)) - raise JobFailedError('\n'.join(info)) - logging.info(f'Successful job: {job_info}') + # Unlike other Jobs that submit a Slurm script and wait for the job + # to complete, this Job will execute the tellread.sh shell script. + # It is this script that does all of the Slurm job creation. This Job + # will need another means to tell when a job has completed + # successfully. - def parse_logs(self): - log_path = join(self.output_path, 'Logs') - errors = join(log_path, 'Errors.log') + command = ("./tellread.sh -s /sequencing/igm_runs/240216_LH00444" + "_0058_A22357VLT4 -i ./samplesheet.csv -l L008 -m " + "metagenomic") - msgs = [] + if self.job_script_path: + res = self._system_call(command) + else: + raise PipelineError("tellread.sh script could not be found.") - if not exists(errors): - # we do not raise an Error in this case because it's expected that - # parse_logs() will be called in response to an exceptional - # condition. - msgs.append(f"'{errors} does not exist") + if res['return_code'] != 0: + raise PipelineError("tellread.sh script did not execute correctly") - with open(errors, 'r') as f: - lines = f.readlines() - for line in [x.strip() for x in lines]: - msgs.append(line) + # res['stdout'] + # res['stderr'] - return msgs + def parse_logs(self): + raise PipelineError("parsing logs not implemented.") @staticmethod def parse_job_script(job_script_path): - # Returns run-directory and sample-sheet path from a job-script. - - if not exists(job_script_path): - raise ValueError(f"'{job_script_path}' is not a valid path") - - with open(job_script_path, 'r') as f: - lines = f.readlines() - lines = [x.strip() for x in lines] - - # As this code creates this file, we can expect it to be of a certain - # format. - if lines[0] != '#!/bin/bash': - raise ValueError(f"'{job_script_path}' is not a valid path") - - result = {} - - m = re.match('^cd (.*)$', lines[12]) - - if m: - result['run_directory'] = m.group(1) - else: - raise ValueError("could not detect run_directory in " - f"'{job_script_path}'") - - m = re.match('^bcl-convert --sample-sheet "(.*?)" ', lines[14]) - - if m: - result['sample_sheet_path'] = m.group(1) - else: - raise ValueError("could not detect sample-sheet path in " - f"'{job_script_path}'") - - return result + raise PipelineError("parsing job script not implemented.") diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index f8a2b000..4296abfb 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -60,7 +60,8 @@ fi mkdir -p ${cs} -pushd {{spades_path}} \ +pushd {{spades_path}} +./spades.py \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index 96673309..e1c2bb40 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -59,7 +59,7 @@ fi mkdir -p ${cs} pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin -# for now don't use {{spades.py}} +# for now don't use spades.py jinja2 variable ./spades.py \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh index 5584b6c0..ac7c6d31 100755 --- a/sequence_processing_pipeline/templates/tellread.sh +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -1,10 +1,10 @@ #!/bin/bash -samplesheet={{tellread_map}} # previously -i option -seqrunpath={{seqrun_path}} # previously -s option -lane={{lane}} # previously -l option -reference_map={{reference_map}} # previously -r option -reference_base={{reference_base}} # previously -b option -mode={{mode}} $ # previously -m option +samplesheet="{{tellread_map}}" # previously -i option +seqrunpath="{{seqrun_path}}" # previously -s option +lane="{{lane}}" # previously -l option +reference_map="{{reference_map}}" # previously -r option +reference_base="{{reference_base}}" # previously -b option +mode="{{mode}}" $ # previously -m option # preserve error-checking of parameters to preserve as much of the original # script as possible, even though this could be done in python. @@ -34,7 +34,7 @@ fi # trim trailing slash # https://stackoverflow.com/a/32845647/19741 -safepath=$(echo ${seqrunpath} | sed 's:/*$::') +safepath=$(echo ${seqrunpath} | sed 's:/*$::') label=$(basename ${safepath}) labeltag=${label}-${tag} output=/panfs/${USER}/${labeltag} diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch new file mode 100644 index 00000000..7ec58058 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch @@ -0,0 +1,84 @@ +#!/bin/bash -l +#SBATCH -J cs-assemble # cs-assemble +#SBATCH --time 24:00:00 # 24:00:00 +#SBATCH --mem 64G # 64G +#SBATCH -N 1 # 1 +#SBATCH -c 12 # 12 +#SBATCH -p qiita # qiita + +# for now these can be left hard-coded. +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. +source activate qiime2-2023.5 +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x +set -e + +# this gets set in the environment from another script. For now let's +# run with that. +echo $TMPDIR + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +base=${OUTPUT} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. +mamba activate activate qiime2-2023.5 + +module load gcc_9.3.0 # gcc_9.3.0 + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) + +# assumes 1-based array index, eg --array 1-N +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +cs=${base}/cloudspades-isolate/${sample} + +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${cs} ]]; then + rm -fr ${cs} + fi +fi + +mkdir -p ${cs} + +pushd ~/spades-cloudspades-paper/assembler/ +./spades.py \ + -o ${cs} \ + --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 +module unload gcc_9.3.0 +popd + +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. +mamba activate quast + +quast \ + -o ${cs}/quast-scaffolds \ + -t ${SLURM_JOB_CPUS_PER_NODE} \ + ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 + +# remove intermediates that currently dont have a downstream use +if [[ -d ${cs}/K21 ]]; then + rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch new file mode 100644 index 00000000..d16dc2b0 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch @@ -0,0 +1,81 @@ +#!/bin/bash -l +#SBATCH -J cs-assemble # cs-assemble +#SBATCH --time 24:00:00 # 24:00:00 +#SBATCH --mem 128G # 128G +#SBATCH -N 1 # 1 +#SBATCH -c 12 # 12 +#SBATCH -p qiita # qiita + +# for now these can be left hard-coded. +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. +source activate qiime2-2023.5 +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x +set -e + +echo $TMPDIR + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +base=${OUTPUT} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. +# for now we will leave it hardcoded. +mamba activate activate qiime2-2023.5 + +module load gcc_9.3.0 # gcc_9.3.0 + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) + +# assumes 1-based array index, eg --array 1-N +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +cs=${base}/cloudspades/${sample} + +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${cs} ]]; then + rm -fr ${cs} + fi +fi + +mkdir -p ${cs} +pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin + +# for now don't use spades.py jinja2 variable +./spades.py \ + -o ${cs} \ + --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + --meta \ + -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 +module unload gcc_9.3.0 +popd + +mamba activate quast +quast \ + -o ${cs}/quast-scaffolds \ + -t ${SLURM_JOB_CPUS_PER_NODE} \ + ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 + +# remove intermediates that currently dont have a downstream use +if [[ -d ${cs}/K21 ]]; then + rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch new file mode 100644 index 00000000..6947c226 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch @@ -0,0 +1,125 @@ +#!/bin/bash -l +#SBATCH -J integrate # integrate +#SBATCH --time 24:00:00 # 24:00:00 +#SBATCH --mem 8G # 8G +#SBATCH -N 1 # 1 +#SBATCH -c 1 # 1 +#SBATCH -p qiita # qiita + +# for now these can be left hard-coded. +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. +source activate rust +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + + +# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html +cores=${SLURM_CPUS_PER_TASK} + +if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then + echo "Not operating in an array" + exit 1 +fi + +if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then + echo "Line extraction assumes 1-based index" + exit 1 +fi + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +if [[ -z ${BASE} ]]; then + echo "BASE not specified" + exit 1 +fi + +tellread=${OUTPUT} +if [[ ! -d ${tellread} ]]; then + echo "${tellread} not found" + exit 1 +fi + +set -x +set -e +set -o pipefail + +samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR=$(mktemp -d) +function cleanup { + echo "Removing $TMPDIR" + rm -r $TMPDIR + unset TMPDIR +} +trap cleanup EXIT + +files=${TMPDIR}/integration.files +/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files} +mkdir -p ${tellread}/integrated + +if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R1" + exit 1 +fi + +if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R2" + exit 1 +fi + +if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} I1" + exit 1 +fi + +r1=$(grep -m 1 "_R1_${sample}" ${files}) +r2=$(grep -m 1 "_R2_${sample}" ${files}) +i1=$(grep -m 1 "_I1_${sample}" ${files}) +r1out=${tellread}/integrated/${sample}.R1.fastq.gz +r2out=${tellread}/integrated/${sample}.R2.fastq.gz +i1out=${tellread}/integrated/${sample}.I1.fastq.gz + +if [[ ! -s ${r1} ]]; then + echo "${r1} is empty, cannot integrate" + if [[ -s ${r2} ]]; then + echo "R1 and R2 are inconsistent" + exit 1 + fi + if [[ -s ${i1} ]]; then + echo "R1 and I1 are inconsistent" + exit 1 + fi + + # reflect the empties so Qiita can know of them + touch ${r1out} + touch ${r2out} + touch ${i1out} + exit 0 +fi + +# this can probably be backgrounded but then you have to get creative to +# not mask a nonzero exit status (e.g., the python process raising) +cat ${i1} | gzip > ${i1out} + +mamba activate tellread-integrate +python ${BASE}/integrate-indices-np.py integrate \ + --no-sort \ + --r1-in ${r1} \ + --r2-in ${r2} \ + --i1-in ${i1} \ + --r1-out ${r1out} \ + --r2-out ${r2out} \ + --threads ${cores} \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch new file mode 100644 index 00000000..6a23331e --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch @@ -0,0 +1,62 @@ +#!/bin/bash -l +#SBATCH -J tellink-isolate # tellink-isolate +#SBATCH -N 1 # 1 +#SBATCH -c 16 # 16 +#SBATCH --mem 160G # 160G +#SBATCH --time 96:00:00 # 96:00:00 +#SBATCH -p qiita # qiita + +# for now these can be left hard-coded. +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +set -x +set -e + +module load singularity_3.6.4 # singularity_3.6.4 + +if [[ -z "${LABELTAG}" ]]; then + echo "LABELTAG is not specified" + exit 1 +fi + +base=/panfs/qiita/TELLREAD/${LABELTAG} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +k=79 +lc=35 +cores=${SLURM_CPUS_PER_TASK} + +tl=${base}/tell-link-isolate/${sample} +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${tl} ]]; then + rm -fr ${tl} + fi +fi + +mkdir -p ${tl} + +/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \ + -r1 ${base}/integrated/${sample}.R1.fastq.gz \ + -r2 ${base}/integrated/${sample}.R2.fastq.gz \ + -i1 ${base}/integrated/${sample}.I1.fastq.gz \ + -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -k ${k} \ + -lc ${lc} \ + -p ${sample} \ + -j ${cores} + +# remove temporary data +if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch new file mode 100644 index 00000000..b6033b24 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch @@ -0,0 +1,64 @@ +#!/bin/bash -l +#SBATCH -J tellink # tellink +#SBATCH --mem 160G # 160G +#SBATCH -N 1 # 1 +#SBATCH -c 16 # 16 +#SBATCH --time 96:00:00 # 96:00:00 +#SBATCH -p qiita # qiita + +# for now these can be left hard-coded. +#SBATCH --output %x-%A_%a.out +#SBATCH --error %x-%A_%a.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=FAIL + +set -x +set -e + +module load singularity_3.6.4 # singularity_3.6.4 + +if [[ -z "${LABELTAG}" ]]; then + echo "LABEL is not specified" + exit 1 +fi + +base=/panfs/${USER}/${LABELTAG} +if [[ ! -d ${base} ]]; then + echo "${base} not found" + exit 1 +fi + +samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +# leave these hardcoded for now +k=79 +lc=35 +cores=${SLURM_CPUS_PER_TASK} + +tl=${base}/tell-link/${sample} +if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then + if [[ -d ${tl} ]]; then + rm -fr ${tl} + fi +fi + +mkdir -p ${tl} + +/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \ + -r1 ${base}/integrated/${sample}.R1.fastq.gz \ + -r2 ${base}/integrated/${sample}.R2.fastq.gz \ + -i1 ${base}/integrated/${sample}.I1.fastq.gz \ + -d metagenomics \ + -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -k ${k} \ + -lc ${lc} \ + -p ${sample} \ + -j ${cores} + +# remove temporary data +if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +fi diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch new file mode 100644 index 00000000..56bc3360 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch @@ -0,0 +1,23 @@ +#!/bin/bash -l +#SBATCH -J cleanup # cleanup +#SBATCH --time 24:00:00 # 24:00:00 +#SBATCH --mem 8G # 8G +#SBATCH -N 1 # 1 +#SBATCH -c 1 # 1 +#SBATCH -p qiita # qiita + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=BEGIN,FAIL + +# for now these can be left hard-coded. +#SBATCH --output %x-%A.out +#SBATCH --error %x-%A.err + +if [[ -z "${OUTPUT}" ]]; then + echo "OUTPUT is not specified" + exit 1 +fi + +# remove unused large outputs +rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch new file mode 100644 index 00000000..ab0647f8 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch @@ -0,0 +1,108 @@ +#!/bin/bash -l +#SBATCH -J tellread # tellread +#SBATCH -p qiita # qiita +#SBATCH -N 1 # 1 +#SBATCH -c 4 # 4 +#SBATCH --mem 16G # 16G +#SBATCH --time 96:00:00 # 96:00:00 + +# for now these can be left hard-coded. +#SBATCH --partition=short +#SBATCH --output %x-%A.out +#SBATCH --error %x-%A.err + +# for now comment these out as qiita is responsible for notifying users. +###SBATCH --mail-user=qiita.help@gmail.com +###SBATCH --mail-type=BEGIN,FAIL + +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x + +if [[ -z "${N_SAMPLES}" ]]; then + echo "N_SAMPLES is not specified" + exit 1 +fi + +if [[ -z "${SEQRUNPATH}" ]]; then + echo "SEQRUNPATH is not specified" + exit 1 +fi + +if [[ -z "${LANE}" ]]; then + echo "LANE is not specified" + exit 1 +fi + +if [[ -z "${SAMPLES}" ]]; then + echo "SAMPLES is not specified" + exit 1 +fi + +if [[ -z "${REFS}" ]]; then + echo "REFS is not specified" + exit 1 +fi + +if [[ -z "${OUTPUT}" ]]; then + echo "OUTPUT is not specified" + exit 1 +fi + +export TMPDIR="/panfs/${USER}/tmp" +mkdir -p ${TMPDIR} +export TMPDIR=$(mktemp -d) +seqrun_path=${SEQRUNPATH} + +if [[ ${LANE} == "L001" ]]; then + lane=s_1 +elif [[ ${LANE} == "L002" ]]; then + lane=s_2 +elif [[ ${LANE} == "L003" ]]; then + lane=s_3 +elif [[ ${LANE} == "L004" ]]; then + lane=s_4 +elif [[ ${LANE} == "L005" ]]; then + lane=s_5 +elif [[ ${LANE} == "L006" ]]; then + lane=s_6 +elif [[ ${LANE} == "L007" ]]; then + lane=s_7 +elif [[ ${LANE} == "L008" ]]; then + lane=s_8 +else + echo "Unrecognized lane: ${LANE}" + exit 1 +fi + +# yes, hard coded, not great but progress. +extra="" +if [[ ! -z ${REFBASE} ]]; then + extra="-f ${REFBASE}" +fi + +mkdir -p ${OUTPUT} + +module load singularity_3.6.4 # singularity_3.6.4 +$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ + -i ${seqrun_path} \ + -o ${OUTPUT} \ + -s $(echo ${SAMPLES} | tr -d '"') \ + -g $(echo ${REFS} | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} \ + ${extra} \ + -l ${lane} + + +if [[ -d ${OUTPUT}/Full ]]; then + echo "Run appears successful" +elif [[ -d ${OUTPUT}/1_demult/Full ]]; then + echo "Run appears unsuccessful but has output" + exit 1 +else + echo "Run appears unsuccessful" + exit 1 +fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh new file mode 100644 index 00000000..90b4e1ce --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh @@ -0,0 +1,236 @@ +#!/bin/bash +samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv" # previously -i option +seqrunpath="/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4" # previously -s option +lane="L008" # previously -l option +reference_map="" # previously -r option +reference_base="" # previously -b option +mode="metagenomic" $ # previously -m option + +# preserve error-checking of parameters to preserve as much of the original +# script as possible, even though this could be done in python. + +# https://unix.stackexchange.com/a/621007 +: ${seqrunpath:?Missing -s} +: ${lane:?Missing -i} + +if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then + if [[ -z ${reference_map} ]]; then + echo "-b used without -r" + exit 1 + fi + if [[ -z ${reference_base} ]]; then + echo "-r used without -b" + exit 1 + fi + if [[ ! -d ${reference_base} ]]; then + echo "reference base not found" + exit 1 + fi + + tag=reference-based +else + tag=reference-free +fi + +# trim trailing slash +# https://stackoverflow.com/a/32845647/19741 +safepath=$(echo ${seqrunpath} | sed 's:/*$::') +label=$(basename ${safepath}) +labeltag=${label}-${tag} +output=/panfs/${USER}/${labeltag} + +if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then + echo "Cannot access the lane" + exit 1 +fi + +# for now this can stay here to keep greater compatibility with the original script. +# however these fields should eventually be parameters that can be configured in the config file. + +if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then + sbatch_cores=2 + sbatch_mem=8G + norm=TRUE + wall=24:00:00 + mode=NA +elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then + sbatch_cores=2 + sbatch_mem=8G + norm=TRUE + wall=24:00:00 + mode=NA +else + sbatch_cores=16 + sbatch_mem=160G + norm=FALSE + assemble=TRUE + wall=48:00:00 +fi + +if [[ ${mode} == "isolate" ]]; then + ISOLATE_MODE=TRUE +elif [[ ${mode} == "metagenomic" ]]; then + ISOLATE_MODE=FALSE +elif [[ ${mode} == "NA" ]]; then + ISOLATE_MODE=FALSE +else + echo "unknown mode: ${mode}" + exit 1 +fi + +set -e +set -o pipefail + +declare -a s +declare -a g +# below extended regex might be broken because C5\d\d happens in column 0, not column 1 +# of the hacked sample-sheet. +for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) +do + echo "sample found: ${sample}" + # get references if they exist + if [[ -f ${reference_map} ]]; then + if $(grep -Fq ${sample} ${reference_map}); then + ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n") + if [[ ${ref} != "NONE" ]]; then + if [[ ! -d "${reference_base}/${ref}" ]]; then + echo "${reference_base}/${ref}" + echo "${ref} not found" + exit 1 + fi + g[${#g[@]}]=${ref} + s[${#s[@]}]=${sample} + fi + fi + else + g[${#g[@]}]=NONE + s[${#s[@]}]=${sample} + fi +done +n_samples=${#s[@]} + +# https://stackoverflow.com/a/17841619/19741 +function join_by { local IFS="$1"; shift; echo "$*"; } +s=$(join_by , "${s[@]}") +g=$(join_by , "${g[@]}") + +base=$(dirname ${0}) +submit_script=$(dirname ${0})/tellread.sbatch +integrate_script=$(dirname ${0})/integrate.sbatch +norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch +asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch +clean_script=$(dirname ${0})/tellread-cleanup.sbatch + +if [[ ${ISOLATE_MODE} == "TRUE" ]]; then + asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch + asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch +else + asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch + asm_tellink_script=$(dirname ${0})/telllink.sbatch +fi + +if [[ ! -f ${submit_script} ]]; then + echo "Cannot access submit script" + exit 1 +fi +if [[ ! -f ${asm_cloudspades_script} ]]; then + echo "Cannot access cloudspades assembly script" + exit 1 +fi +if [[ ! -f ${asm_tellink_script} ]]; then + echo "Cannot access tell-link assembly script" + exit 1 +fi +if [[ ! -f ${integrate_script} ]]; then + echo "Cannot access integrate script" + exit 1 +fi +if [[ ! -f ${clean_script} ]]; then + echo "Cannot access clean script" + exit 1 +fi + +datetag=$(date "+%Y.%m.%d") +scriptcopy=$(pwd)/tellread_script-${datetag}.sh +submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch +asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch +asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch +normcopy=$(pwd)/norm_submission-${datetag}.sbatch +intcopy=$(pwd)/integrate_submission-${datetag}.sbatch +cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch +arguments=$(pwd)/provided_script_arguments.txt +if [[ -f ${scriptcopy} ]]; then + echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit" + exit 1 +fi +if [[ -f ${submitcopy} ]]; then + echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit" + exit 1 +fi + +echo $@ > ${arguments} +cp ${0} ${scriptcopy} +cp ${submit_script} ${submitcopy} +cp ${asm_cloudspades_script} ${asmcscopy} +cp ${asm_tellink_script} ${asmtlcopy} +cp ${integrate_script} ${intcopy} +cp ${clean_script} ${cleancopy} +chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy} + +set -x + +trjob=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag} \ + -c ${sbatch_cores} \ + --mem ${sbatch_mem} \ + --time ${wall} \ + --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ + ${submit_script}) + +if [[ ${norm} == "TRUE" ]]; then + cp ${norm_script} ${normcopy} + chmod gou-w ${normcopy} + norm_counts_job=$(sbatch \ + --parsable \ + --dependency=afterok:${trjob} \ + -J ${labeltag}-${datetag}-norm-counts \ + --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \ + ${norm_script}) +fi + +integrate_job=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag}-integrate \ + --dependency=afterok:${trjob} \ + --array 1-${n_samples} \ + --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \ + ${integrate_script}) + +if [[ ${assemble} == "TRUE" ]]; then + csj=$(sbatch \ + --parsable \ + --dependency=aftercorr:${integrate_job} \ + -J ${labeltag}-${datetag}-cloudspades \ + --array 1-${n_samples} \ + --export LABELTAG=${labeltag},OUTPUT=${output} \ + ${asm_cloudspades_script}) + tlj=$(sbatch \ + --parsable \ + --dependency=aftercorr:${integrate_job} \ + -J ${labeltag}-${datetag}-tell-link \ + --array 1-${n_samples} \ + --export LABELTAG=${labeltag},OUTPUT=${output} \ + ${asm_tellink_script}) + cleanupdep=${csj}:${tlj} +else + cleanupdep=${integrate_job} + echo "Not assembling" +fi + +cleanup=$(sbatch \ + --parsable \ + -J ${labeltag}-${datetag}-cleanup \ + --dependency=afterok:${cleanupdep} \ + --export OUTPUT=${output} \ + ${clean_script}) \ No newline at end of file From a6bde1e8232f1442a357f604e5918773e27f647a Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 4 Sep 2024 20:36:42 -0700 Subject: [PATCH 07/47] Fifth pass, tested on qiita-rc and then refactored. --- README.rst | 6 + sequence_processing_pipeline/Job.py | 199 ++++++--- sequence_processing_pipeline/TRConvertJob.py | 398 ++++++++++++++---- .../contrib/create_picklist.py | 65 +++ .../contrib/integrate-indices-np.py | 330 +++++++++++++++ .../contrib/plot_counts.py | 27 ++ .../templates/cloudspades-isolate.sbatch | 55 +-- .../templates/cloudspades.sbatch | 38 +- ...e_sequence_counts_for_normalization.sbatch | 57 +++ .../templates/integrate.sbatch | 18 +- .../templates/telllink-isolate.sbatch | 15 +- .../templates/telllink.sbatch | 18 +- .../templates/tellread-cleanup.sbatch | 9 +- .../templates/tellread.sbatch | 25 +- .../templates/tellread.sh | 34 +- .../20230906_FS10001773_68_BTR67708-1611.csv | 41 ++ 16 files changed, 1072 insertions(+), 263 deletions(-) create mode 100644 sequence_processing_pipeline/contrib/create_picklist.py create mode 100644 sequence_processing_pipeline/contrib/integrate-indices-np.py create mode 100644 sequence_processing_pipeline/contrib/plot_counts.py create mode 100644 sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch create mode 100644 sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv diff --git a/README.rst b/README.rst index c51fcd0c..190ebba4 100644 --- a/README.rst +++ b/README.rst @@ -62,3 +62,9 @@ Please note that the setting 'minimap2_databases' is expected to be a list of pa For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic' and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay type. + +Additional TellSeq-related notes: +'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP. +Their paths should be made known to SPP in the configuration files. (See examples for details). +Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured. + diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index af04ef9c..035d8ba0 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -9,9 +9,22 @@ import logging from inspect import stack import re +from time import time class Job: + slurm_status_terminated = ['BOOT_FAIL', 'CANCELLED', 'DEADLINE', 'FAILED', + 'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', + 'REVOKED', 'TIMEOUT'] + + slurm_status_successful = ['COMPLETED'] + + slurm_status_running = ['COMPLETING', 'CONFIGURING', 'PENDING', 'REQUEUED', + 'REQUEUE_FED', 'REQUEUE_HOLD', 'RESIZING', + 'RESV_DEL_HOLD', 'RUNNING', 'SIGNALING', + 'SPECIAL_EXIT', 'STAGE_OUT', 'STOPPED', + 'SUSPENDED'] + def __init__(self, root_dir, output_path, job_name, executable_paths, max_array_length, modules_to_load=None): """ @@ -191,53 +204,13 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} - def submit_job(self, script_path, job_parameters=None, - script_parameters=None, wait=True, - exec_from=None, callback=None): - """ - Submit a Torque job script and optionally wait for it to finish. - :param script_path: The path to a Torque job (bash) script. - :param job_parameters: Optional parameters for scheduler submission. - :param script_parameters: Optional parameters for your job script. - :param wait: Set to False to submit job and not wait. - :param exec_from: Set working directory to execute command from. - :param callback: Set callback function that receives status updates. - :return: Dictionary containing the job's id, name, status, and - elapsed time. Raises PipelineError if job could not be submitted or - if job was unsuccessful. - """ - if job_parameters: - cmd = 'sbatch %s %s' % (job_parameters, script_path) - else: - cmd = 'sbatch %s' % (script_path) - - if script_parameters: - cmd += ' %s' % script_parameters - - if exec_from: - cmd = f'cd {exec_from};' + cmd - - logging.debug("job scheduler call: %s" % cmd) - - if self.force_job_fail: - raise JobFailedError("This job died.") - - # if system_call does not raise a PipelineError(), then the scheduler - # successfully submitted the job. In this case, it should return - # the id of the job in stdout. - results = self._system_call(cmd) - stdout = results['stdout'] - - job_id = stdout.strip().split()[-1] - + def _wait_on_job(self, job_id, callback=None): job_info = {'job_id': None, 'job_name': None, 'job_state': None, 'elapsed_time': None} - # Just to give some time for everything to be set up properly - sleep(10) exit_count = 0 - while wait: + while True: result = self._system_call(f"sacct -P -n --job {job_id} --format " "JobID,JobName,State,Elapsed,ExitCode") @@ -287,28 +260,52 @@ def submit_job(self, script_path, job_parameters=None, sleep(10) - if job_info['job_id'] is not None: - # job was once in the queue - if callback is not None: - callback(jid=job_id, status=job_info['job_state']) - - if set(states) == {'COMPLETED'}: - if 'exit_status' in job_info: - if set(estatuses) == {'0:0'}: - # job completed successfully - return job_info - else: - exit_status = job_info['exit_status'] - raise JobFailedError(f"job {job_id} exited with exit_" - f"status {exit_status}") - else: - # with no other info, assume job completed successfully - return job_info - else: - # job exited unsuccessfully - raise JobFailedError(f"job {job_id} exited with status " - f"{job_info['job_state']}") + return job_info, states, estatuses + + def submit_job(self, script_path, job_parameters=None, + script_parameters=None, exec_from=None, callback=None): + """ + Submit a Torque job script and optionally wait for it to finish. + :param script_path: The path to a Torque job (bash) script. + :param job_parameters: Optional parameters for scheduler submission. + :param script_parameters: Optional parameters for your job script. + :param exec_from: Set working directory to execute command from. + :param callback: Set callback function that receives status updates. + :return: Dictionary containing the job's id, name, status, and + elapsed time. Raises PipelineError if job could not be submitted or + if job was unsuccessful. + """ + if job_parameters: + cmd = 'sbatch %s %s' % (job_parameters, script_path) else: + cmd = 'sbatch %s' % (script_path) + + if script_parameters: + cmd += ' %s' % script_parameters + + if exec_from: + cmd = f'cd {exec_from};' + cmd + + logging.debug("job scheduler call: %s" % cmd) + + if self.force_job_fail: + raise JobFailedError("This job died.") + + # if system_call does not raise a PipelineError(), then the scheduler + # successfully submitted the job. In this case, it should return + # the id of the job in stdout. + results = self._system_call(cmd) + stdout = results['stdout'] + + job_id = stdout.strip().split()[-1] + + # Just to give some time for everything to be set up properly + sleep(10) + + job_info, states, estatuses = self._wait_on_job(job_id, + callback=callback) + + if job_info['job_id'] is None: # job was never in the queue - return an error. if callback is not None: callback(jid=job_id, status='ERROR') @@ -316,6 +313,82 @@ def submit_job(self, script_path, job_parameters=None, raise JobFailedError(f"job {job_id} never appeared in the " "queue.") + # job was once in the queue + if callback is not None: + callback(jid=job_id, status=job_info['job_state']) + + if set(states) == {'COMPLETED'}: + if 'exit_status' in job_info: + if set(estatuses) == {'0:0'}: + # job completed successfully + return job_info + else: + exit_status = job_info['exit_status'] + raise JobFailedError(f"job {job_id} exited with exit_" + f"status {exit_status}") + else: + # with no other info, assume job completed successfully + return job_info + else: + # job exited unsuccessfully + raise JobFailedError(f"job {job_id} exited with status " + f"{job_info['job_state']}") + + def _wait_on_job_ids(self, job_ids, timeout_in_seconds=None): + """ + Wait on a list of known Slurm job-ids. + :param job_ids: A list of Slurm job-ids + :param timeout_in_seconds: Abort and raise an Error after n seconds. + :return: A list of strings, representing the state of each job. + """ + + # this method is useful for wrapping scripts that spawn child jobs and + # the user wishes to wait until they are all completed before + # continuing. + if not isinstance(job_ids, list): + raise ValueError("job_ids must be a list of valid slurm job ids") + + if set([isinstance(x, int) for x in job_ids]) != {True}: + raise ValueError("job_ids must contain integers") + + if timeout_in_seconds: + if not isinstance(timeout_in_seconds, int): + raise ValueError("timeout_in_seconds must be an integer") + + if timeout_in_seconds < 1: + raise ValueError("timeout_in_seconds must be greater than 0") + + start_time = time() + while True: + if timeout_in_seconds: + if time() - start_time > timeout_in_seconds: + raise PipelineError("timeout reached while waiting for " + "jobs") + + job_states = [] + for job_id in job_ids: + # NB: sacct can support querying on multiple job-ids at once. + # However, this would require extensive rewriting and testing + # of the existing code. Deferring for now. + _, states, _ = self._wait_on_job(job_id) + job_states.append(set(states)) + + # assuming that a Slurm job will never contain states from both + # terminated and successful, this will generate a list containing + # the current state for each job. + result = [set(x) & set(Job.slurm_status_terminated + + Job.slurm_status_successful) for x in job_states] + + if set([bool(x) for x in result]) == {True}: + # all jobs are no longer in a running state. + break + + sleep(10) + + # return the current state of each job. Assume that each set contains + # only one value. + return [''.join(x) for x in result] + def _group_commands(self, cmds): # break list of commands into chunks of max_array_length (Typically # 1000 for Torque job arrays). To ensure job arrays are never more diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 8cc1a14a..98d9c18d 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -1,8 +1,11 @@ from jinja2 import BaseLoader, TemplateNotFound, Environment -from os.path import join, exists, getmtime +from os.path import split, join, exists, getmtime from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import PipelineError import pathlib +from os import rename, walk, chmod, listdir, makedirs +from shutil import move, rmtree +from re import match # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader @@ -43,7 +46,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, """ super().__init__(run_dir, output_path, - 'TRConvertJob', + 'ConvertJob', [bcl_tool_path], 1000, modules_to_load=modules_to_load) @@ -60,7 +63,13 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.bcl_tool = bcl_tool_path self.qiita_job_id = qiita_job_id self.suffix = 'fastq.gz' - self.job_script_path = None + + self.tellread_output_path = join(self.output_path, 'output') + makedirs(self.tellread_output_path) + + self.tmp1_path = join(self.tellread_output_path, 'tmp1') + + makedirs(self.tmp1_path) # for projects that use sequence_processing_pipeline as a dependency, # jinja_env must be set to sequence_processing_pipeline's root path, @@ -89,113 +98,186 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self._generate_job_scripts() + # TODO: generate a sample-mapping to map C#s to fake sample-names and + # fake projects. Process sample-sheet later. + self.mapping = self._generate_sample_mapping() + + # TODO: hardcode lane at 'L001' + self.lane = 'L001' + + self.clean_wall_time_limit = "24:00:00" + self.clean_mem_in_gb = "8" + self.clean_node_count = "1" + self.clean_cores_per_task = "1" + self.cloudspades_cores_per_task = "12" + self.cloudspades_mem_in_gb = "128" + self.cloudspades_modules = ["gcc_9.3.0"] + self.cloudspades_node_count = "1" + self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1" + self.cloudspades_wall_time_limit = "24:00:00" + self.counts_cores_per_task = "1" + self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py", + self.counts_mem_in_gb = "8" + self.counts_node_count = "1" + self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv' + self.counts_plot_counts_path = "/home/qiita_test/qiita-spots/plot_counts.py" + self.counts_sample_sheet = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv" + self.counts_wall_time_limit = "24:00:00" + self.cs_isolate_mem_in_gb = "64" + self.integrate_indicies_script_path = "/home/qiita_test/qiita-spots/integrate-indices-np.py" + self.integrate_mem_in_gb = "8" + self.integrate_node_count = "1" + self.integrate_wall_time_limit = "24:00:00" + self.integrate_cores_per_task = "1" + self.queue_name = "qiita" + self.tellink_cores_per_task = "16" + self.tellink_mem_in_gb = "160" + self.tellink_modules = ["singularity_3.6.4"] + self.tellink_node_count = "1" + self.tellink_sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" + self.tellink_wall_time_limit = "96:00:00" + self.tellread_cores_per_task = "4" + self.tellread_mem_in_gb = "16" + self.tellread_modules = ["singularity_3.6.4"] + self.tellread_node_count = "1" + self.tellread_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh" + self.tellread_wall_time_limit = "96:00:00" + self.tl_cores_per_task = "16" + self.tl_isolate_node_count = "1" + self.tl_isolate_wall_time_limit = "96:00:00" + self.tl_mem_in_gb = "160" + self.main_map = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv" + self.main_mode = "metagenomic" + self.main_seqrun_path = "/sequencing/seqmount/KL_iSeq_Runs/20230906_FS10001773_68_BTR67708-1611" + + # TODO: Address reference_map and reference_base + self.main_reference_base = "" + self.main_reference_map = "" + def _generate_job_scripts(self): scripts = [ { "template": "cloudspades.sbatch", "params": { "job_name": "cs-assemble", - "wall_time_limit": "24:00:00", - "mem_in_gb": "128", - "node_count": "1", - "cores_per_task": "12", - "queue_name": "qiita", - "modules_to_load": ' '.join(["gcc_9.3.0"]), - "spades_path": "TBD" + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.cloudspades_mem_in_gb, + "node_count": self.cloudspades_node_count, + "cores_per_task": self.cloudspades_cores_per_task, + "queue_name": self.queue_name, + "modules_to_load": ' '.join(self.cloudspades_modules), + "cloudspades_path": self.cloudspades_path } }, { "template": "cloudspades-isolate.sbatch", "params": { "job_name": "cs-assemble", - "wall_time_limit": "24:00:00", - "mem_in_gb": "64", - "node_count": "1", - "cores_per_task": "12", - "queue_name": "qiita", - "modules_to_load": ' '.join(["gcc_9.3.0"]), - "spades_path": "~/spades-cloudspades-paper/assembler/" + "wall_time_limit": self.cloudspades_wall_time_limit, + "mem_in_gb": self.cs_isolate_mem_in_gb, + "node_count": self.cloudspades_node_count, + "cores_per_task": self.cloudspades_cores_per_task, + "queue_name": self.queue_name, + "modules_to_load": ' '.join(self.cloudspades_modules), + "cloudspades_path": self.cloudspades_path } }, { "template": "integrate.sbatch", "params": { "job_name": "integrate", - "wall_time_limit": "24:00:00", - "mem_in_gb": "8", - "node_count": "1", - "cores_per_task": "1", - "queue_name": "qiita" + "wall_time_limit": self.integrate_wall_time_limit, + "mem_in_gb": self.integrate_mem_in_gb, + "node_count": self.integrate_node_count, + "cores_per_task": self.integtrate_cores_per_task, + "iinp_script_path": self.integrate_indicies_script_path, + "queue_name": self.queue_name + } + }, + { + "template": "compute_sequence_counts_for_normalization.sbatch", + "params": { + "job_name": "norm", + "wall_time_limit": self.counts_wall_time_limit, + "mem_in_gb": self.counts_mem_in_gb, + "node_count": self.counts_node_count, + "cores_per_task": self.counts_cores_per_task, + "sample_sheet": self.counts_sample_sheet, + "plot_counts_path": self.counts_plot_counts_path, + "output_path": self.tellread_output_path, + "create_picklist_path": self.counts_create_picklist_path, + "read_counts_path": join(self.tellread_output_path, self.counts_other_file), + "queue_name": self.queue_name } }, { "template": "telllink.sbatch", "params": { "job_name": "tellink", - "wall_time_limit": "96:00:00", - "mem_in_gb": "160", - "node_count": "1", - "cores_per_task": "16", - "queue_name": "qiita", - "modules_to_load": ' '.join(["singularity_3.6.4"]), - "sing_path": "/projects/long_read_collab/code/tellseq/" - "release_v1.11/tellink-release/" - "run_tellink_sing.sh" + "wall_time_limit": self.tellink_wall_time_limit, + "mem_in_gb": self.tellink_mem_in_gb, + "node_count": self.tellink_node_count, + "cores_per_task": self.tellink_cores_per_task, + "queue_name": self.queue_name, + "modules_to_load": ' '.join(self.tellink_modules), + "output_path": self.tellread_output_path, + "sing_path": self.tellink_sing_path } }, { "template": "telllink-isolate.sbatch", "params": { "job_name": "tellink-isolate", - "wall_time_limit": "96:00:00", - "node_count": "1", - "cores_per_task": "16", - "mem_in_gb": "160", - "queue_name": "qiita", - "modules_to_load": ' '.join(["singularity_3.6.4"]), - "sing_path": "/projects/long_read_collab/code/tellseq/" - "release_v1.11/tellink-release/" - "run_tellink_sing.sh" + "wall_time_limit": self.tellink_wall_time_limit, + "node_count": self.tl_isolate_node_count, + "cores_per_task": self.tl_cores_per_task, + "mem_in_gb": self.tl_mem_in_gb, + "queue_name": self.queue_name, + "modules_to_load": ' '.join(self.tellink_modules), + "output_path": self.tellread_output_path, + "sing_path": self.tellink_sing_path } }, { "template": "tellread.sbatch", "params": { "job_name": "tellread", - "wall_time_limit": "96:00:00", - "mem_in_gb": "16", - "node_count": "1", - "tmp_dir": "/panfs/${USER}/tmp", - "cores_per_task": "4", - "queue_name": "qiita", - "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp", - "sing_script_path": "$HOME/qiita-spots/tellread-release" - "-novaseqX/run_tellread_sing.sh", - "modules_to_load": ' '.join(["singularity_3.6.4"]) + "wall_time_limit": self.tellread_wall_time_limit, + "mem_in_gb": self.tellread_mem_in_gb, + "node_count": self.tellread_node_count, + "tmp_dir": self.tmp1_path, + "cores_per_task": self.tellread_cores_per_task, + "queue_name": self.queue_name, + "sing_script_path": self.tellread_sing_script_path, + "modules_to_load": ' '.join(self.tellread_modules) } }, { "template": "tellread-cleanup.sbatch", "params": { "job_name": "cleanup", - "wall_time_limit": "24:00:00", - "mem_in_gb": "8", - "node_count": "1", - "cores_per_task": "1", - "queue_name": "qiita" + "wall_time_limit": self.clean_wall_time_limit, + "mem_in_gb": self.clean_mem_in_gb, + "node_count": self.clean_node_count, + "cores_per_task": self.clean_cores_per_task, + "queue_name": self.queue_name } }, + # these hardcoded paths for tellread.sh need to be replaced with + # the lane number and run-directory path, and the lane and the + # mode from the user input. Note that we also need to process the + # upcoming sample-sheet in order to generate the mapping we need + # as well. { "template": "tellread.sh", "params": { - "tellread_map": "/home/qiita_test/qiita-spots/" - "tellread_mapping.csv", - "seqrun_path": "/sequencing/igm_runs/" - "240216_LH00444_0058_A22357VLT4", - "lane": 'L008', - "reference_map": "", - "reference_base": "", - "mode": "metagenomic" + "tellread_map": self.main_map, + "seqrun_path": self.main_seqrun_path, + "output_path": self.tellread_output_path, + "lane": self.lane, + "reference_map": self.main_reference_map, + "reference_base": self.main_reference_base, + "mode": self.main_mode } } ] @@ -207,9 +289,8 @@ def _generate_job_scripts(self): with open(job_script_path, 'w') as f: f.write(template.render(**params)) - - if script['template'] == "tellread.sh": - self.job_script_path = job_script_path + # TODO: Change from 777 to something more appropriate. + chmod(job_script_path, 0o777) def run(self, callback=None): """ @@ -221,25 +302,126 @@ def run(self, callback=None): """ # Unlike other Jobs that submit a Slurm script and wait for the job - # to complete, this Job will execute the tellread.sh shell script. - # It is this script that does all of the Slurm job creation. This Job - # will need another means to tell when a job has completed - # successfully. - - command = ("./tellread.sh -s /sequencing/igm_runs/240216_LH00444" - "_0058_A22357VLT4 -i ./samplesheet.csv -l L008 -m " - "metagenomic") - - if self.job_script_path: - res = self._system_call(command) - else: + # to complete, this Job() will execute an existing shell script that + # spawns all the jobs that perform the actual work. + + # tellread.sh performs some work that requires it to run on a compute + # node. Since Job()s run on the interactive node, an interactive + # shell on a compute node must be requested for this script to run on. + + # define 'sjob' here for clarity. This should be more than adequate + # resources to run the tellread.sh script and exit as it does not wait + # on its children to complete. + + # as with the original scripts, the scripts generated by Jinja2 will + # live in the current working directory. Hence, the script will always + # exist at ./tellread.sh provided it was created successfully. + sjob = "srun -N 1 -n 1 -p qiita --mem 4g --time 1:00:00 --pty bash -l" + command = (f"{sjob}; pushd .;cd {self.output_path}; ./tellread.sh; " + "popd; exit") + + if not exists(join(self.output_path, 'tellread.sh')): raise PipelineError("tellread.sh script could not be found.") + res = self._system_call(command) + if res['return_code'] != 0: raise PipelineError("tellread.sh script did not execute correctly") - # res['stdout'] - # res['stderr'] + # once _system_call() returns and tellread.sh executed correctly, then + # a pids file should exist in the output subdirectory. + pids_fp = join(self.output_path, 'output', 'pids') + if not exists(pids_fp): + raise PipelineError("TRConvertJob could not locate a pids file") + + with open(pids_fp, 'r') as f: + lines = f.readlines() + lines = [x.strip().split(': ') for x in lines] + results = {k: v for (k, v) in lines} + + child_processes = [('main tellread', 'TRJOB_RETURN_CODE', + 'TRJOB_PID', True), + ('counts', 'NORM_COUNTS_JOB_RETURN_CODE', + 'NORM_COUNTS_JOB_PID', False), + ('integrate', 'INTEGRATE_JOB_RETURN_CODE', + 'INTEGRATE_JOB_PID', True), + ('csj', 'CSJ_JOB_RETURN_CODE', + 'CSJ_JOB_PID', False), + ('tlj', 'TLJ_JOB_RETURN_CODE', + 'TLJ_JOB_PID', False), + ('cleanup', 'CLEANUP_JOB_RETURN_CODE', + 'CLEANUP_JOB_PID', True)] + + # Iterate through all the TellRead script's known child processes. + # Some children will be optional depending on the parameters given, + # while others are required. The Job() should immediately raise an + # error if any child (optional or not) exits unsuccessfully, however. + for name, code, _, is_required in child_processes: + if code in results: + if results[code] != '0': + raise PipelineError(f"An error ({results[code]}) occurred " + f"running {name} subprocess") + else: + if is_required: + raise PipelineError(f"The {name} subprocess did not " + "execute correctly") + + # Get a list of Slurm job ids that we need to wait on and text + # descriptions of what they are. + jids = [(results[x[2], x[0]]) for x in child_processes if + x[2] in results] + + # ensure the jids are casted to integers before passing them. + statuses = self._wait_on_job_ids([int(x[0]) for x in jids]) + + for (jid, description), status in zip(jids, statuses): + if status not in Job.slurm_status_successful: + raise PipelineError(f"process '{description}' ({jid}) " + f"failed ({status}") + + # post-process working directory to make it appear like results + # generated by ConvertJob + + integrated_files_path = join(self.output_path, 'output', "integrated") + + if not exists(integrated_files_path): + raise ValueError(f"{integrated_files_path} does not exist") + + # move integrated directory to TRConvertJob directory, co-level with + # output directory. This makes it easier to delete the rest of the + # output that we don't need. + + # move err and out logs into logs subdirectory. + for root, dirs, files in walk(self.output_path): + for _file in files: + _path = join(root, _file) + if _path.endswith('.err'): + move(_path, join(self.output_path, 'logs')) + elif _path.endswith('.out'): + move(_path, join(self.output_path, 'logs')) + # don't go below one level. + break + + # save two logs and move them into standard Job logs directory. + move(join(self.output_path, 'output', 'log'), + join(self.output_path, 'logs')) + move(join(self.output_path, 'output', 'output.log'), + join(self.output_path, 'logs')) + + # rename the files and move them into project directories. + for root, dirs, files in walk(integrated_files_path): + for _file in files: + fastq_file = join(root, _file) + self._post_process_file(fastq_file, self.lane, self.mapping) + + # move project folders from integrated directory to working_dir. + contents = listdir(integrated_files_path) + for name in contents: + move(join(integrated_files_path, name), + self.output_path) + + # delete the original output directory. + rmtree(join(self.output_path, 'output')) def parse_logs(self): raise PipelineError("parsing logs not implemented.") @@ -247,3 +429,59 @@ def parse_logs(self): @staticmethod def parse_job_script(job_script_path): raise PipelineError("parsing job script not implemented.") + + def _post_process_file(self, fastq_file, lane, mapping): + # generate names of the form generated by bcl-convert/bcl2fastq: + # _S#_L00#__001.fastq.gz + # see: + # https://help.basespace.illumina.com/files-used-by-basespace/ + # fastq-files + _dir, _file = split(fastq_file) + + # ex: integrated/C544.R2.fastq.gz + m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file) + + if m is None: + raise ValueError(f"The filename '{_file}' is not of a " + "recognizable form") + + adapter_id = m[1] + read_type = m[2] + + if adapter_id not in mapping: + raise ValueError(f"{adapter_id} is not present in mapping") + + sample_name, sample_index, project_name = mapping[adapter_id] + + # generate the new filename for the fastq file, and reorganize the + # files by project. + new_name = "%s_S%d_L%s_%s_001.fastq.gz" % (sample_name, + sample_index, + str(lane).zfill(3), + read_type) + + # ensure that the project directory exists before we rename and move + # the file to that location. + makedirs(join(_dir, project_name), exist_ok=True) + + # if there's an error renaming and moving the file, let it pass up to + # the user. + final_path = join(_dir, project_name, new_name) + rename(fastq_file, final_path) + return final_path + + def _generate_sample_mapping(self): + # this generates a sample mapping for the C501-C596 adapters used by + # the vendor to a sample-name and project. In production use this + # mapping would need to be created from the future sample-sheet. + project_names = ['Project1', 'Project2', 'Project3'] + sample_mapping = {} + + for sample_index in range(1, 97): + adapter_id = "C%s" % str(sample_index + 500) + sample_name = "MySample%d" % sample_index + project_name = project_names[sample_index % 3] + sample_mapping[adapter_id] = (sample_name, sample_index, + project_name) + + return sample_mapping diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py new file mode 100644 index 00000000..44906872 --- /dev/null +++ b/sequence_processing_pipeline/contrib/create_picklist.py @@ -0,0 +1,65 @@ +import os +from scipy.stats import mannwhitneyu, zscore +from sklearn.linear_model import LogisticRegression +from contextlib import suppress +import pandas as pd +from metapool.metapool import * +from metapool import (make_sample_sheet, requires_dilution, dilute_gDNA, + find_threshold, autopool, extract_stats_metadata) +from sys import argv + +input_sheet_filename = argv[1] +#input_sheet_filename = input_sheet_filename.rsplit('.', 1)[0] + '.read_counts.tsv' +#instead construct the needed path and pass it. + +plate_df_w_reads = pd.read_csv(input_sheet_filename, + sep='\t') +plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False + for s in plate_df_w_reads['Sample_Name']] +reads_column = 'read_counts' + +well_col = 'Sample_Well' +assert reads_column in plate_df_w_reads.columns + +f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8)) +# evenness plot +rmax = int(round(plate_df_w_reads[reads_column].max(),-2)) +survival_df = pd.concat([read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == True, + reads_column], label='Blanks',rmax=rmax), + read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == False, + reads_column], label='Samples',rmax=rmax)]) + +ax3.set_xlabel(reads_column) +ax3.set_ylabel('Samples') +survival_df.plot(color = ['coral','steelblue'],ax=ax1) +ax1.set_xlabel(reads_column) +ax1.set_ylabel('Samples') + +##Histogram +sns.histplot(plate_df_w_reads[reads_column],ax=ax3) + +#Boxplot +sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4); +sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4, + size=3,color='black',alpha=0.5) + + +plt.tight_layout() +plt.savefig(input_sheet_filename + '.comboplot.pdf') + +#plate_df_w_reads = plate_df_w_reads[plate_df_w_reads[reads_column] > 0] +plate_df_normalized = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,dynamic_range=20, + normalization_column=reads_column) +plt.savefig(input_sheet_filename + '.normalizedplot.pdf') + +vols = make_2D_array(plate_df_normalized, data_col='iSeq normpool volume', well_col=well_col).astype(float) + +# Write the picklist as .csv +picklist_fp = input_sheet_filename + '.picklist.csv' + +if os.path.isfile(picklist_fp): + print("Warning! This file exists already.") + +picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000) +with open(picklist_fp,'w') as f: + f.write(picklist) diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py new file mode 100644 index 00000000..9500cff9 --- /dev/null +++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py @@ -0,0 +1,330 @@ +# Why +# 1) cloudspades requires the index reads be inline in the record header +# 2) Ariadne requires the data are sorted by the barcodes +# +# Inlining is easy. Sorting is complex as the amount of data is large, and +# the ordering stems is determined external to the data being sorted. To +# determine order, all barcodes must be read in to gather the complete +# barcode <-> record association; if only partial data is read then +# associations to barcodes may be missed, and we cannot perform an insertion sort +# efficiently as we're writing to disk. Once we know an order for the records, +# we (currently) read in the entirety of the subsequent data (R1 then R2), +# reorder, and write. Performing this in blocks to minimize memory may be +# possible, but we have to assume access is random as a grouping barcode +# may be with any record along the file. +# +# A variety of approaches were considered, including: +# - indexing portions in a hashtable, reading inputs multiple times, and +# writing in blocks. This was tested in both rust and python. The amount of +# memory was large, and keeping it under control would be many many many +# passes over data on disk or in memory +# - using pandas to do the grouping, which possibly avoids the memory burden +# of a hashmap. it didn't +# - using mmap files. No go, these are large and we have to walk over them +# a lot. +# +# Parsing this stuff adds a lot of overhead in Python. It will add some, if not +# a lot, in rust as well -- our test data had 65M sequences. So the current +# approach operates in the raw file data itself, using regex's to parse +# individual records. We use numpy for sorting and getting record orders. +# This is memory expensive but so far much less than the other approaches tried +# and it does not require multiple passes over files. We bottleneck on write +# IO, so to mitigate that, we are using a parallel gzip (pgzip), which still +# bottlenecks but gets better throughput. +# +# There probably are smarter ways to do this to reduce the memory burden. +# Right now, it's O(N) where N is the number of records. We load R1 and R2 +# separately though so we at least halve the memory use. As for doing it +# faster, at the moment we appear to saturate time on gzip. Easiest solution +# would be to increase the number of threads, but then again, this process +# is expected to run in an array, and filesystem can only take so much. +# +# In addition to the inline tests, md5 checks to verify all record IDs are +# present in both R1 / R2, and relative to original input. Spot checks on +# an arbitrary set of records were performed on R1 / R2 to verify no apparent +# unusual modification. And spot checks were performed to verify that correct +# barcodes are incorporating as expected in output. +# +# author: Daniel McDonald (d3mcdonald@eng.ucsd.edu) +import numpy as np +import click +import re +import io +import pgzip +import gzip + + +RECORD = re.compile(rb'@\S+\n[ATGCN]+\n\+\n\S+\n') +BARCODE = re.compile(rb'@\S+\n([ATGCN]+)\n\+\n\S+\n') + + +def gather_order(i1_in_fp): + """Determine record order + + This is a fancy way of saying: get all the barcodes, and sort them. + + We return the order of the sorted records, the unique barcodes, + and the bounds for what barcode associated with what record + """ + # determine barcode length + _ = i1_in_fp.readline() + b = i1_in_fp.readline() + rec_len = len(b.strip()) + i1_in_fp.seek(0) + + # we need larger data in memory later anyway... + i1 = i1_in_fp.read() + start = 0 + end = len(i1) + + # get the number of records. we completely assume non-multiline fastq here + newlines = i1.count(b'\n') + assert newlines % 4 == 0 + barcodes = np.empty(newlines // 4, dtype='|S%d' % rec_len) + + # walk all index records + # grab each barcode + idx = 0 + while start < end: + barcode_result = BARCODE.search(i1, pos=start) + barcode = barcode_result.groups()[0] + assert len(barcode) == rec_len # get angry if the barcode is weird + + barcodes[idx] = barcode + idx += 1 + start = barcode_result.end() + + # we no longer need the raw data so let's toss it + del i1 + + # determine the record order of a lexicographic sort + # gather the unique barcodes so we can use them later, and the bounding + # points in the sorted set + record_order = barcodes.argsort() + barcodes = barcodes[record_order] + unique_barcodes, barcode_bounds = np.unique(barcodes, return_index=True) + + return record_order, unique_barcodes, barcode_bounds + + +def test_gather_order(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + exp_order = np.array([0, 2, 1, 4, 3, 5, 6]) + exp_unique = np.array([b'ATGC', b'TTGG', b'TTTT']) + exp_bounds = np.array([0, 2, 4]) + + assert (order == exp_order).all() + assert (unique == exp_unique).all() + assert (bounds == exp_bounds).all() + + +def troll_and_write(order, unique, bounds, in_, out_): + """Walk over the raw data, spit out barcode amended records in order + + - read all data + - get index boundaries for each record + - pull out each record in order according to the barcode data + - associate the barcode + - write + """ + + data = in_.read() + boundaries = np.empty([order.size, 2], dtype=np.uint64) + + stop = 0 + for idx in range(order.size): + rec = RECORD.search(data, pos=stop) + start, stop = rec.span() + boundaries[idx] = np.array([start, stop], dtype=np.uint64) + + current_barcode_idx = 0 + current_barcode = unique[current_barcode_idx] + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + for order_idx, record_idx in enumerate(order): + if order_idx >= current_barcode_bound_end: + current_barcode_idx += 1 + + if current_barcode_idx >= bounds.size: + raise ValueError("should not happen?") + current_barcode = unique[current_barcode_idx] + + if current_barcode_idx + 1 >= bounds.size: + # run to the end + current_barcode_bound_end = order.size + else: + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + start, stop = boundaries[record_idx] + record = data[start:stop] + + # in a one-off, these might pass by chance. It would be real weird + # for them to always pass for all records in a large file. + # n.b., b'foo'[0] is int, because yay, so we use a slice to maintain + # a human readable character to test against as most mortals haven't + # memorized the ascii table + assert record[:1] == b'@' + assert record[-1:] == b'\n' + + with_barcode = insert_barcode(record, current_barcode) + out_.write(with_barcode) + + +def test_troll_and_write(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + # we assume records are in the same order, as that has previously been + # observed w/ tellread and is the normal expectation + r1data = [b'@foo', b'AATGC', b'+', b'!!!!!', + b'@bar', b'ATTGG', b'+', b'!!!!!', + b'@baz', b'AATGC', b'+', b'!!!!!', + b'@oof', b'ATTTT', b'+', b'!!!!!', + b'@rab', b'ATTGG', b'+', b'!!!!!', + b'@zab', b'ATTTT', b'+', b'!!!!!', + b'@ofo', b'ATTTT', b'+', b'!!!!!', b''] + r1 = io.BytesIO(b'\n'.join(r1data)) + r1out = io.BytesIO() + troll_and_write(order, unique, bounds, r1, r1out) + r1out.seek(0) + + r1exp = [b'@foo BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@baz BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@bar BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@rab BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@oof BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@zab BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@ofo BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b''] + r1exp = b'\n'.join(r1exp) + assert r1exp == r1out.read() + + +def create_tag(t): + return b'BX:Z:%s-1' % t + + +def create_tag_no_suffix(t): + return b'BX:Z:%s' % t + + +def insert_barcode(record, barcode): + """Get the current ID, smash the needed tag in""" + # @foo\nATGC\n+\n!!!!\n + id_, remainder = record.split(b'\n', 1) + tag = create_tag(barcode) + return b'%s %s\n%s' % (id_, tag, remainder) + + +def readfq(fp): + if fp.mode == 'rb': + strip = bytes.strip + else: + strip = str.strip + + id_ = iter(fp) + seq = iter(fp) + dumb = iter(fp) + qual = iter(fp) + for rec in zip(id_, seq, dumb, qual): + yield list(map(strip, rec)) + + +def writefq(rec, out): + for item in rec: + out.write(item) + out.write(b'\n') + + +@click.group() +def cli(): + pass + + +@cli.command() +def tests(): + test_gather_order() + test_troll_and_write() + + +@cli.command() +@click.option('--r1-in', type=click.Path(exists=True), required=True) +@click.option('--r2-in', type=click.Path(exists=True), required=True) +@click.option('--i1-in', type=click.Path(exists=True), required=True) +@click.option('--r1-out', type=click.Path(exists=False), required=True) +@click.option('--r2-out', type=click.Path(exists=False), required=True) +@click.option('--threads', type=int, required=False, default=1) +@click.option('--no-sort', is_flag=True, default=False) +def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort): + r1_in_fp = open(r1_in, 'rb') + r2_in_fp = open(r2_in, 'rb') + i1_in_fp = open(i1_in, 'rb') + + if no_sort: + r1_out_fp = gzip.open(r1_out, mode='wb') + r2_out_fp = gzip.open(r2_out, mode='wb') + + r1_sniff = r1_in_fp.readline().strip() + r2_sniff = r2_in_fp.readline().strip() + r1_in_fp.seek(0) + r2_in_fp.seek(0) + + # outputs from tellread don't seem to have orientation information + # some downstream programs hate this, so let's add if needed. + if r1_sniff.endswith(b'/1'): + if not r2_sniff.endswith(b'/2'): + raise ValueError(f'unexpected endings: {r1_sniff.decode("utf-8")} {r2_sniff.decode("utf-8")}') + orient_r1 = '' + orient_r2 = '' + else: + assert b'/1' not in r1_sniff + + orient_r1 = b'/1' + orient_r2 = b'/2' + + for (r1, r2, i1) in zip(*map(readfq, [r1_in_fp, r2_in_fp, i1_in_fp])): + assert r1[0] == r2[0] + assert r1[0] == i1[0] + + tag = create_tag_no_suffix(i1[1]) + r1[0] = b"%s%s %s" % (r1[0], orient_r1, tag) + r2[0] = b"%s%s %s" % (r2[0], orient_r2, tag) + writefq(r1, r1_out_fp) + writefq(r2, r2_out_fp) + r1_out_fp.close() + r2_out_fp.close() + else: + # 200MB is what they use in their readme... + r1_out_fp = pgzip.open(r1_out, mode='wb', thread=threads, + blocksize=2*10**8) + r2_out_fp = pgzip.open(r2_out, mode='wb', thread=threads, + blocksize=2*10**8) + + order, unique, bounds = gather_order(i1_in_fp) + + for in_, out_ in zip([r1_in_fp, r2_in_fp], [r1_out_fp, r2_out_fp]): + troll_and_write(order, unique, bounds, in_, out_) + in_.close() + out_.close() + + +if __name__ == '__main__': + cli() diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py new file mode 100644 index 00000000..76c822ee --- /dev/null +++ b/sequence_processing_pipeline/contrib/plot_counts.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import re +import sys +import os +import pandas as pd + +ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq') + +# remove total line from wc +data = [l.strip().split(' ') for l in open(sys.argv[1])][:-1] +plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data] +sheetdata = dict(plotdata) + +ordered = sorted(plotdata, key=lambda x: x[1]) +f = plt.figure(figsize=(16, 8)) +plt.bar([i for i, _ in ordered], [v for _, v in ordered]) +plt.ylabel('I1 reads') +plt.xticks(list(range(len(ordered))), [i for i, _ in ordered], rotation=90) +plt.savefig(sys.argv[3] + '/counts.pdf') + +sheet = pd.read_csv(sys.argv[2], dtype=str) +sheet = sheet[~sheet['Lane'].isnull()] +sheet['read_counts'] = [sheetdata[i] for i in sheet['Barcode_ID']] +name = os.path.basename(sys.argv[2]).rsplit('.', 1)[0] +newname = name + '.read_counts.tsv' + +sheet.to_csv(sys.argv[3] + '/' + newname, sep='\t', index=False, header=True) diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index 4296abfb..261c11c7 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -1,20 +1,14 @@ #!/bin/bash -l -#SBATCH -J {{job_name}} # cs-assemble -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 64G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 12 -#SBATCH -p {{queue_name}} # qiita +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err +#SBATCH --output cloudspades-isolate_%x-%A_%a.out +#SBATCH --error cloudspades-isolate_%x-%A_%a.err -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; @@ -24,8 +18,6 @@ function logger () { set -x set -e -# this gets set in the environment from another script. For now let's -# run with that. echo $TMPDIR if [[ -z "${LABELTAG}" ]]; then @@ -39,13 +31,9 @@ if [[ ! -d ${base} ]]; then exit 1 fi -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate activate qiime2-2023.5 - -module load {{modules_to_load}} # gcc_9.3.0 +module load {{modules_to_load}} -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) # assumes 1-based array index, eg --array 1-N sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} @@ -59,8 +47,8 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then fi mkdir -p ${cs} +pushd {{cloudspades_path}}/assembler/bin -pushd {{spades_path}} ./spades.py \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ @@ -69,16 +57,15 @@ pushd {{spades_path}} module unload gcc_9.3.0 popd -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate quast - -quast \ - -o ${cs}/quast-scaffolds \ - -t ${SLURM_JOB_CPUS_PER_NODE} \ - ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 +# TODO: Look for alternative method to load quast +#mamba activate quast +#quast \ +# -o ${cs}/quast-scaffolds \ +# -t ${SLURM_JOB_CPUS_PER_NODE} \ +# ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 +# # remove intermediates that currently dont have a downstream use -if [[ -d ${cs}/K21 ]]; then - rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -fi +#if [[ -d ${cs}/K21 ]]; then +# rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +#fi diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index e1c2bb40..636dd5ce 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -6,15 +6,9 @@ #SBATCH -c {{cores_per_task}} # 12 #SBATCH -p {{queue_name}} # qiita -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err +#SBATCH --output cloudspades_%x-%A_%a.out +#SBATCH --error cloudspades_%x-%A_%a.err -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. source activate qiime2-2023.5 function logger () { echo "$(date) :: ${@}"; @@ -37,13 +31,9 @@ if [[ ! -d ${base} ]]; then exit 1 fi -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate activate qiime2-2023.5 - -module load {{modules_to_load}} # gcc_9.3.0 +module load {{modules_to_load}} -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) # assumes 1-based array index, eg --array 1-N sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} @@ -57,9 +47,8 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then fi mkdir -p ${cs} -pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin +pushd {{cloudspades_path}}/assembler/bin -# for now don't use spades.py jinja2 variable ./spades.py \ -o ${cs} \ --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ @@ -69,13 +58,14 @@ pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin module unload gcc_9.3.0 popd -mamba activate quast -quast \ - -o ${cs}/quast-scaffolds \ - -t ${SLURM_JOB_CPUS_PER_NODE} \ - ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 +# TODO: Look for alternative method to load quast +#mamba activate quast +#quast \ +# -o ${cs}/quast-scaffolds \ +# -t ${SLURM_JOB_CPUS_PER_NODE} \ +# ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 # remove intermediates that currently dont have a downstream use -if [[ -d ${cs}/K21 ]]; then - rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -fi +#if [[ -d ${cs}/K21 ]]; then +# rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp +#fi diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch new file mode 100644 index 00000000..a4b31114 --- /dev/null +++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch @@ -0,0 +1,57 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} # norm +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +#SBATCH --output compute_sequence_counts_%x-%A_%a.out +#SBATCH --error compute_sequence_counts_%x-%A_%a.err + +# NB: output appears normal w/out. +# source activate qiime2-2023.5 + +function logger () { + echo "$(date) :: ${@}"; + echo "$(date) :: ${@}" 1>&2; +} + +set -x +set -e +set -o pipefail + +echo $TMPDIR + +tellread=${TELLREAD_OUTPUT} +if [[ ! -d ${tellread} ]]; then + echo "${tellread} not found" + exit 1 +fi + +if [[ ! -d ${tellread}/Full ]]; then + echo "${tellread}/Full not found" + exit 1 +fi + +if [[ -z {{output_path}} ]]; then + echo "OUTPUT not specified" + exit 1 +fi + +if [[ -z {{sample_sheet}} ]]; then + echo "SAMPLESHEET not specified" + exit 1 +fi + +if [[ ! -f {{sample_sheet}} ]]; then + echo "SAMPLESHEET not found" + exit 1 +fi + +mkdir -p {{output_path}} +wc -l ${tellread}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt +python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}} + +conda activate qp-knight-lab-processing-2022.03 +python {{create_picklist_path}} {{read_counts_path}} diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index f4161466..30a3a9ba 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -6,22 +6,14 @@ #SBATCH -c {{cores_per_task}} # 1 #SBATCH -p {{queue_name}} # qiita -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err +#SBATCH --output integrate_%x-%A_%a.out +#SBATCH --error integrate_%x-%A_%a.err -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. -source activate rust function logger () { echo "$(date) :: ${@}"; echo "$(date) :: ${@}" 1>&2; } - # https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html cores=${SLURM_CPUS_PER_TASK} @@ -55,7 +47,7 @@ set -x set -e set -o pipefail -samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +samples=($(cat ${tellread}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} export TMPDIR=$(mktemp -d) @@ -114,8 +106,8 @@ fi # not mask a nonzero exit status (e.g., the python process raising) cat ${i1} | gzip > ${i1out} -mamba activate tellread-integrate -python ${BASE}/integrate-indices-np.py integrate \ +conda activate qp-knight-lab-processing-2022.03 +python {{iinp_script_path}} integrate \ --no-sort \ --r1-in ${r1} \ --r2-in ${r2} \ diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index f842cddf..b8f9d735 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -6,31 +6,26 @@ #SBATCH --time {{wall_time_limit}} # 96:00:00 #SBATCH -p {{queue_name}} # qiita -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL +#SBATCH --output telllink-isolate_%x-%A_%a.out +#SBATCH --error telllink-isolate_%x-%A_%a.err set -x set -e -module load {{modules_to_load}} # singularity_3.6.4 +module load {{modules_to_load}} if [[ -z "${LABELTAG}" ]]; then echo "LABELTAG is not specified" exit 1 fi -base=/panfs/qiita/TELLREAD/${LABELTAG} +base={{output_path}} if [[ ! -d ${base} ]]; then echo "${base} not found" exit 1 fi -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} k=79 diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch index 39daa383..234192b2 100644 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -6,34 +6,29 @@ #SBATCH --time {{wall_time_limit}} # 96:00:00 #SBATCH -p {{queue_name}} # qiita -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL +#SBATCH --output telllink_%x-%A_%a.out +#SBATCH --error telllink_%x-%A_%a.err set -x set -e -module load {{modules_to_load}} # singularity_3.6.4 +module load {{modules_to_load}} if [[ -z "${LABELTAG}" ]]; then echo "LABEL is not specified" exit 1 fi -base=/panfs/${USER}/${LABELTAG} +base={{output_path}} if [[ ! -d ${base} ]]; then echo "${base} not found" exit 1 fi -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) +samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} -# leave these hardcoded for now +# TODO: leave these hardcoded for now k=79 lc=35 cores=${SLURM_CPUS_PER_TASK} @@ -62,4 +57,3 @@ mkdir -p ${tl} if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping fi - diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch index d5edf855..2cb479e7 100644 --- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -6,13 +6,8 @@ #SBATCH -c {{cores_per_task}} # 1 #SBATCH -p {{queue_name}} # qiita -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=BEGIN,FAIL - -# for now these can be left hard-coded. -#SBATCH --output %x-%A.out -#SBATCH --error %x-%A.err +#SBATCH --output tellread-cleanup_%x-%A.out +#SBATCH --error tellread-cleanup_%x-%A.err if [[ -z "${OUTPUT}" ]]; then echo "OUTPUT is not specified" diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index 89633da9..fe8d39d9 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -1,19 +1,13 @@ #!/bin/bash -l -#SBATCH -J {{job_name}} # tellread -#SBATCH -p {{queue_name}} # qiita -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 4 -#SBATCH --mem {{mem_in_gb}}G # 16G -#SBATCH --time {{wall_time_limit}} # 96:00:00 +#SBATCH -J {{job_name}} +#SBATCH -p {{queue_name}} +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH --time {{wall_time_limit}} -# for now these can be left hard-coded. -#SBATCH --partition=short -#SBATCH --output %x-%A.out -#SBATCH --error %x-%A.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=BEGIN,FAIL +#SBATCH --output tellread_%x-%A.out +#SBATCH --error tellread_%x-%A.err function logger () { echo "$(date) :: ${@}"; @@ -86,7 +80,7 @@ fi mkdir -p ${OUTPUT} -module load {{modules_to_load}} # singularity_3.6.4 +module load {{modules_to_load}} {{sing_script_path}} \ -i ${seqrun_path} \ -o ${OUTPUT} \ @@ -96,7 +90,6 @@ module load {{modules_to_load}} # singularity_3.6.4 ${extra} \ -l ${lane} - if [[ -d ${OUTPUT}/Full ]]; then echo "Run appears successful" elif [[ -d ${OUTPUT}/1_demult/Full ]]; then diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh index ac7c6d31..d6c61cb0 100755 --- a/sequence_processing_pipeline/templates/tellread.sh +++ b/sequence_processing_pipeline/templates/tellread.sh @@ -4,7 +4,7 @@ seqrunpath="{{seqrun_path}}" # previously -s option lane="{{lane}}" # previously -l option reference_map="{{reference_map}}" # previously -r option reference_base="{{reference_base}}" # previously -b option -mode="{{mode}}" $ # previously -m option +mode="{{mode}}" # previously -m option # preserve error-checking of parameters to preserve as much of the original # script as possible, even though this could be done in python. @@ -37,7 +37,7 @@ fi safepath=$(echo ${seqrunpath} | sed 's:/*$::') label=$(basename ${safepath}) labeltag=${label}-${tag} -output=/panfs/${USER}/${labeltag} +output={{output_path}} if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then echo "Cannot access the lane" @@ -85,7 +85,12 @@ declare -a s declare -a g # below extended regex might be broken because C5\d\d happens in column 0, not column 1 # of the hacked sample-sheet. -for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) +# for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) + +# new sample-sheet is of form: +# Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_ID,Sample_Project,Well_description,Lane +# 10283.LS.4.4.2015,10283.LS.4.4.2015,Plate_1,A1,C501,LS_Timeseries_TellSeq_10283,10283.LS.4.4.2015,1 +for sample in $(egrep -o ",C5..," ${samplesheet} | tr -d "," | sort) do echo "sample found: ${sample}" # get references if they exist @@ -168,7 +173,9 @@ if [[ -f ${submitcopy} ]]; then exit 1 fi -echo $@ > ${arguments} +#TODO: Other possible arguments like -r? +echo "-l {{lane}} -s {{seqrun_path}} -i {{tellread_map}} -m {{mode}}" >${arguments} + cp ${0} ${scriptcopy} cp ${submit_script} ${submitcopy} cp ${asm_cloudspades_script} ${asmcscopy} @@ -188,6 +195,9 @@ trjob=$(sbatch \ --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ ${submit_script}) +echo "TRJOB_RETURN_CODE: $?" > {{output_path}}/pids +echo "TRJOB_PID: $trjob" >> {{output_path}}/pids + if [[ ${norm} == "TRUE" ]]; then cp ${norm_script} ${normcopy} chmod gou-w ${normcopy} @@ -197,6 +207,8 @@ if [[ ${norm} == "TRUE" ]]; then -J ${labeltag}-${datetag}-norm-counts \ --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \ ${norm_script}) + echo "NORM_COUNTS_JOB_RETURN_CODE: $?" >> {{output_path}}/pids + echo "NORM_COUNTS_JOB_PID: $norm_counts_job" >> {{output_path}}/pids fi integrate_job=$(sbatch \ @@ -207,6 +219,9 @@ integrate_job=$(sbatch \ --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \ ${integrate_script}) +echo "INTEGRATE_JOB_RETURN_CODE: $?" >> {{output_path}}/pids +echo "INTEGRATE_JOB_PID: $integrate_job" >> {{output_path}}/pids + if [[ ${assemble} == "TRUE" ]]; then csj=$(sbatch \ --parsable \ @@ -215,6 +230,10 @@ if [[ ${assemble} == "TRUE" ]]; then --array 1-${n_samples} \ --export LABELTAG=${labeltag},OUTPUT=${output} \ ${asm_cloudspades_script}) + + echo "CSJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids + echo "CSJ_JOB_PID: $csj" >> {{output_path}}/pids + tlj=$(sbatch \ --parsable \ --dependency=aftercorr:${integrate_job} \ @@ -222,6 +241,10 @@ if [[ ${assemble} == "TRUE" ]]; then --array 1-${n_samples} \ --export LABELTAG=${labeltag},OUTPUT=${output} \ ${asm_tellink_script}) + + echo "TLJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids + echo "TLJ_JOB_PID: $tlj" >> {{output_path}}/pids + cleanupdep=${csj}:${tlj} else cleanupdep=${integrate_job} @@ -234,3 +257,6 @@ cleanup=$(sbatch \ --dependency=afterok:${cleanupdep} \ --export OUTPUT=${output} \ ${clean_script}) + +echo "CLEANUP_JOB_RETURN_CODE: $?" >> {{output_path}}/pids +echo "CLEANUP_JOB_PID: $cleanup" >> {{output_path}}/pids diff --git a/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv new file mode 100644 index 00000000..f696f0c9 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv @@ -0,0 +1,41 @@ +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_96_Well_Position,Barcode_ID,Sample_Project,Well_description,Lane +Person.A.TELLSEQ.R20.microbe,Person.A.TELLSEQ.R20.microbe,TellSeq3_15196_P3,A1,A4,C525,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.microbe,1 +Person.B.TELLSEQ.R24.microbe,Person.B.TELLSEQ.R24.microbe,TellSeq3_15196_P3,B1,B4,C526,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.microbe,1 +Person.C.TELLSEQ.R21.microbe,Person.C.TELLSEQ.R21.microbe,TellSeq3_15196_P3,C1,C4,C527,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.microbe,1 +Person.D.TELLSEQ.R26.microbe,Person.D.TELLSEQ.R26.microbe,TellSeq3_15196_P3,D1,D4,C528,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.microbe,1 +Person.E.TELLSEQ.R19.microbe,Person.E.TELLSEQ.R19.microbe,TellSeq3_15196_P3,E1,E4,C529,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.microbe,1 +Pet.C.TELLSEQ.R23.microbe,Pet.C.TELLSEQ.R23.microbe,TellSeq3_15196_P3,F1,F4,C530,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.microbe,1 +BLANK.TELLSEQ.3.12.H.microbe,BLANK.TELLSEQ.3.12.H.microbe,TellSeq3_15196_P3,G1,G4,C531,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.microbe,1 +Isolate.115.R1.microbe,Isolate.115.R1.microbe,TellSeq3_15196_P1,H1,H4,C532,TellSeq3_15196_P3,Isolate.115.R1.microbe,1 +Zymo.Mock.Community.R1.microbe,Zymo.Mock.Community.R1.microbe,TellSeq3_15196_P1,A2,A5,C533,TellSeq3_15196_P3,Zymo.Mock.Community.R1.microbe,1 +E.coli.QC.DNA.R1.microbe,E.coli.QC.DNA.R1.microbe,TellSeq3_15196_P1,B2,B5,C534,TellSeq3_15196_P3,E.coli.QC.DNA.R1.microbe,1 +Person.A.TELLSEQ.R20.purified.microbe,Person.A.TELLSEQ.R20.purified.microbe,TellSeq3_15196_P3,C2,C5,C535,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.purified.microbe,1 +Person.B.TELLSEQ.R24.purified.microbe,Person.B.TELLSEQ.R24.purified.microbe,TellSeq3_15196_P3,D2,D5,C536,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.purified.microbe,1 +Person.C.TELLSEQ.R21.purified.microbe,Person.C.TELLSEQ.R21.purified.microbe,TellSeq3_15196_P3,E2,E5,C537,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.purified.microbe,1 +Person.D.TELLSEQ.R26.purified.microbe,Person.D.TELLSEQ.R26.purified.microbe,TellSeq3_15196_P3,F2,F5,C538,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.purified.microbe,1 +Person.E.TELLSEQ.R19.purified.microbe,Person.E.TELLSEQ.R19.purified.microbe,TellSeq3_15196_P3,G2,G5,C539,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.purified.microbe,1 +Pet.C.TELLSEQ.R23.purified.microbe,Pet.C.TELLSEQ.R23.purified.microbe,TellSeq3_15196_P3,H2,H5,C540,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.purified.microbe,1 +BLANK.TELLSEQ.3.12.H.purified.microbe,BLANK.TELLSEQ.3.12.H.purified.microbe,TellSeq3_15196_P3,A3,A6,C541,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.purified.microbe,1 +Isolate.115.R2.microbe,Isolate.115.R2.microbe,TellSeq3_15196_P1,B3,B6,C542,TellSeq3_15196_P3,Isolate.115.R2.microbe,1 +Zymo.Mock.Community.R2.microbe,Zymo.Mock.Community.R2.microbe,TellSeq3_15196_P1,C3,C6,C543,TellSeq3_15196_P3,Zymo.Mock.Community.R2.microbe,1 +E.coli.QC.DNA.R2.microbe,E.coli.QC.DNA.R2.microbe,TellSeq3_15196_P1,D3,D6,C544,TellSeq3_15196_P3,E.coli.QC.DNA.R2.microbe,1 +Person.A.TELLSEQ.R20.std,Person.A.TELLSEQ.R20.std,TellSeq3_15196_P3,A1,A1,C501,TellSeq3_15196,Person.A.TELLSEQ.R20.std,1 +Person.B.TELLSEQ.R24.std,Person.B.TELLSEQ.R24.std,TellSeq3_15196_P3,B1,B1,C502,TellSeq3_15196,Person.B.TELLSEQ.R24.std,1 +Person.C.TELLSEQ.R21.std,Person.C.TELLSEQ.R21.std,TellSeq3_15196_P3,C1,C1,C503,TellSeq3_15196,Person.C.TELLSEQ.R21.std,1 +Person.D.TELLSEQ.R26.std,Person.D.TELLSEQ.R26.std,TellSeq3_15196_P3,D1,D1,C504,TellSeq3_15196,Person.D.TELLSEQ.R26.std,1 +Person.E.TELLSEQ.R19.std,Person.E.TELLSEQ.R19.std,TellSeq3_15196_P3,E1,E1,C505,TellSeq3_15196,Person.E.TELLSEQ.R19.std,1 +Pet.C.TELLSEQ.R23.std,Pet.C.TELLSEQ.R23.std,TellSeq3_15196_P3,F1,F1,C506,TellSeq3_15196,Pet.C.TELLSEQ.R23.std,1 +BLANK.TELLSEQ.3.12.H.std,BLANK.TELLSEQ.3.12.H.std,TellSeq3_15196_P3,G1,G1,C507,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.std,1 +Isolate.115.R1.std,Isolate.115.R1.std,TellSeq3_15196_P1,H1,H1,C508,TellSeq3_15196,Isolate.115.R1.std,1 +Zymo.Mock.Community.R1.std,Zymo.Mock.Community.R1.std,TellSeq3_15196_P1,A2,A2,C509,TellSeq3_15196,Zymo.Mock.Community.R1.std,1 +E.coli.QC.DNA.R1.std,E.coli.QC.DNA.R1.std,TellSeq3_15196_P1,B2,B2,C510,TellSeq3_15196,E.coli.QC.DNA.R1.std,1 +Person.A.TELLSEQ.R20.purified.std,Person.A.TELLSEQ.R20.purified.std,TellSeq3_15196_P3,C2,C2,C511,TellSeq3_15196,Person.A.TELLSEQ.R20.purified.std,1 +Person.B.TELLSEQ.R24.purified.std,Person.B.TELLSEQ.R24.purified.std,TellSeq3_15196_P3,D2,D2,C512,TellSeq3_15196,Person.B.TELLSEQ.R24.purified.std,1 +Person.C.TELLSEQ.R21.purified.std,Person.C.TELLSEQ.R21.purified.std,TellSeq3_15196_P3,E2,E2,C513,TellSeq3_15196,Person.C.TELLSEQ.R21.purified.std,1 +Person.D.TELLSEQ.R26.purified.std,Person.D.TELLSEQ.R26.purified.std,TellSeq3_15196_P3,F2,F2,C514,TellSeq3_15196,Person.D.TELLSEQ.R26.purified.std,1 +Person.E.TELLSEQ.R19.purified.std,Person.E.TELLSEQ.R19.purified.std,TellSeq3_15196_P3,G2,G2,C515,TellSeq3_15196,Person.E.TELLSEQ.R19.purified.std,1 +Pet.C.TELLSEQ.R23.purified.std,Pet.C.TELLSEQ.R23.purified.std,TellSeq3_15196_P3,H2,H2,C516,TellSeq3_15196,Pet.C.TELLSEQ.R23.purified.std,1 +BLANK.TELLSEQ.3.12.H.purified.std,BLANK.TELLSEQ.3.12.H.purified.std,TellSeq3_15196_P3,A3,A3,C517,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.purified.std,1 +Isolate.115.R2.std,Isolate.115.R2.std,TellSeq3_15196_P1,B3,B3,C518,TellSeq3_15196,Isolate.115.R2.std,1 +Zymo.Mock.Community.R2.std,Zymo.Mock.Community.R2.std,TellSeq3_15196_P1,C3,C3,C519,TellSeq3_15196,Zymo.Mock.Community.R2.std,1 +E.coli.QC.DNA.R2.std,E.coli.QC.DNA.R2.std,TellSeq3_15196_P1,D3,D3,C520,TellSeq3_15196,E.coli.QC.DNA.R2.std,1 From 1d431a3d9cb455a1e598a0762602a2bc465440f7 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 15 Sep 2024 17:52:59 -0700 Subject: [PATCH 08/47] Manually merged with current master --- sequence_processing_pipeline/Job.py | 229 ++++++++---------- sequence_processing_pipeline/Pipeline.py | 33 +++ .../tests/test_Pipeline.py | 56 +++++ 3 files changed, 186 insertions(+), 132 deletions(-) diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 035d8ba0..6a5d4f86 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -25,6 +25,15 @@ class Job: 'SPECIAL_EXIT', 'STAGE_OUT', 'STOPPED', 'SUSPENDED'] + slurm_status_not_running = (slurm_status_terminated + + slurm_status_successful) + + slurm_status_all_states = (slurm_status_terminated + + slurm_status_successful + + slurm_status_running) + + polling_interval_in_seconds = 60 + def __init__(self, root_dir, output_path, job_name, executable_paths, max_array_length, modules_to_load=None): """ @@ -204,76 +213,96 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} - def _wait_on_job(self, job_id, callback=None): - job_info = {'job_id': None, 'job_name': None, 'job_state': None, - 'elapsed_time': None} - - exit_count = 0 - - while True: - result = self._system_call(f"sacct -P -n --job {job_id} --format " - "JobID,JobName,State,Elapsed,ExitCode") - - if result['return_code'] != 0: - # sacct did not successfully submit the job. - raise ExecFailedError(result['stderr']) - - # [-1] remove the extra \n - jobs_data = result['stdout'].split('\n')[:-1] - states = dict() - estatuses = dict() - for i, jd in enumerate(jobs_data): - jid, jname, jstate, etime, estatus = jd.split('|') - if jid.endswith('.extern') or jid.endswith('.batch'): - continue + def wait_on_job_ids(self, job_ids, callback=None): + ''' + Wait for the given job-ids to finish running before returning. + :param job_ids: A list of Slurm job-ids + :param callback: Set callback function that receives status updates. + :return: A dictionary of job-ids and their current statuses. + ''' - if i == 0: - job_info['job_id'] = jid - job_info['job_name'] = jname - job_info['elapsed_time'] = etime - job_info['exit_status'] = estatus + # wait_on_job_ids was broken out of submit_job() and updated to monitor + # multiple job ids. This will allow multiple jobs to be submitted to + # Slurm in parallel and a single wait_on_job_ids() can wait on all of + # them before returning, optionally submitting callbacks for each + # job-id. + + def query_slurm(job_ids): + # internal function query_slurm encapsulates the handling of + # squeue. + count = 0 + while True: + result = self._system_call("squeue -t all -j " + f"{','.join(job_ids)} " + "-o '%F,%A,%T'") + + if result['return_code'] == 0: + # there was no issue w/squeue, break this loop and + # continue. + break + else: + # there was a likely intermittent issue w/squeue. Pause + # and wait before trying a few more times. If the problem + # persists then report the error and exit. + count += 1 - if jstate not in states: - states[jstate] = 0 - states[jstate] += 1 + if count > 3: + raise ExecFailedError(result['stderr']) - if estatus not in estatuses: - estatuses[estatus] = 0 - estatuses[estatus] += 1 + sleep(60) - job_info['job_state'] = f'{states}' - job_info['exit_status'] = f'{estatuses}' + lines = result['stdout'].split('\n') + lines.pop(0) # remove header + lines = [x.split(',') for x in lines if x != ''] - if callback is not None: - callback(jid=job_id, status=f'{states}') + jobs = {} + child_jobs = {} + for job_id, unique_id, state in lines: + jobs[unique_id] = state - logging.debug("Job info: %s" % job_info) + if unique_id != job_id: + child_jobs[unique_id] = job_id # job is a child job - # if job is completed after having run or exited after having - # run, then stop waiting. - if not set(states) - {'COMPLETED', 'FAILED', 'CANCELLED'}: - # break - exit_count += 1 + return jobs, child_jobs - if exit_count > 4: + while True: + jobs, child_jobs = query_slurm(job_ids) + + for jid in job_ids: + logging.debug("JOB %s: %s" % (jid, jobs[jid])) + if callback is not None: + callback(jid=jid, status=jobs[jid]) + + children = [x for x in child_jobs if child_jobs[x] == jid] + if len(children) == 0: + logging.debug("\tNO CHILDREN") + for cid in children: + logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid])) + status = [jobs[x] in Job.slurm_status_not_running for x in job_ids] + + if set(status) == {True}: + # all jobs either completed successfully or terminated. break - sleep(10) + sleep(Job.polling_interval_in_seconds) - return job_info, states, estatuses + return jobs def submit_job(self, script_path, job_parameters=None, - script_parameters=None, exec_from=None, callback=None): + script_parameters=None, wait=True, + exec_from=None, callback=None): """ - Submit a Torque job script and optionally wait for it to finish. - :param script_path: The path to a Torque job (bash) script. + Submit a Slurm job script and optionally wait for it to finish. + :param script_path: The path to a Slurm job (bash) script. :param job_parameters: Optional parameters for scheduler submission. :param script_parameters: Optional parameters for your job script. + :param wait: Set to False to submit job and not wait. :param exec_from: Set working directory to execute command from. :param callback: Set callback function that receives status updates. - :return: Dictionary containing the job's id, name, status, and - elapsed time. Raises PipelineError if job could not be submitted or - if job was unsuccessful. + :return: If wait is True, a dictionary containing the job's id and + status. If wait is False, the Slurm job-id of the submitted + job. Raises PipelineError if job could not be submitted or if + job was unsuccessful. """ if job_parameters: cmd = 'sbatch %s %s' % (job_parameters, script_path) @@ -302,96 +331,32 @@ def submit_job(self, script_path, job_parameters=None, # Just to give some time for everything to be set up properly sleep(10) - job_info, states, estatuses = self._wait_on_job(job_id, - callback=callback) + if wait is False: + # return job_id since that is the only information for this new + # job that we have available. User should expect that this is + # not a dict if they explicitly set wait=False. + return job_id - if job_info['job_id'] is None: - # job was never in the queue - return an error. - if callback is not None: - callback(jid=job_id, status='ERROR') + # the user is expecting a dict with 'job_id' and 'job_state' + # attributes. This method will return a dict w/job_ids as keys and + # their job status as values. This must be munged before returning + # to the user. + results = self.wait_on_job_ids([job_id], callback=callback) - raise JobFailedError(f"job {job_id} never appeared in the " - "queue.") + job_result = {'job_id': job_id, 'job_state': results[job_id]} - # job was once in the queue if callback is not None: - callback(jid=job_id, status=job_info['job_state']) + callback(jid=job_id, status=job_result['job_state']) - if set(states) == {'COMPLETED'}: - if 'exit_status' in job_info: - if set(estatuses) == {'0:0'}: - # job completed successfully - return job_info - else: - exit_status = job_info['exit_status'] - raise JobFailedError(f"job {job_id} exited with exit_" - f"status {exit_status}") - else: - # with no other info, assume job completed successfully - return job_info + if job_result['job_state'] == 'COMPLETED': + return job_result else: - # job exited unsuccessfully raise JobFailedError(f"job {job_id} exited with status " - f"{job_info['job_state']}") - - def _wait_on_job_ids(self, job_ids, timeout_in_seconds=None): - """ - Wait on a list of known Slurm job-ids. - :param job_ids: A list of Slurm job-ids - :param timeout_in_seconds: Abort and raise an Error after n seconds. - :return: A list of strings, representing the state of each job. - """ - - # this method is useful for wrapping scripts that spawn child jobs and - # the user wishes to wait until they are all completed before - # continuing. - if not isinstance(job_ids, list): - raise ValueError("job_ids must be a list of valid slurm job ids") - - if set([isinstance(x, int) for x in job_ids]) != {True}: - raise ValueError("job_ids must contain integers") - - if timeout_in_seconds: - if not isinstance(timeout_in_seconds, int): - raise ValueError("timeout_in_seconds must be an integer") - - if timeout_in_seconds < 1: - raise ValueError("timeout_in_seconds must be greater than 0") - - start_time = time() - while True: - if timeout_in_seconds: - if time() - start_time > timeout_in_seconds: - raise PipelineError("timeout reached while waiting for " - "jobs") - - job_states = [] - for job_id in job_ids: - # NB: sacct can support querying on multiple job-ids at once. - # However, this would require extensive rewriting and testing - # of the existing code. Deferring for now. - _, states, _ = self._wait_on_job(job_id) - job_states.append(set(states)) - - # assuming that a Slurm job will never contain states from both - # terminated and successful, this will generate a list containing - # the current state for each job. - result = [set(x) & set(Job.slurm_status_terminated + - Job.slurm_status_successful) for x in job_states] - - if set([bool(x) for x in result]) == {True}: - # all jobs are no longer in a running state. - break - - sleep(10) - - # return the current state of each job. Assume that each set contains - # only one value. - return [''.join(x) for x in result] + f"{job_result['job_state']}") def _group_commands(self, cmds): # break list of commands into chunks of max_array_length (Typically - # 1000 for Torque job arrays). To ensure job arrays are never more + # 1000 for Slurm job arrays). To ensure job arrays are never more # than 1000 jobs long, we'll chain additional commands together, and # evenly distribute them amongst the first 1000. cmds.sort() diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index e308f38c..88319353 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -14,6 +14,7 @@ from collections import defaultdict from datetime import datetime from xml.etree import ElementTree as ET +from metapool.prep import PREP_MF_COLUMNS logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) @@ -235,6 +236,38 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, self._configure_profile() + def identify_reserved_words(self, words): + ''' + Returns a list of words that should not appear as column names in any + project referenced in the Pipeline's sample-sheet/pre-prep file. + :param words: A list of words that may include reserved words. + :return: A list of words that are already reserved in upper, lower, + and mixed cases. + ''' + + # Only strings used as column names in pre-prep files are currently + # considered 'reserved' as loading a pre-prep file containing these + # column names will fail if one or more of the strings already appears + # as a column name in a study's sample metadata table. + + # This implementation assumes some understanding of metapool's impl, + # specifically how the proper set of prep-info file columns are + # generated. For now the functionality will be defined here as this + # area of metapool is currently in flux. + if self.mapping_file is not None: + reserved = PREP_MF_COLUMNS + else: + # results will be dependent on SheetType and SheetVersion of + # the sample-sheet. Since all columns in a prep-info file are + # lower()ed before writing out to file, the word must be + # reserved in all case forms. e.g.: 'Sample_Well' and 'Sample_well' + # are both forms of 'sample_well'. + reserved = [x.lower() for x in + self.sample_sheet.CARRIED_PREP_COLUMNS] + \ + self.sample_sheet.GENERATED_PREP_COLUMNS + + return list(set([x.lower() for x in words]) & set(reserved)) + def _configure_profile(self): # extract the instrument type from self.run_dir and the assay type # from self.sample_sheet (or self.mapping_file). diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 37abb5b9..fff4b07d 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -28,6 +28,7 @@ def setUp(self): makedirs(self.output_file_path, exist_ok=True) self.maxDiff = None self.good_sample_sheet_path = self.path('good-sample-sheet.csv') + self.good_legacy_sheet_path = self.path('mgv90_test_sheet.csv') self.mp_sheet_path = self.path('multi-project-sheet.csv') self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet' '.csv') @@ -1630,6 +1631,38 @@ def test_parse_project_name(self): obs = pipeline._parse_project_name(test, t_set == 'True') self.assertEqual(obs, exp) + def test_identify_reserved_words(self): + pipeline = Pipeline(self.good_config_file, self.good_run_id, + self.good_sample_sheet_path, None, + self.output_file_path, self.qiita_id, + Pipeline.METAGENOMIC_PTYPE) + + # assert that arbitrary strings are not reserved. + obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD', + 'ANOTHER_WORD']) + self.assertEqual(obs, []) + + # assert that 'well_id_384' is a reserved word. + obs = pipeline.identify_reserved_words(['well_id_384', + 'NOT_A_RESERVED_WORD']) + + self.assertEqual(obs, ['well_id_384']) + + # create new pipeline using a/legacy (v90) metagenomic sample-sheet. + pipeline = Pipeline(self.good_config_file, self.good_run_id, + self.good_legacy_sheet_path, None, + self.output_file_path, self.qiita_id, + Pipeline.METAGENOMIC_PTYPE) + + # assert that for legacy sample-sheets, well_id_384 is NOT a reserved + # word and the appropriate reserved word is 'Sample_well'. + obs = pipeline.identify_reserved_words(['well_id_384', + 'NOT_A_RESERVED_WORD', + 'Sample_well', + 'Sample_Well']) + + self.assertEqual(obs, ['sample_well']) + class TestAmpliconPipeline(unittest.TestCase): def setUp(self): @@ -2339,6 +2372,29 @@ def test_process_run_info_file(self): # These are indirectly tested as generate_dummy_sample_sheet() is # called by Pipeline's constructor. + def test_identify_reserved_words(self): + pipeline = Pipeline(self.good_config_file, + self.good_run_id, + None, + self.good_mapping_file_path, + self.output_file_path, + self.qiita_id, + Pipeline.AMPLICON_PTYPE) + + # assert that arbitrary strings are not reserved. + obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD', + 'ANOTHER_WORD']) + self.assertEqual(obs, []) + + # assert that Sample_Well is okay for current pre-prep files but + # well_id_384 is reserved. Show that all forms of tm300_8_tool are + # also reserved. + obs = pipeline.identify_reserved_words(['Sample_Well', + 'TM300_8_Tool', + 'tm300_8_tool', + 'well_id_384']) + self.assertEqual(set(obs), {'tm300_8_tool', 'well_id_384'}) + class TestInstrumentUtils(unittest.TestCase): def setUp(self): From 7a84cd04bf976340278b2b2c18ba53062d222d28 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 15 Sep 2024 17:53:43 -0700 Subject: [PATCH 09/47] Manually merged with master --- .../tests/data/mgv90_test_sheet.csv | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv diff --git a/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv b/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv new file mode 100644 index 00000000..ded82519 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv @@ -0,0 +1,40 @@ +[Header],,,,,,,,,, +IEMFileVersion,4,,,,,,,,, +SheetType,standard_metag,,,,,,,,, +SheetVersion,90,,,,,,,,, +Investigator Name,Caballero,,,,,,,,, +Experiment Name,RKL0042,,,,,,,,, +Date,2/26/20,,,,,,,,, +Workflow,GenerateFASTQ,,,,,,,,, +Application,FASTQ Only,,,,,,,,, +Assay,Metagenomic,,,,,,,,, +Description,,,,,,,,,, +Chemistry,Default,,,,,,,,, +,,,,,,,,,, +[Reads],,,,,,,,,, +150,,,,,,,,,, +150,,,,,,,,,, +,,,,,,,,,, +[Settings],,,,,,,,,, +ReverseComplement,0,,,,,,,,, +,,,,,,,,,, +[Data],,,,,,,,,, +Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description +1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1 +1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2 +3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1 +3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2 +3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5 +3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6 +3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7 +,,,,,,,,,, +[Bioinformatics],,,,,,,,,, +Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,, +Project_1111,1111,False,AACC,GGTT,False,Knight Lab Kapa HP,Eqiiperiment,,, +Trojecp_666,666,False,AACC,GGTT,False,Knight Lab Kapa HP,SomethingWitty,,, +,,,,,,,,,, +[Contact],,,,,,,,,, +Email,Sample_Project,,,,,,,,, +test@lol.com,Project_1111,,,,,,,,, +tester@rofl.com,Trojecp_666,,,,,,,,, +,,,,,,,,,, From 02364037b384be8d836afa13a54e5bba9b048c79 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 17 Sep 2024 10:18:14 -0700 Subject: [PATCH 10/47] Updates based on testing in qiita-rc --- sequence_processing_pipeline/Commands.py | 5 ++++- sequence_processing_pipeline/Job.py | 1 - sequence_processing_pipeline/TRConvertJob.py | 22 ++++++++++--------- .../tests/test_ConvertJob.py | 2 +- .../tests/test_FastQCJob.py | 2 +- .../tests/test_NuQCJob.py | 2 +- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index b2cd5e41..cce7c605 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -22,7 +22,8 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, # is now the following: # add one more level to account for project_names nested under ConvertJob # dir. - fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz') + # this will ignore the _I1_ reads that appear in the integrated result. + fastq_paths = glob.glob(data_location_path + '/*/*_R?_001.fastq.gz') # convert from GB and halve as we sum R1 max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2) @@ -114,6 +115,8 @@ def demux(id_map, fp, out_d, task, maxtask): qual = iter(fp) for i, s, d, q in zip(id_, seq, dumb, qual): + # NB: This appears to not be causing the removal of the metadata + # either. fname_encoded, id_ = i.split(delimiter, 1) if fname_encoded not in openfps: diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 6a5d4f86..1c1a7593 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -9,7 +9,6 @@ import logging from inspect import stack import re -from time import time class Job: diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 98d9c18d..c4ca29c4 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -116,7 +116,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1" self.cloudspades_wall_time_limit = "24:00:00" self.counts_cores_per_task = "1" - self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py", + self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py" self.counts_mem_in_gb = "8" self.counts_node_count = "1" self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv' @@ -154,6 +154,8 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.main_reference_base = "" self.main_reference_map = "" + self._generate_job_scripts() + def _generate_job_scripts(self): scripts = [ { @@ -189,7 +191,7 @@ def _generate_job_scripts(self): "wall_time_limit": self.integrate_wall_time_limit, "mem_in_gb": self.integrate_mem_in_gb, "node_count": self.integrate_node_count, - "cores_per_task": self.integtrate_cores_per_task, + "cores_per_task": self.integrate_cores_per_task, "iinp_script_path": self.integrate_indicies_script_path, "queue_name": self.queue_name } @@ -368,7 +370,7 @@ def run(self, callback=None): # Get a list of Slurm job ids that we need to wait on and text # descriptions of what they are. - jids = [(results[x[2], x[0]]) for x in child_processes if + jids = [(results[x[2]], x[0]) for x in child_processes if x[2] in results] # ensure the jids are casted to integers before passing them. @@ -377,7 +379,7 @@ def run(self, callback=None): for (jid, description), status in zip(jids, statuses): if status not in Job.slurm_status_successful: raise PipelineError(f"process '{description}' ({jid}) " - f"failed ({status}") + f"failed ({status})") # post-process working directory to make it appear like results # generated by ConvertJob @@ -412,7 +414,7 @@ def run(self, callback=None): for root, dirs, files in walk(integrated_files_path): for _file in files: fastq_file = join(root, _file) - self._post_process_file(fastq_file, self.lane, self.mapping) + self._post_process_file(fastq_file, self.mapping) # move project folders from integrated directory to working_dir. contents = listdir(integrated_files_path) @@ -430,7 +432,7 @@ def parse_logs(self): def parse_job_script(job_script_path): raise PipelineError("parsing job script not implemented.") - def _post_process_file(self, fastq_file, lane, mapping): + def _post_process_file(self, fastq_file, mapping): # generate names of the form generated by bcl-convert/bcl2fastq: # _S#_L00#__001.fastq.gz # see: @@ -455,10 +457,10 @@ def _post_process_file(self, fastq_file, lane, mapping): # generate the new filename for the fastq file, and reorganize the # files by project. - new_name = "%s_S%d_L%s_%s_001.fastq.gz" % (sample_name, - sample_index, - str(lane).zfill(3), - read_type) + new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name, + sample_index, + self.lane, + read_type) # ensure that the project directory exists before we rename and move # the file to that location. diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py index df81fdcf..a6ebad23 100644 --- a/sequence_processing_pipeline/tests/test_ConvertJob.py +++ b/sequence_processing_pipeline/tests/test_ConvertJob.py @@ -952,7 +952,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) error_msg = ("This job died.\n2024-01-01T12:12:12Z thread 99999 ERROR:" diff --git a/sequence_processing_pipeline/tests/test_FastQCJob.py b/sequence_processing_pipeline/tests/test_FastQCJob.py index 28fe52cb..a2291296 100644 --- a/sequence_processing_pipeline/tests/test_FastQCJob.py +++ b/sequence_processing_pipeline/tests/test_FastQCJob.py @@ -1121,7 +1121,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py index 5164575c..88d4ef1c 100644 --- a/sequence_processing_pipeline/tests/test_NuQCJob.py +++ b/sequence_processing_pipeline/tests/test_NuQCJob.py @@ -992,7 +992,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: From 64583a22a8269387ce0c4e607df13b43ef0cd523 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 17 Sep 2024 10:41:20 -0700 Subject: [PATCH 11/47] flake8 --- sequence_processing_pipeline/TRConvertJob.py | 34 ++++++---- .../contrib/create_picklist.py | 65 ++++++++++--------- .../contrib/integrate-indices-np.py | 12 ++-- .../contrib/plot_counts.py | 2 +- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index c4ca29c4..81b414d5 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -113,18 +113,24 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.cloudspades_mem_in_gb = "128" self.cloudspades_modules = ["gcc_9.3.0"] self.cloudspades_node_count = "1" - self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1" + self.cloudspades_path = ("/home/qiita_test/qiita-spots/spades-" + "cloudspades-0.1") self.cloudspades_wall_time_limit = "24:00:00" self.counts_cores_per_task = "1" - self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py" + self.counts_create_picklist_path = ("/home/qiita_test/qiita-spots/" + "create_picklist.py") self.counts_mem_in_gb = "8" self.counts_node_count = "1" - self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv' - self.counts_plot_counts_path = "/home/qiita_test/qiita-spots/plot_counts.py" - self.counts_sample_sheet = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv" + self.counts_other_file = ('20230906_FS10001773_68_BTR67708-1611.' + 'read_counts.tsv') + self.counts_plot_counts_path = ("/home/qiita_test/qiita-spots/'" + "'plot_counts.py") + self.counts_sample_sheet = ("/home/qiita_test/qiita-spots/" + "20230906_FS10001773_68_BTR67708-1611.csv") self.counts_wall_time_limit = "24:00:00" self.cs_isolate_mem_in_gb = "64" - self.integrate_indicies_script_path = "/home/qiita_test/qiita-spots/integrate-indices-np.py" + self.integrate_indicies_script_path = ("/home/qiita_test/qiita-spots/" + "integrate-indices-np.py") self.integrate_mem_in_gb = "8" self.integrate_node_count = "1" self.integrate_wall_time_limit = "24:00:00" @@ -134,21 +140,26 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.tellink_mem_in_gb = "160" self.tellink_modules = ["singularity_3.6.4"] self.tellink_node_count = "1" - self.tellink_sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh" + self.tellink_sing_path = ("/projects/long_read_collab/code/tellseq/" + "release_v1.11/tellink-release/" + "run_tellink_sing.sh") self.tellink_wall_time_limit = "96:00:00" self.tellread_cores_per_task = "4" self.tellread_mem_in_gb = "16" self.tellread_modules = ["singularity_3.6.4"] self.tellread_node_count = "1" - self.tellread_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh" + self.tellread_sing_script_path = ("$HOME/qiita-spots/tellread-release" + "-novaseqX/run_tellread_sing.sh") self.tellread_wall_time_limit = "96:00:00" self.tl_cores_per_task = "16" self.tl_isolate_node_count = "1" self.tl_isolate_wall_time_limit = "96:00:00" self.tl_mem_in_gb = "160" - self.main_map = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv" + self.main_map = ("/home/qiita_test/qiita-spots/20230906_FS10001773_" + "68_BTR67708-1611.csv") self.main_mode = "metagenomic" - self.main_seqrun_path = "/sequencing/seqmount/KL_iSeq_Runs/20230906_FS10001773_68_BTR67708-1611" + self.main_seqrun_path = ("/sequencing/seqmount/KL_iSeq_Runs/20230906" + "_FS10001773_68_BTR67708-1611") # TODO: Address reference_map and reference_base self.main_reference_base = "" @@ -208,7 +219,8 @@ def _generate_job_scripts(self): "plot_counts_path": self.counts_plot_counts_path, "output_path": self.tellread_output_path, "create_picklist_path": self.counts_create_picklist_path, - "read_counts_path": join(self.tellread_output_path, self.counts_other_file), + "read_counts_path": join(self.tellread_output_path, + self.counts_other_file), "queue_name": self.queue_name } }, diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py index 44906872..a1d6a1d0 100644 --- a/sequence_processing_pipeline/contrib/create_picklist.py +++ b/sequence_processing_pipeline/contrib/create_picklist.py @@ -1,19 +1,16 @@ import os -from scipy.stats import mannwhitneyu, zscore -from sklearn.linear_model import LogisticRegression -from contextlib import suppress -import pandas as pd -from metapool.metapool import * -from metapool import (make_sample_sheet, requires_dilution, dilute_gDNA, - find_threshold, autopool, extract_stats_metadata) +# from metapool.metapool import * from sys import argv +import pandas as pd +import matplotlib.pyplot as plt +from metapool.metapool import (read_survival, make_2D_array, + calculate_iseqnorm_pooling_volumes, + format_pooling_echo_pick_list) +import seaborn as sns input_sheet_filename = argv[1] -#input_sheet_filename = input_sheet_filename.rsplit('.', 1)[0] + '.read_counts.tsv' -#instead construct the needed path and pass it. -plate_df_w_reads = pd.read_csv(input_sheet_filename, - sep='\t') +plate_df_w_reads = pd.read_csv(input_sheet_filename, sep='\t') plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False for s in plate_df_w_reads['Sample_Name']] reads_column = 'read_counts' @@ -23,36 +20,45 @@ f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8)) # evenness plot -rmax = int(round(plate_df_w_reads[reads_column].max(),-2)) -survival_df = pd.concat([read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == True, - reads_column], label='Blanks',rmax=rmax), - read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == False, - reads_column], label='Samples',rmax=rmax)]) +rmax = int(round(plate_df_w_reads[reads_column].max(), -2)) + +foo = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is True, + reads_column], + label='Blanks', + rmax=rmax) + +bar = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is False, + reads_column], + label='Samples', + rmax=rmax) + +survival_df = pd.concat([foo, bar]) ax3.set_xlabel(reads_column) ax3.set_ylabel('Samples') -survival_df.plot(color = ['coral','steelblue'],ax=ax1) +survival_df.plot(color=['coral', 'steelblue'], ax=ax1) ax1.set_xlabel(reads_column) ax1.set_ylabel('Samples') -##Histogram -sns.histplot(plate_df_w_reads[reads_column],ax=ax3) - -#Boxplot -sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4); -sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4, - size=3,color='black',alpha=0.5) +# Histogram +sns.histplot(plate_df_w_reads[reads_column], ax=ax3) +# Boxplot +sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4) +sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4, + size=3, color='black', alpha=0.5) plt.tight_layout() plt.savefig(input_sheet_filename + '.comboplot.pdf') -#plate_df_w_reads = plate_df_w_reads[plate_df_w_reads[reads_column] > 0] -plate_df_normalized = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,dynamic_range=20, - normalization_column=reads_column) +pdfn = calculate_iseqnorm_pooling_volumes(plate_df_w_reads, + dynamic_range=20, + normalization_column=reads_column) plt.savefig(input_sheet_filename + '.normalizedplot.pdf') -vols = make_2D_array(plate_df_normalized, data_col='iSeq normpool volume', well_col=well_col).astype(float) +vols = make_2D_array(pdfn, + data_col='iSeq normpool volume', + well_col=well_col).astype(float) # Write the picklist as .csv picklist_fp = input_sheet_filename + '.picklist.csv' @@ -61,5 +67,6 @@ print("Warning! This file exists already.") picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000) -with open(picklist_fp,'w') as f: + +with open(picklist_fp, 'w') as f: f.write(picklist) diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py index 9500cff9..b1be83a6 100644 --- a/sequence_processing_pipeline/contrib/integrate-indices-np.py +++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py @@ -6,10 +6,10 @@ # the ordering stems is determined external to the data being sorted. To # determine order, all barcodes must be read in to gather the complete # barcode <-> record association; if only partial data is read then -# associations to barcodes may be missed, and we cannot perform an insertion sort -# efficiently as we're writing to disk. Once we know an order for the records, -# we (currently) read in the entirety of the subsequent data (R1 then R2), -# reorder, and write. Performing this in blocks to minimize memory may be +# associations to barcodes may be missed, and we cannot perform an insertion +# sort efficiently as we're writing to disk. Once we know an order for the +# records, we (currently) read in the entirety of the subsequent data (R1 then +# R2), reorder, and write. Performing this in blocks to minimize memory may be # possible, but we have to assume access is random as a grouping barcode # may be with any record along the file. # @@ -291,7 +291,9 @@ def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort): # some downstream programs hate this, so let's add if needed. if r1_sniff.endswith(b'/1'): if not r2_sniff.endswith(b'/2'): - raise ValueError(f'unexpected endings: {r1_sniff.decode("utf-8")} {r2_sniff.decode("utf-8")}') + raise ValueError('unexpected endings: ' + f'{r1_sniff.decode("utf-8")} ' + f'{r2_sniff.decode("utf-8")}') orient_r1 = '' orient_r2 = '' else: diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py index 76c822ee..ecab9e49 100644 --- a/sequence_processing_pipeline/contrib/plot_counts.py +++ b/sequence_processing_pipeline/contrib/plot_counts.py @@ -7,7 +7,7 @@ ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq') # remove total line from wc -data = [l.strip().split(' ') for l in open(sys.argv[1])][:-1] +data = [x.strip().split(' ') for x in open(sys.argv[1])][:-1] plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data] sheetdata = dict(plotdata) From e7e7c5456aca1a53a8897a24adf1ebe23f9317d4 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 18 Sep 2024 21:35:10 -0700 Subject: [PATCH 12/47] Small fixes --- sequence_processing_pipeline/Job.py | 5 +++++ sequence_processing_pipeline/TRConvertJob.py | 7 +++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 1c1a7593..2d64b039 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -226,6 +226,9 @@ def wait_on_job_ids(self, job_ids, callback=None): # them before returning, optionally submitting callbacks for each # job-id. + # ensure all ids are strings to ensure proper working w/join(). + job_ids = [str(x) for x in job_ids] + def query_slurm(job_ids): # internal function query_slurm encapsulates the handling of # squeue. @@ -257,6 +260,8 @@ def query_slurm(job_ids): jobs = {} child_jobs = {} for job_id, unique_id, state in lines: + # ensure unique_id is of type string for downstream use. + unique_id = str(unique_id) jobs[unique_id] = state if unique_id != job_id: diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 81b414d5..572f3feb 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -96,8 +96,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, # As the sample-sheet is validated by the Pipeline object before # being passed to TRConvertJob, additional validation isn't needed. - self._generate_job_scripts() - # TODO: generate a sample-mapping to map C#s to fake sample-names and # fake projects. Process sample-sheet later. self.mapping = self._generate_sample_mapping() @@ -386,9 +384,10 @@ def run(self, callback=None): x[2] in results] # ensure the jids are casted to integers before passing them. - statuses = self._wait_on_job_ids([int(x[0]) for x in jids]) + statuses = self.wait_on_job_ids([int(x[0]) for x in jids]) - for (jid, description), status in zip(jids, statuses): + for jid, description in jids: + status = statuses[jid] if status not in Job.slurm_status_successful: raise PipelineError(f"process '{description}' ({jid}) " f"failed ({status})") From 497738f3d4e7439e4a8beea48282de09c002c8fb Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 24 Sep 2024 15:34:11 -0700 Subject: [PATCH 13/47] Refactor KISSLoader to be more DRY. --- sequence_processing_pipeline/Job.py | 21 ++++++++++++++++ sequence_processing_pipeline/NuQCJob.py | 24 ++----------------- sequence_processing_pipeline/TRConvertJob.py | 25 +++----------------- 3 files changed, 26 insertions(+), 44 deletions(-) diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 2d64b039..59d9cea2 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -1,3 +1,6 @@ +from jinja2 import BaseLoader, TemplateNotFound +from os.path import getmtime +import pathlib from itertools import zip_longest from os import makedirs, walk from os.path import basename, exists, split, join @@ -11,6 +14,24 @@ import re +# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader +class KISSLoader(BaseLoader): + def __init__(self, path): + # pin the path for loader to the location sequence_processing_pipeline + # (the location of this file), along w/the relative path to the + # templates directory. + self.path = join(pathlib.Path(__file__).parent.resolve(), path) + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) + + class Job: slurm_status_terminated = ['BOOT_FAIL', 'CANCELLED', 'DEADLINE', 'FAILED', 'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index b1c27900..0ffacb1a 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -1,8 +1,7 @@ -from jinja2 import BaseLoader, TemplateNotFound from metapool import load_sample_sheet from os import stat, makedirs, rename -from os.path import join, basename, dirname, exists, abspath, getmtime -from sequence_processing_pipeline.Job import Job +from os.path import join, basename, dirname, exists, abspath +from sequence_processing_pipeline.Job import Job, KISSLoader from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) from sequence_processing_pipeline.Pipeline import Pipeline @@ -14,25 +13,6 @@ import glob import re from sys import executable -import pathlib - - -# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader -class KISSLoader(BaseLoader): - def __init__(self, path): - # pin the path for loader to the location sequence_processing_pipeline - # (the location of this file), along w/the relative path to the - # templates directory. - self.path = join(pathlib.Path(__file__).parent.resolve(), path) - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path) as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) logging.basicConfig(level=logging.DEBUG) diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 572f3feb..54542fef 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -1,31 +1,12 @@ -from jinja2 import BaseLoader, TemplateNotFound, Environment -from os.path import split, join, exists, getmtime -from sequence_processing_pipeline.Job import Job +from jinja2 import Environment +from os.path import split, join, exists +from sequence_processing_pipeline.Job import Job, KISSLoader from sequence_processing_pipeline.PipelineError import PipelineError -import pathlib from os import rename, walk, chmod, listdir, makedirs from shutil import move, rmtree from re import match -# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader -class KISSLoader(BaseLoader): - def __init__(self, path): - # pin the path for loader to the location sequence_processing_pipeline - # (the location of this file), along w/the relative path to the - # templates directory. - self.path = join(pathlib.Path(__file__).parent.resolve(), path) - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path) as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) - - class TRConvertJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, nprocs, wall_time_limit, pmem, bcl_tool_path, From ca71c1db29b73add9efb2169663871c074f77d66 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 1 Oct 2024 14:32:52 -0700 Subject: [PATCH 14/47] Pipeline.py updated to support changes in qp-klp --- sequence_processing_pipeline/Pipeline.py | 120 ++++++++++-------- sequence_processing_pipeline/TRConvertJob.py | 103 ++++++++++++++- .../tests/test_Pipeline.py | 105 ++++++--------- 3 files changed, 206 insertions(+), 122 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 88319353..3dd19371 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -133,25 +133,19 @@ class Pipeline: assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE] - def __init__(self, configuration_file_path, run_id, sample_sheet_path, - mapping_file_path, output_path, qiita_job_id, pipeline_type): + def __init__(self, configuration_file_path, run_id, input_file_path, + output_path, qiita_job_id, pipeline_type): """ Initialize Pipeline object w/configuration information. :param configuration_file_path: Path to configuration.json file. :param run_id: Used w/search_paths to locate input run_directory. - :param sample_sheet_path: Path to sample-sheet. - :param mapping_file_path: Path to mapping file. + :param input_file_path: Path to sample-sheet or pre-prep file. :param output_path: Path where all pipeline-generated files live. :param qiita_job_id: Qiita Job ID creating this Pipeline. :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.) """ - if sample_sheet_path is not None and mapping_file_path is not None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") - - if sample_sheet_path is None and mapping_file_path is None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") + if input_file_path is None: + raise PipelineError("user_input_file_path cannot be None") if pipeline_type not in Pipeline.pipeline_types: raise PipelineError(f"'{type}' is not a valid pipeline type.") @@ -196,21 +190,33 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, self.qiita_job_id = qiita_job_id self.pipeline = [] - if sample_sheet_path: - self.search_paths = self.configuration['search_paths'] - self.sample_sheet = self._validate_sample_sheet(sample_sheet_path) - self.mapping_file = None - else: + # this method will catch a run directory as well as its products + # directory, which also has the same name. Hence, return the + # shortest matching path as that will at least return the right + # path between the two. + results = [] + + if pipeline_type == Pipeline.AMPLICON_PTYPE: self.search_paths = self.configuration['amplicon_search_paths'] - self.mapping_file = self._validate_mapping_file(mapping_file_path) - # unlike _validate_sample_sheet() which returns a SampleSheet - # object that stores the path to the file it was created from, - # _validate_mapping_file() just returns a DataFrame. Store the - # path to the original mapping file itself as well. - self.mapping_file_path = mapping_file_path - self.sample_sheet = None + else: + self.search_paths = self.configuration['search_paths'] - self.run_dir = self._search_for_run_dir() + for search_path in self.search_paths: + logging.debug(f'Searching {search_path} for {self.run_id}') + for entry in listdir(search_path): + some_path = join(search_path, entry) + # ensure some_path never ends in '/' + some_path = some_path.rstrip('/') + if isdir(some_path) and some_path.endswith(self.run_id): + logging.debug(f'Found {some_path}') + results.append(some_path) + + if results: + results.sort(key=lambda s: len(s)) + self.run_dir = results[0] + else: + raise PipelineError(f"A run-dir for '{self.run_id}' could not be " + "found") # required files for successful operation # both RTAComplete.txt and RunInfo.xml should reside in the root of @@ -228,14 +234,44 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, except PermissionError: raise PipelineError('RunInfo.xml is present, but not readable') - if self.mapping_file is not None: + self.input_file_path = input_file_path + + if pipeline_type == Pipeline.AMPLICON_PTYPE: + # assume input_file_path references a pre-prep (mapping) file. + + self.mapping_file = self._validate_mapping_file(input_file_path) + # unlike _validate_sample_sheet() which returns a SampleSheet + # object that stores the path to the file it was created from, + # _validate_mapping_file() just returns a DataFrame. Store the + # path to the original mapping file itself as well. + # create dummy sample-sheet output_fp = join(output_path, 'dummy_sample_sheet.csv') self.generate_dummy_sample_sheet(self.run_dir, output_fp) self.sample_sheet = output_fp + else: + # assume user_input_file_path references a sample-sheet. + self.sample_sheet = self._validate_sample_sheet(input_file_path) + self.mapping_file = None self._configure_profile() + def get_software_configuration(self, software): + if software is None or software == "": + raise ValueError(f"'{software}' is not a valid value") + + key_order = ['profile', 'configuration', software] + + config = self.config_profile + + for key in key_order: + if key in config: + config = config[key] + else: + raise PipelineError(f"'{key}' is not defined in configuration") + + return config + def identify_reserved_words(self, words): ''' Returns a list of words that should not appear as column names in any @@ -254,7 +290,7 @@ def identify_reserved_words(self, words): # specifically how the proper set of prep-info file columns are # generated. For now the functionality will be defined here as this # area of metapool is currently in flux. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: reserved = PREP_MF_COLUMNS else: # results will be dependent on SheetType and SheetVersion of @@ -351,30 +387,6 @@ def _configure_profile(self): self.config_profile = selected_profile - def _search_for_run_dir(self): - # this method will catch a run directory as well as its products - # directory, which also has the same name. Hence, return the - # shortest matching path as that will at least return the right - # path between the two. - results = [] - - for search_path in self.search_paths: - logging.debug(f'Searching {search_path} for {self.run_id}') - for entry in listdir(search_path): - some_path = join(search_path, entry) - # ensure some_path never ends in '/' - some_path = some_path.rstrip('/') - if isdir(some_path) and some_path.endswith(self.run_id): - logging.debug(f'Found {some_path}') - results.append(some_path) - - if results: - results.sort(key=lambda s: len(s)) - return results[0] - - raise PipelineError(f"A run-dir for '{self.run_id}' could not be " - "found") - def _directory_check(self, directory_path, create=False): if exists(directory_path): logging.debug("directory '%s' exists." % directory_path) @@ -551,7 +563,7 @@ def generate_sample_info_files(self, addl_info=None): :param addl_info: A df of (sample-name, project-name) pairs. :return: A list of paths to sample-information-files. """ - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: # Generate a list of BLANKs for each project. df = self.mapping_file[['sample_name', 'project_name']] else: @@ -623,7 +635,7 @@ def get_sample_ids(self): # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: results = list(self.mapping_file.sample_name) else: results = [x.Sample_ID for x in self.sample_sheet.samples] @@ -638,7 +650,7 @@ def get_sample_names(self, project_name=None): ''' # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: return self._get_sample_names_from_mapping_file(project_name) else: return self._get_sample_names_from_sample_sheet(project_name) @@ -737,7 +749,7 @@ def get_project_info(self, short_names=False): # defined in both cases. results = [] - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: if 'contains_replicates' in self.mapping_file: contains_replicates = True else: diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py index 54542fef..7a9e9a19 100644 --- a/sequence_processing_pipeline/TRConvertJob.py +++ b/sequence_processing_pipeline/TRConvertJob.py @@ -5,7 +5,35 @@ from os import rename, walk, chmod, listdir, makedirs from shutil import move, rmtree from re import match - +from metapool import load_sample_sheet + +""" +Note in tellread.sbatch, {{lane}} needs to be: + +if [[ ${LANE} == "L001" ]]; then + lane=s_1 +elif [[ ${LANE} == "L002" ]]; then + lane=s_2 +elif [[ ${LANE} == "L003" ]]; then + lane=s_3 +elif [[ ${LANE} == "L004" ]]; then + lane=s_4 +elif [[ ${LANE} == "L005" ]]; then + lane=s_5 +elif [[ ${LANE} == "L006" ]]; then + lane=s_6 +elif [[ ${LANE} == "L007" ]]; then + lane=s_7 +elif [[ ${LANE} == "L008" ]]; then + lane=s_8 +else + echo "Unrecognized lane: ${LANE}" + exit 1 +fi + +make sure compute_sequence_counts_for_normalization2.sbatch gets {{tellread_output}} as defined in $TELLREAD_OUTPUT in tellread.sh + +""" class TRConvertJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, @@ -146,6 +174,36 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self._generate_job_scripts() + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + def _generate_job_scripts(self): scripts = [ { @@ -417,6 +475,49 @@ def run(self, callback=None): # delete the original output directory. rmtree(join(self.output_path, 'output')) + def run2(self, callback=None): + norm = True + assemble = True + + + + tr_job = self.submit_job('tr.script') + if tr_job['job_state'] != 'COMPLETED': + raise ValueError("TR JOB (%s) FAILED" % tr_job['job_id']) + + if norm is True: + ''' + cp ${norm_script} ${normcopy} + chmod gou-w ${normcopy} + ''' + nc_job = self.submit_job('norm_script') + if nc_job['job_state'] != 'COMPLETED': + raise ValueError("BC JOB (%s) FAILED" % nc_job['job_id']) + + int_job = self.submit_job('integrate.script') + if int_job['job_state'] != 'COMPLETED': + raise ValueError("INT JOB (%s) FAILED" % int_job['job_id']) + + if assemble is True: + # NB assemble jobs rely on successful integrate job + csj_job = self.submit_job('csj_script') + if csj_job['job_state'] != 'COMPLETED': + raise ValueError("CSJ JOB (%s) FAILED" % csj_job['job_id']) + + tlj_job = self.submit_job('tlj_script') + if tlj_job['job_state'] != 'COMPLETED': + raise ValueError("TLJ JOB (%s) FAILED" % tlj_job['job_id']) + + cleanup_job = self.submit_job('cleanup.script') + if cleanup_job['job_state'] != 'COMPLETED': + raise ValueError("CLEANUP JOB (%s) FAILED" % cleanup_job['job_id']) + + + + + + + def parse_logs(self): raise PipelineError("parsing logs not implemented.") diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index fff4b07d..9f8ea4f5 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -118,7 +118,7 @@ def test_validate_mapping_file_numeric_ids(self): with NamedTemporaryFile() as tmp: self._make_mapping_file(tmp.name) exp = ['1.0', '1e-3'] - pipeline = Pipeline(self.good_config_file, self.good_run_id, None, + pipeline = Pipeline(self.good_config_file, self.good_run_id, tmp.name, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -131,7 +131,7 @@ def test_validate_mapping_file_numeric_ids(self): def test_get_sample_names_from_sample_sheet(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.mp_sheet_path, None, + self.mp_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -156,7 +156,7 @@ def test_get_sample_names_from_sample_sheet(self): def test_get_orig_names_from_sheet_with_replicates(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -176,7 +176,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -188,7 +188,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -200,7 +200,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu" "t not readable"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.make_runinfo_file_readable() @@ -210,7 +210,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -227,7 +227,7 @@ def test_creation(self): " valid sample-sheet."): Pipeline(self.good_config_file, self.good_run_id, - self.bad_assay_type_path, None, + self.bad_assay_type_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -235,7 +235,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -246,7 +246,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -257,7 +257,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -268,7 +268,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -278,7 +278,7 @@ def test_creation(self): "not a valid json file"): Pipeline(self.good_sample_sheet_path, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -301,7 +301,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -323,7 +323,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -346,7 +346,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -357,7 +357,7 @@ def test_sample_sheet_validation(self): # contained w/in its 'message' member. try: Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) except PipelineError as e: @@ -367,7 +367,7 @@ def test_sample_sheet_validation(self): # test unsuccessful validation of a bad sample-sheet with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - self.bad_sample_sheet_path, None, + self.bad_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.assertEqual(str(e.exception), ('Sample-sheet contains errors:\n' @@ -379,7 +379,6 @@ def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, self.good_sample_sheet_path, - None, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -493,7 +492,6 @@ def test_generate_sample_information_files_with_additional_meta(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, self.good_sample_sheet_path, - None, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1025,7 +1023,7 @@ def test_get_sample_ids(self): 'EP400448B04', 'EP479894B04'] # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1501,7 +1499,7 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1529,7 +1527,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1560,7 +1558,7 @@ def test_get_project_info(self): self.assertEqual(sorted(obs_project_names), sorted(exp_project_names)) pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1572,7 +1570,7 @@ def test_get_project_info(self): def test_configuration_profiles(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1601,7 +1599,7 @@ def test_configuration_profiles(self): def test_parse_project_name(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1633,7 +1631,7 @@ def test_parse_project_name(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1650,7 +1648,7 @@ def test_identify_reserved_words(self): # create new pipeline using a/legacy (v90) metagenomic sample-sheet. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_legacy_sheet_path, None, + self.good_legacy_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1743,7 +1741,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1755,7 +1753,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1766,7 +1764,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, " "but not readable"): - Pipeline(self.good_config_file, self.good_run_id, None, + Pipeline(self.good_config_file, self.good_run_id, self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.make_runinfo_file_readable() @@ -1776,7 +1774,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1791,7 +1789,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1802,7 +1800,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1813,7 +1811,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1824,7 +1822,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1832,7 +1830,7 @@ def test_mapping_file_validation(self): # test successful validation of a good mapping-file. try: Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) except PipelineError as e: @@ -1842,7 +1840,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_missing_column, + self.mf_missing_column, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ('Mapping-file is missing ' @@ -1852,7 +1850,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_duplicate_sample, + self.mf_duplicate_sample, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ("Mapping-file contains duplicate " @@ -1879,7 +1877,6 @@ def test_is_sample_sheet(self): def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2101,7 +2098,6 @@ def test_get_sample_ids(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2229,7 +2225,6 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2253,7 +2248,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -2270,33 +2265,12 @@ def test_get_project_info(self): self.assertDictEqual(obs_d, exp_d) break - def test_additional_constuctor_check(self): - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - None, None, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - self.sample_sheet_path, - self.good_mapping_file_path, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - def test_dummy_sheet_generation(self): # generate a RunInfo.xml file w/only one indexed read. self.create_runinfo_file(four_reads=False) _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2315,7 +2289,6 @@ def test_dummy_sheet_generation(self): _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2335,7 +2308,6 @@ def test_dummy_sheet_generation(self): def test_process_run_info_file(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2375,7 +2347,6 @@ def test_process_run_info_file(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, From 6cdc7ba986b0171ecd9b1ce52efc8be77e611eed Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 1 Oct 2024 14:44:34 -0700 Subject: [PATCH 15/47] Version 2.0 of TellSeq support. Version 2.0 of TellSeq support removes the master tellread.sh script and the drop-in replacement TRConvertJob.py for Job()s that wrap individual steps in the original script. These steps can be used in whole or in part in varying order in the refactored SPP plugin (qp-klp). --- sequence_processing_pipeline/TRConvertJob.py | 582 ------------------ .../TRIntegrateJob.py | 139 +++++ .../TRNormCountsJob.py | 142 +++++ sequence_processing_pipeline/TellReadJob.py | 181 ++++++ .../templates/cloudspades-isolate.sbatch | 25 +- .../templates/cloudspades.sbatch | 25 +- ...e_sequence_counts_for_normalization.sbatch | 33 +- .../templates/integrate.sbatch | 41 +- .../templates/telllink-isolate.sbatch | 30 +- .../templates/telllink.sbatch | 30 +- .../templates/tellread-cleanup.sbatch | 7 +- .../templates/tellread.sbatch | 83 +-- .../templates/tellread.sh | 262 -------- 13 files changed, 510 insertions(+), 1070 deletions(-) delete mode 100644 sequence_processing_pipeline/TRConvertJob.py create mode 100644 sequence_processing_pipeline/TRIntegrateJob.py create mode 100644 sequence_processing_pipeline/TRNormCountsJob.py create mode 100644 sequence_processing_pipeline/TellReadJob.py delete mode 100755 sequence_processing_pipeline/templates/tellread.sh diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py deleted file mode 100644 index 7a9e9a19..00000000 --- a/sequence_processing_pipeline/TRConvertJob.py +++ /dev/null @@ -1,582 +0,0 @@ -from jinja2 import Environment -from os.path import split, join, exists -from sequence_processing_pipeline.Job import Job, KISSLoader -from sequence_processing_pipeline.PipelineError import PipelineError -from os import rename, walk, chmod, listdir, makedirs -from shutil import move, rmtree -from re import match -from metapool import load_sample_sheet - -""" -Note in tellread.sbatch, {{lane}} needs to be: - -if [[ ${LANE} == "L001" ]]; then - lane=s_1 -elif [[ ${LANE} == "L002" ]]; then - lane=s_2 -elif [[ ${LANE} == "L003" ]]; then - lane=s_3 -elif [[ ${LANE} == "L004" ]]; then - lane=s_4 -elif [[ ${LANE} == "L005" ]]; then - lane=s_5 -elif [[ ${LANE} == "L006" ]]; then - lane=s_6 -elif [[ ${LANE} == "L007" ]]; then - lane=s_7 -elif [[ ${LANE} == "L008" ]]; then - lane=s_8 -else - echo "Unrecognized lane: ${LANE}" - exit 1 -fi - -make sure compute_sequence_counts_for_normalization2.sbatch gets {{tellread_output}} as defined in $TELLREAD_OUTPUT in tellread.sh - -""" - -class TRConvertJob(Job): - def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, - node_count, nprocs, wall_time_limit, pmem, bcl_tool_path, - modules_to_load, qiita_job_id): - """ - TRConvertJob provides a convenient way to run bcl-convert or bcl2fastq - on a directory BCL files to generate Fastq files. - :param run_dir: The 'run' directory that contains BCL files. - :param output_path: Path where all pipeline-generated files live. - :param sample_sheet_path: The path to a sample-sheet. - :param queue_name: The name of the Torque queue to use for processing. - :param node_count: The number of nodes to request. - :param nprocs: The maximum number of parallel processes to use. - :param wall_time_limit: A hard time limit (in min) to bound processing. - :param bcl_tool_path: The path to either bcl2fastq or bcl-convert. - :param modules_to_load: A list of Linux module names to load - :param qiita_job_id: identify Torque jobs using qiita_job_id - """ - super().__init__(run_dir, - output_path, - 'ConvertJob', - [bcl_tool_path], - 1000, - modules_to_load=modules_to_load) - - # for metagenomics pipelines, sample_sheet_path will reflect a real - # sample_sheet file. For amplicon pipelines, sample_sheet_path will - # reference a dummy sample_sheet file. - self.sample_sheet_path = sample_sheet_path - self.queue_name = queue_name - self.node_count = node_count - self.nprocs = nprocs - self.wall_time_limit = wall_time_limit - self.pmem = pmem - self.bcl_tool = bcl_tool_path - self.qiita_job_id = qiita_job_id - self.suffix = 'fastq.gz' - - self.tellread_output_path = join(self.output_path, 'output') - makedirs(self.tellread_output_path) - - self.tmp1_path = join(self.tellread_output_path, 'tmp1') - - makedirs(self.tmp1_path) - - # for projects that use sequence_processing_pipeline as a dependency, - # jinja_env must be set to sequence_processing_pipeline's root path, - # rather than the project's root path. - self.jinja_env = Environment(loader=KISSLoader('templates'), - # set Jinja2 comment strings to be - # anything other than '{#' and '#}', - # which can be used in shell scripts. - comment_start_string='%%%%%%%%%%', - comment_end_string='%%%%%%%%%%') - - tmp = False - for executable_name in ['bcl2fastq', 'bcl-convert']: - if executable_name in self.bcl_tool: - tmp = True - break - - if not tmp: - raise PipelineError(f'{self.bcl_tool} is not the path to a known' - 'executable') - - self._file_check(self.sample_sheet_path) - - # As the sample-sheet is validated by the Pipeline object before - # being passed to TRConvertJob, additional validation isn't needed. - - # TODO: generate a sample-mapping to map C#s to fake sample-names and - # fake projects. Process sample-sheet later. - self.mapping = self._generate_sample_mapping() - - # TODO: hardcode lane at 'L001' - self.lane = 'L001' - - self.clean_wall_time_limit = "24:00:00" - self.clean_mem_in_gb = "8" - self.clean_node_count = "1" - self.clean_cores_per_task = "1" - self.cloudspades_cores_per_task = "12" - self.cloudspades_mem_in_gb = "128" - self.cloudspades_modules = ["gcc_9.3.0"] - self.cloudspades_node_count = "1" - self.cloudspades_path = ("/home/qiita_test/qiita-spots/spades-" - "cloudspades-0.1") - self.cloudspades_wall_time_limit = "24:00:00" - self.counts_cores_per_task = "1" - self.counts_create_picklist_path = ("/home/qiita_test/qiita-spots/" - "create_picklist.py") - self.counts_mem_in_gb = "8" - self.counts_node_count = "1" - self.counts_other_file = ('20230906_FS10001773_68_BTR67708-1611.' - 'read_counts.tsv') - self.counts_plot_counts_path = ("/home/qiita_test/qiita-spots/'" - "'plot_counts.py") - self.counts_sample_sheet = ("/home/qiita_test/qiita-spots/" - "20230906_FS10001773_68_BTR67708-1611.csv") - self.counts_wall_time_limit = "24:00:00" - self.cs_isolate_mem_in_gb = "64" - self.integrate_indicies_script_path = ("/home/qiita_test/qiita-spots/" - "integrate-indices-np.py") - self.integrate_mem_in_gb = "8" - self.integrate_node_count = "1" - self.integrate_wall_time_limit = "24:00:00" - self.integrate_cores_per_task = "1" - self.queue_name = "qiita" - self.tellink_cores_per_task = "16" - self.tellink_mem_in_gb = "160" - self.tellink_modules = ["singularity_3.6.4"] - self.tellink_node_count = "1" - self.tellink_sing_path = ("/projects/long_read_collab/code/tellseq/" - "release_v1.11/tellink-release/" - "run_tellink_sing.sh") - self.tellink_wall_time_limit = "96:00:00" - self.tellread_cores_per_task = "4" - self.tellread_mem_in_gb = "16" - self.tellread_modules = ["singularity_3.6.4"] - self.tellread_node_count = "1" - self.tellread_sing_script_path = ("$HOME/qiita-spots/tellread-release" - "-novaseqX/run_tellread_sing.sh") - self.tellread_wall_time_limit = "96:00:00" - self.tl_cores_per_task = "16" - self.tl_isolate_node_count = "1" - self.tl_isolate_wall_time_limit = "96:00:00" - self.tl_mem_in_gb = "160" - self.main_map = ("/home/qiita_test/qiita-spots/20230906_FS10001773_" - "68_BTR67708-1611.csv") - self.main_mode = "metagenomic" - self.main_seqrun_path = ("/sequencing/seqmount/KL_iSeq_Runs/20230906" - "_FS10001773_68_BTR67708-1611") - - # TODO: Address reference_map and reference_base - self.main_reference_base = "" - self.main_reference_map = "" - - self._generate_job_scripts() - - def _process_sample_sheet(self): - sheet = load_sample_sheet(self.sample_sheet_path) - - if not sheet.validate_and_scrub_sample_sheet(): - s = "Sample sheet %s is not valid." % self.sample_sheet_path - raise PipelineError(s) - - header = sheet.Header - chemistry = header['chemistry'] - - if header['Assay'] not in Pipeline.assay_types: - s = "Assay value '%s' is not recognized." % header['Assay'] - raise PipelineError(s) - - sample_ids = [] - for sample in sheet.samples: - sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) - - bioinformatics = sheet.Bioinformatics - - # reorganize the data into a list of dictionaries, one for each row. - # the ordering of the rows will be preserved in the order of the list. - lst = bioinformatics.to_dict('records') - - # human-filtering jobs are scoped by project. Each job requires - # particular knowledge of the project. - return {'chemistry': chemistry, - 'projects': lst, - 'sample_ids': sample_ids} - - def _generate_job_scripts(self): - scripts = [ - { - "template": "cloudspades.sbatch", - "params": { - "job_name": "cs-assemble", - "wall_time_limit": self.wall_time_limit, - "mem_in_gb": self.cloudspades_mem_in_gb, - "node_count": self.cloudspades_node_count, - "cores_per_task": self.cloudspades_cores_per_task, - "queue_name": self.queue_name, - "modules_to_load": ' '.join(self.cloudspades_modules), - "cloudspades_path": self.cloudspades_path - } - }, - { - "template": "cloudspades-isolate.sbatch", - "params": { - "job_name": "cs-assemble", - "wall_time_limit": self.cloudspades_wall_time_limit, - "mem_in_gb": self.cs_isolate_mem_in_gb, - "node_count": self.cloudspades_node_count, - "cores_per_task": self.cloudspades_cores_per_task, - "queue_name": self.queue_name, - "modules_to_load": ' '.join(self.cloudspades_modules), - "cloudspades_path": self.cloudspades_path - } - }, - { - "template": "integrate.sbatch", - "params": { - "job_name": "integrate", - "wall_time_limit": self.integrate_wall_time_limit, - "mem_in_gb": self.integrate_mem_in_gb, - "node_count": self.integrate_node_count, - "cores_per_task": self.integrate_cores_per_task, - "iinp_script_path": self.integrate_indicies_script_path, - "queue_name": self.queue_name - } - }, - { - "template": "compute_sequence_counts_for_normalization.sbatch", - "params": { - "job_name": "norm", - "wall_time_limit": self.counts_wall_time_limit, - "mem_in_gb": self.counts_mem_in_gb, - "node_count": self.counts_node_count, - "cores_per_task": self.counts_cores_per_task, - "sample_sheet": self.counts_sample_sheet, - "plot_counts_path": self.counts_plot_counts_path, - "output_path": self.tellread_output_path, - "create_picklist_path": self.counts_create_picklist_path, - "read_counts_path": join(self.tellread_output_path, - self.counts_other_file), - "queue_name": self.queue_name - } - }, - { - "template": "telllink.sbatch", - "params": { - "job_name": "tellink", - "wall_time_limit": self.tellink_wall_time_limit, - "mem_in_gb": self.tellink_mem_in_gb, - "node_count": self.tellink_node_count, - "cores_per_task": self.tellink_cores_per_task, - "queue_name": self.queue_name, - "modules_to_load": ' '.join(self.tellink_modules), - "output_path": self.tellread_output_path, - "sing_path": self.tellink_sing_path - } - }, - { - "template": "telllink-isolate.sbatch", - "params": { - "job_name": "tellink-isolate", - "wall_time_limit": self.tellink_wall_time_limit, - "node_count": self.tl_isolate_node_count, - "cores_per_task": self.tl_cores_per_task, - "mem_in_gb": self.tl_mem_in_gb, - "queue_name": self.queue_name, - "modules_to_load": ' '.join(self.tellink_modules), - "output_path": self.tellread_output_path, - "sing_path": self.tellink_sing_path - } - }, - { - "template": "tellread.sbatch", - "params": { - "job_name": "tellread", - "wall_time_limit": self.tellread_wall_time_limit, - "mem_in_gb": self.tellread_mem_in_gb, - "node_count": self.tellread_node_count, - "tmp_dir": self.tmp1_path, - "cores_per_task": self.tellread_cores_per_task, - "queue_name": self.queue_name, - "sing_script_path": self.tellread_sing_script_path, - "modules_to_load": ' '.join(self.tellread_modules) - } - }, - { - "template": "tellread-cleanup.sbatch", - "params": { - "job_name": "cleanup", - "wall_time_limit": self.clean_wall_time_limit, - "mem_in_gb": self.clean_mem_in_gb, - "node_count": self.clean_node_count, - "cores_per_task": self.clean_cores_per_task, - "queue_name": self.queue_name - } - }, - # these hardcoded paths for tellread.sh need to be replaced with - # the lane number and run-directory path, and the lane and the - # mode from the user input. Note that we also need to process the - # upcoming sample-sheet in order to generate the mapping we need - # as well. - { - "template": "tellread.sh", - "params": { - "tellread_map": self.main_map, - "seqrun_path": self.main_seqrun_path, - "output_path": self.tellread_output_path, - "lane": self.lane, - "reference_map": self.main_reference_map, - "reference_base": self.main_reference_base, - "mode": self.main_mode - } - } - ] - - for script in scripts: - template = self.jinja_env.get_template(script["template"]) - params = script["params"] - job_script_path = join(self.output_path, script["template"]) - - with open(job_script_path, 'w') as f: - f.write(template.render(**params)) - # TODO: Change from 777 to something more appropriate. - chmod(job_script_path, 0o777) - - def run(self, callback=None): - """ - Run BCL2Fastq/BCLConvert conversion - :param callback: optional function taking two parameters (id, status) - that is called when a running process's status is - changed. - :return: - """ - - # Unlike other Jobs that submit a Slurm script and wait for the job - # to complete, this Job() will execute an existing shell script that - # spawns all the jobs that perform the actual work. - - # tellread.sh performs some work that requires it to run on a compute - # node. Since Job()s run on the interactive node, an interactive - # shell on a compute node must be requested for this script to run on. - - # define 'sjob' here for clarity. This should be more than adequate - # resources to run the tellread.sh script and exit as it does not wait - # on its children to complete. - - # as with the original scripts, the scripts generated by Jinja2 will - # live in the current working directory. Hence, the script will always - # exist at ./tellread.sh provided it was created successfully. - sjob = "srun -N 1 -n 1 -p qiita --mem 4g --time 1:00:00 --pty bash -l" - command = (f"{sjob}; pushd .;cd {self.output_path}; ./tellread.sh; " - "popd; exit") - - if not exists(join(self.output_path, 'tellread.sh')): - raise PipelineError("tellread.sh script could not be found.") - - res = self._system_call(command) - - if res['return_code'] != 0: - raise PipelineError("tellread.sh script did not execute correctly") - - # once _system_call() returns and tellread.sh executed correctly, then - # a pids file should exist in the output subdirectory. - pids_fp = join(self.output_path, 'output', 'pids') - if not exists(pids_fp): - raise PipelineError("TRConvertJob could not locate a pids file") - - with open(pids_fp, 'r') as f: - lines = f.readlines() - lines = [x.strip().split(': ') for x in lines] - results = {k: v for (k, v) in lines} - - child_processes = [('main tellread', 'TRJOB_RETURN_CODE', - 'TRJOB_PID', True), - ('counts', 'NORM_COUNTS_JOB_RETURN_CODE', - 'NORM_COUNTS_JOB_PID', False), - ('integrate', 'INTEGRATE_JOB_RETURN_CODE', - 'INTEGRATE_JOB_PID', True), - ('csj', 'CSJ_JOB_RETURN_CODE', - 'CSJ_JOB_PID', False), - ('tlj', 'TLJ_JOB_RETURN_CODE', - 'TLJ_JOB_PID', False), - ('cleanup', 'CLEANUP_JOB_RETURN_CODE', - 'CLEANUP_JOB_PID', True)] - - # Iterate through all the TellRead script's known child processes. - # Some children will be optional depending on the parameters given, - # while others are required. The Job() should immediately raise an - # error if any child (optional or not) exits unsuccessfully, however. - for name, code, _, is_required in child_processes: - if code in results: - if results[code] != '0': - raise PipelineError(f"An error ({results[code]}) occurred " - f"running {name} subprocess") - else: - if is_required: - raise PipelineError(f"The {name} subprocess did not " - "execute correctly") - - # Get a list of Slurm job ids that we need to wait on and text - # descriptions of what they are. - jids = [(results[x[2]], x[0]) for x in child_processes if - x[2] in results] - - # ensure the jids are casted to integers before passing them. - statuses = self.wait_on_job_ids([int(x[0]) for x in jids]) - - for jid, description in jids: - status = statuses[jid] - if status not in Job.slurm_status_successful: - raise PipelineError(f"process '{description}' ({jid}) " - f"failed ({status})") - - # post-process working directory to make it appear like results - # generated by ConvertJob - - integrated_files_path = join(self.output_path, 'output', "integrated") - - if not exists(integrated_files_path): - raise ValueError(f"{integrated_files_path} does not exist") - - # move integrated directory to TRConvertJob directory, co-level with - # output directory. This makes it easier to delete the rest of the - # output that we don't need. - - # move err and out logs into logs subdirectory. - for root, dirs, files in walk(self.output_path): - for _file in files: - _path = join(root, _file) - if _path.endswith('.err'): - move(_path, join(self.output_path, 'logs')) - elif _path.endswith('.out'): - move(_path, join(self.output_path, 'logs')) - # don't go below one level. - break - - # save two logs and move them into standard Job logs directory. - move(join(self.output_path, 'output', 'log'), - join(self.output_path, 'logs')) - move(join(self.output_path, 'output', 'output.log'), - join(self.output_path, 'logs')) - - # rename the files and move them into project directories. - for root, dirs, files in walk(integrated_files_path): - for _file in files: - fastq_file = join(root, _file) - self._post_process_file(fastq_file, self.mapping) - - # move project folders from integrated directory to working_dir. - contents = listdir(integrated_files_path) - for name in contents: - move(join(integrated_files_path, name), - self.output_path) - - # delete the original output directory. - rmtree(join(self.output_path, 'output')) - - def run2(self, callback=None): - norm = True - assemble = True - - - - tr_job = self.submit_job('tr.script') - if tr_job['job_state'] != 'COMPLETED': - raise ValueError("TR JOB (%s) FAILED" % tr_job['job_id']) - - if norm is True: - ''' - cp ${norm_script} ${normcopy} - chmod gou-w ${normcopy} - ''' - nc_job = self.submit_job('norm_script') - if nc_job['job_state'] != 'COMPLETED': - raise ValueError("BC JOB (%s) FAILED" % nc_job['job_id']) - - int_job = self.submit_job('integrate.script') - if int_job['job_state'] != 'COMPLETED': - raise ValueError("INT JOB (%s) FAILED" % int_job['job_id']) - - if assemble is True: - # NB assemble jobs rely on successful integrate job - csj_job = self.submit_job('csj_script') - if csj_job['job_state'] != 'COMPLETED': - raise ValueError("CSJ JOB (%s) FAILED" % csj_job['job_id']) - - tlj_job = self.submit_job('tlj_script') - if tlj_job['job_state'] != 'COMPLETED': - raise ValueError("TLJ JOB (%s) FAILED" % tlj_job['job_id']) - - cleanup_job = self.submit_job('cleanup.script') - if cleanup_job['job_state'] != 'COMPLETED': - raise ValueError("CLEANUP JOB (%s) FAILED" % cleanup_job['job_id']) - - - - - - - - def parse_logs(self): - raise PipelineError("parsing logs not implemented.") - - @staticmethod - def parse_job_script(job_script_path): - raise PipelineError("parsing job script not implemented.") - - def _post_process_file(self, fastq_file, mapping): - # generate names of the form generated by bcl-convert/bcl2fastq: - # _S#_L00#__001.fastq.gz - # see: - # https://help.basespace.illumina.com/files-used-by-basespace/ - # fastq-files - _dir, _file = split(fastq_file) - - # ex: integrated/C544.R2.fastq.gz - m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file) - - if m is None: - raise ValueError(f"The filename '{_file}' is not of a " - "recognizable form") - - adapter_id = m[1] - read_type = m[2] - - if adapter_id not in mapping: - raise ValueError(f"{adapter_id} is not present in mapping") - - sample_name, sample_index, project_name = mapping[adapter_id] - - # generate the new filename for the fastq file, and reorganize the - # files by project. - new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name, - sample_index, - self.lane, - read_type) - - # ensure that the project directory exists before we rename and move - # the file to that location. - makedirs(join(_dir, project_name), exist_ok=True) - - # if there's an error renaming and moving the file, let it pass up to - # the user. - final_path = join(_dir, project_name, new_name) - rename(fastq_file, final_path) - return final_path - - def _generate_sample_mapping(self): - # this generates a sample mapping for the C501-C596 adapters used by - # the vendor to a sample-name and project. In production use this - # mapping would need to be created from the future sample-sheet. - project_names = ['Project1', 'Project2', 'Project3'] - sample_mapping = {} - - for sample_index in range(1, 97): - adapter_id = "C%s" % str(sample_index + 500) - sample_name = "MySample%d" % sample_index - project_name = project_names[sample_index % 3] - sample_mapping[adapter_id] = (sample_name, sample_index, - project_name) - - return sample_mapping diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py new file mode 100644 index 00000000..076a15fe --- /dev/null +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -0,0 +1,139 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class TRIntegrateJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, indicies_script_path, label, + reference_base, reference_map, cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: None + :param indicies_script_path: None + :param label: None + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.indicies_script_path = indicies_script_path + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.label = label + + if self.reference_base != None or self.reference_map != None: + tag = 'reference-based' + else: + tag = 'reference-free' + + self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'integrate.sbatch') + template = self.jinja_env.get_template("integrate2.sbatch") + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "integrate", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "iinp_script_path": self.indicies_script_path, + "queue_name": self.queue_name, + "output_dir": self.output_path})) + + return job_script_path diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py new file mode 100644 index 00000000..09e36a67 --- /dev/null +++ b/sequence_processing_pipeline/TRNormCountsJob.py @@ -0,0 +1,142 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class TRNormCountsJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, indicies_script_path, label, + reference_base, reference_map, cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: None + :param indicies_script_path: None + :param label: None + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.indicies_script_path = indicies_script_path + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.label = label + + if self.reference_base != None or self.reference_map != None: + tag = 'reference-based' + else: + tag = 'reference-free' + + self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'compute_sequence_counts_for_normalization.sbatch') + template = self.jinja_env.get_template("compute_sequence_counts_for_normalization2.sbatch") + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "#job_name": "integrate", + "#wall_time_limit": self.wall_time_limit, + "#mem_in_gb": self.jmem, + "#node_count": self.node_count, + "#cores_per_task": self.cores_per_task, + "#queue_name": self.queue_name, + "#output_path": self.output_path, + "read_counts_path": "TODO", + "sample_sheet": "TODO", + "tellread_output": "TODO" + })) + + return job_script_path diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py new file mode 100644 index 00000000..859974a4 --- /dev/null +++ b/sequence_processing_pipeline/TellReadJob.py @@ -0,0 +1,181 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class TellReadJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, indicies_script_path, label, + reference_base, reference_map, tmp1_path, sing_script_path, + lane, cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: None + :param indicies_script_path: None + :param label: None + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.indicies_script_path = indicies_script_path + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.label = label + self.sing_script_path = sing_script_path + self.tmp1_path = tmp1_path + + # force self.lane_number to be int. raise an Error if it's not. + tmp = int(lane) + if tmp < 1 or tmp > 8: + raise ValueError(f"'{tmp}' is not a valid lane number") + self.lane_number = tmp + + if self.reference_base != None or self.reference_map != None: + tag = 'reference-based' + else: + tag = 'reference-free' + + self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + '-c ${sbatch_cores}', + '--mem ${sbatch_mem}', + '--time ${wall}'] + + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TellReadJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TellReadJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'integrate.sbatch') + template = self.jinja_env.get_template("tellread2.sbatch") + + # generate a comma separated list of sample-ids from the tuples stored + # in self.sample_ids. + + # NB: the current sample-sheet format used for TellRead doesn't include + # sample-names and sample-ids, only sample_id. e.g. C501,C502,etc. + # Hence, when a final sample sheet format is ready, it may be prudent + # to switch this to pull values from the expected sample-names column + # instead. + samples = ','.join([id[0] for id in self.sample_ids]) + + # since we haven't included support for reference_map yet, whenever a + # reference is not included, the mapping against the list of sample_ids + # is ['NONE', 'NONE', ..., 'NONE']. + refs = ','.join(['NONE' for _ in self.sample_ids]) + + extra = "" + + # if reference_base is added in the future and is defined, exta needs + # to be f"-f {reference_base}". + # extra = "-f ${REFBASE}" + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "tellread", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "sing_script_path": self.sing_script_path, + "tmp_dir": self.tmp1_path, + "modules_to_load": ' '.join(self.modules_to_load), + "lane": f"s_{self.lane_number}", + # TODO: make sure this is the 'ConvertJob/output' directory + "output": self.output_path, + "rundir_path": self.root_dir, + "samples": samples, + "refs": refs, + "extra": extra + })) + + return job_script_path diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index 261c11c7..1ac51b2e 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -10,35 +10,18 @@ #SBATCH --error cloudspades-isolate_%x-%A_%a.err source activate qiime2-2023.5 -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} set -x set -e -echo $TMPDIR - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base=${OUTPUT} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - module load {{modules_to_load}} -samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) # assumes 1-based array index, eg --array 1-N sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} -cs=${base}/cloudspades-isolate/${sample} +cs={{output_path}}/cloudspades-isolate/${sample} if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then if [[ -d ${cs} ]]; then @@ -51,8 +34,8 @@ pushd {{cloudspades_path}}/assembler/bin ./spades.py \ -o ${cs} \ - --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 module unload gcc_9.3.0 popd diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index 636dd5ce..72efb140 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -10,35 +10,18 @@ #SBATCH --error cloudspades_%x-%A_%a.err source activate qiime2-2023.5 -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} set -x set -e -echo $TMPDIR - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base=${OUTPUT} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - module load {{modules_to_load}} -samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) # assumes 1-based array index, eg --array 1-N sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} -cs=${base}/cloudspades/${sample} +cs={{output_path}}/cloudspades/${sample} if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then if [[ -d ${cs} ]]; then @@ -51,8 +34,8 @@ pushd {{cloudspades_path}}/assembler/bin ./spades.py \ -o ${cs} \ - --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ + --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ + --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ --meta \ -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 module unload gcc_9.3.0 diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch index a4b31114..ab8af109 100644 --- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch +++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch @@ -12,45 +12,14 @@ # NB: output appears normal w/out. # source activate qiime2-2023.5 -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - set -x set -e set -o pipefail echo $TMPDIR -tellread=${TELLREAD_OUTPUT} -if [[ ! -d ${tellread} ]]; then - echo "${tellread} not found" - exit 1 -fi - -if [[ ! -d ${tellread}/Full ]]; then - echo "${tellread}/Full not found" - exit 1 -fi - -if [[ -z {{output_path}} ]]; then - echo "OUTPUT not specified" - exit 1 -fi - -if [[ -z {{sample_sheet}} ]]; then - echo "SAMPLESHEET not specified" - exit 1 -fi - -if [[ ! -f {{sample_sheet}} ]]; then - echo "SAMPLESHEET not found" - exit 1 -fi - mkdir -p {{output_path}} -wc -l ${tellread}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt +wc -l {{tellread_output}}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}} conda activate qp-knight-lab-processing-2022.03 diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index 30a3a9ba..8c767382 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -9,47 +9,26 @@ #SBATCH --output integrate_%x-%A_%a.out #SBATCH --error integrate_%x-%A_%a.err -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - -# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html -cores=${SLURM_CPUS_PER_TASK} - +# NB SLURM_ARRAY_TASK_ID is exported by Slurm if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then echo "Not operating in an array" exit 1 fi +# NB SLURM_ARRAY_TASK_MIN is exported by Slurm if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then echo "Line extraction assumes 1-based index" exit 1 fi -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -if [[ -z ${BASE} ]]; then - echo "BASE not specified" - exit 1 -fi - -tellread=${OUTPUT} -if [[ ! -d ${tellread} ]]; then - echo "${tellread} not found" - exit 1 -fi - set -x set -e set -o pipefail -samples=($(cat ${tellread}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_dir}}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} +# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT. export TMPDIR=$(mktemp -d) function cleanup { echo "Removing $TMPDIR" @@ -59,8 +38,8 @@ function cleanup { trap cleanup EXIT files=${TMPDIR}/integration.files -/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files} -mkdir -p ${tellread}/integrated +/bin/ls -1 {{output_dir}}/Full/*corrected.err_barcode_removed.fastq > ${files} +mkdir -p {{output_dir}}/integrated if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then echo "Multiple matches for ${sample} R1" @@ -80,9 +59,9 @@ fi r1=$(grep -m 1 "_R1_${sample}" ${files}) r2=$(grep -m 1 "_R2_${sample}" ${files}) i1=$(grep -m 1 "_I1_${sample}" ${files}) -r1out=${tellread}/integrated/${sample}.R1.fastq.gz -r2out=${tellread}/integrated/${sample}.R2.fastq.gz -i1out=${tellread}/integrated/${sample}.I1.fastq.gz +r1out={{output_dir}}/integrated/${sample}.R1.fastq.gz +r2out={{output_dir}}/integrated/${sample}.R2.fastq.gz +i1out={{output_dir}}/integrated/${sample}.I1.fastq.gz if [[ ! -s ${r1} ]]; then echo "${r1} is empty, cannot integrate" @@ -114,4 +93,4 @@ python {{iinp_script_path}} integrate \ --i1-in ${i1} \ --r1-out ${r1out} \ --r2-out ${r2out} \ - --threads ${cores} + --threads ${SLURM_CPUS_PER_TASK} diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index b8f9d735..90e04012 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -14,25 +14,13 @@ set -e module load {{modules_to_load}} -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base={{output_path}} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} k=79 lc=35 -cores=${SLURM_CPUS_PER_TASK} -tl=${base}/tell-link-isolate/${sample} +tl={{output_path}}/tell-link-isolate/${sample} if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then if [[ -d ${tl} ]]; then rm -fr ${tl} @@ -42,16 +30,16 @@ fi mkdir -p ${tl} {{sing_path}} \ - -r1 ${base}/integrated/${sample}.R1.fastq.gz \ - -r2 ${base}/integrated/${sample}.R2.fastq.gz \ - -i1 ${base}/integrated/${sample}.I1.fastq.gz \ - -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ + -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ + -i1 {{output_path}}}/integrated/${sample}.I1.fastq.gz \ + -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \ -k ${k} \ -lc ${lc} \ -p ${sample} \ - -j ${cores} + -j ${SLURM_CPUS_PER_TASK} # remove temporary data -if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping fi diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch index 234192b2..efdf0578 100644 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -14,26 +14,14 @@ set -e module load {{modules_to_load}} -if [[ -z "${LABELTAG}" ]]; then - echo "LABEL is not specified" - exit 1 -fi - -base={{output_path}} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} # TODO: leave these hardcoded for now k=79 lc=35 -cores=${SLURM_CPUS_PER_TASK} -tl=${base}/tell-link/${sample} +tl={{output_path}}/tell-link/${sample} if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then if [[ -d ${tl} ]]; then rm -fr ${tl} @@ -43,17 +31,17 @@ fi mkdir -p ${tl} {{sing_path}} \ - -r1 ${base}/integrated/${sample}.R1.fastq.gz \ - -r2 ${base}/integrated/${sample}.R2.fastq.gz \ - -i1 ${base}/integrated/${sample}.I1.fastq.gz \ + -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ + -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ + -i1 {{output_path}}/integrated/${sample}.I1.fastq.gz \ -d metagenomics \ - -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ + -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \ -k ${k} \ -lc ${lc} \ -p ${sample} \ - -j ${cores} + -j ${SLURM_CPUS_PER_TASK} # remove temporary data -if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping +if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then + rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping fi diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch index 2cb479e7..e5b0873e 100644 --- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -9,10 +9,5 @@ #SBATCH --output tellread-cleanup_%x-%A.out #SBATCH --error tellread-cleanup_%x-%A.err -if [[ -z "${OUTPUT}" ]]; then - echo "OUTPUT is not specified" - exit 1 -fi - # remove unused large outputs -rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full +rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index fe8d39d9..da439836 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -9,90 +9,27 @@ #SBATCH --output tellread_%x-%A.out #SBATCH --error tellread_%x-%A.err -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - -set -x - -if [[ -z "${N_SAMPLES}" ]]; then - echo "N_SAMPLES is not specified" - exit 1 -fi - -if [[ -z "${SEQRUNPATH}" ]]; then - echo "SEQRUNPATH is not specified" - exit 1 -fi - -if [[ -z "${LANE}" ]]; then - echo "LANE is not specified" - exit 1 -fi - -if [[ -z "${SAMPLES}" ]]; then - echo "SAMPLES is not specified" - exit 1 -fi - -if [[ -z "${REFS}" ]]; then - echo "REFS is not specified" - exit 1 -fi - -if [[ -z "${OUTPUT}" ]]; then - echo "OUTPUT is not specified" - exit 1 -fi +set -x export TMPDIR={{tmp_dir}} mkdir -p ${TMPDIR} export TMPDIR=$(mktemp -d) -seqrun_path=${SEQRUNPATH} - -if [[ ${LANE} == "L001" ]]; then - lane=s_1 -elif [[ ${LANE} == "L002" ]]; then - lane=s_2 -elif [[ ${LANE} == "L003" ]]; then - lane=s_3 -elif [[ ${LANE} == "L004" ]]; then - lane=s_4 -elif [[ ${LANE} == "L005" ]]; then - lane=s_5 -elif [[ ${LANE} == "L006" ]]; then - lane=s_6 -elif [[ ${LANE} == "L007" ]]; then - lane=s_7 -elif [[ ${LANE} == "L008" ]]; then - lane=s_8 -else - echo "Unrecognized lane: ${LANE}" - exit 1 -fi - -# yes, hard coded, not great but progress. -extra="" -if [[ ! -z ${REFBASE} ]]; then - extra="-f ${REFBASE}" -fi -mkdir -p ${OUTPUT} +mkdir -p {{output}} module load {{modules_to_load}} {{sing_script_path}} \ - -i ${seqrun_path} \ - -o ${OUTPUT} \ - -s $(echo ${SAMPLES} | tr -d '"') \ - -g $(echo ${REFS} | tr -d '"') \ + -i {{rundir_path}} \ + -o {{output}} \ + -s $(echo {{samples}} | tr -d '"') \ + -g $(echo {{refs}} | tr -d '"') \ -j ${SLURM_JOB_CPUS_PER_NODE} \ - ${extra} \ - -l ${lane} + {{extra}} \ + -l {{lane}} -if [[ -d ${OUTPUT}/Full ]]; then +if [[ -d {{output}}/Full ]]; then echo "Run appears successful" -elif [[ -d ${OUTPUT}/1_demult/Full ]]; then +elif [[ -d {{output}}/1_demult/Full ]]; then echo "Run appears unsuccessful but has output" exit 1 else diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh deleted file mode 100755 index d6c61cb0..00000000 --- a/sequence_processing_pipeline/templates/tellread.sh +++ /dev/null @@ -1,262 +0,0 @@ -#!/bin/bash -samplesheet="{{tellread_map}}" # previously -i option -seqrunpath="{{seqrun_path}}" # previously -s option -lane="{{lane}}" # previously -l option -reference_map="{{reference_map}}" # previously -r option -reference_base="{{reference_base}}" # previously -b option -mode="{{mode}}" # previously -m option - -# preserve error-checking of parameters to preserve as much of the original -# script as possible, even though this could be done in python. - -# https://unix.stackexchange.com/a/621007 -: ${seqrunpath:?Missing -s} -: ${lane:?Missing -i} - -if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then - if [[ -z ${reference_map} ]]; then - echo "-b used without -r" - exit 1 - fi - if [[ -z ${reference_base} ]]; then - echo "-r used without -b" - exit 1 - fi - if [[ ! -d ${reference_base} ]]; then - echo "reference base not found" - exit 1 - fi - - tag=reference-based -else - tag=reference-free -fi - -# trim trailing slash -# https://stackoverflow.com/a/32845647/19741 -safepath=$(echo ${seqrunpath} | sed 's:/*$::') -label=$(basename ${safepath}) -labeltag=${label}-${tag} -output={{output_path}} - -if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then - echo "Cannot access the lane" - exit 1 -fi - -# for now this can stay here to keep greater compatibility with the original script. -# however these fields should eventually be parameters that can be configured in the config file. - -if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then - sbatch_cores=2 - sbatch_mem=8G - norm=TRUE - wall=24:00:00 - mode=NA -elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then - sbatch_cores=2 - sbatch_mem=8G - norm=TRUE - wall=24:00:00 - mode=NA -else - sbatch_cores=16 - sbatch_mem=160G - norm=FALSE - assemble=TRUE - wall=48:00:00 -fi - -if [[ ${mode} == "isolate" ]]; then - ISOLATE_MODE=TRUE -elif [[ ${mode} == "metagenomic" ]]; then - ISOLATE_MODE=FALSE -elif [[ ${mode} == "NA" ]]; then - ISOLATE_MODE=FALSE -else - echo "unknown mode: ${mode}" - exit 1 -fi - -set -e -set -o pipefail - -declare -a s -declare -a g -# below extended regex might be broken because C5\d\d happens in column 0, not column 1 -# of the hacked sample-sheet. -# for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) - -# new sample-sheet is of form: -# Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_ID,Sample_Project,Well_description,Lane -# 10283.LS.4.4.2015,10283.LS.4.4.2015,Plate_1,A1,C501,LS_Timeseries_TellSeq_10283,10283.LS.4.4.2015,1 -for sample in $(egrep -o ",C5..," ${samplesheet} | tr -d "," | sort) -do - echo "sample found: ${sample}" - # get references if they exist - if [[ -f ${reference_map} ]]; then - if $(grep -Fq ${sample} ${reference_map}); then - ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n") - if [[ ${ref} != "NONE" ]]; then - if [[ ! -d "${reference_base}/${ref}" ]]; then - echo "${reference_base}/${ref}" - echo "${ref} not found" - exit 1 - fi - g[${#g[@]}]=${ref} - s[${#s[@]}]=${sample} - fi - fi - else - g[${#g[@]}]=NONE - s[${#s[@]}]=${sample} - fi -done -n_samples=${#s[@]} - -# https://stackoverflow.com/a/17841619/19741 -function join_by { local IFS="$1"; shift; echo "$*"; } -s=$(join_by , "${s[@]}") -g=$(join_by , "${g[@]}") - -base=$(dirname ${0}) -submit_script=$(dirname ${0})/tellread.sbatch -integrate_script=$(dirname ${0})/integrate.sbatch -norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch -asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch -clean_script=$(dirname ${0})/tellread-cleanup.sbatch - -if [[ ${ISOLATE_MODE} == "TRUE" ]]; then - asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch - asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch -else - asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch - asm_tellink_script=$(dirname ${0})/telllink.sbatch -fi - -if [[ ! -f ${submit_script} ]]; then - echo "Cannot access submit script" - exit 1 -fi -if [[ ! -f ${asm_cloudspades_script} ]]; then - echo "Cannot access cloudspades assembly script" - exit 1 -fi -if [[ ! -f ${asm_tellink_script} ]]; then - echo "Cannot access tell-link assembly script" - exit 1 -fi -if [[ ! -f ${integrate_script} ]]; then - echo "Cannot access integrate script" - exit 1 -fi -if [[ ! -f ${clean_script} ]]; then - echo "Cannot access clean script" - exit 1 -fi - -datetag=$(date "+%Y.%m.%d") -scriptcopy=$(pwd)/tellread_script-${datetag}.sh -submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch -asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch -asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch -normcopy=$(pwd)/norm_submission-${datetag}.sbatch -intcopy=$(pwd)/integrate_submission-${datetag}.sbatch -cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch -arguments=$(pwd)/provided_script_arguments.txt -if [[ -f ${scriptcopy} ]]; then - echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit" - exit 1 -fi -if [[ -f ${submitcopy} ]]; then - echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit" - exit 1 -fi - -#TODO: Other possible arguments like -r? -echo "-l {{lane}} -s {{seqrun_path}} -i {{tellread_map}} -m {{mode}}" >${arguments} - -cp ${0} ${scriptcopy} -cp ${submit_script} ${submitcopy} -cp ${asm_cloudspades_script} ${asmcscopy} -cp ${asm_tellink_script} ${asmtlcopy} -cp ${integrate_script} ${intcopy} -cp ${clean_script} ${cleancopy} -chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy} - -set -x - -trjob=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag} \ - -c ${sbatch_cores} \ - --mem ${sbatch_mem} \ - --time ${wall} \ - --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ - ${submit_script}) - -echo "TRJOB_RETURN_CODE: $?" > {{output_path}}/pids -echo "TRJOB_PID: $trjob" >> {{output_path}}/pids - -if [[ ${norm} == "TRUE" ]]; then - cp ${norm_script} ${normcopy} - chmod gou-w ${normcopy} - norm_counts_job=$(sbatch \ - --parsable \ - --dependency=afterok:${trjob} \ - -J ${labeltag}-${datetag}-norm-counts \ - --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \ - ${norm_script}) - echo "NORM_COUNTS_JOB_RETURN_CODE: $?" >> {{output_path}}/pids - echo "NORM_COUNTS_JOB_PID: $norm_counts_job" >> {{output_path}}/pids -fi - -integrate_job=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag}-integrate \ - --dependency=afterok:${trjob} \ - --array 1-${n_samples} \ - --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \ - ${integrate_script}) - -echo "INTEGRATE_JOB_RETURN_CODE: $?" >> {{output_path}}/pids -echo "INTEGRATE_JOB_PID: $integrate_job" >> {{output_path}}/pids - -if [[ ${assemble} == "TRUE" ]]; then - csj=$(sbatch \ - --parsable \ - --dependency=aftercorr:${integrate_job} \ - -J ${labeltag}-${datetag}-cloudspades \ - --array 1-${n_samples} \ - --export LABELTAG=${labeltag},OUTPUT=${output} \ - ${asm_cloudspades_script}) - - echo "CSJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids - echo "CSJ_JOB_PID: $csj" >> {{output_path}}/pids - - tlj=$(sbatch \ - --parsable \ - --dependency=aftercorr:${integrate_job} \ - -J ${labeltag}-${datetag}-tell-link \ - --array 1-${n_samples} \ - --export LABELTAG=${labeltag},OUTPUT=${output} \ - ${asm_tellink_script}) - - echo "TLJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids - echo "TLJ_JOB_PID: $tlj" >> {{output_path}}/pids - - cleanupdep=${csj}:${tlj} -else - cleanupdep=${integrate_job} - echo "Not assembling" -fi - -cleanup=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag}-cleanup \ - --dependency=afterok:${cleanupdep} \ - --export OUTPUT=${output} \ - ${clean_script}) - -echo "CLEANUP_JOB_RETURN_CODE: $?" >> {{output_path}}/pids -echo "CLEANUP_JOB_PID: $cleanup" >> {{output_path}}/pids From 6818d440fbdc949fc829b8720525cf348d08d363 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Thu, 3 Oct 2024 18:37:41 -0700 Subject: [PATCH 16/47] Creation tests added for new TellReadJob() class. --- sequence_processing_pipeline/TellReadJob.py | 41 +-- .../templates/tellread.sbatch | 5 +- .../cloudspades-isolate.sbatch | 84 ------- .../data/tellread_output/cloudspades.sbatch | 81 ------ .../data/tellread_output/integrate.sbatch | 125 ---------- .../tellread_output/telllink-isolate.sbatch | 62 ----- .../data/tellread_output/telllink.sbatch | 64 ----- .../tellread_output/tellread-cleanup.sbatch | 23 -- .../data/tellread_output/tellread.sbatch | 108 -------- .../tests/data/tellread_output/tellread.sh | 236 ------------------ .../tests/test_TellReadJob.py | 99 ++++++++ 11 files changed, 123 insertions(+), 805 deletions(-) delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sh create mode 100644 sequence_processing_pipeline/tests/test_TellReadJob.py diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 859974a4..2f7905d5 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -6,6 +6,7 @@ from .Pipeline import Pipeline from .PipelineError import PipelineError from metapool import load_sample_sheet +from datetime import datetime logging.basicConfig(level=logging.DEBUG) @@ -14,9 +15,9 @@ class TellReadJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, max_array_length, indicies_script_path, label, - reference_base, reference_map, tmp1_path, sing_script_path, - lane, cores_per_task=4): + qiita_job_id, label, reference_base, + reference_map, tmp1_path, sing_script_path, lane, + cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -29,8 +30,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param jmem: String representing total memory limit for entire job. :param modules_to_load: A list of Linux module names to load :param qiita_job_id: identify Torque jobs using qiita_job_id - :param max_array_length: None - :param indicies_script_path: None :param label: None :param reference_base: None :param reference_map: None @@ -38,9 +37,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, """ super().__init__(run_dir, output_path, - 'TRIntegrateJob', + 'TellReadJob', [], - max_array_length, + 1, modules_to_load=modules_to_load) self.sample_sheet_path = sample_sheet_path @@ -51,7 +50,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.node_count = node_count self.wall_time_limit = wall_time_limit self.cores_per_task = cores_per_task - self.indicies_script_path = indicies_script_path self.reference_base = reference_base self.reference_map = reference_map @@ -59,9 +57,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, # raise an Error if jmem is not a valid floating point value. self.jmem = str(int(jmem)) self.qiita_job_id = qiita_job_id - self.sample_count = len(self.sample_ids) self.jinja_env = Environment(loader=KISSLoader('templates')) - self.label = label self.sing_script_path = sing_script_path self.tmp1_path = tmp1_path @@ -71,12 +67,14 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, raise ValueError(f"'{tmp}' is not a valid lane number") self.lane_number = tmp - if self.reference_base != None or self.reference_map != None: + # TODO: Need examples of these being not None + if self.reference_base is not None or self.reference_map is not None: tag = 'reference-based' else: tag = 'reference-free' - self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + date = datetime.today().strftime('%Y.%m.%d') + self.job_name = (f"{label}-{tag}-{date}-tellread") def run(self, callback=None): job_script_path = self._generate_job_script() @@ -96,9 +94,12 @@ def run(self, callback=None): except JobFailedError as e: # When a job has failed, parse the logs generated by this specific # job to return a more descriptive message to the user. - info = self.parse_logs() + # TODO: We need more examples of failed jobs before we can create + # a parser for the logs. + # info = self.parse_logs() # prepend just the message component of the Error. - info.insert(0, str(e)) + # info.insert(0, str(e)) + info = str(e) raise JobFailedError('\n'.join(info)) logging.debug(f'TellReadJob {self.job_info["job_id"]} completed') @@ -134,8 +135,8 @@ def _process_sample_sheet(self): 'sample_ids': sample_ids} def _generate_job_script(self): - job_script_path = join(self.output_path, 'integrate.sbatch') - template = self.jinja_env.get_template("tellread2.sbatch") + job_script_path = join(self.output_path, 'tellread_test.sbatch') + template = self.jinja_env.get_template("tellread.sbatch") # generate a comma separated list of sample-ids from the tuples stored # in self.sample_ids. @@ -154,7 +155,7 @@ def _generate_job_script(self): extra = "" - # if reference_base is added in the future and is defined, exta needs + # if reference_base is added in the future and is defined, extra needs # to be f"-f {reference_base}". # extra = "-f ${REFBASE}" @@ -170,8 +171,7 @@ def _generate_job_script(self): "tmp_dir": self.tmp1_path, "modules_to_load": ' '.join(self.modules_to_load), "lane": f"s_{self.lane_number}", - # TODO: make sure this is the 'ConvertJob/output' directory - "output": self.output_path, + "output": join(self.output_path, "output"), "rundir_path": self.root_dir, "samples": samples, "refs": refs, @@ -179,3 +179,6 @@ def _generate_job_script(self): })) return job_script_path + + def parse_logs(self): + raise PipelineError("parse_logs() not implemented for TellReadJob") diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index da439836..7d044bb7 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -16,15 +16,14 @@ mkdir -p ${TMPDIR} export TMPDIR=$(mktemp -d) mkdir -p {{output}} - + module load {{modules_to_load}} {{sing_script_path}} \ -i {{rundir_path}} \ -o {{output}} \ -s $(echo {{samples}} | tr -d '"') \ -g $(echo {{refs}} | tr -d '"') \ - -j ${SLURM_JOB_CPUS_PER_NODE} \ - {{extra}} \ + -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \ -l {{lane}} if [[ -d {{output}}/Full ]]; then diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch deleted file mode 100644 index 7ec58058..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch +++ /dev/null @@ -1,84 +0,0 @@ -#!/bin/bash -l -#SBATCH -J cs-assemble # cs-assemble -#SBATCH --time 24:00:00 # 24:00:00 -#SBATCH --mem 64G # 64G -#SBATCH -N 1 # 1 -#SBATCH -c 12 # 12 -#SBATCH -p qiita # qiita - -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. -source activate qiime2-2023.5 -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - -set -x -set -e - -# this gets set in the environment from another script. For now let's -# run with that. -echo $TMPDIR - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base=${OUTPUT} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate activate qiime2-2023.5 - -module load gcc_9.3.0 # gcc_9.3.0 - -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) - -# assumes 1-based array index, eg --array 1-N -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -cs=${base}/cloudspades-isolate/${sample} - -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${cs} ]]; then - rm -fr ${cs} - fi -fi - -mkdir -p ${cs} - -pushd ~/spades-cloudspades-paper/assembler/ -./spades.py \ - -o ${cs} \ - --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ - -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 -module unload gcc_9.3.0 -popd - -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate quast - -quast \ - -o ${cs}/quast-scaffolds \ - -t ${SLURM_JOB_CPUS_PER_NODE} \ - ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 - -# remove intermediates that currently dont have a downstream use -if [[ -d ${cs}/K21 ]]; then - rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch deleted file mode 100644 index d16dc2b0..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -l -#SBATCH -J cs-assemble # cs-assemble -#SBATCH --time 24:00:00 # 24:00:00 -#SBATCH --mem 128G # 128G -#SBATCH -N 1 # 1 -#SBATCH -c 12 # 12 -#SBATCH -p qiita # qiita - -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. -source activate qiime2-2023.5 -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - -set -x -set -e - -echo $TMPDIR - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base=${OUTPUT} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp. -# for now we will leave it hardcoded. -mamba activate activate qiime2-2023.5 - -module load gcc_9.3.0 # gcc_9.3.0 - -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) - -# assumes 1-based array index, eg --array 1-N -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -cs=${base}/cloudspades/${sample} - -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${cs} ]]; then - rm -fr ${cs} - fi -fi - -mkdir -p ${cs} -pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin - -# for now don't use spades.py jinja2 variable -./spades.py \ - -o ${cs} \ - --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \ - --meta \ - -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 -module unload gcc_9.3.0 -popd - -mamba activate quast -quast \ - -o ${cs}/quast-scaffolds \ - -t ${SLURM_JOB_CPUS_PER_NODE} \ - ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 - -# remove intermediates that currently dont have a downstream use -if [[ -d ${cs}/K21 ]]; then - rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch deleted file mode 100644 index 6947c226..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash -l -#SBATCH -J integrate # integrate -#SBATCH --time 24:00:00 # 24:00:00 -#SBATCH --mem 8G # 8G -#SBATCH -N 1 # 1 -#SBATCH -c 1 # 1 -#SBATCH -p qiita # qiita - -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded. -source activate rust -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - - -# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html -cores=${SLURM_CPUS_PER_TASK} - -if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then - echo "Not operating in an array" - exit 1 -fi - -if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then - echo "Line extraction assumes 1-based index" - exit 1 -fi - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -if [[ -z ${BASE} ]]; then - echo "BASE not specified" - exit 1 -fi - -tellread=${OUTPUT} -if [[ ! -d ${tellread} ]]; then - echo "${tellread} not found" - exit 1 -fi - -set -x -set -e -set -o pipefail - -samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -export TMPDIR=$(mktemp -d) -function cleanup { - echo "Removing $TMPDIR" - rm -r $TMPDIR - unset TMPDIR -} -trap cleanup EXIT - -files=${TMPDIR}/integration.files -/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files} -mkdir -p ${tellread}/integrated - -if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R1" - exit 1 -fi - -if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R2" - exit 1 -fi - -if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} I1" - exit 1 -fi - -r1=$(grep -m 1 "_R1_${sample}" ${files}) -r2=$(grep -m 1 "_R2_${sample}" ${files}) -i1=$(grep -m 1 "_I1_${sample}" ${files}) -r1out=${tellread}/integrated/${sample}.R1.fastq.gz -r2out=${tellread}/integrated/${sample}.R2.fastq.gz -i1out=${tellread}/integrated/${sample}.I1.fastq.gz - -if [[ ! -s ${r1} ]]; then - echo "${r1} is empty, cannot integrate" - if [[ -s ${r2} ]]; then - echo "R1 and R2 are inconsistent" - exit 1 - fi - if [[ -s ${i1} ]]; then - echo "R1 and I1 are inconsistent" - exit 1 - fi - - # reflect the empties so Qiita can know of them - touch ${r1out} - touch ${r2out} - touch ${i1out} - exit 0 -fi - -# this can probably be backgrounded but then you have to get creative to -# not mask a nonzero exit status (e.g., the python process raising) -cat ${i1} | gzip > ${i1out} - -mamba activate tellread-integrate -python ${BASE}/integrate-indices-np.py integrate \ - --no-sort \ - --r1-in ${r1} \ - --r2-in ${r2} \ - --i1-in ${i1} \ - --r1-out ${r1out} \ - --r2-out ${r2out} \ - --threads ${cores} \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch deleted file mode 100644 index 6a23331e..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -l -#SBATCH -J tellink-isolate # tellink-isolate -#SBATCH -N 1 # 1 -#SBATCH -c 16 # 16 -#SBATCH --mem 160G # 160G -#SBATCH --time 96:00:00 # 96:00:00 -#SBATCH -p qiita # qiita - -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -set -x -set -e - -module load singularity_3.6.4 # singularity_3.6.4 - -if [[ -z "${LABELTAG}" ]]; then - echo "LABELTAG is not specified" - exit 1 -fi - -base=/panfs/qiita/TELLREAD/${LABELTAG} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -k=79 -lc=35 -cores=${SLURM_CPUS_PER_TASK} - -tl=${base}/tell-link-isolate/${sample} -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${tl} ]]; then - rm -fr ${tl} - fi -fi - -mkdir -p ${tl} - -/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \ - -r1 ${base}/integrated/${sample}.R1.fastq.gz \ - -r2 ${base}/integrated/${sample}.R2.fastq.gz \ - -i1 ${base}/integrated/${sample}.I1.fastq.gz \ - -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ - -k ${k} \ - -lc ${lc} \ - -p ${sample} \ - -j ${cores} - -# remove temporary data -if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping -fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch deleted file mode 100644 index b6033b24..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -l -#SBATCH -J tellink # tellink -#SBATCH --mem 160G # 160G -#SBATCH -N 1 # 1 -#SBATCH -c 16 # 16 -#SBATCH --time 96:00:00 # 96:00:00 -#SBATCH -p qiita # qiita - -# for now these can be left hard-coded. -#SBATCH --output %x-%A_%a.out -#SBATCH --error %x-%A_%a.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=FAIL - -set -x -set -e - -module load singularity_3.6.4 # singularity_3.6.4 - -if [[ -z "${LABELTAG}" ]]; then - echo "LABEL is not specified" - exit 1 -fi - -base=/panfs/${USER}/${LABELTAG} -if [[ ! -d ${base} ]]; then - echo "${base} not found" - exit 1 -fi - -samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -# leave these hardcoded for now -k=79 -lc=35 -cores=${SLURM_CPUS_PER_TASK} - -tl=${base}/tell-link/${sample} -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${tl} ]]; then - rm -fr ${tl} - fi -fi - -mkdir -p ${tl} - -/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \ - -r1 ${base}/integrated/${sample}.R1.fastq.gz \ - -r2 ${base}/integrated/${sample}.R2.fastq.gz \ - -i1 ${base}/integrated/${sample}.I1.fastq.gz \ - -d metagenomics \ - -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \ - -k ${k} \ - -lc ${lc} \ - -p ${sample} \ - -j ${cores} - -# remove temporary data -if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping -fi diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch deleted file mode 100644 index 56bc3360..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -l -#SBATCH -J cleanup # cleanup -#SBATCH --time 24:00:00 # 24:00:00 -#SBATCH --mem 8G # 8G -#SBATCH -N 1 # 1 -#SBATCH -c 1 # 1 -#SBATCH -p qiita # qiita - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=BEGIN,FAIL - -# for now these can be left hard-coded. -#SBATCH --output %x-%A.out -#SBATCH --error %x-%A.err - -if [[ -z "${OUTPUT}" ]]; then - echo "OUTPUT is not specified" - exit 1 -fi - -# remove unused large outputs -rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch deleted file mode 100644 index ab0647f8..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -l -#SBATCH -J tellread # tellread -#SBATCH -p qiita # qiita -#SBATCH -N 1 # 1 -#SBATCH -c 4 # 4 -#SBATCH --mem 16G # 16G -#SBATCH --time 96:00:00 # 96:00:00 - -# for now these can be left hard-coded. -#SBATCH --partition=short -#SBATCH --output %x-%A.out -#SBATCH --error %x-%A.err - -# for now comment these out as qiita is responsible for notifying users. -###SBATCH --mail-user=qiita.help@gmail.com -###SBATCH --mail-type=BEGIN,FAIL - -function logger () { - echo "$(date) :: ${@}"; - echo "$(date) :: ${@}" 1>&2; -} - -set -x - -if [[ -z "${N_SAMPLES}" ]]; then - echo "N_SAMPLES is not specified" - exit 1 -fi - -if [[ -z "${SEQRUNPATH}" ]]; then - echo "SEQRUNPATH is not specified" - exit 1 -fi - -if [[ -z "${LANE}" ]]; then - echo "LANE is not specified" - exit 1 -fi - -if [[ -z "${SAMPLES}" ]]; then - echo "SAMPLES is not specified" - exit 1 -fi - -if [[ -z "${REFS}" ]]; then - echo "REFS is not specified" - exit 1 -fi - -if [[ -z "${OUTPUT}" ]]; then - echo "OUTPUT is not specified" - exit 1 -fi - -export TMPDIR="/panfs/${USER}/tmp" -mkdir -p ${TMPDIR} -export TMPDIR=$(mktemp -d) -seqrun_path=${SEQRUNPATH} - -if [[ ${LANE} == "L001" ]]; then - lane=s_1 -elif [[ ${LANE} == "L002" ]]; then - lane=s_2 -elif [[ ${LANE} == "L003" ]]; then - lane=s_3 -elif [[ ${LANE} == "L004" ]]; then - lane=s_4 -elif [[ ${LANE} == "L005" ]]; then - lane=s_5 -elif [[ ${LANE} == "L006" ]]; then - lane=s_6 -elif [[ ${LANE} == "L007" ]]; then - lane=s_7 -elif [[ ${LANE} == "L008" ]]; then - lane=s_8 -else - echo "Unrecognized lane: ${LANE}" - exit 1 -fi - -# yes, hard coded, not great but progress. -extra="" -if [[ ! -z ${REFBASE} ]]; then - extra="-f ${REFBASE}" -fi - -mkdir -p ${OUTPUT} - -module load singularity_3.6.4 # singularity_3.6.4 -$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ - -i ${seqrun_path} \ - -o ${OUTPUT} \ - -s $(echo ${SAMPLES} | tr -d '"') \ - -g $(echo ${REFS} | tr -d '"') \ - -j ${SLURM_JOB_CPUS_PER_NODE} \ - ${extra} \ - -l ${lane} - - -if [[ -d ${OUTPUT}/Full ]]; then - echo "Run appears successful" -elif [[ -d ${OUTPUT}/1_demult/Full ]]; then - echo "Run appears unsuccessful but has output" - exit 1 -else - echo "Run appears unsuccessful" - exit 1 -fi \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh deleted file mode 100644 index 90b4e1ce..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh +++ /dev/null @@ -1,236 +0,0 @@ -#!/bin/bash -samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv" # previously -i option -seqrunpath="/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4" # previously -s option -lane="L008" # previously -l option -reference_map="" # previously -r option -reference_base="" # previously -b option -mode="metagenomic" $ # previously -m option - -# preserve error-checking of parameters to preserve as much of the original -# script as possible, even though this could be done in python. - -# https://unix.stackexchange.com/a/621007 -: ${seqrunpath:?Missing -s} -: ${lane:?Missing -i} - -if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then - if [[ -z ${reference_map} ]]; then - echo "-b used without -r" - exit 1 - fi - if [[ -z ${reference_base} ]]; then - echo "-r used without -b" - exit 1 - fi - if [[ ! -d ${reference_base} ]]; then - echo "reference base not found" - exit 1 - fi - - tag=reference-based -else - tag=reference-free -fi - -# trim trailing slash -# https://stackoverflow.com/a/32845647/19741 -safepath=$(echo ${seqrunpath} | sed 's:/*$::') -label=$(basename ${safepath}) -labeltag=${label}-${tag} -output=/panfs/${USER}/${labeltag} - -if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then - echo "Cannot access the lane" - exit 1 -fi - -# for now this can stay here to keep greater compatibility with the original script. -# however these fields should eventually be parameters that can be configured in the config file. - -if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then - sbatch_cores=2 - sbatch_mem=8G - norm=TRUE - wall=24:00:00 - mode=NA -elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then - sbatch_cores=2 - sbatch_mem=8G - norm=TRUE - wall=24:00:00 - mode=NA -else - sbatch_cores=16 - sbatch_mem=160G - norm=FALSE - assemble=TRUE - wall=48:00:00 -fi - -if [[ ${mode} == "isolate" ]]; then - ISOLATE_MODE=TRUE -elif [[ ${mode} == "metagenomic" ]]; then - ISOLATE_MODE=FALSE -elif [[ ${mode} == "NA" ]]; then - ISOLATE_MODE=FALSE -else - echo "unknown mode: ${mode}" - exit 1 -fi - -set -e -set -o pipefail - -declare -a s -declare -a g -# below extended regex might be broken because C5\d\d happens in column 0, not column 1 -# of the hacked sample-sheet. -for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort) -do - echo "sample found: ${sample}" - # get references if they exist - if [[ -f ${reference_map} ]]; then - if $(grep -Fq ${sample} ${reference_map}); then - ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n") - if [[ ${ref} != "NONE" ]]; then - if [[ ! -d "${reference_base}/${ref}" ]]; then - echo "${reference_base}/${ref}" - echo "${ref} not found" - exit 1 - fi - g[${#g[@]}]=${ref} - s[${#s[@]}]=${sample} - fi - fi - else - g[${#g[@]}]=NONE - s[${#s[@]}]=${sample} - fi -done -n_samples=${#s[@]} - -# https://stackoverflow.com/a/17841619/19741 -function join_by { local IFS="$1"; shift; echo "$*"; } -s=$(join_by , "${s[@]}") -g=$(join_by , "${g[@]}") - -base=$(dirname ${0}) -submit_script=$(dirname ${0})/tellread.sbatch -integrate_script=$(dirname ${0})/integrate.sbatch -norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch -asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch -clean_script=$(dirname ${0})/tellread-cleanup.sbatch - -if [[ ${ISOLATE_MODE} == "TRUE" ]]; then - asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch - asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch -else - asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch - asm_tellink_script=$(dirname ${0})/telllink.sbatch -fi - -if [[ ! -f ${submit_script} ]]; then - echo "Cannot access submit script" - exit 1 -fi -if [[ ! -f ${asm_cloudspades_script} ]]; then - echo "Cannot access cloudspades assembly script" - exit 1 -fi -if [[ ! -f ${asm_tellink_script} ]]; then - echo "Cannot access tell-link assembly script" - exit 1 -fi -if [[ ! -f ${integrate_script} ]]; then - echo "Cannot access integrate script" - exit 1 -fi -if [[ ! -f ${clean_script} ]]; then - echo "Cannot access clean script" - exit 1 -fi - -datetag=$(date "+%Y.%m.%d") -scriptcopy=$(pwd)/tellread_script-${datetag}.sh -submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch -asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch -asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch -normcopy=$(pwd)/norm_submission-${datetag}.sbatch -intcopy=$(pwd)/integrate_submission-${datetag}.sbatch -cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch -arguments=$(pwd)/provided_script_arguments.txt -if [[ -f ${scriptcopy} ]]; then - echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit" - exit 1 -fi -if [[ -f ${submitcopy} ]]; then - echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit" - exit 1 -fi - -echo $@ > ${arguments} -cp ${0} ${scriptcopy} -cp ${submit_script} ${submitcopy} -cp ${asm_cloudspades_script} ${asmcscopy} -cp ${asm_tellink_script} ${asmtlcopy} -cp ${integrate_script} ${intcopy} -cp ${clean_script} ${cleancopy} -chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy} - -set -x - -trjob=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag} \ - -c ${sbatch_cores} \ - --mem ${sbatch_mem} \ - --time ${wall} \ - --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \ - ${submit_script}) - -if [[ ${norm} == "TRUE" ]]; then - cp ${norm_script} ${normcopy} - chmod gou-w ${normcopy} - norm_counts_job=$(sbatch \ - --parsable \ - --dependency=afterok:${trjob} \ - -J ${labeltag}-${datetag}-norm-counts \ - --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \ - ${norm_script}) -fi - -integrate_job=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag}-integrate \ - --dependency=afterok:${trjob} \ - --array 1-${n_samples} \ - --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \ - ${integrate_script}) - -if [[ ${assemble} == "TRUE" ]]; then - csj=$(sbatch \ - --parsable \ - --dependency=aftercorr:${integrate_job} \ - -J ${labeltag}-${datetag}-cloudspades \ - --array 1-${n_samples} \ - --export LABELTAG=${labeltag},OUTPUT=${output} \ - ${asm_cloudspades_script}) - tlj=$(sbatch \ - --parsable \ - --dependency=aftercorr:${integrate_job} \ - -J ${labeltag}-${datetag}-tell-link \ - --array 1-${n_samples} \ - --export LABELTAG=${labeltag},OUTPUT=${output} \ - ${asm_tellink_script}) - cleanupdep=${csj}:${tlj} -else - cleanupdep=${integrate_job} - echo "Not assembling" -fi - -cleanup=$(sbatch \ - --parsable \ - -J ${labeltag}-${datetag}-cleanup \ - --dependency=afterok:${cleanupdep} \ - --export OUTPUT=${output} \ - ${clean_script}) \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py new file mode 100644 index 00000000..b9659267 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -0,0 +1,99 @@ +from os.path import join, abspath +from sequence_processing_pipeline.TellReadJob import TellReadJob +from functools import partial +import unittest + + +class TestTellReadJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TellReadJob', 'tellread_test.sbatch') + self.exp = self.path('data', 'tellread_output', 'tellread_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + # TODO: Revisit w/a new directory named as expected for a + # TellSeq-produced run-directory. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + # TODO: Revisit w/a proper sample-sheet once spec is near finalized. + self.sample_sheet_path = self.path('data', 'good-sample-sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TellReadJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + + def test_creation(self): + # confirm only sensible lane numbers are allowed. + with self.assertRaisesRegex(ValueError, + "'-1' is not a valid lane number"): + TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, + self.queue_name, self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.label, self.reference_base, self.reference_map, + self.tmp1_path, self.sing_script_path, -1, + self.cores_per_task) + + with self.assertRaisesRegex(ValueError, + "'0' is not a valid lane number"): + TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, + self.queue_name, self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.label, self.reference_base, self.reference_map, + self.tmp1_path, self.sing_script_path, 0, + self.cores_per_task) + + with self.assertRaisesRegex(ValueError, + "'9' is not a valid lane number"): + TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, + self.queue_name, self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.label, self.reference_base, self.reference_map, + self.tmp1_path, self.sing_script_path, 9, + self.cores_per_task) + + # test basic good-path + job = TellReadJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.label, self.reference_base, self.reference_map, + self.tmp1_path, self.sing_script_path, self.lane, + self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + print("OBS: %s" % obs_line) + print("EXP: %s" % exp_line) + print("") + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() From baf35ea4fd51b71cb4e09d881680da09cdf8afc6 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 6 Oct 2024 15:43:33 -0700 Subject: [PATCH 17/47] flake8 --- sequence_processing_pipeline/TRIntegrateJob.py | 10 +++++----- sequence_processing_pipeline/TRNormCountsJob.py | 8 +++++--- sequence_processing_pipeline/tests/test_TellReadJob.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 076a15fe..25cec68a 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -15,7 +15,7 @@ class TRIntegrateJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, qiita_job_id, max_array_length, indicies_script_path, label, - reference_base, reference_map, cores_per_task=4): + reference_base, reference_map, cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -33,7 +33,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param label: None :param reference_base: None :param reference_map: None - :param cores_per_task: (Optional) # of CPU cores per node to request. + :param cores_per_task: # of CPU cores per node to request. """ super().__init__(run_dir, output_path, @@ -62,7 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.jinja_env = Environment(loader=KISSLoader('templates')) self.label = label - if self.reference_base != None or self.reference_map != None: + if self.reference_base is not None or self.reference_map is not None: tag = 'reference-based' else: tag = 'reference-free' @@ -122,8 +122,8 @@ def _process_sample_sheet(self): 'sample_ids': sample_ids} def _generate_job_script(self): - job_script_path = join(self.output_path, 'integrate.sbatch') - template = self.jinja_env.get_template("integrate2.sbatch") + job_script_path = join(self.output_path, 'integrate_test.sbatch') + template = self.jinja_env.get_template("integrate.sbatch") with open(job_script_path, mode="w", encoding="utf-8") as f: f.write(template.render({ diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py index 09e36a67..a3603bcd 100644 --- a/sequence_processing_pipeline/TRNormCountsJob.py +++ b/sequence_processing_pipeline/TRNormCountsJob.py @@ -62,7 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.jinja_env = Environment(loader=KISSLoader('templates')) self.label = label - if self.reference_base != None or self.reference_map != None: + if self.reference_base is not None or self.reference_map is not None: tag = 'reference-based' else: tag = 'reference-free' @@ -122,8 +122,10 @@ def _process_sample_sheet(self): 'sample_ids': sample_ids} def _generate_job_script(self): - job_script_path = join(self.output_path, 'compute_sequence_counts_for_normalization.sbatch') - template = self.jinja_env.get_template("compute_sequence_counts_for_normalization2.sbatch") + job_script_path = join(self.output_path, "compute_sequence_counts_for" + "_normalization.sbatch") + template = self.jinja_env.get_template("compute_sequence_counts_for_" + "normalization2.sbatch") with open(job_script_path, mode="w", encoding="utf-8") as f: f.write(template.render({ diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py index b9659267..6cc12632 100644 --- a/sequence_processing_pipeline/tests/test_TellReadJob.py +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -1,4 +1,4 @@ -from os.path import join, abspath +from os.path import join from sequence_processing_pipeline.TellReadJob import TellReadJob from functools import partial import unittest From 56fc5be5d598de3ba44c2930ca83ed851552e7c4 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 6 Oct 2024 18:39:33 -0700 Subject: [PATCH 18/47] New sample files added --- .../tellread_output/integrate_test.sbatch | 96 +++++++++++++++++++ .../data/tellread_output/tellread_test.sbatch | 37 +++++++ 2 files changed, 133 insertions(+) create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch new file mode 100644 index 00000000..3cdc891f --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch @@ -0,0 +1,96 @@ +#!/bin/bash -l +#SBATCH -J integrate # integrate +#SBATCH --time 96:00:00 # 24:00:00 +#SBATCH --mem 16G # 8G +#SBATCH -N 1 # 1 +#SBATCH -c 4 # 1 +#SBATCH -p qiita # qiita + +#SBATCH --output integrate_%x-%A_%a.out +#SBATCH --error integrate_%x-%A_%a.err + +# NB SLURM_ARRAY_TASK_ID is exported by Slurm +if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then + echo "Not operating in an array" + exit 1 +fi + +# NB SLURM_ARRAY_TASK_MIN is exported by Slurm +if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then + echo "Line extraction assumes 1-based index" + exit 1 +fi + +set -x +set -e +set -o pipefail + +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list_output.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT. +export TMPDIR=$(mktemp -d) +function cleanup { + echo "Removing $TMPDIR" + rm -r $TMPDIR + unset TMPDIR +} +trap cleanup EXIT + +files=${TMPDIR}/integration.files +/bin/ls -1 sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/Full/*corrected.err_barcode_removed.fastq > ${files} +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated + +if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R1" + exit 1 +fi + +if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} R2" + exit 1 +fi + +if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then + echo "Multiple matches for ${sample} I1" + exit 1 +fi + +r1=$(grep -m 1 "_R1_${sample}" ${files}) +r2=$(grep -m 1 "_R2_${sample}" ${files}) +i1=$(grep -m 1 "_I1_${sample}" ${files}) +r1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz +r2out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz +i1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz + +if [[ ! -s ${r1} ]]; then + echo "${r1} is empty, cannot integrate" + if [[ -s ${r2} ]]; then + echo "R1 and R2 are inconsistent" + exit 1 + fi + if [[ -s ${i1} ]]; then + echo "R1 and I1 are inconsistent" + exit 1 + fi + + # reflect the empties so Qiita can know of them + touch ${r1out} + touch ${r2out} + touch ${i1out} + exit 0 +fi + +# this can probably be backgrounded but then you have to get creative to +# not mask a nonzero exit status (e.g., the python process raising) +cat ${i1} | gzip > ${i1out} + +conda activate qp-knight-lab-processing-2022.03 +python hello integrate \ + --no-sort \ + --r1-in ${r1} \ + --r2-in ${r2} \ + --i1-in ${i1} \ + --r1-out ${r1out} \ + --r2-out ${r2out} \ + --threads ${SLURM_CPUS_PER_TASK} \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch new file mode 100644 index 00000000..a008937b --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch @@ -0,0 +1,37 @@ +#!/bin/bash -l +#SBATCH -J tellread +#SBATCH -p qiita +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH --mem 16G +#SBATCH --time 96:00:00 + +#SBATCH --output tellread_%x-%A.out +#SBATCH --error tellread_%x-%A.err + +set -x + +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/tmp1 +mkdir -p ${TMPDIR} +export TMPDIR=$(mktemp -d) + +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output + +module load singularity_3.6.4 +$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ + -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \ + -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output \ + -s $(echo CDPH-SAL__Salmonella__Typhi__MDL-143,CDPH-SAL_Salmonella_Typhi_MDL-144,CDPH-SAL_Salmonella_Typhi_MDL-145,CDPH-SAL_Salmonella_Typhi_MDL-146,CDPH-SAL_Salmonella_Typhi_MDL-147,CDPH-SAL_Salmonella_Typhi_MDL-148,CDPH-SAL_Salmonella_Typhi_MDL-149,CDPH-SAL_Salmonella_Typhi_MDL-150,CDPH-SAL_Salmonella_Typhi_MDL-151,CDPH-SAL_Salmonella_Typhi_MDL-152,CDPH-SAL_Salmonella_Typhi_MDL-153,CDPH-SAL_Salmonella_Typhi_MDL-154,CDPH-SAL_Salmonella_Typhi_MDL-155,CDPH-SAL_Salmonella_Typhi_MDL-156,CDPH-SAL_Salmonella_Typhi_MDL-157,CDPH-SAL_Salmonella_Typhi_MDL-158,CDPH-SAL_Salmonella_Typhi_MDL-159,CDPH-SAL_Salmonella_Typhi_MDL-160,CDPH-SAL_Salmonella_Typhi_MDL-161,CDPH-SAL_Salmonella_Typhi_MDL-162,CDPH-SAL_Salmonella_Typhi_MDL-163,CDPH-SAL_Salmonella_Typhi_MDL-164,CDPH-SAL_Salmonella_Typhi_MDL-165,CDPH-SAL_Salmonella_Typhi_MDL-166,CDPH-SAL_Salmonella_Typhi_MDL-167,CDPH-SAL_Salmonella_Typhi_MDL-168,P21_E_coli_ELI344,P21_E_coli_ELI345,P21_E_coli_ELI347,P21_E_coli_ELI348,P21_E_coli_ELI349,P21_E_coli_ELI350,P21_E_coli_ELI351,P21_E_coli_ELI352,P21_E_coli_ELI353,P21_E_coli_ELI354,P21_E_coli_ELI355,P21_E_coli_ELI357,P21_E_coli_ELI358,P21_E_coli_ELI359,P21_E_coli_ELI361,P21_E_coli_ELI362,P21_E_coli_ELI363,P21_E_coli_ELI364,P21_E_coli_ELI365,P21_E_coli_ELI366,P21_E_coli_ELI367,P21_E_coli_ELI368,P21_E_coli_ELI369,stALE_E_coli_A1_F21_I1_R1,stALE_E_coli_A2_F21_I1_R1,stALE_E_coli_A3_F18_I1_R1,stALE_E_coli_A3_F40_I1_R1,stALE_E_coli_A4_F21_I1_R1,stALE_E_coli_A4_F21_I1_R2,stALE_E_coli_A4_F42_I1_R1,stALE_E_coli_A5_F21_I1_R1,stALE_E_coli_A5_F42_I1_R1,stALE_E_coli_A6_F21_I1_R1,stALE_E_coli_A6_F43_I1_R1,stALE_E_coli_A7_F21_I1_R1,stALE_E_coli_A7_F42_I1_R1,stALE_E_coli_A8_F20_I1_R1,stALE_E_coli_A8_F42_I1_R1,stALE_E_coli_A9_F21_I1_R1,stALE_E_coli_A9_F44_I1_R1,stALE_E_coli_A10_F21_I1_R1,stALE_E_coli_A10_F43_I1_R1,stALE_E_coli_A10_F131_I1_R1,stALE_E_coli_A11_F21_I1_R1,stALE_E_coli_A11_F43_I1_R1,stALE_E_coli_A11_F119_I1_R1,stALE_E_coli_A12_F21_I1_R1,stALE_E_coli_A12_F43_I1_R1,stALE_E_coli_A12_F136_I1_R1,stALE_E_coli_A13_F20_I1_R1,stALE_E_coli_A13_F42_I1_R1,stALE_E_coli_A13_F121_I1_R1,stALE_E_coli_A14_F20_I1_R1,stALE_E_coli_A14_F42_I1_R1,stALE_E_coli_A14_F133_I1_R1,stALE_E_coli_A15_F21_I1_R1,stALE_E_coli_A15_F42_I1_R1,stALE_E_coli_A15_F117_I1_R1,stALE_E_coli_A16_F20_I1_R1,stALE_E_coli_A16_F42_I1_R1,stALE_E_coli_A16_F134_I1_R1,stALE_E_coli_A17_F21_I1_R1,stALE_E_coli_A17_F118_I1_R1,stALE_E_coli_A18_F18_I1_R1,stALE_E_coli_A18_F39_I1_R1,stALE_E_coli_A18_F130_I1_R1,3A,4A,BLANK_40_12G,BLANK_40_12H,Pputida_JBEI__HGL_Pputida_107_BP6,Pputida_JBEI__HGL_Pputida_108_BP7,Pputida_JBEI__HGL_Pputida_109_BP8,Pputida_JBEI__HGL_Pputida_110_M2,Pputida_JBEI__HGL_Pputida_111_M5,Pputida_TALE__HGL_Pputida_112,Pputida_TALE__HGL_Pputida_113,Pputida_TALE__HGL_Pputida_114,Pputida_TALE__HGL_Pputida_115,Pputida_TALE__HGL_Pputida_116,Pputida_TALE__HGL_Pputida_117,Pputida_TALE__HGL_Pputida_118,Pputida_TALE__HGL_Pputida_119,Pputida_TALE__HGL_Pputida_120,Pputida_TALE__HGL_Pputida_121,Pputida_TALE__HGL_Pputida_122,Pputida_TALE__HGL_Pputida_123,Pputida_TALE__HGL_Pputida_124,Pputida_TALE__HGL_Pputida_125,Pputida_TALE__HGL_Pputida_126,Pputida_TALE__HGL_Pputida_127,Pputida_TALE__HGL_Pputida_128,Pputida_TALE__HGL_Pputida_129,Pputida_TALE__HGL_Pputida_130,Pputida_TALE__HGL_Pputida_131,Pputida_TALE__HGL_Pputida_132,Pputida_TALE__HGL_Pputida_133,Pputida_TALE__HGL_Pputida_134,Pputida_TALE__HGL_Pputida_135,Pputida_TALE__HGL_Pputida_136,Pputida_TALE__HGL_Pputida_137,Pputida_TALE__HGL_Pputida_138,Pputida_TALE__HGL_Pputida_139,Pputida_TALE__HGL_Pputida_140,Pputida_TALE__HGL_Pputida_141,Pputida_TALE__HGL_Pputida_142,Pputida_TALE__HGL_Pputida_143,Pputida_TALE__HGL_Pputida_144,Pputida_PALE__HGL_Pputida_145,Pputida_PALE__HGL_Pputida_146,Pputida_PALE__HGL_Pputida_147,Pputida_PALE__HGL_Pputida_148,Pputida_PALE__HGL_Pputida_149,Pputida_PALE__HGL_Pputida_150,Pputida_PALE__HGL_Pputida_151,Pputida_PALE__HGL_Pputida_152,Pputida_PALE__HGL_Pputida_153,Pputida_PALE__HGL_Pputida_154,Pputida_PALE__HGL_Pputida_155,Pputida_PALE__HGL_Pputida_156,Pputida_PALE__HGL_Pputida_157,Pputida_PALE__HGL_Pputida_158,Pputida_PALE__HGL_Pputida_159,Pputida_PALE__HGL_Pputida_160,Pputida_PALE__HGL_Pputida_161,Pputida_PALE__HGL_Pputida_162,Pputida_PALE__HGL_Pputida_163,Pputida_PALE__HGL_Pputida_164,Pputida_PALE__HGL_Pputida_165,Pputida_PALE__HGL_Pputida_166,Pputida_PALE__HGL_Pputida_167,Pputida_PALE__HGL_Pputida_168,Pputida_PALE__HGL_Pputida_169,Pputida_PALE__HGL_Pputida_170,Pputida_PALE__HGL_Pputida_171,Pputida_PALE__HGL_Pputida_172,Pputida_PALE__HGL_Pputida_173,Pputida_PALE__HGL_Pputida_174,Pputida_PALE__HGL_Pputida_175,Pputida_PALE__HGL_Pputida_176,JM-Metabolic__GN0_2005,JM-Metabolic__GN0_2007,JM-Metabolic__GN0_2009,JM-Metabolic__GN0_2094,JM-Metabolic__GN0_2099,JM-Metabolic__GN0_2148,JM-Metabolic__GN0_2165,JM-Metabolic__GN0_2169,JM-Metabolic__GN0_2172,JM-Metabolic__GN0_2175,JM-Metabolic__GN0_2183,JM-Metabolic__GN0_2215,JM-Metabolic__GN0_2254,JM-Metabolic__GN0_2277,JM-Metabolic__GN0_2290,JM-Metabolic__GN0_2337,JM-Metabolic__GN0_2317,JM-Metabolic__GN0_2354,JM-Metabolic__GN0_2375,JM-Metabolic__GN0_2380,JM-Metabolic__GN0_2393,JM-Metabolic__GN0_2404,5B,6A,BLANK_41_12G,BLANK_41_12H,Deoxyribose_PALE_ALE__MG1655_BOP27_4_14,Deoxyribose_PALE_ALE__MG1655_BOP27_4_23,Deoxyribose_PALE_ALE__MG1655_BOP27_4_48,Deoxyribose_PALE_ALE__MG1655_BOP27_6_21,Deoxyribose_PALE_ALE__MG1655_BOP27_6_35,Deoxyribose_PALE_ALE__MG1655_BOP27_10_13,Deoxyribose_PALE_ALE__MG1655_BOP27_10_28,Deoxyribose_PALE_ALE__MG1655_BOP27_10_51,Deoxyribose_PALE_ALE__MG1655_Lib4_18_19,Deoxyribose_PALE_ALE__MG1655_Lib4_18_59,Deoxyribose_PALE_ALE__MG1655_Lib4_18_35,Deoxyribose_PALE_ALE__MG1655_Lib4_20_16,Deoxyribose_PALE_ALE__MG1655_Lib4_20_43,Deoxyribose_PALE_ALE__MG1655_Lib4_20_71,Deoxyribose_PALE_ALE__MG1655_Lib4_22_16,Deoxyribose_PALE_ALE__MG1655_Lib4_22_28,Deoxyribose_PALE_ALE__MG1655_Lib4_22_52,Deoxyribose_PALE_ALE__MG1655_Lib4_24_9,Deoxyribose_PALE_ALE__MG1655_Lib4_24_24,Deoxyribose_PALE_ALE__MG1655_Lib4_24_52,Deoxyribose_PALE_ALE__MG1655_Lib4_26_6,Deoxyribose_PALE_ALE__MG1655_Lib4_26_27,Deoxyribose_PALE_ALE__MG1655_Lib4_26_69,Deoxyribose_PALE_ALE__MG1655_Lib4_28_13,Deoxyribose_PALE_ALE__MG1655_Lib4_28_28,Deoxyribose_PALE_ALE__MG1655_Lib4_28_53,Deoxyribose_PALE_ALE__MG1655_Lib4_30_7,Deoxyribose_PALE_ALE__MG1655_Lib4_30_22,Deoxyribose_PALE_ALE__MG1655_Lib4_30_60,Deoxyribose_PALE_ALE__MG1655_Lib4_32_6,Deoxyribose_PALE_ALE__MG1655_Lib4_32_20,Deoxyribose_PALE_ALE__MG1655_Lib4_32_56,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_69,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_50,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_61,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_22,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_36,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_58,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_64,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_55,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_63,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_49,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_42,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_62,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_21,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_50,JM-Metabolic__GN02514,JM-Metabolic__GN02529,JM-Metabolic__GN02531,JM-Metabolic__GN02567,JM-Metabolic__GN02590,JM-Metabolic__GN02657,JM-Metabolic__GN02748,JM-Metabolic__GN02766,JM-Metabolic__GN02769,JM-Metabolic__GN02787,JM-Metabolic__GN03132,JM-Metabolic__GN03218,JM-Metabolic__GN03252,JM-Metabolic__GN03409,JM-Metabolic__GN04014,JM-Metabolic__GN04094,JM-Metabolic__GN04255,JM-Metabolic__GN04306,JM-Metabolic__GN04428,JM-Metabolic__GN04488,JM-Metabolic__GN04540,JM-Metabolic__GN04563,JM-Metabolic__GN04612,JM-Metabolic__GN04665,JM-Metabolic__GN04682,JM-Metabolic__GN05002,JM-Metabolic__GN05109,JM-Metabolic__GN05128,JM-Metabolic__GN05367,JM-Metabolic__GN05377,7A,8A,BLANK_42_12G,BLANK_42_12H,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0326,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0327,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0328,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0329,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0330,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0352,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0353,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0354,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0355,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0356,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0357,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0364,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0366,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0367,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0368,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0369,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0370,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0371,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0372,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0373,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0374,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0375,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0376,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0377,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0378,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0380,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0381,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0382,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0383,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0384,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0385,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0386,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0387,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0388,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0389,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0390,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0391,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0392,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0393,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0394,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0395,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0396,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0397,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0398,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0399,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0400,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0401,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0402,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0403,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0404,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0405,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0406,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0407,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0408,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0409,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0417,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0418,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0419,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0420,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0421,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0473,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0474,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0483,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0484,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0485,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0486,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0516,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0517,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0518,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0519,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0520,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0521,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0522,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0523,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0524,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0525,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08624,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08704,JM-MEC__Staphylococcus_aureusstrain_BERTI-R10727,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11044,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11078,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11101,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11102,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11103,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11135,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11153,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11154,JM-Metabolic__GN02424,JM-Metabolic__GN02446,JM-Metabolic__GN02449,JM-Metabolic__GN02487,JM-Metabolic__GN02501,ISB,GFR,BLANK_43_12G,BLANK_43_12H,RMA_KHP_rpoS_Mage_Q97D,RMA_KHP_rpoS_Mage_Q97L,RMA_KHP_rpoS_Mage_Q97N,RMA_KHP_rpoS_Mage_Q97E,JBI_KHP_HGL_021,JBI_KHP_HGL_022,JBI_KHP_HGL_023,JBI_KHP_HGL_024,JBI_KHP_HGL_025,JBI_KHP_HGL_026,JBI_KHP_HGL_027,JBI_KHP_HGL_028_Amitesh_soxR,JBI_KHP_HGL_029_Amitesh_oxyR,JBI_KHP_HGL_030_Amitesh_soxR_oxyR,JBI_KHP_HGL_031_Amitesh_rpoS,BLANK1_1A,BLANK1_1B,BLANK1_1C,BLANK1_1D,BLANK1_1E,BLANK1_1F,BLANK1_1G,BLANK1_1H,AP581451B02,EP256645B01,EP112567B02,EP337425B01,LP127890A01,EP159692B04,EP987683A01,AP959450A03,SP464350A04,C9,ep256643b01,EP121011B01,AP616837B04,SP506933A04,EP159695B01,EP256644B01,SP511289A02,EP305735B04,SP415030A01,AP549681B02,AP549678B01,EP260544B04,EP202452B01,EP282276B04,SP531696A04,SP515443A04,SP515763A04,EP184255B04,SP503615A02,EP260543B04,EP768748A04,AP309872B03,AP568785B04,EP721390A04,EP940013A01,EP291979B04,EP182065B04,EP128904B02,EP915769A04,SP464352A03,SP365864A04,SP511294A04,EP061002B01,SP410793A01,SP232077A04,EP128910B01,AP531397B04,EP043583B01,EP230245B01,EP606652B04,EP207041B01,EP727972A04,EP291980B04,EP087938B02,SP471496A04,SP573823A04,EP393718B01,SP612496A01,EP032410B02,EP073216B01,EP410046B01,SP561451A04,EP320438B01,SP612495A04,EP446604B03,EP446602B01,EP182243B02,EP333541B04,EP238034B01,AP298002B02,EP455759B04,EP207042B04,LP128479A01,LP128476A01,EP316863B03,C20,lp127896a01,SP491907A02,EP182060B03,EP422407B01,SP573859A04,SP584547A02,EP182346B04,AP668631B04,EP451428B04,LP128538A01,SP490298A02,SP573860A01,EP032412B02,EP163771B01,LP169879A01,EP729433A02,EP447940B04,SP584551A08,EP216516B04,EP023808B02,BLANK2_2A,BLANK2_2B,BLANK2_2C,BLANK2_2D,BLANK2_2E,BLANK2_2F,BLANK2_2G,BLANK2_2H,SP573843A04,EP683835A01,SP573824A04,SP335002A04,SP478193A02,SP232311A04,SP415021A02,SP231630A02,SP641029A02,SP232310A04,EP617442B01,EP587478B04,EP447928B04,EP587475B04,EP675042B01,EP554513B02,EP702221B04,AP568787B02,EP054632B01,EP121013B01,EP649418A02,EP573313B01,LP154981A01,AP470859B01,LP154986A01,AP732307B04,EP533426B03,EP587476B04,AP696363B02,EP587477B04,SP683466A02,EP554518B04,EP533429B04,EP431570B01,EP202095B04,EP504030B04,EP207036B01,EP393717B01,SP491898A02,EP484973B04,EP479794B02,EP554515B04,SP631994A04,EP921593A04,AP787247B04,EP090129B04,EP447975B02,EP212214B01,EP410042B01,SP404409A02,SP247340A04,AP029018B01,EP872341A01,AP062219B03,EP790020A02,EP808112A04,SP404403A02,EP073160B01,EP012991B03,SP317297A02,EP656055A04,EP649623A01,EP790019A01,SP257519A04,EP808104A01,EP808106A01,SP231629A02,EP675044A01,EP657260A01,EP808110A04,AP032413B04,EP843906A04,AP173305B04,SP231628A02,AP173301B04,SP404405A02,EP649653A04,EP718687A04,AP905750A02,EP738468A01,C6,EP890157A02,SP353893A02,EP944059A02,EP970005A01,EP927461A04,EP808111A03,EP927459A04,SP317293A02,SP235186A04,SP399724A04,EP738469A01,SP284095A03,C5,EP337325B04,EP759450A04,BLANK3_3A,BLANK3_3B,BLANK3_3C,BLANK3_3D,BLANK3_3E,BLANK3_3F,BLANK3_3G,BLANK3_3H,AP006367B02,EP929277A02,AP324642B04,EP786631A04,EP657385A04,SP235189A01,EP448041B04,SP231631A02,SP280481A02,AP032412B04,EP649737A03,AP967057A04,EP876243A04,SP229387A04,EP667743A04,SP246941A01,AP745799A04,SP205732A02,SP230382A04,SP230380A02,SP230381A01,SP205754A01,EP606662B04,AP780167B02,EP447927B04,C18,LP191039A01,EP606663B04,EP573296B01,EP447926B04,LP127767A01,EP479266B04,LP128543A01,EP479270B03,EP921594A04,EP554501B04,EP542577B04,EP487995B04,EP542578B04,EP573310B01,EP244366B01,EP533389B03,EP244360B01,AP911328B01,AP481403B02,22_001_801_552_503_00,EP372981B04,EP447929B04,SP573849A04,SP577399A02,EP606656B03,LP166715A01,AP668628B04,C14,EP446610B02,EP339061B02,SP681591A04,EP393712B02,EP410041B01,SP453872A01,22_001_710_503_791_00,LP128540A01,EP339053B02,EP617443B01,EP190307B01,AP795068B04,LP128541A01,EP584756B04,SP284096A02,EP431562B04,EP685640B01,EP339059B02,EP431575B01,EP379938B01,EP529635B02,EP554506B04,EP455757B04,SP491900A02,LP196272A01,SP704319A04,EP617441B01,AP687591B04,SP640978A02,EP981129A02,EP455763B04,EP339057B02,SP491897A02,EP980752B04,LP128539A01,EP996831B04,EP273332B04,EP483291B04,EP393715B01,EP617440B01,EP729434A01,SP645141A03,BLANK4_4A,BLANK4_4B,BLANK4_4C,BLANK4_4D,BLANK4_4E,BLANK4_4F,BLANK4_4G,BLANK4_4H,SP232114A04,EP393714B01,EP533388B01,EP724905B01,EP282108B01,EP282107B01,EP001625B01,EP073209B02,SP232079A01,EP772145A02,AP771472A04,AP223470B01,SP404412A02,EP772143A02,SP408629A01,EP749735A07,EP846485A01,EP808109A01,SP416130A04,EP882752A01,AP953594A02,AP046324B02,AP891020A04,EP790023A01,EP657386A01,EP805337A01,EP927458A04,AP173299B04,EP768164A02,EP886422A01,AP103463B01,AP744361A02,AP065292B01,SP257517A04,EP790021A04,EP675075A04,SP388683A02,SP232309A01,EP899038A04,EP636802A01,AP046327B02,EP905975A04,SP410796A02,EP784608A01,EP808105A01,SP331134A04,EP718688A01,SP232270A02,EP970001A01,EP001624B01,EP868682A01,EP927462A02,C3,EP890158A02,EP023801B04,EP400447B04,EP385379B01,EP385387B01,EP385384B01,SP754514A04,SP415025A01,SP415023A02,EP400448B04,EP479894B04 | tr -d '"') \ + -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} \ + -l s_1 + +if [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/Full ]]; then + echo "Run appears successful" +elif [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/1_demult/Full ]]; then + echo "Run appears unsuccessful but has output" + exit 1 +else + echo "Run appears unsuccessful" + exit 1 +fi \ No newline at end of file From 33726511939b8f7281df46a06829232bff0c7134 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 9 Oct 2024 14:03:20 -0700 Subject: [PATCH 19/47] Added optional parameter to Pipeline() class. Added optional parameter to Pipeline() class that overwrites the values in the lane column of a sample-sheet's data section. This functionality used to reside in the qp-klp plugin and is a common usage pattern. This allows SPP to override the value in a sample-sheet's lane column with the value provided by the user at submission time. --- sequence_processing_pipeline/Pipeline.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 3dd19371..fa5f5c83 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -134,7 +134,7 @@ class Pipeline: assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE] def __init__(self, configuration_file_path, run_id, input_file_path, - output_path, qiita_job_id, pipeline_type): + output_path, qiita_job_id, pipeline_type, lane_number=None): """ Initialize Pipeline object w/configuration information. :param configuration_file_path: Path to configuration.json file. @@ -143,6 +143,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path, :param output_path: Path where all pipeline-generated files live. :param qiita_job_id: Qiita Job ID creating this Pipeline. :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.) + :param lane_number: (Optional) overwrite lane_number in input_file. """ if input_file_path is None: raise PipelineError("user_input_file_path cannot be None") @@ -249,11 +250,29 @@ def __init__(self, configuration_file_path, run_id, input_file_path, output_fp = join(output_path, 'dummy_sample_sheet.csv') self.generate_dummy_sample_sheet(self.run_dir, output_fp) self.sample_sheet = output_fp + + # Optional lane_number parameter is ignored for Amplicon + # runs, as the only valid value is 1. else: # assume user_input_file_path references a sample-sheet. self.sample_sheet = self._validate_sample_sheet(input_file_path) self.mapping_file = None + if lane_number is not None: + # confirm that the lane_number is a reasonable value. + lane_number = int(lane_number) + if lane_number < 1 or lane_number > 8: + raise ValueError(f"'{lane_number}' is not a valid name" + " number") + + # create/overwrite the value for Lane. + for sample in self.sample_sheet.Samples: + sample.Lane = lane_number + + # overwrite the original file. + with open(input_file_path, 'w') as f: + self.sample_sheet.write(f) + self._configure_profile() def get_software_configuration(self, software): From d883b7babe2c5cd1762624ae0b0410733f00ee3a Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 9 Oct 2024 15:07:51 -0700 Subject: [PATCH 20/47] bugfix --- sequence_processing_pipeline/Pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index fa5f5c83..977aed0e 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -266,7 +266,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path, " number") # create/overwrite the value for Lane. - for sample in self.sample_sheet.Samples: + for sample in self.sample_sheet.samples: sample.Lane = lane_number # overwrite the original file. From a075cd9df65dacbe2112a285a448fba677a8f74a Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 13 Oct 2024 22:30:02 -0700 Subject: [PATCH 21/47] Fixes error Fixes error found when post-processing adapter-trimmed fastq files. All files were being moved into one of the project sub-folders, rather than into their associated folders. This appears to be due to recent implementation change. All files are now moved into their correct folder. --- sequence_processing_pipeline/NuQCJob.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index 0ffacb1a..07261e39 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -1,6 +1,6 @@ from metapool import load_sample_sheet from os import stat, makedirs, rename -from os.path import join, basename, dirname, exists, abspath +from os.path import join, basename, dirname, exists, abspath, split from sequence_processing_pipeline.Job import Job, KISSLoader from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) @@ -104,6 +104,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.minimum_bytes = 3100 self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}' r'\.fastq\.gz$') + self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.interleave\.fastq\.gz$') self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$') self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$') @@ -170,7 +171,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): substr = regex.search(file_name) if substr is None: raise ValueError(f"{file_name} does not follow naming " - " pattern.") + "pattern.") else: # check if found substring is a member of this # project. Note sample-name != sample-id @@ -190,8 +191,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): for fp in files_to_move: move(fp, dst) - @staticmethod - def _move_trimmed_files(project_name, output_path): + def _move_trimmed_files(self, project_name, output_path): ''' Given output_path, move all fastqs to a new subdir named project_name. :param project_name: The name of the new folder to be created. @@ -205,8 +205,15 @@ def _move_trimmed_files(project_name, output_path): # this directory shouldn't already exist. makedirs(join(output_path, project_name), exist_ok=False) + sample_ids = [x[0] for x in self.sample_ids if x[1] == project_name] + for trimmed_file in list(glob.glob(pattern)): - move(trimmed_file, join(output_path, project_name)) + file_name = split(trimmed_file)[1] + substr = self.interleave_fastq_regex.search(file_name) + if substr is not None: + # only move the sample_ids in this project. + if substr[1] in sample_ids: + move(trimmed_file, join(output_path, project_name)) else: raise ValueError(f"'{output_path}' does not exist") @@ -258,7 +265,6 @@ def run(self, callback=None): for project in self.project_data: project_name = project['Sample_Project'] needs_human_filtering = project['HumanFiltering'] - source_dir = join(self.output_path, project_name) pattern = f"{source_dir}/*.fastq.gz" completed_files = list(glob.glob(pattern)) @@ -270,7 +276,7 @@ def run(self, callback=None): 'only-adapter-filtered') if exists(trimmed_only_path): - NuQCJob._move_trimmed_files(project_name, trimmed_only_path) + self._move_trimmed_files(project_name, trimmed_only_path) if needs_human_filtering is True: filtered_directory = join(source_dir, 'filtered_sequences') From 62734d8802e6b0fddc52de13ce6236efa11e7a3f Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 16 Oct 2024 20:51:14 -0700 Subject: [PATCH 22/47] Rewrote test --- sequence_processing_pipeline/NuQCJob.py | 7 +- .../tests/test_NuQCJob.py | 135 ++++++++++-------- setup.py | 6 +- 3 files changed, 86 insertions(+), 62 deletions(-) diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index 1f17a46f..89b3106a 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -108,7 +108,9 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.minimum_bytes = 3100 self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}' r'\.fastq\.gz$') - self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.interleave\.fastq\.gz$') + self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d' + r'_\d{3}\.interleave\.fastq' + r'\.gz$') self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$') self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$') @@ -209,7 +211,8 @@ def _move_trimmed_files(self, project_name, output_path): # this directory shouldn't already exist. makedirs(join(output_path, project_name), exist_ok=False) - sample_ids = [x[0] for x in self.sample_ids if x[1] == project_name] + sample_ids = [x[0] for x in self.sample_ids + if x[1] == project_name] for trimmed_file in list(glob.glob(pattern)): file_name = split(trimmed_file)[1] diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py index a7e04ec6..177c8162 100644 --- a/sequence_processing_pipeline/tests/test_NuQCJob.py +++ b/sequence_processing_pipeline/tests/test_NuQCJob.py @@ -9,7 +9,7 @@ ) from os import makedirs, remove from metapool import load_sample_sheet -import glob +from os import walk class TestNuQCJob(unittest.TestCase): @@ -2166,10 +2166,56 @@ def test_generate_mmi_filter_cmds_w_annotate_fastq(self): self.assertEqual(obs, exp) def test_move_trimmed(self): - # Note: this test does not make use of the output_dir that other - # tests use. + # create a NuQCJob() object, but do not call run(). + # instead we will manually create some files to test with. + double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"] + job = NuQCJob( + self.fastq_root_path, + self.output_path, + self.good_sample_sheet_path, + double_db_paths, + "queue_name", + 1, + 1440, + "8", + "fastp", + "minimap2", + "samtools", + [], + self.qiita_job_id, + 1000, + "", + self.movi_path, + self.gres_value, + self.pmls_path, + ['BX'] + ) - for dummy_fp in SAMPLE_DIR: + sample_dir = [ + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/NPH_15288/fastp_reports_dir/html/EP890158A02_S58_L001_" + "R1_001.html", + "NuQCJob/NPH_15288/fastp_reports_dir/json/EP023801B04_S27_L001_" + "R1_001.json", + "NuQCJob/process_all_fastq_files.sh", + "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981." + "completed", + "NuQCJob/logs/slurm-1897981_1.out", + "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R2_001.interleave.fastq.gz', + ] + + for dummy_fp in sample_dir: dummy_fp = self.path(dummy_fp) dummy_path = dirname(dummy_fp) makedirs(dummy_path, exist_ok=True) @@ -2178,38 +2224,33 @@ def test_move_trimmed(self): trimmed_only_path = self.path("NuQCJob", "only-adapter-filtered") - NuQCJob._move_trimmed_files("NPH_15288", trimmed_only_path) - - new_path = join(trimmed_only_path, "NPH_15288") - pattern = f"{new_path}/*.fastq.gz" - - exp = [ - ( - "only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001." - "fastq.gz" - ), - ] - - for trimmed_file in list(glob.glob(pattern)): - trimmed_file = trimmed_file.split("NuQCJob/")[-1] - if trimmed_file not in exp: - self.assertIn(trimmed_file, exp) + # test _move_trimmed_files() by verifying that only the interleave + # fastq files from the NYU project are moved. + job._move_trimmed_files("NYU_BMS_Melanoma_13059", trimmed_only_path) + + new_path = join(trimmed_only_path, "NYU_BMS_Melanoma_13059") + + exp = { + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R2_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R2_001.interleave.fastq.gz' + } + + obs = [] + for root, dirs, files in walk(new_path): + for some_file in files: + some_path = join(root, some_file) + some_path = some_path.replace(self.path(""), "") + obs.append(some_path) + + # confirm that only the samples in NYU_BMS_Melanoma_13059 were + # moved. + self.assertEqual(set(obs), exp) def _helper(self, regex, good_names, bad_names): for good_name in good_names: @@ -2221,27 +2262,5 @@ def _helper(self, regex, good_names, bad_names): self.assertIsNone(substr, msg=f"Regex failed on {bad_name}") -SAMPLE_DIR = [ - "NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json", - "NuQCJob/process_all_fastq_files.sh", - "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed", - "NuQCJob/logs/slurm-1897981_1.out", - "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", -] - if __name__ == "__main__": unittest.main() diff --git a/setup.py b/setup.py index 99103fbb..e0e94196 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,10 @@ install_requires=[ 'click', 'requests', 'pandas', 'flake8', 'nose', 'coverage', 'pgzip', 'jinja2', - 'metapool @ https://github.com/biocore/' - 'metagenomics_pooling_notebook/archive/master.zip' + # 'metapool @ https://github.com/biocore/' + # 'metagenomics_pooling_notebook/archive/master.zip' + 'metapool @ https://codeload.github.com/charles-cowart/metagenomics' + '_pooling_notebook/zip/refs/heads/fake_tellread' ], entry_points={ 'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli' From 45131f114f748e1494f11ba5e6723b5a914e33d3 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sat, 2 Nov 2024 13:13:12 -0700 Subject: [PATCH 23/47] Updated branch to use new DFSheet() functionality --- sequence_processing_pipeline/Pipeline.py | 11 ++++------- setup.py | 4 +++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 977aed0e..86b01cdf 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -15,6 +15,7 @@ from datetime import datetime from xml.etree import ElementTree as ET from metapool.prep import PREP_MF_COLUMNS +from metapool import set_lane_number_in_sheet logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) @@ -265,13 +266,9 @@ def __init__(self, configuration_file_path, run_id, input_file_path, raise ValueError(f"'{lane_number}' is not a valid name" " number") - # create/overwrite the value for Lane. - for sample in self.sample_sheet.samples: - sample.Lane = lane_number - - # overwrite the original file. - with open(input_file_path, 'w') as f: - self.sample_sheet.write(f) + # overwrite sample-sheet w/DFSheets processed version + # with overwritten Lane number. + set_lane_number_in_sheet(input_file_path, lane_number) self._configure_profile() diff --git a/setup.py b/setup.py index e0e94196..e7894aab 100644 --- a/setup.py +++ b/setup.py @@ -45,8 +45,10 @@ 'pgzip', 'jinja2', # 'metapool @ https://github.com/biocore/' # 'metagenomics_pooling_notebook/archive/master.zip' + # sample_sheet_update branch contains all of the changes in the + # fake_tellread branch + DFSheet. 'metapool @ https://codeload.github.com/charles-cowart/metagenomics' - '_pooling_notebook/zip/refs/heads/fake_tellread' + '_pooling_notebook/zip/refs/heads/sample_sheet_update' ], entry_points={ 'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli' From 4665ee8d89744af16c0c2b8ccb94f85ba38bca90 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 3 Nov 2024 22:57:54 -0800 Subject: [PATCH 24/47] Updated to recent changes in metapool --- sequence_processing_pipeline/Pipeline.py | 8 ++--- sequence_processing_pipeline/TellReadJob.py | 6 ++-- .../tests/test_TellReadJob.py | 30 +------------------ 3 files changed, 9 insertions(+), 35 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 86b01cdf..04d96f0a 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -255,10 +255,6 @@ def __init__(self, configuration_file_path, run_id, input_file_path, # Optional lane_number parameter is ignored for Amplicon # runs, as the only valid value is 1. else: - # assume user_input_file_path references a sample-sheet. - self.sample_sheet = self._validate_sample_sheet(input_file_path) - self.mapping_file = None - if lane_number is not None: # confirm that the lane_number is a reasonable value. lane_number = int(lane_number) @@ -270,6 +266,10 @@ def __init__(self, configuration_file_path, run_id, input_file_path, # with overwritten Lane number. set_lane_number_in_sheet(input_file_path, lane_number) + # assume user_input_file_path references a sample-sheet. + self.sample_sheet = self._validate_sample_sheet(input_file_path) + self.mapping_file = None + self._configure_profile() def get_software_configuration(self, software): diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 2f7905d5..ad01ef8f 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -16,8 +16,7 @@ class TellReadJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, qiita_job_id, label, reference_base, - reference_map, tmp1_path, sing_script_path, lane, - cores_per_task): + reference_map, tmp1_path, sing_script_path, cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -61,6 +60,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.sing_script_path = sing_script_path self.tmp1_path = tmp1_path + sheet = load_sample_sheet(self.sample_sheet_path) + lane = sheet.samples[0].Lane + # force self.lane_number to be int. raise an Error if it's not. tmp = int(lane) if tmp < 1 or tmp > 8: diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py index 6cc12632..801947e8 100644 --- a/sequence_processing_pipeline/tests/test_TellReadJob.py +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -43,41 +43,13 @@ def setUp(self): self.cores_per_task = "4" def test_creation(self): - # confirm only sensible lane numbers are allowed. - with self.assertRaisesRegex(ValueError, - "'-1' is not a valid lane number"): - TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, - self.queue_name, self.node_count, self.wall_time_limit, - self.jmem, self.modules_to_load, self.qiita_job_id, - self.label, self.reference_base, self.reference_map, - self.tmp1_path, self.sing_script_path, -1, - self.cores_per_task) - - with self.assertRaisesRegex(ValueError, - "'0' is not a valid lane number"): - TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, - self.queue_name, self.node_count, self.wall_time_limit, - self.jmem, self.modules_to_load, self.qiita_job_id, - self.label, self.reference_base, self.reference_map, - self.tmp1_path, self.sing_script_path, 0, - self.cores_per_task) - - with self.assertRaisesRegex(ValueError, - "'9' is not a valid lane number"): - TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, - self.queue_name, self.node_count, self.wall_time_limit, - self.jmem, self.modules_to_load, self.qiita_job_id, - self.label, self.reference_base, self.reference_map, - self.tmp1_path, self.sing_script_path, 9, - self.cores_per_task) - # test basic good-path job = TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path, self.queue_name, self.node_count, self.wall_time_limit, self.jmem, self.modules_to_load, self.qiita_job_id, self.label, self.reference_base, self.reference_map, - self.tmp1_path, self.sing_script_path, self.lane, + self.tmp1_path, self.sing_script_path, self.cores_per_task) job._generate_job_script() From 3542df37bd5970d1e6abc1f10eee3e568b3323bc Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 6 Nov 2024 18:11:58 -0800 Subject: [PATCH 25/47] Update from testing --- sequence_processing_pipeline/TellReadJob.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index ad01ef8f..e36888db 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -16,7 +16,7 @@ class TellReadJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, qiita_job_id, label, reference_base, - reference_map, tmp1_path, sing_script_path, cores_per_task): + reference_map, sing_script_path, cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -58,7 +58,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.qiita_job_id = qiita_job_id self.jinja_env = Environment(loader=KISSLoader('templates')) self.sing_script_path = sing_script_path - self.tmp1_path = tmp1_path sheet = load_sample_sheet(self.sample_sheet_path) lane = sheet.samples[0].Lane @@ -122,7 +121,9 @@ def _process_sample_sheet(self): sample_ids = [] for sample in sheet.samples: - sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + sample_ids.append((sample['Sample_ID'], + sample['Sample_Project'], + sample['barcode_id'])) bioinformatics = sheet.Bioinformatics @@ -143,12 +144,12 @@ def _generate_job_script(self): # generate a comma separated list of sample-ids from the tuples stored # in self.sample_ids. - # NB: the current sample-sheet format used for TellRead doesn't include - # sample-names and sample-ids, only sample_id. e.g. C501,C502,etc. - # Hence, when a final sample sheet format is ready, it may be prudent - # to switch this to pull values from the expected sample-names column - # instead. - samples = ','.join([id[0] for id in self.sample_ids]) + # NB: Proposed sample-sheets will have traditional Sample_ID and + # Sample_Name columns as well as a new value named barcode_id. It's + # this column that will contain the 'C50n' values needed to be + # supplied to tellread. Later we will use this mapping to rename the + # files from C50n...fastq.gz to sample-name...fastq.gz. + samples = ','.join([id[2] for id in self.sample_ids]) # since we haven't included support for reference_map yet, whenever a # reference is not included, the mapping against the list of sample_ids @@ -170,7 +171,7 @@ def _generate_job_script(self): "cores_per_task": self.cores_per_task, "queue_name": self.queue_name, "sing_script_path": self.sing_script_path, - "tmp_dir": self.tmp1_path, + "tmp_dir": join(self.output_path, "output", "tmp1"), "modules_to_load": ' '.join(self.modules_to_load), "lane": f"s_{self.lane_number}", "output": join(self.output_path, "output"), From c2c3b06b3f5babf357530dc1fd67848580f4dbe0 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 13 Nov 2024 18:01:22 -0800 Subject: [PATCH 26/47] Updates to TRIntegrateJob based on testing --- .../TRIntegrateJob.py | 52 +++++-- .../TRNormCountsJob.py | 7 +- sequence_processing_pipeline/TellReadJob.py | 22 +-- .../templates/integrate.sbatch | 130 +++++++----------- .../templates/tellread.sbatch | 17 +-- 5 files changed, 106 insertions(+), 122 deletions(-) diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 25cec68a..3b1e8561 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -6,6 +6,8 @@ from .Pipeline import Pipeline from .PipelineError import PipelineError from metapool import load_sample_sheet +from os import makedirs +from shutil import copy logging.basicConfig(level=logging.DEBUG) @@ -14,8 +16,9 @@ class TRIntegrateJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, max_array_length, indicies_script_path, label, - reference_base, reference_map, cores_per_task): + qiita_job_id, max_array_length, integrate_script_path, + sil_path, raw_fastq_dir, reference_base, reference_map, + cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -29,8 +32,8 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param modules_to_load: A list of Linux module names to load :param qiita_job_id: identify Torque jobs using qiita_job_id :param max_array_length: None - :param indicies_script_path: None - :param label: None + :param integrate_script_path: None + :param sil_path: A path to a confidential file mapping C5xx, adapters. :param reference_base: None :param reference_map: None :param cores_per_task: # of CPU cores per node to request. @@ -50,7 +53,10 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.node_count = node_count self.wall_time_limit = wall_time_limit self.cores_per_task = cores_per_task - self.indicies_script_path = indicies_script_path + self.integrate_script_path = integrate_script_path + self.sil_path = sil_path + self.raw_fastq_dir = raw_fastq_dir + self.tmp_dir = join(self.output_path, 'tmp') self.reference_base = reference_base self.reference_map = reference_map @@ -60,17 +66,31 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.qiita_job_id = qiita_job_id self.sample_count = len(self.sample_ids) self.jinja_env = Environment(loader=KISSLoader('templates')) - self.label = label + self.job_name = (f"integrate_{self.qiita_job_id}") - if self.reference_base is not None or self.reference_map is not None: - tag = 'reference-based' - else: - tag = 'reference-free' - - self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + with open(self.sil_path, 'r') as f: + # obtain the number of unique barcode_ids as determined by + # TellReadJob() in order to set up an array job of the + # proper length. + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.barcode_id_count = len(lines) def run(self, callback=None): job_script_path = self._generate_job_script() + + # copy sil_path to TRIntegrate working directory and rename to a + # predictable name. + copy(self.sil_path, join(self.output_path, 'sample_index_list.txt')) + + # generate the tailored subset of adapter to barcode_id based on + # the proprietary lists owned by the manufacturer and supplied by + # the caller, and the barcode ids found in the sample-sheet. + self._generate_sample_index_list() + + makedirs(self.tmp_dir) + params = ['--parsable', f'-J {self.job_name}', f'--array 1-{self.sample_count}'] @@ -132,8 +152,14 @@ def _generate_job_script(self): "mem_in_gb": self.jmem, "node_count": self.node_count, "cores_per_task": self.cores_per_task, - "iinp_script_path": self.indicies_script_path, + "integrate_script_path": self.integrate_script_path, "queue_name": self.queue_name, + "barcode_id_count": self.barcode_id_count, + "raw_fastq_dir": self.raw_fastq_dir, + "tmp_dir": self.tmp_dir, "output_dir": self.output_path})) return job_script_path + + def parse_logs(self): + raise PipelineError("parse_logs() not implemented for TRIntegrateJob") diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py index a3603bcd..6887994a 100644 --- a/sequence_processing_pipeline/TRNormCountsJob.py +++ b/sequence_processing_pipeline/TRNormCountsJob.py @@ -62,12 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, self.jinja_env = Environment(loader=KISSLoader('templates')) self.label = label - if self.reference_base is not None or self.reference_map is not None: - tag = 'reference-based' - else: - tag = 'reference-free' - - self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate") + self.job_name = (f"norm_counts_{self.qiita_job_id}") def run(self, callback=None): job_script_path = self._generate_job_script() diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index e36888db..322cca17 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -15,7 +15,7 @@ class TellReadJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, label, reference_base, + qiita_job_id, reference_base, reference_map, sing_script_path, cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq @@ -29,7 +29,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param jmem: String representing total memory limit for entire job. :param modules_to_load: A list of Linux module names to load :param qiita_job_id: identify Torque jobs using qiita_job_id - :param label: None :param reference_base: None :param reference_map: None :param cores_per_task: (Optional) # of CPU cores per node to request. @@ -75,15 +74,13 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, tag = 'reference-free' date = datetime.today().strftime('%Y.%m.%d') - self.job_name = (f"{label}-{tag}-{date}-tellread") + self.job_name = (f"{self.qiita_job_id}-{tag}-{date}-tellread") def run(self, callback=None): job_script_path = self._generate_job_script() - params = ['--parsable', - f'-J {self.job_name}', - '-c ${sbatch_cores}', - '--mem ${sbatch_mem}', - '--time ${wall}'] + + # everything is in the job script so there are no additional params. + params = [] try: self.job_info = self.submit_job(job_script_path, @@ -171,10 +168,15 @@ def _generate_job_script(self): "cores_per_task": self.cores_per_task, "queue_name": self.queue_name, "sing_script_path": self.sing_script_path, - "tmp_dir": join(self.output_path, "output", "tmp1"), "modules_to_load": ' '.join(self.modules_to_load), "lane": f"s_{self.lane_number}", - "output": join(self.output_path, "output"), + # NB: Note that we no longer create a sub-directory under the + # working directory for TellRead to create all its output + # folders and files. This means it is creating folders and + # files in the same directory that has our sbatch script and + # logs directory. Currently there are no name collisions, + # however. + "output": self.output_path, "rundir_path": self.root_dir, "samples": samples, "refs": refs, diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index 8c767382..92dcfe87 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -1,96 +1,66 @@ #!/bin/bash -l -#SBATCH -J {{job_name}} # integrate -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 8G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 1 -#SBATCH -p {{queue_name}} # qiita +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{barcode_id_count}} +#SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out +#SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err -#SBATCH --output integrate_%x-%A_%a.out -#SBATCH --error integrate_%x-%A_%a.err - -# NB SLURM_ARRAY_TASK_ID is exported by Slurm -if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then - echo "Not operating in an array" - exit 1 -fi - -# NB SLURM_ARRAY_TASK_MIN is exported by Slurm -if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then - echo "Line extraction assumes 1-based index" - exit 1 -fi - -set -x +set -x set -e -set -o pipefail -samples=($(cat {{output_dir}}/sample_index_list_output.txt | cut -f 2)) +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} -# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT. -export TMPDIR=$(mktemp -d) -function cleanup { - echo "Removing $TMPDIR" - rm -r $TMPDIR - unset TMPDIR -} -trap cleanup EXIT +export TMPDIR={{tmp_dir}} -files=${TMPDIR}/integration.files -/bin/ls -1 {{output_dir}}/Full/*corrected.err_barcode_removed.fastq > ${files} -mkdir -p {{output_dir}}/integrated +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} -if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R1" - exit 1 -fi +echo "Processing sample ${sample}..." -if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R2" - exit 1 -fi +# make temp directory +export TMPDIR={{tmp_dir}} +mkdir -p $TMPDIR -if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} I1" - exit 1 -fi -r1=$(grep -m 1 "_R1_${sample}" ${files}) -r2=$(grep -m 1 "_R2_${sample}" ${files}) -i1=$(grep -m 1 "_I1_${sample}" ${files}) -r1out={{output_dir}}/integrated/${sample}.R1.fastq.gz -r2out={{output_dir}}/integrated/${sample}.R2.fastq.gz -i1out={{output_dir}}/integrated/${sample}.I1.fastq.gz +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. -if [[ ! -s ${r1} ]]; then - echo "${r1} is empty, cannot integrate" - if [[ -s ${r2} ]]; then - echo "R1 and R2 are inconsistent" - exit 1 - fi - if [[ -s ${i1} ]]; then - echo "R1 and I1 are inconsistent" - exit 1 - fi +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in={{raw_fastq_dir}}/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in={{raw_fastq_dir}}/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in={{raw_fastq_dir}}/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq - # reflect the empties so Qiita can know of them - touch ${r1out} - touch ${r2out} - touch ${i1out} - exit 0 -fi +# create output directory +mkdir -p {{output_dir}}/integrated + +# generate output file names +r1_out={{output_dir}}/integrated/${sample}.R1.fastq.gz +r2_out={{output_dir}}/integrated/${sample}.R2.fastq.gz +i1_out={{output_dir}}/integrated/${sample}.I1.fastq.gz -# this can probably be backgrounded but then you have to get creative to -# not mask a nonzero exit status (e.g., the python process raising) -cat ${i1} | gzip > ${i1out} +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} +# generate integrated R1 and R2 fastq.gz files. conda activate qp-knight-lab-processing-2022.03 -python {{iinp_script_path}} integrate \ - --no-sort \ - --r1-in ${r1} \ - --r2-in ${r2} \ - --i1-in ${i1} \ - --r1-out ${r1out} \ - --r2-out ${r2out} \ - --threads ${SLURM_CPUS_PER_TASK} + +python {{integrate_script_path}} integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads {{cores_per_task}} diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index 7d044bb7..89cae33f 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -6,17 +6,11 @@ #SBATCH --mem {{mem_in_gb}}G #SBATCH --time {{wall_time_limit}} -#SBATCH --output tellread_%x-%A.out -#SBATCH --error tellread_%x-%A.err +#SBATCH --output {{output}}/logs/tellread_%x-%A.out +#SBATCH --error {{output}}/logs/tellread_%x-%A.err set -x -export TMPDIR={{tmp_dir}} -mkdir -p ${TMPDIR} -export TMPDIR=$(mktemp -d) - -mkdir -p {{output}} - module load {{modules_to_load}} {{sing_script_path}} \ -i {{rundir_path}} \ @@ -27,11 +21,8 @@ module load {{modules_to_load}} -l {{lane}} if [[ -d {{output}}/Full ]]; then - echo "Run appears successful" -elif [[ -d {{output}}/1_demult/Full ]]; then - echo "Run appears unsuccessful but has output" - exit 1 + echo "tellread.sbatch successful" else - echo "Run appears unsuccessful" + echo "tellread.sbatch unsuccessful" exit 1 fi From 49f16732fce1df4bd105b57bdced5a2de2e7012c Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 13 Nov 2024 18:16:14 -0800 Subject: [PATCH 27/47] Updated sample config file --- .../iseq_metagenomic.json | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json index 089e82f1..c82c76b0 100644 --- a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json +++ b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json @@ -3,6 +3,25 @@ "instrument_type": "iseq", "assay_type": "Metagenomic", "configuration": { + "tell-seq": { + "label": "my_label", + "reference_base": "", + "reference_map": "", + "sing_script_path": "/my_path/tellread-release-novaseqX/run_tellread_sing.sh", + "nodes": 1, + "lane": 1, + "sample_index_list": "/my_path/sample_index_list_1.txt", + "queue": "qiita", + "wallclock_time_in_minutes": 1440, + "modules_to_load": ["singularity_3.6.4"], + "integrate_script_path": "/my_path/integrate-indices-np.py", + "tellread_mem_limit": "16", + "tellread_cores": "4", + "normcount_cores": "1", + "integrate_cores": "1", + "normcount_mem_limit": "8", + "integrate_mem_limit": "8" + }, "bcl2fastq": { "nodes": 1, "nprocs": 16, From efc0849a007be79cddf8ce9a250d84aa165589f3 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 13 Nov 2024 19:48:42 -0800 Subject: [PATCH 28/47] Replaced legacy exit check for tellread --- .../templates/tellread.sbatch | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index 89cae33f..f46e0798 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -20,9 +20,33 @@ module load {{modules_to_load}} -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \ -l {{lane}} -if [[ -d {{output}}/Full ]]; then - echo "tellread.sbatch successful" -else - echo "tellread.sbatch unsuccessful" - exit 1 -fi +# instead of testing for the presence of '{{output}}/Full', we will review +# the changed timestamps for all the files in '{{output}}/Full' and when +# we can demonstrate that they haven't changed in an arbitrary period of time +# we will consider the work completed. + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 From 47278653a0cf8104038394419a961f4799dac411 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Fri, 15 Nov 2024 21:29:21 -0800 Subject: [PATCH 29/47] recent updates --- sequence_processing_pipeline/Job.py | 1 + sequence_processing_pipeline/TRIntegrateJob.py | 2 +- sequence_processing_pipeline/TellReadJob.py | 4 ++-- sequence_processing_pipeline/util.py | 12 +++++++++++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 59d9cea2..36248ab3 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -12,6 +12,7 @@ import logging from inspect import stack import re +from time import time # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 3b1e8561..4e273055 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -7,7 +7,7 @@ from .PipelineError import PipelineError from metapool import load_sample_sheet from os import makedirs -from shutil import copy +from shutil import copyfile logging.basicConfig(level=logging.DEBUG) diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 322cca17..ee5054de 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -73,8 +73,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, else: tag = 'reference-free' - date = datetime.today().strftime('%Y.%m.%d') - self.job_name = (f"{self.qiita_job_id}-{tag}-{date}-tellread") + self.job_name = (f"{self.qiita_job_id}-tellread") def run(self, callback=None): job_script_path = self._generate_job_script() @@ -187,3 +186,4 @@ def _generate_job_script(self): def parse_logs(self): raise PipelineError("parse_logs() not implemented for TellReadJob") + diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py index d9586f81..c5b3cdef 100644 --- a/sequence_processing_pipeline/util.py +++ b/sequence_processing_pipeline/util.py @@ -1,7 +1,17 @@ import re -PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +#PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') + +# The above will truncate on the first _R1_ found, which only works when _R1_ or _R2_ +# appears exactly once in a file path. When the wet-lab incorporates these same strings +# in their sample-names as descriptive metadata, this assumption is broken. +# For all raw fastq files being used as input into NuQCJob, we can assume they end +# in the following convention. Per Illumina spec, all fastq files end in _001 and we +# preserve this convention even at the cost of renaming output files from TRIntegrateJob. +# PAIR_DOT is kept as is, but may be removed later because for the purposes of SPP, no input +# should ever be named with dots instead of underscores. +PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), '_R1_001.fastq.gz', '_R2_001.fastq.gz') PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.') PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT) From ba1399f8f621c5856134cbc6271ba8da71d2f9e5 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 19 Nov 2024 15:20:21 -0800 Subject: [PATCH 30/47] Updated tests --- sequence_processing_pipeline/Commands.py | 2 +- sequence_processing_pipeline/Job.py | 1 - sequence_processing_pipeline/Pipeline.py | 5 +- .../TRIntegrateJob.py | 3 +- sequence_processing_pipeline/TellReadJob.py | 8 -- .../templates/tellread.sbatch | 3 +- .../data/tellread_output/tellread_test.sbatch | 55 ++++--- .../data/tellseq_metag_dummy_sample_sheet.csv | 135 ++++++++++++++++++ .../tests/test_TellReadJob.py | 12 +- .../tests/test_commands.py | 36 +++-- .../tests/test_util.py | 32 ++--- sequence_processing_pipeline/util.py | 23 +-- 12 files changed, 231 insertions(+), 84 deletions(-) create mode 100644 sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index cce7c605..642e49cf 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -87,7 +87,7 @@ def demux(id_map, fp, out_d, task, maxtask): """Split infile data based in provided map""" delimiter = '::MUX::' mode = 'wt' - ext = '.fastq.gz' + ext = '_001.fastq.gz' sep = '/' rec = '@' diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 36248ab3..59d9cea2 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -12,7 +12,6 @@ import logging from inspect import stack import re -from time import time # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 04d96f0a..2b9f3fa2 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -15,7 +15,6 @@ from datetime import datetime from xml.etree import ElementTree as ET from metapool.prep import PREP_MF_COLUMNS -from metapool import set_lane_number_in_sheet logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) @@ -264,7 +263,9 @@ def __init__(self, configuration_file_path, run_id, input_file_path, # overwrite sample-sheet w/DFSheets processed version # with overwritten Lane number. - set_lane_number_in_sheet(input_file_path, lane_number) + sheet = load_sample_sheet(input_file_path) + with open(input_file_path, 'w') as f: + sheet.write(f, lane=lane_number) # assume user_input_file_path references a sample-sheet. self.sample_sheet = self._validate_sample_sheet(input_file_path) diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 4e273055..875a1988 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -82,7 +82,8 @@ def run(self, callback=None): # copy sil_path to TRIntegrate working directory and rename to a # predictable name. - copy(self.sil_path, join(self.output_path, 'sample_index_list.txt')) + copyfile(self.sil_path, + join(self.output_path, 'sample_index_list.txt')) # generate the tailored subset of adapter to barcode_id based on # the proprietary lists owned by the manufacturer and supplied by diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index ee5054de..3b3bf314 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -6,7 +6,6 @@ from .Pipeline import Pipeline from .PipelineError import PipelineError from metapool import load_sample_sheet -from datetime import datetime logging.basicConfig(level=logging.DEBUG) @@ -67,12 +66,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, raise ValueError(f"'{tmp}' is not a valid lane number") self.lane_number = tmp - # TODO: Need examples of these being not None - if self.reference_base is not None or self.reference_map is not None: - tag = 'reference-based' - else: - tag = 'reference-free' - self.job_name = (f"{self.qiita_job_id}-tellread") def run(self, callback=None): @@ -186,4 +179,3 @@ def _generate_job_script(self): def parse_logs(self): raise PipelineError("parse_logs() not implemented for TellReadJob") - diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index f46e0798..66d9d9fd 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -23,7 +23,7 @@ module load {{modules_to_load}} # instead of testing for the presence of '{{output}}/Full', we will review # the changed timestamps for all the files in '{{output}}/Full' and when # we can demonstrate that they haven't changed in an arbitrary period of time -# we will consider the work completed. +# we will consider the work completed. # get the timestamp for the most recently changed file in directory '.' @@ -50,3 +50,4 @@ done # if we've reached this point then we've exceeded our hard-limit for waiting. # return w/an error. exit 1 + diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch index a008937b..fb099cf3 100644 --- a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch +++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch @@ -6,32 +6,47 @@ #SBATCH --mem 16G #SBATCH --time 96:00:00 -#SBATCH --output tellread_%x-%A.out -#SBATCH --error tellread_%x-%A.err +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.err set -x -export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/tmp1 -mkdir -p ${TMPDIR} -export TMPDIR=$(mktemp -d) - -mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output - module load singularity_3.6.4 $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \ - -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output \ - -s $(echo CDPH-SAL__Salmonella__Typhi__MDL-143,CDPH-SAL_Salmonella_Typhi_MDL-144,CDPH-SAL_Salmonella_Typhi_MDL-145,CDPH-SAL_Salmonella_Typhi_MDL-146,CDPH-SAL_Salmonella_Typhi_MDL-147,CDPH-SAL_Salmonella_Typhi_MDL-148,CDPH-SAL_Salmonella_Typhi_MDL-149,CDPH-SAL_Salmonella_Typhi_MDL-150,CDPH-SAL_Salmonella_Typhi_MDL-151,CDPH-SAL_Salmonella_Typhi_MDL-152,CDPH-SAL_Salmonella_Typhi_MDL-153,CDPH-SAL_Salmonella_Typhi_MDL-154,CDPH-SAL_Salmonella_Typhi_MDL-155,CDPH-SAL_Salmonella_Typhi_MDL-156,CDPH-SAL_Salmonella_Typhi_MDL-157,CDPH-SAL_Salmonella_Typhi_MDL-158,CDPH-SAL_Salmonella_Typhi_MDL-159,CDPH-SAL_Salmonella_Typhi_MDL-160,CDPH-SAL_Salmonella_Typhi_MDL-161,CDPH-SAL_Salmonella_Typhi_MDL-162,CDPH-SAL_Salmonella_Typhi_MDL-163,CDPH-SAL_Salmonella_Typhi_MDL-164,CDPH-SAL_Salmonella_Typhi_MDL-165,CDPH-SAL_Salmonella_Typhi_MDL-166,CDPH-SAL_Salmonella_Typhi_MDL-167,CDPH-SAL_Salmonella_Typhi_MDL-168,P21_E_coli_ELI344,P21_E_coli_ELI345,P21_E_coli_ELI347,P21_E_coli_ELI348,P21_E_coli_ELI349,P21_E_coli_ELI350,P21_E_coli_ELI351,P21_E_coli_ELI352,P21_E_coli_ELI353,P21_E_coli_ELI354,P21_E_coli_ELI355,P21_E_coli_ELI357,P21_E_coli_ELI358,P21_E_coli_ELI359,P21_E_coli_ELI361,P21_E_coli_ELI362,P21_E_coli_ELI363,P21_E_coli_ELI364,P21_E_coli_ELI365,P21_E_coli_ELI366,P21_E_coli_ELI367,P21_E_coli_ELI368,P21_E_coli_ELI369,stALE_E_coli_A1_F21_I1_R1,stALE_E_coli_A2_F21_I1_R1,stALE_E_coli_A3_F18_I1_R1,stALE_E_coli_A3_F40_I1_R1,stALE_E_coli_A4_F21_I1_R1,stALE_E_coli_A4_F21_I1_R2,stALE_E_coli_A4_F42_I1_R1,stALE_E_coli_A5_F21_I1_R1,stALE_E_coli_A5_F42_I1_R1,stALE_E_coli_A6_F21_I1_R1,stALE_E_coli_A6_F43_I1_R1,stALE_E_coli_A7_F21_I1_R1,stALE_E_coli_A7_F42_I1_R1,stALE_E_coli_A8_F20_I1_R1,stALE_E_coli_A8_F42_I1_R1,stALE_E_coli_A9_F21_I1_R1,stALE_E_coli_A9_F44_I1_R1,stALE_E_coli_A10_F21_I1_R1,stALE_E_coli_A10_F43_I1_R1,stALE_E_coli_A10_F131_I1_R1,stALE_E_coli_A11_F21_I1_R1,stALE_E_coli_A11_F43_I1_R1,stALE_E_coli_A11_F119_I1_R1,stALE_E_coli_A12_F21_I1_R1,stALE_E_coli_A12_F43_I1_R1,stALE_E_coli_A12_F136_I1_R1,stALE_E_coli_A13_F20_I1_R1,stALE_E_coli_A13_F42_I1_R1,stALE_E_coli_A13_F121_I1_R1,stALE_E_coli_A14_F20_I1_R1,stALE_E_coli_A14_F42_I1_R1,stALE_E_coli_A14_F133_I1_R1,stALE_E_coli_A15_F21_I1_R1,stALE_E_coli_A15_F42_I1_R1,stALE_E_coli_A15_F117_I1_R1,stALE_E_coli_A16_F20_I1_R1,stALE_E_coli_A16_F42_I1_R1,stALE_E_coli_A16_F134_I1_R1,stALE_E_coli_A17_F21_I1_R1,stALE_E_coli_A17_F118_I1_R1,stALE_E_coli_A18_F18_I1_R1,stALE_E_coli_A18_F39_I1_R1,stALE_E_coli_A18_F130_I1_R1,3A,4A,BLANK_40_12G,BLANK_40_12H,Pputida_JBEI__HGL_Pputida_107_BP6,Pputida_JBEI__HGL_Pputida_108_BP7,Pputida_JBEI__HGL_Pputida_109_BP8,Pputida_JBEI__HGL_Pputida_110_M2,Pputida_JBEI__HGL_Pputida_111_M5,Pputida_TALE__HGL_Pputida_112,Pputida_TALE__HGL_Pputida_113,Pputida_TALE__HGL_Pputida_114,Pputida_TALE__HGL_Pputida_115,Pputida_TALE__HGL_Pputida_116,Pputida_TALE__HGL_Pputida_117,Pputida_TALE__HGL_Pputida_118,Pputida_TALE__HGL_Pputida_119,Pputida_TALE__HGL_Pputida_120,Pputida_TALE__HGL_Pputida_121,Pputida_TALE__HGL_Pputida_122,Pputida_TALE__HGL_Pputida_123,Pputida_TALE__HGL_Pputida_124,Pputida_TALE__HGL_Pputida_125,Pputida_TALE__HGL_Pputida_126,Pputida_TALE__HGL_Pputida_127,Pputida_TALE__HGL_Pputida_128,Pputida_TALE__HGL_Pputida_129,Pputida_TALE__HGL_Pputida_130,Pputida_TALE__HGL_Pputida_131,Pputida_TALE__HGL_Pputida_132,Pputida_TALE__HGL_Pputida_133,Pputida_TALE__HGL_Pputida_134,Pputida_TALE__HGL_Pputida_135,Pputida_TALE__HGL_Pputida_136,Pputida_TALE__HGL_Pputida_137,Pputida_TALE__HGL_Pputida_138,Pputida_TALE__HGL_Pputida_139,Pputida_TALE__HGL_Pputida_140,Pputida_TALE__HGL_Pputida_141,Pputida_TALE__HGL_Pputida_142,Pputida_TALE__HGL_Pputida_143,Pputida_TALE__HGL_Pputida_144,Pputida_PALE__HGL_Pputida_145,Pputida_PALE__HGL_Pputida_146,Pputida_PALE__HGL_Pputida_147,Pputida_PALE__HGL_Pputida_148,Pputida_PALE__HGL_Pputida_149,Pputida_PALE__HGL_Pputida_150,Pputida_PALE__HGL_Pputida_151,Pputida_PALE__HGL_Pputida_152,Pputida_PALE__HGL_Pputida_153,Pputida_PALE__HGL_Pputida_154,Pputida_PALE__HGL_Pputida_155,Pputida_PALE__HGL_Pputida_156,Pputida_PALE__HGL_Pputida_157,Pputida_PALE__HGL_Pputida_158,Pputida_PALE__HGL_Pputida_159,Pputida_PALE__HGL_Pputida_160,Pputida_PALE__HGL_Pputida_161,Pputida_PALE__HGL_Pputida_162,Pputida_PALE__HGL_Pputida_163,Pputida_PALE__HGL_Pputida_164,Pputida_PALE__HGL_Pputida_165,Pputida_PALE__HGL_Pputida_166,Pputida_PALE__HGL_Pputida_167,Pputida_PALE__HGL_Pputida_168,Pputida_PALE__HGL_Pputida_169,Pputida_PALE__HGL_Pputida_170,Pputida_PALE__HGL_Pputida_171,Pputida_PALE__HGL_Pputida_172,Pputida_PALE__HGL_Pputida_173,Pputida_PALE__HGL_Pputida_174,Pputida_PALE__HGL_Pputida_175,Pputida_PALE__HGL_Pputida_176,JM-Metabolic__GN0_2005,JM-Metabolic__GN0_2007,JM-Metabolic__GN0_2009,JM-Metabolic__GN0_2094,JM-Metabolic__GN0_2099,JM-Metabolic__GN0_2148,JM-Metabolic__GN0_2165,JM-Metabolic__GN0_2169,JM-Metabolic__GN0_2172,JM-Metabolic__GN0_2175,JM-Metabolic__GN0_2183,JM-Metabolic__GN0_2215,JM-Metabolic__GN0_2254,JM-Metabolic__GN0_2277,JM-Metabolic__GN0_2290,JM-Metabolic__GN0_2337,JM-Metabolic__GN0_2317,JM-Metabolic__GN0_2354,JM-Metabolic__GN0_2375,JM-Metabolic__GN0_2380,JM-Metabolic__GN0_2393,JM-Metabolic__GN0_2404,5B,6A,BLANK_41_12G,BLANK_41_12H,Deoxyribose_PALE_ALE__MG1655_BOP27_4_14,Deoxyribose_PALE_ALE__MG1655_BOP27_4_23,Deoxyribose_PALE_ALE__MG1655_BOP27_4_48,Deoxyribose_PALE_ALE__MG1655_BOP27_6_21,Deoxyribose_PALE_ALE__MG1655_BOP27_6_35,Deoxyribose_PALE_ALE__MG1655_BOP27_10_13,Deoxyribose_PALE_ALE__MG1655_BOP27_10_28,Deoxyribose_PALE_ALE__MG1655_BOP27_10_51,Deoxyribose_PALE_ALE__MG1655_Lib4_18_19,Deoxyribose_PALE_ALE__MG1655_Lib4_18_59,Deoxyribose_PALE_ALE__MG1655_Lib4_18_35,Deoxyribose_PALE_ALE__MG1655_Lib4_20_16,Deoxyribose_PALE_ALE__MG1655_Lib4_20_43,Deoxyribose_PALE_ALE__MG1655_Lib4_20_71,Deoxyribose_PALE_ALE__MG1655_Lib4_22_16,Deoxyribose_PALE_ALE__MG1655_Lib4_22_28,Deoxyribose_PALE_ALE__MG1655_Lib4_22_52,Deoxyribose_PALE_ALE__MG1655_Lib4_24_9,Deoxyribose_PALE_ALE__MG1655_Lib4_24_24,Deoxyribose_PALE_ALE__MG1655_Lib4_24_52,Deoxyribose_PALE_ALE__MG1655_Lib4_26_6,Deoxyribose_PALE_ALE__MG1655_Lib4_26_27,Deoxyribose_PALE_ALE__MG1655_Lib4_26_69,Deoxyribose_PALE_ALE__MG1655_Lib4_28_13,Deoxyribose_PALE_ALE__MG1655_Lib4_28_28,Deoxyribose_PALE_ALE__MG1655_Lib4_28_53,Deoxyribose_PALE_ALE__MG1655_Lib4_30_7,Deoxyribose_PALE_ALE__MG1655_Lib4_30_22,Deoxyribose_PALE_ALE__MG1655_Lib4_30_60,Deoxyribose_PALE_ALE__MG1655_Lib4_32_6,Deoxyribose_PALE_ALE__MG1655_Lib4_32_20,Deoxyribose_PALE_ALE__MG1655_Lib4_32_56,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_69,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_50,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_61,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_22,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_36,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_58,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_64,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_55,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_63,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_49,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_42,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_62,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_21,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_50,JM-Metabolic__GN02514,JM-Metabolic__GN02529,JM-Metabolic__GN02531,JM-Metabolic__GN02567,JM-Metabolic__GN02590,JM-Metabolic__GN02657,JM-Metabolic__GN02748,JM-Metabolic__GN02766,JM-Metabolic__GN02769,JM-Metabolic__GN02787,JM-Metabolic__GN03132,JM-Metabolic__GN03218,JM-Metabolic__GN03252,JM-Metabolic__GN03409,JM-Metabolic__GN04014,JM-Metabolic__GN04094,JM-Metabolic__GN04255,JM-Metabolic__GN04306,JM-Metabolic__GN04428,JM-Metabolic__GN04488,JM-Metabolic__GN04540,JM-Metabolic__GN04563,JM-Metabolic__GN04612,JM-Metabolic__GN04665,JM-Metabolic__GN04682,JM-Metabolic__GN05002,JM-Metabolic__GN05109,JM-Metabolic__GN05128,JM-Metabolic__GN05367,JM-Metabolic__GN05377,7A,8A,BLANK_42_12G,BLANK_42_12H,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0326,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0327,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0328,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0329,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0330,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0352,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0353,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0354,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0355,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0356,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0357,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0364,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0366,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0367,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0368,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0369,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0370,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0371,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0372,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0373,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0374,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0375,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0376,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0377,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0378,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0380,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0381,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0382,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0383,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0384,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0385,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0386,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0387,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0388,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0389,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0390,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0391,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0392,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0393,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0394,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0395,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0396,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0397,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0398,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0399,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0400,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0401,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0402,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0403,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0404,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0405,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0406,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0407,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0408,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0409,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0417,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0418,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0419,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0420,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0421,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0473,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0474,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0483,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0484,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0485,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0486,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0516,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0517,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0518,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0519,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0520,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0521,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0522,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0523,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0524,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0525,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08624,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08704,JM-MEC__Staphylococcus_aureusstrain_BERTI-R10727,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11044,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11078,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11101,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11102,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11103,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11135,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11153,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11154,JM-Metabolic__GN02424,JM-Metabolic__GN02446,JM-Metabolic__GN02449,JM-Metabolic__GN02487,JM-Metabolic__GN02501,ISB,GFR,BLANK_43_12G,BLANK_43_12H,RMA_KHP_rpoS_Mage_Q97D,RMA_KHP_rpoS_Mage_Q97L,RMA_KHP_rpoS_Mage_Q97N,RMA_KHP_rpoS_Mage_Q97E,JBI_KHP_HGL_021,JBI_KHP_HGL_022,JBI_KHP_HGL_023,JBI_KHP_HGL_024,JBI_KHP_HGL_025,JBI_KHP_HGL_026,JBI_KHP_HGL_027,JBI_KHP_HGL_028_Amitesh_soxR,JBI_KHP_HGL_029_Amitesh_oxyR,JBI_KHP_HGL_030_Amitesh_soxR_oxyR,JBI_KHP_HGL_031_Amitesh_rpoS,BLANK1_1A,BLANK1_1B,BLANK1_1C,BLANK1_1D,BLANK1_1E,BLANK1_1F,BLANK1_1G,BLANK1_1H,AP581451B02,EP256645B01,EP112567B02,EP337425B01,LP127890A01,EP159692B04,EP987683A01,AP959450A03,SP464350A04,C9,ep256643b01,EP121011B01,AP616837B04,SP506933A04,EP159695B01,EP256644B01,SP511289A02,EP305735B04,SP415030A01,AP549681B02,AP549678B01,EP260544B04,EP202452B01,EP282276B04,SP531696A04,SP515443A04,SP515763A04,EP184255B04,SP503615A02,EP260543B04,EP768748A04,AP309872B03,AP568785B04,EP721390A04,EP940013A01,EP291979B04,EP182065B04,EP128904B02,EP915769A04,SP464352A03,SP365864A04,SP511294A04,EP061002B01,SP410793A01,SP232077A04,EP128910B01,AP531397B04,EP043583B01,EP230245B01,EP606652B04,EP207041B01,EP727972A04,EP291980B04,EP087938B02,SP471496A04,SP573823A04,EP393718B01,SP612496A01,EP032410B02,EP073216B01,EP410046B01,SP561451A04,EP320438B01,SP612495A04,EP446604B03,EP446602B01,EP182243B02,EP333541B04,EP238034B01,AP298002B02,EP455759B04,EP207042B04,LP128479A01,LP128476A01,EP316863B03,C20,lp127896a01,SP491907A02,EP182060B03,EP422407B01,SP573859A04,SP584547A02,EP182346B04,AP668631B04,EP451428B04,LP128538A01,SP490298A02,SP573860A01,EP032412B02,EP163771B01,LP169879A01,EP729433A02,EP447940B04,SP584551A08,EP216516B04,EP023808B02,BLANK2_2A,BLANK2_2B,BLANK2_2C,BLANK2_2D,BLANK2_2E,BLANK2_2F,BLANK2_2G,BLANK2_2H,SP573843A04,EP683835A01,SP573824A04,SP335002A04,SP478193A02,SP232311A04,SP415021A02,SP231630A02,SP641029A02,SP232310A04,EP617442B01,EP587478B04,EP447928B04,EP587475B04,EP675042B01,EP554513B02,EP702221B04,AP568787B02,EP054632B01,EP121013B01,EP649418A02,EP573313B01,LP154981A01,AP470859B01,LP154986A01,AP732307B04,EP533426B03,EP587476B04,AP696363B02,EP587477B04,SP683466A02,EP554518B04,EP533429B04,EP431570B01,EP202095B04,EP504030B04,EP207036B01,EP393717B01,SP491898A02,EP484973B04,EP479794B02,EP554515B04,SP631994A04,EP921593A04,AP787247B04,EP090129B04,EP447975B02,EP212214B01,EP410042B01,SP404409A02,SP247340A04,AP029018B01,EP872341A01,AP062219B03,EP790020A02,EP808112A04,SP404403A02,EP073160B01,EP012991B03,SP317297A02,EP656055A04,EP649623A01,EP790019A01,SP257519A04,EP808104A01,EP808106A01,SP231629A02,EP675044A01,EP657260A01,EP808110A04,AP032413B04,EP843906A04,AP173305B04,SP231628A02,AP173301B04,SP404405A02,EP649653A04,EP718687A04,AP905750A02,EP738468A01,C6,EP890157A02,SP353893A02,EP944059A02,EP970005A01,EP927461A04,EP808111A03,EP927459A04,SP317293A02,SP235186A04,SP399724A04,EP738469A01,SP284095A03,C5,EP337325B04,EP759450A04,BLANK3_3A,BLANK3_3B,BLANK3_3C,BLANK3_3D,BLANK3_3E,BLANK3_3F,BLANK3_3G,BLANK3_3H,AP006367B02,EP929277A02,AP324642B04,EP786631A04,EP657385A04,SP235189A01,EP448041B04,SP231631A02,SP280481A02,AP032412B04,EP649737A03,AP967057A04,EP876243A04,SP229387A04,EP667743A04,SP246941A01,AP745799A04,SP205732A02,SP230382A04,SP230380A02,SP230381A01,SP205754A01,EP606662B04,AP780167B02,EP447927B04,C18,LP191039A01,EP606663B04,EP573296B01,EP447926B04,LP127767A01,EP479266B04,LP128543A01,EP479270B03,EP921594A04,EP554501B04,EP542577B04,EP487995B04,EP542578B04,EP573310B01,EP244366B01,EP533389B03,EP244360B01,AP911328B01,AP481403B02,22_001_801_552_503_00,EP372981B04,EP447929B04,SP573849A04,SP577399A02,EP606656B03,LP166715A01,AP668628B04,C14,EP446610B02,EP339061B02,SP681591A04,EP393712B02,EP410041B01,SP453872A01,22_001_710_503_791_00,LP128540A01,EP339053B02,EP617443B01,EP190307B01,AP795068B04,LP128541A01,EP584756B04,SP284096A02,EP431562B04,EP685640B01,EP339059B02,EP431575B01,EP379938B01,EP529635B02,EP554506B04,EP455757B04,SP491900A02,LP196272A01,SP704319A04,EP617441B01,AP687591B04,SP640978A02,EP981129A02,EP455763B04,EP339057B02,SP491897A02,EP980752B04,LP128539A01,EP996831B04,EP273332B04,EP483291B04,EP393715B01,EP617440B01,EP729434A01,SP645141A03,BLANK4_4A,BLANK4_4B,BLANK4_4C,BLANK4_4D,BLANK4_4E,BLANK4_4F,BLANK4_4G,BLANK4_4H,SP232114A04,EP393714B01,EP533388B01,EP724905B01,EP282108B01,EP282107B01,EP001625B01,EP073209B02,SP232079A01,EP772145A02,AP771472A04,AP223470B01,SP404412A02,EP772143A02,SP408629A01,EP749735A07,EP846485A01,EP808109A01,SP416130A04,EP882752A01,AP953594A02,AP046324B02,AP891020A04,EP790023A01,EP657386A01,EP805337A01,EP927458A04,AP173299B04,EP768164A02,EP886422A01,AP103463B01,AP744361A02,AP065292B01,SP257517A04,EP790021A04,EP675075A04,SP388683A02,SP232309A01,EP899038A04,EP636802A01,AP046327B02,EP905975A04,SP410796A02,EP784608A01,EP808105A01,SP331134A04,EP718688A01,SP232270A02,EP970001A01,EP001624B01,EP868682A01,EP927462A02,C3,EP890158A02,EP023801B04,EP400447B04,EP385379B01,EP385387B01,EP385384B01,SP754514A04,SP415025A01,SP415023A02,EP400448B04,EP479894B04 | tr -d '"') \ - -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \ + -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob \ + -s $(echo C501,C509,C502,C510,C503,C511,C504,C512,C505,C513,C506,C514,C507,C515,C508,C516,C517,C525,C518,C526,C519,C527,C520,C528,C521,C529,C522,C530,C523,C531,C524,C532,C533,C541,C534,C542,C535,C543,C536,C544,C537,C545,C538,C546,C539,C547,C540,C548,C549,C557,C550,C558,C551,C559,C552,C560,C553,C561,C554,C562,C555,C563,C556,C564,C565,C573,C566,C574,C567,C575,C568,C576,C569,C577,C570,C578,C571,C579,C572,C580,C581,C589,C582,C590,C583,C591,C584,C592,C585,C593,C586,C594,C587,C595,C588,C596 | tr -d '"') \ + -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \ -j ${SLURM_JOB_CPUS_PER_NODE} \ -l s_1 -if [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/Full ]]; then - echo "Run appears successful" -elif [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/1_demult/Full ]]; then - echo "Run appears unsuccessful but has output" - exit 1 -else - echo "Run appears unsuccessful" - exit 1 -fi \ No newline at end of file +# instead of testing for the presence of 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full', we will review +# the changed timestamps for all the files in 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full' and when +# we can demonstrate that they haven't changed in an arbitrary period of time +# we will consider the work completed. + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 diff --git a/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv new file mode 100644 index 00000000..105330fd --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv @@ -0,0 +1,135 @@ +[Header],,,,,,,, +IEMFileVersion,1,,,,,,, +SheetType,tellseq_metag,,,,,,, +SheetVersion,10,,,,,,, +Investigator Name,Knight,,,,,,, +Experiment Name,RKL0151,,,,,,, +Date,5/6/24,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +Application,FASTQ Only,,,,,,, +Assay,Metagenomic,,,,,,, +Description,,,,,,,, +Chemistry,Default,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +151,,,,,,,, +151,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +ReverseComplement,0,,,,,,, +,,,,,,,, +[Data],,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,well_id_384,barcode_id,Sample_Project,Well_description,Lane, +LS_8_10_2013_SRE,LS.8.10.2013.SRE,LS_Donor_SS_Samples_P1,A1,C501,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013.SRE,1, +LS_12_17_2014_SRE,LS.12.17.2014.SRE,LS_Donor_SS_Samples_P1,B1,C509,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.17.2014.SRE,1, +LS_4_4_2015_SRE,LS.4.4.2015.SRE,LS_Donor_SS_Samples_P1,C1,C502,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.4.2015.SRE,1, +LS_2_23_2015_SRE,LS.2.23.2015.SRE,LS_Donor_SS_Samples_P1,D1,C510,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.23.2015.SRE,1, +LS_9_28_2014_SRE,LS.9.28.2014.SRE,LS_Donor_SS_Samples_P1,E1,C503,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.28.2014.SRE,1, +LS_12_14_2013_SRE,LS.12.14.2013.SRE,LS_Donor_SS_Samples_P1,F1,C511,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.14.2013.SRE,1, +LS_4_7_2013_SRE,LS.4.7.2013.SRE,LS_Donor_SS_Samples_P1,G1,C504,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.7.2013.SRE,1, +LS_7_14_2013_SRE,LS.7.14.2013.SRE,LS_Donor_SS_Samples_P1,H1,C512,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.14.2013.SRE,1, +LS_10_27_2013_SRE,LS.10.27.2013.SRE,LS_Donor_SS_Samples_P1,I1,C505,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.10.27.2013.SRE,1, +LS_1_19_2014_SRE,LS.1.19.2014.SRE,LS_Donor_SS_Samples_P1,J1,C513,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014.SRE,1, +LS_9_3_2013_SRE,LS.9.3.2013.SRE,LS_Donor_SS_Samples_P1,K1,C506,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.3.2013.SRE,1, +LS_2_25_2013_SRE,LS.2.25.2013.SRE,LS_Donor_SS_Samples_P1,L1,C514,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.25.2013.SRE,1, +LS_7_26_2015_SRE,LS.7.26.2015.SRE,LS_Donor_SS_Samples_P1,M1,C507,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.26.2015.SRE,1, +LS_2_17_2014_SRE,LS.2.17.2014.SRE,LS_Donor_SS_Samples_P1,N1,C515,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.17.2014.SRE,1, +LS_6_29_2015_SRE,LS.6.29.2015.SRE,LS_Donor_SS_Samples_P1,O1,C508,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015.SRE,1, +LS_3_24_2015_SRE,LS.3.24.2015.SRE,LS_Donor_SS_Samples_P1,P1,C516,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015.SRE,1, +LS_1_6_2015_SRE,LS.1.6.2015.SRE,LS_Donor_SS_Samples_P1,A2,C517,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.6.2015.SRE,1, +T_LS_7_15_15B_SRE,T.LS.7.15.15B.SRE,LS_Donor_SS_Samples_P1,B2,C525,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B.SRE,1, +LS_6_9_2013_SRE,LS.6.9.2013.SRE,LS_Donor_SS_Samples_P1,C2,C518,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.9.2013.SRE,1, +Person A_SRE,Person A.SRE,LS_Donor_SS_Samples_P1,D2,C526,Tellseq_Shortread_Metagenomic_Analysis_10283,Person A.SRE,1, +LS_8_22_2014_R2_SRE,LS.8.22.2014.R2.SRE,LS_Donor_SS_Samples_P1,E2,C519,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R2.SRE,1, +Person B_SRE,Person B.SRE,LS_Donor_SS_Samples_P1,F2,C527,Tellseq_Shortread_Metagenomic_Analysis_10283,Person B.SRE,1, +LS_8_22_2014_R1_SRE,LS.8.22.2014.R1.SRE,LS_Donor_SS_Samples_P1,G2,C520,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R1.SRE,1, +Person C_SRE,Person C.SRE,LS_Donor_SS_Samples_P1,H2,C528,Tellseq_Shortread_Metagenomic_Analysis_10283,Person C.SRE,1, +LS_12_28_2011_SRE,LS.12.28.2011.SRE,LS_Donor_SS_Samples_P1,I2,C521,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.28.2011.SRE,1, +Person D_SRE,Person D.SRE,LS_Donor_SS_Samples_P1,J2,C529,Tellseq_Shortread_Metagenomic_Analysis_10283,Person D.SRE,1, +LS_5_4_2014_SRE,LS.5.4.2014.SRE,LS_Donor_SS_Samples_P1,K2,C522,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.5.4.2014.SRE,1, +45208_1_1,45208.1.1,UROBIOME_TEST_MF_SAMPLES_P2,L2,C530,Tellseq_Shortread_Metagenomic_Analysis_10283,45208.1.1,1, +LS_11_6_2012_SRE,LS.11.6.2012.SRE,LS_Donor_SS_Samples_P1,M2,C523,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.6.2012.SRE,1, +45248_2_2,45248.2.2,UROBIOME_TEST_MF_SAMPLES_P2,N2,C531,Tellseq_Shortread_Metagenomic_Analysis_10283,45248.2.2,1, +LS_4_3_2012_SRE,LS.4.3.2012.SRE,LS_Donor_SS_Samples_P1,O2,C524,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.3.2012.SRE,1, +45261_2_1,45261.2.1,UROBIOME_TEST_MF_SAMPLES_P2,P2,C532,Tellseq_Shortread_Metagenomic_Analysis_10283,45261.2.1,1, +45272_11_2,45272.11.2,UROBIOME_TEST_MF_SAMPLES_P2,A3,C533,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.11.2,1, +T_LS_7_12_15A,T.LS.7.12.15A,Larry_Smarr_Plus_Donor_Samples_P3,B3,C541,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.12.15A,1, +45316_8_1,45316.8.1,UROBIOME_TEST_MF_SAMPLES_P2,C3,C534,Tellseq_Shortread_Metagenomic_Analysis_10283,45316.8.1,1, +T_LS_7_8_15A,T.LS.7.8.15A,Larry_Smarr_Plus_Donor_Samples_P3,D3,C542,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.8.15A,1, +45327_7_2,45327.7.2,UROBIOME_TEST_MF_SAMPLES_P2,E3,C535,Tellseq_Shortread_Metagenomic_Analysis_10283,45327.7.2,1, +LS_8_10_2013,LS.8.10.2013,LS_Time_Series_ABSQ_P4,F3,C543,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013,1, +45272_1_swab_2,45272.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,G3,C536,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.1.swab.2,1, +LS_6_29_2015,LS.6.29.2015,LS_Time_Series_ABSQ_P4,H3,C544,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015,1, +45326_1_swab_2,45326.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,I3,C537,Tellseq_Shortread_Metagenomic_Analysis_10283,45326.1.swab.2,1, +LS_3_8_2015,LS.3.8.2015,LS_Time_Series_ABSQ_P4,J3,C545,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.8.2015,1, +T_LS_7_19_15A,T.LS.7.19.15A,Larry_Smarr_Plus_Donor_Samples_P3,K3,C538,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15A,1, +LS_4_29_2013,LS.4.29.2013,LS_Time_Series_ABSQ_P4,L3,C546,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.29.2013,1, +T_LS_7_15_15B,T.LS.7.15.15B,Larry_Smarr_Plus_Donor_Samples_P3,M3,C539,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B,1, +LS_11_16_2014,LS.11.16.2014,LS_Time_Series_ABSQ_P4,N3,C547,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.16.2014,1, +T_LS_7_19_15B,T.LS.7.19.15B,Larry_Smarr_Plus_Donor_Samples_P3,O3,C540,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15B,1, +LS_1_19_2014,LS.1.19.2014,LS_Time_Series_ABSQ_P4,P3,C548,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014,1, +LS_3_24_2015,LS.3.24.2015,LS_Time_Series_ABSQ_P4,A4,C549,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015,1, +LS_2_8_2013,LS.2.8.2013,LS_Time_Series_ABSQ_P4,B4,C557,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.8.2013,1, +LS_11_10_2013,LS.11.10.2013,LS_Time_Series_ABSQ_P4,C4,C550,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.10.2013,1, +Marine_Sediment_0_2cm_R1,Marine.Sediment.0.2cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,D4,C558,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.0.2cm.R1,1, +LS_3_23_2014,LS.3.23.2014,LS_Time_Series_ABSQ_P4,E4,C551,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.23.2014,1, +Marine_Sediment_5_7cm_R1,Marine.Sediment.5.7cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,F4,C559,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.5.7cm.R1,1, +LS_1_14_2015,LS.1.14.2015,LS_Time_Series_ABSQ_P4,G4,C552,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.14.2015,1, +Marine_Sediment_10_12cm_R2,Marine.Sediment.10.12cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,H4,C560,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.10.12cm.R2,1, +LS_8_25_2014,LS.8.25.2014,LS_Time_Series_ABSQ_P4,I4,C553,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.25.2014,1, +Marine_Sediment_15_17cm_R1,Marine.Sediment.15.17cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,J4,C561,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.15.17cm.R1,1, +LS_1_26_2013,LS.1.26.2013,LS_Time_Series_ABSQ_P4,K4,C554,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.26.2013,1, +Marine_Sediment_20_22cm_R1,Marine.Sediment.20.22cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,L4,C562,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.20.22cm.R1,1, +LS_6_16_2014,LS.6.16.2014,LS_Time_Series_ABSQ_P4,M4,C555,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.16.2014,1, +Marine_Sediment_25_27cm_R2,Marine.Sediment.25.27cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,N4,C563,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.25.27cm.R2,1, +LS_7_27_2014,LS.7.27.2014,LS_Time_Series_ABSQ_P4,O4,C556,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.27.2014,1, +Marine_Sediment_30_32cm_R3,Marine.Sediment.30.32cm.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,P4,C564,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.30.32cm.R3,1, +Person_A_R3,Person.A.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,A5,C565,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.A.R3,1, +Soil_SynCom_T4_2_Tube5,Soil.SynCom.T4.2.Tube5,16_member_community_native_soil_P6,B5,C573,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.2.Tube5,1, +Person_B_R2,Person.B.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,C5,C566,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.B.R2,1, +A21,A21,Tumor_Community_P7,D5,C574,Tellseq_Shortread_Metagenomic_Analysis_10283,A21,1, +Person_C_R4,Person.C.R4,MarineSediment_Donor_LarrySmarr_NoProK_P5,E5,C567,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.C.R4,1, +A23,A23,Tumor_Community_P7,F5,C575,Tellseq_Shortread_Metagenomic_Analysis_10283,A23,1, +Person_D_R2,Person.D.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,G5,C568,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.D.R2,1, +A27,A27,Tumor_Community_P7,H5,C576,Tellseq_Shortread_Metagenomic_Analysis_10283,A27,1, +Soil_SynCom_T1_2_Tube1,Soil.SynCom.T1.2.Tube1,16_member_community_native_soil_P6,I5,C569,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T1.2.Tube1,1, +A30,A30,Tumor_Community_P7,J5,C577,Tellseq_Shortread_Metagenomic_Analysis_10283,A30,1, +Soil _SynCom_T2_2_Tube2,Soil .SynCom.T2.2.Tube2,16_member_community_native_soil_P6,K5,C570,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil .SynCom.T2.2.Tube2,1, +A31,A31,Tumor_Community_P7,L5,C578,Tellseq_Shortread_Metagenomic_Analysis_10283,A31,1, +Soil_SynCom_T3_2_Tube3,Soil.SynCom.T3.2.Tube3,16_member_community_native_soil_P6,M5,C571,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T3.2.Tube3,1, +S1_T1_A,S1.T1.A,Tumor_Community_P7,N5,C579,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.A,1, +Soil_SynCom_T4_1_Tube4,Soil.SynCom.T4.1.Tube4,16_member_community_native_soil_P6,O5,C572,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.1.Tube4,1, +S2_T1_B_A,S2.T1.B.A,Tumor_Community_P7,P5,C580,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.B.A,1, +S2_T1_01BH1_Y_A,S2.T1.01BH1.Y.A,Tumor_Community_P7,A6,C581,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.01BH1.Y.A,1, +S1_T1_1CIM_A,S1.T1.1CIM.A,Tumor_Community_P7,B6,C589,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.1CIM.A,1, +S2_MT1_1HBI_Y_A,S2.MT1.1HBI.Y.A,Tumor_Community_P7,C6,C582,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.1HBI.Y.A,1, +S1_M1_B_1CIM_A,S1.M1.B.1CIM.A,Tumor_Community_P7,D6,C590,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.M1.B.1CIM.A,1, +S1_T1_B_LBM_A,S1.T1.B.LBM.A,Tumor_Community_P7,E6,C583,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.B.LBM.A,1, +BLANK_K15_cancer_patient,BLANK.K15.cancer.patient,Tumor_Community_P7,F6,C591,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.K15.cancer.patient,1, +S2_MT1_LBM_A,S2.MT1.LBM.A,Tumor_Community_P7,G6,C584,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.LBM.A,1, +BLANK_M15_cancer_patient,BLANK.M15.cancer.patient,Tumor_Community_P7,H6,C592,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.M15.cancer.patient,1, +S2_T1_A,S2.T1.A,Tumor_Community_P7,I6,C585,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.A,1, +BLANK_O15_cancer_patient,BLANK.O15.cancer.patient,Tumor_Community_P7,J6,C593,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.O15.cancer.patient,1, +1CIM_M_CNTL_A,1CIM.M.CNTL.A,Tumor_Community_P7,K6,C586,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.M.CNTL.A,1, +BLANK_A17_cancer_patient,BLANK.A17.cancer.patient,Tumor_Community_P7,L6,C594,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.A17.cancer.patient,1, +1CIM_G_CNTL_A,1CIM.G.CNTL.A,Tumor_Community_P7,M6,C587,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.G.CNTL.A,1, +BLANK_C17_cancer_patient,BLANK.C17.cancer.patient,Tumor_Community_P7,N6,C595,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.C17.cancer.patient,1, +GC_1HCOM_A,GC.1HCOM.A,Tumor_Community_P7,O6,C588,Tellseq_Shortread_Metagenomic_Analysis_10283,GC.1HCOM.A,1, +BLANK_E17_cancer_patient,BLANK.E17.cancer.patient,Tumor_Community_P7,P6,C596,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.E17.cancer.patient,1, +,,,,,,,, +[Bioinformatics],,,,,,,, +Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates +Tellseq_Shortread_Metagenomic_Analysis_10283,10283,TRUE,GATCGGAAGAGCACACGTCTGAACTCCAGTCAC,GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT,TRUE,tellseq,tellseq metagenomics,FALSE +,,,,,,,, +[Contact],,,,,,,, +Sample_Project,Email,,,,,,, +Tellseq_Shortread_Metagenomic_Analysis_10283,cbrenchy@gmail.com,,,,,,, +,,,,,,,, +[SampleContext],,,,,,,, +sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,, +BLANK.K15.cancer.patient,control blank,10283,,,,,, +BLANK.M15.cancer.patient,control blank,10283,,,,,, +BLANK.O15.cancer.patient,control blank,10283,,,,,, +BLANK.A17.cancer.patient,control blank,10283,,,,,, +BLANK.C17.cancer.patient,control blank,10283,,,,,, +BLANK.E17.cancer.patient,control blank,10283,,,,,, \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py index 801947e8..440192c8 100644 --- a/sequence_processing_pipeline/tests/test_TellReadJob.py +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -23,7 +23,9 @@ def setUp(self): self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') # TODO: Revisit w/a proper sample-sheet once spec is near finalized. - self.sample_sheet_path = self.path('data', 'good-sample-sheet.csv') + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') self.queue_name = "qiita" self.node_count = "1" @@ -48,9 +50,8 @@ def test_creation(self): self.sample_sheet_path, self.queue_name, self.node_count, self.wall_time_limit, self.jmem, self.modules_to_load, self.qiita_job_id, - self.label, self.reference_base, self.reference_map, - self.tmp1_path, self.sing_script_path, - self.cores_per_task) + self.reference_base, self.reference_map, + self.sing_script_path, self.cores_per_task) job._generate_job_script() @@ -61,9 +62,6 @@ def test_creation(self): exp_lines = f.readlines() for obs_line, exp_line in zip(obs_lines, exp_lines): - print("OBS: %s" % obs_line) - print("EXP: %s" % exp_line) - print("") self.assertEqual(obs_line, exp_line) diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py index f58bb176..3919ef43 100644 --- a/sequence_processing_pipeline/tests/test_commands.py +++ b/sequence_processing_pipeline/tests/test_commands.py @@ -16,12 +16,12 @@ def test_split_similar_size_bins(self, glob, stat): class MockStat: st_size = 2 ** 28 # 256MB - mockglob = ['/foo/bar/a_R1_.fastq.gz', - '/foo/bar/b_R2_.fastq.gz', - '/foo/bar/a_R2_.fastq.gz', - '/foo/baz/c_R2_.fastq.gz', - '/foo/baz/c_R1_.fastq.gz', - '/foo/bar/b_R1_.fastq.gz'] + mockglob = ['/foo/bar/a_R1_001.fastq.gz', + '/foo/bar/b_R2_001.fastq.gz', + '/foo/bar/a_R2_001.fastq.gz', + '/foo/baz/c_R2_001.fastq.gz', + '/foo/baz/c_R1_001.fastq.gz', + '/foo/bar/b_R1_001.fastq.gz'] with TemporaryDirectory() as tmp: exp = (2, 1073741824) @@ -30,9 +30,12 @@ class MockStat: obs = split_similar_size_bins('foo', 1, tmp + '/prefix') self.assertEqual(obs, exp) - exp_1 = ('/foo/bar/a_R1_.fastq.gz\t/foo/bar/a_R2_.fastq.gz\tbar\n' - '/foo/bar/b_R1_.fastq.gz\t/foo/bar/b_R2_.fastq.gz\tbar\n') - exp_2 = '/foo/baz/c_R1_.fastq.gz\t/foo/baz/c_R2_.fastq.gz\tbaz\n' + exp_1 = ('/foo/bar/a_R1_001.fastq.gz\t/foo/bar/a_R2_001.fastq.gz' + '\tbar\n' + '/foo/bar/b_R1_001.fastq.gz\t/foo/bar/b_R2_001.fastq.gz' + '\tbar\n') + exp_2 = ('/foo/baz/c_R1_001.fastq.gz\t/foo/baz/c_R2_001.fastq.gz' + '\tbaz\n') obs_1 = open(tmp + '/prefix-1').read() self.assertEqual(obs_1, exp_1) @@ -71,9 +74,16 @@ def test_demux(self): demux(id_map, infile, tmp, task, maxtask) - obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1.fastq.gz'), + foo = join(tmp, 'Project_12345') + from os import walk + for root, dirs, files in walk(foo): + for _file in files: + _path = join(root, _file) + print(_path) + + obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'), 'rt').read() - obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2.fastq.gz'), + obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'), 'rt').read() exp = '\n'.join(exp_data_r1) + '\n' self.assertEqual(obs_r1, exp) @@ -81,8 +91,8 @@ def test_demux(self): exp = '\n'.join(exp_data_r2) + '\n' self.assertEqual(obs_r2, exp) - self.assertFalse(os.path.exists(join(tmp, 'a_R1.fastq.gz'))) - self.assertFalse(os.path.exists(join(tmp, 'a_R2.fastq.gz'))) + self.assertFalse(os.path.exists(join(tmp, 'a_R1_001.fastq.gz'))) + self.assertFalse(os.path.exists(join(tmp, 'a_R2_001.fastq.gz'))) if __name__ == '__main__': diff --git a/sequence_processing_pipeline/tests/test_util.py b/sequence_processing_pipeline/tests/test_util.py index 136dc9a0..e5073101 100644 --- a/sequence_processing_pipeline/tests/test_util.py +++ b/sequence_processing_pipeline/tests/test_util.py @@ -4,24 +4,18 @@ class TestUtil(unittest.TestCase): def test_iter_paired_files(self): - tests = [(['a_R1_foo', - 'b_R2_bar', - 'a_R2_baz', - 'b_R1_bing'], - [('a_R1_foo', 'a_R2_baz'), - ('b_R1_bing', 'b_R2_bar')]), - (['a.R1.foo', - 'b.R2.bar', - 'a.R2.baz', - 'b.R1.bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b.R1.bing', 'b.R2.bar')]), - (['a.R1.foo', - 'b_R2_bar', - 'a.R2.baz', - 'b_R1_bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b_R1_bing', 'b_R2_bar')])] + # tuples of randomly ordered fastq files and thier expected + # sorted and organized output from iter_paired_files(). + + # underscore filenames updated to require '_001.fastq.gz'. + # legacy dot filenames test remains as-is. + tests = [(['b_R2_001.fastq.gz', 'a_R1_001.fastq.gz', + 'a_R2_001.fastq.gz', 'b_R1_001.fastq.gz'], + [('a_R1_001.fastq.gz', 'a_R2_001.fastq.gz'), + ('b_R1_001.fastq.gz', 'b_R2_001.fastq.gz')]), + (['a.R1.foo', 'b.R2.bar', 'a.R2.baz', 'b.R1.bing'], + [('a.R1.foo', 'a.R2.baz'), ('b.R1.bing', 'b.R2.bar')])] + for files, exp in tests: obs = list(iter_paired_files(files)) self.assertEqual(obs, exp) @@ -42,7 +36,7 @@ def test_iter_paired_files_bad_pair(self): list(iter_paired_files(files)) def test_iter_paired_files_mismatch_prefix(self): - files = ['a_R1_foo', 'ab_R2_foo'] + files = ['a_R1_001.fastq.gz', 'ab_R2_001.fastq.gz'] with self.assertRaisesRegex(ValueError, "Mismatch prefixes"): list(iter_paired_files(files)) diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py index c5b3cdef..e19bf98a 100644 --- a/sequence_processing_pipeline/util.py +++ b/sequence_processing_pipeline/util.py @@ -1,17 +1,18 @@ import re -#PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') - -# The above will truncate on the first _R1_ found, which only works when _R1_ or _R2_ -# appears exactly once in a file path. When the wet-lab incorporates these same strings -# in their sample-names as descriptive metadata, this assumption is broken. -# For all raw fastq files being used as input into NuQCJob, we can assume they end -# in the following convention. Per Illumina spec, all fastq files end in _001 and we -# preserve this convention even at the cost of renaming output files from TRIntegrateJob. -# PAIR_DOT is kept as is, but may be removed later because for the purposes of SPP, no input -# should ever be named with dots instead of underscores. -PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), '_R1_001.fastq.gz', '_R2_001.fastq.gz') +# PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +# The above will truncate on the first _R1_ found, which only works when _R1_ +# or _R2_ appears exactly once in a file path. When the wet-lab incorporates +# these same strings in their sample-names as descriptive metadata, this +# assumption is broken. For all raw fastq files being used as input into +# NuQCJob, we can assume they end in the following convention. Per Illumina +# spec, all fastq files end in _001 and we preserve this convention even at +# the cost of renaming output files from TRIntegrateJob. +# PAIR_DOT is kept as is, but may be removed later because for the purposes of +# SPP, no input should ever be named with dots instead of underscores. +PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), + '_R1_001.fastq.gz', '_R2_001.fastq.gz') PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.') PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT) From fd1809b893d7b6ee3d0edbfc8d0fd2e9cc0927bc Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Tue, 19 Nov 2024 15:21:42 -0800 Subject: [PATCH 31/47] Update setup.py to point to merged metapool updates --- setup.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index e7894aab..99103fbb 100644 --- a/setup.py +++ b/setup.py @@ -43,12 +43,8 @@ install_requires=[ 'click', 'requests', 'pandas', 'flake8', 'nose', 'coverage', 'pgzip', 'jinja2', - # 'metapool @ https://github.com/biocore/' - # 'metagenomics_pooling_notebook/archive/master.zip' - # sample_sheet_update branch contains all of the changes in the - # fake_tellread branch + DFSheet. - 'metapool @ https://codeload.github.com/charles-cowart/metagenomics' - '_pooling_notebook/zip/refs/heads/sample_sheet_update' + 'metapool @ https://github.com/biocore/' + 'metagenomics_pooling_notebook/archive/master.zip' ], entry_points={ 'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli' From 96f3cffad5f72fbe305e46f8f4eeebd4f3b73c22 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 16:48:41 -0800 Subject: [PATCH 32/47] New tests for slurm polling --- sequence_processing_pipeline/FastQCJob.py | 2 +- .../GenPrepFileJob.py | 2 +- sequence_processing_pipeline/Job.py | 156 +++++++++------- .../scripts/fake_squeue.py | 101 ++++++++++ .../tests/test_Job.py | 173 +++++++++++++++++- 5 files changed, 365 insertions(+), 69 deletions(-) create mode 100755 sequence_processing_pipeline/scripts/fake_squeue.py diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 5e0bf4fc..889ef75d 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -255,7 +255,7 @@ def run(self, callback=None): cmd = ' '.join(cmd_head + input_path_list + cmd_tail) - results = self._system_call(cmd, callback=callback) + results = Job._system_call(cmd, callback=callback) if results['return_code'] != 0: raise PipelineError("multiqc encountered an error") diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py index 49e8f651..0bb2c52c 100644 --- a/sequence_processing_pipeline/GenPrepFileJob.py +++ b/sequence_processing_pipeline/GenPrepFileJob.py @@ -159,7 +159,7 @@ def run(self, callback=None): # currently that is how it's done. Hence, self.output_directory # and the path to run_dir might be different locations than the # others. - res = self._system_call(' '.join(command), callback=callback) + res = Job._system_call(' '.join(command), callback=callback) if res['return_code'] != 0: raise PipelineError("Seqpro encountered an error") diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 59d9cea2..7a771908 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -12,6 +12,7 @@ import logging from inspect import stack import re +from collections import Counter # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader @@ -233,6 +234,41 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} + def query_slurm(self, job_ids): + # query_slurm encapsulates the handling of squeue. + count = 0 + while True: + result = self._system_call("squeue -t all -j " + f"{','.join(job_ids)} " + "-o '%i,%T'") + + if result['return_code'] == 0: + # there was no issue w/squeue, break this loop and + # continue. + break + else: + # there was likely an intermittent issue w/squeue. Pause + # and wait before trying a few more times. If the problem + # persists then report the error and exit. + count += 1 + + if count > 3: + raise ExecFailedError(result['stderr']) + + sleep(60) + + lines = result['stdout'].split('\n') + lines.pop(0) # remove header + lines = [x.split(',') for x in lines if x != ''] + + jobs = {} + for job_id, state in lines: + # ensure unique_id is of type string for downstream use. + job_id = str(job_id) + jobs[job_id] = state + + return jobs + def wait_on_job_ids(self, job_ids, callback=None): ''' Wait for the given job-ids to finish running before returning. @@ -250,65 +286,27 @@ def wait_on_job_ids(self, job_ids, callback=None): # ensure all ids are strings to ensure proper working w/join(). job_ids = [str(x) for x in job_ids] - def query_slurm(job_ids): - # internal function query_slurm encapsulates the handling of - # squeue. - count = 0 - while True: - result = self._system_call("squeue -t all -j " - f"{','.join(job_ids)} " - "-o '%F,%A,%T'") - - if result['return_code'] == 0: - # there was no issue w/squeue, break this loop and - # continue. - break - else: - # there was a likely intermittent issue w/squeue. Pause - # and wait before trying a few more times. If the problem - # persists then report the error and exit. - count += 1 - - if count > 3: - raise ExecFailedError(result['stderr']) - - sleep(60) - - lines = result['stdout'].split('\n') - lines.pop(0) # remove header - lines = [x.split(',') for x in lines if x != ''] - - jobs = {} - child_jobs = {} - for job_id, unique_id, state in lines: - # ensure unique_id is of type string for downstream use. - unique_id = str(unique_id) - jobs[unique_id] = state - - if unique_id != job_id: - child_jobs[unique_id] = job_id # job is a child job - - return jobs, child_jobs - while True: - jobs, child_jobs = query_slurm(job_ids) - - for jid in job_ids: - logging.debug("JOB %s: %s" % (jid, jobs[jid])) - if callback is not None: - callback(jid=jid, status=jobs[jid]) - - children = [x for x in child_jobs if child_jobs[x] == jid] - if len(children) == 0: - logging.debug("\tNO CHILDREN") - for cid in children: - logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid])) - status = [jobs[x] in Job.slurm_status_not_running for x in job_ids] - - if set(status) == {True}: - # all jobs either completed successfully or terminated. + # Because query_slurm only returns state on the job-ids we specify, + # the wait process is a simple check to see whether any of the + # states are 'running' states or not. + jobs = self.query_slurm(job_ids) + + # jobs will be a dict of job-ids or array-ids for jobs that + # are array-jobs. the value of jobs[id] will be a state e.g.: + # 'RUNNING', 'FAILED', 'COMPLETED'. + states = [jobs[x] in Job.slurm_status_not_running for x in jobs] + + if set(states) == {True}: + # if all the states are either FAILED or COMPLETED + # then the set of those states no matter how many + # array-jobs there were will ultimately be the set of + # {True}. If not then that means there are still jobs + # that are running. break + logging.debug(f"sleeping {Job.polling_interval_in_seconds} " + "seconds...") sleep(Job.polling_interval_in_seconds) return jobs @@ -366,18 +364,50 @@ def submit_job(self, script_path, job_parameters=None, # attributes. This method will return a dict w/job_ids as keys and # their job status as values. This must be munged before returning # to the user. - results = self.wait_on_job_ids([job_id], callback=callback) + results = Job.wait_on_job_ids([job_id], callback=callback) - job_result = {'job_id': job_id, 'job_state': results[job_id]} + if job_id in results: + # job is a non-array job + job_result = {'job_id': job_id, 'job_state': results[job_id]} + else: + # job is an array job + # assume all array jobs in this case will be associated w/job_id. + counts = Counter() + for array_id in results: + counts[results[array_id]] += 1 + + # for array jobs we won't be returning a string representing the + # state of a single job. Instead we're returning a dictionary of + # the number of unique states the set of array-jobs ended up in and + # the number for each one. + job_result = {'job_id': job_id, 'job_state': dict(counts)} if callback is not None: - callback(jid=job_id, status=job_result['job_state']) + if isinstance(job_result['job_state'], dict): + # this is an array job + states = [] + for key in counts: + states.append(f"{key}: {counts[key]}") + + callback(jid=job_id, status=", ".join(states)) + + else: + # this is a standard job + callback(jid=job_id, status=job_result['job_state']) - if job_result['job_state'] == 'COMPLETED': - return job_result + if isinstance(job_result['job_state'], dict): + states = list(job_result['job_state'].keys()) + if states == ['COMPLETED']: + return job_result + else: + raise JobFailedError(f"job {job_id} exited with jobs in the " + f"following states: {', '.join(states)}") else: - raise JobFailedError(f"job {job_id} exited with status " - f"{job_result['job_state']}") + if job_result['job_state'] == 'COMPLETED': + return job_result + else: + raise JobFailedError(f"job {job_id} exited with status " + f"{job_result['job_state']}") def _group_commands(self, cmds): # break list of commands into chunks of max_array_length (Typically diff --git a/sequence_processing_pipeline/scripts/fake_squeue.py b/sequence_processing_pipeline/scripts/fake_squeue.py new file mode 100755 index 00000000..6c8511ce --- /dev/null +++ b/sequence_processing_pipeline/scripts/fake_squeue.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +from json import load, dumps +from os.path import exists, join +from sys import argv +from random import randint, choice + + +def print_state(state): + # Note that %i will appear w/column name 'JOBID' in actual squeue output. + # this is because %i shows the array-id if it's an array job and what we + # consider the regular job-id if it's not an array job. + print("JOBID,STATE") + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job + for array_id in state[job_id]['array_ids']: + if state[job_id]['array_ids'][array_id] <= 0: + end_state = state[job_id]['endgame'][array_id] + else: + end_state = 'RUNNING' + + print(f"{array_id},{end_state}") + else: + # this is a non-array job + if state[job_id]['countdown'] <= 0: + end_state = state[job_id]['endgame'] + else: + end_state = 'RUNNING' + + print(f"{job_id},{end_state}") + + +def generate_output(job_ids): + results = {} + + for job_id in job_ids: + is_successful = choice([True, False]) + is_array_job = choice([True, False]) + + if is_array_job: + result = {'job_id': job_id} + result['array_ids'] = {} + result['endgame'] = {} + + for i in range(0, randint(5, 15)): + array_id = "%s_%d" % (job_id, i) + result['array_ids'][array_id] = randint(3, 7) + result['array_ids'][array_id] = randint(3, 7) + if is_successful: + # all array jobs must be successful + result['endgame'][array_id] = "COMPLETED" + else: + # some jobs may succeed but some may fail + result['endgame'][array_id] = choice( + ['COMPLETED', 'FAILED']) + results[job_id] = result + else: + result = {'job_id': job_id} + result['countdown'] = randint(3, 7) + result['endgame'] = choice(['COMPLETED', 'FAILED']) + results[job_id] = result + + return results + + +def save_state(state, file_path): + with open(file_path, 'w') as f: + print(dumps(state, indent=2), file=f) + + +def load_state(file_path): + with open(file_path, 'r') as f: + return load(f) + + +if __name__ == "__main__": + # "squeue -t all -j " f"{','.join(job_ids)} " "-o '%i,%T'" + job_ids = argv[4].split(',') + + state_file_path = join("sequence_processing_pipeline", "scripts", + "my_state.json") + + state = generate_output(job_ids) + + if exists(state_file_path): + state = load_state(state_file_path) + else: + state = generate_output(job_ids) + + print_state(state) + + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job. + for array_id in state[job_id]['array_ids']: + state[job_id]['array_ids'][array_id] -= 1 + else: + # this is a standard job. + state[job_id]['countdown'] -= 1 + + save_state(state, state_file_path) diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py index 7aa5889a..e7d58d66 100644 --- a/sequence_processing_pipeline/tests/test_Job.py +++ b/sequence_processing_pipeline/tests/test_Job.py @@ -1,10 +1,10 @@ import unittest from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import PipelineError -from os.path import abspath, join, dirname -from os import makedirs +from os.path import abspath, join, dirname, split, isdir +from os import makedirs, chmod, remove from functools import partial -from shutil import rmtree +from shutil import rmtree, copyfile import re @@ -14,7 +14,10 @@ def setUp(self): def tearDown(self): for some_path in self.remove_these: - rmtree(some_path) + if isdir(some_path): + rmtree(some_path) + else: + remove(some_path) def test_system_call(self): package_root = abspath('./sequence_processing_pipeline') @@ -123,6 +126,168 @@ def test_extract_project_names_from_fastq_dir(self): obs = job.extract_project_names_from_fastq_dir(tmp) self.assertEqual(obs, ['NPH_15288']) + def test_query_slurm(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567', '1234568', '1234569', '1234570'] + jobs = job.query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_query_slurm_single_job(self): + # perform test_query_slurm() but with a single job only. + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567'] + jobs = job.query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_wait_on_job_ids(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1', '2', '3', '4'] + + # to shorten the test time, set polling_interval_in_seconds to be + # lower than one minute. + Job.polling_interval_in_seconds = 10 + results = job.wait_on_job_ids(job_ids) + + # calling query_slurm one more time after wait_on_job_ids() is called + # will technically advance the counter one more, which means that this + # doesn't confirm that wait_on_job_ids() doesn't return before EVERY + # single job is either COMPLETED or FAILED. However it does confirm + # that wait_on_job_ids() doesn't return once the FIRST completed array + # job is either COMPLETED or FAILED while others are still RUNNING. + # This was previously an issue. + obs = job.query_slurm(job_ids) + + for array_id in obs: + state = obs[array_id] + # w/out relying on states defined in Job, simply confirm all are + # either COMPLETED or FAILED. + self.assertIn(state, ['COMPLETED', 'FAILED']) + + # since wait_on_job_ids() now returns the same data structure as + # query_slurm(), they should be equal. + self.assertDictEqual(obs, results) + if __name__ == '__main__': unittest.main() From 84edad5be0b48a92ea1f67c308310fbb477a98b0 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 18:03:18 -0800 Subject: [PATCH 33/47] Updates --- README.rst => README.md | 2 +- sequence_processing_pipeline/Commands.py | 2 -- sequence_processing_pipeline/FastQCJob.py | 2 +- sequence_processing_pipeline/GenPrepFileJob.py | 2 +- sequence_processing_pipeline/Job.py | 9 +++++---- sequence_processing_pipeline/NuQCJob.py | 16 ++++++++-------- sequence_processing_pipeline/TRIntegrateJob.py | 13 ++++++++++++- sequence_processing_pipeline/TellReadJob.py | 13 ++++++++++++- .../templates/cloudspades-isolate.sbatch | 4 ++-- .../templates/cloudspades.sbatch | 4 ++-- ...pute_sequence_counts_for_normalization.sbatch | 4 ++-- .../templates/integrate.sbatch | 1 + .../templates/telllink-isolate.sbatch | 4 ++-- .../templates/telllink.sbatch | 4 ++-- .../templates/tellread-cleanup.sbatch | 4 ++-- sequence_processing_pipeline/tests/test_Job.py | 6 +++--- .../tests/test_commands.py | 11 ----------- 17 files changed, 56 insertions(+), 45 deletions(-) rename README.rst => README.md (91%) diff --git a/README.rst b/README.md similarity index 91% rename from README.rst rename to README.md index 190ebba4..d9ef9b6c 100644 --- a/README.rst +++ b/README.md @@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git Create a Python3 Conda environment in which to run the notebook: ```bash -conda create -n sp_pipeline 'python==3.9' numpy pandas click scipy matplotlib fastq-pair +conda create --yes -n spp python=${{ matrix.python-version }} scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair ``` Activate the Conda environment: diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index 642e49cf..130ac28d 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -115,8 +115,6 @@ def demux(id_map, fp, out_d, task, maxtask): qual = iter(fp) for i, s, d, q in zip(id_, seq, dumb, qual): - # NB: This appears to not be causing the removal of the metadata - # either. fname_encoded, id_ = i.split(delimiter, 1) if fname_encoded not in openfps: diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 889ef75d..5e0bf4fc 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -255,7 +255,7 @@ def run(self, callback=None): cmd = ' '.join(cmd_head + input_path_list + cmd_tail) - results = Job._system_call(cmd, callback=callback) + results = self._system_call(cmd, callback=callback) if results['return_code'] != 0: raise PipelineError("multiqc encountered an error") diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py index 0bb2c52c..49e8f651 100644 --- a/sequence_processing_pipeline/GenPrepFileJob.py +++ b/sequence_processing_pipeline/GenPrepFileJob.py @@ -159,7 +159,7 @@ def run(self, callback=None): # currently that is how it's done. Hence, self.output_directory # and the path to run_dir might be different locations than the # others. - res = Job._system_call(' '.join(command), callback=callback) + res = self._system_call(' '.join(command), callback=callback) if res['return_code'] != 0: raise PipelineError("Seqpro encountered an error") diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 7a771908..55f287db 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -54,6 +54,7 @@ class Job: slurm_status_running) polling_interval_in_seconds = 60 + squeue_retry_in_seconds = 10 def __init__(self, root_dir, output_path, job_name, executable_paths, max_array_length, modules_to_load=None): @@ -234,7 +235,7 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} - def query_slurm(self, job_ids): + def _query_slurm(self, job_ids): # query_slurm encapsulates the handling of squeue. count = 0 while True: @@ -255,7 +256,7 @@ def query_slurm(self, job_ids): if count > 3: raise ExecFailedError(result['stderr']) - sleep(60) + sleep(Job.squeue_retry_in_seconds) lines = result['stdout'].split('\n') lines.pop(0) # remove header @@ -290,7 +291,7 @@ def wait_on_job_ids(self, job_ids, callback=None): # Because query_slurm only returns state on the job-ids we specify, # the wait process is a simple check to see whether any of the # states are 'running' states or not. - jobs = self.query_slurm(job_ids) + jobs = self._query_slurm(job_ids) # jobs will be a dict of job-ids or array-ids for jobs that # are array-jobs. the value of jobs[id] will be a state e.g.: @@ -364,7 +365,7 @@ def submit_job(self, script_path, job_parameters=None, # attributes. This method will return a dict w/job_ids as keys and # their job status as values. This must be munged before returning # to the user. - results = Job.wait_on_job_ids([job_id], callback=callback) + results = self.wait_on_job_ids([job_id], callback=callback) if job_id in results: # job is a non-array job diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index c0c51897..83bdf551 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -10,7 +10,7 @@ from sequence_processing_pipeline.Commands import split_similar_size_bins from sequence_processing_pipeline.util import iter_paired_files from jinja2 import Environment -import glob +from glob import glob import re from sys import executable @@ -150,7 +150,7 @@ def _filter_empty_fastq_files(self, filtered_directory, ''' empty_list = [] - files = glob.glob(join(filtered_directory, f'*.{self.suffix}')) + files = glob(join(filtered_directory, f'*.{self.suffix}')) for r1, r2 in iter_paired_files(files): full_path = join(filtered_directory, r1) @@ -214,7 +214,7 @@ def _move_trimmed_files(self, project_name, output_path): sample_ids = [x[0] for x in self.sample_ids if x[1] == project_name] - for trimmed_file in list(glob.glob(pattern)): + for trimmed_file in list(glob(pattern)): file_name = split(trimmed_file)[1] substr = self.interleave_fastq_regex.search(file_name) if substr is not None: @@ -274,7 +274,7 @@ def run(self, callback=None): needs_human_filtering = project['HumanFiltering'] source_dir = join(self.output_path, project_name) pattern = f"{source_dir}/*.fastq.gz" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) # if the 'only-adapter-filtered' directory exists, move the files # into a unique location so that files from multiple projects @@ -319,7 +319,7 @@ def run(self, callback=None): # move all html files underneath the subdirectory for this project. pattern = f"{old_html_path}/*.html" - completed_htmls = list(glob.glob(pattern)) + completed_htmls = list(glob(pattern)) self._move_helper(completed_htmls, # Tissue_1_Super_Trizol_S19_L001_R1_001.html self.html_regex, @@ -328,7 +328,7 @@ def run(self, callback=None): # move all json files underneath the subdirectory for this project. pattern = f"{old_json_path}/*.json" - completed_jsons = list(glob.glob(pattern)) + completed_jsons = list(glob(pattern)) self._move_helper(completed_jsons, # Tissue_1_Super_Trizol_S19_L001_R1_001.json self.json_regex, @@ -346,7 +346,7 @@ def _confirm_job_completed(self): # since NuQCJob processes across all projects in a run, there isn't # a need to iterate by project_name and job_id. pattern = f"{self.output_path}/hds-{self.qiita_job_id}.*.completed" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) if completed_files: return True @@ -503,7 +503,7 @@ def _generate_job_script(self, max_bucket_size): def parse_logs(self): log_path = join(self.output_path, 'logs') # sorted lists give predictable results - files = sorted(glob.glob(join(log_path, '*.out'))) + files = sorted(glob(join(log_path, '*.out'))) msgs = [] for some_file in files: diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 875a1988..9bb36a86 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -8,6 +8,7 @@ from metapool import load_sample_sheet from os import makedirs from shutil import copyfile +from glob import glob logging.basicConfig(level=logging.DEBUG) @@ -163,4 +164,14 @@ def _generate_job_script(self): return job_script_path def parse_logs(self): - raise PipelineError("parse_logs() not implemented for TRIntegrateJob") + log_path = join(self.output_path, 'logs') + # sorted lists give predictable results + files = sorted(glob(join(log_path, '*.out'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if 'error:' in line.lower()] + + return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 3b3bf314..5be1cbd0 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -6,6 +6,7 @@ from .Pipeline import Pipeline from .PipelineError import PipelineError from metapool import load_sample_sheet +from glob import glob logging.basicConfig(level=logging.DEBUG) @@ -178,4 +179,14 @@ def _generate_job_script(self): return job_script_path def parse_logs(self): - raise PipelineError("parse_logs() not implemented for TellReadJob") + log_path = join(self.output_path, 'logs') + # sorted lists give predictable results + files = sorted(glob(join(log_path, '*.out'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if 'error:' in line.lower()] + + return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch index 1ac51b2e..96426613 100644 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch @@ -6,8 +6,8 @@ #SBATCH -c {{cores_per_task}} #SBATCH -p {{queue_name}} -#SBATCH --output cloudspades-isolate_%x-%A_%a.out -#SBATCH --error cloudspades-isolate_%x-%A_%a.err +#SBATCH --output {{output}}/logs/cloudspades-isolate_%x-%A_%a.out +#SBATCH --error {{output}}/logs/cloudspades-isolate_%x-%A_%a.err source activate qiime2-2023.5 diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch index 72efb140..7a658892 100644 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ b/sequence_processing_pipeline/templates/cloudspades.sbatch @@ -6,8 +6,8 @@ #SBATCH -c {{cores_per_task}} # 12 #SBATCH -p {{queue_name}} # qiita -#SBATCH --output cloudspades_%x-%A_%a.out -#SBATCH --error cloudspades_%x-%A_%a.err +#SBATCH --output {{output}}/logs/cloudspades_%x-%A_%a.out +#SBATCH --error {{output}}/logs/cloudspades_%x-%A_%a.err source activate qiime2-2023.5 diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch index ab8af109..9414fd4c 100644 --- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch +++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch @@ -6,8 +6,8 @@ #SBATCH -c {{cores_per_task}} # 1 #SBATCH -p {{queue_name}} # qiita -#SBATCH --output compute_sequence_counts_%x-%A_%a.out -#SBATCH --error compute_sequence_counts_%x-%A_%a.err +#SBATCH --output {{output}}/logs/compute_sequence_counts_%x-%A_%a.out +#SBATCH --error {{output}}/logs/compute_sequence_counts_%x-%A_%a.err # NB: output appears normal w/out. # source activate qiime2-2023.5 diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch index 92dcfe87..68ebce5e 100644 --- a/sequence_processing_pipeline/templates/integrate.sbatch +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -6,6 +6,7 @@ #SBATCH -c {{cores_per_task}} #SBATCH -p {{queue_name}} #SBATCH --array=1-{{barcode_id_count}} + #SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out #SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch index 90e04012..eab0b380 100644 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch @@ -6,8 +6,8 @@ #SBATCH --time {{wall_time_limit}} # 96:00:00 #SBATCH -p {{queue_name}} # qiita -#SBATCH --output telllink-isolate_%x-%A_%a.out -#SBATCH --error telllink-isolate_%x-%A_%a.err +#SBATCH --output {{output}}/logs/telllink-isolate_%x-%A_%a.out +#SBATCH --error {{output}}/logs/telllink-isolate_%x-%A_%a.err set -x set -e diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch index efdf0578..16be25a4 100644 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ b/sequence_processing_pipeline/templates/telllink.sbatch @@ -6,8 +6,8 @@ #SBATCH --time {{wall_time_limit}} # 96:00:00 #SBATCH -p {{queue_name}} # qiita -#SBATCH --output telllink_%x-%A_%a.out -#SBATCH --error telllink_%x-%A_%a.err +#SBATCH --output {{output}}/logs/telllink_%x-%A_%a.out +#SBATCH --error {{output}}/logs/telllink_%x-%A_%a.err set -x set -e diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch index e5b0873e..3c31219d 100644 --- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -6,8 +6,8 @@ #SBATCH -c {{cores_per_task}} # 1 #SBATCH -p {{queue_name}} # qiita -#SBATCH --output tellread-cleanup_%x-%A.out -#SBATCH --error tellread-cleanup_%x-%A.err +#SBATCH --output {{output}}/logs/cleanup_%x-%A.out +#SBATCH --error {{output}}/logs/cleanup_%x-%A.err # remove unused large outputs rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py index e7d58d66..192709b6 100644 --- a/sequence_processing_pipeline/tests/test_Job.py +++ b/sequence_processing_pipeline/tests/test_Job.py @@ -159,7 +159,7 @@ def test_query_slurm(self): 'my_state.json')) job_ids = ['1234567', '1234568', '1234569', '1234570'] - jobs = job.query_slurm(job_ids) + jobs = job._query_slurm(job_ids) # jobs is a dictionary of unique array_ids and/or job-ids for non- # array jobs. The faked squeue reports anywhere between five and @@ -219,7 +219,7 @@ def test_query_slurm_single_job(self): 'my_state.json')) job_ids = ['1234567'] - jobs = job.query_slurm(job_ids) + jobs = job._query_slurm(job_ids) # jobs is a dictionary of unique array_ids and/or job-ids for non- # array jobs. The faked squeue reports anywhere between five and @@ -276,7 +276,7 @@ def test_wait_on_job_ids(self): # that wait_on_job_ids() doesn't return once the FIRST completed array # job is either COMPLETED or FAILED while others are still RUNNING. # This was previously an issue. - obs = job.query_slurm(job_ids) + obs = job._query_slurm(job_ids) for array_id in obs: state = obs[array_id] diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py index 3919ef43..4e0d0491 100644 --- a/sequence_processing_pipeline/tests/test_commands.py +++ b/sequence_processing_pipeline/tests/test_commands.py @@ -59,10 +59,6 @@ def test_demux(self): '@2::MUX::bing/2', 'ATGC', '+', '!!!!', '']) infile = io.StringIO(infile_data) - exp_data_r1 = '\n'.join(['@baz/1', 'ATGC', '+', '!!!!', - '@bing/1', 'ATGC', '+', '!!!!', '']) - exp_data_r2 = '\n'.join(['@baz/2', 'ATGC', '+', '!!!!', - '@bing/2', 'ATGC', '+', '!!!!', '']) exp_data_r1 = ['@baz/1', 'ATGC', '+', '!!!!', '@bing/1', 'ATGC', '+', '!!!!'] @@ -74,13 +70,6 @@ def test_demux(self): demux(id_map, infile, tmp, task, maxtask) - foo = join(tmp, 'Project_12345') - from os import walk - for root, dirs, files in walk(foo): - for _file in files: - _path = join(root, _file) - print(_path) - obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'), 'rt').read() obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'), From 8691147e7803a0e7a59f5ae9a68b8b857f8abb7a Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 19:00:59 -0800 Subject: [PATCH 34/47] Updates --- .../TRIntegrateJob.py | 10 +- .../tests/data/fake_sample_index_list.txt | 96 +++++++++++++++++++ .../tellread_output/integrate_test.sbatch | 96 ------------------- .../data/tellseq_output/integrate_test.sbatch | 67 +++++++++++++ .../tellread_test.sbatch | 0 .../tests/test_TRIntegrateJob.py | 72 ++++++++++++++ .../tests/test_TellReadJob.py | 5 +- 7 files changed, 241 insertions(+), 105 deletions(-) create mode 100644 sequence_processing_pipeline/tests/data/fake_sample_index_list.txt delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch rename sequence_processing_pipeline/tests/data/{tellread_output => tellseq_output}/tellread_test.sbatch (100%) create mode 100644 sequence_processing_pipeline/tests/test_TRIntegrateJob.py diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 9bb36a86..6994f2ad 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -17,9 +17,8 @@ class TRIntegrateJob(Job): def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, max_array_length, integrate_script_path, - sil_path, raw_fastq_dir, reference_base, reference_map, - cores_per_task): + qiita_job_id, integrate_script_path, sil_path, raw_fastq_dir, + reference_base, reference_map, cores_per_task): """ ConvertJob provides a convenient way to run bcl-convert or bcl2fastq on a directory BCL files to generate Fastq files. @@ -32,7 +31,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, :param jmem: String representing total memory limit for entire job. :param modules_to_load: A list of Linux module names to load :param qiita_job_id: identify Torque jobs using qiita_job_id - :param max_array_length: None :param integrate_script_path: None :param sil_path: A path to a confidential file mapping C5xx, adapters. :param reference_base: None @@ -43,7 +41,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, output_path, 'TRIntegrateJob', [], - max_array_length, + # max_array_length and self.max_array_length are + # not used by TRIntegrateJob. + -1, modules_to_load=modules_to_load) self.sample_sheet_path = sample_sheet_path diff --git a/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt new file mode 100644 index 00000000..1c4345ef --- /dev/null +++ b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt @@ -0,0 +1,96 @@ +CCCCCACCAA C501 NONE PE +AACCCCCACA C502 NONE PE +CCAACACACC C503 NONE PE +AACACCCCCA C504 NONE PE +CAAAACCCCC C505 NONE PE +ACACACCACC C506 NONE PE +AACCCACACC C507 NONE PE +CAAAAAAAAA C508 NONE PE +AAACCACCCC C509 NONE PE +ACACCCCCCC C510 NONE PE +AAACACCACA C511 NONE PE +CAAAACCCCA C512 NONE PE +ACCCCAACCC C513 NONE PE +CAACCAACAC C514 NONE PE +CCCCCACCCA C515 NONE PE +CCAACCCCCA C516 NONE PE +CAAAACACCC C517 NONE PE +ACACACCAAA C518 NONE PE +CCACCAAAAA C519 NONE PE +AAACCCCCCC C520 NONE PE +AACAACCCCA C521 NONE PE +CAACAACAAC C522 NONE PE +CACCACCAAA C523 NONE PE +CACAACAAAC C524 NONE PE +AACACCCACC C525 NONE PE +CAAAACACAA C526 NONE PE +AAAAAAAAAA C527 NONE PE +CCAACCCCCA C528 NONE PE +CAACCCCAAA C529 NONE PE +ACCCAACCCA C530 NONE PE +CACACCAAAC C531 NONE PE +CAACAAAAAC C532 NONE PE +CCAAAAAAAC C533 NONE PE +ACCAACACAC C534 NONE PE +CCAAAACACC C535 NONE PE +CACCCAAACC C536 NONE PE +CAAACCAAAC C537 NONE PE +CACAAACACA C538 NONE PE +ACCCACCCCC C539 NONE PE +AACCACACAC C540 NONE PE +CACCCCCACA C541 NONE PE +CACAACCACC C542 NONE PE +AAAAACAAAA C543 NONE PE +CCACACAAAC C544 NONE PE +AAACAAACAC C545 NONE PE +ACCCCCACCC C546 NONE PE +ACACCCCAAA C547 NONE PE +CAAACCAAAC C548 NONE PE +AAACCACCAA C549 NONE PE +CAACCAAAAC C550 NONE PE +ACACCCCCCC C551 NONE PE +CACCCACCAC C552 NONE PE +ACCCCAACCC C553 NONE PE +AAACCCAACA C554 NONE PE +ACCACACAAA C555 NONE PE +ACCCACCACC C556 NONE PE +CCCCAACCAA C557 NONE PE +CAAAAACACC C558 NONE PE +ACCACAAAAC C559 NONE PE +ACCCCCCCAA C560 NONE PE +CCACAAAACA C561 NONE PE +CAAACCCACC C562 NONE PE +ACACCACAAA C563 NONE PE +ACCAACAAAA C564 NONE PE +CCCAACAAAA C565 NONE PE +CACCCCCCCA C566 NONE PE +AAACCCACCA C567 NONE PE +CACACCACAA C568 NONE PE +CCAAACCCCA C569 NONE PE +CACCCCACCC C570 NONE PE +AAACCCCCAA C571 NONE PE +ACACCACACC C572 NONE PE +ACAAAACACC C573 NONE PE +CACAAACCAC C574 NONE PE +ACCCCCACAA C575 NONE PE +CCCCAAACCC C576 NONE PE +AAAACACAAC C577 NONE PE +AACCCAACCA C578 NONE PE +AAACACACAC C579 NONE PE +AAACACCACC C580 NONE PE +AACCCCAACA C581 NONE PE +CCACCCAAAC C582 NONE PE +CCAAAACAAC C583 NONE PE +ACCAACAAAC C584 NONE PE +AAACACCACC C585 NONE PE +AACCACACAC C586 NONE PE +CACAACAAAA C587 NONE PE +AACCAAAAAC C588 NONE PE +ACCAAACCAA C589 NONE PE +ACAAACACAC C590 NONE PE +ACCACACCAA C591 NONE PE +AAAAACAACC C592 NONE PE +CACACAACAC C593 NONE PE +CCCCCAACCC C594 NONE PE +ACACAAAACC C595 NONE PE +CCACCACACC C596 NONE PE diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch deleted file mode 100644 index 3cdc891f..00000000 --- a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -l -#SBATCH -J integrate # integrate -#SBATCH --time 96:00:00 # 24:00:00 -#SBATCH --mem 16G # 8G -#SBATCH -N 1 # 1 -#SBATCH -c 4 # 1 -#SBATCH -p qiita # qiita - -#SBATCH --output integrate_%x-%A_%a.out -#SBATCH --error integrate_%x-%A_%a.err - -# NB SLURM_ARRAY_TASK_ID is exported by Slurm -if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then - echo "Not operating in an array" - exit 1 -fi - -# NB SLURM_ARRAY_TASK_MIN is exported by Slurm -if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then - echo "Line extraction assumes 1-based index" - exit 1 -fi - -set -x -set -e -set -o pipefail - -samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list_output.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT. -export TMPDIR=$(mktemp -d) -function cleanup { - echo "Removing $TMPDIR" - rm -r $TMPDIR - unset TMPDIR -} -trap cleanup EXIT - -files=${TMPDIR}/integration.files -/bin/ls -1 sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/Full/*corrected.err_barcode_removed.fastq > ${files} -mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated - -if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R1" - exit 1 -fi - -if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} R2" - exit 1 -fi - -if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then - echo "Multiple matches for ${sample} I1" - exit 1 -fi - -r1=$(grep -m 1 "_R1_${sample}" ${files}) -r2=$(grep -m 1 "_R2_${sample}" ${files}) -i1=$(grep -m 1 "_I1_${sample}" ${files}) -r1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz -r2out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz -i1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz - -if [[ ! -s ${r1} ]]; then - echo "${r1} is empty, cannot integrate" - if [[ -s ${r2} ]]; then - echo "R1 and R2 are inconsistent" - exit 1 - fi - if [[ -s ${i1} ]]; then - echo "R1 and I1 are inconsistent" - exit 1 - fi - - # reflect the empties so Qiita can know of them - touch ${r1out} - touch ${r2out} - touch ${i1out} - exit 0 -fi - -# this can probably be backgrounded but then you have to get creative to -# not mask a nonzero exit status (e.g., the python process raising) -cat ${i1} | gzip > ${i1out} - -conda activate qp-knight-lab-processing-2022.03 -python hello integrate \ - --no-sort \ - --r1-in ${r1} \ - --r2-in ${r2} \ - --i1-in ${i1} \ - --r1-out ${r1out} \ - --r2-out ${r2out} \ - --threads ${SLURM_CPUS_PER_TASK} \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch new file mode 100644 index 00000000..f7a53198 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash -l +#SBATCH -J integrate +#SBATCH --time 96:00:00 +#SBATCH --mem 16G +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH -p qiita +#SBATCH --array=1-96 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.err + +set -x +set -e + +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp + +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "Processing sample ${sample}..." + +# make temp directory +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp +mkdir -p $TMPDIR + + +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. + +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq + +# create output directory +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated + +# generate output file names +r1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz +r2_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz +i1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz + +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} + +# generate integrated R1 and R2 fastq.gz files. +conda activate qp-knight-lab-processing-2022.03 + +python sequence_processing_pipeline/contrib/integrate-indices-np.py integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads 4 \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch similarity index 100% rename from sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch rename to sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py new file mode 100644 index 00000000..c01cb59f --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py @@ -0,0 +1,72 @@ +from os.path import join +from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob +from functools import partial +import unittest + + +class TestTRIntegrateJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TRIntegrateJob', 'integrate_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TRIntegrateJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + self.integrate_script_path = join(package_root, "contrib", + "integrate-indices-np.py") + self.sil_path = self.path('data', 'fake_sample_index_list.txt') + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + + def test_creation(self): + # test basic good-path + job = TRIntegrateJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.integrate_script_path, + self.sil_path, self.raw_fastq_dir, + self.reference_base, self.reference_map, + self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py index 440192c8..4c6a75c2 100644 --- a/sequence_processing_pipeline/tests/test_TellReadJob.py +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -11,18 +11,15 @@ def setUp(self): # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', 'TellReadJob', 'tellread_test.sbatch') - self.exp = self.path('data', 'tellread_output', 'tellread_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'tellread_test.sbatch') # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already # exists. - # TODO: Revisit w/a new directory named as expected for a - # TellSeq-produced run-directory. self.run_dir = self.path('data', 'sample_run_directories', '150629_SN1001_0511_AH5L7GBCXX') self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') - # TODO: Revisit w/a proper sample-sheet once spec is near finalized. self.sample_sheet_path = self.path('data', 'tellseq_metag_dummy_sample_' 'sheet.csv') From 487fc0cb25f7c4d672bbfe19405ad41452931fe7 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 19:07:11 -0800 Subject: [PATCH 35/47] flake8 --- .../tests/test_TRIntegrateJob.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py index c01cb59f..17ded346 100644 --- a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py +++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py @@ -48,13 +48,13 @@ def setUp(self): def test_creation(self): # test basic good-path job = TRIntegrateJob(self.run_dir, self.output_path, - self.sample_sheet_path, self.queue_name, - self.node_count, self.wall_time_limit, - self.jmem, self.modules_to_load, self.qiita_job_id, - self.integrate_script_path, - self.sil_path, self.raw_fastq_dir, - self.reference_base, self.reference_map, - self.cores_per_task) + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, + self.qiita_job_id, self.integrate_script_path, + self.sil_path, self.raw_fastq_dir, + self.reference_base, self.reference_map, + self.cores_per_task) job._generate_job_script() From 4276fc71f4a6b7df9964a4f40370eab6805b7d93 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 19:20:13 -0800 Subject: [PATCH 36/47] flake8 post merger --- sequence_processing_pipeline/Pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index f81a0bea..2867fa2f 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -144,7 +144,7 @@ class Pipeline: AMPLICON_ATYPE = 'TruSeq HT' assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE] - + @staticmethod def make_sif_fname(run_id, full_project_name): # TODO: the problem with this structure is that there's no clear way From 77c10b91e9eeade9171202d72d2edd6ee31cfc78 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 20 Nov 2024 21:31:29 -0800 Subject: [PATCH 37/47] Fixed older test --- sequence_processing_pipeline/Pipeline.py | 24 +++++++---- .../tests/test_Pipeline.py | 42 +++++++------------ 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 2867fa2f..2be36fe7 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -804,8 +804,6 @@ def _parse_project_name(self, project_name, short_names): return proj_info[PROJECT_SHORT_NAME_KEY], proj_info[QIITA_ID_KEY] def get_project_info(self, short_names=False): - # test for self.mapping_file, since self.sample_sheet will be - # defined in both cases. results = [] if self.pipeline_type == Pipeline.AMPLICON_PTYPE: @@ -820,25 +818,35 @@ def get_project_info(self, short_names=False): {p: parse_project_name(p) for p in sample_project_map} else: projects_info = self.sample_sheet.get_projects_details() - # endif mapping_file if short_names: proj_name_key = PROJECT_SHORT_NAME_KEY else: proj_name_key = PROJECT_FULL_NAME_KEY - # endif + for curr_project_info in projects_info.values(): curr_dict = { _PROJECT_NAME_KEY: curr_project_info[proj_name_key], QIITA_ID_KEY: curr_project_info[QIITA_ID_KEY] } - if contains_replicates is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: + # this is a mapping file: curr_contains_reps = contains_replicates else: - curr_contains_reps = \ - curr_project_info.get(CONTAINS_REPLICATES_KEY, False) - # endif + bi_df = self.sample_sheet.Bioinformatics + if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist(): + # subselect rows in [Bioinformatics] based on whether they + # match the project name. + df = bi_df.loc[bi_df['Sample_Project'] == + curr_project_info[proj_name_key]] + # since only one project can match by definition, convert + # to dict and extract the needed value. + curr_contains_reps = df.iloc[0].to_dict()[ + CONTAINS_REPLICATES_KEY] + else: + curr_contains_reps = False + curr_dict[CONTAINS_REPLICATES_KEY] = curr_contains_reps results.append(curr_dict) # next project diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index c1a9c08e..0af31439 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -531,35 +531,24 @@ def test_generate_sample_information_files_with_additional_meta(self): # get the path for the NYU_BMS_Melanoma dataset. sif_path = [x for x in sif_path if 'NYU_BMS_Melanoma' in x][0] - # we expect one more BLANK than before. - exp_lines = 34 - - exp_first_line = ('BLANK1.1A\t2021-10-21\t193\t' - 'Control\tNegative\tSterile w' - 'ater blank\turban biome\tres' - 'earch facility\tsterile wate' - 'r\tmisc environment\tUSA:CA:' - 'San Diego\tBLANK1.1A\t32.5\t' - '-117.25\tcontrol blank\tmeta' - 'genome\t256318\tBLANK1.1A\tN' - 'YU_BMS_Melanoma\tTRUE\t' - 'UCSD\tFALSE') - - # the new last sample should be BLANK999.999A. - exp_last_line = ('BLANK999.999A\t2021-10-21\t193\t' - 'Control\tNegative\tSterile w' - 'ater blank\turban biome\tres' - 'earch facility\tsterile wate' - 'r\tmisc environment\tUSA:CA:' - 'San Diego\tBLANK999.999A\t32.5\t' - '-117.25\tcontrol blank\tmeta' - 'genome\t256318\tBLANK999.999A\tN' - 'YU_BMS_Melanoma\tTRUE\t' - 'UCSD\tFALSE') + exp_first_line = ("BLANK1.1A\t2021-10-21\t193\t" + "Control\tNegative\tSterile water blank\t" + "Sterile water blank\turban biome\t" + "research facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\t" + "BLANK1.1A\t32.5\t-117.25\tcontrol blank\t" + "metagenome\t256318\tBLANK1.1A\t" + "NYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") + + exp_last_line = ("BLANK4.4H\t2021-10-21\t193\tControl\tNegative\t" + "Sterile water blank\tSterile water blank\t" + "urban biome\tresearch facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\tBLANK4.4H\t" + "32.5\t-117.25\tcontrol blank\tmetagenome\t256318\t" + "BLANK4.4H\tNYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") with open(sif_path, 'r') as f: obs_lines = f.readlines() - self.assertEqual(len(obs_lines), exp_lines) # confirm that each file contains the expected header. header = obs_lines[0].strip() @@ -574,6 +563,7 @@ def test_generate_sample_information_files_with_additional_meta(self): # confirm that the last line of each file is as expected. obs = obs_lines[-1].strip() exp = exp_last_line + self.assertEqual(obs, exp) def test_get_sample_ids(self): From a67a8a8a181a03bc2953006b47e5e0293ebbf3bb Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sat, 23 Nov 2024 18:12:53 -0800 Subject: [PATCH 38/47] Minor update --- sequence_processing_pipeline/ConvertJob.py | 8 ++++- sequence_processing_pipeline/Pipeline.py | 38 ++++++++++++++-------- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py index 122a4987..17b8c3b3 100644 --- a/sequence_processing_pipeline/ConvertJob.py +++ b/sequence_processing_pipeline/ConvertJob.py @@ -156,7 +156,13 @@ def run(self, callback=None): exec_from=self.log_path, callback=callback) - self.copy_controls_between_projects() + # ConvertJob() is used to process Amplicon as well as Meta*Omic + # runs. Amplicon runs use a dummy sample-sheet generated by + # Pipeline(). For these types of sheets we can't copy controls + # between projects because demuxing is not performed here. + _, sheet_name = split(self.sample_sheet_path) + if sheet_name != 'dummy_sample_sheet.csv': + self.copy_controls_between_projects() except JobFailedError as e: # When a job has failed, parse the logs generated by this specific diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 2be36fe7..9a30c2a0 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -230,6 +230,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path, self.run_id = run_id self.qiita_job_id = qiita_job_id self.pipeline = [] + self.assay_type = None # this method will catch a run directory as well as its products # directory, which also has the same name. Hence, return the @@ -239,6 +240,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path, if pipeline_type == Pipeline.AMPLICON_PTYPE: self.search_paths = self.configuration['amplicon_search_paths'] + self.assay_type = Pipeline.AMPLICON_ATYPE else: self.search_paths = self.configuration['search_paths'] @@ -289,7 +291,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path, # create dummy sample-sheet output_fp = join(output_path, 'dummy_sample_sheet.csv') self.generate_dummy_sample_sheet(self.run_dir, output_fp) - self.sample_sheet = output_fp + self.dummy_sheet_path = output_fp # Optional lane_number parameter is ignored for Amplicon # runs, as the only valid value is 1. @@ -311,8 +313,26 @@ def __init__(self, configuration_file_path, run_id, input_file_path, self.sample_sheet = self._validate_sample_sheet(input_file_path) self.mapping_file = None + if self.assay_type is None: + # set self.assay_type for non-amplicon types. + assay_type = self.sample_sheet.Header['Assay'] + if assay_type not in Pipeline.assay_types: + raise ValueError(f"'{assay_type} is not a valid Assay type") + self.assay_type = assay_type + self._configure_profile() + def get_sample_sheet_path(self): + """ + Returns path to a sample-sheet or dummy sample-sheet for amplicon runs. + """ + if self.assay_type == Pipeline.AMPLICON_ATYPE: + # assume self.dummy_sheet_path has been created for amplicon runs. + return self.dummy_sheet_path + else: + # assume input_file_path is a sample-sheet for non-amplicon runs. + return self.input_file_path + def get_software_configuration(self, software): if software is None or software == "": raise ValueError(f"'{software}' is not a valid value") @@ -366,15 +386,6 @@ def _configure_profile(self): # from self.sample_sheet (or self.mapping_file). instr_type = InstrumentUtils.get_instrument_type(self.run_dir) - if isinstance(self.sample_sheet, str): - # if self.sample_sheet is a file instead of a KLSampleSheet() - # type, then this is an Amplicon run. - assay_type = Pipeline.AMPLICON_ATYPE - else: - assay_type = self.sample_sheet.Header['Assay'] - if assay_type not in Pipeline.assay_types: - raise ValueError(f"'{assay_type} is not a valid Assay type") - # open the configuration profiles directory as specified by # profiles_path in the configuration.json file. parse each json into # a nested dictionary keyed by (instrument-type, assay-type) as @@ -434,13 +445,14 @@ def _configure_profile(self): i_type = profile['profile']['instrument_type'] a_type = profile['profile']['assay_type'] - if i_type == instr_type and a_type == assay_type: + if i_type == instr_type and a_type == self.assay_type: selected_profile = profile break if selected_profile is None: - raise ValueError(f"a matching profile ({instr_type}, {assay_type}" - ") was not found. Please notify an administrator") + raise ValueError(f"a matching profile ({instr_type}, " + f"{self.assay_type}) was not found. Please notify" + " an administrator") self.config_profile = selected_profile From eb3600113157a5ca1ead0a3bf4a35758a39644b2 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 24 Nov 2024 12:36:03 -0800 Subject: [PATCH 39/47] Remove lengthy comment --- sequence_processing_pipeline/templates/tellread.sbatch | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch index 66d9d9fd..f038a568 100644 --- a/sequence_processing_pipeline/templates/tellread.sbatch +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -20,11 +20,6 @@ module load {{modules_to_load}} -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \ -l {{lane}} -# instead of testing for the presence of '{{output}}/Full', we will review -# the changed timestamps for all the files in '{{output}}/Full' and when -# we can demonstrate that they haven't changed in an arbitrary period of time -# we will consider the work completed. - # get the timestamp for the most recently changed file in directory '.' # hard-limit for wait time set to ~ 8 hours. From d69a0c3202deb7eab41b5f3e0df32f31df561522 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 24 Nov 2024 13:18:48 -0800 Subject: [PATCH 40/47] fix test --- .../tests/data/tellseq_output/tellread_test.sbatch | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch index fb099cf3..9dc3ccff 100644 --- a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch +++ b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch @@ -20,11 +20,6 @@ $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ -j ${SLURM_JOB_CPUS_PER_NODE} \ -l s_1 -# instead of testing for the presence of 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full', we will review -# the changed timestamps for all the files in 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full' and when -# we can demonstrate that they haven't changed in an arbitrary period of time -# we will consider the work completed. - # get the timestamp for the most recently changed file in directory '.' # hard-limit for wait time set to ~ 8 hours. From 1649d647e1fac67df3331cbe6e42f60a9e03120d Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Mon, 25 Nov 2024 14:52:16 -0800 Subject: [PATCH 41/47] Updates based on feedback --- README.md | 2 +- sequence_processing_pipeline/TellReadJob.py | 4 -- .../templates/cloudspades-isolate.sbatch | 54 ------------------- .../templates/cloudspades.sbatch | 54 ------------------- .../templates/telllink-isolate.sbatch | 45 ---------------- .../templates/telllink.sbatch | 47 ---------------- 6 files changed, 1 insertion(+), 205 deletions(-) delete mode 100644 sequence_processing_pipeline/templates/cloudspades-isolate.sbatch delete mode 100644 sequence_processing_pipeline/templates/cloudspades.sbatch delete mode 100644 sequence_processing_pipeline/templates/telllink-isolate.sbatch delete mode 100644 sequence_processing_pipeline/templates/telllink.sbatch diff --git a/README.md b/README.md index d9ef9b6c..594e11aa 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git Create a Python3 Conda environment in which to run the notebook: ```bash -conda create --yes -n spp python=${{ matrix.python-version }} scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair +conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair ``` Activate the Conda environment: diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 5be1cbd0..75e3b958 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -148,10 +148,6 @@ def _generate_job_script(self): extra = "" - # if reference_base is added in the future and is defined, extra needs - # to be f"-f {reference_base}". - # extra = "-f ${REFBASE}" - with open(job_script_path, mode="w", encoding="utf-8") as f: f.write(template.render({ "job_name": "tellread", diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch deleted file mode 100644 index 96426613..00000000 --- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -l -#SBATCH -J {{job_name}} -#SBATCH --time {{wall_time_limit}} -#SBATCH --mem {{mem_in_gb}}G -#SBATCH -N {{node_count}} -#SBATCH -c {{cores_per_task}} -#SBATCH -p {{queue_name}} - -#SBATCH --output {{output}}/logs/cloudspades-isolate_%x-%A_%a.out -#SBATCH --error {{output}}/logs/cloudspades-isolate_%x-%A_%a.err - -source activate qiime2-2023.5 - -set -x -set -e - -module load {{modules_to_load}} - -samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) - -# assumes 1-based array index, eg --array 1-N -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -cs={{output_path}}/cloudspades-isolate/${sample} - -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${cs} ]]; then - rm -fr ${cs} - fi -fi - -mkdir -p ${cs} -pushd {{cloudspades_path}}/assembler/bin - -./spades.py \ - -o ${cs} \ - --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ - -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 -module unload gcc_9.3.0 -popd - -# TODO: Look for alternative method to load quast -#mamba activate quast - -#quast \ -# -o ${cs}/quast-scaffolds \ -# -t ${SLURM_JOB_CPUS_PER_NODE} \ -# ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 -# -# remove intermediates that currently dont have a downstream use -#if [[ -d ${cs}/K21 ]]; then -# rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -#fi diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch deleted file mode 100644 index 7a658892..00000000 --- a/sequence_processing_pipeline/templates/cloudspades.sbatch +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -l -#SBATCH -J {{job_name}} # cs-assemble -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 128G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 12 -#SBATCH -p {{queue_name}} # qiita - -#SBATCH --output {{output}}/logs/cloudspades_%x-%A_%a.out -#SBATCH --error {{output}}/logs/cloudspades_%x-%A_%a.err - -source activate qiime2-2023.5 - -set -x -set -e - -module load {{modules_to_load}} - -samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) - -# assumes 1-based array index, eg --array 1-N -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -cs={{output_path}}/cloudspades/${sample} - -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${cs} ]]; then - rm -fr ${cs} - fi -fi - -mkdir -p ${cs} -pushd {{cloudspades_path}}/assembler/bin - -./spades.py \ - -o ${cs} \ - --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ - --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ - --meta \ - -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1 -module unload gcc_9.3.0 -popd - -# TODO: Look for alternative method to load quast -#mamba activate quast -#quast \ -# -o ${cs}/quast-scaffolds \ -# -t ${SLURM_JOB_CPUS_PER_NODE} \ -# ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1 - -# remove intermediates that currently dont have a downstream use -#if [[ -d ${cs}/K21 ]]; then -# rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp -#fi diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch deleted file mode 100644 index eab0b380..00000000 --- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -l -#SBATCH -J {{job_name}} # tellink-isolate -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 16 -#SBATCH --mem {{mem_in_gb}}G # 160G -#SBATCH --time {{wall_time_limit}} # 96:00:00 -#SBATCH -p {{queue_name}} # qiita - -#SBATCH --output {{output}}/logs/telllink-isolate_%x-%A_%a.out -#SBATCH --error {{output}}/logs/telllink-isolate_%x-%A_%a.err - -set -x -set -e - -module load {{modules_to_load}} - -samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -k=79 -lc=35 - -tl={{output_path}}/tell-link-isolate/${sample} -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${tl} ]]; then - rm -fr ${tl} - fi -fi - -mkdir -p ${tl} - -{{sing_path}} \ - -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ - -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ - -i1 {{output_path}}}/integrated/${sample}.I1.fastq.gz \ - -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \ - -k ${k} \ - -lc ${lc} \ - -p ${sample} \ - -j ${SLURM_CPUS_PER_TASK} - -# remove temporary data -if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping -fi diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch deleted file mode 100644 index 16be25a4..00000000 --- a/sequence_processing_pipeline/templates/telllink.sbatch +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -l -#SBATCH -J {{job_name}} # tellink -#SBATCH --mem {{mem_in_gb}}G # 160G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 16 -#SBATCH --time {{wall_time_limit}} # 96:00:00 -#SBATCH -p {{queue_name}} # qiita - -#SBATCH --output {{output}}/logs/telllink_%x-%A_%a.out -#SBATCH --error {{output}}/logs/telllink_%x-%A_%a.err - -set -x -set -e - -module load {{modules_to_load}} - -samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2)) -sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} - -# TODO: leave these hardcoded for now -k=79 -lc=35 - -tl={{output_path}}/tell-link/${sample} -if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then - if [[ -d ${tl} ]]; then - rm -fr ${tl} - fi -fi - -mkdir -p ${tl} - -{{sing_path}} \ - -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \ - -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \ - -i1 {{output_path}}/integrated/${sample}.I1.fastq.gz \ - -d metagenomics \ - -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \ - -k ${k} \ - -lc ${lc} \ - -p ${sample} \ - -j ${SLURM_CPUS_PER_TASK} - -# remove temporary data -if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then - rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping -fi From 65690393e674ba4e94bf82ff1e703a15beb7a166 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Mon, 25 Nov 2024 15:05:36 -0800 Subject: [PATCH 42/47] Update based on feedback --- .../TRNormCountsJob.py | 139 ------------------ 1 file changed, 139 deletions(-) delete mode 100644 sequence_processing_pipeline/TRNormCountsJob.py diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py deleted file mode 100644 index 6887994a..00000000 --- a/sequence_processing_pipeline/TRNormCountsJob.py +++ /dev/null @@ -1,139 +0,0 @@ -from os.path import join -from .Job import Job, KISSLoader -from .PipelineError import JobFailedError -import logging -from jinja2 import Environment -from .Pipeline import Pipeline -from .PipelineError import PipelineError -from metapool import load_sample_sheet - - -logging.basicConfig(level=logging.DEBUG) - - -class TRNormCountsJob(Job): - def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, - node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, max_array_length, indicies_script_path, label, - reference_base, reference_map, cores_per_task=4): - """ - ConvertJob provides a convenient way to run bcl-convert or bcl2fastq - on a directory BCL files to generate Fastq files. - :param run_dir: The 'run' directory that contains BCL files. - :param output_path: Path where all pipeline-generated files live. - :param sample_sheet_path: The path to a sample-sheet. - :param queue_name: The name of the Torque queue to use for processing. - :param node_count: The number of nodes to request. - :param wall_time_limit: A hard time limit (in min) to bound processing. - :param jmem: String representing total memory limit for entire job. - :param modules_to_load: A list of Linux module names to load - :param qiita_job_id: identify Torque jobs using qiita_job_id - :param max_array_length: None - :param indicies_script_path: None - :param label: None - :param reference_base: None - :param reference_map: None - :param cores_per_task: (Optional) # of CPU cores per node to request. - """ - super().__init__(run_dir, - output_path, - 'TRIntegrateJob', - [], - max_array_length, - modules_to_load=modules_to_load) - - self.sample_sheet_path = sample_sheet_path - self._file_check(self.sample_sheet_path) - metadata = self._process_sample_sheet() - self.sample_ids = metadata['sample_ids'] - self.queue_name = queue_name - self.node_count = node_count - self.wall_time_limit = wall_time_limit - self.cores_per_task = cores_per_task - self.indicies_script_path = indicies_script_path - - self.reference_base = reference_base - self.reference_map = reference_map - - # raise an Error if jmem is not a valid floating point value. - self.jmem = str(int(jmem)) - self.qiita_job_id = qiita_job_id - self.sample_count = len(self.sample_ids) - self.jinja_env = Environment(loader=KISSLoader('templates')) - self.label = label - - self.job_name = (f"norm_counts_{self.qiita_job_id}") - - def run(self, callback=None): - job_script_path = self._generate_job_script() - params = ['--parsable', - f'-J {self.job_name}', - f'--array 1-{self.sample_count}'] - try: - self.job_info = self.submit_job(job_script_path, - job_parameters=' '.join(params), - exec_from=None, - callback=callback) - - logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') - except JobFailedError as e: - # When a job has failed, parse the logs generated by this specific - # job to return a more descriptive message to the user. - info = self.parse_logs() - # prepend just the message component of the Error. - info.insert(0, str(e)) - raise JobFailedError('\n'.join(info)) - - logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') - - def _process_sample_sheet(self): - sheet = load_sample_sheet(self.sample_sheet_path) - - if not sheet.validate_and_scrub_sample_sheet(): - s = "Sample sheet %s is not valid." % self.sample_sheet_path - raise PipelineError(s) - - header = sheet.Header - chemistry = header['chemistry'] - - if header['Assay'] not in Pipeline.assay_types: - s = "Assay value '%s' is not recognized." % header['Assay'] - raise PipelineError(s) - - sample_ids = [] - for sample in sheet.samples: - sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) - - bioinformatics = sheet.Bioinformatics - - # reorganize the data into a list of dictionaries, one for each row. - # the ordering of the rows will be preserved in the order of the list. - lst = bioinformatics.to_dict('records') - - # human-filtering jobs are scoped by project. Each job requires - # particular knowledge of the project. - return {'chemistry': chemistry, - 'projects': lst, - 'sample_ids': sample_ids} - - def _generate_job_script(self): - job_script_path = join(self.output_path, "compute_sequence_counts_for" - "_normalization.sbatch") - template = self.jinja_env.get_template("compute_sequence_counts_for_" - "normalization2.sbatch") - - with open(job_script_path, mode="w", encoding="utf-8") as f: - f.write(template.render({ - "#job_name": "integrate", - "#wall_time_limit": self.wall_time_limit, - "#mem_in_gb": self.jmem, - "#node_count": self.node_count, - "#cores_per_task": self.cores_per_task, - "#queue_name": self.queue_name, - "#output_path": self.output_path, - "read_counts_path": "TODO", - "sample_sheet": "TODO", - "tellread_output": "TODO" - })) - - return job_script_path From 81922b56c42a446323efb80587d2af4ee800ae76 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Wed, 27 Nov 2024 19:23:39 -0800 Subject: [PATCH 43/47] Added renamed file --- sequence_processing_pipeline/NormCountsJob.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 sequence_processing_pipeline/NormCountsJob.py diff --git a/sequence_processing_pipeline/NormCountsJob.py b/sequence_processing_pipeline/NormCountsJob.py new file mode 100644 index 00000000..1909420d --- /dev/null +++ b/sequence_processing_pipeline/NormCountsJob.py @@ -0,0 +1,139 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class NormCountsJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, indicies_script_path, label, + reference_base, reference_map, cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: None + :param indicies_script_path: None + :param label: None + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.indicies_script_path = indicies_script_path + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.label = label + + self.job_name = (f"norm_counts_{self.qiita_job_id}") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, "compute_sequence_counts_for" + "_normalization.sbatch") + template = self.jinja_env.get_template("compute_sequence_counts_for_" + "normalization2.sbatch") + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "#job_name": "integrate", + "#wall_time_limit": self.wall_time_limit, + "#mem_in_gb": self.jmem, + "#node_count": self.node_count, + "#cores_per_task": self.cores_per_task, + "#queue_name": self.queue_name, + "#output_path": self.output_path, + "read_counts_path": "TODO", + "sample_sheet": "TODO", + "tellread_output": "TODO" + })) + + return job_script_path From 01d77d6b66aac3ad2aa08d4228d895c8255b059b Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 1 Dec 2024 16:47:24 -0800 Subject: [PATCH 44/47] Refactored sequence counting job Request from Antonio to make TRNormCountsJob more generalized for current and upcoming work. TRNormCountsJob replaced w/SeqCountsJob: * takes a list of paths to fastq and/or fastq.gz files. * runs seqtk to count sequences and bases in parallel. * aggregator code produces a json file of counts from log output. --- sequence_processing_pipeline/NormCountsJob.py | 139 ------------------ sequence_processing_pipeline/SeqCountsJob.py | 138 +++++++++++++++++ .../aggregate_counts.py | 40 +++++ ...e_sequence_counts_for_normalization.sbatch | 26 ---- .../templates/seq_counts.sbatch | 25 ++++ .../tests/data/aggregate_counts_results.json | 36 +++++ .../tests/data/files_to_count.txt | 8 + .../tests/data/seq_counts.sbatch | 25 ++++ .../seq_counts_logs/seq_count_2679966_1.err | 3 + .../seq_counts_logs/seq_count_2679966_1.out | 2 + .../seq_counts_logs/seq_count_2679966_2.err | 3 + .../seq_counts_logs/seq_count_2679966_2.out | 2 + .../seq_counts_logs/seq_count_2679966_3.err | 3 + .../seq_counts_logs/seq_count_2679966_3.out | 2 + .../seq_counts_logs/seq_count_2679966_4.err | 3 + .../seq_counts_logs/seq_count_2679966_4.out | 2 + .../seq_counts_logs/seq_count_2679966_5.err | 3 + .../seq_counts_logs/seq_count_2679966_5.out | 2 + .../seq_counts_logs/seq_count_2679966_6.err | 3 + .../seq_counts_logs/seq_count_2679966_6.out | 2 + .../seq_counts_logs/seq_count_2679966_7.err | 3 + .../seq_counts_logs/seq_count_2679966_7.out | 2 + .../seq_counts_logs/seq_count_2679966_8.err | 3 + .../seq_counts_logs/seq_count_2679966_8.out | 2 + .../tests/test_SeqCountsJob.py | 72 +++++++++ 25 files changed, 384 insertions(+), 165 deletions(-) delete mode 100644 sequence_processing_pipeline/NormCountsJob.py create mode 100644 sequence_processing_pipeline/SeqCountsJob.py create mode 100644 sequence_processing_pipeline/aggregate_counts.py delete mode 100644 sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch create mode 100644 sequence_processing_pipeline/templates/seq_counts.sbatch create mode 100644 sequence_processing_pipeline/tests/data/aggregate_counts_results.json create mode 100644 sequence_processing_pipeline/tests/data/files_to_count.txt create mode 100644 sequence_processing_pipeline/tests/data/seq_counts.sbatch create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out create mode 100644 sequence_processing_pipeline/tests/test_SeqCountsJob.py diff --git a/sequence_processing_pipeline/NormCountsJob.py b/sequence_processing_pipeline/NormCountsJob.py deleted file mode 100644 index 1909420d..00000000 --- a/sequence_processing_pipeline/NormCountsJob.py +++ /dev/null @@ -1,139 +0,0 @@ -from os.path import join -from .Job import Job, KISSLoader -from .PipelineError import JobFailedError -import logging -from jinja2 import Environment -from .Pipeline import Pipeline -from .PipelineError import PipelineError -from metapool import load_sample_sheet - - -logging.basicConfig(level=logging.DEBUG) - - -class NormCountsJob(Job): - def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, - node_count, wall_time_limit, jmem, modules_to_load, - qiita_job_id, max_array_length, indicies_script_path, label, - reference_base, reference_map, cores_per_task=4): - """ - ConvertJob provides a convenient way to run bcl-convert or bcl2fastq - on a directory BCL files to generate Fastq files. - :param run_dir: The 'run' directory that contains BCL files. - :param output_path: Path where all pipeline-generated files live. - :param sample_sheet_path: The path to a sample-sheet. - :param queue_name: The name of the Torque queue to use for processing. - :param node_count: The number of nodes to request. - :param wall_time_limit: A hard time limit (in min) to bound processing. - :param jmem: String representing total memory limit for entire job. - :param modules_to_load: A list of Linux module names to load - :param qiita_job_id: identify Torque jobs using qiita_job_id - :param max_array_length: None - :param indicies_script_path: None - :param label: None - :param reference_base: None - :param reference_map: None - :param cores_per_task: (Optional) # of CPU cores per node to request. - """ - super().__init__(run_dir, - output_path, - 'TRIntegrateJob', - [], - max_array_length, - modules_to_load=modules_to_load) - - self.sample_sheet_path = sample_sheet_path - self._file_check(self.sample_sheet_path) - metadata = self._process_sample_sheet() - self.sample_ids = metadata['sample_ids'] - self.queue_name = queue_name - self.node_count = node_count - self.wall_time_limit = wall_time_limit - self.cores_per_task = cores_per_task - self.indicies_script_path = indicies_script_path - - self.reference_base = reference_base - self.reference_map = reference_map - - # raise an Error if jmem is not a valid floating point value. - self.jmem = str(int(jmem)) - self.qiita_job_id = qiita_job_id - self.sample_count = len(self.sample_ids) - self.jinja_env = Environment(loader=KISSLoader('templates')) - self.label = label - - self.job_name = (f"norm_counts_{self.qiita_job_id}") - - def run(self, callback=None): - job_script_path = self._generate_job_script() - params = ['--parsable', - f'-J {self.job_name}', - f'--array 1-{self.sample_count}'] - try: - self.job_info = self.submit_job(job_script_path, - job_parameters=' '.join(params), - exec_from=None, - callback=callback) - - logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') - except JobFailedError as e: - # When a job has failed, parse the logs generated by this specific - # job to return a more descriptive message to the user. - info = self.parse_logs() - # prepend just the message component of the Error. - info.insert(0, str(e)) - raise JobFailedError('\n'.join(info)) - - logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') - - def _process_sample_sheet(self): - sheet = load_sample_sheet(self.sample_sheet_path) - - if not sheet.validate_and_scrub_sample_sheet(): - s = "Sample sheet %s is not valid." % self.sample_sheet_path - raise PipelineError(s) - - header = sheet.Header - chemistry = header['chemistry'] - - if header['Assay'] not in Pipeline.assay_types: - s = "Assay value '%s' is not recognized." % header['Assay'] - raise PipelineError(s) - - sample_ids = [] - for sample in sheet.samples: - sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) - - bioinformatics = sheet.Bioinformatics - - # reorganize the data into a list of dictionaries, one for each row. - # the ordering of the rows will be preserved in the order of the list. - lst = bioinformatics.to_dict('records') - - # human-filtering jobs are scoped by project. Each job requires - # particular knowledge of the project. - return {'chemistry': chemistry, - 'projects': lst, - 'sample_ids': sample_ids} - - def _generate_job_script(self): - job_script_path = join(self.output_path, "compute_sequence_counts_for" - "_normalization.sbatch") - template = self.jinja_env.get_template("compute_sequence_counts_for_" - "normalization2.sbatch") - - with open(job_script_path, mode="w", encoding="utf-8") as f: - f.write(template.render({ - "#job_name": "integrate", - "#wall_time_limit": self.wall_time_limit, - "#mem_in_gb": self.jmem, - "#node_count": self.node_count, - "#cores_per_task": self.cores_per_task, - "#queue_name": self.queue_name, - "#output_path": self.output_path, - "read_counts_path": "TODO", - "sample_sheet": "TODO", - "tellread_output": "TODO" - })) - - return job_script_path diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py new file mode 100644 index 00000000..51f8e276 --- /dev/null +++ b/sequence_processing_pipeline/SeqCountsJob.py @@ -0,0 +1,138 @@ +from os.path import join, split +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from os import walk +from json import dumps + + +logging.basicConfig(level=logging.DEBUG) + + +class SeqCountsJob(Job): + def __init__(self, run_dir, output_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, files_to_count_path, + cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: A hard-limit for array-sizes + :param files_to_count_path: A path to a list of file-paths to count. + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'SeqCountsJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.jinja_env = Environment(loader=KISSLoader('templates')) + + self.job_name = (f"seq_counts_{self.qiita_job_id}") + self.files_to_count_path = files_to_count_path + + with open(self.files_to_count_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.file_count = len(lines) + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'SeqCountsJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + self._aggregate_counts() + + logging.debug(f'SeqCountJob {self.job_info["job_id"]} completed') + + def _generate_job_script(self): + job_script_path = join(self.output_path, "seq_counts.sbatch") + template = self.jinja_env.get_template("seq_counts.sbatch") + + # got to make files_to_count.txt and put it in the output directory + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "seq_counts", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "file_count": self.file_count, + "output_path": self.output_path + })) + + return job_script_path + + def parse_logs(self): + # TODO + pass + + def _aggregate_counts(self): + def extract_metadata(fp): + with open(fp, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % fp) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + results = {} + + for root, dirs, files in walk(self.log_path): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + results_path = join(self.output_path, 'aggregate_counts.json') + + with open(results_path, 'w') as f: + print(dumps(results, indent=2), file=f) + + return results_path diff --git a/sequence_processing_pipeline/aggregate_counts.py b/sequence_processing_pipeline/aggregate_counts.py new file mode 100644 index 00000000..ace90212 --- /dev/null +++ b/sequence_processing_pipeline/aggregate_counts.py @@ -0,0 +1,40 @@ +from os import walk +from sys import argv +from os.path import join, split +from json import dumps + + +def extract_metadata(log_output_file_path): + with open(log_output_file_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % log_output_file_path) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + +def aggregate_counts(fp): + results = {} + + for root, dirs, files in walk(fp): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + return results + + +if __name__ == '__main__': + results = aggregate_counts(argv[1]) + with open(argv[2], 'w') as f: + print(dumps(results, indent=2), file=f) diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch deleted file mode 100644 index 9414fd4c..00000000 --- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -l -#SBATCH -J {{job_name}} # norm -#SBATCH --time {{wall_time_limit}} # 24:00:00 -#SBATCH --mem {{mem_in_gb}}G # 8G -#SBATCH -N {{node_count}} # 1 -#SBATCH -c {{cores_per_task}} # 1 -#SBATCH -p {{queue_name}} # qiita - -#SBATCH --output {{output}}/logs/compute_sequence_counts_%x-%A_%a.out -#SBATCH --error {{output}}/logs/compute_sequence_counts_%x-%A_%a.err - -# NB: output appears normal w/out. -# source activate qiime2-2023.5 - -set -x -set -e -set -o pipefail - -echo $TMPDIR - -mkdir -p {{output_path}} -wc -l {{tellread_output}}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt -python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}} - -conda activate qp-knight-lab-processing-2022.03 -python {{create_picklist_path}} {{read_counts_path}} diff --git a/sequence_processing_pipeline/templates/seq_counts.sbatch b/sequence_processing_pipeline/templates/seq_counts.sbatch new file mode 100644 index 00000000..f44bd5b9 --- /dev/null +++ b/sequence_processing_pipeline/templates/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{file_count}} + +#SBATCH --output {{output_path}}/logs/%x_%A_%a.out +#SBATCH --error {{output_path}}/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p {{output_path}}/logs + +files=($(cat {{output_path}}/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/tests/data/aggregate_counts_results.json b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json new file mode 100644 index 00000000..1cae0f05 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json @@ -0,0 +1,36 @@ +{ + "REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full": { + "TellReadJob_I1_C520.fastq.gz.erroneous.fastq": { + "seq_counts": 2139633, + "base_pairs": 38513394 + }, + "TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8345327641 + }, + "TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9293296513 + }, + "TellReadJob_I1_C519.fastq.gz.erroneous.fastq": { + "seq_counts": 1932116, + "base_pairs": 34778088 + }, + "TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 1160354916 + }, + "TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8370238082 + }, + "TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9317943166 + }, + "TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 1267182504 + } + } +} diff --git a/sequence_processing_pipeline/tests/data/files_to_count.txt b/sequence_processing_pipeline/tests/data/files_to_count.txt new file mode 100644 index 00000000..8d7ce4b1 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/files_to_count.txt @@ -0,0 +1,8 @@ +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts.sbatch b/sequence_processing_pipeline/tests/data/seq_counts.sbatch new file mode 100644 index 00000000..cc73187c --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J seq_counts +#SBATCH --time 1440 +#SBATCH --mem 8G +#SBATCH -N 1 +#SBATCH -c 1 +#SBATCH -p qiita +#SBATCH --array=1-8 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs + +files=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out new file mode 100644 index 00000000..50a46674 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8345327641 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out new file mode 100644 index 00000000..87ad9f55 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9293296513 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out new file mode 100644 index 00000000..a22d9f8d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +1932116 34778088 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out new file mode 100644 index 00000000..0b35614a --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9317943166 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out new file mode 100644 index 00000000..887522ae --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 1267182504 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out new file mode 100644 index 00000000..a4fbd555 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8370238082 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out new file mode 100644 index 00000000..6c6a9c06 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 1160354916 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out new file mode 100644 index 00000000..9be52329 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +2139633 38513394 diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py new file mode 100644 index 00000000..d0fee2cc --- /dev/null +++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py @@ -0,0 +1,72 @@ +from os.path import join +from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob +from functools import partial +import unittest + + +class TestSeqCountsJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.files_to_count_path = self.path("data", "files_to_count.txt") + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "1440" + self.jmem = "8" + self.modules_to_load = [] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.cores_per_task = "1" + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + self.max_array_length = 100 + self.exp_sbatch_output = self.path("data", "seq_counts.sbatch") + self.exp_results = self.path("data", + "aggregate_counts_results.json") + + def test_creation(self): + def compare_files(obs, exp): + with open(obs, 'r') as f: + obs_lines = f.readlines() + obs_lines = [x.strip() for x in obs_lines] + obs_lines = [x for x in obs_lines if x != ''] + + with open(exp, 'r') as f: + exp_lines = f.readlines() + exp_lines = [x.strip() for x in exp_lines] + exp_lines = [x for x in exp_lines if x != ''] + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + # test basic good-path + job = SeqCountsJob(self.run_dir, self.output_path, self.queue_name, + self.node_count, self.wall_time_limit, self.jmem, + self.modules_to_load, self.qiita_job_id, + self.max_array_length, self.files_to_count_path, + self.cores_per_task) + + obs = job._generate_job_script() + + compare_files(obs, self.exp_sbatch_output) + + # hack log path so that it points to test data directory rather than + # the output directory for a run we didn't run(). + job.log_path = self.path("data", "seq_counts_logs") + + obs = job._aggregate_counts() + + compare_files(obs, self.exp_results) + + +if __name__ == '__main__': + unittest.main() From b718e8b790005801689068fccd4395b20539aaa6 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 1 Dec 2024 18:10:37 -0800 Subject: [PATCH 45/47] Update test based on randomness in output generation --- sequence_processing_pipeline/tests/test_SeqCountsJob.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py index d0fee2cc..d641c3b2 100644 --- a/sequence_processing_pipeline/tests/test_SeqCountsJob.py +++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py @@ -2,6 +2,7 @@ from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob from functools import partial import unittest +from json import load as json_load class TestSeqCountsJob(unittest.TestCase): @@ -63,9 +64,10 @@ def compare_files(obs, exp): # the output directory for a run we didn't run(). job.log_path = self.path("data", "seq_counts_logs") - obs = job._aggregate_counts() + obs = json_load(open(job._aggregate_counts(), 'r')) + exp = json_load(open(self.exp_results, 'r')) - compare_files(obs, self.exp_results) + self.assertDictEqual(obs, exp) if __name__ == '__main__': From a0ffb81090ecf837e2fde71be44df864e15b2b54 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 1 Dec 2024 19:58:50 -0800 Subject: [PATCH 46/47] Updates based on feedback --- sequence_processing_pipeline/Commands.py | 9 +++++++-- sequence_processing_pipeline/tests/test_commands.py | 8 ++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index 130ac28d..ae971fc9 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -23,7 +23,12 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, # add one more level to account for project_names nested under ConvertJob # dir. # this will ignore the _I1_ reads that appear in the integrated result. - fastq_paths = glob.glob(data_location_path + '/*/*_R?_001.fastq.gz') + fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz') + + # case-specific filter for TellSeq output directories that also contain + # _I1_ files. Ensure paths are still sorted afterwards. + fastq_paths = [x for x in fastq_paths if '_I1_001.fastq.gz' not in x] + fastq_paths = sorted(fastq_paths) # convert from GB and halve as we sum R1 max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2) @@ -87,7 +92,7 @@ def demux(id_map, fp, out_d, task, maxtask): """Split infile data based in provided map""" delimiter = '::MUX::' mode = 'wt' - ext = '_001.fastq.gz' + ext = '.fastq.gz' sep = '/' rec = '@' diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py index 4e0d0491..ac8a4bd9 100644 --- a/sequence_processing_pipeline/tests/test_commands.py +++ b/sequence_processing_pipeline/tests/test_commands.py @@ -70,9 +70,9 @@ def test_demux(self): demux(id_map, infile, tmp, task, maxtask) - obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'), + obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1.fastq.gz'), 'rt').read() - obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'), + obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2.fastq.gz'), 'rt').read() exp = '\n'.join(exp_data_r1) + '\n' self.assertEqual(obs_r1, exp) @@ -80,8 +80,8 @@ def test_demux(self): exp = '\n'.join(exp_data_r2) + '\n' self.assertEqual(obs_r2, exp) - self.assertFalse(os.path.exists(join(tmp, 'a_R1_001.fastq.gz'))) - self.assertFalse(os.path.exists(join(tmp, 'a_R2_001.fastq.gz'))) + self.assertFalse(os.path.exists(join(tmp, 'a_R1.fastq.gz'))) + self.assertFalse(os.path.exists(join(tmp, 'a_R2.fastq.gz'))) if __name__ == '__main__': From 0b7ce90342435a14aa76cf42a29c2807b9029af1 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 1 Dec 2024 21:11:06 -0800 Subject: [PATCH 47/47] Common parse_log() method made default --- sequence_processing_pipeline/ConvertJob.py | 1 + sequence_processing_pipeline/FastQCJob.py | 17 ----------------- sequence_processing_pipeline/Job.py | 13 ++++++++++++- sequence_processing_pipeline/NuQCJob.py | 13 ------------- sequence_processing_pipeline/SeqCountsJob.py | 13 +++++++++++-- sequence_processing_pipeline/TRIntegrateJob.py | 14 -------------- sequence_processing_pipeline/TellReadJob.py | 14 -------------- 7 files changed, 24 insertions(+), 61 deletions(-) diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py index 17b8c3b3..dc9b36aa 100644 --- a/sequence_processing_pipeline/ConvertJob.py +++ b/sequence_processing_pipeline/ConvertJob.py @@ -175,6 +175,7 @@ def run(self, callback=None): logging.info(f'Successful job: {job_info}') def parse_logs(self): + # overrides Job.parse_logs() w/tailored parse for specific logs. log_path = join(self.output_path, 'Logs') errors = join(log_path, 'Errors.log') diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 5e0bf4fc..8db0440b 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -6,7 +6,6 @@ from functools import partial from json import dumps import logging -import glob class FastQCJob(Job): @@ -305,19 +304,3 @@ def _generate_job_script(self): with open(sh_details_fp, 'w') as f: f.write('\n'.join(self.commands)) - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - files = sorted(glob.glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - # note 'error' is not same - # requirement as found in QCJob. - # ('error:'). This is a very - # generalized filter. - if 'error' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index 55f287db..4121bd7f 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -13,6 +13,7 @@ from inspect import stack import re from collections import Counter +from glob import glob # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader @@ -126,7 +127,17 @@ def run(self): raise PipelineError("Base class run() method not implemented.") def parse_logs(self): - raise PipelineError("Base class parse_logs() method not implemented.") + # by default, look for anything to parse in the logs directory. + log_path = join(self.output_path, 'logs') + files = sorted(glob(join(log_path, '*'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if 'error:' in line.lower()] + + return [msg.strip() for msg in msgs] def _which(self, file_path, modules_to_load=None): """ diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index 83bdf551..0e05b41d 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -499,16 +499,3 @@ def _generate_job_script(self, max_bucket_size): pmls_path=self.pmls_path)) return job_script_path - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - # sorted lists give predictable results - files = sorted(glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - if 'error:' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py index 51f8e276..f080bd00 100644 --- a/sequence_processing_pipeline/SeqCountsJob.py +++ b/sequence_processing_pipeline/SeqCountsJob.py @@ -5,6 +5,7 @@ from jinja2 import Environment from os import walk from json import dumps +from glob import glob logging.basicConfig(level=logging.DEBUG) @@ -101,8 +102,16 @@ def _generate_job_script(self): return job_script_path def parse_logs(self): - # TODO - pass + # overrides Job.parse_logs() w/tailored parse for specific logs. + files = sorted(glob(join(self.log_path, '*.err'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if line.startswith("[E::stk_size]")] + + return [msg.strip() for msg in msgs] def _aggregate_counts(self): def extract_metadata(fp): diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py index 6994f2ad..7b8740b4 100644 --- a/sequence_processing_pipeline/TRIntegrateJob.py +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -8,7 +8,6 @@ from metapool import load_sample_sheet from os import makedirs from shutil import copyfile -from glob import glob logging.basicConfig(level=logging.DEBUG) @@ -162,16 +161,3 @@ def _generate_job_script(self): "output_dir": self.output_path})) return job_script_path - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - # sorted lists give predictable results - files = sorted(glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - if 'error:' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py index 75e3b958..3d68d4c8 100644 --- a/sequence_processing_pipeline/TellReadJob.py +++ b/sequence_processing_pipeline/TellReadJob.py @@ -6,7 +6,6 @@ from .Pipeline import Pipeline from .PipelineError import PipelineError from metapool import load_sample_sheet -from glob import glob logging.basicConfig(level=logging.DEBUG) @@ -173,16 +172,3 @@ def _generate_job_script(self): })) return job_script_path - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - # sorted lists give predictable results - files = sorted(glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - if 'error:' in line.lower()] - - return [msg.strip() for msg in msgs]