From 09dc316822592105d5f595c6a45e1d0856ac9b4e Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 11 Aug 2024 16:43:27 -0700
Subject: [PATCH 01/47] initial add

---
 sequence_processing_pipeline/TRConvertJob.py  | 212 +++++++++++++
 .../templates/cloudspades-isolate.sbatch      | 110 +++++++
 .../templates/cloudspades.sbatch              | 115 ++++++++
 .../templates/integrate.sbatch                | 120 ++++++++
 .../templates/telllink-isolate.sbatch         |  60 ++++
 .../templates/telllink.sbatch                 |  61 ++++
 .../templates/tellread-cleanup.sbatch         |  19 ++
 .../templates/tellread.sbatch                 | 105 +++++++
 .../templates/tellread.sh                     | 279 ++++++++++++++++++
 9 files changed, 1081 insertions(+)
 create mode 100644 sequence_processing_pipeline/TRConvertJob.py
 create mode 100644 sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
 create mode 100644 sequence_processing_pipeline/templates/cloudspades.sbatch
 create mode 100644 sequence_processing_pipeline/templates/integrate.sbatch
 create mode 100644 sequence_processing_pipeline/templates/telllink-isolate.sbatch
 create mode 100644 sequence_processing_pipeline/templates/telllink.sbatch
 create mode 100644 sequence_processing_pipeline/templates/tellread-cleanup.sbatch
 create mode 100644 sequence_processing_pipeline/templates/tellread.sbatch
 create mode 100755 sequence_processing_pipeline/templates/tellread.sh

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
new file mode 100644
index 00000000..5d277609
--- /dev/null
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -0,0 +1,212 @@
+from os.path import join, exists
+from sequence_processing_pipeline.Job import Job
+from sequence_processing_pipeline.PipelineError import (PipelineError,
+                                                        JobFailedError)
+import logging
+import re
+
+
+class TRConvertJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, nprocs, wall_time_limit, pmem, bcl_tool_path,
+                 modules_to_load, qiita_job_id):
+        """
+        TRConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param nprocs: The maximum number of parallel processes to use.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param bcl_tool_path: The path to either bcl2fastq or bcl-convert.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRConvertJob',
+                         [bcl_tool_path],
+                         1000,
+                         modules_to_load=modules_to_load)
+
+        # for metagenomics pipelines, sample_sheet_path will reflect a real
+        # sample_sheet file. For amplicon pipelines, sample_sheet_path will
+        # reference a dummy sample_sheet file.
+        self.sample_sheet_path = sample_sheet_path
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.nprocs = nprocs
+        self.wall_time_limit = wall_time_limit
+        self.pmem = pmem
+        self.bcl_tool = bcl_tool_path
+        self.qiita_job_id = qiita_job_id
+        self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
+        self.suffix = 'fastq.gz'
+
+        tmp = False
+        for executable_name in ['bcl2fastq', 'bcl-convert']:
+            if executable_name in self.bcl_tool:
+                tmp = True
+                break
+
+        if not tmp:
+            raise PipelineError(f'{self.bcl_tool} is not the path to a known'
+                                'executable')
+
+        self._file_check(self.sample_sheet_path)
+
+        # As the sample-sheet is validated by the Pipeline object before
+        # being passed to TRConvertJob, additional validation isn't needed.
+
+        self._generate_job_script()
+
+    def _generate_job_script(self):
+        """
+        Generate a Torque job script for processing supplied root_directory.
+        :return: The path to the newly-created job-script.
+        """
+        lines = []
+
+        lines.append("#!/bin/bash")
+        lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}")
+        lines.append(f"#SBATCH -p {self.queue_name}")
+        lines.append(f'#SBATCH -N {self.node_count}')
+        lines.append(f'#SBATCH -n {self.nprocs}')
+        lines.append("#SBATCH --time %d" % self.wall_time_limit)
+
+        # send an email to the list of users defined below when a job starts,
+        # terminates, or aborts. This is used to confirm that the package's
+        # own reporting mechanism is reporting correctly.
+        lines.append("#SBATCH --mail-type=ALL")
+
+        # list of users to be contacted independently of this package's
+        # notification system, when a job starts, terminates, or gets aborted.
+        lines.append("#SBATCH --mail-user qiita.help@gmail.com")
+
+        lines.append(f"#SBATCH --mem-per-cpu {self.pmem}")
+
+        lines.append("set -x")
+        lines.append('date')
+        lines.append('hostname')
+        lines.append(f'cd {self.root_dir}')
+
+        if self.modules_to_load:
+            lines.append("module load " + ' '.join(self.modules_to_load))
+
+        # Assume that the bcl-convert tool is named 'bcl-convert' and choose
+        # accordingly.
+        if 'bcl-convert' in self.bcl_tool:
+            lines.append(('%s '
+                          '--sample-sheet "%s" '
+                          '--output-directory %s '
+                          '--bcl-input-directory . '
+                          '--bcl-num-decompression-threads 16 '
+                          '--bcl-num-conversion-threads 16 '
+                          '--bcl-num-compression-threads 16 '
+                          '--bcl-num-parallel-tiles 16 '
+                          '--bcl-sampleproject-subdirectories true '
+                          '--force') % (self.bcl_tool,
+                                        self.sample_sheet_path,
+                                        self.output_path))
+
+            # equivalent cp for bcl-conversion (see below) needed.
+        else:
+            lines.append(('%s '
+                          '--sample-sheet "%s" '
+                          '--minimum-trimmed-read-length 1 '
+                          '--mask-short-adapter-reads 1 '
+                          '-R . '
+                          '-o %s '
+                          '--loading-threads 16 '
+                          '--processing-threads 16 '
+                          '--writing-threads 16 '
+                          '--create-fastq-for-index-reads '
+                          '--ignore-missing-positions ') %
+                         (self.bcl_tool,
+                          self.sample_sheet_path,
+                          self.output_path))
+
+        with open(self.job_script_path, 'w') as f:
+            for line in lines:
+                # remove long spaces in some lines.
+                line = re.sub(r'\s+', ' ', line)
+                f.write(f"{line}\n")
+
+    def run(self, callback=None):
+        """
+        Run BCL2Fastq/BCLConvert conversion
+        :param callback: optional function taking two parameters (id, status)
+                         that is called when a running process's status is
+                         changed.
+        :return:
+        """
+        try:
+            job_info = self.submit_job(self.job_script_path,
+                                       exec_from=self.log_path,
+                                       callback=callback)
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.info(f'Successful job: {job_info}')
+
+    def parse_logs(self):
+        log_path = join(self.output_path, 'Logs')
+        errors = join(log_path, 'Errors.log')
+
+        msgs = []
+
+        if not exists(errors):
+            # we do not raise an Error in this case because it's expected that
+            # parse_logs() will be called in response to an exceptional
+            # condition.
+            msgs.append(f"'{errors} does not exist")
+
+        with open(errors, 'r') as f:
+            lines = f.readlines()
+            for line in [x.strip() for x in lines]:
+                msgs.append(line)
+
+        return msgs
+
+    @staticmethod
+    def parse_job_script(job_script_path):
+        # Returns run-directory and sample-sheet path from a job-script.
+
+        if not exists(job_script_path):
+            raise ValueError(f"'{job_script_path}' is not a valid path")
+
+        with open(job_script_path, 'r') as f:
+            lines = f.readlines()
+            lines = [x.strip() for x in lines]
+
+        # As this code creates this file, we can expect it to be of a certain
+        # format.
+        if lines[0] != '#!/bin/bash':
+            raise ValueError(f"'{job_script_path}' is not a valid path")
+
+        result = {}
+
+        m = re.match('^cd (.*)$', lines[12])
+
+        if m:
+            result['run_directory'] = m.group(1)
+        else:
+            raise ValueError("could not detect run_directory in "
+                             f"'{job_script_path}'")
+
+        m = re.match('^bcl-convert --sample-sheet "(.*?)" ', lines[14])
+
+        if m:
+            result['sample_sheet_path'] = m.group(1)
+        else:
+            raise ValueError("could not detect sample-sheet path in "
+                             f"'{job_script_path}'")
+
+        return result
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
new file mode 100644
index 00000000..5d0e5015
--- /dev/null
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -0,0 +1,110 @@
+#!/bin/bash -l
+#SBATCH -J cs-assemble
+#SBATCH --time 24:00:00
+#SBATCH --mem 64gb
+#SBATCH -N 1
+#SBATCH -c 12
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH -p qiita
+
+### --gres=gpu:1
+source activate qiime2-2023.5
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+set -e
+
+echo $TMPDIR
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+base=${OUTPUT}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+mamba activate activate qiime2-2023.5
+module load gcc_9.3.0 
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+
+# assumes 1-based array index, eg --array 1-N
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+cs=${base}/cloudspades-isolate/${sample}
+
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${cs} ]]; then
+        rm -fr ${cs}
+    fi
+fi
+
+#acs=${base}/cloudspades/${sample}-ariadne
+#acscs=${acs}/assembled
+mkdir -p ${cs}
+#mkdir -p ${acs}
+#mkdir -p ${acscs}
+
+pushd ~/spades-cloudspades-paper/assembler/
+./spades.py \
+    -o ${cs} \
+    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
+module unload gcc_9.3.0
+popd
+
+mamba activate quast                                                           
+quast \
+    -o ${cs}/quast-scaffolds \
+    -t ${SLURM_JOB_CPUS_PER_NODE} \
+    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+
+# remove intermediates that currently dont have a downstream use
+if [[ -d ${cs}/K21 ]]; then
+    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+fi
+
+#pushd $HOME/2023.08.29-ariadne/ariadne
+#mamba activate ariadne-gcc8.5.0
+#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH}
+#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH}
+## parameters from Lauren Mak, 9.25.23
+#./spades.py \
+#    -o ${acs} \
+#    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+#    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+#    --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \
+#    --meta \
+#    --only-assembler \
+#    --search-distance 5000 --size-cutoff 6 -k 55 \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1
+#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH}
+#popd
+#
+#pushd ~/spades-cloudspades-paper/assembler/
+#module load gcc_9.3.0 
+#./spades.py \
+#    -o ${acscs} \
+#    --gemcode1-1 ${acs}/K55/5000.R1.fastq \
+#    --gemcode1-2 ${acs}/K55/5000.R2.fastq \
+#    --meta \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1
+#module unload gcc_9.3.0
+#popd
+#
+#mamba activate quast                                                           
+#quast \
+#    -o ${acscs}/quast-scaffolds \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} \
+#    ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
new file mode 100644
index 00000000..fbc30ae2
--- /dev/null
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -0,0 +1,115 @@
+#!/bin/bash -l
+#SBATCH -J cs-assemble
+#SBATCH --time 24:00:00
+#SBATCH --mem 128gb
+#SBATCH -N 1
+#SBATCH -c 12
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH -p qiita
+
+### --gres=gpu:1
+source activate qiime2-2023.5
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+set -e
+
+echo $TMPDIR
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+base=${OUTPUT}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+mamba activate activate qiime2-2023.5
+module load gcc_9.3.0 
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+
+# assumes 1-based array index, eg --array 1-N
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+cs=${base}/cloudspades/${sample}
+
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${cs} ]]; then
+        rm -fr ${cs}
+    fi
+fi
+
+#acs=${base}/cloudspades/${sample}-ariadne
+#acscs=${acs}/assembled
+mkdir -p ${cs}
+#mkdir -p ${acs}
+#mkdir -p ${acscs}
+
+#pushd ~/spades-cloudspades-paper/assembler/
+#pushd /home/mcdonadt/cloudspades-0.1/spades-cloudspades-0.1/assembler/bin
+#pushd /home/qiita/CHARLIE/TELLREAD/spades-cloudspades-0.1/assembler/bin
+pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
+
+./spades.py \
+    -o ${cs} \
+    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    --meta \
+    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
+module unload gcc_9.3.0
+popd
+
+mamba activate quast                                                           
+quast \
+    -o ${cs}/quast-scaffolds \
+    -t ${SLURM_JOB_CPUS_PER_NODE} \
+    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+
+# remove intermediates that currently dont have a downstream use
+if [[ -d ${cs}/K21 ]]; then
+    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+fi
+
+#pushd $HOME/2023.08.29-ariadne/ariadne
+#mamba activate ariadne-gcc8.5.0
+#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH}
+#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH}
+## parameters from Lauren Mak, 9.25.23
+#./spades.py \
+#    -o ${acs} \
+#    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+#    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+#    --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \
+#    --meta \
+#    --only-assembler \
+#    --search-distance 5000 --size-cutoff 6 -k 55 \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1
+#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH}
+#popd
+#
+#pushd ~/spades-cloudspades-paper/assembler/
+#module load gcc_9.3.0 
+#./spades.py \
+#    -o ${acscs} \
+#    --gemcode1-1 ${acs}/K55/5000.R1.fastq \
+#    --gemcode1-2 ${acs}/K55/5000.R2.fastq \
+#    --meta \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1
+#module unload gcc_9.3.0
+#popd
+#
+#mamba activate quast                                                           
+#quast \
+#    -o ${acscs}/quast-scaffolds \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} \
+#    ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
new file mode 100644
index 00000000..4d7af5aa
--- /dev/null
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -0,0 +1,120 @@
+#!/bin/bash -l
+#SBATCH -J integrate
+#SBATCH --time 24:00:00
+#SBATCH --mem 8gb
+#SBATCH -N 1
+#SBATCH -c 1
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+#SBATCH -p qiita
+
+source activate rust
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
+cores=${SLURM_CPUS_PER_TASK}
+
+if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
+    echo "Not operating in an array"
+    exit 1
+fi
+
+if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
+    echo "Line extraction assumes 1-based index"
+    exit 1
+fi
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+if [[ -z ${BASE} ]]; then
+    echo "BASE not specified"
+    exit 1
+fi
+
+tellread=${OUTPUT}
+if [[ ! -d ${tellread} ]]; then
+    echo "${tellread} not found"
+    exit 1
+fi
+
+set -x 
+set -e
+set -o pipefail
+
+samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+export TMPDIR=$(mktemp -d)
+function cleanup {                                                              
+  echo "Removing $TMPDIR"                                                          
+  rm  -r $TMPDIR                                                                   
+  unset TMPDIR                                                                  
+}                                                                               
+trap cleanup EXIT
+
+files=${TMPDIR}/integration.files
+/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files}
+mkdir -p ${tellread}/integrated
+
+if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R1"
+    exit 1
+fi
+
+if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R2"
+    exit 1
+fi
+
+if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} I1"
+    exit 1
+fi
+
+r1=$(grep -m 1 "_R1_${sample}" ${files})
+r2=$(grep -m 1 "_R2_${sample}" ${files})
+i1=$(grep -m 1 "_I1_${sample}" ${files})
+r1out=${tellread}/integrated/${sample}.R1.fastq.gz
+r2out=${tellread}/integrated/${sample}.R2.fastq.gz
+i1out=${tellread}/integrated/${sample}.I1.fastq.gz
+
+if [[ ! -s ${r1} ]]; then
+    echo "${r1} is empty, cannot integrate"
+    if [[ -s ${r2} ]]; then
+        echo "R1 and R2 are inconsistent"
+        exit 1
+    fi
+    if [[ -s ${i1} ]]; then
+        echo "R1 and I1 are inconsistent"
+        exit 1
+    fi
+
+    # reflect the empties so Qiita can know of them
+    touch ${r1out}
+    touch ${r2out}
+    touch ${i1out}
+    exit 0
+fi
+
+# this can probably be backgrounded but then you have to get creative to
+# not mask a nonzero exit status (e.g., the python process raising)
+cat ${i1} | gzip > ${i1out} 
+
+mamba activate tellread-integrate
+#python ${BASE}/integrate-indices-np.py integrate \
+python ${BASE}/integrate-indices-np.py integrate \
+    --no-sort \
+    --r1-in ${r1} \
+    --r2-in ${r2} \
+    --i1-in ${i1} \
+    --r1-out ${r1out} \
+    --r2-out ${r2out} \
+    --threads ${cores} 
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
new file mode 100644
index 00000000..9f778757
--- /dev/null
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -0,0 +1,60 @@
+#!/bin/bash -l
+#SBATCH --mem 160G
+#SBATCH -N 1
+#SBATCH -c 16
+#SBATCH -t 96:00:00
+#SBATCH -J tellink-isolate
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH -p qiita
+
+set -x 
+set -e
+
+module load singularity_3.6.4
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+#base=/panfs/dtmcdonald/${LABELTAG}
+base=/panfs/qiita/TELLREAD/${LABELTAG}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+k=79
+lc=35
+cores=${SLURM_CPUS_PER_TASK}
+
+tl=${base}/tell-link-isolate/${sample}
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${tl} ]]; then
+        rm -fr ${tl}
+    fi
+fi
+
+mkdir -p ${tl}
+
+HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/
+${HOME_PATH}/tellink-release/run_tellink_sing.sh \
+    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
+    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
+    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -k ${k} \
+    -lc ${lc} \
+    -p ${sample} \
+    -j ${cores}
+
+# remove temporary data
+if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+fi
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
new file mode 100644
index 00000000..64a69072
--- /dev/null
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -0,0 +1,61 @@
+#!/bin/bash -l
+#SBATCH --mem 160G
+#SBATCH -N 1
+#SBATCH -c 16
+#SBATCH -t 96:00:00
+#SBATCH -J tellink
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH -p qiita
+
+set -x 
+set -e
+
+module load singularity_3.6.4
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABEL is not specified"
+    exit 1
+fi
+
+base=/panfs/${USER}/${LABELTAG}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+k=79
+lc=35
+cores=${SLURM_CPUS_PER_TASK}
+
+tl=${base}/tell-link/${sample}
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${tl} ]]; then
+        rm -fr ${tl}
+    fi
+fi
+
+mkdir -p ${tl}
+
+HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/
+${HOME_PATH}/tellink-release/run_tellink_sing.sh \
+    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
+    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
+    -d metagenomics \
+    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -k ${k} \
+    -lc ${lc} \
+    -p ${sample} \
+    -j ${cores}
+
+# remove temporary data
+if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+fi
+
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
new file mode 100644
index 00000000..a8808822
--- /dev/null
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -0,0 +1,19 @@
+#!/bin/bash -l
+#SBATCH -J cleanup
+#SBATCH --time 24:00:00
+#SBATCH --mem 8gb
+#SBATCH -N 1
+#SBATCH -c 1
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=FAIL
+#SBATCH --output %x-%A.out
+#SBATCH --error %x-%A.err
+#SBATCH -p qiita
+
+if [[ -z "${OUTPUT}" ]]; then
+    echo "OUTPUT is not specified"
+    exit 1
+fi
+
+# remove unused large outputs
+rm -fr ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
new file mode 100644
index 00000000..be5ef9e7
--- /dev/null
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -0,0 +1,105 @@
+#!/bin/bash -l
+
+#SBATCH -N 1
+#SBATCH -c 4
+#SBATCH --mem 16G
+#SBATCH --partition=short
+#SBATCH -t 96:00:00
+#SBATCH -J tellread
+#SBATCH --output %x-%A.out
+#SBATCH --error %x-%A.err
+#SBATCH --mail-user=qiita.help@gmail.com
+#SBATCH --mail-type=BEGIN,FAIL
+#SBATCH -p qiita
+
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+
+if [[ -z "${N_SAMPLES}" ]]; then
+    echo "N_SAMPLES is not specified"
+    exit 1
+fi
+
+if [[ -z "${SEQRUNPATH}" ]]; then
+    echo "SEQRUNPATH is not specified"
+    exit 1
+fi
+
+if [[ -z "${LANE}" ]]; then
+    echo "LANE is not specified"
+    exit 1
+fi
+
+if [[ -z "${SAMPLES}" ]]; then
+    echo "SAMPLES is not specified"
+    exit 1
+fi
+
+if [[ -z "${REFS}" ]]; then
+    echo "REFS is not specified"
+    exit 1
+fi
+
+if [[ -z "${OUTPUT}" ]]; then
+    echo "OUTPUT is not specified"
+    exit 1
+fi
+
+export TMPDIR=/panfs/${USER}/tmp
+mkdir -p ${TMPDIR}
+export TMPDIR=$(mktemp -d)
+seqrun_path=${SEQRUNPATH}
+
+if [[ ${LANE} == "L001" ]]; then
+    lane=s_1
+elif [[ ${LANE} == "L002" ]]; then
+    lane=s_2
+elif [[ ${LANE} == "L003" ]]; then
+    lane=s_3
+elif [[ ${LANE} == "L004" ]]; then
+    lane=s_4
+elif [[ ${LANE} == "L005" ]]; then
+    lane=s_5
+elif [[ ${LANE} == "L006" ]]; then
+    lane=s_6
+elif [[ ${LANE} == "L007" ]]; then
+    lane=s_7
+elif [[ ${LANE} == "L008" ]]; then
+    lane=s_8
+else
+    echo "Unrecognized lane: ${LANE}"
+    exit 1
+fi
+
+# yes, hard coded, not great but progress.
+extra=""
+if [[ ! -z ${REFBASE} ]]; then
+    extra="-f ${REFBASE}"
+fi
+
+mkdir -p ${OUTPUT}
+    
+module load singularity_3.6.4
+$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
+    -i ${seqrun_path} \
+    -o ${OUTPUT} \
+    -s $(echo ${SAMPLES} | tr -d '"') \
+    -g $(echo ${REFS} | tr -d '"') \
+    -j ${SLURM_JOB_CPUS_PER_NODE} \
+    ${extra} \
+    -l ${lane}
+
+    
+if [[ -d ${OUTPUT}/Full ]]; then
+    echo "Run appears successful"
+elif [[ -d ${OUTPUT}/1_demult/Full ]]; then
+    echo "Run appears unsuccessful but has output"
+    exit 1
+else
+    echo "Run appears unsuccessful"
+    exit 1
+fi
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
new file mode 100755
index 00000000..78b8862a
--- /dev/null
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+script_name=${0##*/}
+
+function help () {
+    echo "Submit for TELL-read"
+    echo ""
+    echo "Usage: ${script_name} -s <seqrunpath> -l <lane> [-r reference_map] [-b reference_base]"
+    echo ""
+    echo -e "\t-s\tPath to the sequencing run."  
+    echo -e "\t-i\tThe sample sheet."  
+    echo -e "\t-l\tThe lane to process."  
+    echo -e "\t-r\tA file specifying reference genomes to use [OPTIONAL]"  
+    echo -e "\t-b\tReference genome base directory [OPTIONAL]"  
+    echo -e "\t-m\tMode, isolate or metagenomic [OPTIONAL]"  
+    echo "" 
+}
+
+# references right now are only used for techdev
+
+# derived from https://www.redhat.com/sysadmin/arguments-options-bash-scripts
+while getopts "hs:i:l:r:b:m:" option; do
+   case ${option} in
+    h) 
+        help
+        exit;;
+    s) seqrunpath=${OPTARG};;
+    l) lane=${OPTARG};;
+    r) reference_map=${OPTARG};;
+    b) reference_base=${OPTARG};;
+    m) mode=${OPTARG};;
+    \?)
+         echo "Error: Invalid option"
+         exit;;
+    *)
+         echo "Error: Invalid option"
+         exit;;
+   esac
+done
+
+# nifty
+# https://unix.stackexchange.com/a/621007
+: ${seqrunpath:?Missing -s}
+: ${lane:?Missing -i}
+
+if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then
+    if [[ -z ${reference_map} ]]; then
+        echo "-b used without -r"
+        exit 1
+    fi
+    if [[ -z ${reference_base} ]]; then
+        echo "-r used without -b"
+        exit 1
+    fi
+    if [[ ! -d ${reference_base} ]]; then
+        echo "reference base not found"
+        exit 1
+    fi
+
+    tag=reference-based
+else
+    tag=reference-free
+fi
+
+samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv"
+
+# trim trailing slash
+# https://stackoverflow.com/a/32845647/19741
+safepath=$(echo ${seqrunpath} | sed 's:/*$::')  
+label=$(basename ${safepath})
+labeltag=${label}-${tag}
+output=/panfs/${USER}/${labeltag}
+
+if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
+    echo "Cannot access the lane"
+    exit 1
+fi
+
+if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then
+    echo "FOO"
+    sbatch_cores=2
+    sbatch_mem=8G
+    norm=TRUE
+    wall=24:00:00
+    mode=NA
+elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then
+    echo "BAR"
+    sbatch_cores=2
+    sbatch_mem=8G
+    norm=TRUE
+    wall=24:00:00
+    mode=NA
+else
+    echo "BAZ"
+    sbatch_cores=16
+    sbatch_mem=160G
+    norm=FALSE
+    assemble=TRUE
+    wall=48:00:00
+fi
+
+if [[ ${mode} == "isolate" ]]; then
+    ISOLATE_MODE=TRUE
+elif [[ ${mode} == "metagenomic" ]]; then
+    ISOLATE_MODE=FALSE
+elif [[ ${mode} == "NA" ]]; then
+    ISOLATE_MODE=FALSE
+else
+    echo "unknown mode: ${mode}"
+    exit 1
+fi
+
+set -e
+set -o pipefail
+
+declare -a s
+declare -a g
+# below extended regex might be broken because C5\d\d happens in column 0, not column 1
+# of the hacked sample-sheet. 
+#for sample in $(egrep -o ",C5[0-9][0-9]," ${samplesheet} | tr -d "," | sort)
+for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
+do
+    echo "SAMPLE: ${sample}"
+    # get references if they exist
+    if [[ -f ${reference_map} ]]; then
+        if $(grep -Fq ${sample} ${reference_map}); then
+            ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n")
+            if [[ ${ref} != "NONE" ]]; then
+                if [[ ! -d "${reference_base}/${ref}" ]]; then
+                    echo "${reference_base}/${ref}"
+                    echo "${ref} not found"
+                    exit 1
+                fi
+                g[${#g[@]}]=${ref}
+                s[${#s[@]}]=${sample}
+            fi
+        fi
+    else
+        g[${#g[@]}]=NONE
+        s[${#s[@]}]=${sample}
+    fi
+done
+n_samples=${#s[@]}
+
+echo "Submitting:"
+echo "S: ${s[@]}"
+echo "G: ${g[@]}"
+
+# https://stackoverflow.com/a/17841619/19741
+function join_by { local IFS="$1"; shift; echo "$*"; }
+s=$(join_by , "${s[@]}")
+g=$(join_by , "${g[@]}")
+
+base=$(dirname ${0})
+submit_script=$(dirname ${0})/tellread.sbatch
+integrate_script=$(dirname ${0})/integrate.sbatch
+norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch
+asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
+clean_script=$(dirname ${0})/tellread-cleanup.sbatch
+
+if [[ ${ISOLATE_MODE} == "TRUE" ]]; then
+    asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch
+    asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch
+else
+    asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
+    asm_tellink_script=$(dirname ${0})/telllink.sbatch
+fi
+
+if [[ ! -f ${submit_script} ]]; then
+    echo "Cannot access submit script"
+    exit 1
+fi
+if [[ ! -f ${asm_cloudspades_script} ]]; then
+    echo "Cannot access cloudspades assembly script"
+    exit 1
+fi
+if [[ ! -f ${asm_tellink_script} ]]; then
+    echo "Cannot access tell-link assembly script"
+    exit 1
+fi
+if [[ ! -f ${integrate_script} ]]; then
+    echo "Cannot access integrate script"
+    exit 1
+fi
+if [[ ! -f ${clean_script} ]]; then
+    echo "Cannot access clean script"
+    exit 1
+fi
+
+datetag=$(date "+%Y.%m.%d")
+scriptcopy=$(pwd)/tellread_script-${datetag}.sh
+submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch
+asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch
+asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch
+normcopy=$(pwd)/norm_submission-${datetag}.sbatch
+intcopy=$(pwd)/integrate_submission-${datetag}.sbatch
+cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch
+arguments=$(pwd)/provided_script_arguments.txt
+if [[ -f ${scriptcopy} ]]; then
+    echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit"
+    exit 1
+fi
+if [[ -f ${submitcopy} ]]; then
+    echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit"
+    exit 1
+fi
+
+# CHARLIE
+echo $@ > ${arguments}
+cp ${0} ${scriptcopy}
+cp ${submit_script} ${submitcopy}
+cp ${asm_cloudspades_script} ${asmcscopy}
+cp ${asm_tellink_script} ${asmtlcopy}
+cp ${integrate_script} ${intcopy}
+cp ${clean_script} ${cleancopy}
+chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy}
+
+set -x
+
+echo "C"
+
+trjob=$(sbatch \
+          --parsable \
+          -J ${labeltag}-${datetag} \
+          -c ${sbatch_cores} \
+          --mem ${sbatch_mem} \
+          --time ${wall} \
+          --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
+          ${submit_script})
+
+echo "D"
+
+if [[ ${norm} == "TRUE" ]]; then
+    cp ${norm_script} ${normcopy}
+    chmod gou-w ${normcopy}
+    norm_counts_job=$(sbatch \
+                        --parsable \
+                        --dependency=afterok:${trjob} \
+                        -J ${labeltag}-${datetag}-norm-counts \
+                        --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \
+                        ${norm_script})
+fi
+
+echo "E"
+integrate_job=$(sbatch \
+                    --parsable \
+                    -J ${labeltag}-${datetag}-integrate \
+                    --dependency=afterok:${trjob} \
+                    --array 1-${n_samples} \
+                    --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \
+                    ${integrate_script})
+
+if [[ ${assemble} == "TRUE" ]]; then
+    csj=$(sbatch \
+            --parsable \
+            --dependency=aftercorr:${integrate_job} \
+            -J ${labeltag}-${datetag}-cloudspades \
+            --array 1-${n_samples} \
+            --export LABELTAG=${labeltag},OUTPUT=${output} \
+            ${asm_cloudspades_script})
+    tlj=$(sbatch \
+            --parsable \
+            --dependency=aftercorr:${integrate_job} \
+            -J ${labeltag}-${datetag}-tell-link \
+            --array 1-${n_samples} \
+            --export LABELTAG=${labeltag},OUTPUT=${output} \
+            ${asm_tellink_script})
+    cleanupdep=${csj}:${tlj}
+else
+    cleanupdep=${integrate_job}
+    echo "Not assembling"
+fi
+
+cleanup=$(sbatch \
+            --parsable \
+            -J ${labeltag}-${datetag}-cleanup \
+            --dependency=afterok:${cleanupdep} \
+            --export OUTPUT=${output} \
+            ${clean_script})

From 3406cbfa1de9f946c774c981efca085239015fb8 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 11 Aug 2024 16:47:54 -0700
Subject: [PATCH 02/47] initial cleanup

---
 .../templates/cloudspades-isolate.sbatch      | 39 -----------------
 .../templates/cloudspades.sbatch              | 43 -------------------
 .../templates/integrate.sbatch                |  1 -
 .../templates/telllink-isolate.sbatch         |  1 -
 .../templates/tellread.sh                     |  5 +--
 5 files changed, 1 insertion(+), 88 deletions(-)

diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index 5d0e5015..cf18a094 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -10,7 +10,6 @@
 #SBATCH --mail-type=FAIL
 #SBATCH -p qiita
 
-### --gres=gpu:1
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -49,11 +48,7 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     fi
 fi
 
-#acs=${base}/cloudspades/${sample}-ariadne
-#acscs=${acs}/assembled
 mkdir -p ${cs}
-#mkdir -p ${acs}
-#mkdir -p ${acscs}
 
 pushd ~/spades-cloudspades-paper/assembler/
 ./spades.py \
@@ -74,37 +69,3 @@ quast \
 if [[ -d ${cs}/K21 ]]; then
     rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
 fi
-
-#pushd $HOME/2023.08.29-ariadne/ariadne
-#mamba activate ariadne-gcc8.5.0
-#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH}
-#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH}
-## parameters from Lauren Mak, 9.25.23
-#./spades.py \
-#    -o ${acs} \
-#    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-#    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
-#    --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \
-#    --meta \
-#    --only-assembler \
-#    --search-distance 5000 --size-cutoff 6 -k 55 \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1
-#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH}
-#popd
-#
-#pushd ~/spades-cloudspades-paper/assembler/
-#module load gcc_9.3.0 
-#./spades.py \
-#    -o ${acscs} \
-#    --gemcode1-1 ${acs}/K55/5000.R1.fastq \
-#    --gemcode1-2 ${acs}/K55/5000.R2.fastq \
-#    --meta \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1
-#module unload gcc_9.3.0
-#popd
-#
-#mamba activate quast                                                           
-#quast \
-#    -o ${acscs}/quast-scaffolds \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} \
-#    ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index fbc30ae2..f80f6626 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -10,7 +10,6 @@
 #SBATCH --mail-type=FAIL
 #SBATCH -p qiita
 
-### --gres=gpu:1
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -49,15 +48,7 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     fi
 fi
 
-#acs=${base}/cloudspades/${sample}-ariadne
-#acscs=${acs}/assembled
 mkdir -p ${cs}
-#mkdir -p ${acs}
-#mkdir -p ${acscs}
-
-#pushd ~/spades-cloudspades-paper/assembler/
-#pushd /home/mcdonadt/cloudspades-0.1/spades-cloudspades-0.1/assembler/bin
-#pushd /home/qiita/CHARLIE/TELLREAD/spades-cloudspades-0.1/assembler/bin
 pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
 
 ./spades.py \
@@ -79,37 +70,3 @@ quast \
 if [[ -d ${cs}/K21 ]]; then
     rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
 fi
-
-#pushd $HOME/2023.08.29-ariadne/ariadne
-#mamba activate ariadne-gcc8.5.0
-#OLD_LDD_LIBRARY_PATH=${LDD_LIBRARY_PATH}
-#export LDD_LIBRARY_PATH=${HOME}/miniconda3/envs/ariadne-gcc8.5.0/include/:${LDD_LIBRARY_PATH}
-## parameters from Lauren Mak, 9.25.23
-#./spades.py \
-#    -o ${acs} \
-#    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-#    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
-#    --assembly-graph ${cs}/assembly_graph_with_scaffolds.gfa \
-#    --meta \
-#    --only-assembler \
-#    --search-distance 5000 --size-cutoff 6 -k 55 \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acs}/stdoutstderr.log 2>&1
-#export LDD_LIBRARY_PATH=${OLD_LDD_LIBRARY_PATH}
-#popd
-#
-#pushd ~/spades-cloudspades-paper/assembler/
-#module load gcc_9.3.0 
-#./spades.py \
-#    -o ${acscs} \
-#    --gemcode1-1 ${acs}/K55/5000.R1.fastq \
-#    --gemcode1-2 ${acs}/K55/5000.R2.fastq \
-#    --meta \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} > ${acscs}/stdoutstderr.log 2>&1
-#module unload gcc_9.3.0
-#popd
-#
-#mamba activate quast                                                           
-#quast \
-#    -o ${acscs}/quast-scaffolds \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} \
-#    ${acscs}/scaffolds.fasta > ${acscs}/quast-stdoutstderr.log 2>&1
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index 4d7af5aa..acdf1224 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -109,7 +109,6 @@ fi
 cat ${i1} | gzip > ${i1out} 
 
 mamba activate tellread-integrate
-#python ${BASE}/integrate-indices-np.py integrate \
 python ${BASE}/integrate-indices-np.py integrate \
     --no-sort \
     --r1-in ${r1} \
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index 9f778757..85d061c2 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -20,7 +20,6 @@ if [[ -z "${LABELTAG}" ]]; then
     exit 1
 fi
 
-#base=/panfs/dtmcdonald/${LABELTAG}
 base=/panfs/qiita/TELLREAD/${LABELTAG}
 if [[ ! -d ${base} ]]; then
     echo "${base} not found"
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
index 78b8862a..628f1e99 100755
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -38,7 +38,6 @@ while getopts "hs:i:l:r:b:m:" option; do
    esac
 done
 
-# nifty
 # https://unix.stackexchange.com/a/621007
 : ${seqrunpath:?Missing -s}
 : ${lane:?Missing -i}
@@ -116,8 +115,7 @@ set -o pipefail
 declare -a s
 declare -a g
 # below extended regex might be broken because C5\d\d happens in column 0, not column 1
-# of the hacked sample-sheet. 
-#for sample in $(egrep -o ",C5[0-9][0-9]," ${samplesheet} | tr -d "," | sort)
+# of the hacked sample-sheet.
 for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
 do
     echo "SAMPLE: ${sample}"
@@ -205,7 +203,6 @@ if [[ -f ${submitcopy} ]]; then
     exit 1
 fi
 
-# CHARLIE
 echo $@ > ${arguments}
 cp ${0} ${scriptcopy}
 cp ${submit_script} ${submitcopy}

From c5540f78ced5b44f90c97def11e7ffdf61ce0166 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 11 Aug 2024 20:33:23 -0700
Subject: [PATCH 03/47] first pass at converting TELLREAD scripts

---
 sequence_processing_pipeline/TRConvertJob.py  | 68 +++++++++++++++++++
 .../templates/cloudspades-isolate.sbatch      | 36 ++++++----
 .../templates/cloudspades.sbatch              | 21 +++---
 .../templates/integrate.sbatch                | 21 +++---
 .../templates/telllink-isolate.sbatch         | 25 ++++---
 .../templates/telllink.sbatch                 | 25 ++++---
 .../templates/tellread-cleanup.sbatch         | 20 +++---
 .../templates/tellread.sbatch                 | 25 ++++---
 .../templates/tellread.sh                     | 64 ++++-------------
 9 files changed, 184 insertions(+), 121 deletions(-)

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 5d277609..81d6bda8 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -6,6 +6,74 @@
 import re
 
 
+tellread.sh
+# {{CHARLIE_TELLREAD_MAP}} = samplesheet to telread.sh (-i option) must equal "/home/qiita_test/qiita-spots/tellread_mapping.csv"
+
+tellread.sbatch
+#SBATCH -J {{job_name}}             # tellread
+#SBATCH -p {{queue_name}}           # qiita
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 4
+#SBATCH --mem {{mem_in_gb}}G        # 16G
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
+{{CHARLIE_TMPDIR}} = /panfs/${USER}/tmp - replace with something in the work directory
+{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} = $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh
+{{modules_to_load}}  # singularity_3.6.4
+
+tellink-isolate.sbatch
+#SBATCH -J {{job_name}}             # tellink-isolate
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 16
+#SBATCH --mem {{mem_in_gb}}G        # 160G
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
+#SBATCH -p {{queue_name}}           # qiita
+
+{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh
+{{modules_to_load}}  # singularity_3.6.4
+
+telllink.sbatch
+#SBATCH -J {{job_name}}             # tellink
+#SBATCH --mem {{mem_in_gb}}G        # 160G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 16
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
+#SBATCH -p {{queue_name}}           # qiita
+{{modules_to_load}}  # singularity_3.6.4
+{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh
+
+integrate.sbatch (should this be renamed?)
+#SBATCH -J {{job_name}}             # integrate
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+cloudspades-isolate.sbatch:
+#SBATCH -J {{job_name}}             # cs-assemble
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 64G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 12
+#SBATCH -p {{queue_name}}           # qiita
+
+module load {{modules_to_load}} # gcc_9.3.0
+
+{{CHARLIE_SPADES_PATH}} = ~/spades-cloudspades-paper/assembler/spades.py
+
+
+tellread-cleanup.sbatch
+#SBATCH -J {{job_name}}             # cleanup
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+
+
+
+
 class TRConvertJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, nprocs, wall_time_limit, pmem, bcl_tool_path,
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index cf18a094..390a7f90 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -1,15 +1,20 @@
 #!/bin/bash -l
-#SBATCH -J cs-assemble
-#SBATCH --time 24:00:00
-#SBATCH --mem 64gb
-#SBATCH -N 1
-#SBATCH -c 12
+#SBATCH -J {{job_name}}             # cs-assemble
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 64G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 12
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A_%a.out
 #SBATCH --error %x-%A_%a.err
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
-#SBATCH -p qiita
 
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -19,6 +24,8 @@ function logger () {
 set -x 
 set -e
 
+# this gets set in the environment from another script. For now let's
+# run with that.
 echo $TMPDIR
 
 if [[ -z "${LABELTAG}" ]]; then
@@ -32,8 +39,11 @@ if [[ ! -d ${base} ]]; then
     exit 1
 fi
 
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
 mamba activate activate qiime2-2023.5
-module load gcc_9.3.0 
+
+module load {{modules_to_load}} # gcc_9.3.0
 
 samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
 
@@ -50,8 +60,7 @@ fi
 
 mkdir -p ${cs}
 
-pushd ~/spades-cloudspades-paper/assembler/
-./spades.py \
+pushd {{CHARLIE_SPADES_PATH}} \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
     --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
@@ -59,7 +68,10 @@ pushd ~/spades-cloudspades-paper/assembler/
 module unload gcc_9.3.0
 popd
 
-mamba activate quast                                                           
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
+mamba activate quast
+
 quast \
     -o ${cs}/quast-scaffolds \
     -t ${SLURM_JOB_CPUS_PER_NODE} \
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index f80f6626..a9f1ec45 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -1,15 +1,20 @@
 #!/bin/bash -l
-#SBATCH -J cs-assemble
-#SBATCH --time 24:00:00
-#SBATCH --mem 128gb
-#SBATCH -N 1
-#SBATCH -c 12
+#SBATCH -J {{job_name}}             # cs-assemble
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 128G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 12
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A_%a.out
 #SBATCH --error %x-%A_%a.err
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
-#SBATCH -p qiita
 
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index acdf1224..787da4b2 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -1,15 +1,20 @@
 #!/bin/bash -l
-#SBATCH -J integrate
-#SBATCH --time 24:00:00
-#SBATCH --mem 8gb
-#SBATCH -N 1
-#SBATCH -c 1
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
+#SBATCH -J {{job_name}}             # integrate
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A_%a.out
 #SBATCH --error %x-%A_%a.err
-#SBATCH -p qiita
 
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
 source activate rust
 function logger () { 
     echo "$(date) :: ${@}"; 
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index 85d061c2..0f08c0a3 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -1,19 +1,23 @@
 #!/bin/bash -l
-#SBATCH --mem 160G
-#SBATCH -N 1
-#SBATCH -c 16
-#SBATCH -t 96:00:00
-#SBATCH -J tellink-isolate
+#SBATCH -J {{job_name}}             # tellink-isolate
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 16
+#SBATCH --mem {{mem_in_gb}}G        # 160G
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A_%a.out
 #SBATCH --error %x-%A_%a.err
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
-#SBATCH -p qiita
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
 
 set -x 
 set -e
 
-module load singularity_3.6.4
+module load {{modules_to_load}} # singularity_3.6.4
 
 if [[ -z "${LABELTAG}" ]]; then
     echo "LABELTAG is not specified"
@@ -42,8 +46,7 @@ fi
 
 mkdir -p ${tl}
 
-HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/
-${HOME_PATH}/tellink-release/run_tellink_sing.sh \
+{{TELLLINK_SING_PATH}} \
     -r1 ${base}/integrated/${sample}.R1.fastq.gz \
     -r2 ${base}/integrated/${sample}.R2.fastq.gz \
     -i1 ${base}/integrated/${sample}.I1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
index 64a69072..591ac69d 100644
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -1,19 +1,23 @@
 #!/bin/bash -l
-#SBATCH --mem 160G
-#SBATCH -N 1
-#SBATCH -c 16
-#SBATCH -t 96:00:00
-#SBATCH -J tellink
+#SBATCH -J {{job_name}}             # tellink
+#SBATCH --mem {{mem_in_gb}}G        # 160G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 16
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A_%a.out
 #SBATCH --error %x-%A_%a.err
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
-#SBATCH -p qiita
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
 
 set -x 
 set -e
 
-module load singularity_3.6.4
+module load {{modules_to_load}} # singularity_3.6.4
 
 if [[ -z "${LABELTAG}" ]]; then
     echo "LABEL is not specified"
@@ -42,8 +46,7 @@ fi
 
 mkdir -p ${tl}
 
-HOME_PATH=/projects/long_read_collab/code/tellseq/release_v1.11/
-${HOME_PATH}/tellink-release/run_tellink_sing.sh \
+{{TELLLINK_SING_PATH}} \
     -r1 ${base}/integrated/${sample}.R1.fastq.gz \
     -r2 ${base}/integrated/${sample}.R2.fastq.gz \
     -i1 ${base}/integrated/${sample}.I1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
index a8808822..f3388ef7 100644
--- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -1,14 +1,18 @@
 #!/bin/bash -l
-#SBATCH -J cleanup
-#SBATCH --time 24:00:00
-#SBATCH --mem 8gb
-#SBATCH -N 1
-#SBATCH -c 1
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=FAIL
+#SBATCH -J {{job_name}}             # cleanup
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=BEGIN,FAIL
+
+# for now these can be left hard-coded.
 #SBATCH --output %x-%A.out
 #SBATCH --error %x-%A.err
-#SBATCH -p qiita
 
 if [[ -z "${OUTPUT}" ]]; then
     echo "OUTPUT is not specified"
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index be5ef9e7..800503f0 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -1,16 +1,19 @@
 #!/bin/bash -l
+#SBATCH -J {{job_name}}             # tellread
+#SBATCH -p {{queue_name}}           # qiita
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 4
+#SBATCH --mem {{mem_in_gb}}G        # 16G
+#SBATCH --time {{wall_time_limit}}  # 96:00:00
 
-#SBATCH -N 1
-#SBATCH -c 4
-#SBATCH --mem 16G
+# for now these can be left hard-coded.
 #SBATCH --partition=short
-#SBATCH -t 96:00:00
-#SBATCH -J tellread
 #SBATCH --output %x-%A.out
 #SBATCH --error %x-%A.err
-#SBATCH --mail-user=qiita.help@gmail.com
-#SBATCH --mail-type=BEGIN,FAIL
-#SBATCH -p qiita
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=BEGIN,FAIL
 
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -49,7 +52,7 @@ if [[ -z "${OUTPUT}" ]]; then
     exit 1
 fi
 
-export TMPDIR=/panfs/${USER}/tmp
+export TMPDIR={{CHARLIE_TMPDIR}}
 mkdir -p ${TMPDIR}
 export TMPDIR=$(mktemp -d)
 seqrun_path=${SEQRUNPATH}
@@ -83,8 +86,8 @@ fi
 
 mkdir -p ${OUTPUT}
     
-module load singularity_3.6.4
-$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
+module load {{modules_to_load}} # singularity_3.6.4
+{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} \
     -i ${seqrun_path} \
     -o ${OUTPUT} \
     -s $(echo ${SAMPLES} | tr -d '"') \
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
index 628f1e99..ffaf726e 100755
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -1,42 +1,13 @@
 #!/bin/bash
+samplesheet={{CHARLIE_TELLREAD_MAP}}      # previously -i option
+seqrunpath={{CHARLIE_SEQRUNPATH}}         # previously -s option
+lane={{CHARLIE_LANE}}                     # previously -l option
+reference_map={{CHARLIE_REFERENCE_MAP}}   # previously -r option
+reference_base={{CHARLIE_REFERENCE_BASE}} # previously -b option
+mode={{CHARLIE_MODE}} $                   # previously -m option
 
-script_name=${0##*/}
-
-function help () {
-    echo "Submit for TELL-read"
-    echo ""
-    echo "Usage: ${script_name} -s <seqrunpath> -l <lane> [-r reference_map] [-b reference_base]"
-    echo ""
-    echo -e "\t-s\tPath to the sequencing run."  
-    echo -e "\t-i\tThe sample sheet."  
-    echo -e "\t-l\tThe lane to process."  
-    echo -e "\t-r\tA file specifying reference genomes to use [OPTIONAL]"  
-    echo -e "\t-b\tReference genome base directory [OPTIONAL]"  
-    echo -e "\t-m\tMode, isolate or metagenomic [OPTIONAL]"  
-    echo "" 
-}
-
-# references right now are only used for techdev
-
-# derived from https://www.redhat.com/sysadmin/arguments-options-bash-scripts
-while getopts "hs:i:l:r:b:m:" option; do
-   case ${option} in
-    h) 
-        help
-        exit;;
-    s) seqrunpath=${OPTARG};;
-    l) lane=${OPTARG};;
-    r) reference_map=${OPTARG};;
-    b) reference_base=${OPTARG};;
-    m) mode=${OPTARG};;
-    \?)
-         echo "Error: Invalid option"
-         exit;;
-    *)
-         echo "Error: Invalid option"
-         exit;;
-   esac
-done
+# preserve error-checking of parameters to preserve as much of the original
+# script as possible, even though this could be done in python.
 
 # https://unix.stackexchange.com/a/621007
 : ${seqrunpath:?Missing -s}
@@ -61,8 +32,6 @@ else
     tag=reference-free
 fi
 
-samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv"
-
 # trim trailing slash
 # https://stackoverflow.com/a/32845647/19741
 safepath=$(echo ${seqrunpath} | sed 's:/*$::')  
@@ -75,22 +44,22 @@ if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
     exit 1
 fi
 
+# for now this can stay here to keep greater compatibility with the original script.
+# however these fields should eventually be parameters that can be configured in the config file.
+
 if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then
-    echo "FOO"
     sbatch_cores=2
     sbatch_mem=8G
     norm=TRUE
     wall=24:00:00
     mode=NA
 elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then
-    echo "BAR"
     sbatch_cores=2
     sbatch_mem=8G
     norm=TRUE
     wall=24:00:00
     mode=NA
 else
-    echo "BAZ"
     sbatch_cores=16
     sbatch_mem=160G
     norm=FALSE
@@ -118,7 +87,7 @@ declare -a g
 # of the hacked sample-sheet.
 for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
 do
-    echo "SAMPLE: ${sample}"
+    echo "sample found: ${sample}"
     # get references if they exist
     if [[ -f ${reference_map} ]]; then
         if $(grep -Fq ${sample} ${reference_map}); then
@@ -140,10 +109,6 @@ do
 done
 n_samples=${#s[@]}
 
-echo "Submitting:"
-echo "S: ${s[@]}"
-echo "G: ${g[@]}"
-
 # https://stackoverflow.com/a/17841619/19741
 function join_by { local IFS="$1"; shift; echo "$*"; }
 s=$(join_by , "${s[@]}")
@@ -214,8 +179,6 @@ chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cle
 
 set -x
 
-echo "C"
-
 trjob=$(sbatch \
           --parsable \
           -J ${labeltag}-${datetag} \
@@ -225,8 +188,6 @@ trjob=$(sbatch \
           --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
           ${submit_script})
 
-echo "D"
-
 if [[ ${norm} == "TRUE" ]]; then
     cp ${norm_script} ${normcopy}
     chmod gou-w ${normcopy}
@@ -238,7 +199,6 @@ if [[ ${norm} == "TRUE" ]]; then
                         ${norm_script})
 fi
 
-echo "E"
 integrate_job=$(sbatch \
                     --parsable \
                     -J ${labeltag}-${datetag}-integrate \

From 5bac0ffd2793140136fe04c22c5d2f9e92c46310 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 11 Aug 2024 22:14:28 -0700
Subject: [PATCH 04/47] Second pass at integrating tellread scripts

---
 sequence_processing_pipeline/TRConvertJob.py  | 283 ++++++++++++++----
 .../templates/cloudspades-isolate.sbatch      |   2 +-
 .../templates/cloudspades.sbatch              |   6 +-
 .../templates/integrate.sbatch                |   1 +
 .../templates/telllink-isolate.sbatch         |   2 +-
 .../templates/telllink.sbatch                 |   3 +-
 .../templates/tellread-cleanup.sbatch         |   2 +-
 .../templates/tellread.sbatch                 |   4 +-
 .../templates/tellread.sh                     |  12 +-
 9 files changed, 248 insertions(+), 67 deletions(-)

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 81d6bda8..f5250139 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -1,74 +1,56 @@
-from os.path import join, exists
+from jinja2 import BaseLoader, TemplateNotFound
+from metapool import load_sample_sheet
+from os import stat, makedirs, rename
+from os.path import join, basename, dirname, exists, abspath, getmtime
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
+from sequence_processing_pipeline.Pipeline import Pipeline
+from shutil import move
 import logging
+from sequence_processing_pipeline.Commands import split_similar_size_bins
+from sequence_processing_pipeline.util import iter_paired_files
+from jinja2 import Environment
+import glob
 import re
+from sys import executable
+import pathlib
+
+
+# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
+class KISSLoader(BaseLoader):
+    def __init__(self, path):
+        # pin the path for loader to the location sequence_processing_pipeline
+        # (the location of this file), along w/the relative path to the
+        # templates directory.
+        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
 
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
 
-tellread.sh
-# {{CHARLIE_TELLREAD_MAP}} = samplesheet to telread.sh (-i option) must equal "/home/qiita_test/qiita-spots/tellread_mapping.csv"
 
-tellread.sbatch
-#SBATCH -J {{job_name}}             # tellread
-#SBATCH -p {{queue_name}}           # qiita
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 4
-#SBATCH --mem {{mem_in_gb}}G        # 16G
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
-{{CHARLIE_TMPDIR}} = /panfs/${USER}/tmp - replace with something in the work directory
-{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} = $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh
-{{modules_to_load}}  # singularity_3.6.4
 
-tellink-isolate.sbatch
-#SBATCH -J {{job_name}}             # tellink-isolate
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 16
-#SBATCH --mem {{mem_in_gb}}G        # 160G
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
-#SBATCH -p {{queue_name}}           # qiita
 
-{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh
-{{modules_to_load}}  # singularity_3.6.4
+from os.path import join, exists
+from sequence_processing_pipeline.Job import Job
+from sequence_processing_pipeline.PipelineError import (PipelineError,
+                                                        JobFailedError)
+import logging
+import re
+
 
-telllink.sbatch
-#SBATCH -J {{job_name}}             # tellink
-#SBATCH --mem {{mem_in_gb}}G        # 160G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 16
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
-#SBATCH -p {{queue_name}}           # qiita
-{{modules_to_load}}  # singularity_3.6.4
-{{TELLLINK_SING_PATH}}=/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh
 
-integrate.sbatch (should this be renamed?)
-#SBATCH -J {{job_name}}             # integrate
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 8G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 1
-#SBATCH -p {{queue_name}}           # qiita
 
-cloudspades-isolate.sbatch:
-#SBATCH -J {{job_name}}             # cs-assemble
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 64G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 12
-#SBATCH -p {{queue_name}}           # qiita
 
-module load {{modules_to_load}} # gcc_9.3.0
 
-{{CHARLIE_SPADES_PATH}} = ~/spades-cloudspades-paper/assembler/spades.py
 
 
-tellread-cleanup.sbatch
-#SBATCH -J {{job_name}}             # cleanup
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 8G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 1
-#SBATCH -p {{queue_name}}           # qiita
 
 
 
@@ -113,6 +95,11 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
         self.suffix = 'fastq.gz'
 
+        # for projects that use sequence_processing_pipeline as a dependency,
+        # jinja_env must be set to sequence_processing_pipeline's root path,
+        # rather than the project's root path.
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+
         tmp = False
         for executable_name in ['bcl2fastq', 'bcl-convert']:
             if executable_name in self.bcl_tool:
@@ -130,6 +117,194 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
 
         self._generate_job_script()
 
+    def _generate_script_one(self):
+        template = self.jinja_env.get_template("tellread.sh")
+
+        tellread_map = "/home/qiita_test/qiita-spots/tellread_mapping.csv"
+        seqrun_path = "/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4"
+        lane = 'L008'
+        reference_map = ""
+        reference_base = ""
+        mode = "metagenomic"
+
+        return template.render(tellread_map=tellread_map,
+                               seqrun_path=seqrun_path,
+                               lane=lane,
+                               reference_map=reference_map,
+                               reference_base=reference_base,
+                               mode=mode)
+
+    def _generate_script_two(self):
+        template = self.jinja_env.get_template("tellread-cleanup.sbatch")
+
+        job_name = "cleanup"
+        wall_time_limit = "24:00:00"
+        mem_in_gb = "8"
+        node_count = "1"
+        cores_per_task = "1"
+        queue_name = "qiita"
+
+        return template.render(job_name=job_name,
+                               wall_time_limit=wall_time_limit,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               queue_name=queue_name)
+
+    def _generate_script_three(self):
+        template = self.jinja_env.get_template("tellread.sbatch")
+        job_name = "tellread"
+        wall_time_limit = "96:00:00"
+        mem_in_gb = "16"
+        node_count = "1"
+        cores_per_task = "4"
+        queue_name = "qiita"
+        tellread_sbatch_tmp_dir = "/panfs/${USER}/tmp"
+        tr_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh"
+        modules_to_load = ["singularity_3.6.4"]
+
+        return template.render(job_name=job_name,
+                               wall_time_limit=wall_time_limit,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               queue_name=queue_name,
+                               tmp_dir=tellread_sbatch_tmp_dir,
+                               sing_script_path=tr_sing_script_path,
+                               modules_to_load=' '.join(modules_to_load))
+
+    def _generate_script_four(self):
+        template = self.jinja_env.get_template("telllink-isolate.sbatch")
+
+        job_name = "tellink-isolate"
+        wall_time_limit = "96:00:00"
+        node_count = "1"
+        cores_per_task = "16"
+        mem_in_gb = "160"
+        queue_name = "qiita"
+        modules_to_load = ["singularity_3.6.4"]
+        sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
+
+        return template.render(job_name=job_name,
+                               wall_time_limit=wall_time_limit,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               queue_name=queue_name,
+                               modules_to_load=' '.join(modules_to_load),
+                               sing_path=sing_path)
+
+    def _generate_script_five(self):
+        template = self.jinja_env.get_template("telllink.sbatch")
+
+        job_name = "tellink"
+        mem_in_gb = "160"
+        node_count = "1"
+        cores_per_task = "16"
+        wall_time_limit = "96:00:00"
+        queue_name = "qiita"
+        modules_to_load = ["singularity_3.6.4"]
+        sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
+
+        return template.render(job_name=job_name,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               wall_time_limit=wall_time_limit,
+                               queue_name=queue_name,
+                               modules_to_load=' '.join(modules_to_load),
+                               sing_path=sing_path)
+
+    def _generate_script_six(self):
+        template = self.jinja_env.get_template("integrate.sbatch")
+
+        job_name = "integrate"
+        mem_in_gb = "8"
+        node_count = "1"
+        cores_per_task = "1"
+        wall_time_limit = "24:00:00"
+        queue_name = "qiita"
+
+        return template.render(job_name=job_name,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               wall_time_limit=wall_time_limit,
+                               queue_name=queue_name)
+
+    def _generate_script_seven(self):
+        template = self.jinja_env.get_template("cloudspades-isolate.sbatch")
+
+        job_name = "cs-assemble"
+        mem_in_gb = "64"
+        node_count = "1"
+        cores_per_task = "12"
+        wall_time_limit = "24:00:00"
+        queue_name = "qiita"
+        modules_to_load = ["gcc_9.3.0"]
+        spades_path = "~/spades-cloudspades-paper/assembler/spades.py"
+
+        return template.render(job_name=job_name,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               wall_time_limit=wall_time_limit,
+                               queue_name=queue_name,
+                               modules_to_load=' '.join(modules_to_load),
+                               spades_path=spades_path)
+
+    def _generate_script_eight(self):
+        template = self.jinja_env.get_template("cloudspades.sbatch")
+
+        job_name = "cs-assemble"
+        wall_time_limit = "24:00:00"
+        mem_in_gb = "128"
+        node_count = "1"
+        cores_per_task = "12"
+        queue_name = "qiita"
+        modules_to_load = ["gcc_9.3.0"]
+        spades_path = "TBD" # for now pass but don't use this spades_path var.
+
+        return template.render(job_name=job_name,
+                               mem_in_gb=mem_in_gb,
+                               node_count=node_count,
+                               cores_per_task=cores_per_task,
+                               wall_time_limit=wall_time_limit,
+                               queue_name=queue_name,
+                               modules_to_load=' '.join(modules_to_load),
+                               spades_path=spades_path)
+
+    def _generate_job_scripts(self):
+        scripts = [
+            {
+                "template": self.jinja_env.get_template("cloudspades.sbatch"),
+                "params": {
+                    "job_name": "cs-assemble",
+                    "wall_time_limit": "24:00:00",
+                    "mem_in_gb": "128",
+                    "node_count": "1",
+                    "cores_per_task": "12",
+                    "queue_name": "qiita",
+                    "modules_to_load": ' '.join(["gcc_9.3.0"]),
+                    "spades_path": "TBD"
+                }
+
+            },
+            {},
+            {}
+
+        ]
+
+        for script in scripts:
+            template = self.jinja_env.get_template(script["template"])
+            params = script["params"]
+            result = template.render(**params)
+
+
+
+
+
+
     def _generate_job_script(self):
         """
         Generate a Torque job script for processing supplied root_directory.
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index 390a7f90..f8a2b000 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -60,7 +60,7 @@ fi
 
 mkdir -p ${cs}
 
-pushd {{CHARLIE_SPADES_PATH}} \
+pushd {{spades_path}} \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
     --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index a9f1ec45..96673309 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -37,8 +37,11 @@ if [[ ! -d ${base} ]]; then
     exit 1
 fi
 
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
 mamba activate activate qiime2-2023.5
-module load gcc_9.3.0 
+
+module load {{modules_to_load}} # gcc_9.3.0
 
 samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
 
@@ -56,6 +59,7 @@ fi
 mkdir -p ${cs}
 pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
 
+# for now don't use {{spades.py}}
 ./spades.py \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index 787da4b2..f4161466 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -21,6 +21,7 @@ function logger () {
     echo "$(date) :: ${@}" 1>&2; 
 }
 
+
 # https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
 cores=${SLURM_CPUS_PER_TASK}
 
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index 0f08c0a3..f842cddf 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -46,7 +46,7 @@ fi
 
 mkdir -p ${tl}
 
-{{TELLLINK_SING_PATH}} \
+{{sing_path}} \
     -r1 ${base}/integrated/${sample}.R1.fastq.gz \
     -r2 ${base}/integrated/${sample}.R2.fastq.gz \
     -i1 ${base}/integrated/${sample}.I1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
index 591ac69d..39daa383 100644
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -33,6 +33,7 @@ fi
 samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
+# leave these hardcoded for now
 k=79
 lc=35
 cores=${SLURM_CPUS_PER_TASK}
@@ -46,7 +47,7 @@ fi
 
 mkdir -p ${tl}
 
-{{TELLLINK_SING_PATH}} \
+{{sing_path}} \
     -r1 ${base}/integrated/${sample}.R1.fastq.gz \
     -r2 ${base}/integrated/${sample}.R2.fastq.gz \
     -i1 ${base}/integrated/${sample}.I1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
index f3388ef7..d5edf855 100644
--- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -20,4 +20,4 @@ if [[ -z "${OUTPUT}" ]]; then
 fi
 
 # remove unused large outputs
-rm -fr ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
+rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index 800503f0..89633da9 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -52,7 +52,7 @@ if [[ -z "${OUTPUT}" ]]; then
     exit 1
 fi
 
-export TMPDIR={{CHARLIE_TMPDIR}}
+export TMPDIR={{tmp_dir}}
 mkdir -p ${TMPDIR}
 export TMPDIR=$(mktemp -d)
 seqrun_path=${SEQRUNPATH}
@@ -87,7 +87,7 @@ fi
 mkdir -p ${OUTPUT}
     
 module load {{modules_to_load}} # singularity_3.6.4
-{{CHARLIE_TELLREAD_SING_SCRIPT_PATH}} \
+{{sing_script_path}} \
     -i ${seqrun_path} \
     -o ${OUTPUT} \
     -s $(echo ${SAMPLES} | tr -d '"') \
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
index ffaf726e..5584b6c0 100755
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-samplesheet={{CHARLIE_TELLREAD_MAP}}      # previously -i option
-seqrunpath={{CHARLIE_SEQRUNPATH}}         # previously -s option
-lane={{CHARLIE_LANE}}                     # previously -l option
-reference_map={{CHARLIE_REFERENCE_MAP}}   # previously -r option
-reference_base={{CHARLIE_REFERENCE_BASE}} # previously -b option
-mode={{CHARLIE_MODE}} $                   # previously -m option
+samplesheet={{tellread_map}}      # previously -i option
+seqrunpath={{seqrun_path}}        # previously -s option
+lane={{lane}}                     # previously -l option
+reference_map={{reference_map}}   # previously -r option
+reference_base={{reference_base}} # previously -b option
+mode={{mode}} $                   # previously -m option
 
 # preserve error-checking of parameters to preserve as much of the original
 # script as possible, even though this could be done in python.

From 16ec4170f4667dae6a23cb414fbd7a9202f38fa5 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 11 Aug 2024 23:11:11 -0700
Subject: [PATCH 05/47] third pass adding tellread

---
 sequence_processing_pipeline/TRConvertJob.py | 377 +++++--------------
 1 file changed, 102 insertions(+), 275 deletions(-)

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index f5250139..47b72c58 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -1,20 +1,11 @@
-from jinja2 import BaseLoader, TemplateNotFound
-from metapool import load_sample_sheet
-from os import stat, makedirs, rename
-from os.path import join, basename, dirname, exists, abspath, getmtime
+from jinja2 import BaseLoader, TemplateNotFound, Environment
+from os.path import join, exists, getmtime
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
-from sequence_processing_pipeline.Pipeline import Pipeline
-from shutil import move
 import logging
-from sequence_processing_pipeline.Commands import split_similar_size_bins
-from sequence_processing_pipeline.util import iter_paired_files
-from jinja2 import Environment
-import glob
-import re
-from sys import executable
 import pathlib
+import re
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
@@ -35,27 +26,6 @@ def get_source(self, environment, template):
         return source, path, lambda: mtime == getmtime(path)
 
 
-
-
-from os.path import join, exists
-from sequence_processing_pipeline.Job import Job
-from sequence_processing_pipeline.PipelineError import (PipelineError,
-                                                        JobFailedError)
-import logging
-import re
-
-
-
-
-
-
-
-
-
-
-
-
-
 class TRConvertJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, nprocs, wall_time_limit, pmem, bcl_tool_path,
@@ -92,7 +62,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.pmem = pmem
         self.bcl_tool = bcl_tool_path
         self.qiita_job_id = qiita_job_id
-        self.job_script_path = join(self.output_path, f"{self.job_name}.sh")
         self.suffix = 'fastq.gz'
 
         # for projects that use sequence_processing_pipeline as a dependency,
@@ -115,169 +84,12 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         # As the sample-sheet is validated by the Pipeline object before
         # being passed to TRConvertJob, additional validation isn't needed.
 
-        self._generate_job_script()
-
-    def _generate_script_one(self):
-        template = self.jinja_env.get_template("tellread.sh")
-
-        tellread_map = "/home/qiita_test/qiita-spots/tellread_mapping.csv"
-        seqrun_path = "/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4"
-        lane = 'L008'
-        reference_map = ""
-        reference_base = ""
-        mode = "metagenomic"
-
-        return template.render(tellread_map=tellread_map,
-                               seqrun_path=seqrun_path,
-                               lane=lane,
-                               reference_map=reference_map,
-                               reference_base=reference_base,
-                               mode=mode)
-
-    def _generate_script_two(self):
-        template = self.jinja_env.get_template("tellread-cleanup.sbatch")
-
-        job_name = "cleanup"
-        wall_time_limit = "24:00:00"
-        mem_in_gb = "8"
-        node_count = "1"
-        cores_per_task = "1"
-        queue_name = "qiita"
-
-        return template.render(job_name=job_name,
-                               wall_time_limit=wall_time_limit,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               queue_name=queue_name)
-
-    def _generate_script_three(self):
-        template = self.jinja_env.get_template("tellread.sbatch")
-        job_name = "tellread"
-        wall_time_limit = "96:00:00"
-        mem_in_gb = "16"
-        node_count = "1"
-        cores_per_task = "4"
-        queue_name = "qiita"
-        tellread_sbatch_tmp_dir = "/panfs/${USER}/tmp"
-        tr_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh"
-        modules_to_load = ["singularity_3.6.4"]
-
-        return template.render(job_name=job_name,
-                               wall_time_limit=wall_time_limit,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               queue_name=queue_name,
-                               tmp_dir=tellread_sbatch_tmp_dir,
-                               sing_script_path=tr_sing_script_path,
-                               modules_to_load=' '.join(modules_to_load))
-
-    def _generate_script_four(self):
-        template = self.jinja_env.get_template("telllink-isolate.sbatch")
-
-        job_name = "tellink-isolate"
-        wall_time_limit = "96:00:00"
-        node_count = "1"
-        cores_per_task = "16"
-        mem_in_gb = "160"
-        queue_name = "qiita"
-        modules_to_load = ["singularity_3.6.4"]
-        sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
-
-        return template.render(job_name=job_name,
-                               wall_time_limit=wall_time_limit,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               queue_name=queue_name,
-                               modules_to_load=' '.join(modules_to_load),
-                               sing_path=sing_path)
-
-    def _generate_script_five(self):
-        template = self.jinja_env.get_template("telllink.sbatch")
-
-        job_name = "tellink"
-        mem_in_gb = "160"
-        node_count = "1"
-        cores_per_task = "16"
-        wall_time_limit = "96:00:00"
-        queue_name = "qiita"
-        modules_to_load = ["singularity_3.6.4"]
-        sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
-
-        return template.render(job_name=job_name,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               wall_time_limit=wall_time_limit,
-                               queue_name=queue_name,
-                               modules_to_load=' '.join(modules_to_load),
-                               sing_path=sing_path)
-
-    def _generate_script_six(self):
-        template = self.jinja_env.get_template("integrate.sbatch")
-
-        job_name = "integrate"
-        mem_in_gb = "8"
-        node_count = "1"
-        cores_per_task = "1"
-        wall_time_limit = "24:00:00"
-        queue_name = "qiita"
-
-        return template.render(job_name=job_name,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               wall_time_limit=wall_time_limit,
-                               queue_name=queue_name)
-
-    def _generate_script_seven(self):
-        template = self.jinja_env.get_template("cloudspades-isolate.sbatch")
-
-        job_name = "cs-assemble"
-        mem_in_gb = "64"
-        node_count = "1"
-        cores_per_task = "12"
-        wall_time_limit = "24:00:00"
-        queue_name = "qiita"
-        modules_to_load = ["gcc_9.3.0"]
-        spades_path = "~/spades-cloudspades-paper/assembler/spades.py"
-
-        return template.render(job_name=job_name,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               wall_time_limit=wall_time_limit,
-                               queue_name=queue_name,
-                               modules_to_load=' '.join(modules_to_load),
-                               spades_path=spades_path)
-
-    def _generate_script_eight(self):
-        template = self.jinja_env.get_template("cloudspades.sbatch")
-
-        job_name = "cs-assemble"
-        wall_time_limit = "24:00:00"
-        mem_in_gb = "128"
-        node_count = "1"
-        cores_per_task = "12"
-        queue_name = "qiita"
-        modules_to_load = ["gcc_9.3.0"]
-        spades_path = "TBD" # for now pass but don't use this spades_path var.
-
-        return template.render(job_name=job_name,
-                               mem_in_gb=mem_in_gb,
-                               node_count=node_count,
-                               cores_per_task=cores_per_task,
-                               wall_time_limit=wall_time_limit,
-                               queue_name=queue_name,
-                               modules_to_load=' '.join(modules_to_load),
-                               spades_path=spades_path)
+        self._generate_job_scripts()
 
     def _generate_job_scripts(self):
         scripts = [
             {
-                "template": self.jinja_env.get_template("cloudspades.sbatch"),
+                "template": "cloudspades.sbatch",
                 "params": {
                     "job_name": "cs-assemble",
                     "wall_time_limit": "24:00:00",
@@ -288,94 +100,109 @@ def _generate_job_scripts(self):
                     "modules_to_load": ' '.join(["gcc_9.3.0"]),
                     "spades_path": "TBD"
                 }
-
             },
-            {},
-            {}
-
+            {
+                "template": "cloudspades-isolate.sbatch",
+                "params": {
+                    "job_name": "cs-assemble",
+                    "wall_time_limit": "24:00:00",
+                    "mem_in_gb": "64",
+                    "node_count": "1",
+                    "cores_per_task": "12",
+                    "queue_name": "qiita",
+                    "modules_to_load": ' '.join(["gcc_9.3.0"]),
+                    "spades_path": "~/spades-cloudspades-paper/assembler/"
+                                   "spades.py"
+                }
+            },
+            {
+                "template": "integrate.sbatch",
+                "params": {
+                    "job_name": "integrate",
+                    "wall_time_limit": "24:00:00",
+                    "mem_in_gb": "8",
+                    "node_count": "1",
+                    "cores_per_task": "1",
+                    "queue_name": "qiita"
+                }
+            },
+            {
+                "template": "telllink.sbatch",
+                "params": {
+                    "job_name": "telllink",
+                    "wall_time_limit": "96:00:00",
+                    "mem_in_gb": "160",
+                    "node_count": "1",
+                    "cores_per_task": "16",
+                    "queue_name": "qiita",
+                    "modules_to_load": ' '.join(["singularity_3.6.4"]),
+                    "sing_path": "/projects/long_read_collab/code/tellseq/"
+                                 "release_v1.11/tellink-release/"
+                                 "run_tellink_sing.sh"
+                }
+            },
+            {
+                "template": "telllink-isolate.sbatch",
+                "params": {
+                    "job_name": "tellink-isolate",
+                    "wall_time_limit": "96:00:00",
+                    "node_count": "1",
+                    "cores_per_task": "16",
+                    "mem_in_gb": "160",
+                    "queue_name": "qiita",
+                    "modules_to_load": ' '.join(["singularity_3.6.4"]),
+                    "sing_path": "/projects/long_read_collab/code/tellseq/"
+                                 "release_v1.11/tellink-release/"
+                                 "run_tellink_sing.sh"
+                }
+            },
+            {
+                "template": "tellread.sbatch",
+                "params": {
+                    "job_name": "tellread",
+                    "wall_time_limit": "96:00:00",
+                    "mem_in_gb": "16",
+                    "node_count": "1",
+                    "cores_per_task": "4",
+                    "queue_name": "qiita",
+                    "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp",
+                    "tr_sing_script_path": "$HOME/qiita-spots/tellread-release"
+                                           "-novaseqX/run_tellread_sing.sh",
+                    "modules_to_load": ' '.join(["singularity_3.6.4"])
+                }
+             },
+            {
+                "template": "tellread-cleanup.sbatch",
+                "params": {
+                    "job_name": "cleanup",
+                    "wall_time_limit": "24:00:00",
+                    "mem_in_gb": "8",
+                    "node_count": "1",
+                    "cores_per_task": "1",
+                    "queue_name": "qiita"
+                }
+             },
+            {
+                "template": "",
+                "params": {
+                    "tellread_map": "/home/qiita_test/qiita-spots/"
+                                    "tellread_mapping.csv",
+                    "seqrun_path": "/sequencing/igm_runs/"
+                                   "240216_LH00444_0058_A22357VLT4",
+                    "lane": 'L008',
+                    "reference_map": "",
+                    "reference_base": "",
+                    "mode": "metagenomic"
+                }
+             }
         ]
 
         for script in scripts:
             template = self.jinja_env.get_template(script["template"])
             params = script["params"]
-            result = template.render(**params)
-
-
-
-
-
-
-    def _generate_job_script(self):
-        """
-        Generate a Torque job script for processing supplied root_directory.
-        :return: The path to the newly-created job-script.
-        """
-        lines = []
-
-        lines.append("#!/bin/bash")
-        lines.append(f"#SBATCH --job-name {self.qiita_job_id}_{self.job_name}")
-        lines.append(f"#SBATCH -p {self.queue_name}")
-        lines.append(f'#SBATCH -N {self.node_count}')
-        lines.append(f'#SBATCH -n {self.nprocs}')
-        lines.append("#SBATCH --time %d" % self.wall_time_limit)
-
-        # send an email to the list of users defined below when a job starts,
-        # terminates, or aborts. This is used to confirm that the package's
-        # own reporting mechanism is reporting correctly.
-        lines.append("#SBATCH --mail-type=ALL")
-
-        # list of users to be contacted independently of this package's
-        # notification system, when a job starts, terminates, or gets aborted.
-        lines.append("#SBATCH --mail-user qiita.help@gmail.com")
-
-        lines.append(f"#SBATCH --mem-per-cpu {self.pmem}")
-
-        lines.append("set -x")
-        lines.append('date')
-        lines.append('hostname')
-        lines.append(f'cd {self.root_dir}')
-
-        if self.modules_to_load:
-            lines.append("module load " + ' '.join(self.modules_to_load))
-
-        # Assume that the bcl-convert tool is named 'bcl-convert' and choose
-        # accordingly.
-        if 'bcl-convert' in self.bcl_tool:
-            lines.append(('%s '
-                          '--sample-sheet "%s" '
-                          '--output-directory %s '
-                          '--bcl-input-directory . '
-                          '--bcl-num-decompression-threads 16 '
-                          '--bcl-num-conversion-threads 16 '
-                          '--bcl-num-compression-threads 16 '
-                          '--bcl-num-parallel-tiles 16 '
-                          '--bcl-sampleproject-subdirectories true '
-                          '--force') % (self.bcl_tool,
-                                        self.sample_sheet_path,
-                                        self.output_path))
-
-            # equivalent cp for bcl-conversion (see below) needed.
-        else:
-            lines.append(('%s '
-                          '--sample-sheet "%s" '
-                          '--minimum-trimmed-read-length 1 '
-                          '--mask-short-adapter-reads 1 '
-                          '-R . '
-                          '-o %s '
-                          '--loading-threads 16 '
-                          '--processing-threads 16 '
-                          '--writing-threads 16 '
-                          '--create-fastq-for-index-reads '
-                          '--ignore-missing-positions ') %
-                         (self.bcl_tool,
-                          self.sample_sheet_path,
-                          self.output_path))
-
-        with open(self.job_script_path, 'w') as f:
-            for line in lines:
-                # remove long spaces in some lines.
-                line = re.sub(r'\s+', ' ', line)
-                f.write(f"{line}\n")
+            job_script_path = join(self.output_path, script["template"])
+            with open(job_script_path, 'w') as f:
+                f.write(template.render(**params))
 
     def run(self, callback=None):
         """

From 74cab5d6ca468ea6281c62bb17c5aad5d03540be Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 13 Aug 2024 19:47:09 -0700
Subject: [PATCH 06/47] fourth pass

---
 sequence_processing_pipeline/TRConvertJob.py  | 105 +++-----
 .../templates/cloudspades-isolate.sbatch      |   3 +-
 .../templates/cloudspades.sbatch              |   2 +-
 .../templates/tellread.sh                     |  14 +-
 .../cloudspades-isolate.sbatch                |  84 +++++++
 .../data/tellread_output/cloudspades.sbatch   |  81 ++++++
 .../data/tellread_output/integrate.sbatch     | 125 ++++++++++
 .../tellread_output/telllink-isolate.sbatch   |  62 +++++
 .../data/tellread_output/telllink.sbatch      |  64 +++++
 .../tellread_output/tellread-cleanup.sbatch   |  23 ++
 .../data/tellread_output/tellread.sbatch      | 108 ++++++++
 .../tests/data/tellread_output/tellread.sh    | 236 ++++++++++++++++++
 12 files changed, 829 insertions(+), 78 deletions(-)
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sh

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 47b72c58..8cc1a14a 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -1,11 +1,8 @@
 from jinja2 import BaseLoader, TemplateNotFound, Environment
 from os.path import join, exists, getmtime
 from sequence_processing_pipeline.Job import Job
-from sequence_processing_pipeline.PipelineError import (PipelineError,
-                                                        JobFailedError)
-import logging
+from sequence_processing_pipeline.PipelineError import PipelineError
 import pathlib
-import re
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
@@ -63,11 +60,17 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.bcl_tool = bcl_tool_path
         self.qiita_job_id = qiita_job_id
         self.suffix = 'fastq.gz'
+        self.job_script_path = None
 
         # for projects that use sequence_processing_pipeline as a dependency,
         # jinja_env must be set to sequence_processing_pipeline's root path,
         # rather than the project's root path.
-        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.jinja_env = Environment(loader=KISSLoader('templates'),
+                                     # set Jinja2 comment strings to be
+                                     # anything other than '{#' and '#}',
+                                     # which can be used in shell scripts.
+                                     comment_start_string='%%%%%%%%%%',
+                                     comment_end_string='%%%%%%%%%%')
 
         tmp = False
         for executable_name in ['bcl2fastq', 'bcl-convert']:
@@ -112,7 +115,6 @@ def _generate_job_scripts(self):
                     "queue_name": "qiita",
                     "modules_to_load": ' '.join(["gcc_9.3.0"]),
                     "spades_path": "~/spades-cloudspades-paper/assembler/"
-                                   "spades.py"
                 }
             },
             {
@@ -129,7 +131,7 @@ def _generate_job_scripts(self):
             {
                 "template": "telllink.sbatch",
                 "params": {
-                    "job_name": "telllink",
+                    "job_name": "tellink",
                     "wall_time_limit": "96:00:00",
                     "mem_in_gb": "160",
                     "node_count": "1",
@@ -163,11 +165,12 @@ def _generate_job_scripts(self):
                     "wall_time_limit": "96:00:00",
                     "mem_in_gb": "16",
                     "node_count": "1",
+                    "tmp_dir": "/panfs/${USER}/tmp",
                     "cores_per_task": "4",
                     "queue_name": "qiita",
                     "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp",
-                    "tr_sing_script_path": "$HOME/qiita-spots/tellread-release"
-                                           "-novaseqX/run_tellread_sing.sh",
+                    "sing_script_path": "$HOME/qiita-spots/tellread-release"
+                                        "-novaseqX/run_tellread_sing.sh",
                     "modules_to_load": ' '.join(["singularity_3.6.4"])
                 }
              },
@@ -183,7 +186,7 @@ def _generate_job_scripts(self):
                 }
              },
             {
-                "template": "",
+                "template": "tellread.sh",
                 "params": {
                     "tellread_map": "/home/qiita_test/qiita-spots/"
                                     "tellread_mapping.csv",
@@ -201,9 +204,13 @@ def _generate_job_scripts(self):
             template = self.jinja_env.get_template(script["template"])
             params = script["params"]
             job_script_path = join(self.output_path, script["template"])
+
             with open(job_script_path, 'w') as f:
                 f.write(template.render(**params))
 
+            if script['template'] == "tellread.sh":
+                self.job_script_path = job_script_path
+
     def run(self, callback=None):
         """
         Run BCL2Fastq/BCLConvert conversion
@@ -212,71 +219,31 @@ def run(self, callback=None):
                          changed.
         :return:
         """
-        try:
-            job_info = self.submit_job(self.job_script_path,
-                                       exec_from=self.log_path,
-                                       callback=callback)
-        except JobFailedError as e:
-            # When a job has failed, parse the logs generated by this specific
-            # job to return a more descriptive message to the user.
-            info = self.parse_logs()
-            # prepend just the message component of the Error.
-            info.insert(0, str(e))
-            raise JobFailedError('\n'.join(info))
 
-        logging.info(f'Successful job: {job_info}')
+        # Unlike other Jobs that submit a Slurm script and wait for the job
+        # to complete, this Job will execute the tellread.sh shell script.
+        # It is this script that does all of the Slurm job creation. This Job
+        # will need another means to tell when a job has completed
+        # successfully.
 
-    def parse_logs(self):
-        log_path = join(self.output_path, 'Logs')
-        errors = join(log_path, 'Errors.log')
+        command = ("./tellread.sh -s /sequencing/igm_runs/240216_LH00444"
+                   "_0058_A22357VLT4 -i ./samplesheet.csv -l L008 -m "
+                   "metagenomic")
 
-        msgs = []
+        if self.job_script_path:
+            res = self._system_call(command)
+        else:
+            raise PipelineError("tellread.sh script could not be found.")
 
-        if not exists(errors):
-            # we do not raise an Error in this case because it's expected that
-            # parse_logs() will be called in response to an exceptional
-            # condition.
-            msgs.append(f"'{errors} does not exist")
+        if res['return_code'] != 0:
+            raise PipelineError("tellread.sh script did not execute correctly")
 
-        with open(errors, 'r') as f:
-            lines = f.readlines()
-            for line in [x.strip() for x in lines]:
-                msgs.append(line)
+        # res['stdout']
+        # res['stderr']
 
-        return msgs
+    def parse_logs(self):
+        raise PipelineError("parsing logs not implemented.")
 
     @staticmethod
     def parse_job_script(job_script_path):
-        # Returns run-directory and sample-sheet path from a job-script.
-
-        if not exists(job_script_path):
-            raise ValueError(f"'{job_script_path}' is not a valid path")
-
-        with open(job_script_path, 'r') as f:
-            lines = f.readlines()
-            lines = [x.strip() for x in lines]
-
-        # As this code creates this file, we can expect it to be of a certain
-        # format.
-        if lines[0] != '#!/bin/bash':
-            raise ValueError(f"'{job_script_path}' is not a valid path")
-
-        result = {}
-
-        m = re.match('^cd (.*)$', lines[12])
-
-        if m:
-            result['run_directory'] = m.group(1)
-        else:
-            raise ValueError("could not detect run_directory in "
-                             f"'{job_script_path}'")
-
-        m = re.match('^bcl-convert --sample-sheet "(.*?)" ', lines[14])
-
-        if m:
-            result['sample_sheet_path'] = m.group(1)
-        else:
-            raise ValueError("could not detect sample-sheet path in "
-                             f"'{job_script_path}'")
-
-        return result
+        raise PipelineError("parsing job script not implemented.")
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index f8a2b000..4296abfb 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -60,7 +60,8 @@ fi
 
 mkdir -p ${cs}
 
-pushd {{spades_path}} \
+pushd {{spades_path}}
+./spades.py \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
     --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index 96673309..e1c2bb40 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -59,7 +59,7 @@ fi
 mkdir -p ${cs}
 pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
 
-# for now don't use {{spades.py}}
+# for now don't use spades.py jinja2 variable
 ./spades.py \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
index 5584b6c0..ac7c6d31 100755
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
-samplesheet={{tellread_map}}      # previously -i option
-seqrunpath={{seqrun_path}}        # previously -s option
-lane={{lane}}                     # previously -l option
-reference_map={{reference_map}}   # previously -r option
-reference_base={{reference_base}} # previously -b option
-mode={{mode}} $                   # previously -m option
+samplesheet="{{tellread_map}}"      # previously -i option
+seqrunpath="{{seqrun_path}}"        # previously -s option
+lane="{{lane}}"                     # previously -l option
+reference_map="{{reference_map}}"   # previously -r option
+reference_base="{{reference_base}}" # previously -b option
+mode="{{mode}}" $                   # previously -m option
 
 # preserve error-checking of parameters to preserve as much of the original
 # script as possible, even though this could be done in python.
@@ -34,7 +34,7 @@ fi
 
 # trim trailing slash
 # https://stackoverflow.com/a/32845647/19741
-safepath=$(echo ${seqrunpath} | sed 's:/*$::')  
+safepath=$(echo ${seqrunpath} | sed 's:/*$::')
 label=$(basename ${safepath})
 labeltag=${label}-${tag}
 output=/panfs/${USER}/${labeltag}
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
new file mode 100644
index 00000000..7ec58058
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
@@ -0,0 +1,84 @@
+#!/bin/bash -l
+#SBATCH -J cs-assemble             # cs-assemble
+#SBATCH --time 24:00:00  # 24:00:00
+#SBATCH --mem 64G        # 64G
+#SBATCH -N 1           # 1
+#SBATCH -c 12       # 12
+#SBATCH -p qiita           # qiita
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
+source activate qiime2-2023.5
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+set -e
+
+# this gets set in the environment from another script. For now let's
+# run with that.
+echo $TMPDIR
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+base=${OUTPUT}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
+mamba activate activate qiime2-2023.5
+
+module load gcc_9.3.0 # gcc_9.3.0
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+
+# assumes 1-based array index, eg --array 1-N
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+cs=${base}/cloudspades-isolate/${sample}
+
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${cs} ]]; then
+        rm -fr ${cs}
+    fi
+fi
+
+mkdir -p ${cs}
+
+pushd ~/spades-cloudspades-paper/assembler/
+./spades.py \
+    -o ${cs} \
+    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
+module unload gcc_9.3.0
+popd
+
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
+mamba activate quast
+
+quast \
+    -o ${cs}/quast-scaffolds \
+    -t ${SLURM_JOB_CPUS_PER_NODE} \
+    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+
+# remove intermediates that currently dont have a downstream use
+if [[ -d ${cs}/K21 ]]; then
+    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
new file mode 100644
index 00000000..d16dc2b0
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
@@ -0,0 +1,81 @@
+#!/bin/bash -l
+#SBATCH -J cs-assemble             # cs-assemble
+#SBATCH --time 24:00:00  # 24:00:00
+#SBATCH --mem 128G        # 128G
+#SBATCH -N 1           # 1
+#SBATCH -c 12       # 12
+#SBATCH -p qiita           # qiita
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
+source activate qiime2-2023.5
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+set -e
+
+echo $TMPDIR
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+base=${OUTPUT}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
+# for now we will leave it hardcoded.
+mamba activate activate qiime2-2023.5
+
+module load gcc_9.3.0 # gcc_9.3.0
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+
+# assumes 1-based array index, eg --array 1-N
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+cs=${base}/cloudspades/${sample}
+
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${cs} ]]; then
+        rm -fr ${cs}
+    fi
+fi
+
+mkdir -p ${cs}
+pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
+
+# for now don't use spades.py jinja2 variable
+./spades.py \
+    -o ${cs} \
+    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    --meta \
+    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
+module unload gcc_9.3.0
+popd
+
+mamba activate quast                                                           
+quast \
+    -o ${cs}/quast-scaffolds \
+    -t ${SLURM_JOB_CPUS_PER_NODE} \
+    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+
+# remove intermediates that currently dont have a downstream use
+if [[ -d ${cs}/K21 ]]; then
+    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
new file mode 100644
index 00000000..6947c226
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
@@ -0,0 +1,125 @@
+#!/bin/bash -l
+#SBATCH -J integrate             # integrate
+#SBATCH --time 24:00:00  # 24:00:00
+#SBATCH --mem 8G        # 8G
+#SBATCH -N 1           # 1
+#SBATCH -c 1       # 1
+#SBATCH -p qiita           # qiita
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
+source activate rust
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+
+# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
+cores=${SLURM_CPUS_PER_TASK}
+
+if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
+    echo "Not operating in an array"
+    exit 1
+fi
+
+if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
+    echo "Line extraction assumes 1-based index"
+    exit 1
+fi
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+if [[ -z ${BASE} ]]; then
+    echo "BASE not specified"
+    exit 1
+fi
+
+tellread=${OUTPUT}
+if [[ ! -d ${tellread} ]]; then
+    echo "${tellread} not found"
+    exit 1
+fi
+
+set -x 
+set -e
+set -o pipefail
+
+samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+export TMPDIR=$(mktemp -d)
+function cleanup {                                                              
+  echo "Removing $TMPDIR"                                                          
+  rm  -r $TMPDIR                                                                   
+  unset TMPDIR                                                                  
+}                                                                               
+trap cleanup EXIT
+
+files=${TMPDIR}/integration.files
+/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files}
+mkdir -p ${tellread}/integrated
+
+if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R1"
+    exit 1
+fi
+
+if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R2"
+    exit 1
+fi
+
+if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} I1"
+    exit 1
+fi
+
+r1=$(grep -m 1 "_R1_${sample}" ${files})
+r2=$(grep -m 1 "_R2_${sample}" ${files})
+i1=$(grep -m 1 "_I1_${sample}" ${files})
+r1out=${tellread}/integrated/${sample}.R1.fastq.gz
+r2out=${tellread}/integrated/${sample}.R2.fastq.gz
+i1out=${tellread}/integrated/${sample}.I1.fastq.gz
+
+if [[ ! -s ${r1} ]]; then
+    echo "${r1} is empty, cannot integrate"
+    if [[ -s ${r2} ]]; then
+        echo "R1 and R2 are inconsistent"
+        exit 1
+    fi
+    if [[ -s ${i1} ]]; then
+        echo "R1 and I1 are inconsistent"
+        exit 1
+    fi
+
+    # reflect the empties so Qiita can know of them
+    touch ${r1out}
+    touch ${r2out}
+    touch ${i1out}
+    exit 0
+fi
+
+# this can probably be backgrounded but then you have to get creative to
+# not mask a nonzero exit status (e.g., the python process raising)
+cat ${i1} | gzip > ${i1out} 
+
+mamba activate tellread-integrate
+python ${BASE}/integrate-indices-np.py integrate \
+    --no-sort \
+    --r1-in ${r1} \
+    --r2-in ${r2} \
+    --i1-in ${i1} \
+    --r1-out ${r1out} \
+    --r2-out ${r2out} \
+    --threads ${cores} 
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
new file mode 100644
index 00000000..6a23331e
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
@@ -0,0 +1,62 @@
+#!/bin/bash -l
+#SBATCH -J tellink-isolate             # tellink-isolate
+#SBATCH -N 1           # 1
+#SBATCH -c 16       # 16
+#SBATCH --mem 160G        # 160G
+#SBATCH --time 96:00:00  # 96:00:00
+#SBATCH -p qiita           # qiita
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+set -x 
+set -e
+
+module load singularity_3.6.4 # singularity_3.6.4
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABELTAG is not specified"
+    exit 1
+fi
+
+base=/panfs/qiita/TELLREAD/${LABELTAG}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+k=79
+lc=35
+cores=${SLURM_CPUS_PER_TASK}
+
+tl=${base}/tell-link-isolate/${sample}
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${tl} ]]; then
+        rm -fr ${tl}
+    fi
+fi
+
+mkdir -p ${tl}
+
+/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \
+    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
+    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
+    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -k ${k} \
+    -lc ${lc} \
+    -p ${sample} \
+    -j ${cores}
+
+# remove temporary data
+if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
new file mode 100644
index 00000000..b6033b24
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
@@ -0,0 +1,64 @@
+#!/bin/bash -l
+#SBATCH -J tellink             # tellink
+#SBATCH --mem 160G        # 160G
+#SBATCH -N 1           # 1
+#SBATCH -c 16       # 16
+#SBATCH --time 96:00:00  # 96:00:00
+#SBATCH -p qiita           # qiita
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A_%a.out
+#SBATCH --error %x-%A_%a.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=FAIL
+
+set -x 
+set -e
+
+module load singularity_3.6.4 # singularity_3.6.4
+
+if [[ -z "${LABELTAG}" ]]; then
+    echo "LABEL is not specified"
+    exit 1
+fi
+
+base=/panfs/${USER}/${LABELTAG}
+if [[ ! -d ${base} ]]; then
+    echo "${base} not found"
+    exit 1
+fi
+
+samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+# leave these hardcoded for now
+k=79
+lc=35
+cores=${SLURM_CPUS_PER_TASK}
+
+tl=${base}/tell-link/${sample}
+if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
+    if [[ -d ${tl} ]]; then
+        rm -fr ${tl}
+    fi
+fi
+
+mkdir -p ${tl}
+
+/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \
+    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
+    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
+    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
+    -d metagenomics \
+    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -k ${k} \
+    -lc ${lc} \
+    -p ${sample} \
+    -j ${cores}
+
+# remove temporary data
+if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+fi
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
new file mode 100644
index 00000000..56bc3360
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
@@ -0,0 +1,23 @@
+#!/bin/bash -l
+#SBATCH -J cleanup             # cleanup
+#SBATCH --time 24:00:00  # 24:00:00
+#SBATCH --mem 8G        # 8G
+#SBATCH -N 1           # 1
+#SBATCH -c 1       # 1
+#SBATCH -p qiita           # qiita
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=BEGIN,FAIL
+
+# for now these can be left hard-coded.
+#SBATCH --output %x-%A.out
+#SBATCH --error %x-%A.err
+
+if [[ -z "${OUTPUT}" ]]; then
+    echo "OUTPUT is not specified"
+    exit 1
+fi
+
+# remove unused large outputs
+rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
new file mode 100644
index 00000000..ab0647f8
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
@@ -0,0 +1,108 @@
+#!/bin/bash -l
+#SBATCH -J tellread             # tellread
+#SBATCH -p qiita           # qiita
+#SBATCH -N 1           # 1
+#SBATCH -c 4       # 4
+#SBATCH --mem 16G        # 16G
+#SBATCH --time 96:00:00  # 96:00:00
+
+# for now these can be left hard-coded.
+#SBATCH --partition=short
+#SBATCH --output %x-%A.out
+#SBATCH --error %x-%A.err
+
+# for now comment these out as qiita is responsible for notifying users.
+###SBATCH --mail-user=qiita.help@gmail.com
+###SBATCH --mail-type=BEGIN,FAIL
+
+function logger () { 
+    echo "$(date) :: ${@}"; 
+    echo "$(date) :: ${@}" 1>&2; 
+}
+
+set -x 
+
+if [[ -z "${N_SAMPLES}" ]]; then
+    echo "N_SAMPLES is not specified"
+    exit 1
+fi
+
+if [[ -z "${SEQRUNPATH}" ]]; then
+    echo "SEQRUNPATH is not specified"
+    exit 1
+fi
+
+if [[ -z "${LANE}" ]]; then
+    echo "LANE is not specified"
+    exit 1
+fi
+
+if [[ -z "${SAMPLES}" ]]; then
+    echo "SAMPLES is not specified"
+    exit 1
+fi
+
+if [[ -z "${REFS}" ]]; then
+    echo "REFS is not specified"
+    exit 1
+fi
+
+if [[ -z "${OUTPUT}" ]]; then
+    echo "OUTPUT is not specified"
+    exit 1
+fi
+
+export TMPDIR="/panfs/${USER}/tmp"
+mkdir -p ${TMPDIR}
+export TMPDIR=$(mktemp -d)
+seqrun_path=${SEQRUNPATH}
+
+if [[ ${LANE} == "L001" ]]; then
+    lane=s_1
+elif [[ ${LANE} == "L002" ]]; then
+    lane=s_2
+elif [[ ${LANE} == "L003" ]]; then
+    lane=s_3
+elif [[ ${LANE} == "L004" ]]; then
+    lane=s_4
+elif [[ ${LANE} == "L005" ]]; then
+    lane=s_5
+elif [[ ${LANE} == "L006" ]]; then
+    lane=s_6
+elif [[ ${LANE} == "L007" ]]; then
+    lane=s_7
+elif [[ ${LANE} == "L008" ]]; then
+    lane=s_8
+else
+    echo "Unrecognized lane: ${LANE}"
+    exit 1
+fi
+
+# yes, hard coded, not great but progress.
+extra=""
+if [[ ! -z ${REFBASE} ]]; then
+    extra="-f ${REFBASE}"
+fi
+
+mkdir -p ${OUTPUT}
+    
+module load singularity_3.6.4 # singularity_3.6.4
+$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
+    -i ${seqrun_path} \
+    -o ${OUTPUT} \
+    -s $(echo ${SAMPLES} | tr -d '"') \
+    -g $(echo ${REFS} | tr -d '"') \
+    -j ${SLURM_JOB_CPUS_PER_NODE} \
+    ${extra} \
+    -l ${lane}
+
+    
+if [[ -d ${OUTPUT}/Full ]]; then
+    echo "Run appears successful"
+elif [[ -d ${OUTPUT}/1_demult/Full ]]; then
+    echo "Run appears unsuccessful but has output"
+    exit 1
+else
+    echo "Run appears unsuccessful"
+    exit 1
+fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh
new file mode 100644
index 00000000..90b4e1ce
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv"      # previously -i option
+seqrunpath="/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4"        # previously -s option
+lane="L008"                     # previously -l option
+reference_map=""   # previously -r option
+reference_base="" # previously -b option
+mode="metagenomic" $                   # previously -m option
+
+# preserve error-checking of parameters to preserve as much of the original
+# script as possible, even though this could be done in python.
+
+# https://unix.stackexchange.com/a/621007
+: ${seqrunpath:?Missing -s}
+: ${lane:?Missing -i}
+
+if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then
+    if [[ -z ${reference_map} ]]; then
+        echo "-b used without -r"
+        exit 1
+    fi
+    if [[ -z ${reference_base} ]]; then
+        echo "-r used without -b"
+        exit 1
+    fi
+    if [[ ! -d ${reference_base} ]]; then
+        echo "reference base not found"
+        exit 1
+    fi
+
+    tag=reference-based
+else
+    tag=reference-free
+fi
+
+# trim trailing slash
+# https://stackoverflow.com/a/32845647/19741
+safepath=$(echo ${seqrunpath} | sed 's:/*$::')
+label=$(basename ${safepath})
+labeltag=${label}-${tag}
+output=/panfs/${USER}/${labeltag}
+
+if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
+    echo "Cannot access the lane"
+    exit 1
+fi
+
+# for now this can stay here to keep greater compatibility with the original script.
+# however these fields should eventually be parameters that can be configured in the config file.
+
+if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then
+    sbatch_cores=2
+    sbatch_mem=8G
+    norm=TRUE
+    wall=24:00:00
+    mode=NA
+elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then
+    sbatch_cores=2
+    sbatch_mem=8G
+    norm=TRUE
+    wall=24:00:00
+    mode=NA
+else
+    sbatch_cores=16
+    sbatch_mem=160G
+    norm=FALSE
+    assemble=TRUE
+    wall=48:00:00
+fi
+
+if [[ ${mode} == "isolate" ]]; then
+    ISOLATE_MODE=TRUE
+elif [[ ${mode} == "metagenomic" ]]; then
+    ISOLATE_MODE=FALSE
+elif [[ ${mode} == "NA" ]]; then
+    ISOLATE_MODE=FALSE
+else
+    echo "unknown mode: ${mode}"
+    exit 1
+fi
+
+set -e
+set -o pipefail
+
+declare -a s
+declare -a g
+# below extended regex might be broken because C5\d\d happens in column 0, not column 1
+# of the hacked sample-sheet.
+for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
+do
+    echo "sample found: ${sample}"
+    # get references if they exist
+    if [[ -f ${reference_map} ]]; then
+        if $(grep -Fq ${sample} ${reference_map}); then
+            ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n")
+            if [[ ${ref} != "NONE" ]]; then
+                if [[ ! -d "${reference_base}/${ref}" ]]; then
+                    echo "${reference_base}/${ref}"
+                    echo "${ref} not found"
+                    exit 1
+                fi
+                g[${#g[@]}]=${ref}
+                s[${#s[@]}]=${sample}
+            fi
+        fi
+    else
+        g[${#g[@]}]=NONE
+        s[${#s[@]}]=${sample}
+    fi
+done
+n_samples=${#s[@]}
+
+# https://stackoverflow.com/a/17841619/19741
+function join_by { local IFS="$1"; shift; echo "$*"; }
+s=$(join_by , "${s[@]}")
+g=$(join_by , "${g[@]}")
+
+base=$(dirname ${0})
+submit_script=$(dirname ${0})/tellread.sbatch
+integrate_script=$(dirname ${0})/integrate.sbatch
+norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch
+asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
+clean_script=$(dirname ${0})/tellread-cleanup.sbatch
+
+if [[ ${ISOLATE_MODE} == "TRUE" ]]; then
+    asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch
+    asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch
+else
+    asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
+    asm_tellink_script=$(dirname ${0})/telllink.sbatch
+fi
+
+if [[ ! -f ${submit_script} ]]; then
+    echo "Cannot access submit script"
+    exit 1
+fi
+if [[ ! -f ${asm_cloudspades_script} ]]; then
+    echo "Cannot access cloudspades assembly script"
+    exit 1
+fi
+if [[ ! -f ${asm_tellink_script} ]]; then
+    echo "Cannot access tell-link assembly script"
+    exit 1
+fi
+if [[ ! -f ${integrate_script} ]]; then
+    echo "Cannot access integrate script"
+    exit 1
+fi
+if [[ ! -f ${clean_script} ]]; then
+    echo "Cannot access clean script"
+    exit 1
+fi
+
+datetag=$(date "+%Y.%m.%d")
+scriptcopy=$(pwd)/tellread_script-${datetag}.sh
+submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch
+asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch
+asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch
+normcopy=$(pwd)/norm_submission-${datetag}.sbatch
+intcopy=$(pwd)/integrate_submission-${datetag}.sbatch
+cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch
+arguments=$(pwd)/provided_script_arguments.txt
+if [[ -f ${scriptcopy} ]]; then
+    echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit"
+    exit 1
+fi
+if [[ -f ${submitcopy} ]]; then
+    echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit"
+    exit 1
+fi
+
+echo $@ > ${arguments}
+cp ${0} ${scriptcopy}
+cp ${submit_script} ${submitcopy}
+cp ${asm_cloudspades_script} ${asmcscopy}
+cp ${asm_tellink_script} ${asmtlcopy}
+cp ${integrate_script} ${intcopy}
+cp ${clean_script} ${cleancopy}
+chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy}
+
+set -x
+
+trjob=$(sbatch \
+          --parsable \
+          -J ${labeltag}-${datetag} \
+          -c ${sbatch_cores} \
+          --mem ${sbatch_mem} \
+          --time ${wall} \
+          --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
+          ${submit_script})
+
+if [[ ${norm} == "TRUE" ]]; then
+    cp ${norm_script} ${normcopy}
+    chmod gou-w ${normcopy}
+    norm_counts_job=$(sbatch \
+                        --parsable \
+                        --dependency=afterok:${trjob} \
+                        -J ${labeltag}-${datetag}-norm-counts \
+                        --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \
+                        ${norm_script})
+fi
+
+integrate_job=$(sbatch \
+                    --parsable \
+                    -J ${labeltag}-${datetag}-integrate \
+                    --dependency=afterok:${trjob} \
+                    --array 1-${n_samples} \
+                    --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \
+                    ${integrate_script})
+
+if [[ ${assemble} == "TRUE" ]]; then
+    csj=$(sbatch \
+            --parsable \
+            --dependency=aftercorr:${integrate_job} \
+            -J ${labeltag}-${datetag}-cloudspades \
+            --array 1-${n_samples} \
+            --export LABELTAG=${labeltag},OUTPUT=${output} \
+            ${asm_cloudspades_script})
+    tlj=$(sbatch \
+            --parsable \
+            --dependency=aftercorr:${integrate_job} \
+            -J ${labeltag}-${datetag}-tell-link \
+            --array 1-${n_samples} \
+            --export LABELTAG=${labeltag},OUTPUT=${output} \
+            ${asm_tellink_script})
+    cleanupdep=${csj}:${tlj}
+else
+    cleanupdep=${integrate_job}
+    echo "Not assembling"
+fi
+
+cleanup=$(sbatch \
+            --parsable \
+            -J ${labeltag}-${datetag}-cleanup \
+            --dependency=afterok:${cleanupdep} \
+            --export OUTPUT=${output} \
+            ${clean_script})
\ No newline at end of file

From a6bde1e8232f1442a357f604e5918773e27f647a Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 4 Sep 2024 20:36:42 -0700
Subject: [PATCH 07/47] Fifth pass, tested on qiita-rc and then refactored.

---
 README.rst                                    |   6 +
 sequence_processing_pipeline/Job.py           | 199 ++++++---
 sequence_processing_pipeline/TRConvertJob.py  | 398 ++++++++++++++----
 .../contrib/create_picklist.py                |  65 +++
 .../contrib/integrate-indices-np.py           | 330 +++++++++++++++
 .../contrib/plot_counts.py                    |  27 ++
 .../templates/cloudspades-isolate.sbatch      |  55 +--
 .../templates/cloudspades.sbatch              |  38 +-
 ...e_sequence_counts_for_normalization.sbatch |  57 +++
 .../templates/integrate.sbatch                |  18 +-
 .../templates/telllink-isolate.sbatch         |  15 +-
 .../templates/telllink.sbatch                 |  18 +-
 .../templates/tellread-cleanup.sbatch         |   9 +-
 .../templates/tellread.sbatch                 |  25 +-
 .../templates/tellread.sh                     |  34 +-
 .../20230906_FS10001773_68_BTR67708-1611.csv  |  41 ++
 16 files changed, 1072 insertions(+), 263 deletions(-)
 create mode 100644 sequence_processing_pipeline/contrib/create_picklist.py
 create mode 100644 sequence_processing_pipeline/contrib/integrate-indices-np.py
 create mode 100644 sequence_processing_pipeline/contrib/plot_counts.py
 create mode 100644 sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv

diff --git a/README.rst b/README.rst
index c51fcd0c..190ebba4 100644
--- a/README.rst
+++ b/README.rst
@@ -62,3 +62,9 @@ Please note that the setting 'minimap2_databases' is expected to be a list of pa
 For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic'
 and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay
 type.
+
+Additional TellSeq-related notes:
+'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP.
+Their paths should be made known to SPP in the configuration files. (See examples for details).
+Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured.
+
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index af04ef9c..035d8ba0 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -9,9 +9,22 @@
 import logging
 from inspect import stack
 import re
+from time import time
 
 
 class Job:
+    slurm_status_terminated = ['BOOT_FAIL', 'CANCELLED', 'DEADLINE', 'FAILED',
+                               'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED',
+                               'REVOKED', 'TIMEOUT']
+
+    slurm_status_successful = ['COMPLETED']
+
+    slurm_status_running = ['COMPLETING', 'CONFIGURING', 'PENDING', 'REQUEUED',
+                            'REQUEUE_FED', 'REQUEUE_HOLD', 'RESIZING',
+                            'RESV_DEL_HOLD', 'RUNNING', 'SIGNALING',
+                            'SPECIAL_EXIT', 'STAGE_OUT', 'STOPPED',
+                            'SUSPENDED']
+
     def __init__(self, root_dir, output_path, job_name, executable_paths,
                  max_array_length, modules_to_load=None):
         """
@@ -191,53 +204,13 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None):
 
         return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code}
 
-    def submit_job(self, script_path, job_parameters=None,
-                   script_parameters=None, wait=True,
-                   exec_from=None, callback=None):
-        """
-        Submit a Torque job script and optionally wait for it to finish.
-        :param script_path: The path to a Torque job (bash) script.
-        :param job_parameters: Optional parameters for scheduler submission.
-        :param script_parameters: Optional parameters for your job script.
-        :param wait: Set to False to submit job and not wait.
-        :param exec_from: Set working directory to execute command from.
-        :param callback: Set callback function that receives status updates.
-        :return: Dictionary containing the job's id, name, status, and
-        elapsed time. Raises PipelineError if job could not be submitted or
-        if job was unsuccessful.
-        """
-        if job_parameters:
-            cmd = 'sbatch %s %s' % (job_parameters, script_path)
-        else:
-            cmd = 'sbatch %s' % (script_path)
-
-        if script_parameters:
-            cmd += ' %s' % script_parameters
-
-        if exec_from:
-            cmd = f'cd {exec_from};' + cmd
-
-        logging.debug("job scheduler call: %s" % cmd)
-
-        if self.force_job_fail:
-            raise JobFailedError("This job died.")
-
-        # if system_call does not raise a PipelineError(), then the scheduler
-        # successfully submitted the job. In this case, it should return
-        # the id of the job in stdout.
-        results = self._system_call(cmd)
-        stdout = results['stdout']
-
-        job_id = stdout.strip().split()[-1]
-
+    def _wait_on_job(self, job_id, callback=None):
         job_info = {'job_id': None, 'job_name': None, 'job_state': None,
                     'elapsed_time': None}
-        # Just to give some time for everything to be set up properly
-        sleep(10)
 
         exit_count = 0
 
-        while wait:
+        while True:
             result = self._system_call(f"sacct -P -n --job {job_id} --format "
                                        "JobID,JobName,State,Elapsed,ExitCode")
 
@@ -287,28 +260,52 @@ def submit_job(self, script_path, job_parameters=None,
 
             sleep(10)
 
-        if job_info['job_id'] is not None:
-            # job was once in the queue
-            if callback is not None:
-                callback(jid=job_id, status=job_info['job_state'])
-
-            if set(states) == {'COMPLETED'}:
-                if 'exit_status' in job_info:
-                    if set(estatuses) == {'0:0'}:
-                        # job completed successfully
-                        return job_info
-                    else:
-                        exit_status = job_info['exit_status']
-                        raise JobFailedError(f"job {job_id} exited with exit_"
-                                             f"status {exit_status}")
-                else:
-                    # with no other info, assume job completed successfully
-                    return job_info
-            else:
-                # job exited unsuccessfully
-                raise JobFailedError(f"job {job_id} exited with status "
-                                     f"{job_info['job_state']}")
+        return job_info, states, estatuses
+
+    def submit_job(self, script_path, job_parameters=None,
+                   script_parameters=None, exec_from=None, callback=None):
+        """
+        Submit a Torque job script and optionally wait for it to finish.
+        :param script_path: The path to a Torque job (bash) script.
+        :param job_parameters: Optional parameters for scheduler submission.
+        :param script_parameters: Optional parameters for your job script.
+        :param exec_from: Set working directory to execute command from.
+        :param callback: Set callback function that receives status updates.
+        :return: Dictionary containing the job's id, name, status, and
+        elapsed time. Raises PipelineError if job could not be submitted or
+        if job was unsuccessful.
+        """
+        if job_parameters:
+            cmd = 'sbatch %s %s' % (job_parameters, script_path)
         else:
+            cmd = 'sbatch %s' % (script_path)
+
+        if script_parameters:
+            cmd += ' %s' % script_parameters
+
+        if exec_from:
+            cmd = f'cd {exec_from};' + cmd
+
+        logging.debug("job scheduler call: %s" % cmd)
+
+        if self.force_job_fail:
+            raise JobFailedError("This job died.")
+
+        # if system_call does not raise a PipelineError(), then the scheduler
+        # successfully submitted the job. In this case, it should return
+        # the id of the job in stdout.
+        results = self._system_call(cmd)
+        stdout = results['stdout']
+
+        job_id = stdout.strip().split()[-1]
+
+        # Just to give some time for everything to be set up properly
+        sleep(10)
+
+        job_info, states, estatuses = self._wait_on_job(job_id,
+                                                        callback=callback)
+
+        if job_info['job_id'] is None:
             # job was never in the queue - return an error.
             if callback is not None:
                 callback(jid=job_id, status='ERROR')
@@ -316,6 +313,82 @@ def submit_job(self, script_path, job_parameters=None,
             raise JobFailedError(f"job {job_id} never appeared in the "
                                  "queue.")
 
+        # job was once in the queue
+        if callback is not None:
+            callback(jid=job_id, status=job_info['job_state'])
+
+        if set(states) == {'COMPLETED'}:
+            if 'exit_status' in job_info:
+                if set(estatuses) == {'0:0'}:
+                    # job completed successfully
+                    return job_info
+                else:
+                    exit_status = job_info['exit_status']
+                    raise JobFailedError(f"job {job_id} exited with exit_"
+                                         f"status {exit_status}")
+            else:
+                # with no other info, assume job completed successfully
+                return job_info
+        else:
+            # job exited unsuccessfully
+            raise JobFailedError(f"job {job_id} exited with status "
+                                 f"{job_info['job_state']}")
+
+    def _wait_on_job_ids(self, job_ids, timeout_in_seconds=None):
+        """
+        Wait on a list of known Slurm job-ids.
+        :param job_ids: A list of Slurm job-ids
+        :param timeout_in_seconds: Abort and raise an Error after n seconds.
+        :return: A list of strings, representing the state of each job.
+        """
+
+        # this method is useful for wrapping scripts that spawn child jobs and
+        # the user wishes to wait until they are all completed before
+        # continuing.
+        if not isinstance(job_ids, list):
+            raise ValueError("job_ids must be a list of valid slurm job ids")
+
+        if set([isinstance(x, int) for x in job_ids]) != {True}:
+            raise ValueError("job_ids must contain integers")
+
+        if timeout_in_seconds:
+            if not isinstance(timeout_in_seconds, int):
+                raise ValueError("timeout_in_seconds must be an integer")
+
+            if timeout_in_seconds < 1:
+                raise ValueError("timeout_in_seconds must be greater than 0")
+
+        start_time = time()
+        while True:
+            if timeout_in_seconds:
+                if time() - start_time > timeout_in_seconds:
+                    raise PipelineError("timeout reached while waiting for "
+                                        "jobs")
+
+            job_states = []
+            for job_id in job_ids:
+                # NB: sacct can support querying on multiple job-ids at once.
+                # However, this would require extensive rewriting and testing
+                # of the existing code. Deferring for now.
+                _, states, _ = self._wait_on_job(job_id)
+                job_states.append(set(states))
+
+            # assuming that a Slurm job will never contain states from both
+            # terminated and successful, this will generate a list containing
+            # the current state for each job.
+            result = [set(x) & set(Job.slurm_status_terminated +
+                                   Job.slurm_status_successful) for x in job_states]
+
+            if set([bool(x) for x in result]) == {True}:
+                # all jobs are no longer in a running state.
+                break
+
+            sleep(10)
+
+        # return the current state of each job. Assume that each set contains
+        # only one value.
+        return [''.join(x) for x in result]
+
     def _group_commands(self, cmds):
         # break list of commands into chunks of max_array_length (Typically
         # 1000 for Torque job arrays). To ensure job arrays are never more
diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 8cc1a14a..98d9c18d 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -1,8 +1,11 @@
 from jinja2 import BaseLoader, TemplateNotFound, Environment
-from os.path import join, exists, getmtime
+from os.path import split, join, exists, getmtime
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import PipelineError
 import pathlib
+from os import rename, walk, chmod, listdir, makedirs
+from shutil import move, rmtree
+from re import match
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
@@ -43,7 +46,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         """
         super().__init__(run_dir,
                          output_path,
-                         'TRConvertJob',
+                         'ConvertJob',
                          [bcl_tool_path],
                          1000,
                          modules_to_load=modules_to_load)
@@ -60,7 +63,13 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.bcl_tool = bcl_tool_path
         self.qiita_job_id = qiita_job_id
         self.suffix = 'fastq.gz'
-        self.job_script_path = None
+
+        self.tellread_output_path = join(self.output_path, 'output')
+        makedirs(self.tellread_output_path)
+
+        self.tmp1_path = join(self.tellread_output_path, 'tmp1')
+
+        makedirs(self.tmp1_path)
 
         # for projects that use sequence_processing_pipeline as a dependency,
         # jinja_env must be set to sequence_processing_pipeline's root path,
@@ -89,113 +98,186 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
 
         self._generate_job_scripts()
 
+        # TODO: generate a sample-mapping to map C#s to fake sample-names and
+        #  fake projects. Process sample-sheet later.
+        self.mapping = self._generate_sample_mapping()
+
+        # TODO: hardcode lane at 'L001'
+        self.lane = 'L001'
+
+        self.clean_wall_time_limit = "24:00:00"
+        self.clean_mem_in_gb = "8"
+        self.clean_node_count = "1"
+        self.clean_cores_per_task = "1"
+        self.cloudspades_cores_per_task = "12"
+        self.cloudspades_mem_in_gb = "128"
+        self.cloudspades_modules = ["gcc_9.3.0"]
+        self.cloudspades_node_count = "1"
+        self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1"
+        self.cloudspades_wall_time_limit = "24:00:00"
+        self.counts_cores_per_task = "1"
+        self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py",
+        self.counts_mem_in_gb = "8"
+        self.counts_node_count = "1"
+        self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv'
+        self.counts_plot_counts_path = "/home/qiita_test/qiita-spots/plot_counts.py"
+        self.counts_sample_sheet = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv"
+        self.counts_wall_time_limit = "24:00:00"
+        self.cs_isolate_mem_in_gb = "64"
+        self.integrate_indicies_script_path = "/home/qiita_test/qiita-spots/integrate-indices-np.py"
+        self.integrate_mem_in_gb = "8"
+        self.integrate_node_count = "1"
+        self.integrate_wall_time_limit = "24:00:00"
+        self.integrate_cores_per_task = "1"
+        self.queue_name = "qiita"
+        self.tellink_cores_per_task = "16"
+        self.tellink_mem_in_gb = "160"
+        self.tellink_modules = ["singularity_3.6.4"]
+        self.tellink_node_count = "1"
+        self.tellink_sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
+        self.tellink_wall_time_limit = "96:00:00"
+        self.tellread_cores_per_task = "4"
+        self.tellread_mem_in_gb = "16"
+        self.tellread_modules = ["singularity_3.6.4"]
+        self.tellread_node_count = "1"
+        self.tellread_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh"
+        self.tellread_wall_time_limit = "96:00:00"
+        self.tl_cores_per_task = "16"
+        self.tl_isolate_node_count = "1"
+        self.tl_isolate_wall_time_limit = "96:00:00"
+        self.tl_mem_in_gb = "160"
+        self.main_map = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv"
+        self.main_mode = "metagenomic"
+        self.main_seqrun_path = "/sequencing/seqmount/KL_iSeq_Runs/20230906_FS10001773_68_BTR67708-1611"
+
+        # TODO: Address reference_map and reference_base
+        self.main_reference_base = ""
+        self.main_reference_map = ""
+
     def _generate_job_scripts(self):
         scripts = [
             {
                 "template": "cloudspades.sbatch",
                 "params": {
                     "job_name": "cs-assemble",
-                    "wall_time_limit": "24:00:00",
-                    "mem_in_gb": "128",
-                    "node_count": "1",
-                    "cores_per_task": "12",
-                    "queue_name": "qiita",
-                    "modules_to_load": ' '.join(["gcc_9.3.0"]),
-                    "spades_path": "TBD"
+                    "wall_time_limit": self.wall_time_limit,
+                    "mem_in_gb": self.cloudspades_mem_in_gb,
+                    "node_count": self.cloudspades_node_count,
+                    "cores_per_task": self.cloudspades_cores_per_task,
+                    "queue_name": self.queue_name,
+                    "modules_to_load": ' '.join(self.cloudspades_modules),
+                    "cloudspades_path": self.cloudspades_path
                 }
             },
             {
                 "template": "cloudspades-isolate.sbatch",
                 "params": {
                     "job_name": "cs-assemble",
-                    "wall_time_limit": "24:00:00",
-                    "mem_in_gb": "64",
-                    "node_count": "1",
-                    "cores_per_task": "12",
-                    "queue_name": "qiita",
-                    "modules_to_load": ' '.join(["gcc_9.3.0"]),
-                    "spades_path": "~/spades-cloudspades-paper/assembler/"
+                    "wall_time_limit": self.cloudspades_wall_time_limit,
+                    "mem_in_gb": self.cs_isolate_mem_in_gb,
+                    "node_count": self.cloudspades_node_count,
+                    "cores_per_task": self.cloudspades_cores_per_task,
+                    "queue_name": self.queue_name,
+                    "modules_to_load": ' '.join(self.cloudspades_modules),
+                    "cloudspades_path": self.cloudspades_path
                 }
             },
             {
                 "template": "integrate.sbatch",
                 "params": {
                     "job_name": "integrate",
-                    "wall_time_limit": "24:00:00",
-                    "mem_in_gb": "8",
-                    "node_count": "1",
-                    "cores_per_task": "1",
-                    "queue_name": "qiita"
+                    "wall_time_limit": self.integrate_wall_time_limit,
+                    "mem_in_gb": self.integrate_mem_in_gb,
+                    "node_count": self.integrate_node_count,
+                    "cores_per_task": self.integtrate_cores_per_task,
+                    "iinp_script_path": self.integrate_indicies_script_path,
+                    "queue_name": self.queue_name
+                }
+            },
+            {
+                "template": "compute_sequence_counts_for_normalization.sbatch",
+                "params": {
+                    "job_name": "norm",
+                    "wall_time_limit": self.counts_wall_time_limit,
+                    "mem_in_gb": self.counts_mem_in_gb,
+                    "node_count": self.counts_node_count,
+                    "cores_per_task": self.counts_cores_per_task,
+                    "sample_sheet": self.counts_sample_sheet,
+                    "plot_counts_path": self.counts_plot_counts_path,
+                    "output_path": self.tellread_output_path,
+                    "create_picklist_path": self.counts_create_picklist_path,
+                    "read_counts_path": join(self.tellread_output_path, self.counts_other_file),
+                    "queue_name": self.queue_name
                 }
             },
             {
                 "template": "telllink.sbatch",
                 "params": {
                     "job_name": "tellink",
-                    "wall_time_limit": "96:00:00",
-                    "mem_in_gb": "160",
-                    "node_count": "1",
-                    "cores_per_task": "16",
-                    "queue_name": "qiita",
-                    "modules_to_load": ' '.join(["singularity_3.6.4"]),
-                    "sing_path": "/projects/long_read_collab/code/tellseq/"
-                                 "release_v1.11/tellink-release/"
-                                 "run_tellink_sing.sh"
+                    "wall_time_limit": self.tellink_wall_time_limit,
+                    "mem_in_gb": self.tellink_mem_in_gb,
+                    "node_count": self.tellink_node_count,
+                    "cores_per_task": self.tellink_cores_per_task,
+                    "queue_name": self.queue_name,
+                    "modules_to_load": ' '.join(self.tellink_modules),
+                    "output_path": self.tellread_output_path,
+                    "sing_path": self.tellink_sing_path
                 }
             },
             {
                 "template": "telllink-isolate.sbatch",
                 "params": {
                     "job_name": "tellink-isolate",
-                    "wall_time_limit": "96:00:00",
-                    "node_count": "1",
-                    "cores_per_task": "16",
-                    "mem_in_gb": "160",
-                    "queue_name": "qiita",
-                    "modules_to_load": ' '.join(["singularity_3.6.4"]),
-                    "sing_path": "/projects/long_read_collab/code/tellseq/"
-                                 "release_v1.11/tellink-release/"
-                                 "run_tellink_sing.sh"
+                    "wall_time_limit": self.tellink_wall_time_limit,
+                    "node_count": self.tl_isolate_node_count,
+                    "cores_per_task": self.tl_cores_per_task,
+                    "mem_in_gb": self.tl_mem_in_gb,
+                    "queue_name": self.queue_name,
+                    "modules_to_load": ' '.join(self.tellink_modules),
+                    "output_path": self.tellread_output_path,
+                    "sing_path": self.tellink_sing_path
                 }
             },
             {
                 "template": "tellread.sbatch",
                 "params": {
                     "job_name": "tellread",
-                    "wall_time_limit": "96:00:00",
-                    "mem_in_gb": "16",
-                    "node_count": "1",
-                    "tmp_dir": "/panfs/${USER}/tmp",
-                    "cores_per_task": "4",
-                    "queue_name": "qiita",
-                    "tellread_sbatch_tmp_dir": "/panfs/${USER}/tmp",
-                    "sing_script_path": "$HOME/qiita-spots/tellread-release"
-                                        "-novaseqX/run_tellread_sing.sh",
-                    "modules_to_load": ' '.join(["singularity_3.6.4"])
+                    "wall_time_limit": self.tellread_wall_time_limit,
+                    "mem_in_gb": self.tellread_mem_in_gb,
+                    "node_count": self.tellread_node_count,
+                    "tmp_dir": self.tmp1_path,
+                    "cores_per_task": self.tellread_cores_per_task,
+                    "queue_name": self.queue_name,
+                    "sing_script_path": self.tellread_sing_script_path,
+                    "modules_to_load": ' '.join(self.tellread_modules)
                 }
              },
             {
                 "template": "tellread-cleanup.sbatch",
                 "params": {
                     "job_name": "cleanup",
-                    "wall_time_limit": "24:00:00",
-                    "mem_in_gb": "8",
-                    "node_count": "1",
-                    "cores_per_task": "1",
-                    "queue_name": "qiita"
+                    "wall_time_limit": self.clean_wall_time_limit,
+                    "mem_in_gb": self.clean_mem_in_gb,
+                    "node_count": self.clean_node_count,
+                    "cores_per_task": self.clean_cores_per_task,
+                    "queue_name": self.queue_name
                 }
              },
+            # these hardcoded paths for tellread.sh need to be replaced with
+            # the lane number and run-directory path, and the lane and the
+            # mode from the user input. Note that we also need to process the
+            # upcoming sample-sheet in order to generate the mapping we need
+            # as well.
             {
                 "template": "tellread.sh",
                 "params": {
-                    "tellread_map": "/home/qiita_test/qiita-spots/"
-                                    "tellread_mapping.csv",
-                    "seqrun_path": "/sequencing/igm_runs/"
-                                   "240216_LH00444_0058_A22357VLT4",
-                    "lane": 'L008',
-                    "reference_map": "",
-                    "reference_base": "",
-                    "mode": "metagenomic"
+                    "tellread_map": self.main_map,
+                    "seqrun_path": self.main_seqrun_path,
+                    "output_path": self.tellread_output_path,
+                    "lane": self.lane,
+                    "reference_map": self.main_reference_map,
+                    "reference_base": self.main_reference_base,
+                    "mode": self.main_mode
                 }
              }
         ]
@@ -207,9 +289,8 @@ def _generate_job_scripts(self):
 
             with open(job_script_path, 'w') as f:
                 f.write(template.render(**params))
-
-            if script['template'] == "tellread.sh":
-                self.job_script_path = job_script_path
+                # TODO: Change from 777 to something more appropriate.
+                chmod(job_script_path, 0o777)
 
     def run(self, callback=None):
         """
@@ -221,25 +302,126 @@ def run(self, callback=None):
         """
 
         # Unlike other Jobs that submit a Slurm script and wait for the job
-        # to complete, this Job will execute the tellread.sh shell script.
-        # It is this script that does all of the Slurm job creation. This Job
-        # will need another means to tell when a job has completed
-        # successfully.
-
-        command = ("./tellread.sh -s /sequencing/igm_runs/240216_LH00444"
-                   "_0058_A22357VLT4 -i ./samplesheet.csv -l L008 -m "
-                   "metagenomic")
-
-        if self.job_script_path:
-            res = self._system_call(command)
-        else:
+        # to complete, this Job() will execute an existing shell script that
+        # spawns all the jobs that perform the actual work.
+
+        # tellread.sh performs some work that requires it to run on a compute
+        # node. Since Job()s run on the interactive node, an interactive
+        # shell on a compute node must be requested for this script to run on.
+
+        # define 'sjob' here for clarity. This should be more than adequate
+        # resources to run the tellread.sh script and exit as it does not wait
+        # on its children to complete.
+
+        # as with the original scripts, the scripts generated by Jinja2 will
+        # live in the current working directory. Hence, the script will always
+        # exist at ./tellread.sh provided it was created successfully.
+        sjob = "srun -N 1 -n 1 -p qiita --mem 4g --time 1:00:00 --pty bash -l"
+        command = (f"{sjob}; pushd .;cd {self.output_path}; ./tellread.sh; "
+                   "popd; exit")
+
+        if not exists(join(self.output_path, 'tellread.sh')):
             raise PipelineError("tellread.sh script could not be found.")
 
+        res = self._system_call(command)
+
         if res['return_code'] != 0:
             raise PipelineError("tellread.sh script did not execute correctly")
 
-        # res['stdout']
-        # res['stderr']
+        # once _system_call() returns and tellread.sh executed correctly, then
+        # a pids file should exist in the output subdirectory.
+        pids_fp = join(self.output_path, 'output', 'pids')
+        if not exists(pids_fp):
+            raise PipelineError("TRConvertJob could not locate a pids file")
+
+        with open(pids_fp, 'r') as f:
+            lines = f.readlines()
+            lines = [x.strip().split(': ') for x in lines]
+            results = {k: v for (k, v) in lines}
+
+        child_processes = [('main tellread', 'TRJOB_RETURN_CODE',
+                            'TRJOB_PID', True),
+                           ('counts', 'NORM_COUNTS_JOB_RETURN_CODE',
+                            'NORM_COUNTS_JOB_PID', False),
+                           ('integrate', 'INTEGRATE_JOB_RETURN_CODE',
+                            'INTEGRATE_JOB_PID', True),
+                           ('csj', 'CSJ_JOB_RETURN_CODE',
+                            'CSJ_JOB_PID', False),
+                           ('tlj', 'TLJ_JOB_RETURN_CODE',
+                            'TLJ_JOB_PID', False),
+                           ('cleanup', 'CLEANUP_JOB_RETURN_CODE',
+                            'CLEANUP_JOB_PID', True)]
+
+        # Iterate through all the TellRead script's known child processes.
+        # Some children will be optional depending on the parameters given,
+        # while others are required. The Job() should immediately raise an
+        # error if any child (optional or not) exits unsuccessfully, however.
+        for name, code, _, is_required in child_processes:
+            if code in results:
+                if results[code] != '0':
+                    raise PipelineError(f"An error ({results[code]}) occurred "
+                                        f"running {name} subprocess")
+            else:
+                if is_required:
+                    raise PipelineError(f"The {name} subprocess did not "
+                                        "execute correctly")
+
+        # Get a list of Slurm job ids that we need to wait on and text
+        # descriptions of what they are.
+        jids = [(results[x[2], x[0]]) for x in child_processes if
+                x[2] in results]
+
+        # ensure the jids are casted to integers before passing them.
+        statuses = self._wait_on_job_ids([int(x[0]) for x in jids])
+
+        for (jid, description), status in zip(jids, statuses):
+            if status not in Job.slurm_status_successful:
+                raise PipelineError(f"process '{description}' ({jid}) "
+                                    f"failed ({status}")
+
+        # post-process working directory to make it appear like results
+        # generated by ConvertJob
+
+        integrated_files_path = join(self.output_path, 'output', "integrated")
+
+        if not exists(integrated_files_path):
+            raise ValueError(f"{integrated_files_path} does not exist")
+
+        # move integrated directory to TRConvertJob directory, co-level with
+        # output directory. This makes it easier to delete the rest of the
+        # output that we don't need.
+
+        # move err and out logs into logs subdirectory.
+        for root, dirs, files in walk(self.output_path):
+            for _file in files:
+                _path = join(root, _file)
+                if _path.endswith('.err'):
+                    move(_path, join(self.output_path, 'logs'))
+                elif _path.endswith('.out'):
+                    move(_path, join(self.output_path, 'logs'))
+            # don't go below one level.
+            break
+
+        # save two logs and move them into standard Job logs directory.
+        move(join(self.output_path, 'output', 'log'),
+             join(self.output_path, 'logs'))
+        move(join(self.output_path, 'output', 'output.log'),
+             join(self.output_path, 'logs'))
+
+        # rename the files and move them into project directories.
+        for root, dirs, files in walk(integrated_files_path):
+            for _file in files:
+                fastq_file = join(root, _file)
+                self._post_process_file(fastq_file, self.lane, self.mapping)
+
+        # move project folders from integrated directory to working_dir.
+        contents = listdir(integrated_files_path)
+        for name in contents:
+            move(join(integrated_files_path, name),
+                 self.output_path)
+
+        # delete the original output directory.
+        rmtree(join(self.output_path, 'output'))
 
     def parse_logs(self):
         raise PipelineError("parsing logs not implemented.")
@@ -247,3 +429,59 @@ def parse_logs(self):
     @staticmethod
     def parse_job_script(job_script_path):
         raise PipelineError("parsing job script not implemented.")
+
+    def _post_process_file(self, fastq_file, lane, mapping):
+        # generate names of the form generated by bcl-convert/bcl2fastq:
+        # <Sample_ID>_S#_L00#_<R# or I#>_001.fastq.gz
+        # see:
+        # https://help.basespace.illumina.com/files-used-by-basespace/
+        # fastq-files
+        _dir, _file = split(fastq_file)
+
+        # ex: integrated/C544.R2.fastq.gz
+        m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file)
+
+        if m is None:
+            raise ValueError(f"The filename '{_file}' is not of a "
+                             "recognizable form")
+
+        adapter_id = m[1]
+        read_type = m[2]
+
+        if adapter_id not in mapping:
+            raise ValueError(f"{adapter_id} is not present in mapping")
+
+        sample_name, sample_index, project_name = mapping[adapter_id]
+
+        # generate the new filename for the fastq file, and reorganize the
+        # files by project.
+        new_name = "%s_S%d_L%s_%s_001.fastq.gz" % (sample_name,
+                                                   sample_index,
+                                                   str(lane).zfill(3),
+                                                   read_type)
+
+        # ensure that the project directory exists before we rename and move
+        # the file to that location.
+        makedirs(join(_dir, project_name), exist_ok=True)
+
+        # if there's an error renaming and moving the file, let it pass up to
+        # the user.
+        final_path = join(_dir, project_name, new_name)
+        rename(fastq_file, final_path)
+        return final_path
+
+    def _generate_sample_mapping(self):
+        # this generates a sample mapping for the C501-C596 adapters used by
+        # the vendor to a sample-name and project. In production use this
+        # mapping would need to be created from the future sample-sheet.
+        project_names = ['Project1', 'Project2', 'Project3']
+        sample_mapping = {}
+
+        for sample_index in range(1, 97):
+            adapter_id = "C%s" % str(sample_index + 500)
+            sample_name = "MySample%d" % sample_index
+            project_name = project_names[sample_index % 3]
+            sample_mapping[adapter_id] = (sample_name, sample_index,
+                                          project_name)
+
+        return sample_mapping
diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py
new file mode 100644
index 00000000..44906872
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/create_picklist.py
@@ -0,0 +1,65 @@
+import os
+from scipy.stats import mannwhitneyu, zscore
+from sklearn.linear_model import LogisticRegression
+from contextlib import suppress
+import pandas as pd
+from metapool.metapool import *
+from metapool import (make_sample_sheet, requires_dilution, dilute_gDNA,
+                      find_threshold, autopool, extract_stats_metadata)
+from sys import argv
+
+input_sheet_filename = argv[1]
+#input_sheet_filename = input_sheet_filename.rsplit('.', 1)[0] + '.read_counts.tsv'
+#instead construct the needed path and pass it.
+
+plate_df_w_reads = pd.read_csv(input_sheet_filename,
+                                 sep='\t')
+plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False
+                             for s in plate_df_w_reads['Sample_Name']]
+reads_column = 'read_counts'
+
+well_col = 'Sample_Well'
+assert reads_column in plate_df_w_reads.columns
+
+f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
+# evenness plot
+rmax = int(round(plate_df_w_reads[reads_column].max(),-2))
+survival_df = pd.concat([read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == True,
+                                                            reads_column], label='Blanks',rmax=rmax),
+                         read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == False,
+                                                            reads_column], label='Samples',rmax=rmax)])
+
+ax3.set_xlabel(reads_column)
+ax3.set_ylabel('Samples')
+survival_df.plot(color = ['coral','steelblue'],ax=ax1)
+ax1.set_xlabel(reads_column)
+ax1.set_ylabel('Samples')
+
+##Histogram
+sns.histplot(plate_df_w_reads[reads_column],ax=ax3)
+
+#Boxplot
+sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4);
+sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4,
+              size=3,color='black',alpha=0.5)
+
+
+plt.tight_layout()
+plt.savefig(input_sheet_filename + '.comboplot.pdf')
+
+#plate_df_w_reads = plate_df_w_reads[plate_df_w_reads[reads_column] > 0]
+plate_df_normalized = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,dynamic_range=20,
+                                                         normalization_column=reads_column)
+plt.savefig(input_sheet_filename + '.normalizedplot.pdf')
+
+vols = make_2D_array(plate_df_normalized, data_col='iSeq normpool volume', well_col=well_col).astype(float)
+
+# Write the picklist as .csv
+picklist_fp = input_sheet_filename + '.picklist.csv'
+
+if os.path.isfile(picklist_fp):
+    print("Warning! This file exists already.")
+
+picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000)
+with open(picklist_fp,'w') as f:
+    f.write(picklist)
diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py
new file mode 100644
index 00000000..9500cff9
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py
@@ -0,0 +1,330 @@
+# Why
+# 1) cloudspades requires the index reads be inline in the record header
+# 2) Ariadne requires the data are sorted by the barcodes
+#
+# Inlining is easy. Sorting is complex as the amount of data is large, and
+# the ordering stems is determined external to the data being sorted. To
+# determine order, all barcodes must be read in to gather the complete
+# barcode <-> record association; if only partial data is read then
+# associations to barcodes may be missed, and we cannot perform an insertion sort
+# efficiently as we're writing to disk. Once we know an order for the records,
+# we (currently) read in the entirety of the subsequent data (R1 then R2),
+# reorder, and write. Performing this in blocks to minimize memory may be
+# possible, but we have to assume access is random as a grouping barcode
+# may be with any record along the file.
+#
+# A variety of approaches were considered, including:
+# - indexing portions in a hashtable, reading inputs multiple times, and
+#   writing in blocks. This was tested in both rust and python. The amount of
+#   memory was large, and keeping it under control would be many many many
+#   passes over data on disk or in memory
+# - using pandas to do the grouping, which possibly avoids the memory burden
+#   of a hashmap. it didn't
+# - using mmap files. No go, these are large and we have to walk over them
+#   a lot.
+#
+# Parsing this stuff adds a lot of overhead in Python. It will add some, if not
+# a lot, in rust as well -- our test data had 65M sequences. So the current
+# approach operates in the raw file data itself, using regex's to parse
+# individual records. We use numpy for sorting and getting record orders.
+# This is memory expensive but so far much less than the other approaches tried
+# and it does not require multiple passes over files. We bottleneck on write
+# IO, so to mitigate that, we are using a parallel gzip (pgzip), which still
+# bottlenecks but gets better throughput.
+#
+# There probably are smarter ways to do this to reduce the memory burden.
+# Right now, it's O(N) where N is the number of records. We load R1 and R2
+# separately though so we at least halve the memory use. As for doing it
+# faster, at the moment we appear to saturate time on gzip. Easiest solution
+# would be to increase the number of threads, but then again, this process
+# is expected to run in an array, and filesystem can only take so much.
+#
+# In addition to the inline tests, md5 checks to verify all record IDs are
+# present in both R1 / R2, and relative to original input. Spot checks on
+# an arbitrary set of records were performed on R1 / R2 to verify no apparent
+# unusual modification. And spot checks were performed to verify that correct
+# barcodes are incorporating as expected in output.
+#
+# author: Daniel McDonald (d3mcdonald@eng.ucsd.edu)
+import numpy as np
+import click
+import re
+import io
+import pgzip
+import gzip
+
+
+RECORD = re.compile(rb'@\S+\n[ATGCN]+\n\+\n\S+\n')
+BARCODE = re.compile(rb'@\S+\n([ATGCN]+)\n\+\n\S+\n')
+
+
+def gather_order(i1_in_fp):
+    """Determine record order
+
+    This is a fancy way of saying: get all the barcodes, and sort them.
+
+    We return the order of the sorted records, the unique barcodes,
+    and the bounds for what barcode associated with what record
+    """
+    # determine barcode length
+    _ = i1_in_fp.readline()
+    b = i1_in_fp.readline()
+    rec_len = len(b.strip())
+    i1_in_fp.seek(0)
+
+    # we need larger data in memory later anyway...
+    i1 = i1_in_fp.read()
+    start = 0
+    end = len(i1)
+
+    # get the number of records. we completely assume non-multiline fastq here
+    newlines = i1.count(b'\n')
+    assert newlines % 4 == 0
+    barcodes = np.empty(newlines // 4, dtype='|S%d' % rec_len)
+
+    # walk all index records
+    # grab each barcode
+    idx = 0
+    while start < end:
+        barcode_result = BARCODE.search(i1, pos=start)
+        barcode = barcode_result.groups()[0]
+        assert len(barcode) == rec_len  # get angry if the barcode is weird
+
+        barcodes[idx] = barcode
+        idx += 1
+        start = barcode_result.end()
+
+    # we no longer need the raw data so let's toss it
+    del i1
+
+    # determine the record order of a lexicographic sort
+    # gather the unique barcodes so we can use them later, and the bounding
+    # points in the sorted set
+    record_order = barcodes.argsort()
+    barcodes = barcodes[record_order]
+    unique_barcodes, barcode_bounds = np.unique(barcodes, return_index=True)
+
+    return record_order, unique_barcodes, barcode_bounds
+
+
+def test_gather_order():
+    i1data = [b'@foo', b'ATGC', b'+', b'!!!!',
+              b'@bar', b'TTGG', b'+', b'!!!!',
+              b'@baz', b'ATGC', b'+', b'!!!!',
+              b'@oof', b'TTTT', b'+', b'!!!!',
+              b'@rab', b'TTGG', b'+', b'!!!!',
+              b'@zab', b'TTTT', b'+', b'!!!!',
+              b'@ofo', b'TTTT', b'+', b'!!!!', b'']
+
+    i1 = io.BytesIO(b'\n'.join(i1data))
+    order, unique, bounds = gather_order(i1)
+
+    exp_order = np.array([0, 2, 1, 4, 3, 5, 6])
+    exp_unique = np.array([b'ATGC', b'TTGG', b'TTTT'])
+    exp_bounds = np.array([0, 2, 4])
+
+    assert (order == exp_order).all()
+    assert (unique == exp_unique).all()
+    assert (bounds == exp_bounds).all()
+
+
+def troll_and_write(order, unique, bounds, in_, out_):
+    """Walk over the raw data, spit out barcode amended records in order
+
+    - read all data
+    - get index boundaries for each record
+    - pull out each record in order according to the barcode data
+    - associate the barcode
+    - write
+    """
+
+    data = in_.read()
+    boundaries = np.empty([order.size, 2], dtype=np.uint64)
+
+    stop = 0
+    for idx in range(order.size):
+        rec = RECORD.search(data, pos=stop)
+        start, stop = rec.span()
+        boundaries[idx] = np.array([start, stop], dtype=np.uint64)
+
+    current_barcode_idx = 0
+    current_barcode = unique[current_barcode_idx]
+    current_barcode_bound_end = bounds[current_barcode_idx + 1]
+
+    for order_idx, record_idx in enumerate(order):
+        if order_idx >= current_barcode_bound_end:
+            current_barcode_idx += 1
+
+            if current_barcode_idx >= bounds.size:
+                raise ValueError("should not happen?")
+            current_barcode = unique[current_barcode_idx]
+
+            if current_barcode_idx + 1 >= bounds.size:
+                # run to the end
+                current_barcode_bound_end = order.size
+            else:
+                current_barcode_bound_end = bounds[current_barcode_idx + 1]
+
+        start, stop = boundaries[record_idx]
+        record = data[start:stop]
+
+        # in a one-off, these might pass by chance. It would be real weird
+        # for them to always pass for all records in a large file.
+        # n.b., b'foo'[0] is int, because yay, so we use a slice to maintain
+        # a human readable character to test against as most mortals haven't
+        # memorized the ascii table
+        assert record[:1] == b'@'
+        assert record[-1:] == b'\n'
+
+        with_barcode = insert_barcode(record, current_barcode)
+        out_.write(with_barcode)
+
+
+def test_troll_and_write():
+    i1data = [b'@foo', b'ATGC', b'+', b'!!!!',
+              b'@bar', b'TTGG', b'+', b'!!!!',
+              b'@baz', b'ATGC', b'+', b'!!!!',
+              b'@oof', b'TTTT', b'+', b'!!!!',
+              b'@rab', b'TTGG', b'+', b'!!!!',
+              b'@zab', b'TTTT', b'+', b'!!!!',
+              b'@ofo', b'TTTT', b'+', b'!!!!', b'']
+
+    i1 = io.BytesIO(b'\n'.join(i1data))
+    order, unique, bounds = gather_order(i1)
+
+    # we assume records are in the same order, as that has previously been
+    # observed w/ tellread and is the normal expectation
+    r1data = [b'@foo', b'AATGC', b'+', b'!!!!!',
+              b'@bar', b'ATTGG', b'+', b'!!!!!',
+              b'@baz', b'AATGC', b'+', b'!!!!!',
+              b'@oof', b'ATTTT', b'+', b'!!!!!',
+              b'@rab', b'ATTGG', b'+', b'!!!!!',
+              b'@zab', b'ATTTT', b'+', b'!!!!!',
+              b'@ofo', b'ATTTT', b'+', b'!!!!!', b'']
+    r1 = io.BytesIO(b'\n'.join(r1data))
+    r1out = io.BytesIO()
+    troll_and_write(order, unique, bounds, r1, r1out)
+    r1out.seek(0)
+
+    r1exp = [b'@foo BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!',
+             b'@baz BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!',
+             b'@bar BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!',
+             b'@rab BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!',
+             b'@oof BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'@zab BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'@ofo BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!',
+             b'']
+    r1exp = b'\n'.join(r1exp)
+    assert r1exp == r1out.read()
+
+
+def create_tag(t):
+    return b'BX:Z:%s-1' % t
+
+
+def create_tag_no_suffix(t):
+    return b'BX:Z:%s' % t
+
+
+def insert_barcode(record, barcode):
+    """Get the current ID, smash the needed tag in"""
+    # @foo\nATGC\n+\n!!!!\n
+    id_, remainder = record.split(b'\n', 1)
+    tag = create_tag(barcode)
+    return b'%s %s\n%s' % (id_, tag, remainder)
+
+
+def readfq(fp):
+    if fp.mode == 'rb':
+        strip = bytes.strip
+    else:
+        strip = str.strip
+
+    id_ = iter(fp)
+    seq = iter(fp)
+    dumb = iter(fp)
+    qual = iter(fp)
+    for rec in zip(id_, seq, dumb, qual):
+        yield list(map(strip, rec))
+
+
+def writefq(rec, out):
+    for item in rec:
+        out.write(item)
+        out.write(b'\n')
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+def tests():
+    test_gather_order()
+    test_troll_and_write()
+
+
+@cli.command()
+@click.option('--r1-in', type=click.Path(exists=True), required=True)
+@click.option('--r2-in', type=click.Path(exists=True), required=True)
+@click.option('--i1-in', type=click.Path(exists=True), required=True)
+@click.option('--r1-out', type=click.Path(exists=False), required=True)
+@click.option('--r2-out', type=click.Path(exists=False), required=True)
+@click.option('--threads', type=int, required=False, default=1)
+@click.option('--no-sort', is_flag=True, default=False)
+def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort):
+    r1_in_fp = open(r1_in, 'rb')
+    r2_in_fp = open(r2_in, 'rb')
+    i1_in_fp = open(i1_in, 'rb')
+
+    if no_sort:
+        r1_out_fp = gzip.open(r1_out, mode='wb')
+        r2_out_fp = gzip.open(r2_out, mode='wb')
+
+        r1_sniff = r1_in_fp.readline().strip()
+        r2_sniff = r2_in_fp.readline().strip()
+        r1_in_fp.seek(0)
+        r2_in_fp.seek(0)
+
+        # outputs from tellread don't seem to have orientation information
+        # some downstream programs hate this, so let's add if needed.
+        if r1_sniff.endswith(b'/1'):
+            if not r2_sniff.endswith(b'/2'):
+                raise ValueError(f'unexpected endings: {r1_sniff.decode("utf-8")} {r2_sniff.decode("utf-8")}')
+            orient_r1 = ''
+            orient_r2 = ''
+        else:
+            assert b'/1' not in r1_sniff
+
+            orient_r1 = b'/1'
+            orient_r2 = b'/2'
+
+        for (r1, r2, i1) in zip(*map(readfq, [r1_in_fp, r2_in_fp, i1_in_fp])):
+            assert r1[0] == r2[0]
+            assert r1[0] == i1[0]
+
+            tag = create_tag_no_suffix(i1[1])
+            r1[0] = b"%s%s %s" % (r1[0], orient_r1, tag)
+            r2[0] = b"%s%s %s" % (r2[0], orient_r2, tag)
+            writefq(r1, r1_out_fp)
+            writefq(r2, r2_out_fp)
+        r1_out_fp.close()
+        r2_out_fp.close()
+    else:
+        # 200MB is what they use in their readme...
+        r1_out_fp = pgzip.open(r1_out, mode='wb', thread=threads,
+                               blocksize=2*10**8)
+        r2_out_fp = pgzip.open(r2_out, mode='wb', thread=threads,
+                               blocksize=2*10**8)
+
+        order, unique, bounds = gather_order(i1_in_fp)
+
+        for in_, out_ in zip([r1_in_fp, r2_in_fp], [r1_out_fp, r2_out_fp]):
+            troll_and_write(order, unique, bounds, in_, out_)
+            in_.close()
+            out_.close()
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py
new file mode 100644
index 00000000..76c822ee
--- /dev/null
+++ b/sequence_processing_pipeline/contrib/plot_counts.py
@@ -0,0 +1,27 @@
+import matplotlib.pyplot as plt
+import re
+import sys
+import os
+import pandas as pd
+
+ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq')
+
+# remove total line from wc
+data = [l.strip().split(' ') for l in open(sys.argv[1])][:-1]
+plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data]
+sheetdata = dict(plotdata)
+
+ordered = sorted(plotdata, key=lambda x: x[1])
+f = plt.figure(figsize=(16, 8))
+plt.bar([i for i, _ in ordered], [v for _, v in ordered])
+plt.ylabel('I1 reads')
+plt.xticks(list(range(len(ordered))), [i for i, _ in ordered], rotation=90)
+plt.savefig(sys.argv[3] + '/counts.pdf')
+
+sheet = pd.read_csv(sys.argv[2], dtype=str)
+sheet = sheet[~sheet['Lane'].isnull()]
+sheet['read_counts'] = [sheetdata[i] for i in sheet['Barcode_ID']]
+name = os.path.basename(sys.argv[2]).rsplit('.', 1)[0]
+newname = name + '.read_counts.tsv'
+
+sheet.to_csv(sys.argv[3] + '/' + newname, sep='\t', index=False, header=True)
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index 4296abfb..261c11c7 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -1,20 +1,14 @@
 #!/bin/bash -l
-#SBATCH -J {{job_name}}             # cs-assemble
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 64G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 12
-#SBATCH -p {{queue_name}}           # qiita
+#SBATCH -J {{job_name}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH -p {{queue_name}}
 
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
+#SBATCH --output cloudspades-isolate_%x-%A_%a.out
+#SBATCH --error cloudspades-isolate_%x-%A_%a.err
 
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -24,8 +18,6 @@ function logger () {
 set -x 
 set -e
 
-# this gets set in the environment from another script. For now let's
-# run with that.
 echo $TMPDIR
 
 if [[ -z "${LABELTAG}" ]]; then
@@ -39,13 +31,9 @@ if [[ ! -d ${base} ]]; then
     exit 1
 fi
 
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate activate qiime2-2023.5
-
-module load {{modules_to_load}} # gcc_9.3.0
+module load {{modules_to_load}}
 
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))            
 
 # assumes 1-based array index, eg --array 1-N
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
@@ -59,8 +47,8 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
 fi
 
 mkdir -p ${cs}
+pushd {{cloudspades_path}}/assembler/bin
 
-pushd {{spades_path}}
 ./spades.py \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
@@ -69,16 +57,15 @@ pushd {{spades_path}}
 module unload gcc_9.3.0
 popd
 
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate quast
-
-quast \
-    -o ${cs}/quast-scaffolds \
-    -t ${SLURM_JOB_CPUS_PER_NODE} \
-    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+# TODO: Look for alternative method to load quast
+#mamba activate quast
 
+#quast \
+#    -o ${cs}/quast-scaffolds \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} \
+#    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+#
 # remove intermediates that currently dont have a downstream use
-if [[ -d ${cs}/K21 ]]; then
-    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-fi
+#if [[ -d ${cs}/K21 ]]; then
+#    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+#fi
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index e1c2bb40..636dd5ce 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -6,15 +6,9 @@
 #SBATCH -c {{cores_per_task}}       # 12
 #SBATCH -p {{queue_name}}           # qiita
 
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
+#SBATCH --output cloudspades_%x-%A_%a.out
+#SBATCH --error cloudspades_%x-%A_%a.err
 
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
 source activate qiime2-2023.5
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -37,13 +31,9 @@ if [[ ! -d ${base} ]]; then
     exit 1
 fi
 
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate activate qiime2-2023.5
-
-module load {{modules_to_load}} # gcc_9.3.0
+module load {{modules_to_load}}
 
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))            
 
 # assumes 1-based array index, eg --array 1-N
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
@@ -57,9 +47,8 @@ if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
 fi
 
 mkdir -p ${cs}
-pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
+pushd {{cloudspades_path}}/assembler/bin
 
-# for now don't use spades.py jinja2 variable
 ./spades.py \
     -o ${cs} \
     --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
@@ -69,13 +58,14 @@ pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
 module unload gcc_9.3.0
 popd
 
-mamba activate quast                                                           
-quast \
-    -o ${cs}/quast-scaffolds \
-    -t ${SLURM_JOB_CPUS_PER_NODE} \
-    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
+# TODO: Look for alternative method to load quast
+#mamba activate quast
+#quast \
+#    -o ${cs}/quast-scaffolds \
+#    -t ${SLURM_JOB_CPUS_PER_NODE} \
+#    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
 
 # remove intermediates that currently dont have a downstream use
-if [[ -d ${cs}/K21 ]]; then
-    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-fi
+#if [[ -d ${cs}/K21 ]]; then
+#    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
+#fi
diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
new file mode 100644
index 00000000..a4b31114
--- /dev/null
+++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
@@ -0,0 +1,57 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}             # norm
+#SBATCH --time {{wall_time_limit}}  # 24:00:00
+#SBATCH --mem {{mem_in_gb}}G        # 8G
+#SBATCH -N {{node_count}}           # 1
+#SBATCH -c {{cores_per_task}}       # 1
+#SBATCH -p {{queue_name}}           # qiita
+
+#SBATCH --output compute_sequence_counts_%x-%A_%a.out
+#SBATCH --error compute_sequence_counts_%x-%A_%a.err
+
+# NB: output appears normal w/out.
+# source activate qiime2-2023.5
+
+function logger () {
+    echo "$(date) :: ${@}";
+    echo "$(date) :: ${@}" 1>&2;
+}
+
+set -x
+set -e
+set -o pipefail
+
+echo $TMPDIR
+
+tellread=${TELLREAD_OUTPUT}
+if [[ ! -d ${tellread} ]]; then
+    echo "${tellread} not found"
+    exit 1
+fi
+
+if [[ ! -d ${tellread}/Full ]]; then
+    echo "${tellread}/Full not found"
+    exit 1
+fi
+
+if [[ -z {{output_path}} ]]; then
+    echo "OUTPUT not specified"
+    exit 1
+fi
+
+if [[ -z {{sample_sheet}} ]]; then
+    echo "SAMPLESHEET not specified"
+    exit 1
+fi
+
+if [[ ! -f {{sample_sheet}} ]]; then
+    echo "SAMPLESHEET not found"
+    exit 1
+fi
+
+mkdir -p {{output_path}}
+wc -l ${tellread}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt
+python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}}
+
+conda activate qp-knight-lab-processing-2022.03
+python {{create_picklist_path}} {{read_counts_path}}
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index f4161466..30a3a9ba 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -6,22 +6,14 @@
 #SBATCH -c {{cores_per_task}}       # 1
 #SBATCH -p {{queue_name}}           # qiita
 
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
+#SBATCH --output integrate_%x-%A_%a.out
+#SBATCH --error integrate_%x-%A_%a.err
 
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
-source activate rust
 function logger () { 
     echo "$(date) :: ${@}"; 
     echo "$(date) :: ${@}" 1>&2; 
 }
 
-
 # https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
 cores=${SLURM_CPUS_PER_TASK}
 
@@ -55,7 +47,7 @@ set -x
 set -e
 set -o pipefail
 
-samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
+samples=($(cat ${tellread}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
 export TMPDIR=$(mktemp -d)
@@ -114,8 +106,8 @@ fi
 # not mask a nonzero exit status (e.g., the python process raising)
 cat ${i1} | gzip > ${i1out} 
 
-mamba activate tellread-integrate
-python ${BASE}/integrate-indices-np.py integrate \
+conda activate qp-knight-lab-processing-2022.03
+python {{iinp_script_path}} integrate \
     --no-sort \
     --r1-in ${r1} \
     --r2-in ${r2} \
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index f842cddf..b8f9d735 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -6,31 +6,26 @@
 #SBATCH --time {{wall_time_limit}}  # 96:00:00
 #SBATCH -p {{queue_name}}           # qiita
 
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
+#SBATCH --output telllink-isolate_%x-%A_%a.out
+#SBATCH --error telllink-isolate_%x-%A_%a.err
 
 set -x 
 set -e
 
-module load {{modules_to_load}} # singularity_3.6.4
+module load {{modules_to_load}}
 
 if [[ -z "${LABELTAG}" ]]; then
     echo "LABELTAG is not specified"
     exit 1
 fi
 
-base=/panfs/qiita/TELLREAD/${LABELTAG}
+base={{output_path}}
 if [[ ! -d ${base} ]]; then
     echo "${base} not found"
     exit 1
 fi
 
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
 k=79
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
index 39daa383..234192b2 100644
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -6,34 +6,29 @@
 #SBATCH --time {{wall_time_limit}}  # 96:00:00
 #SBATCH -p {{queue_name}}           # qiita
 
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
+#SBATCH --output telllink_%x-%A_%a.out
+#SBATCH --error telllink_%x-%A_%a.err
 
 set -x 
 set -e
 
-module load {{modules_to_load}} # singularity_3.6.4
+module load {{modules_to_load}}
 
 if [[ -z "${LABELTAG}" ]]; then
     echo "LABEL is not specified"
     exit 1
 fi
 
-base=/panfs/${USER}/${LABELTAG}
+base={{output_path}}
 if [[ ! -d ${base} ]]; then
     echo "${base} not found"
     exit 1
 fi
 
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
+samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
-# leave these hardcoded for now
+# TODO: leave these hardcoded for now
 k=79
 lc=35
 cores=${SLURM_CPUS_PER_TASK}
@@ -62,4 +57,3 @@ mkdir -p ${tl}
 if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
     rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
 fi
-
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
index d5edf855..2cb479e7 100644
--- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -6,13 +6,8 @@
 #SBATCH -c {{cores_per_task}}       # 1
 #SBATCH -p {{queue_name}}           # qiita
 
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=BEGIN,FAIL
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A.out
-#SBATCH --error %x-%A.err
+#SBATCH --output tellread-cleanup_%x-%A.out
+#SBATCH --error tellread-cleanup_%x-%A.err
 
 if [[ -z "${OUTPUT}" ]]; then
     echo "OUTPUT is not specified"
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index 89633da9..fe8d39d9 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -1,19 +1,13 @@
 #!/bin/bash -l
-#SBATCH -J {{job_name}}             # tellread
-#SBATCH -p {{queue_name}}           # qiita
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 4
-#SBATCH --mem {{mem_in_gb}}G        # 16G
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
+#SBATCH -J {{job_name}}
+#SBATCH -p {{queue_name}}
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH --time {{wall_time_limit}}
 
-# for now these can be left hard-coded.
-#SBATCH --partition=short
-#SBATCH --output %x-%A.out
-#SBATCH --error %x-%A.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=BEGIN,FAIL
+#SBATCH --output tellread_%x-%A.out
+#SBATCH --error tellread_%x-%A.err
 
 function logger () { 
     echo "$(date) :: ${@}"; 
@@ -86,7 +80,7 @@ fi
 
 mkdir -p ${OUTPUT}
     
-module load {{modules_to_load}} # singularity_3.6.4
+module load {{modules_to_load}}
 {{sing_script_path}} \
     -i ${seqrun_path} \
     -o ${OUTPUT} \
@@ -96,7 +90,6 @@ module load {{modules_to_load}} # singularity_3.6.4
     ${extra} \
     -l ${lane}
 
-    
 if [[ -d ${OUTPUT}/Full ]]; then
     echo "Run appears successful"
 elif [[ -d ${OUTPUT}/1_demult/Full ]]; then
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
index ac7c6d31..d6c61cb0 100755
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ b/sequence_processing_pipeline/templates/tellread.sh
@@ -4,7 +4,7 @@ seqrunpath="{{seqrun_path}}"        # previously -s option
 lane="{{lane}}"                     # previously -l option
 reference_map="{{reference_map}}"   # previously -r option
 reference_base="{{reference_base}}" # previously -b option
-mode="{{mode}}" $                   # previously -m option
+mode="{{mode}}"                     # previously -m option
 
 # preserve error-checking of parameters to preserve as much of the original
 # script as possible, even though this could be done in python.
@@ -37,7 +37,7 @@ fi
 safepath=$(echo ${seqrunpath} | sed 's:/*$::')
 label=$(basename ${safepath})
 labeltag=${label}-${tag}
-output=/panfs/${USER}/${labeltag}
+output={{output_path}}
 
 if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
     echo "Cannot access the lane"
@@ -85,7 +85,12 @@ declare -a s
 declare -a g
 # below extended regex might be broken because C5\d\d happens in column 0, not column 1
 # of the hacked sample-sheet.
-for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
+# for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
+
+# new sample-sheet is of form:
+# Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_ID,Sample_Project,Well_description,Lane
+# 10283.LS.4.4.2015,10283.LS.4.4.2015,Plate_1,A1,C501,LS_Timeseries_TellSeq_10283,10283.LS.4.4.2015,1
+for sample in $(egrep -o ",C5..," ${samplesheet} | tr -d "," | sort)
 do
     echo "sample found: ${sample}"
     # get references if they exist
@@ -168,7 +173,9 @@ if [[ -f ${submitcopy} ]]; then
     exit 1
 fi
 
-echo $@ > ${arguments}
+#TODO: Other possible arguments like -r?
+echo "-l {{lane}} -s {{seqrun_path}} -i {{tellread_map}} -m {{mode}}" >${arguments} 
+
 cp ${0} ${scriptcopy}
 cp ${submit_script} ${submitcopy}
 cp ${asm_cloudspades_script} ${asmcscopy}
@@ -188,6 +195,9 @@ trjob=$(sbatch \
           --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
           ${submit_script})
 
+echo "TRJOB_RETURN_CODE: $?" > {{output_path}}/pids
+echo "TRJOB_PID: $trjob" >> {{output_path}}/pids
+
 if [[ ${norm} == "TRUE" ]]; then
     cp ${norm_script} ${normcopy}
     chmod gou-w ${normcopy}
@@ -197,6 +207,8 @@ if [[ ${norm} == "TRUE" ]]; then
                         -J ${labeltag}-${datetag}-norm-counts \
                         --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \
                         ${norm_script})
+    echo "NORM_COUNTS_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
+    echo "NORM_COUNTS_JOB_PID: $norm_counts_job" >> {{output_path}}/pids
 fi
 
 integrate_job=$(sbatch \
@@ -207,6 +219,9 @@ integrate_job=$(sbatch \
                     --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \
                     ${integrate_script})
 
+echo "INTEGRATE_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
+echo "INTEGRATE_JOB_PID: $integrate_job" >> {{output_path}}/pids
+
 if [[ ${assemble} == "TRUE" ]]; then
     csj=$(sbatch \
             --parsable \
@@ -215,6 +230,10 @@ if [[ ${assemble} == "TRUE" ]]; then
             --array 1-${n_samples} \
             --export LABELTAG=${labeltag},OUTPUT=${output} \
             ${asm_cloudspades_script})
+
+    echo "CSJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
+    echo "CSJ_JOB_PID: $csj" >> {{output_path}}/pids
+
     tlj=$(sbatch \
             --parsable \
             --dependency=aftercorr:${integrate_job} \
@@ -222,6 +241,10 @@ if [[ ${assemble} == "TRUE" ]]; then
             --array 1-${n_samples} \
             --export LABELTAG=${labeltag},OUTPUT=${output} \
             ${asm_tellink_script})
+
+    echo "TLJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
+    echo "TLJ_JOB_PID: $tlj" >> {{output_path}}/pids
+
     cleanupdep=${csj}:${tlj}
 else
     cleanupdep=${integrate_job}
@@ -234,3 +257,6 @@ cleanup=$(sbatch \
             --dependency=afterok:${cleanupdep} \
             --export OUTPUT=${output} \
             ${clean_script})
+
+echo "CLEANUP_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
+echo "CLEANUP_JOB_PID: $cleanup" >> {{output_path}}/pids
diff --git a/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv
new file mode 100644
index 00000000..f696f0c9
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv
@@ -0,0 +1,41 @@
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_96_Well_Position,Barcode_ID,Sample_Project,Well_description,Lane
+Person.A.TELLSEQ.R20.microbe,Person.A.TELLSEQ.R20.microbe,TellSeq3_15196_P3,A1,A4,C525,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.microbe,1
+Person.B.TELLSEQ.R24.microbe,Person.B.TELLSEQ.R24.microbe,TellSeq3_15196_P3,B1,B4,C526,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.microbe,1
+Person.C.TELLSEQ.R21.microbe,Person.C.TELLSEQ.R21.microbe,TellSeq3_15196_P3,C1,C4,C527,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.microbe,1
+Person.D.TELLSEQ.R26.microbe,Person.D.TELLSEQ.R26.microbe,TellSeq3_15196_P3,D1,D4,C528,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.microbe,1
+Person.E.TELLSEQ.R19.microbe,Person.E.TELLSEQ.R19.microbe,TellSeq3_15196_P3,E1,E4,C529,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.microbe,1
+Pet.C.TELLSEQ.R23.microbe,Pet.C.TELLSEQ.R23.microbe,TellSeq3_15196_P3,F1,F4,C530,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.microbe,1
+BLANK.TELLSEQ.3.12.H.microbe,BLANK.TELLSEQ.3.12.H.microbe,TellSeq3_15196_P3,G1,G4,C531,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.microbe,1
+Isolate.115.R1.microbe,Isolate.115.R1.microbe,TellSeq3_15196_P1,H1,H4,C532,TellSeq3_15196_P3,Isolate.115.R1.microbe,1
+Zymo.Mock.Community.R1.microbe,Zymo.Mock.Community.R1.microbe,TellSeq3_15196_P1,A2,A5,C533,TellSeq3_15196_P3,Zymo.Mock.Community.R1.microbe,1
+E.coli.QC.DNA.R1.microbe,E.coli.QC.DNA.R1.microbe,TellSeq3_15196_P1,B2,B5,C534,TellSeq3_15196_P3,E.coli.QC.DNA.R1.microbe,1
+Person.A.TELLSEQ.R20.purified.microbe,Person.A.TELLSEQ.R20.purified.microbe,TellSeq3_15196_P3,C2,C5,C535,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.purified.microbe,1
+Person.B.TELLSEQ.R24.purified.microbe,Person.B.TELLSEQ.R24.purified.microbe,TellSeq3_15196_P3,D2,D5,C536,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.purified.microbe,1
+Person.C.TELLSEQ.R21.purified.microbe,Person.C.TELLSEQ.R21.purified.microbe,TellSeq3_15196_P3,E2,E5,C537,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.purified.microbe,1
+Person.D.TELLSEQ.R26.purified.microbe,Person.D.TELLSEQ.R26.purified.microbe,TellSeq3_15196_P3,F2,F5,C538,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.purified.microbe,1
+Person.E.TELLSEQ.R19.purified.microbe,Person.E.TELLSEQ.R19.purified.microbe,TellSeq3_15196_P3,G2,G5,C539,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.purified.microbe,1
+Pet.C.TELLSEQ.R23.purified.microbe,Pet.C.TELLSEQ.R23.purified.microbe,TellSeq3_15196_P3,H2,H5,C540,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.purified.microbe,1
+BLANK.TELLSEQ.3.12.H.purified.microbe,BLANK.TELLSEQ.3.12.H.purified.microbe,TellSeq3_15196_P3,A3,A6,C541,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.purified.microbe,1
+Isolate.115.R2.microbe,Isolate.115.R2.microbe,TellSeq3_15196_P1,B3,B6,C542,TellSeq3_15196_P3,Isolate.115.R2.microbe,1
+Zymo.Mock.Community.R2.microbe,Zymo.Mock.Community.R2.microbe,TellSeq3_15196_P1,C3,C6,C543,TellSeq3_15196_P3,Zymo.Mock.Community.R2.microbe,1
+E.coli.QC.DNA.R2.microbe,E.coli.QC.DNA.R2.microbe,TellSeq3_15196_P1,D3,D6,C544,TellSeq3_15196_P3,E.coli.QC.DNA.R2.microbe,1
+Person.A.TELLSEQ.R20.std,Person.A.TELLSEQ.R20.std,TellSeq3_15196_P3,A1,A1,C501,TellSeq3_15196,Person.A.TELLSEQ.R20.std,1
+Person.B.TELLSEQ.R24.std,Person.B.TELLSEQ.R24.std,TellSeq3_15196_P3,B1,B1,C502,TellSeq3_15196,Person.B.TELLSEQ.R24.std,1
+Person.C.TELLSEQ.R21.std,Person.C.TELLSEQ.R21.std,TellSeq3_15196_P3,C1,C1,C503,TellSeq3_15196,Person.C.TELLSEQ.R21.std,1
+Person.D.TELLSEQ.R26.std,Person.D.TELLSEQ.R26.std,TellSeq3_15196_P3,D1,D1,C504,TellSeq3_15196,Person.D.TELLSEQ.R26.std,1
+Person.E.TELLSEQ.R19.std,Person.E.TELLSEQ.R19.std,TellSeq3_15196_P3,E1,E1,C505,TellSeq3_15196,Person.E.TELLSEQ.R19.std,1
+Pet.C.TELLSEQ.R23.std,Pet.C.TELLSEQ.R23.std,TellSeq3_15196_P3,F1,F1,C506,TellSeq3_15196,Pet.C.TELLSEQ.R23.std,1
+BLANK.TELLSEQ.3.12.H.std,BLANK.TELLSEQ.3.12.H.std,TellSeq3_15196_P3,G1,G1,C507,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.std,1
+Isolate.115.R1.std,Isolate.115.R1.std,TellSeq3_15196_P1,H1,H1,C508,TellSeq3_15196,Isolate.115.R1.std,1
+Zymo.Mock.Community.R1.std,Zymo.Mock.Community.R1.std,TellSeq3_15196_P1,A2,A2,C509,TellSeq3_15196,Zymo.Mock.Community.R1.std,1
+E.coli.QC.DNA.R1.std,E.coli.QC.DNA.R1.std,TellSeq3_15196_P1,B2,B2,C510,TellSeq3_15196,E.coli.QC.DNA.R1.std,1
+Person.A.TELLSEQ.R20.purified.std,Person.A.TELLSEQ.R20.purified.std,TellSeq3_15196_P3,C2,C2,C511,TellSeq3_15196,Person.A.TELLSEQ.R20.purified.std,1
+Person.B.TELLSEQ.R24.purified.std,Person.B.TELLSEQ.R24.purified.std,TellSeq3_15196_P3,D2,D2,C512,TellSeq3_15196,Person.B.TELLSEQ.R24.purified.std,1
+Person.C.TELLSEQ.R21.purified.std,Person.C.TELLSEQ.R21.purified.std,TellSeq3_15196_P3,E2,E2,C513,TellSeq3_15196,Person.C.TELLSEQ.R21.purified.std,1
+Person.D.TELLSEQ.R26.purified.std,Person.D.TELLSEQ.R26.purified.std,TellSeq3_15196_P3,F2,F2,C514,TellSeq3_15196,Person.D.TELLSEQ.R26.purified.std,1
+Person.E.TELLSEQ.R19.purified.std,Person.E.TELLSEQ.R19.purified.std,TellSeq3_15196_P3,G2,G2,C515,TellSeq3_15196,Person.E.TELLSEQ.R19.purified.std,1
+Pet.C.TELLSEQ.R23.purified.std,Pet.C.TELLSEQ.R23.purified.std,TellSeq3_15196_P3,H2,H2,C516,TellSeq3_15196,Pet.C.TELLSEQ.R23.purified.std,1
+BLANK.TELLSEQ.3.12.H.purified.std,BLANK.TELLSEQ.3.12.H.purified.std,TellSeq3_15196_P3,A3,A3,C517,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.purified.std,1
+Isolate.115.R2.std,Isolate.115.R2.std,TellSeq3_15196_P1,B3,B3,C518,TellSeq3_15196,Isolate.115.R2.std,1
+Zymo.Mock.Community.R2.std,Zymo.Mock.Community.R2.std,TellSeq3_15196_P1,C3,C3,C519,TellSeq3_15196,Zymo.Mock.Community.R2.std,1
+E.coli.QC.DNA.R2.std,E.coli.QC.DNA.R2.std,TellSeq3_15196_P1,D3,D3,C520,TellSeq3_15196,E.coli.QC.DNA.R2.std,1

From 1d431a3d9cb455a1e598a0762602a2bc465440f7 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 15 Sep 2024 17:52:59 -0700
Subject: [PATCH 08/47] Manually merged with current master

---
 sequence_processing_pipeline/Job.py           | 229 ++++++++----------
 sequence_processing_pipeline/Pipeline.py      |  33 +++
 .../tests/test_Pipeline.py                    |  56 +++++
 3 files changed, 186 insertions(+), 132 deletions(-)

diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 035d8ba0..6a5d4f86 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -25,6 +25,15 @@ class Job:
                             'SPECIAL_EXIT', 'STAGE_OUT', 'STOPPED',
                             'SUSPENDED']
 
+    slurm_status_not_running = (slurm_status_terminated +
+                                slurm_status_successful)
+
+    slurm_status_all_states = (slurm_status_terminated +
+                               slurm_status_successful +
+                               slurm_status_running)
+
+    polling_interval_in_seconds = 60
+
     def __init__(self, root_dir, output_path, job_name, executable_paths,
                  max_array_length, modules_to_load=None):
         """
@@ -204,76 +213,96 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None):
 
         return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code}
 
-    def _wait_on_job(self, job_id, callback=None):
-        job_info = {'job_id': None, 'job_name': None, 'job_state': None,
-                    'elapsed_time': None}
-
-        exit_count = 0
-
-        while True:
-            result = self._system_call(f"sacct -P -n --job {job_id} --format "
-                                       "JobID,JobName,State,Elapsed,ExitCode")
-
-            if result['return_code'] != 0:
-                # sacct did not successfully submit the job.
-                raise ExecFailedError(result['stderr'])
-
-            # [-1] remove the extra \n
-            jobs_data = result['stdout'].split('\n')[:-1]
-            states = dict()
-            estatuses = dict()
-            for i, jd in enumerate(jobs_data):
-                jid, jname, jstate, etime, estatus = jd.split('|')
-                if jid.endswith('.extern') or jid.endswith('.batch'):
-                    continue
+    def wait_on_job_ids(self, job_ids, callback=None):
+        '''
+        Wait for the given job-ids to finish running before returning.
+        :param job_ids: A list of Slurm job-ids
+        :param callback: Set callback function that receives status updates.
+        :return: A dictionary of job-ids and their current statuses.
+        '''
 
-                if i == 0:
-                    job_info['job_id'] = jid
-                    job_info['job_name'] = jname
-                    job_info['elapsed_time'] = etime
-                    job_info['exit_status'] = estatus
+        # wait_on_job_ids was broken out of submit_job() and updated to monitor
+        # multiple job ids. This will allow multiple jobs to be submitted to
+        # Slurm in parallel and a single wait_on_job_ids() can wait on all of
+        # them before returning, optionally submitting callbacks for each
+        # job-id.
+
+        def query_slurm(job_ids):
+            # internal function query_slurm encapsulates the handling of
+            # squeue.
+            count = 0
+            while True:
+                result = self._system_call("squeue -t all -j "
+                                           f"{','.join(job_ids)} "
+                                           "-o '%F,%A,%T'")
+
+                if result['return_code'] == 0:
+                    # there was no issue w/squeue, break this loop and
+                    # continue.
+                    break
+                else:
+                    # there was a likely intermittent issue w/squeue. Pause
+                    # and wait before trying a few more times. If the problem
+                    # persists then report the error and exit.
+                    count += 1
 
-                if jstate not in states:
-                    states[jstate] = 0
-                states[jstate] += 1
+                    if count > 3:
+                        raise ExecFailedError(result['stderr'])
 
-                if estatus not in estatuses:
-                    estatuses[estatus] = 0
-                estatuses[estatus] += 1
+                    sleep(60)
 
-            job_info['job_state'] = f'{states}'
-            job_info['exit_status'] = f'{estatuses}'
+            lines = result['stdout'].split('\n')
+            lines.pop(0)    # remove header
+            lines = [x.split(',') for x in lines if x != '']
 
-            if callback is not None:
-                callback(jid=job_id, status=f'{states}')
+            jobs = {}
+            child_jobs = {}
+            for job_id, unique_id, state in lines:
+                jobs[unique_id] = state
 
-            logging.debug("Job info: %s" % job_info)
+                if unique_id != job_id:
+                    child_jobs[unique_id] = job_id  # job is a child job
 
-            # if job is completed after having run or exited after having
-            # run, then stop waiting.
-            if not set(states) - {'COMPLETED', 'FAILED', 'CANCELLED'}:
-                # break
-                exit_count += 1
+            return jobs, child_jobs
 
-            if exit_count > 4:
+        while True:
+            jobs, child_jobs = query_slurm(job_ids)
+
+            for jid in job_ids:
+                logging.debug("JOB %s: %s" % (jid, jobs[jid]))
+                if callback is not None:
+                    callback(jid=jid, status=jobs[jid])
+
+                children = [x for x in child_jobs if child_jobs[x] == jid]
+                if len(children) == 0:
+                    logging.debug("\tNO CHILDREN")
+                for cid in children:
+                    logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid]))
+            status = [jobs[x] in Job.slurm_status_not_running for x in job_ids]
+
+            if set(status) == {True}:
+                # all jobs either completed successfully or terminated.
                 break
 
-            sleep(10)
+            sleep(Job.polling_interval_in_seconds)
 
-        return job_info, states, estatuses
+        return jobs
 
     def submit_job(self, script_path, job_parameters=None,
-                   script_parameters=None, exec_from=None, callback=None):
+                   script_parameters=None, wait=True,
+                   exec_from=None, callback=None):
         """
-        Submit a Torque job script and optionally wait for it to finish.
-        :param script_path: The path to a Torque job (bash) script.
+        Submit a Slurm job script and optionally wait for it to finish.
+        :param script_path: The path to a Slurm job (bash) script.
         :param job_parameters: Optional parameters for scheduler submission.
         :param script_parameters: Optional parameters for your job script.
+        :param wait: Set to False to submit job and not wait.
         :param exec_from: Set working directory to execute command from.
         :param callback: Set callback function that receives status updates.
-        :return: Dictionary containing the job's id, name, status, and
-        elapsed time. Raises PipelineError if job could not be submitted or
-        if job was unsuccessful.
+        :return: If wait is True, a dictionary containing the job's id and
+                 status. If wait is False, the Slurm job-id of the submitted
+                 job. Raises PipelineError if job could not be submitted or if
+                 job was unsuccessful.
         """
         if job_parameters:
             cmd = 'sbatch %s %s' % (job_parameters, script_path)
@@ -302,96 +331,32 @@ def submit_job(self, script_path, job_parameters=None,
         # Just to give some time for everything to be set up properly
         sleep(10)
 
-        job_info, states, estatuses = self._wait_on_job(job_id,
-                                                        callback=callback)
+        if wait is False:
+            # return job_id since that is the only information for this new
+            # job that we have available. User should expect that this is
+            # not a dict if they explicitly set wait=False.
+            return job_id
 
-        if job_info['job_id'] is None:
-            # job was never in the queue - return an error.
-            if callback is not None:
-                callback(jid=job_id, status='ERROR')
+        # the user is expecting a dict with 'job_id' and 'job_state'
+        # attributes. This method will return a dict w/job_ids as keys and
+        # their job status as values. This must be munged before returning
+        # to the user.
+        results = self.wait_on_job_ids([job_id], callback=callback)
 
-            raise JobFailedError(f"job {job_id} never appeared in the "
-                                 "queue.")
+        job_result = {'job_id': job_id, 'job_state': results[job_id]}
 
-        # job was once in the queue
         if callback is not None:
-            callback(jid=job_id, status=job_info['job_state'])
+            callback(jid=job_id, status=job_result['job_state'])
 
-        if set(states) == {'COMPLETED'}:
-            if 'exit_status' in job_info:
-                if set(estatuses) == {'0:0'}:
-                    # job completed successfully
-                    return job_info
-                else:
-                    exit_status = job_info['exit_status']
-                    raise JobFailedError(f"job {job_id} exited with exit_"
-                                         f"status {exit_status}")
-            else:
-                # with no other info, assume job completed successfully
-                return job_info
+        if job_result['job_state'] == 'COMPLETED':
+            return job_result
         else:
-            # job exited unsuccessfully
             raise JobFailedError(f"job {job_id} exited with status "
-                                 f"{job_info['job_state']}")
-
-    def _wait_on_job_ids(self, job_ids, timeout_in_seconds=None):
-        """
-        Wait on a list of known Slurm job-ids.
-        :param job_ids: A list of Slurm job-ids
-        :param timeout_in_seconds: Abort and raise an Error after n seconds.
-        :return: A list of strings, representing the state of each job.
-        """
-
-        # this method is useful for wrapping scripts that spawn child jobs and
-        # the user wishes to wait until they are all completed before
-        # continuing.
-        if not isinstance(job_ids, list):
-            raise ValueError("job_ids must be a list of valid slurm job ids")
-
-        if set([isinstance(x, int) for x in job_ids]) != {True}:
-            raise ValueError("job_ids must contain integers")
-
-        if timeout_in_seconds:
-            if not isinstance(timeout_in_seconds, int):
-                raise ValueError("timeout_in_seconds must be an integer")
-
-            if timeout_in_seconds < 1:
-                raise ValueError("timeout_in_seconds must be greater than 0")
-
-        start_time = time()
-        while True:
-            if timeout_in_seconds:
-                if time() - start_time > timeout_in_seconds:
-                    raise PipelineError("timeout reached while waiting for "
-                                        "jobs")
-
-            job_states = []
-            for job_id in job_ids:
-                # NB: sacct can support querying on multiple job-ids at once.
-                # However, this would require extensive rewriting and testing
-                # of the existing code. Deferring for now.
-                _, states, _ = self._wait_on_job(job_id)
-                job_states.append(set(states))
-
-            # assuming that a Slurm job will never contain states from both
-            # terminated and successful, this will generate a list containing
-            # the current state for each job.
-            result = [set(x) & set(Job.slurm_status_terminated +
-                                   Job.slurm_status_successful) for x in job_states]
-
-            if set([bool(x) for x in result]) == {True}:
-                # all jobs are no longer in a running state.
-                break
-
-            sleep(10)
-
-        # return the current state of each job. Assume that each set contains
-        # only one value.
-        return [''.join(x) for x in result]
+                                 f"{job_result['job_state']}")
 
     def _group_commands(self, cmds):
         # break list of commands into chunks of max_array_length (Typically
-        # 1000 for Torque job arrays). To ensure job arrays are never more
+        # 1000 for Slurm job arrays). To ensure job arrays are never more
         # than 1000 jobs long, we'll chain additional commands together, and
         # evenly distribute them amongst the first 1000.
         cmds.sort()
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index e308f38c..88319353 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -14,6 +14,7 @@
 from collections import defaultdict
 from datetime import datetime
 from xml.etree import ElementTree as ET
+from metapool.prep import PREP_MF_COLUMNS
 
 
 logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
@@ -235,6 +236,38 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
 
         self._configure_profile()
 
+    def identify_reserved_words(self, words):
+        '''
+        Returns a list of words that should not appear as column names in any
+        project referenced in the Pipeline's sample-sheet/pre-prep file.
+        :param words: A list of words that may include reserved words.
+        :return: A list of words that are already reserved in upper, lower,
+                 and mixed cases.
+        '''
+
+        # Only strings used as column names in pre-prep files are currently
+        # considered 'reserved' as loading a pre-prep file containing these
+        # column names will fail if one or more of the strings already appears
+        # as a column name in a study's sample metadata table.
+
+        # This implementation assumes some understanding of metapool's impl,
+        # specifically how the proper set of prep-info file columns are
+        # generated. For now the functionality will be defined here as this
+        # area of metapool is currently in flux.
+        if self.mapping_file is not None:
+            reserved = PREP_MF_COLUMNS
+        else:
+            # results will be dependent on SheetType and SheetVersion of
+            # the sample-sheet. Since all columns in a prep-info file are
+            # lower()ed before writing out to file, the word must be
+            # reserved in all case forms. e.g.: 'Sample_Well' and 'Sample_well'
+            # are both forms of 'sample_well'.
+            reserved = [x.lower() for x in
+                        self.sample_sheet.CARRIED_PREP_COLUMNS] + \
+                        self.sample_sheet.GENERATED_PREP_COLUMNS
+
+        return list(set([x.lower() for x in words]) & set(reserved))
+
     def _configure_profile(self):
         # extract the instrument type from self.run_dir and the assay type
         # from self.sample_sheet (or self.mapping_file).
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index 37abb5b9..fff4b07d 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -28,6 +28,7 @@ def setUp(self):
         makedirs(self.output_file_path, exist_ok=True)
         self.maxDiff = None
         self.good_sample_sheet_path = self.path('good-sample-sheet.csv')
+        self.good_legacy_sheet_path = self.path('mgv90_test_sheet.csv')
         self.mp_sheet_path = self.path('multi-project-sheet.csv')
         self.bad_sample_sheet_path = self.path('duplicate_sample-sample-sheet'
                                                '.csv')
@@ -1630,6 +1631,38 @@ def test_parse_project_name(self):
                     obs = pipeline._parse_project_name(test, t_set == 'True')
                     self.assertEqual(obs, exp)
 
+    def test_identify_reserved_words(self):
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.good_sample_sheet_path, None,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # assert that arbitrary strings are not reserved.
+        obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
+                                                'ANOTHER_WORD'])
+        self.assertEqual(obs, [])
+
+        # assert that 'well_id_384' is a reserved word.
+        obs = pipeline.identify_reserved_words(['well_id_384',
+                                                'NOT_A_RESERVED_WORD'])
+
+        self.assertEqual(obs, ['well_id_384'])
+
+        # create new pipeline using a/legacy (v90) metagenomic sample-sheet.
+        pipeline = Pipeline(self.good_config_file, self.good_run_id,
+                            self.good_legacy_sheet_path, None,
+                            self.output_file_path, self.qiita_id,
+                            Pipeline.METAGENOMIC_PTYPE)
+
+        # assert that for legacy sample-sheets, well_id_384 is NOT a reserved
+        # word and the appropriate reserved word is 'Sample_well'.
+        obs = pipeline.identify_reserved_words(['well_id_384',
+                                                'NOT_A_RESERVED_WORD',
+                                                'Sample_well',
+                                                'Sample_Well'])
+
+        self.assertEqual(obs, ['sample_well'])
+
 
 class TestAmpliconPipeline(unittest.TestCase):
     def setUp(self):
@@ -2339,6 +2372,29 @@ def test_process_run_info_file(self):
         # These are indirectly tested as generate_dummy_sample_sheet() is
         # called by Pipeline's constructor.
 
+    def test_identify_reserved_words(self):
+        pipeline = Pipeline(self.good_config_file,
+                            self.good_run_id,
+                            None,
+                            self.good_mapping_file_path,
+                            self.output_file_path,
+                            self.qiita_id,
+                            Pipeline.AMPLICON_PTYPE)
+
+        # assert that arbitrary strings are not reserved.
+        obs = pipeline.identify_reserved_words(['NOT_A_RESERVED_WORD',
+                                                'ANOTHER_WORD'])
+        self.assertEqual(obs, [])
+
+        # assert that Sample_Well is okay for current pre-prep files but
+        # well_id_384 is reserved. Show that all forms of tm300_8_tool are
+        # also reserved.
+        obs = pipeline.identify_reserved_words(['Sample_Well',
+                                                'TM300_8_Tool',
+                                                'tm300_8_tool',
+                                                'well_id_384'])
+        self.assertEqual(set(obs), {'tm300_8_tool', 'well_id_384'})
+
 
 class TestInstrumentUtils(unittest.TestCase):
     def setUp(self):

From 7a84cd04bf976340278b2b2c18ba53062d222d28 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 15 Sep 2024 17:53:43 -0700
Subject: [PATCH 09/47] Manually merged with master

---
 .../tests/data/mgv90_test_sheet.csv           | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv

diff --git a/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv b/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv
new file mode 100644
index 00000000..ded82519
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/mgv90_test_sheet.csv
@@ -0,0 +1,40 @@
+[Header],,,,,,,,,,
+IEMFileVersion,4,,,,,,,,,
+SheetType,standard_metag,,,,,,,,,
+SheetVersion,90,,,,,,,,,
+Investigator Name,Caballero,,,,,,,,,
+Experiment Name,RKL0042,,,,,,,,,
+Date,2/26/20,,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,,
+Application,FASTQ Only,,,,,,,,,
+Assay,Metagenomic,,,,,,,,,
+Description,,,,,,,,,,
+Chemistry,Default,,,,,,,,,
+,,,,,,,,,,
+[Reads],,,,,,,,,,
+150,,,,,,,,,,
+150,,,,,,,,,,
+,,,,,,,,,,
+[Settings],,,,,,,,,,
+ReverseComplement,0,,,,,,,,,
+,,,,,,,,,,
+[Data],,,,,,,,,,
+Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Well_description
+1,sample1,sample1,FooBar_666_p1,A1,iTru7_107_07,CCGACTAT,iTru5_01_A,ACCGACAA,Project_1111,s1
+1,sample2,sample2,FooBar_666_p1,A2,iTru7_107_08,CCGACTAT,iTru5_01_A,CTTCGCAA,Project_1111,s2
+3,sample1,sample1,FooBar_666_p1,A3,iTru7_107_09,GCCTTGTT,iTru5_01_A,AACACCAC,Project_1111,s1
+3,sample2,sample2,FooBar_666_p1,A4,iTru7_107_10,AACTTGCC,iTru5_01_A,CGTATCTC,Project_1111,s2
+3,sample3,sample3,FooBar_666_p1,A5,iTru7_107_11,CAATGTGG,iTru5_01_A,GGTACGAA,Trojecp_666,s5
+3,sample4,sample4,FooBar_666_p1,B6,iTru7_107_12,AAGGCTGA,iTru5_01_A,CGATCGAT,Trojecp_666,s6
+3,sample5,sample5,FooBar_666_p1,B8,iTru7_107_13,TTACCGAG,iTru5_01_A,AAGACACC,Trojecp_666,s7
+,,,,,,,,,,
+[Bioinformatics],,,,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,,,
+Project_1111,1111,False,AACC,GGTT,False,Knight Lab Kapa HP,Eqiiperiment,,,
+Trojecp_666,666,False,AACC,GGTT,False,Knight Lab Kapa HP,SomethingWitty,,,
+,,,,,,,,,,
+[Contact],,,,,,,,,,
+Email,Sample_Project,,,,,,,,,
+test@lol.com,Project_1111,,,,,,,,,
+tester@rofl.com,Trojecp_666,,,,,,,,,
+,,,,,,,,,,

From 02364037b384be8d836afa13a54e5bba9b048c79 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 17 Sep 2024 10:18:14 -0700
Subject: [PATCH 10/47] Updates based on testing in qiita-rc

---
 sequence_processing_pipeline/Commands.py      |  5 ++++-
 sequence_processing_pipeline/Job.py           |  1 -
 sequence_processing_pipeline/TRConvertJob.py  | 22 ++++++++++---------
 .../tests/test_ConvertJob.py                  |  2 +-
 .../tests/test_FastQCJob.py                   |  2 +-
 .../tests/test_NuQCJob.py                     |  2 +-
 6 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
index b2cd5e41..cce7c605 100644
--- a/sequence_processing_pipeline/Commands.py
+++ b/sequence_processing_pipeline/Commands.py
@@ -22,7 +22,8 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
     # is now the following:
     # add one more level to account for project_names nested under ConvertJob
     # dir.
-    fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz')
+    # this will ignore the _I1_ reads that appear in the integrated result.
+    fastq_paths = glob.glob(data_location_path + '/*/*_R?_001.fastq.gz')
 
     # convert from GB and halve as we sum R1
     max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
@@ -114,6 +115,8 @@ def demux(id_map, fp, out_d, task, maxtask):
     qual = iter(fp)
 
     for i, s, d, q in zip(id_, seq, dumb, qual):
+        # NB: This appears to not be causing the removal of the metadata
+        # either.
         fname_encoded, id_ = i.split(delimiter, 1)
 
         if fname_encoded not in openfps:
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 6a5d4f86..1c1a7593 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -9,7 +9,6 @@
 import logging
 from inspect import stack
 import re
-from time import time
 
 
 class Job:
diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 98d9c18d..c4ca29c4 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -116,7 +116,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1"
         self.cloudspades_wall_time_limit = "24:00:00"
         self.counts_cores_per_task = "1"
-        self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py",
+        self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py"
         self.counts_mem_in_gb = "8"
         self.counts_node_count = "1"
         self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv'
@@ -154,6 +154,8 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.main_reference_base = ""
         self.main_reference_map = ""
 
+        self._generate_job_scripts()
+
     def _generate_job_scripts(self):
         scripts = [
             {
@@ -189,7 +191,7 @@ def _generate_job_scripts(self):
                     "wall_time_limit": self.integrate_wall_time_limit,
                     "mem_in_gb": self.integrate_mem_in_gb,
                     "node_count": self.integrate_node_count,
-                    "cores_per_task": self.integtrate_cores_per_task,
+                    "cores_per_task": self.integrate_cores_per_task,
                     "iinp_script_path": self.integrate_indicies_script_path,
                     "queue_name": self.queue_name
                 }
@@ -368,7 +370,7 @@ def run(self, callback=None):
 
         # Get a list of Slurm job ids that we need to wait on and text
         # descriptions of what they are.
-        jids = [(results[x[2], x[0]]) for x in child_processes if
+        jids = [(results[x[2]], x[0]) for x in child_processes if
                 x[2] in results]
 
         # ensure the jids are casted to integers before passing them.
@@ -377,7 +379,7 @@ def run(self, callback=None):
         for (jid, description), status in zip(jids, statuses):
             if status not in Job.slurm_status_successful:
                 raise PipelineError(f"process '{description}' ({jid}) "
-                                    f"failed ({status}")
+                                    f"failed ({status})")
 
         # post-process working directory to make it appear like results
         # generated by ConvertJob
@@ -412,7 +414,7 @@ def run(self, callback=None):
         for root, dirs, files in walk(integrated_files_path):
             for _file in files:
                 fastq_file = join(root, _file)
-                self._post_process_file(fastq_file, self.lane, self.mapping)
+                self._post_process_file(fastq_file, self.mapping)
 
         # move project folders from integrated directory to working_dir.
         contents = listdir(integrated_files_path)
@@ -430,7 +432,7 @@ def parse_logs(self):
     def parse_job_script(job_script_path):
         raise PipelineError("parsing job script not implemented.")
 
-    def _post_process_file(self, fastq_file, lane, mapping):
+    def _post_process_file(self, fastq_file, mapping):
         # generate names of the form generated by bcl-convert/bcl2fastq:
         # <Sample_ID>_S#_L00#_<R# or I#>_001.fastq.gz
         # see:
@@ -455,10 +457,10 @@ def _post_process_file(self, fastq_file, lane, mapping):
 
         # generate the new filename for the fastq file, and reorganize the
         # files by project.
-        new_name = "%s_S%d_L%s_%s_001.fastq.gz" % (sample_name,
-                                                   sample_index,
-                                                   str(lane).zfill(3),
-                                                   read_type)
+        new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name,
+                                                  sample_index,
+                                                  self.lane,
+                                                  read_type)
 
         # ensure that the project directory exists before we rename and move
         # the file to that location.
diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py
index df81fdcf..a6ebad23 100644
--- a/sequence_processing_pipeline/tests/test_ConvertJob.py
+++ b/sequence_processing_pipeline/tests/test_ConvertJob.py
@@ -952,7 +952,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         error_msg = ("This job died.\n2024-01-01T12:12:12Z thread 99999 ERROR:"
diff --git a/sequence_processing_pipeline/tests/test_FastQCJob.py b/sequence_processing_pipeline/tests/test_FastQCJob.py
index 28fe52cb..a2291296 100644
--- a/sequence_processing_pipeline/tests/test_FastQCJob.py
+++ b/sequence_processing_pipeline/tests/test_FastQCJob.py
@@ -1121,7 +1121,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         try:
diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py
index 5164575c..88d4ef1c 100644
--- a/sequence_processing_pipeline/tests/test_NuQCJob.py
+++ b/sequence_processing_pipeline/tests/test_NuQCJob.py
@@ -992,7 +992,7 @@ def test_error_msg_from_logs(self):
 
         # an internal method to force submit_job() to raise a JobFailedError
         # instead of submitting the job w/sbatch and waiting for a failed
-        # job w/sacct.
+        # job w/squeue.
         self.assertTrue(job._toggle_force_job_fail())
 
         try:

From 64583a22a8269387ce0c4e607df13b43ef0cd523 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 17 Sep 2024 10:41:20 -0700
Subject: [PATCH 11/47] flake8

---
 sequence_processing_pipeline/TRConvertJob.py  | 34 ++++++----
 .../contrib/create_picklist.py                | 65 ++++++++++---------
 .../contrib/integrate-indices-np.py           | 12 ++--
 .../contrib/plot_counts.py                    |  2 +-
 4 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index c4ca29c4..81b414d5 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -113,18 +113,24 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.cloudspades_mem_in_gb = "128"
         self.cloudspades_modules = ["gcc_9.3.0"]
         self.cloudspades_node_count = "1"
-        self.cloudspades_path = "/home/qiita_test/qiita-spots/spades-cloudspades-0.1"
+        self.cloudspades_path = ("/home/qiita_test/qiita-spots/spades-"
+                                 "cloudspades-0.1")
         self.cloudspades_wall_time_limit = "24:00:00"
         self.counts_cores_per_task = "1"
-        self.counts_create_picklist_path = "/home/qiita_test/qiita-spots/create_picklist.py"
+        self.counts_create_picklist_path = ("/home/qiita_test/qiita-spots/"
+                                            "create_picklist.py")
         self.counts_mem_in_gb = "8"
         self.counts_node_count = "1"
-        self.counts_other_file = '20230906_FS10001773_68_BTR67708-1611.read_counts.tsv'
-        self.counts_plot_counts_path = "/home/qiita_test/qiita-spots/plot_counts.py"
-        self.counts_sample_sheet = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv"
+        self.counts_other_file = ('20230906_FS10001773_68_BTR67708-1611.'
+                                  'read_counts.tsv')
+        self.counts_plot_counts_path = ("/home/qiita_test/qiita-spots/'"
+                                        "'plot_counts.py")
+        self.counts_sample_sheet = ("/home/qiita_test/qiita-spots/"
+                                    "20230906_FS10001773_68_BTR67708-1611.csv")
         self.counts_wall_time_limit = "24:00:00"
         self.cs_isolate_mem_in_gb = "64"
-        self.integrate_indicies_script_path = "/home/qiita_test/qiita-spots/integrate-indices-np.py"
+        self.integrate_indicies_script_path = ("/home/qiita_test/qiita-spots/"
+                                               "integrate-indices-np.py")
         self.integrate_mem_in_gb = "8"
         self.integrate_node_count = "1"
         self.integrate_wall_time_limit = "24:00:00"
@@ -134,21 +140,26 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.tellink_mem_in_gb = "160"
         self.tellink_modules = ["singularity_3.6.4"]
         self.tellink_node_count = "1"
-        self.tellink_sing_path = "/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh"
+        self.tellink_sing_path = ("/projects/long_read_collab/code/tellseq/"
+                                  "release_v1.11/tellink-release/"
+                                  "run_tellink_sing.sh")
         self.tellink_wall_time_limit = "96:00:00"
         self.tellread_cores_per_task = "4"
         self.tellread_mem_in_gb = "16"
         self.tellread_modules = ["singularity_3.6.4"]
         self.tellread_node_count = "1"
-        self.tellread_sing_script_path = "$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh"
+        self.tellread_sing_script_path = ("$HOME/qiita-spots/tellread-release"
+                                          "-novaseqX/run_tellread_sing.sh")
         self.tellread_wall_time_limit = "96:00:00"
         self.tl_cores_per_task = "16"
         self.tl_isolate_node_count = "1"
         self.tl_isolate_wall_time_limit = "96:00:00"
         self.tl_mem_in_gb = "160"
-        self.main_map = "/home/qiita_test/qiita-spots/20230906_FS10001773_68_BTR67708-1611.csv"
+        self.main_map = ("/home/qiita_test/qiita-spots/20230906_FS10001773_"
+                         "68_BTR67708-1611.csv")
         self.main_mode = "metagenomic"
-        self.main_seqrun_path = "/sequencing/seqmount/KL_iSeq_Runs/20230906_FS10001773_68_BTR67708-1611"
+        self.main_seqrun_path = ("/sequencing/seqmount/KL_iSeq_Runs/20230906"
+                                 "_FS10001773_68_BTR67708-1611")
 
         # TODO: Address reference_map and reference_base
         self.main_reference_base = ""
@@ -208,7 +219,8 @@ def _generate_job_scripts(self):
                     "plot_counts_path": self.counts_plot_counts_path,
                     "output_path": self.tellread_output_path,
                     "create_picklist_path": self.counts_create_picklist_path,
-                    "read_counts_path": join(self.tellread_output_path, self.counts_other_file),
+                    "read_counts_path": join(self.tellread_output_path,
+                                             self.counts_other_file),
                     "queue_name": self.queue_name
                 }
             },
diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py
index 44906872..a1d6a1d0 100644
--- a/sequence_processing_pipeline/contrib/create_picklist.py
+++ b/sequence_processing_pipeline/contrib/create_picklist.py
@@ -1,19 +1,16 @@
 import os
-from scipy.stats import mannwhitneyu, zscore
-from sklearn.linear_model import LogisticRegression
-from contextlib import suppress
-import pandas as pd
-from metapool.metapool import *
-from metapool import (make_sample_sheet, requires_dilution, dilute_gDNA,
-                      find_threshold, autopool, extract_stats_metadata)
+# from metapool.metapool import *
 from sys import argv
+import pandas as pd
+import matplotlib.pyplot as plt
+from metapool.metapool import (read_survival, make_2D_array,
+                               calculate_iseqnorm_pooling_volumes,
+                               format_pooling_echo_pick_list)
+import seaborn as sns
 
 input_sheet_filename = argv[1]
-#input_sheet_filename = input_sheet_filename.rsplit('.', 1)[0] + '.read_counts.tsv'
-#instead construct the needed path and pass it.
 
-plate_df_w_reads = pd.read_csv(input_sheet_filename,
-                                 sep='\t')
+plate_df_w_reads = pd.read_csv(input_sheet_filename, sep='\t')
 plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False
                              for s in plate_df_w_reads['Sample_Name']]
 reads_column = 'read_counts'
@@ -23,36 +20,45 @@
 
 f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8))
 # evenness plot
-rmax = int(round(plate_df_w_reads[reads_column].max(),-2))
-survival_df = pd.concat([read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == True,
-                                                            reads_column], label='Blanks',rmax=rmax),
-                         read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] == False,
-                                                            reads_column], label='Samples',rmax=rmax)])
+rmax = int(round(plate_df_w_reads[reads_column].max(), -2))
+
+foo = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is True,
+                                         reads_column],
+                    label='Blanks',
+                    rmax=rmax)
+
+bar = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is False,
+                                         reads_column],
+                    label='Samples',
+                    rmax=rmax)
+
+survival_df = pd.concat([foo, bar])
 
 ax3.set_xlabel(reads_column)
 ax3.set_ylabel('Samples')
-survival_df.plot(color = ['coral','steelblue'],ax=ax1)
+survival_df.plot(color=['coral', 'steelblue'], ax=ax1)
 ax1.set_xlabel(reads_column)
 ax1.set_ylabel('Samples')
 
-##Histogram
-sns.histplot(plate_df_w_reads[reads_column],ax=ax3)
-
-#Boxplot
-sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4);
-sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax = ax4,
-              size=3,color='black',alpha=0.5)
+# Histogram
+sns.histplot(plate_df_w_reads[reads_column], ax=ax3)
 
+# Boxplot
+sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4)
+sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4,
+              size=3, color='black', alpha=0.5)
 
 plt.tight_layout()
 plt.savefig(input_sheet_filename + '.comboplot.pdf')
 
-#plate_df_w_reads = plate_df_w_reads[plate_df_w_reads[reads_column] > 0]
-plate_df_normalized = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,dynamic_range=20,
-                                                         normalization_column=reads_column)
+pdfn = calculate_iseqnorm_pooling_volumes(plate_df_w_reads,
+                                          dynamic_range=20,
+                                          normalization_column=reads_column)
 plt.savefig(input_sheet_filename + '.normalizedplot.pdf')
 
-vols = make_2D_array(plate_df_normalized, data_col='iSeq normpool volume', well_col=well_col).astype(float)
+vols = make_2D_array(pdfn,
+                     data_col='iSeq normpool volume',
+                     well_col=well_col).astype(float)
 
 # Write the picklist as .csv
 picklist_fp = input_sheet_filename + '.picklist.csv'
@@ -61,5 +67,6 @@
     print("Warning! This file exists already.")
 
 picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000)
-with open(picklist_fp,'w') as f:
+
+with open(picklist_fp, 'w') as f:
     f.write(picklist)
diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py
index 9500cff9..b1be83a6 100644
--- a/sequence_processing_pipeline/contrib/integrate-indices-np.py
+++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py
@@ -6,10 +6,10 @@
 # the ordering stems is determined external to the data being sorted. To
 # determine order, all barcodes must be read in to gather the complete
 # barcode <-> record association; if only partial data is read then
-# associations to barcodes may be missed, and we cannot perform an insertion sort
-# efficiently as we're writing to disk. Once we know an order for the records,
-# we (currently) read in the entirety of the subsequent data (R1 then R2),
-# reorder, and write. Performing this in blocks to minimize memory may be
+# associations to barcodes may be missed, and we cannot perform an insertion
+# sort efficiently as we're writing to disk. Once we know an order for the
+# records, we (currently) read in the entirety of the subsequent data (R1 then
+# R2), reorder, and write. Performing this in blocks to minimize memory may be
 # possible, but we have to assume access is random as a grouping barcode
 # may be with any record along the file.
 #
@@ -291,7 +291,9 @@ def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort):
         # some downstream programs hate this, so let's add if needed.
         if r1_sniff.endswith(b'/1'):
             if not r2_sniff.endswith(b'/2'):
-                raise ValueError(f'unexpected endings: {r1_sniff.decode("utf-8")} {r2_sniff.decode("utf-8")}')
+                raise ValueError('unexpected endings: '
+                                 f'{r1_sniff.decode("utf-8")} '
+                                 f'{r2_sniff.decode("utf-8")}')
             orient_r1 = ''
             orient_r2 = ''
         else:
diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py
index 76c822ee..ecab9e49 100644
--- a/sequence_processing_pipeline/contrib/plot_counts.py
+++ b/sequence_processing_pipeline/contrib/plot_counts.py
@@ -7,7 +7,7 @@
 ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq')
 
 # remove total line from wc
-data = [l.strip().split(' ') for l in open(sys.argv[1])][:-1]
+data = [x.strip().split(' ') for x in open(sys.argv[1])][:-1]
 plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data]
 sheetdata = dict(plotdata)
 

From e7e7c5456aca1a53a8897a24adf1ebe23f9317d4 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 18 Sep 2024 21:35:10 -0700
Subject: [PATCH 12/47] Small fixes

---
 sequence_processing_pipeline/Job.py          | 5 +++++
 sequence_processing_pipeline/TRConvertJob.py | 7 +++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 1c1a7593..2d64b039 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -226,6 +226,9 @@ def wait_on_job_ids(self, job_ids, callback=None):
         # them before returning, optionally submitting callbacks for each
         # job-id.
 
+        # ensure all ids are strings to ensure proper working w/join().
+        job_ids = [str(x) for x in job_ids]
+
         def query_slurm(job_ids):
             # internal function query_slurm encapsulates the handling of
             # squeue.
@@ -257,6 +260,8 @@ def query_slurm(job_ids):
             jobs = {}
             child_jobs = {}
             for job_id, unique_id, state in lines:
+                # ensure unique_id is of type string for downstream use.
+                unique_id = str(unique_id)
                 jobs[unique_id] = state
 
                 if unique_id != job_id:
diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 81b414d5..572f3feb 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -96,8 +96,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         # As the sample-sheet is validated by the Pipeline object before
         # being passed to TRConvertJob, additional validation isn't needed.
 
-        self._generate_job_scripts()
-
         # TODO: generate a sample-mapping to map C#s to fake sample-names and
         #  fake projects. Process sample-sheet later.
         self.mapping = self._generate_sample_mapping()
@@ -386,9 +384,10 @@ def run(self, callback=None):
                 x[2] in results]
 
         # ensure the jids are casted to integers before passing them.
-        statuses = self._wait_on_job_ids([int(x[0]) for x in jids])
+        statuses = self.wait_on_job_ids([int(x[0]) for x in jids])
 
-        for (jid, description), status in zip(jids, statuses):
+        for jid, description in jids:
+            status = statuses[jid]
             if status not in Job.slurm_status_successful:
                 raise PipelineError(f"process '{description}' ({jid}) "
                                     f"failed ({status})")

From 497738f3d4e7439e4a8beea48282de09c002c8fb Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 24 Sep 2024 15:34:11 -0700
Subject: [PATCH 13/47] Refactor KISSLoader to be more DRY.

---
 sequence_processing_pipeline/Job.py          | 21 ++++++++++++++++
 sequence_processing_pipeline/NuQCJob.py      | 24 ++-----------------
 sequence_processing_pipeline/TRConvertJob.py | 25 +++-----------------
 3 files changed, 26 insertions(+), 44 deletions(-)

diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 2d64b039..59d9cea2 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -1,3 +1,6 @@
+from jinja2 import BaseLoader, TemplateNotFound
+from os.path import getmtime
+import pathlib
 from itertools import zip_longest
 from os import makedirs, walk
 from os.path import basename, exists, split, join
@@ -11,6 +14,24 @@
 import re
 
 
+# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
+class KISSLoader(BaseLoader):
+    def __init__(self, path):
+        # pin the path for loader to the location sequence_processing_pipeline
+        # (the location of this file), along w/the relative path to the
+        # templates directory.
+        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
+
+
 class Job:
     slurm_status_terminated = ['BOOT_FAIL', 'CANCELLED', 'DEADLINE', 'FAILED',
                                'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED',
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index b1c27900..0ffacb1a 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -1,8 +1,7 @@
-from jinja2 import BaseLoader, TemplateNotFound
 from metapool import load_sample_sheet
 from os import stat, makedirs, rename
-from os.path import join, basename, dirname, exists, abspath, getmtime
-from sequence_processing_pipeline.Job import Job
+from os.path import join, basename, dirname, exists, abspath
+from sequence_processing_pipeline.Job import Job, KISSLoader
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
 from sequence_processing_pipeline.Pipeline import Pipeline
@@ -14,25 +13,6 @@
 import glob
 import re
 from sys import executable
-import pathlib
-
-
-# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
-class KISSLoader(BaseLoader):
-    def __init__(self, path):
-        # pin the path for loader to the location sequence_processing_pipeline
-        # (the location of this file), along w/the relative path to the
-        # templates directory.
-        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
-
-    def get_source(self, environment, template):
-        path = join(self.path, template)
-        if not exists(path):
-            raise TemplateNotFound(template)
-        mtime = getmtime(path)
-        with open(path) as f:
-            source = f.read()
-        return source, path, lambda: mtime == getmtime(path)
 
 
 logging.basicConfig(level=logging.DEBUG)
diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 572f3feb..54542fef 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -1,31 +1,12 @@
-from jinja2 import BaseLoader, TemplateNotFound, Environment
-from os.path import split, join, exists, getmtime
-from sequence_processing_pipeline.Job import Job
+from jinja2 import Environment
+from os.path import split, join, exists
+from sequence_processing_pipeline.Job import Job, KISSLoader
 from sequence_processing_pipeline.PipelineError import PipelineError
-import pathlib
 from os import rename, walk, chmod, listdir, makedirs
 from shutil import move, rmtree
 from re import match
 
 
-# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
-class KISSLoader(BaseLoader):
-    def __init__(self, path):
-        # pin the path for loader to the location sequence_processing_pipeline
-        # (the location of this file), along w/the relative path to the
-        # templates directory.
-        self.path = join(pathlib.Path(__file__).parent.resolve(), path)
-
-    def get_source(self, environment, template):
-        path = join(self.path, template)
-        if not exists(path):
-            raise TemplateNotFound(template)
-        mtime = getmtime(path)
-        with open(path) as f:
-            source = f.read()
-        return source, path, lambda: mtime == getmtime(path)
-
-
 class TRConvertJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, nprocs, wall_time_limit, pmem, bcl_tool_path,

From ca71c1db29b73add9efb2169663871c074f77d66 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 1 Oct 2024 14:32:52 -0700
Subject: [PATCH 14/47] Pipeline.py updated to support changes in qp-klp

---
 sequence_processing_pipeline/Pipeline.py      | 120 ++++++++++--------
 sequence_processing_pipeline/TRConvertJob.py  | 103 ++++++++++++++-
 .../tests/test_Pipeline.py                    | 105 ++++++---------
 3 files changed, 206 insertions(+), 122 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 88319353..3dd19371 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -133,25 +133,19 @@ class Pipeline:
 
     assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE]
 
-    def __init__(self, configuration_file_path, run_id, sample_sheet_path,
-                 mapping_file_path, output_path, qiita_job_id, pipeline_type):
+    def __init__(self, configuration_file_path, run_id, input_file_path,
+                 output_path, qiita_job_id, pipeline_type):
         """
         Initialize Pipeline object w/configuration information.
         :param configuration_file_path: Path to configuration.json file.
         :param run_id: Used w/search_paths to locate input run_directory.
-        :param sample_sheet_path: Path to sample-sheet.
-        :param mapping_file_path: Path to mapping file.
+        :param input_file_path: Path to sample-sheet or pre-prep file.
         :param output_path: Path where all pipeline-generated files live.
         :param qiita_job_id: Qiita Job ID creating this Pipeline.
         :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.)
         """
-        if sample_sheet_path is not None and mapping_file_path is not None:
-            raise PipelineError("sample_sheet_path or mapping_file_path "
-                                "must be defined, but not both.")
-
-        if sample_sheet_path is None and mapping_file_path is None:
-            raise PipelineError("sample_sheet_path or mapping_file_path "
-                                "must be defined, but not both.")
+        if input_file_path is None:
+            raise PipelineError("user_input_file_path cannot be None")
 
         if pipeline_type not in Pipeline.pipeline_types:
             raise PipelineError(f"'{type}' is not a valid pipeline type.")
@@ -196,21 +190,33 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
         self.qiita_job_id = qiita_job_id
         self.pipeline = []
 
-        if sample_sheet_path:
-            self.search_paths = self.configuration['search_paths']
-            self.sample_sheet = self._validate_sample_sheet(sample_sheet_path)
-            self.mapping_file = None
-        else:
+        # this method will catch a run directory as well as its products
+        # directory, which also has the same name. Hence, return the
+        # shortest matching path as that will at least return the right
+        # path between the two.
+        results = []
+
+        if pipeline_type == Pipeline.AMPLICON_PTYPE:
             self.search_paths = self.configuration['amplicon_search_paths']
-            self.mapping_file = self._validate_mapping_file(mapping_file_path)
-            # unlike _validate_sample_sheet() which returns a SampleSheet
-            # object that stores the path to the file it was created from,
-            # _validate_mapping_file() just returns a DataFrame. Store the
-            # path to the original mapping file itself as well.
-            self.mapping_file_path = mapping_file_path
-            self.sample_sheet = None
+        else:
+            self.search_paths = self.configuration['search_paths']
 
-        self.run_dir = self._search_for_run_dir()
+        for search_path in self.search_paths:
+            logging.debug(f'Searching {search_path} for {self.run_id}')
+            for entry in listdir(search_path):
+                some_path = join(search_path, entry)
+                # ensure some_path never ends in '/'
+                some_path = some_path.rstrip('/')
+                if isdir(some_path) and some_path.endswith(self.run_id):
+                    logging.debug(f'Found {some_path}')
+                    results.append(some_path)
+
+        if results:
+            results.sort(key=lambda s: len(s))
+            self.run_dir = results[0]
+        else:
+            raise PipelineError(f"A run-dir for '{self.run_id}' could not be "
+                                "found")
 
         # required files for successful operation
         # both RTAComplete.txt and RunInfo.xml should reside in the root of
@@ -228,14 +234,44 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path,
         except PermissionError:
             raise PipelineError('RunInfo.xml is present, but not readable')
 
-        if self.mapping_file is not None:
+        self.input_file_path = input_file_path
+
+        if pipeline_type == Pipeline.AMPLICON_PTYPE:
+            # assume input_file_path references a pre-prep (mapping) file.
+
+            self.mapping_file = self._validate_mapping_file(input_file_path)
+            # unlike _validate_sample_sheet() which returns a SampleSheet
+            # object that stores the path to the file it was created from,
+            # _validate_mapping_file() just returns a DataFrame. Store the
+            # path to the original mapping file itself as well.
+
             # create dummy sample-sheet
             output_fp = join(output_path, 'dummy_sample_sheet.csv')
             self.generate_dummy_sample_sheet(self.run_dir, output_fp)
             self.sample_sheet = output_fp
+        else:
+            # assume user_input_file_path references a sample-sheet.
+            self.sample_sheet = self._validate_sample_sheet(input_file_path)
+            self.mapping_file = None
 
         self._configure_profile()
 
+    def get_software_configuration(self, software):
+        if software is None or software == "":
+            raise ValueError(f"'{software}' is not a valid value")
+
+        key_order = ['profile', 'configuration', software]
+
+        config = self.config_profile
+
+        for key in key_order:
+            if key in config:
+                config = config[key]
+            else:
+                raise PipelineError(f"'{key}' is not defined in configuration")
+
+        return config
+
     def identify_reserved_words(self, words):
         '''
         Returns a list of words that should not appear as column names in any
@@ -254,7 +290,7 @@ def identify_reserved_words(self, words):
         # specifically how the proper set of prep-info file columns are
         # generated. For now the functionality will be defined here as this
         # area of metapool is currently in flux.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             reserved = PREP_MF_COLUMNS
         else:
             # results will be dependent on SheetType and SheetVersion of
@@ -351,30 +387,6 @@ def _configure_profile(self):
 
         self.config_profile = selected_profile
 
-    def _search_for_run_dir(self):
-        # this method will catch a run directory as well as its products
-        # directory, which also has the same name. Hence, return the
-        # shortest matching path as that will at least return the right
-        # path between the two.
-        results = []
-
-        for search_path in self.search_paths:
-            logging.debug(f'Searching {search_path} for {self.run_id}')
-            for entry in listdir(search_path):
-                some_path = join(search_path, entry)
-                # ensure some_path never ends in '/'
-                some_path = some_path.rstrip('/')
-                if isdir(some_path) and some_path.endswith(self.run_id):
-                    logging.debug(f'Found {some_path}')
-                    results.append(some_path)
-
-        if results:
-            results.sort(key=lambda s: len(s))
-            return results[0]
-
-        raise PipelineError(f"A run-dir for '{self.run_id}' could not be "
-                            "found")
-
     def _directory_check(self, directory_path, create=False):
         if exists(directory_path):
             logging.debug("directory '%s' exists." % directory_path)
@@ -551,7 +563,7 @@ def generate_sample_info_files(self, addl_info=None):
         :param addl_info: A df of (sample-name, project-name) pairs.
         :return: A list of paths to sample-information-files.
         """
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             # Generate a list of BLANKs for each project.
             df = self.mapping_file[['sample_name', 'project_name']]
         else:
@@ -623,7 +635,7 @@ def get_sample_ids(self):
 
         # test for self.mapping_file, since self.sample_sheet will be
         # defined in both cases.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             results = list(self.mapping_file.sample_name)
         else:
             results = [x.Sample_ID for x in self.sample_sheet.samples]
@@ -638,7 +650,7 @@ def get_sample_names(self, project_name=None):
         '''
         # test for self.mapping_file, since self.sample_sheet will be
         # defined in both cases.
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             return self._get_sample_names_from_mapping_file(project_name)
         else:
             return self._get_sample_names_from_sample_sheet(project_name)
@@ -737,7 +749,7 @@ def get_project_info(self, short_names=False):
         # defined in both cases.
         results = []
 
-        if self.mapping_file is not None:
+        if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
             if 'contains_replicates' in self.mapping_file:
                 contains_replicates = True
             else:
diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
index 54542fef..7a9e9a19 100644
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ b/sequence_processing_pipeline/TRConvertJob.py
@@ -5,7 +5,35 @@
 from os import rename, walk, chmod, listdir, makedirs
 from shutil import move, rmtree
 from re import match
-
+from metapool import load_sample_sheet
+
+"""
+Note in tellread.sbatch, {{lane}} needs to be:
+
+if [[ ${LANE} == "L001" ]]; then
+    lane=s_1
+elif [[ ${LANE} == "L002" ]]; then
+    lane=s_2
+elif [[ ${LANE} == "L003" ]]; then
+    lane=s_3
+elif [[ ${LANE} == "L004" ]]; then
+    lane=s_4
+elif [[ ${LANE} == "L005" ]]; then
+    lane=s_5
+elif [[ ${LANE} == "L006" ]]; then
+    lane=s_6
+elif [[ ${LANE} == "L007" ]]; then
+    lane=s_7
+elif [[ ${LANE} == "L008" ]]; then
+    lane=s_8
+else
+    echo "Unrecognized lane: ${LANE}"
+    exit 1
+fi
+
+make sure compute_sequence_counts_for_normalization2.sbatch gets {{tellread_output}} as defined in $TELLREAD_OUTPUT in tellread.sh
+
+"""
 
 class TRConvertJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
@@ -146,6 +174,36 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
 
         self._generate_job_scripts()
 
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
     def _generate_job_scripts(self):
         scripts = [
             {
@@ -417,6 +475,49 @@ def run(self, callback=None):
         # delete the original output directory.
         rmtree(join(self.output_path, 'output'))
 
+    def run2(self, callback=None):
+        norm = True
+        assemble = True
+
+
+
+        tr_job = self.submit_job('tr.script')
+        if tr_job['job_state'] != 'COMPLETED':
+            raise ValueError("TR JOB (%s) FAILED" % tr_job['job_id'])
+
+        if norm is True:
+            '''
+                cp ${norm_script} ${normcopy}
+                chmod gou-w ${normcopy}
+            '''
+            nc_job = self.submit_job('norm_script')
+            if nc_job['job_state'] != 'COMPLETED':
+                raise ValueError("BC JOB (%s) FAILED" % nc_job['job_id'])
+
+        int_job = self.submit_job('integrate.script')
+        if int_job['job_state'] != 'COMPLETED':
+            raise ValueError("INT JOB (%s) FAILED" % int_job['job_id'])
+
+        if assemble is True:
+            # NB assemble jobs rely on successful integrate job
+            csj_job = self.submit_job('csj_script')
+            if csj_job['job_state'] != 'COMPLETED':
+                raise ValueError("CSJ JOB (%s) FAILED" % csj_job['job_id'])
+
+            tlj_job = self.submit_job('tlj_script')
+            if tlj_job['job_state'] != 'COMPLETED':
+                raise ValueError("TLJ JOB (%s) FAILED" % tlj_job['job_id'])
+
+        cleanup_job = self.submit_job('cleanup.script')
+        if cleanup_job['job_state'] != 'COMPLETED':
+            raise ValueError("CLEANUP JOB (%s) FAILED" % cleanup_job['job_id'])
+
+
+
+
+
+
+
     def parse_logs(self):
         raise PipelineError("parsing logs not implemented.")
 
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index fff4b07d..9f8ea4f5 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -118,7 +118,7 @@ def test_validate_mapping_file_numeric_ids(self):
         with NamedTemporaryFile() as tmp:
             self._make_mapping_file(tmp.name)
             exp = ['1.0', '1e-3']
-            pipeline = Pipeline(self.good_config_file, self.good_run_id, None,
+            pipeline = Pipeline(self.good_config_file, self.good_run_id,
                                 tmp.name, self.output_file_path, self.qiita_id,
                                 Pipeline.AMPLICON_PTYPE)
 
@@ -131,7 +131,7 @@ def test_validate_mapping_file_numeric_ids(self):
 
     def test_get_sample_names_from_sample_sheet(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.mp_sheet_path, None,
+                            self.mp_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -156,7 +156,7 @@ def test_get_sample_names_from_sample_sheet(self):
 
     def test_get_orig_names_from_sheet_with_replicates(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sheet_w_replicates, None,
+                            self.good_sheet_w_replicates,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -176,7 +176,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
                                                    "' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -188,7 +188,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
                                                    ".txt' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -200,7 +200,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu"
                                                    "t not readable"):
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         self.make_runinfo_file_readable()
@@ -210,7 +210,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.bad_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -227,7 +227,7 @@ def test_creation(self):
                                                 " valid sample-sheet."):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.bad_assay_type_path, None,
+                     self.bad_assay_type_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -235,7 +235,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.invalid_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -246,7 +246,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(None,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -257,7 +257,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      self.invalid_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -268,7 +268,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      None,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -278,7 +278,7 @@ def test_creation(self):
                                                    "not a valid json file"):
             Pipeline(self.good_sample_sheet_path,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -301,7 +301,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -323,7 +323,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -346,7 +346,7 @@ def test_creation(self):
                                                 "bad.json'"):
             Pipeline(self.good_config_file,
                      self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
 
@@ -357,7 +357,7 @@ def test_sample_sheet_validation(self):
         # contained w/in its 'message' member.
         try:
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.good_sample_sheet_path, None,
+                     self.good_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         except PipelineError as e:
@@ -367,7 +367,7 @@ def test_sample_sheet_validation(self):
         # test unsuccessful validation of a bad sample-sheet
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     self.bad_sample_sheet_path, None,
+                     self.bad_sample_sheet_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.METAGENOMIC_PTYPE)
         self.assertEqual(str(e.exception), ('Sample-sheet contains errors:\n'
@@ -379,7 +379,6 @@ def test_generate_sample_information_files(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
                             self.good_sample_sheet_path,
-                            None,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -493,7 +492,6 @@ def test_generate_sample_information_files_with_additional_meta(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
                             self.good_sample_sheet_path,
-                            None,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1025,7 +1023,7 @@ def test_get_sample_ids(self):
                           'EP400448B04', 'EP479894B04']
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1501,7 +1499,7 @@ def test_get_sample_names(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1529,7 +1527,7 @@ def test_get_project_info(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1560,7 +1558,7 @@ def test_get_project_info(self):
         self.assertEqual(sorted(obs_project_names), sorted(exp_project_names))
 
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sheet_w_replicates, None,
+                            self.good_sheet_w_replicates,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1572,7 +1570,7 @@ def test_get_project_info(self):
 
     def test_configuration_profiles(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1601,7 +1599,7 @@ def test_configuration_profiles(self):
     def test_parse_project_name(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1633,7 +1631,7 @@ def test_parse_project_name(self):
 
     def test_identify_reserved_words(self):
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_sample_sheet_path, None,
+                            self.good_sample_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1650,7 +1648,7 @@ def test_identify_reserved_words(self):
 
         # create new pipeline using a/legacy (v90) metagenomic sample-sheet.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            self.good_legacy_sheet_path, None,
+                            self.good_legacy_sheet_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.METAGENOMIC_PTYPE)
 
@@ -1743,7 +1741,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml"
                                                    "' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1755,7 +1753,7 @@ def test_required_file_checks(self):
         with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete"
                                                    ".txt' is not present."):
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1766,7 +1764,7 @@ def test_required_file_checks(self):
 
         with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, "
                                                    "but not readable"):
-            Pipeline(self.good_config_file, self.good_run_id, None,
+            Pipeline(self.good_config_file, self.good_run_id,
                      self.good_mapping_file_path, self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
             self.make_runinfo_file_readable()
@@ -1776,7 +1774,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.bad_config_file,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1791,7 +1789,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.invalid_config_file,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1802,7 +1800,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(None,
                      self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1813,7 +1811,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      self.invalid_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1824,7 +1822,7 @@ def test_creation(self):
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file,
                      None,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
 
@@ -1832,7 +1830,7 @@ def test_mapping_file_validation(self):
         # test successful validation of a good mapping-file.
         try:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.good_mapping_file_path,
+                     self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         except PipelineError as e:
@@ -1842,7 +1840,7 @@ def test_mapping_file_validation(self):
         # test unsuccessful validation of a bad mapping-file.
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.mf_missing_column,
+                     self.mf_missing_column,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         self.assertEqual(str(e.exception), ('Mapping-file is missing '
@@ -1852,7 +1850,7 @@ def test_mapping_file_validation(self):
         # test unsuccessful validation of a bad mapping-file.
         with self.assertRaises(PipelineError) as e:
             Pipeline(self.good_config_file, self.good_run_id,
-                     None, self.mf_duplicate_sample,
+                     self.mf_duplicate_sample,
                      self.output_file_path,
                      self.qiita_id, Pipeline.AMPLICON_PTYPE)
         self.assertEqual(str(e.exception), ("Mapping-file contains duplicate "
@@ -1879,7 +1877,6 @@ def test_is_sample_sheet(self):
     def test_generate_sample_information_files(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2101,7 +2098,6 @@ def test_get_sample_ids(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2229,7 +2225,6 @@ def test_get_sample_names(self):
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2253,7 +2248,7 @@ def test_get_project_info(self):
 
         # test sample-information-file generation.
         pipeline = Pipeline(self.good_config_file, self.good_run_id,
-                            None, self.good_mapping_file_path,
+                            self.good_mapping_file_path,
                             self.output_file_path, self.qiita_id,
                             Pipeline.AMPLICON_PTYPE)
 
@@ -2270,33 +2265,12 @@ def test_get_project_info(self):
                     self.assertDictEqual(obs_d, exp_d)
                     break
 
-    def test_additional_constuctor_check(self):
-        with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or "
-                                                    "mapping_file_path must "
-                                                    "be defined, but not "
-                                                    "both.")):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     None, None,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
-        with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or "
-                                                    "mapping_file_path must "
-                                                    "be defined, but not "
-                                                    "both.")):
-            Pipeline(self.good_config_file, self.good_run_id,
-                     self.sample_sheet_path,
-                     self.good_mapping_file_path,
-                     self.output_file_path,
-                     self.qiita_id, Pipeline.AMPLICON_PTYPE)
-
     def test_dummy_sheet_generation(self):
         # generate a RunInfo.xml file w/only one indexed read.
         self.create_runinfo_file(four_reads=False)
 
         _ = Pipeline(self.good_config_file,
                      self.good_run_id,
-                     None,
                      self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id,
@@ -2315,7 +2289,6 @@ def test_dummy_sheet_generation(self):
 
         _ = Pipeline(self.good_config_file,
                      self.good_run_id,
-                     None,
                      self.good_mapping_file_path,
                      self.output_file_path,
                      self.qiita_id,
@@ -2335,7 +2308,6 @@ def test_dummy_sheet_generation(self):
     def test_process_run_info_file(self):
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,
@@ -2375,7 +2347,6 @@ def test_process_run_info_file(self):
     def test_identify_reserved_words(self):
         pipeline = Pipeline(self.good_config_file,
                             self.good_run_id,
-                            None,
                             self.good_mapping_file_path,
                             self.output_file_path,
                             self.qiita_id,

From 6cdc7ba986b0171ecd9b1ce52efc8be77e611eed Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 1 Oct 2024 14:44:34 -0700
Subject: [PATCH 15/47] Version 2.0 of TellSeq support.

Version 2.0 of TellSeq support removes the master tellread.sh script and
the drop-in replacement TRConvertJob.py for Job()s that wrap individual
steps in the original script. These steps can be used in whole or in
part in varying order in the refactored SPP plugin (qp-klp).
---
 sequence_processing_pipeline/TRConvertJob.py  | 582 ------------------
 .../TRIntegrateJob.py                         | 139 +++++
 .../TRNormCountsJob.py                        | 142 +++++
 sequence_processing_pipeline/TellReadJob.py   | 181 ++++++
 .../templates/cloudspades-isolate.sbatch      |  25 +-
 .../templates/cloudspades.sbatch              |  25 +-
 ...e_sequence_counts_for_normalization.sbatch |  33 +-
 .../templates/integrate.sbatch                |  41 +-
 .../templates/telllink-isolate.sbatch         |  30 +-
 .../templates/telllink.sbatch                 |  30 +-
 .../templates/tellread-cleanup.sbatch         |   7 +-
 .../templates/tellread.sbatch                 |  83 +--
 .../templates/tellread.sh                     | 262 --------
 13 files changed, 510 insertions(+), 1070 deletions(-)
 delete mode 100644 sequence_processing_pipeline/TRConvertJob.py
 create mode 100644 sequence_processing_pipeline/TRIntegrateJob.py
 create mode 100644 sequence_processing_pipeline/TRNormCountsJob.py
 create mode 100644 sequence_processing_pipeline/TellReadJob.py
 delete mode 100755 sequence_processing_pipeline/templates/tellread.sh

diff --git a/sequence_processing_pipeline/TRConvertJob.py b/sequence_processing_pipeline/TRConvertJob.py
deleted file mode 100644
index 7a9e9a19..00000000
--- a/sequence_processing_pipeline/TRConvertJob.py
+++ /dev/null
@@ -1,582 +0,0 @@
-from jinja2 import Environment
-from os.path import split, join, exists
-from sequence_processing_pipeline.Job import Job, KISSLoader
-from sequence_processing_pipeline.PipelineError import PipelineError
-from os import rename, walk, chmod, listdir, makedirs
-from shutil import move, rmtree
-from re import match
-from metapool import load_sample_sheet
-
-"""
-Note in tellread.sbatch, {{lane}} needs to be:
-
-if [[ ${LANE} == "L001" ]]; then
-    lane=s_1
-elif [[ ${LANE} == "L002" ]]; then
-    lane=s_2
-elif [[ ${LANE} == "L003" ]]; then
-    lane=s_3
-elif [[ ${LANE} == "L004" ]]; then
-    lane=s_4
-elif [[ ${LANE} == "L005" ]]; then
-    lane=s_5
-elif [[ ${LANE} == "L006" ]]; then
-    lane=s_6
-elif [[ ${LANE} == "L007" ]]; then
-    lane=s_7
-elif [[ ${LANE} == "L008" ]]; then
-    lane=s_8
-else
-    echo "Unrecognized lane: ${LANE}"
-    exit 1
-fi
-
-make sure compute_sequence_counts_for_normalization2.sbatch gets {{tellread_output}} as defined in $TELLREAD_OUTPUT in tellread.sh
-
-"""
-
-class TRConvertJob(Job):
-    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
-                 node_count, nprocs, wall_time_limit, pmem, bcl_tool_path,
-                 modules_to_load, qiita_job_id):
-        """
-        TRConvertJob provides a convenient way to run bcl-convert or bcl2fastq
-        on a directory BCL files to generate Fastq files.
-        :param run_dir: The 'run' directory that contains BCL files.
-        :param output_path: Path where all pipeline-generated files live.
-        :param sample_sheet_path: The path to a sample-sheet.
-        :param queue_name: The name of the Torque queue to use for processing.
-        :param node_count: The number of nodes to request.
-        :param nprocs: The maximum number of parallel processes to use.
-        :param wall_time_limit: A hard time limit (in min) to bound processing.
-        :param bcl_tool_path: The path to either bcl2fastq or bcl-convert.
-        :param modules_to_load: A list of Linux module names to load
-        :param qiita_job_id: identify Torque jobs using qiita_job_id
-        """
-        super().__init__(run_dir,
-                         output_path,
-                         'ConvertJob',
-                         [bcl_tool_path],
-                         1000,
-                         modules_to_load=modules_to_load)
-
-        # for metagenomics pipelines, sample_sheet_path will reflect a real
-        # sample_sheet file. For amplicon pipelines, sample_sheet_path will
-        # reference a dummy sample_sheet file.
-        self.sample_sheet_path = sample_sheet_path
-        self.queue_name = queue_name
-        self.node_count = node_count
-        self.nprocs = nprocs
-        self.wall_time_limit = wall_time_limit
-        self.pmem = pmem
-        self.bcl_tool = bcl_tool_path
-        self.qiita_job_id = qiita_job_id
-        self.suffix = 'fastq.gz'
-
-        self.tellread_output_path = join(self.output_path, 'output')
-        makedirs(self.tellread_output_path)
-
-        self.tmp1_path = join(self.tellread_output_path, 'tmp1')
-
-        makedirs(self.tmp1_path)
-
-        # for projects that use sequence_processing_pipeline as a dependency,
-        # jinja_env must be set to sequence_processing_pipeline's root path,
-        # rather than the project's root path.
-        self.jinja_env = Environment(loader=KISSLoader('templates'),
-                                     # set Jinja2 comment strings to be
-                                     # anything other than '{#' and '#}',
-                                     # which can be used in shell scripts.
-                                     comment_start_string='%%%%%%%%%%',
-                                     comment_end_string='%%%%%%%%%%')
-
-        tmp = False
-        for executable_name in ['bcl2fastq', 'bcl-convert']:
-            if executable_name in self.bcl_tool:
-                tmp = True
-                break
-
-        if not tmp:
-            raise PipelineError(f'{self.bcl_tool} is not the path to a known'
-                                'executable')
-
-        self._file_check(self.sample_sheet_path)
-
-        # As the sample-sheet is validated by the Pipeline object before
-        # being passed to TRConvertJob, additional validation isn't needed.
-
-        # TODO: generate a sample-mapping to map C#s to fake sample-names and
-        #  fake projects. Process sample-sheet later.
-        self.mapping = self._generate_sample_mapping()
-
-        # TODO: hardcode lane at 'L001'
-        self.lane = 'L001'
-
-        self.clean_wall_time_limit = "24:00:00"
-        self.clean_mem_in_gb = "8"
-        self.clean_node_count = "1"
-        self.clean_cores_per_task = "1"
-        self.cloudspades_cores_per_task = "12"
-        self.cloudspades_mem_in_gb = "128"
-        self.cloudspades_modules = ["gcc_9.3.0"]
-        self.cloudspades_node_count = "1"
-        self.cloudspades_path = ("/home/qiita_test/qiita-spots/spades-"
-                                 "cloudspades-0.1")
-        self.cloudspades_wall_time_limit = "24:00:00"
-        self.counts_cores_per_task = "1"
-        self.counts_create_picklist_path = ("/home/qiita_test/qiita-spots/"
-                                            "create_picklist.py")
-        self.counts_mem_in_gb = "8"
-        self.counts_node_count = "1"
-        self.counts_other_file = ('20230906_FS10001773_68_BTR67708-1611.'
-                                  'read_counts.tsv')
-        self.counts_plot_counts_path = ("/home/qiita_test/qiita-spots/'"
-                                        "'plot_counts.py")
-        self.counts_sample_sheet = ("/home/qiita_test/qiita-spots/"
-                                    "20230906_FS10001773_68_BTR67708-1611.csv")
-        self.counts_wall_time_limit = "24:00:00"
-        self.cs_isolate_mem_in_gb = "64"
-        self.integrate_indicies_script_path = ("/home/qiita_test/qiita-spots/"
-                                               "integrate-indices-np.py")
-        self.integrate_mem_in_gb = "8"
-        self.integrate_node_count = "1"
-        self.integrate_wall_time_limit = "24:00:00"
-        self.integrate_cores_per_task = "1"
-        self.queue_name = "qiita"
-        self.tellink_cores_per_task = "16"
-        self.tellink_mem_in_gb = "160"
-        self.tellink_modules = ["singularity_3.6.4"]
-        self.tellink_node_count = "1"
-        self.tellink_sing_path = ("/projects/long_read_collab/code/tellseq/"
-                                  "release_v1.11/tellink-release/"
-                                  "run_tellink_sing.sh")
-        self.tellink_wall_time_limit = "96:00:00"
-        self.tellread_cores_per_task = "4"
-        self.tellread_mem_in_gb = "16"
-        self.tellread_modules = ["singularity_3.6.4"]
-        self.tellread_node_count = "1"
-        self.tellread_sing_script_path = ("$HOME/qiita-spots/tellread-release"
-                                          "-novaseqX/run_tellread_sing.sh")
-        self.tellread_wall_time_limit = "96:00:00"
-        self.tl_cores_per_task = "16"
-        self.tl_isolate_node_count = "1"
-        self.tl_isolate_wall_time_limit = "96:00:00"
-        self.tl_mem_in_gb = "160"
-        self.main_map = ("/home/qiita_test/qiita-spots/20230906_FS10001773_"
-                         "68_BTR67708-1611.csv")
-        self.main_mode = "metagenomic"
-        self.main_seqrun_path = ("/sequencing/seqmount/KL_iSeq_Runs/20230906"
-                                 "_FS10001773_68_BTR67708-1611")
-
-        # TODO: Address reference_map and reference_base
-        self.main_reference_base = ""
-        self.main_reference_map = ""
-
-        self._generate_job_scripts()
-
-    def _process_sample_sheet(self):
-        sheet = load_sample_sheet(self.sample_sheet_path)
-
-        if not sheet.validate_and_scrub_sample_sheet():
-            s = "Sample sheet %s is not valid." % self.sample_sheet_path
-            raise PipelineError(s)
-
-        header = sheet.Header
-        chemistry = header['chemistry']
-
-        if header['Assay'] not in Pipeline.assay_types:
-            s = "Assay value '%s' is not recognized." % header['Assay']
-            raise PipelineError(s)
-
-        sample_ids = []
-        for sample in sheet.samples:
-            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
-
-        bioinformatics = sheet.Bioinformatics
-
-        # reorganize the data into a list of dictionaries, one for each row.
-        # the ordering of the rows will be preserved in the order of the list.
-        lst = bioinformatics.to_dict('records')
-
-        # human-filtering jobs are scoped by project. Each job requires
-        # particular knowledge of the project.
-        return {'chemistry': chemistry,
-                'projects': lst,
-                'sample_ids': sample_ids}
-
-    def _generate_job_scripts(self):
-        scripts = [
-            {
-                "template": "cloudspades.sbatch",
-                "params": {
-                    "job_name": "cs-assemble",
-                    "wall_time_limit": self.wall_time_limit,
-                    "mem_in_gb": self.cloudspades_mem_in_gb,
-                    "node_count": self.cloudspades_node_count,
-                    "cores_per_task": self.cloudspades_cores_per_task,
-                    "queue_name": self.queue_name,
-                    "modules_to_load": ' '.join(self.cloudspades_modules),
-                    "cloudspades_path": self.cloudspades_path
-                }
-            },
-            {
-                "template": "cloudspades-isolate.sbatch",
-                "params": {
-                    "job_name": "cs-assemble",
-                    "wall_time_limit": self.cloudspades_wall_time_limit,
-                    "mem_in_gb": self.cs_isolate_mem_in_gb,
-                    "node_count": self.cloudspades_node_count,
-                    "cores_per_task": self.cloudspades_cores_per_task,
-                    "queue_name": self.queue_name,
-                    "modules_to_load": ' '.join(self.cloudspades_modules),
-                    "cloudspades_path": self.cloudspades_path
-                }
-            },
-            {
-                "template": "integrate.sbatch",
-                "params": {
-                    "job_name": "integrate",
-                    "wall_time_limit": self.integrate_wall_time_limit,
-                    "mem_in_gb": self.integrate_mem_in_gb,
-                    "node_count": self.integrate_node_count,
-                    "cores_per_task": self.integrate_cores_per_task,
-                    "iinp_script_path": self.integrate_indicies_script_path,
-                    "queue_name": self.queue_name
-                }
-            },
-            {
-                "template": "compute_sequence_counts_for_normalization.sbatch",
-                "params": {
-                    "job_name": "norm",
-                    "wall_time_limit": self.counts_wall_time_limit,
-                    "mem_in_gb": self.counts_mem_in_gb,
-                    "node_count": self.counts_node_count,
-                    "cores_per_task": self.counts_cores_per_task,
-                    "sample_sheet": self.counts_sample_sheet,
-                    "plot_counts_path": self.counts_plot_counts_path,
-                    "output_path": self.tellread_output_path,
-                    "create_picklist_path": self.counts_create_picklist_path,
-                    "read_counts_path": join(self.tellread_output_path,
-                                             self.counts_other_file),
-                    "queue_name": self.queue_name
-                }
-            },
-            {
-                "template": "telllink.sbatch",
-                "params": {
-                    "job_name": "tellink",
-                    "wall_time_limit": self.tellink_wall_time_limit,
-                    "mem_in_gb": self.tellink_mem_in_gb,
-                    "node_count": self.tellink_node_count,
-                    "cores_per_task": self.tellink_cores_per_task,
-                    "queue_name": self.queue_name,
-                    "modules_to_load": ' '.join(self.tellink_modules),
-                    "output_path": self.tellread_output_path,
-                    "sing_path": self.tellink_sing_path
-                }
-            },
-            {
-                "template": "telllink-isolate.sbatch",
-                "params": {
-                    "job_name": "tellink-isolate",
-                    "wall_time_limit": self.tellink_wall_time_limit,
-                    "node_count": self.tl_isolate_node_count,
-                    "cores_per_task": self.tl_cores_per_task,
-                    "mem_in_gb": self.tl_mem_in_gb,
-                    "queue_name": self.queue_name,
-                    "modules_to_load": ' '.join(self.tellink_modules),
-                    "output_path": self.tellread_output_path,
-                    "sing_path": self.tellink_sing_path
-                }
-            },
-            {
-                "template": "tellread.sbatch",
-                "params": {
-                    "job_name": "tellread",
-                    "wall_time_limit": self.tellread_wall_time_limit,
-                    "mem_in_gb": self.tellread_mem_in_gb,
-                    "node_count": self.tellread_node_count,
-                    "tmp_dir": self.tmp1_path,
-                    "cores_per_task": self.tellread_cores_per_task,
-                    "queue_name": self.queue_name,
-                    "sing_script_path": self.tellread_sing_script_path,
-                    "modules_to_load": ' '.join(self.tellread_modules)
-                }
-             },
-            {
-                "template": "tellread-cleanup.sbatch",
-                "params": {
-                    "job_name": "cleanup",
-                    "wall_time_limit": self.clean_wall_time_limit,
-                    "mem_in_gb": self.clean_mem_in_gb,
-                    "node_count": self.clean_node_count,
-                    "cores_per_task": self.clean_cores_per_task,
-                    "queue_name": self.queue_name
-                }
-             },
-            # these hardcoded paths for tellread.sh need to be replaced with
-            # the lane number and run-directory path, and the lane and the
-            # mode from the user input. Note that we also need to process the
-            # upcoming sample-sheet in order to generate the mapping we need
-            # as well.
-            {
-                "template": "tellread.sh",
-                "params": {
-                    "tellread_map": self.main_map,
-                    "seqrun_path": self.main_seqrun_path,
-                    "output_path": self.tellread_output_path,
-                    "lane": self.lane,
-                    "reference_map": self.main_reference_map,
-                    "reference_base": self.main_reference_base,
-                    "mode": self.main_mode
-                }
-             }
-        ]
-
-        for script in scripts:
-            template = self.jinja_env.get_template(script["template"])
-            params = script["params"]
-            job_script_path = join(self.output_path, script["template"])
-
-            with open(job_script_path, 'w') as f:
-                f.write(template.render(**params))
-                # TODO: Change from 777 to something more appropriate.
-                chmod(job_script_path, 0o777)
-
-    def run(self, callback=None):
-        """
-        Run BCL2Fastq/BCLConvert conversion
-        :param callback: optional function taking two parameters (id, status)
-                         that is called when a running process's status is
-                         changed.
-        :return:
-        """
-
-        # Unlike other Jobs that submit a Slurm script and wait for the job
-        # to complete, this Job() will execute an existing shell script that
-        # spawns all the jobs that perform the actual work.
-
-        # tellread.sh performs some work that requires it to run on a compute
-        # node. Since Job()s run on the interactive node, an interactive
-        # shell on a compute node must be requested for this script to run on.
-
-        # define 'sjob' here for clarity. This should be more than adequate
-        # resources to run the tellread.sh script and exit as it does not wait
-        # on its children to complete.
-
-        # as with the original scripts, the scripts generated by Jinja2 will
-        # live in the current working directory. Hence, the script will always
-        # exist at ./tellread.sh provided it was created successfully.
-        sjob = "srun -N 1 -n 1 -p qiita --mem 4g --time 1:00:00 --pty bash -l"
-        command = (f"{sjob}; pushd .;cd {self.output_path}; ./tellread.sh; "
-                   "popd; exit")
-
-        if not exists(join(self.output_path, 'tellread.sh')):
-            raise PipelineError("tellread.sh script could not be found.")
-
-        res = self._system_call(command)
-
-        if res['return_code'] != 0:
-            raise PipelineError("tellread.sh script did not execute correctly")
-
-        # once _system_call() returns and tellread.sh executed correctly, then
-        # a pids file should exist in the output subdirectory.
-        pids_fp = join(self.output_path, 'output', 'pids')
-        if not exists(pids_fp):
-            raise PipelineError("TRConvertJob could not locate a pids file")
-
-        with open(pids_fp, 'r') as f:
-            lines = f.readlines()
-            lines = [x.strip().split(': ') for x in lines]
-            results = {k: v for (k, v) in lines}
-
-        child_processes = [('main tellread', 'TRJOB_RETURN_CODE',
-                            'TRJOB_PID', True),
-                           ('counts', 'NORM_COUNTS_JOB_RETURN_CODE',
-                            'NORM_COUNTS_JOB_PID', False),
-                           ('integrate', 'INTEGRATE_JOB_RETURN_CODE',
-                            'INTEGRATE_JOB_PID', True),
-                           ('csj', 'CSJ_JOB_RETURN_CODE',
-                            'CSJ_JOB_PID', False),
-                           ('tlj', 'TLJ_JOB_RETURN_CODE',
-                            'TLJ_JOB_PID', False),
-                           ('cleanup', 'CLEANUP_JOB_RETURN_CODE',
-                            'CLEANUP_JOB_PID', True)]
-
-        # Iterate through all the TellRead script's known child processes.
-        # Some children will be optional depending on the parameters given,
-        # while others are required. The Job() should immediately raise an
-        # error if any child (optional or not) exits unsuccessfully, however.
-        for name, code, _, is_required in child_processes:
-            if code in results:
-                if results[code] != '0':
-                    raise PipelineError(f"An error ({results[code]}) occurred "
-                                        f"running {name} subprocess")
-            else:
-                if is_required:
-                    raise PipelineError(f"The {name} subprocess did not "
-                                        "execute correctly")
-
-        # Get a list of Slurm job ids that we need to wait on and text
-        # descriptions of what they are.
-        jids = [(results[x[2]], x[0]) for x in child_processes if
-                x[2] in results]
-
-        # ensure the jids are casted to integers before passing them.
-        statuses = self.wait_on_job_ids([int(x[0]) for x in jids])
-
-        for jid, description in jids:
-            status = statuses[jid]
-            if status not in Job.slurm_status_successful:
-                raise PipelineError(f"process '{description}' ({jid}) "
-                                    f"failed ({status})")
-
-        # post-process working directory to make it appear like results
-        # generated by ConvertJob
-
-        integrated_files_path = join(self.output_path, 'output', "integrated")
-
-        if not exists(integrated_files_path):
-            raise ValueError(f"{integrated_files_path} does not exist")
-
-        # move integrated directory to TRConvertJob directory, co-level with
-        # output directory. This makes it easier to delete the rest of the
-        # output that we don't need.
-
-        # move err and out logs into logs subdirectory.
-        for root, dirs, files in walk(self.output_path):
-            for _file in files:
-                _path = join(root, _file)
-                if _path.endswith('.err'):
-                    move(_path, join(self.output_path, 'logs'))
-                elif _path.endswith('.out'):
-                    move(_path, join(self.output_path, 'logs'))
-            # don't go below one level.
-            break
-
-        # save two logs and move them into standard Job logs directory.
-        move(join(self.output_path, 'output', 'log'),
-             join(self.output_path, 'logs'))
-        move(join(self.output_path, 'output', 'output.log'),
-             join(self.output_path, 'logs'))
-
-        # rename the files and move them into project directories.
-        for root, dirs, files in walk(integrated_files_path):
-            for _file in files:
-                fastq_file = join(root, _file)
-                self._post_process_file(fastq_file, self.mapping)
-
-        # move project folders from integrated directory to working_dir.
-        contents = listdir(integrated_files_path)
-        for name in contents:
-            move(join(integrated_files_path, name),
-                 self.output_path)
-
-        # delete the original output directory.
-        rmtree(join(self.output_path, 'output'))
-
-    def run2(self, callback=None):
-        norm = True
-        assemble = True
-
-
-
-        tr_job = self.submit_job('tr.script')
-        if tr_job['job_state'] != 'COMPLETED':
-            raise ValueError("TR JOB (%s) FAILED" % tr_job['job_id'])
-
-        if norm is True:
-            '''
-                cp ${norm_script} ${normcopy}
-                chmod gou-w ${normcopy}
-            '''
-            nc_job = self.submit_job('norm_script')
-            if nc_job['job_state'] != 'COMPLETED':
-                raise ValueError("BC JOB (%s) FAILED" % nc_job['job_id'])
-
-        int_job = self.submit_job('integrate.script')
-        if int_job['job_state'] != 'COMPLETED':
-            raise ValueError("INT JOB (%s) FAILED" % int_job['job_id'])
-
-        if assemble is True:
-            # NB assemble jobs rely on successful integrate job
-            csj_job = self.submit_job('csj_script')
-            if csj_job['job_state'] != 'COMPLETED':
-                raise ValueError("CSJ JOB (%s) FAILED" % csj_job['job_id'])
-
-            tlj_job = self.submit_job('tlj_script')
-            if tlj_job['job_state'] != 'COMPLETED':
-                raise ValueError("TLJ JOB (%s) FAILED" % tlj_job['job_id'])
-
-        cleanup_job = self.submit_job('cleanup.script')
-        if cleanup_job['job_state'] != 'COMPLETED':
-            raise ValueError("CLEANUP JOB (%s) FAILED" % cleanup_job['job_id'])
-
-
-
-
-
-
-
-    def parse_logs(self):
-        raise PipelineError("parsing logs not implemented.")
-
-    @staticmethod
-    def parse_job_script(job_script_path):
-        raise PipelineError("parsing job script not implemented.")
-
-    def _post_process_file(self, fastq_file, mapping):
-        # generate names of the form generated by bcl-convert/bcl2fastq:
-        # <Sample_ID>_S#_L00#_<R# or I#>_001.fastq.gz
-        # see:
-        # https://help.basespace.illumina.com/files-used-by-basespace/
-        # fastq-files
-        _dir, _file = split(fastq_file)
-
-        # ex: integrated/C544.R2.fastq.gz
-        m = match(r"(C5\d\d)\.([R,I]\d)\.fastq.gz", _file)
-
-        if m is None:
-            raise ValueError(f"The filename '{_file}' is not of a "
-                             "recognizable form")
-
-        adapter_id = m[1]
-        read_type = m[2]
-
-        if adapter_id not in mapping:
-            raise ValueError(f"{adapter_id} is not present in mapping")
-
-        sample_name, sample_index, project_name = mapping[adapter_id]
-
-        # generate the new filename for the fastq file, and reorganize the
-        # files by project.
-        new_name = "%s_S%d_%s_%s_001.fastq.gz" % (sample_name,
-                                                  sample_index,
-                                                  self.lane,
-                                                  read_type)
-
-        # ensure that the project directory exists before we rename and move
-        # the file to that location.
-        makedirs(join(_dir, project_name), exist_ok=True)
-
-        # if there's an error renaming and moving the file, let it pass up to
-        # the user.
-        final_path = join(_dir, project_name, new_name)
-        rename(fastq_file, final_path)
-        return final_path
-
-    def _generate_sample_mapping(self):
-        # this generates a sample mapping for the C501-C596 adapters used by
-        # the vendor to a sample-name and project. In production use this
-        # mapping would need to be created from the future sample-sheet.
-        project_names = ['Project1', 'Project2', 'Project3']
-        sample_mapping = {}
-
-        for sample_index in range(1, 97):
-            adapter_id = "C%s" % str(sample_index + 500)
-            sample_name = "MySample%d" % sample_index
-            project_name = project_names[sample_index % 3]
-            sample_mapping[adapter_id] = (sample_name, sample_index,
-                                          project_name)
-
-        return sample_mapping
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
new file mode 100644
index 00000000..076a15fe
--- /dev/null
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -0,0 +1,139 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TRIntegrateJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, indicies_script_path, label,
+                 reference_base, reference_map, cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: None
+        :param indicies_script_path: None
+        :param label: None
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRIntegrateJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+        self.indicies_script_path = indicies_script_path
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.sample_count = len(self.sample_ids)
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.label = label
+
+        if self.reference_base != None or self.reference_map != None:
+            tag = 'reference-based'
+        else:
+            tag = 'reference-free'
+
+        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, 'integrate.sbatch')
+        template = self.jinja_env.get_template("integrate2.sbatch")
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "integrate",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "iinp_script_path": self.indicies_script_path,
+                "queue_name": self.queue_name,
+                "output_dir": self.output_path}))
+
+        return job_script_path
diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py
new file mode 100644
index 00000000..09e36a67
--- /dev/null
+++ b/sequence_processing_pipeline/TRNormCountsJob.py
@@ -0,0 +1,142 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TRNormCountsJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, indicies_script_path, label,
+                 reference_base, reference_map, cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: None
+        :param indicies_script_path: None
+        :param label: None
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRIntegrateJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+        self.indicies_script_path = indicies_script_path
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.sample_count = len(self.sample_ids)
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.label = label
+
+        if self.reference_base != None or self.reference_map != None:
+            tag = 'reference-based'
+        else:
+            tag = 'reference-free'
+
+        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, 'compute_sequence_counts_for_normalization.sbatch')
+        template = self.jinja_env.get_template("compute_sequence_counts_for_normalization2.sbatch")
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "#job_name": "integrate",
+                "#wall_time_limit": self.wall_time_limit,
+                "#mem_in_gb": self.jmem,
+                "#node_count": self.node_count,
+                "#cores_per_task": self.cores_per_task,
+                "#queue_name": self.queue_name,
+                "#output_path": self.output_path,
+                "read_counts_path": "TODO",
+                "sample_sheet": "TODO",
+                "tellread_output": "TODO"
+            }))
+
+        return job_script_path
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
new file mode 100644
index 00000000..859974a4
--- /dev/null
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -0,0 +1,181 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class TellReadJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, indicies_script_path, label,
+                 reference_base, reference_map, tmp1_path, sing_script_path,
+                 lane, cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: None
+        :param indicies_script_path: None
+        :param label: None
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRIntegrateJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+        self.indicies_script_path = indicies_script_path
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.sample_count = len(self.sample_ids)
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.label = label
+        self.sing_script_path = sing_script_path
+        self.tmp1_path = tmp1_path
+
+        # force self.lane_number to be int. raise an Error if it's not.
+        tmp = int(lane)
+        if tmp < 1 or tmp > 8:
+            raise ValueError(f"'{tmp}' is not a valid lane number")
+        self.lane_number = tmp
+
+        if self.reference_base != None or self.reference_map != None:
+            tag = 'reference-based'
+        else:
+            tag = 'reference-free'
+
+        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  '-c ${sbatch_cores}',
+                  '--mem ${sbatch_mem}',
+                  '--time ${wall}']
+
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TellReadJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TellReadJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, 'integrate.sbatch')
+        template = self.jinja_env.get_template("tellread2.sbatch")
+
+        # generate a comma separated list of sample-ids from the tuples stored
+        # in self.sample_ids.
+
+        # NB: the current sample-sheet format used for TellRead doesn't include
+        # sample-names and sample-ids, only sample_id. e.g. C501,C502,etc.
+        # Hence, when a final sample sheet format is ready, it may be prudent
+        # to switch this to pull values from the expected sample-names column
+        # instead.
+        samples = ','.join([id[0] for id in self.sample_ids])
+
+        # since we haven't included support for reference_map yet, whenever a
+        # reference is not included, the mapping against the list of sample_ids
+        # is ['NONE', 'NONE', ..., 'NONE'].
+        refs = ','.join(['NONE' for _ in self.sample_ids])
+
+        extra = ""
+
+        # if reference_base is added in the future and is defined, exta needs
+        # to be f"-f {reference_base}".
+        # extra = "-f ${REFBASE}"
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "tellread",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "queue_name": self.queue_name,
+                "sing_script_path": self.sing_script_path,
+                "tmp_dir": self.tmp1_path,
+                "modules_to_load": ' '.join(self.modules_to_load),
+                "lane": f"s_{self.lane_number}",
+                # TODO: make sure this is the 'ConvertJob/output' directory
+                "output": self.output_path,
+                "rundir_path": self.root_dir,
+                "samples": samples,
+                "refs": refs,
+                "extra": extra
+            }))
+
+        return job_script_path
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index 261c11c7..1ac51b2e 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -10,35 +10,18 @@
 #SBATCH --error cloudspades-isolate_%x-%A_%a.err
 
 source activate qiime2-2023.5
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
 
 set -x 
 set -e
 
-echo $TMPDIR
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base=${OUTPUT}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
 module load {{modules_to_load}}
 
-samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))            
+samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))            
 
 # assumes 1-based array index, eg --array 1-N
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
-cs=${base}/cloudspades-isolate/${sample}
+cs={{output_path}}/cloudspades-isolate/${sample}
 
 if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     if [[ -d ${cs} ]]; then
@@ -51,8 +34,8 @@ pushd {{cloudspades_path}}/assembler/bin
 
 ./spades.py \
     -o ${cs} \
-    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
     -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
 module unload gcc_9.3.0
 popd
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index 636dd5ce..72efb140 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -10,35 +10,18 @@
 #SBATCH --error cloudspades_%x-%A_%a.err
 
 source activate qiime2-2023.5
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
 
 set -x 
 set -e
 
-echo $TMPDIR
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base=${OUTPUT}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
 module load {{modules_to_load}}
 
-samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))            
+samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))            
 
 # assumes 1-based array index, eg --array 1-N
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
-cs=${base}/cloudspades/${sample}
+cs={{output_path}}/cloudspades/${sample}
 
 if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     if [[ -d ${cs} ]]; then
@@ -51,8 +34,8 @@ pushd {{cloudspades_path}}/assembler/bin
 
 ./spades.py \
     -o ${cs} \
-    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
+    --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
+    --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
     --meta \
     -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
 module unload gcc_9.3.0
diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
index a4b31114..ab8af109 100644
--- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
+++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
@@ -12,45 +12,14 @@
 # NB: output appears normal w/out.
 # source activate qiime2-2023.5
 
-function logger () {
-    echo "$(date) :: ${@}";
-    echo "$(date) :: ${@}" 1>&2;
-}
-
 set -x
 set -e
 set -o pipefail
 
 echo $TMPDIR
 
-tellread=${TELLREAD_OUTPUT}
-if [[ ! -d ${tellread} ]]; then
-    echo "${tellread} not found"
-    exit 1
-fi
-
-if [[ ! -d ${tellread}/Full ]]; then
-    echo "${tellread}/Full not found"
-    exit 1
-fi
-
-if [[ -z {{output_path}} ]]; then
-    echo "OUTPUT not specified"
-    exit 1
-fi
-
-if [[ -z {{sample_sheet}} ]]; then
-    echo "SAMPLESHEET not specified"
-    exit 1
-fi
-
-if [[ ! -f {{sample_sheet}} ]]; then
-    echo "SAMPLESHEET not found"
-    exit 1
-fi
-
 mkdir -p {{output_path}}
-wc -l ${tellread}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt
+wc -l {{tellread_output}}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt
 python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}}
 
 conda activate qp-knight-lab-processing-2022.03
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index 30a3a9ba..8c767382 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -9,47 +9,26 @@
 #SBATCH --output integrate_%x-%A_%a.out
 #SBATCH --error integrate_%x-%A_%a.err
 
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
-cores=${SLURM_CPUS_PER_TASK}
-
+# NB SLURM_ARRAY_TASK_ID is exported by Slurm
 if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
     echo "Not operating in an array"
     exit 1
 fi
 
+# NB SLURM_ARRAY_TASK_MIN is exported by Slurm
 if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
     echo "Line extraction assumes 1-based index"
     exit 1
 fi
 
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-if [[ -z ${BASE} ]]; then
-    echo "BASE not specified"
-    exit 1
-fi
-
-tellread=${OUTPUT}
-if [[ ! -d ${tellread} ]]; then
-    echo "${tellread} not found"
-    exit 1
-fi
-
 set -x 
 set -e
 set -o pipefail
 
-samples=($(cat ${tellread}/sample_index_list_output.txt | cut -f 2))
+samples=($(cat {{output_dir}}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
+# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT.
 export TMPDIR=$(mktemp -d)
 function cleanup {                                                              
   echo "Removing $TMPDIR"                                                          
@@ -59,8 +38,8 @@ function cleanup {
 trap cleanup EXIT
 
 files=${TMPDIR}/integration.files
-/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files}
-mkdir -p ${tellread}/integrated
+/bin/ls -1 {{output_dir}}/Full/*corrected.err_barcode_removed.fastq > ${files}
+mkdir -p {{output_dir}}/integrated
 
 if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
     echo "Multiple matches for ${sample} R1"
@@ -80,9 +59,9 @@ fi
 r1=$(grep -m 1 "_R1_${sample}" ${files})
 r2=$(grep -m 1 "_R2_${sample}" ${files})
 i1=$(grep -m 1 "_I1_${sample}" ${files})
-r1out=${tellread}/integrated/${sample}.R1.fastq.gz
-r2out=${tellread}/integrated/${sample}.R2.fastq.gz
-i1out=${tellread}/integrated/${sample}.I1.fastq.gz
+r1out={{output_dir}}/integrated/${sample}.R1.fastq.gz
+r2out={{output_dir}}/integrated/${sample}.R2.fastq.gz
+i1out={{output_dir}}/integrated/${sample}.I1.fastq.gz
 
 if [[ ! -s ${r1} ]]; then
     echo "${r1} is empty, cannot integrate"
@@ -114,4 +93,4 @@ python {{iinp_script_path}} integrate \
     --i1-in ${i1} \
     --r1-out ${r1out} \
     --r2-out ${r2out} \
-    --threads ${cores} 
+    --threads ${SLURM_CPUS_PER_TASK}
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index b8f9d735..90e04012 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -14,25 +14,13 @@ set -e
 
 module load {{modules_to_load}}
 
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base={{output_path}}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))
+samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
 k=79
 lc=35
-cores=${SLURM_CPUS_PER_TASK}
 
-tl=${base}/tell-link-isolate/${sample}
+tl={{output_path}}/tell-link-isolate/${sample}
 if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     if [[ -d ${tl} ]]; then
         rm -fr ${tl}
@@ -42,16 +30,16 @@ fi
 mkdir -p ${tl}
 
 {{sing_path}} \
-    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
-    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
-    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
-    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
+    -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
+    -i1 {{output_path}}}/integrated/${sample}.I1.fastq.gz \
+    -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \
     -k ${k} \
     -lc ${lc} \
     -p ${sample} \
-    -j ${cores}
+    -j ${SLURM_CPUS_PER_TASK}
 
 # remove temporary data
-if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
 fi
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
index 234192b2..efdf0578 100644
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -14,26 +14,14 @@ set -e
 
 module load {{modules_to_load}}
 
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABEL is not specified"
-    exit 1
-fi
-
-base={{output_path}}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-samples=($(cat ${base}/sample_index_list_output.txt | cut -f 2))
+samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
 # TODO: leave these hardcoded for now
 k=79
 lc=35
-cores=${SLURM_CPUS_PER_TASK}
 
-tl=${base}/tell-link/${sample}
+tl={{output_path}}/tell-link/${sample}
 if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
     if [[ -d ${tl} ]]; then
         rm -fr ${tl}
@@ -43,17 +31,17 @@ fi
 mkdir -p ${tl}
 
 {{sing_path}} \
-    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
-    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
-    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
+    -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
+    -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
+    -i1 {{output_path}}/integrated/${sample}.I1.fastq.gz \
     -d metagenomics \
-    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
+    -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \
     -k ${k} \
     -lc ${lc} \
     -p ${sample} \
-    -j ${cores}
+    -j ${SLURM_CPUS_PER_TASK}
 
 # remove temporary data
-if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
+if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
+    rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
 fi
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
index 2cb479e7..e5b0873e 100644
--- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -9,10 +9,5 @@
 #SBATCH --output tellread-cleanup_%x-%A.out
 #SBATCH --error tellread-cleanup_%x-%A.err
 
-if [[ -z "${OUTPUT}" ]]; then
-    echo "OUTPUT is not specified"
-    exit 1
-fi
-
 # remove unused large outputs
-rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
+rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index fe8d39d9..da439836 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -9,90 +9,27 @@
 #SBATCH --output tellread_%x-%A.out
 #SBATCH --error tellread_%x-%A.err
 
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-set -x 
-
-if [[ -z "${N_SAMPLES}" ]]; then
-    echo "N_SAMPLES is not specified"
-    exit 1
-fi
-
-if [[ -z "${SEQRUNPATH}" ]]; then
-    echo "SEQRUNPATH is not specified"
-    exit 1
-fi
-
-if [[ -z "${LANE}" ]]; then
-    echo "LANE is not specified"
-    exit 1
-fi
-
-if [[ -z "${SAMPLES}" ]]; then
-    echo "SAMPLES is not specified"
-    exit 1
-fi
-
-if [[ -z "${REFS}" ]]; then
-    echo "REFS is not specified"
-    exit 1
-fi
-
-if [[ -z "${OUTPUT}" ]]; then
-    echo "OUTPUT is not specified"
-    exit 1
-fi
+set -x
 
 export TMPDIR={{tmp_dir}}
 mkdir -p ${TMPDIR}
 export TMPDIR=$(mktemp -d)
-seqrun_path=${SEQRUNPATH}
-
-if [[ ${LANE} == "L001" ]]; then
-    lane=s_1
-elif [[ ${LANE} == "L002" ]]; then
-    lane=s_2
-elif [[ ${LANE} == "L003" ]]; then
-    lane=s_3
-elif [[ ${LANE} == "L004" ]]; then
-    lane=s_4
-elif [[ ${LANE} == "L005" ]]; then
-    lane=s_5
-elif [[ ${LANE} == "L006" ]]; then
-    lane=s_6
-elif [[ ${LANE} == "L007" ]]; then
-    lane=s_7
-elif [[ ${LANE} == "L008" ]]; then
-    lane=s_8
-else
-    echo "Unrecognized lane: ${LANE}"
-    exit 1
-fi
-
-# yes, hard coded, not great but progress.
-extra=""
-if [[ ! -z ${REFBASE} ]]; then
-    extra="-f ${REFBASE}"
-fi
 
-mkdir -p ${OUTPUT}
+mkdir -p {{output}}
     
 module load {{modules_to_load}}
 {{sing_script_path}} \
-    -i ${seqrun_path} \
-    -o ${OUTPUT} \
-    -s $(echo ${SAMPLES} | tr -d '"') \
-    -g $(echo ${REFS} | tr -d '"') \
+    -i {{rundir_path}} \
+    -o {{output}} \
+    -s $(echo {{samples}} | tr -d '"') \
+    -g $(echo {{refs}} | tr -d '"') \
     -j ${SLURM_JOB_CPUS_PER_NODE} \
-    ${extra} \
-    -l ${lane}
+    {{extra}} \
+    -l {{lane}}
 
-if [[ -d ${OUTPUT}/Full ]]; then
+if [[ -d {{output}}/Full ]]; then
     echo "Run appears successful"
-elif [[ -d ${OUTPUT}/1_demult/Full ]]; then
+elif [[ -d {{output}}/1_demult/Full ]]; then
     echo "Run appears unsuccessful but has output"
     exit 1
 else
diff --git a/sequence_processing_pipeline/templates/tellread.sh b/sequence_processing_pipeline/templates/tellread.sh
deleted file mode 100755
index d6c61cb0..00000000
--- a/sequence_processing_pipeline/templates/tellread.sh
+++ /dev/null
@@ -1,262 +0,0 @@
-#!/bin/bash
-samplesheet="{{tellread_map}}"      # previously -i option
-seqrunpath="{{seqrun_path}}"        # previously -s option
-lane="{{lane}}"                     # previously -l option
-reference_map="{{reference_map}}"   # previously -r option
-reference_base="{{reference_base}}" # previously -b option
-mode="{{mode}}"                     # previously -m option
-
-# preserve error-checking of parameters to preserve as much of the original
-# script as possible, even though this could be done in python.
-
-# https://unix.stackexchange.com/a/621007
-: ${seqrunpath:?Missing -s}
-: ${lane:?Missing -i}
-
-if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then
-    if [[ -z ${reference_map} ]]; then
-        echo "-b used without -r"
-        exit 1
-    fi
-    if [[ -z ${reference_base} ]]; then
-        echo "-r used without -b"
-        exit 1
-    fi
-    if [[ ! -d ${reference_base} ]]; then
-        echo "reference base not found"
-        exit 1
-    fi
-
-    tag=reference-based
-else
-    tag=reference-free
-fi
-
-# trim trailing slash
-# https://stackoverflow.com/a/32845647/19741
-safepath=$(echo ${seqrunpath} | sed 's:/*$::')
-label=$(basename ${safepath})
-labeltag=${label}-${tag}
-output={{output_path}}
-
-if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
-    echo "Cannot access the lane"
-    exit 1
-fi
-
-# for now this can stay here to keep greater compatibility with the original script.
-# however these fields should eventually be parameters that can be configured in the config file.
-
-if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then
-    sbatch_cores=2
-    sbatch_mem=8G
-    norm=TRUE
-    wall=24:00:00
-    mode=NA
-elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then
-    sbatch_cores=2
-    sbatch_mem=8G
-    norm=TRUE
-    wall=24:00:00
-    mode=NA
-else
-    sbatch_cores=16
-    sbatch_mem=160G
-    norm=FALSE
-    assemble=TRUE
-    wall=48:00:00
-fi
-
-if [[ ${mode} == "isolate" ]]; then
-    ISOLATE_MODE=TRUE
-elif [[ ${mode} == "metagenomic" ]]; then
-    ISOLATE_MODE=FALSE
-elif [[ ${mode} == "NA" ]]; then
-    ISOLATE_MODE=FALSE
-else
-    echo "unknown mode: ${mode}"
-    exit 1
-fi
-
-set -e
-set -o pipefail
-
-declare -a s
-declare -a g
-# below extended regex might be broken because C5\d\d happens in column 0, not column 1
-# of the hacked sample-sheet.
-# for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
-
-# new sample-sheet is of form:
-# Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_ID,Sample_Project,Well_description,Lane
-# 10283.LS.4.4.2015,10283.LS.4.4.2015,Plate_1,A1,C501,LS_Timeseries_TellSeq_10283,10283.LS.4.4.2015,1
-for sample in $(egrep -o ",C5..," ${samplesheet} | tr -d "," | sort)
-do
-    echo "sample found: ${sample}"
-    # get references if they exist
-    if [[ -f ${reference_map} ]]; then
-        if $(grep -Fq ${sample} ${reference_map}); then
-            ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n")
-            if [[ ${ref} != "NONE" ]]; then
-                if [[ ! -d "${reference_base}/${ref}" ]]; then
-                    echo "${reference_base}/${ref}"
-                    echo "${ref} not found"
-                    exit 1
-                fi
-                g[${#g[@]}]=${ref}
-                s[${#s[@]}]=${sample}
-            fi
-        fi
-    else
-        g[${#g[@]}]=NONE
-        s[${#s[@]}]=${sample}
-    fi
-done
-n_samples=${#s[@]}
-
-# https://stackoverflow.com/a/17841619/19741
-function join_by { local IFS="$1"; shift; echo "$*"; }
-s=$(join_by , "${s[@]}")
-g=$(join_by , "${g[@]}")
-
-base=$(dirname ${0})
-submit_script=$(dirname ${0})/tellread.sbatch
-integrate_script=$(dirname ${0})/integrate.sbatch
-norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch
-asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
-clean_script=$(dirname ${0})/tellread-cleanup.sbatch
-
-if [[ ${ISOLATE_MODE} == "TRUE" ]]; then
-    asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch
-    asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch
-else
-    asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
-    asm_tellink_script=$(dirname ${0})/telllink.sbatch
-fi
-
-if [[ ! -f ${submit_script} ]]; then
-    echo "Cannot access submit script"
-    exit 1
-fi
-if [[ ! -f ${asm_cloudspades_script} ]]; then
-    echo "Cannot access cloudspades assembly script"
-    exit 1
-fi
-if [[ ! -f ${asm_tellink_script} ]]; then
-    echo "Cannot access tell-link assembly script"
-    exit 1
-fi
-if [[ ! -f ${integrate_script} ]]; then
-    echo "Cannot access integrate script"
-    exit 1
-fi
-if [[ ! -f ${clean_script} ]]; then
-    echo "Cannot access clean script"
-    exit 1
-fi
-
-datetag=$(date "+%Y.%m.%d")
-scriptcopy=$(pwd)/tellread_script-${datetag}.sh
-submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch
-asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch
-asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch
-normcopy=$(pwd)/norm_submission-${datetag}.sbatch
-intcopy=$(pwd)/integrate_submission-${datetag}.sbatch
-cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch
-arguments=$(pwd)/provided_script_arguments.txt
-if [[ -f ${scriptcopy} ]]; then
-    echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit"
-    exit 1
-fi
-if [[ -f ${submitcopy} ]]; then
-    echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit"
-    exit 1
-fi
-
-#TODO: Other possible arguments like -r?
-echo "-l {{lane}} -s {{seqrun_path}} -i {{tellread_map}} -m {{mode}}" >${arguments} 
-
-cp ${0} ${scriptcopy}
-cp ${submit_script} ${submitcopy}
-cp ${asm_cloudspades_script} ${asmcscopy}
-cp ${asm_tellink_script} ${asmtlcopy}
-cp ${integrate_script} ${intcopy}
-cp ${clean_script} ${cleancopy}
-chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy}
-
-set -x
-
-trjob=$(sbatch \
-          --parsable \
-          -J ${labeltag}-${datetag} \
-          -c ${sbatch_cores} \
-          --mem ${sbatch_mem} \
-          --time ${wall} \
-          --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
-          ${submit_script})
-
-echo "TRJOB_RETURN_CODE: $?" > {{output_path}}/pids
-echo "TRJOB_PID: $trjob" >> {{output_path}}/pids
-
-if [[ ${norm} == "TRUE" ]]; then
-    cp ${norm_script} ${normcopy}
-    chmod gou-w ${normcopy}
-    norm_counts_job=$(sbatch \
-                        --parsable \
-                        --dependency=afterok:${trjob} \
-                        -J ${labeltag}-${datetag}-norm-counts \
-                        --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \
-                        ${norm_script})
-    echo "NORM_COUNTS_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
-    echo "NORM_COUNTS_JOB_PID: $norm_counts_job" >> {{output_path}}/pids
-fi
-
-integrate_job=$(sbatch \
-                    --parsable \
-                    -J ${labeltag}-${datetag}-integrate \
-                    --dependency=afterok:${trjob} \
-                    --array 1-${n_samples} \
-                    --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \
-                    ${integrate_script})
-
-echo "INTEGRATE_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
-echo "INTEGRATE_JOB_PID: $integrate_job" >> {{output_path}}/pids
-
-if [[ ${assemble} == "TRUE" ]]; then
-    csj=$(sbatch \
-            --parsable \
-            --dependency=aftercorr:${integrate_job} \
-            -J ${labeltag}-${datetag}-cloudspades \
-            --array 1-${n_samples} \
-            --export LABELTAG=${labeltag},OUTPUT=${output} \
-            ${asm_cloudspades_script})
-
-    echo "CSJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
-    echo "CSJ_JOB_PID: $csj" >> {{output_path}}/pids
-
-    tlj=$(sbatch \
-            --parsable \
-            --dependency=aftercorr:${integrate_job} \
-            -J ${labeltag}-${datetag}-tell-link \
-            --array 1-${n_samples} \
-            --export LABELTAG=${labeltag},OUTPUT=${output} \
-            ${asm_tellink_script})
-
-    echo "TLJ_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
-    echo "TLJ_JOB_PID: $tlj" >> {{output_path}}/pids
-
-    cleanupdep=${csj}:${tlj}
-else
-    cleanupdep=${integrate_job}
-    echo "Not assembling"
-fi
-
-cleanup=$(sbatch \
-            --parsable \
-            -J ${labeltag}-${datetag}-cleanup \
-            --dependency=afterok:${cleanupdep} \
-            --export OUTPUT=${output} \
-            ${clean_script})
-
-echo "CLEANUP_JOB_RETURN_CODE: $?" >> {{output_path}}/pids
-echo "CLEANUP_JOB_PID: $cleanup" >> {{output_path}}/pids

From 6818d440fbdc949fc829b8720525cf348d08d363 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Thu, 3 Oct 2024 18:37:41 -0700
Subject: [PATCH 16/47] Creation tests added for new TellReadJob() class.

---
 sequence_processing_pipeline/TellReadJob.py   |  41 +--
 .../templates/tellread.sbatch                 |   5 +-
 .../cloudspades-isolate.sbatch                |  84 -------
 .../data/tellread_output/cloudspades.sbatch   |  81 ------
 .../data/tellread_output/integrate.sbatch     | 125 ----------
 .../tellread_output/telllink-isolate.sbatch   |  62 -----
 .../data/tellread_output/telllink.sbatch      |  64 -----
 .../tellread_output/tellread-cleanup.sbatch   |  23 --
 .../data/tellread_output/tellread.sbatch      | 108 --------
 .../tests/data/tellread_output/tellread.sh    | 236 ------------------
 .../tests/test_TellReadJob.py                 |  99 ++++++++
 11 files changed, 123 insertions(+), 805 deletions(-)
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread.sh
 create mode 100644 sequence_processing_pipeline/tests/test_TellReadJob.py

diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 859974a4..2f7905d5 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -6,6 +6,7 @@
 from .Pipeline import Pipeline
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
+from datetime import datetime
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -14,9 +15,9 @@
 class TellReadJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, max_array_length, indicies_script_path, label,
-                 reference_base, reference_map, tmp1_path, sing_script_path,
-                 lane, cores_per_task=4):
+                 qiita_job_id, label, reference_base,
+                 reference_map, tmp1_path, sing_script_path, lane,
+                 cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -29,8 +30,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param jmem: String representing total memory limit for entire job.
         :param modules_to_load: A list of Linux module names to load
         :param qiita_job_id: identify Torque jobs using qiita_job_id
-        :param max_array_length: None
-        :param indicies_script_path: None
         :param label: None
         :param reference_base: None
         :param reference_map: None
@@ -38,9 +37,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         """
         super().__init__(run_dir,
                          output_path,
-                         'TRIntegrateJob',
+                         'TellReadJob',
                          [],
-                         max_array_length,
+                         1,
                          modules_to_load=modules_to_load)
 
         self.sample_sheet_path = sample_sheet_path
@@ -51,7 +50,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.node_count = node_count
         self.wall_time_limit = wall_time_limit
         self.cores_per_task = cores_per_task
-        self.indicies_script_path = indicies_script_path
 
         self.reference_base = reference_base
         self.reference_map = reference_map
@@ -59,9 +57,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         # raise an Error if jmem is not a valid floating point value.
         self.jmem = str(int(jmem))
         self.qiita_job_id = qiita_job_id
-        self.sample_count = len(self.sample_ids)
         self.jinja_env = Environment(loader=KISSLoader('templates'))
-        self.label = label
         self.sing_script_path = sing_script_path
         self.tmp1_path = tmp1_path
 
@@ -71,12 +67,14 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
             raise ValueError(f"'{tmp}' is not a valid lane number")
         self.lane_number = tmp
 
-        if self.reference_base != None or self.reference_map != None:
+        # TODO: Need examples of these being not None
+        if self.reference_base is not None or self.reference_map is not None:
             tag = 'reference-based'
         else:
             tag = 'reference-free'
 
-        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+        date = datetime.today().strftime('%Y.%m.%d')
+        self.job_name = (f"{label}-{tag}-{date}-tellread")
 
     def run(self, callback=None):
         job_script_path = self._generate_job_script()
@@ -96,9 +94,12 @@ def run(self, callback=None):
         except JobFailedError as e:
             # When a job has failed, parse the logs generated by this specific
             # job to return a more descriptive message to the user.
-            info = self.parse_logs()
+            # TODO: We need more examples of failed jobs before we can create
+            #  a parser for the logs.
+            # info = self.parse_logs()
             # prepend just the message component of the Error.
-            info.insert(0, str(e))
+            # info.insert(0, str(e))
+            info = str(e)
             raise JobFailedError('\n'.join(info))
 
         logging.debug(f'TellReadJob {self.job_info["job_id"]} completed')
@@ -134,8 +135,8 @@ def _process_sample_sheet(self):
                 'sample_ids': sample_ids}
 
     def _generate_job_script(self):
-        job_script_path = join(self.output_path, 'integrate.sbatch')
-        template = self.jinja_env.get_template("tellread2.sbatch")
+        job_script_path = join(self.output_path, 'tellread_test.sbatch')
+        template = self.jinja_env.get_template("tellread.sbatch")
 
         # generate a comma separated list of sample-ids from the tuples stored
         # in self.sample_ids.
@@ -154,7 +155,7 @@ def _generate_job_script(self):
 
         extra = ""
 
-        # if reference_base is added in the future and is defined, exta needs
+        # if reference_base is added in the future and is defined, extra needs
         # to be f"-f {reference_base}".
         # extra = "-f ${REFBASE}"
 
@@ -170,8 +171,7 @@ def _generate_job_script(self):
                 "tmp_dir": self.tmp1_path,
                 "modules_to_load": ' '.join(self.modules_to_load),
                 "lane": f"s_{self.lane_number}",
-                # TODO: make sure this is the 'ConvertJob/output' directory
-                "output": self.output_path,
+                "output": join(self.output_path, "output"),
                 "rundir_path": self.root_dir,
                 "samples": samples,
                 "refs": refs,
@@ -179,3 +179,6 @@ def _generate_job_script(self):
             }))
 
         return job_script_path
+
+    def parse_logs(self):
+        raise PipelineError("parse_logs() not implemented for TellReadJob")
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index da439836..7d044bb7 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -16,15 +16,14 @@ mkdir -p ${TMPDIR}
 export TMPDIR=$(mktemp -d)
 
 mkdir -p {{output}}
-    
+
 module load {{modules_to_load}}
 {{sing_script_path}} \
     -i {{rundir_path}} \
     -o {{output}} \
     -s $(echo {{samples}} | tr -d '"') \
     -g $(echo {{refs}} | tr -d '"') \
-    -j ${SLURM_JOB_CPUS_PER_NODE} \
-    {{extra}} \
+    -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \
     -l {{lane}}
 
 if [[ -d {{output}}/Full ]]; then
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
deleted file mode 100644
index 7ec58058..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades-isolate.sbatch
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J cs-assemble             # cs-assemble
-#SBATCH --time 24:00:00  # 24:00:00
-#SBATCH --mem 64G        # 64G
-#SBATCH -N 1           # 1
-#SBATCH -c 12       # 12
-#SBATCH -p qiita           # qiita
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
-source activate qiime2-2023.5
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-set -x 
-set -e
-
-# this gets set in the environment from another script. For now let's
-# run with that.
-echo $TMPDIR
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base=${OUTPUT}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate activate qiime2-2023.5
-
-module load gcc_9.3.0 # gcc_9.3.0
-
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
-
-# assumes 1-based array index, eg --array 1-N
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-cs=${base}/cloudspades-isolate/${sample}
-
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${cs} ]]; then
-        rm -fr ${cs}
-    fi
-fi
-
-mkdir -p ${cs}
-
-pushd ~/spades-cloudspades-paper/assembler/
-./spades.py \
-    -o ${cs} \
-    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
-    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
-module unload gcc_9.3.0
-popd
-
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate quast
-
-quast \
-    -o ${cs}/quast-scaffolds \
-    -t ${SLURM_JOB_CPUS_PER_NODE} \
-    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
-
-# remove intermediates that currently dont have a downstream use
-if [[ -d ${cs}/K21 ]]; then
-    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
deleted file mode 100644
index d16dc2b0..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/cloudspades.sbatch
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J cs-assemble             # cs-assemble
-#SBATCH --time 24:00:00  # 24:00:00
-#SBATCH --mem 128G        # 128G
-#SBATCH -N 1           # 1
-#SBATCH -c 12       # 12
-#SBATCH -p qiita           # qiita
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
-source activate qiime2-2023.5
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-set -x 
-set -e
-
-echo $TMPDIR
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base=${OUTPUT}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-# mamba is a new environment we'll have to address. perhaps it's possible to change this when using qp-klp.
-# for now we will leave it hardcoded.
-mamba activate activate qiime2-2023.5
-
-module load gcc_9.3.0 # gcc_9.3.0
-
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
-
-# assumes 1-based array index, eg --array 1-N
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-cs=${base}/cloudspades/${sample}
-
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${cs} ]]; then
-        rm -fr ${cs}
-    fi
-fi
-
-mkdir -p ${cs}
-pushd /home/qiita_test/qiita-spots/spades-cloudspades-0.1/assembler/bin
-
-# for now don't use spades.py jinja2 variable
-./spades.py \
-    -o ${cs} \
-    --gemcode1-1 ${base}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 ${base}/integrated/${sample}.R2.fastq.gz \
-    --meta \
-    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
-module unload gcc_9.3.0
-popd
-
-mamba activate quast                                                           
-quast \
-    -o ${cs}/quast-scaffolds \
-    -t ${SLURM_JOB_CPUS_PER_NODE} \
-    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
-
-# remove intermediates that currently dont have a downstream use
-if [[ -d ${cs}/K21 ]]; then
-    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
deleted file mode 100644
index 6947c226..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/integrate.sbatch
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J integrate             # integrate
-#SBATCH --time 24:00:00  # 24:00:00
-#SBATCH --mem 8G        # 8G
-#SBATCH -N 1           # 1
-#SBATCH -c 1       # 1
-#SBATCH -p qiita           # qiita
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-# like mamba, source activate is an issue we'll have to address. for now we'll leave it hardcoded.
-source activate rust
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-
-# https://docs.hpc.shef.ac.uk/en/latest/referenceinfo/scheduler/SLURM/SLURM-environment-variables.html
-cores=${SLURM_CPUS_PER_TASK}
-
-if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
-    echo "Not operating in an array"
-    exit 1
-fi
-
-if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
-    echo "Line extraction assumes 1-based index"
-    exit 1
-fi
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-if [[ -z ${BASE} ]]; then
-    echo "BASE not specified"
-    exit 1
-fi
-
-tellread=${OUTPUT}
-if [[ ! -d ${tellread} ]]; then
-    echo "${tellread} not found"
-    exit 1
-fi
-
-set -x 
-set -e
-set -o pipefail
-
-samples=($(cat ${tellread}/sample_index_list_${LABELTAG}.txt | cut -f 2))            
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-export TMPDIR=$(mktemp -d)
-function cleanup {                                                              
-  echo "Removing $TMPDIR"                                                          
-  rm  -r $TMPDIR                                                                   
-  unset TMPDIR                                                                  
-}                                                                               
-trap cleanup EXIT
-
-files=${TMPDIR}/integration.files
-/bin/ls -1 ${tellread}/Full/*corrected.err_barcode_removed.fastq > ${files}
-mkdir -p ${tellread}/integrated
-
-if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R1"
-    exit 1
-fi
-
-if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R2"
-    exit 1
-fi
-
-if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} I1"
-    exit 1
-fi
-
-r1=$(grep -m 1 "_R1_${sample}" ${files})
-r2=$(grep -m 1 "_R2_${sample}" ${files})
-i1=$(grep -m 1 "_I1_${sample}" ${files})
-r1out=${tellread}/integrated/${sample}.R1.fastq.gz
-r2out=${tellread}/integrated/${sample}.R2.fastq.gz
-i1out=${tellread}/integrated/${sample}.I1.fastq.gz
-
-if [[ ! -s ${r1} ]]; then
-    echo "${r1} is empty, cannot integrate"
-    if [[ -s ${r2} ]]; then
-        echo "R1 and R2 are inconsistent"
-        exit 1
-    fi
-    if [[ -s ${i1} ]]; then
-        echo "R1 and I1 are inconsistent"
-        exit 1
-    fi
-
-    # reflect the empties so Qiita can know of them
-    touch ${r1out}
-    touch ${r2out}
-    touch ${i1out}
-    exit 0
-fi
-
-# this can probably be backgrounded but then you have to get creative to
-# not mask a nonzero exit status (e.g., the python process raising)
-cat ${i1} | gzip > ${i1out} 
-
-mamba activate tellread-integrate
-python ${BASE}/integrate-indices-np.py integrate \
-    --no-sort \
-    --r1-in ${r1} \
-    --r2-in ${r2} \
-    --i1-in ${i1} \
-    --r1-out ${r1out} \
-    --r2-out ${r2out} \
-    --threads ${cores} 
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
deleted file mode 100644
index 6a23331e..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/telllink-isolate.sbatch
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J tellink-isolate             # tellink-isolate
-#SBATCH -N 1           # 1
-#SBATCH -c 16       # 16
-#SBATCH --mem 160G        # 160G
-#SBATCH --time 96:00:00  # 96:00:00
-#SBATCH -p qiita           # qiita
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-set -x 
-set -e
-
-module load singularity_3.6.4 # singularity_3.6.4
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABELTAG is not specified"
-    exit 1
-fi
-
-base=/panfs/qiita/TELLREAD/${LABELTAG}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-k=79
-lc=35
-cores=${SLURM_CPUS_PER_TASK}
-
-tl=${base}/tell-link-isolate/${sample}
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${tl} ]]; then
-        rm -fr ${tl}
-    fi
-fi
-
-mkdir -p ${tl}
-
-/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \
-    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
-    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
-    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
-    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
-    -k ${k} \
-    -lc ${lc} \
-    -p ${sample} \
-    -j ${cores}
-
-# remove temporary data
-if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
-fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
deleted file mode 100644
index b6033b24..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/telllink.sbatch
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J tellink             # tellink
-#SBATCH --mem 160G        # 160G
-#SBATCH -N 1           # 1
-#SBATCH -c 16       # 16
-#SBATCH --time 96:00:00  # 96:00:00
-#SBATCH -p qiita           # qiita
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A_%a.out
-#SBATCH --error %x-%A_%a.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=FAIL
-
-set -x 
-set -e
-
-module load singularity_3.6.4 # singularity_3.6.4
-
-if [[ -z "${LABELTAG}" ]]; then
-    echo "LABEL is not specified"
-    exit 1
-fi
-
-base=/panfs/${USER}/${LABELTAG}
-if [[ ! -d ${base} ]]; then
-    echo "${base} not found"
-    exit 1
-fi
-
-samples=($(cat ${base}/sample_index_list_${LABELTAG}.txt | cut -f 2))
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-# leave these hardcoded for now
-k=79
-lc=35
-cores=${SLURM_CPUS_PER_TASK}
-
-tl=${base}/tell-link/${sample}
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${tl} ]]; then
-        rm -fr ${tl}
-    fi
-fi
-
-mkdir -p ${tl}
-
-/projects/long_read_collab/code/tellseq/release_v1.11/tellink-release/run_tellink_sing.sh \
-    -r1 ${base}/integrated/${sample}.R1.fastq.gz \
-    -r2 ${base}/integrated/${sample}.R2.fastq.gz \
-    -i1 ${base}/integrated/${sample}.I1.fastq.gz \
-    -d metagenomics \
-    -o ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc} \
-    -k ${k} \
-    -lc ${lc} \
-    -p ${sample} \
-    -j ${cores}
-
-# remove temporary data
-if [[ -d ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/${LABELTAG}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
-fi
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
deleted file mode 100644
index 56bc3360..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/tellread-cleanup.sbatch
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J cleanup             # cleanup
-#SBATCH --time 24:00:00  # 24:00:00
-#SBATCH --mem 8G        # 8G
-#SBATCH -N 1           # 1
-#SBATCH -c 1       # 1
-#SBATCH -p qiita           # qiita
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=BEGIN,FAIL
-
-# for now these can be left hard-coded.
-#SBATCH --output %x-%A.out
-#SBATCH --error %x-%A.err
-
-if [[ -z "${OUTPUT}" ]]; then
-    echo "OUTPUT is not specified"
-    exit 1
-fi
-
-# remove unused large outputs
-rm -rf ${OUTPUT}/biosample_format ${OUTPUT}/1_demult ${OUTPUT}/Full
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
deleted file mode 100644
index ab0647f8..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sbatch
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J tellread             # tellread
-#SBATCH -p qiita           # qiita
-#SBATCH -N 1           # 1
-#SBATCH -c 4       # 4
-#SBATCH --mem 16G        # 16G
-#SBATCH --time 96:00:00  # 96:00:00
-
-# for now these can be left hard-coded.
-#SBATCH --partition=short
-#SBATCH --output %x-%A.out
-#SBATCH --error %x-%A.err
-
-# for now comment these out as qiita is responsible for notifying users.
-###SBATCH --mail-user=qiita.help@gmail.com
-###SBATCH --mail-type=BEGIN,FAIL
-
-function logger () { 
-    echo "$(date) :: ${@}"; 
-    echo "$(date) :: ${@}" 1>&2; 
-}
-
-set -x 
-
-if [[ -z "${N_SAMPLES}" ]]; then
-    echo "N_SAMPLES is not specified"
-    exit 1
-fi
-
-if [[ -z "${SEQRUNPATH}" ]]; then
-    echo "SEQRUNPATH is not specified"
-    exit 1
-fi
-
-if [[ -z "${LANE}" ]]; then
-    echo "LANE is not specified"
-    exit 1
-fi
-
-if [[ -z "${SAMPLES}" ]]; then
-    echo "SAMPLES is not specified"
-    exit 1
-fi
-
-if [[ -z "${REFS}" ]]; then
-    echo "REFS is not specified"
-    exit 1
-fi
-
-if [[ -z "${OUTPUT}" ]]; then
-    echo "OUTPUT is not specified"
-    exit 1
-fi
-
-export TMPDIR="/panfs/${USER}/tmp"
-mkdir -p ${TMPDIR}
-export TMPDIR=$(mktemp -d)
-seqrun_path=${SEQRUNPATH}
-
-if [[ ${LANE} == "L001" ]]; then
-    lane=s_1
-elif [[ ${LANE} == "L002" ]]; then
-    lane=s_2
-elif [[ ${LANE} == "L003" ]]; then
-    lane=s_3
-elif [[ ${LANE} == "L004" ]]; then
-    lane=s_4
-elif [[ ${LANE} == "L005" ]]; then
-    lane=s_5
-elif [[ ${LANE} == "L006" ]]; then
-    lane=s_6
-elif [[ ${LANE} == "L007" ]]; then
-    lane=s_7
-elif [[ ${LANE} == "L008" ]]; then
-    lane=s_8
-else
-    echo "Unrecognized lane: ${LANE}"
-    exit 1
-fi
-
-# yes, hard coded, not great but progress.
-extra=""
-if [[ ! -z ${REFBASE} ]]; then
-    extra="-f ${REFBASE}"
-fi
-
-mkdir -p ${OUTPUT}
-    
-module load singularity_3.6.4 # singularity_3.6.4
-$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
-    -i ${seqrun_path} \
-    -o ${OUTPUT} \
-    -s $(echo ${SAMPLES} | tr -d '"') \
-    -g $(echo ${REFS} | tr -d '"') \
-    -j ${SLURM_JOB_CPUS_PER_NODE} \
-    ${extra} \
-    -l ${lane}
-
-    
-if [[ -d ${OUTPUT}/Full ]]; then
-    echo "Run appears successful"
-elif [[ -d ${OUTPUT}/1_demult/Full ]]; then
-    echo "Run appears unsuccessful but has output"
-    exit 1
-else
-    echo "Run appears unsuccessful"
-    exit 1
-fi
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh b/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh
deleted file mode 100644
index 90b4e1ce..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/tellread.sh
+++ /dev/null
@@ -1,236 +0,0 @@
-#!/bin/bash
-samplesheet="/home/qiita_test/qiita-spots/tellread_mapping.csv"      # previously -i option
-seqrunpath="/sequencing/igm_runs/240216_LH00444_0058_A22357VLT4"        # previously -s option
-lane="L008"                     # previously -l option
-reference_map=""   # previously -r option
-reference_base="" # previously -b option
-mode="metagenomic" $                   # previously -m option
-
-# preserve error-checking of parameters to preserve as much of the original
-# script as possible, even though this could be done in python.
-
-# https://unix.stackexchange.com/a/621007
-: ${seqrunpath:?Missing -s}
-: ${lane:?Missing -i}
-
-if [[ ! -z ${reference_map} || ! -z ${reference_base} ]]; then
-    if [[ -z ${reference_map} ]]; then
-        echo "-b used without -r"
-        exit 1
-    fi
-    if [[ -z ${reference_base} ]]; then
-        echo "-r used without -b"
-        exit 1
-    fi
-    if [[ ! -d ${reference_base} ]]; then
-        echo "reference base not found"
-        exit 1
-    fi
-
-    tag=reference-based
-else
-    tag=reference-free
-fi
-
-# trim trailing slash
-# https://stackoverflow.com/a/32845647/19741
-safepath=$(echo ${seqrunpath} | sed 's:/*$::')
-label=$(basename ${safepath})
-labeltag=${label}-${tag}
-output=/panfs/${USER}/${labeltag}
-
-if [[ ! -d ${seqrunpath}/Data/Intensities/BaseCalls/${lane} ]]; then
-    echo "Cannot access the lane"
-    exit 1
-fi
-
-# for now this can stay here to keep greater compatibility with the original script.
-# however these fields should eventually be parameters that can be configured in the config file.
-
-if [[ ${seqrunpath} == *"_iSeq_Runs"* ]]; then
-    sbatch_cores=2
-    sbatch_mem=8G
-    norm=TRUE
-    wall=24:00:00
-    mode=NA
-elif [[ ${seqrunpath} == *"_MiSeq_Runs"* ]]; then
-    sbatch_cores=2
-    sbatch_mem=8G
-    norm=TRUE
-    wall=24:00:00
-    mode=NA
-else
-    sbatch_cores=16
-    sbatch_mem=160G
-    norm=FALSE
-    assemble=TRUE
-    wall=48:00:00
-fi
-
-if [[ ${mode} == "isolate" ]]; then
-    ISOLATE_MODE=TRUE
-elif [[ ${mode} == "metagenomic" ]]; then
-    ISOLATE_MODE=FALSE
-elif [[ ${mode} == "NA" ]]; then
-    ISOLATE_MODE=FALSE
-else
-    echo "unknown mode: ${mode}"
-    exit 1
-fi
-
-set -e
-set -o pipefail
-
-declare -a s
-declare -a g
-# below extended regex might be broken because C5\d\d happens in column 0, not column 1
-# of the hacked sample-sheet.
-for sample in $(egrep -o "^C5.*," ${samplesheet} | tr -d "," | sort)
-do
-    echo "sample found: ${sample}"
-    # get references if they exist
-    if [[ -f ${reference_map} ]]; then
-        if $(grep -Fq ${sample} ${reference_map}); then
-            ref=$(grep -m 1 ${sample} ${reference_map} | cut -f 2 -d"," | tr -d "\n")
-            if [[ ${ref} != "NONE" ]]; then
-                if [[ ! -d "${reference_base}/${ref}" ]]; then
-                    echo "${reference_base}/${ref}"
-                    echo "${ref} not found"
-                    exit 1
-                fi
-                g[${#g[@]}]=${ref}
-                s[${#s[@]}]=${sample}
-            fi
-        fi
-    else
-        g[${#g[@]}]=NONE
-        s[${#s[@]}]=${sample}
-    fi
-done
-n_samples=${#s[@]}
-
-# https://stackoverflow.com/a/17841619/19741
-function join_by { local IFS="$1"; shift; echo "$*"; }
-s=$(join_by , "${s[@]}")
-g=$(join_by , "${g[@]}")
-
-base=$(dirname ${0})
-submit_script=$(dirname ${0})/tellread.sbatch
-integrate_script=$(dirname ${0})/integrate.sbatch
-norm_script=$(dirname ${0})/compute_sequence_counts_for_normalization.sbatch
-asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
-clean_script=$(dirname ${0})/tellread-cleanup.sbatch
-
-if [[ ${ISOLATE_MODE} == "TRUE" ]]; then
-    asm_tellink_script=$(dirname ${0})/telllink-isolate.sbatch
-    asm_cloudspades_script=$(dirname ${0})/cloudspades-isolate.sbatch
-else
-    asm_cloudspades_script=$(dirname ${0})/cloudspades.sbatch
-    asm_tellink_script=$(dirname ${0})/telllink.sbatch
-fi
-
-if [[ ! -f ${submit_script} ]]; then
-    echo "Cannot access submit script"
-    exit 1
-fi
-if [[ ! -f ${asm_cloudspades_script} ]]; then
-    echo "Cannot access cloudspades assembly script"
-    exit 1
-fi
-if [[ ! -f ${asm_tellink_script} ]]; then
-    echo "Cannot access tell-link assembly script"
-    exit 1
-fi
-if [[ ! -f ${integrate_script} ]]; then
-    echo "Cannot access integrate script"
-    exit 1
-fi
-if [[ ! -f ${clean_script} ]]; then
-    echo "Cannot access clean script"
-    exit 1
-fi
-
-datetag=$(date "+%Y.%m.%d")
-scriptcopy=$(pwd)/tellread_script-${datetag}.sh
-submitcopy=$(pwd)/tellread_submission-${datetag}.sbatch
-asmcscopy=$(pwd)/assembly_submission_cloudspades-${datetag}.sbatch
-asmtlcopy=$(pwd)/assembly_submission_tell-link-${datetag}.sbatch
-normcopy=$(pwd)/norm_submission-${datetag}.sbatch
-intcopy=$(pwd)/integrate_submission-${datetag}.sbatch
-cleancopy=$(pwd)/tellread-cleanup-${datetag}.sbatch
-arguments=$(pwd)/provided_script_arguments.txt
-if [[ -f ${scriptcopy} ]]; then
-    echo "Existing script copy ${scriptcopy} found, not overwriting, delete to resubmit"
-    exit 1
-fi
-if [[ -f ${submitcopy} ]]; then
-    echo "Existing submission ${submitcopy} found, not overwriting, delete to resubmit"
-    exit 1
-fi
-
-echo $@ > ${arguments}
-cp ${0} ${scriptcopy}
-cp ${submit_script} ${submitcopy}
-cp ${asm_cloudspades_script} ${asmcscopy}
-cp ${asm_tellink_script} ${asmtlcopy}
-cp ${integrate_script} ${intcopy}
-cp ${clean_script} ${cleancopy}
-chmod gou-w ${scriptcopy} ${submitcopy} ${asmcopy} ${intcopy} ${arguments} ${cleancopy}
-
-set -x
-
-trjob=$(sbatch \
-          --parsable \
-          -J ${labeltag}-${datetag} \
-          -c ${sbatch_cores} \
-          --mem ${sbatch_mem} \
-          --time ${wall} \
-          --export BASE=${base},N_SAMPLES=${n_samples},SEQRUNPATH=${seqrunpath},LANE=${lane},REFMAP=${reference_map},REFBASE=${reference_base},OUTPUT=${output},SAMPLES=\"${s}\",REFS=\"${g}\" \
-          ${submit_script})
-
-if [[ ${norm} == "TRUE" ]]; then
-    cp ${norm_script} ${normcopy}
-    chmod gou-w ${normcopy}
-    norm_counts_job=$(sbatch \
-                        --parsable \
-                        --dependency=afterok:${trjob} \
-                        -J ${labeltag}-${datetag}-norm-counts \
-                        --export BASE=${base},TELLREAD_OUTPUT=${output},OUTPUT=$(pwd),SAMPLESHEET=${samplesheet} \
-                        ${norm_script})
-fi
-
-integrate_job=$(sbatch \
-                    --parsable \
-                    -J ${labeltag}-${datetag}-integrate \
-                    --dependency=afterok:${trjob} \
-                    --array 1-${n_samples} \
-                    --export BASE=${base},LABELTAG=${labeltag},OUTPUT=${output} \
-                    ${integrate_script})
-
-if [[ ${assemble} == "TRUE" ]]; then
-    csj=$(sbatch \
-            --parsable \
-            --dependency=aftercorr:${integrate_job} \
-            -J ${labeltag}-${datetag}-cloudspades \
-            --array 1-${n_samples} \
-            --export LABELTAG=${labeltag},OUTPUT=${output} \
-            ${asm_cloudspades_script})
-    tlj=$(sbatch \
-            --parsable \
-            --dependency=aftercorr:${integrate_job} \
-            -J ${labeltag}-${datetag}-tell-link \
-            --array 1-${n_samples} \
-            --export LABELTAG=${labeltag},OUTPUT=${output} \
-            ${asm_tellink_script})
-    cleanupdep=${csj}:${tlj}
-else
-    cleanupdep=${integrate_job}
-    echo "Not assembling"
-fi
-
-cleanup=$(sbatch \
-            --parsable \
-            -J ${labeltag}-${datetag}-cleanup \
-            --dependency=afterok:${cleanupdep} \
-            --export OUTPUT=${output} \
-            ${clean_script})
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
new file mode 100644
index 00000000..b9659267
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -0,0 +1,99 @@
+from os.path import join, abspath
+from sequence_processing_pipeline.TellReadJob import TellReadJob
+from functools import partial
+import unittest
+
+
+class TestTellReadJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0',
+                             'TellReadJob', 'tellread_test.sbatch')
+        self.exp = self.path('data', 'tellread_output', 'tellread_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        # TODO: Revisit w/a new directory named as expected for a
+        #  TellSeq-produced run-directory.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        # TODO: Revisit w/a proper sample-sheet once spec is near finalized.
+        self.sample_sheet_path = self.path('data', 'good-sample-sheet.csv')
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "96:00:00"
+        self.jmem = "16"
+        self.modules_to_load = ["singularity_3.6.4"]
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.label = "150629_SN1001_0511_AH5L7GBCXX-test"
+        self.reference_base = ""
+        self.reference_map = ""
+        self.tmp1_path = join(self.output_path, "TellReadJob", "output",
+                              "tmp1")
+        # reflects location of script on host.
+        self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/"
+                                 "run_tellread_sing.sh")
+        self.lane = "1"
+        self.cores_per_task = "4"
+
+    def test_creation(self):
+        # confirm only sensible lane numbers are allowed.
+        with self.assertRaisesRegex(ValueError,
+                                    "'-1' is not a valid lane number"):
+            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
+                        self.queue_name, self.node_count, self.wall_time_limit,
+                        self.jmem, self.modules_to_load, self.qiita_job_id,
+                        self.label, self.reference_base, self.reference_map,
+                        self.tmp1_path, self.sing_script_path, -1,
+                        self.cores_per_task)
+
+        with self.assertRaisesRegex(ValueError,
+                                    "'0' is not a valid lane number"):
+            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
+                        self.queue_name, self.node_count, self.wall_time_limit,
+                        self.jmem, self.modules_to_load, self.qiita_job_id,
+                        self.label, self.reference_base, self.reference_map,
+                        self.tmp1_path, self.sing_script_path, 0,
+                        self.cores_per_task)
+
+        with self.assertRaisesRegex(ValueError,
+                                    "'9' is not a valid lane number"):
+            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
+                        self.queue_name, self.node_count, self.wall_time_limit,
+                        self.jmem, self.modules_to_load, self.qiita_job_id,
+                        self.label, self.reference_base, self.reference_map,
+                        self.tmp1_path, self.sing_script_path, 9,
+                        self.cores_per_task)
+
+        # test basic good-path
+        job = TellReadJob(self.run_dir, self.output_path,
+                          self.sample_sheet_path, self.queue_name,
+                          self.node_count, self.wall_time_limit,
+                          self.jmem, self.modules_to_load, self.qiita_job_id,
+                          self.label, self.reference_base, self.reference_map,
+                          self.tmp1_path, self.sing_script_path, self.lane,
+                          self.cores_per_task)
+
+        job._generate_job_script()
+
+        with open(self.obs, 'r') as f:
+            obs_lines = f.readlines()
+
+        with open(self.exp, 'r') as f:
+            exp_lines = f.readlines()
+
+        for obs_line, exp_line in zip(obs_lines, exp_lines):
+            print("OBS: %s" % obs_line)
+            print("EXP: %s" % exp_line)
+            print("")
+            self.assertEqual(obs_line, exp_line)
+
+
+if __name__ == '__main__':
+    unittest.main()

From baf35ea4fd51b71cb4e09d881680da09cdf8afc6 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 6 Oct 2024 15:43:33 -0700
Subject: [PATCH 17/47] flake8

---
 sequence_processing_pipeline/TRIntegrateJob.py         | 10 +++++-----
 sequence_processing_pipeline/TRNormCountsJob.py        |  8 +++++---
 sequence_processing_pipeline/tests/test_TellReadJob.py |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 076a15fe..25cec68a 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -15,7 +15,7 @@ class TRIntegrateJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
                  qiita_job_id, max_array_length, indicies_script_path, label,
-                 reference_base, reference_map, cores_per_task=4):
+                 reference_base, reference_map, cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -33,7 +33,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param label: None
         :param reference_base: None
         :param reference_map: None
-        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        :param cores_per_task: # of CPU cores per node to request.
         """
         super().__init__(run_dir,
                          output_path,
@@ -62,7 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.jinja_env = Environment(loader=KISSLoader('templates'))
         self.label = label
 
-        if self.reference_base != None or self.reference_map != None:
+        if self.reference_base is not None or self.reference_map is not None:
             tag = 'reference-based'
         else:
             tag = 'reference-free'
@@ -122,8 +122,8 @@ def _process_sample_sheet(self):
                 'sample_ids': sample_ids}
 
     def _generate_job_script(self):
-        job_script_path = join(self.output_path, 'integrate.sbatch')
-        template = self.jinja_env.get_template("integrate2.sbatch")
+        job_script_path = join(self.output_path, 'integrate_test.sbatch')
+        template = self.jinja_env.get_template("integrate.sbatch")
 
         with open(job_script_path, mode="w", encoding="utf-8") as f:
             f.write(template.render({
diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py
index 09e36a67..a3603bcd 100644
--- a/sequence_processing_pipeline/TRNormCountsJob.py
+++ b/sequence_processing_pipeline/TRNormCountsJob.py
@@ -62,7 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.jinja_env = Environment(loader=KISSLoader('templates'))
         self.label = label
 
-        if self.reference_base != None or self.reference_map != None:
+        if self.reference_base is not None or self.reference_map is not None:
             tag = 'reference-based'
         else:
             tag = 'reference-free'
@@ -122,8 +122,10 @@ def _process_sample_sheet(self):
                 'sample_ids': sample_ids}
 
     def _generate_job_script(self):
-        job_script_path = join(self.output_path, 'compute_sequence_counts_for_normalization.sbatch')
-        template = self.jinja_env.get_template("compute_sequence_counts_for_normalization2.sbatch")
+        job_script_path = join(self.output_path, "compute_sequence_counts_for"
+                                                 "_normalization.sbatch")
+        template = self.jinja_env.get_template("compute_sequence_counts_for_"
+                                               "normalization2.sbatch")
 
         with open(job_script_path, mode="w", encoding="utf-8") as f:
             f.write(template.render({
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
index b9659267..6cc12632 100644
--- a/sequence_processing_pipeline/tests/test_TellReadJob.py
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -1,4 +1,4 @@
-from os.path import join, abspath
+from os.path import join
 from sequence_processing_pipeline.TellReadJob import TellReadJob
 from functools import partial
 import unittest

From 56fc5be5d598de3ba44c2930ca83ed851552e7c4 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 6 Oct 2024 18:39:33 -0700
Subject: [PATCH 18/47] New sample files added

---
 .../tellread_output/integrate_test.sbatch     | 96 +++++++++++++++++++
 .../data/tellread_output/tellread_test.sbatch | 37 +++++++
 2 files changed, 133 insertions(+)
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch

diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
new file mode 100644
index 00000000..3cdc891f
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
@@ -0,0 +1,96 @@
+#!/bin/bash -l
+#SBATCH -J integrate             # integrate
+#SBATCH --time 96:00:00  # 24:00:00
+#SBATCH --mem 16G        # 8G
+#SBATCH -N 1           # 1
+#SBATCH -c 4       # 1
+#SBATCH -p qiita           # qiita
+
+#SBATCH --output integrate_%x-%A_%a.out
+#SBATCH --error integrate_%x-%A_%a.err
+
+# NB SLURM_ARRAY_TASK_ID is exported by Slurm
+if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
+    echo "Not operating in an array"
+    exit 1
+fi
+
+# NB SLURM_ARRAY_TASK_MIN is exported by Slurm
+if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
+    echo "Line extraction assumes 1-based index"
+    exit 1
+fi
+
+set -x 
+set -e
+set -o pipefail
+
+samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list_output.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT.
+export TMPDIR=$(mktemp -d)
+function cleanup {                                                              
+  echo "Removing $TMPDIR"                                                          
+  rm  -r $TMPDIR                                                                   
+  unset TMPDIR                                                                  
+}                                                                               
+trap cleanup EXIT
+
+files=${TMPDIR}/integration.files
+/bin/ls -1 sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/Full/*corrected.err_barcode_removed.fastq > ${files}
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated
+
+if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R1"
+    exit 1
+fi
+
+if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} R2"
+    exit 1
+fi
+
+if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
+    echo "Multiple matches for ${sample} I1"
+    exit 1
+fi
+
+r1=$(grep -m 1 "_R1_${sample}" ${files})
+r2=$(grep -m 1 "_R2_${sample}" ${files})
+i1=$(grep -m 1 "_I1_${sample}" ${files})
+r1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz
+r2out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz
+i1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz
+
+if [[ ! -s ${r1} ]]; then
+    echo "${r1} is empty, cannot integrate"
+    if [[ -s ${r2} ]]; then
+        echo "R1 and R2 are inconsistent"
+        exit 1
+    fi
+    if [[ -s ${i1} ]]; then
+        echo "R1 and I1 are inconsistent"
+        exit 1
+    fi
+
+    # reflect the empties so Qiita can know of them
+    touch ${r1out}
+    touch ${r2out}
+    touch ${i1out}
+    exit 0
+fi
+
+# this can probably be backgrounded but then you have to get creative to
+# not mask a nonzero exit status (e.g., the python process raising)
+cat ${i1} | gzip > ${i1out} 
+
+conda activate qp-knight-lab-processing-2022.03
+python hello integrate \
+    --no-sort \
+    --r1-in ${r1} \
+    --r2-in ${r2} \
+    --i1-in ${i1} \
+    --r1-out ${r1out} \
+    --r2-out ${r2out} \
+    --threads ${SLURM_CPUS_PER_TASK}
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
new file mode 100644
index 00000000..a008937b
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
@@ -0,0 +1,37 @@
+#!/bin/bash -l
+#SBATCH -J tellread
+#SBATCH -p qiita
+#SBATCH -N 1
+#SBATCH -c 4
+#SBATCH --mem 16G
+#SBATCH --time 96:00:00
+
+#SBATCH --output tellread_%x-%A.out
+#SBATCH --error tellread_%x-%A.err
+
+set -x
+
+export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/tmp1
+mkdir -p ${TMPDIR}
+export TMPDIR=$(mktemp -d)
+
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output
+
+module load singularity_3.6.4
+$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
+    -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \
+    -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output \
+    -s $(echo CDPH-SAL__Salmonella__Typhi__MDL-143,CDPH-SAL_Salmonella_Typhi_MDL-144,CDPH-SAL_Salmonella_Typhi_MDL-145,CDPH-SAL_Salmonella_Typhi_MDL-146,CDPH-SAL_Salmonella_Typhi_MDL-147,CDPH-SAL_Salmonella_Typhi_MDL-148,CDPH-SAL_Salmonella_Typhi_MDL-149,CDPH-SAL_Salmonella_Typhi_MDL-150,CDPH-SAL_Salmonella_Typhi_MDL-151,CDPH-SAL_Salmonella_Typhi_MDL-152,CDPH-SAL_Salmonella_Typhi_MDL-153,CDPH-SAL_Salmonella_Typhi_MDL-154,CDPH-SAL_Salmonella_Typhi_MDL-155,CDPH-SAL_Salmonella_Typhi_MDL-156,CDPH-SAL_Salmonella_Typhi_MDL-157,CDPH-SAL_Salmonella_Typhi_MDL-158,CDPH-SAL_Salmonella_Typhi_MDL-159,CDPH-SAL_Salmonella_Typhi_MDL-160,CDPH-SAL_Salmonella_Typhi_MDL-161,CDPH-SAL_Salmonella_Typhi_MDL-162,CDPH-SAL_Salmonella_Typhi_MDL-163,CDPH-SAL_Salmonella_Typhi_MDL-164,CDPH-SAL_Salmonella_Typhi_MDL-165,CDPH-SAL_Salmonella_Typhi_MDL-166,CDPH-SAL_Salmonella_Typhi_MDL-167,CDPH-SAL_Salmonella_Typhi_MDL-168,P21_E_coli_ELI344,P21_E_coli_ELI345,P21_E_coli_ELI347,P21_E_coli_ELI348,P21_E_coli_ELI349,P21_E_coli_ELI350,P21_E_coli_ELI351,P21_E_coli_ELI352,P21_E_coli_ELI353,P21_E_coli_ELI354,P21_E_coli_ELI355,P21_E_coli_ELI357,P21_E_coli_ELI358,P21_E_coli_ELI359,P21_E_coli_ELI361,P21_E_coli_ELI362,P21_E_coli_ELI363,P21_E_coli_ELI364,P21_E_coli_ELI365,P21_E_coli_ELI366,P21_E_coli_ELI367,P21_E_coli_ELI368,P21_E_coli_ELI369,stALE_E_coli_A1_F21_I1_R1,stALE_E_coli_A2_F21_I1_R1,stALE_E_coli_A3_F18_I1_R1,stALE_E_coli_A3_F40_I1_R1,stALE_E_coli_A4_F21_I1_R1,stALE_E_coli_A4_F21_I1_R2,stALE_E_coli_A4_F42_I1_R1,stALE_E_coli_A5_F21_I1_R1,stALE_E_coli_A5_F42_I1_R1,stALE_E_coli_A6_F21_I1_R1,stALE_E_coli_A6_F43_I1_R1,stALE_E_coli_A7_F21_I1_R1,stALE_E_coli_A7_F42_I1_R1,stALE_E_coli_A8_F20_I1_R1,stALE_E_coli_A8_F42_I1_R1,stALE_E_coli_A9_F21_I1_R1,stALE_E_coli_A9_F44_I1_R1,stALE_E_coli_A10_F21_I1_R1,stALE_E_coli_A10_F43_I1_R1,stALE_E_coli_A10_F131_I1_R1,stALE_E_coli_A11_F21_I1_R1,stALE_E_coli_A11_F43_I1_R1,stALE_E_coli_A11_F119_I1_R1,stALE_E_coli_A12_F21_I1_R1,stALE_E_coli_A12_F43_I1_R1,stALE_E_coli_A12_F136_I1_R1,stALE_E_coli_A13_F20_I1_R1,stALE_E_coli_A13_F42_I1_R1,stALE_E_coli_A13_F121_I1_R1,stALE_E_coli_A14_F20_I1_R1,stALE_E_coli_A14_F42_I1_R1,stALE_E_coli_A14_F133_I1_R1,stALE_E_coli_A15_F21_I1_R1,stALE_E_coli_A15_F42_I1_R1,stALE_E_coli_A15_F117_I1_R1,stALE_E_coli_A16_F20_I1_R1,stALE_E_coli_A16_F42_I1_R1,stALE_E_coli_A16_F134_I1_R1,stALE_E_coli_A17_F21_I1_R1,stALE_E_coli_A17_F118_I1_R1,stALE_E_coli_A18_F18_I1_R1,stALE_E_coli_A18_F39_I1_R1,stALE_E_coli_A18_F130_I1_R1,3A,4A,BLANK_40_12G,BLANK_40_12H,Pputida_JBEI__HGL_Pputida_107_BP6,Pputida_JBEI__HGL_Pputida_108_BP7,Pputida_JBEI__HGL_Pputida_109_BP8,Pputida_JBEI__HGL_Pputida_110_M2,Pputida_JBEI__HGL_Pputida_111_M5,Pputida_TALE__HGL_Pputida_112,Pputida_TALE__HGL_Pputida_113,Pputida_TALE__HGL_Pputida_114,Pputida_TALE__HGL_Pputida_115,Pputida_TALE__HGL_Pputida_116,Pputida_TALE__HGL_Pputida_117,Pputida_TALE__HGL_Pputida_118,Pputida_TALE__HGL_Pputida_119,Pputida_TALE__HGL_Pputida_120,Pputida_TALE__HGL_Pputida_121,Pputida_TALE__HGL_Pputida_122,Pputida_TALE__HGL_Pputida_123,Pputida_TALE__HGL_Pputida_124,Pputida_TALE__HGL_Pputida_125,Pputida_TALE__HGL_Pputida_126,Pputida_TALE__HGL_Pputida_127,Pputida_TALE__HGL_Pputida_128,Pputida_TALE__HGL_Pputida_129,Pputida_TALE__HGL_Pputida_130,Pputida_TALE__HGL_Pputida_131,Pputida_TALE__HGL_Pputida_132,Pputida_TALE__HGL_Pputida_133,Pputida_TALE__HGL_Pputida_134,Pputida_TALE__HGL_Pputida_135,Pputida_TALE__HGL_Pputida_136,Pputida_TALE__HGL_Pputida_137,Pputida_TALE__HGL_Pputida_138,Pputida_TALE__HGL_Pputida_139,Pputida_TALE__HGL_Pputida_140,Pputida_TALE__HGL_Pputida_141,Pputida_TALE__HGL_Pputida_142,Pputida_TALE__HGL_Pputida_143,Pputida_TALE__HGL_Pputida_144,Pputida_PALE__HGL_Pputida_145,Pputida_PALE__HGL_Pputida_146,Pputida_PALE__HGL_Pputida_147,Pputida_PALE__HGL_Pputida_148,Pputida_PALE__HGL_Pputida_149,Pputida_PALE__HGL_Pputida_150,Pputida_PALE__HGL_Pputida_151,Pputida_PALE__HGL_Pputida_152,Pputida_PALE__HGL_Pputida_153,Pputida_PALE__HGL_Pputida_154,Pputida_PALE__HGL_Pputida_155,Pputida_PALE__HGL_Pputida_156,Pputida_PALE__HGL_Pputida_157,Pputida_PALE__HGL_Pputida_158,Pputida_PALE__HGL_Pputida_159,Pputida_PALE__HGL_Pputida_160,Pputida_PALE__HGL_Pputida_161,Pputida_PALE__HGL_Pputida_162,Pputida_PALE__HGL_Pputida_163,Pputida_PALE__HGL_Pputida_164,Pputida_PALE__HGL_Pputida_165,Pputida_PALE__HGL_Pputida_166,Pputida_PALE__HGL_Pputida_167,Pputida_PALE__HGL_Pputida_168,Pputida_PALE__HGL_Pputida_169,Pputida_PALE__HGL_Pputida_170,Pputida_PALE__HGL_Pputida_171,Pputida_PALE__HGL_Pputida_172,Pputida_PALE__HGL_Pputida_173,Pputida_PALE__HGL_Pputida_174,Pputida_PALE__HGL_Pputida_175,Pputida_PALE__HGL_Pputida_176,JM-Metabolic__GN0_2005,JM-Metabolic__GN0_2007,JM-Metabolic__GN0_2009,JM-Metabolic__GN0_2094,JM-Metabolic__GN0_2099,JM-Metabolic__GN0_2148,JM-Metabolic__GN0_2165,JM-Metabolic__GN0_2169,JM-Metabolic__GN0_2172,JM-Metabolic__GN0_2175,JM-Metabolic__GN0_2183,JM-Metabolic__GN0_2215,JM-Metabolic__GN0_2254,JM-Metabolic__GN0_2277,JM-Metabolic__GN0_2290,JM-Metabolic__GN0_2337,JM-Metabolic__GN0_2317,JM-Metabolic__GN0_2354,JM-Metabolic__GN0_2375,JM-Metabolic__GN0_2380,JM-Metabolic__GN0_2393,JM-Metabolic__GN0_2404,5B,6A,BLANK_41_12G,BLANK_41_12H,Deoxyribose_PALE_ALE__MG1655_BOP27_4_14,Deoxyribose_PALE_ALE__MG1655_BOP27_4_23,Deoxyribose_PALE_ALE__MG1655_BOP27_4_48,Deoxyribose_PALE_ALE__MG1655_BOP27_6_21,Deoxyribose_PALE_ALE__MG1655_BOP27_6_35,Deoxyribose_PALE_ALE__MG1655_BOP27_10_13,Deoxyribose_PALE_ALE__MG1655_BOP27_10_28,Deoxyribose_PALE_ALE__MG1655_BOP27_10_51,Deoxyribose_PALE_ALE__MG1655_Lib4_18_19,Deoxyribose_PALE_ALE__MG1655_Lib4_18_59,Deoxyribose_PALE_ALE__MG1655_Lib4_18_35,Deoxyribose_PALE_ALE__MG1655_Lib4_20_16,Deoxyribose_PALE_ALE__MG1655_Lib4_20_43,Deoxyribose_PALE_ALE__MG1655_Lib4_20_71,Deoxyribose_PALE_ALE__MG1655_Lib4_22_16,Deoxyribose_PALE_ALE__MG1655_Lib4_22_28,Deoxyribose_PALE_ALE__MG1655_Lib4_22_52,Deoxyribose_PALE_ALE__MG1655_Lib4_24_9,Deoxyribose_PALE_ALE__MG1655_Lib4_24_24,Deoxyribose_PALE_ALE__MG1655_Lib4_24_52,Deoxyribose_PALE_ALE__MG1655_Lib4_26_6,Deoxyribose_PALE_ALE__MG1655_Lib4_26_27,Deoxyribose_PALE_ALE__MG1655_Lib4_26_69,Deoxyribose_PALE_ALE__MG1655_Lib4_28_13,Deoxyribose_PALE_ALE__MG1655_Lib4_28_28,Deoxyribose_PALE_ALE__MG1655_Lib4_28_53,Deoxyribose_PALE_ALE__MG1655_Lib4_30_7,Deoxyribose_PALE_ALE__MG1655_Lib4_30_22,Deoxyribose_PALE_ALE__MG1655_Lib4_30_60,Deoxyribose_PALE_ALE__MG1655_Lib4_32_6,Deoxyribose_PALE_ALE__MG1655_Lib4_32_20,Deoxyribose_PALE_ALE__MG1655_Lib4_32_56,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_69,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_50,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_61,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_22,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_36,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_58,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_64,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_55,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_63,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_49,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_42,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_62,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_21,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_50,JM-Metabolic__GN02514,JM-Metabolic__GN02529,JM-Metabolic__GN02531,JM-Metabolic__GN02567,JM-Metabolic__GN02590,JM-Metabolic__GN02657,JM-Metabolic__GN02748,JM-Metabolic__GN02766,JM-Metabolic__GN02769,JM-Metabolic__GN02787,JM-Metabolic__GN03132,JM-Metabolic__GN03218,JM-Metabolic__GN03252,JM-Metabolic__GN03409,JM-Metabolic__GN04014,JM-Metabolic__GN04094,JM-Metabolic__GN04255,JM-Metabolic__GN04306,JM-Metabolic__GN04428,JM-Metabolic__GN04488,JM-Metabolic__GN04540,JM-Metabolic__GN04563,JM-Metabolic__GN04612,JM-Metabolic__GN04665,JM-Metabolic__GN04682,JM-Metabolic__GN05002,JM-Metabolic__GN05109,JM-Metabolic__GN05128,JM-Metabolic__GN05367,JM-Metabolic__GN05377,7A,8A,BLANK_42_12G,BLANK_42_12H,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0326,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0327,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0328,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0329,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0330,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0352,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0353,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0354,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0355,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0356,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0357,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0364,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0366,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0367,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0368,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0369,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0370,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0371,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0372,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0373,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0374,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0375,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0376,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0377,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0378,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0380,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0381,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0382,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0383,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0384,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0385,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0386,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0387,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0388,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0389,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0390,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0391,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0392,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0393,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0394,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0395,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0396,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0397,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0398,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0399,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0400,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0401,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0402,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0403,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0404,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0405,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0406,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0407,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0408,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0409,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0417,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0418,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0419,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0420,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0421,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0473,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0474,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0483,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0484,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0485,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0486,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0516,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0517,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0518,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0519,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0520,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0521,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0522,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0523,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0524,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0525,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08624,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08704,JM-MEC__Staphylococcus_aureusstrain_BERTI-R10727,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11044,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11078,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11101,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11102,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11103,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11135,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11153,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11154,JM-Metabolic__GN02424,JM-Metabolic__GN02446,JM-Metabolic__GN02449,JM-Metabolic__GN02487,JM-Metabolic__GN02501,ISB,GFR,BLANK_43_12G,BLANK_43_12H,RMA_KHP_rpoS_Mage_Q97D,RMA_KHP_rpoS_Mage_Q97L,RMA_KHP_rpoS_Mage_Q97N,RMA_KHP_rpoS_Mage_Q97E,JBI_KHP_HGL_021,JBI_KHP_HGL_022,JBI_KHP_HGL_023,JBI_KHP_HGL_024,JBI_KHP_HGL_025,JBI_KHP_HGL_026,JBI_KHP_HGL_027,JBI_KHP_HGL_028_Amitesh_soxR,JBI_KHP_HGL_029_Amitesh_oxyR,JBI_KHP_HGL_030_Amitesh_soxR_oxyR,JBI_KHP_HGL_031_Amitesh_rpoS,BLANK1_1A,BLANK1_1B,BLANK1_1C,BLANK1_1D,BLANK1_1E,BLANK1_1F,BLANK1_1G,BLANK1_1H,AP581451B02,EP256645B01,EP112567B02,EP337425B01,LP127890A01,EP159692B04,EP987683A01,AP959450A03,SP464350A04,C9,ep256643b01,EP121011B01,AP616837B04,SP506933A04,EP159695B01,EP256644B01,SP511289A02,EP305735B04,SP415030A01,AP549681B02,AP549678B01,EP260544B04,EP202452B01,EP282276B04,SP531696A04,SP515443A04,SP515763A04,EP184255B04,SP503615A02,EP260543B04,EP768748A04,AP309872B03,AP568785B04,EP721390A04,EP940013A01,EP291979B04,EP182065B04,EP128904B02,EP915769A04,SP464352A03,SP365864A04,SP511294A04,EP061002B01,SP410793A01,SP232077A04,EP128910B01,AP531397B04,EP043583B01,EP230245B01,EP606652B04,EP207041B01,EP727972A04,EP291980B04,EP087938B02,SP471496A04,SP573823A04,EP393718B01,SP612496A01,EP032410B02,EP073216B01,EP410046B01,SP561451A04,EP320438B01,SP612495A04,EP446604B03,EP446602B01,EP182243B02,EP333541B04,EP238034B01,AP298002B02,EP455759B04,EP207042B04,LP128479A01,LP128476A01,EP316863B03,C20,lp127896a01,SP491907A02,EP182060B03,EP422407B01,SP573859A04,SP584547A02,EP182346B04,AP668631B04,EP451428B04,LP128538A01,SP490298A02,SP573860A01,EP032412B02,EP163771B01,LP169879A01,EP729433A02,EP447940B04,SP584551A08,EP216516B04,EP023808B02,BLANK2_2A,BLANK2_2B,BLANK2_2C,BLANK2_2D,BLANK2_2E,BLANK2_2F,BLANK2_2G,BLANK2_2H,SP573843A04,EP683835A01,SP573824A04,SP335002A04,SP478193A02,SP232311A04,SP415021A02,SP231630A02,SP641029A02,SP232310A04,EP617442B01,EP587478B04,EP447928B04,EP587475B04,EP675042B01,EP554513B02,EP702221B04,AP568787B02,EP054632B01,EP121013B01,EP649418A02,EP573313B01,LP154981A01,AP470859B01,LP154986A01,AP732307B04,EP533426B03,EP587476B04,AP696363B02,EP587477B04,SP683466A02,EP554518B04,EP533429B04,EP431570B01,EP202095B04,EP504030B04,EP207036B01,EP393717B01,SP491898A02,EP484973B04,EP479794B02,EP554515B04,SP631994A04,EP921593A04,AP787247B04,EP090129B04,EP447975B02,EP212214B01,EP410042B01,SP404409A02,SP247340A04,AP029018B01,EP872341A01,AP062219B03,EP790020A02,EP808112A04,SP404403A02,EP073160B01,EP012991B03,SP317297A02,EP656055A04,EP649623A01,EP790019A01,SP257519A04,EP808104A01,EP808106A01,SP231629A02,EP675044A01,EP657260A01,EP808110A04,AP032413B04,EP843906A04,AP173305B04,SP231628A02,AP173301B04,SP404405A02,EP649653A04,EP718687A04,AP905750A02,EP738468A01,C6,EP890157A02,SP353893A02,EP944059A02,EP970005A01,EP927461A04,EP808111A03,EP927459A04,SP317293A02,SP235186A04,SP399724A04,EP738469A01,SP284095A03,C5,EP337325B04,EP759450A04,BLANK3_3A,BLANK3_3B,BLANK3_3C,BLANK3_3D,BLANK3_3E,BLANK3_3F,BLANK3_3G,BLANK3_3H,AP006367B02,EP929277A02,AP324642B04,EP786631A04,EP657385A04,SP235189A01,EP448041B04,SP231631A02,SP280481A02,AP032412B04,EP649737A03,AP967057A04,EP876243A04,SP229387A04,EP667743A04,SP246941A01,AP745799A04,SP205732A02,SP230382A04,SP230380A02,SP230381A01,SP205754A01,EP606662B04,AP780167B02,EP447927B04,C18,LP191039A01,EP606663B04,EP573296B01,EP447926B04,LP127767A01,EP479266B04,LP128543A01,EP479270B03,EP921594A04,EP554501B04,EP542577B04,EP487995B04,EP542578B04,EP573310B01,EP244366B01,EP533389B03,EP244360B01,AP911328B01,AP481403B02,22_001_801_552_503_00,EP372981B04,EP447929B04,SP573849A04,SP577399A02,EP606656B03,LP166715A01,AP668628B04,C14,EP446610B02,EP339061B02,SP681591A04,EP393712B02,EP410041B01,SP453872A01,22_001_710_503_791_00,LP128540A01,EP339053B02,EP617443B01,EP190307B01,AP795068B04,LP128541A01,EP584756B04,SP284096A02,EP431562B04,EP685640B01,EP339059B02,EP431575B01,EP379938B01,EP529635B02,EP554506B04,EP455757B04,SP491900A02,LP196272A01,SP704319A04,EP617441B01,AP687591B04,SP640978A02,EP981129A02,EP455763B04,EP339057B02,SP491897A02,EP980752B04,LP128539A01,EP996831B04,EP273332B04,EP483291B04,EP393715B01,EP617440B01,EP729434A01,SP645141A03,BLANK4_4A,BLANK4_4B,BLANK4_4C,BLANK4_4D,BLANK4_4E,BLANK4_4F,BLANK4_4G,BLANK4_4H,SP232114A04,EP393714B01,EP533388B01,EP724905B01,EP282108B01,EP282107B01,EP001625B01,EP073209B02,SP232079A01,EP772145A02,AP771472A04,AP223470B01,SP404412A02,EP772143A02,SP408629A01,EP749735A07,EP846485A01,EP808109A01,SP416130A04,EP882752A01,AP953594A02,AP046324B02,AP891020A04,EP790023A01,EP657386A01,EP805337A01,EP927458A04,AP173299B04,EP768164A02,EP886422A01,AP103463B01,AP744361A02,AP065292B01,SP257517A04,EP790021A04,EP675075A04,SP388683A02,SP232309A01,EP899038A04,EP636802A01,AP046327B02,EP905975A04,SP410796A02,EP784608A01,EP808105A01,SP331134A04,EP718688A01,SP232270A02,EP970001A01,EP001624B01,EP868682A01,EP927462A02,C3,EP890158A02,EP023801B04,EP400447B04,EP385379B01,EP385387B01,EP385384B01,SP754514A04,SP415025A01,SP415023A02,EP400448B04,EP479894B04 | tr -d '"') \
+    -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \
+    -j ${SLURM_JOB_CPUS_PER_NODE}  \
+    -l s_1
+
+if [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/Full ]]; then
+    echo "Run appears successful"
+elif [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/1_demult/Full ]]; then
+    echo "Run appears unsuccessful but has output"
+    exit 1
+else
+    echo "Run appears unsuccessful"
+    exit 1
+fi
\ No newline at end of file

From 33726511939b8f7281df46a06829232bff0c7134 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 9 Oct 2024 14:03:20 -0700
Subject: [PATCH 19/47] Added optional parameter to Pipeline() class.

Added optional parameter to Pipeline() class that overwrites the values in the lane column of a sample-sheet's data section.
This functionality used to reside in the qp-klp plugin and is a common usage pattern.
This allows SPP to override the value in a sample-sheet's lane column with the value provided by the user at submission time.
---
 sequence_processing_pipeline/Pipeline.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 3dd19371..fa5f5c83 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -134,7 +134,7 @@ class Pipeline:
     assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE]
 
     def __init__(self, configuration_file_path, run_id, input_file_path,
-                 output_path, qiita_job_id, pipeline_type):
+                 output_path, qiita_job_id, pipeline_type, lane_number=None):
         """
         Initialize Pipeline object w/configuration information.
         :param configuration_file_path: Path to configuration.json file.
@@ -143,6 +143,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
         :param output_path: Path where all pipeline-generated files live.
         :param qiita_job_id: Qiita Job ID creating this Pipeline.
         :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.)
+        :param lane_number: (Optional) overwrite lane_number in input_file.
         """
         if input_file_path is None:
             raise PipelineError("user_input_file_path cannot be None")
@@ -249,11 +250,29 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
             output_fp = join(output_path, 'dummy_sample_sheet.csv')
             self.generate_dummy_sample_sheet(self.run_dir, output_fp)
             self.sample_sheet = output_fp
+
+            # Optional lane_number parameter is ignored for Amplicon
+            # runs, as the only valid value is 1.
         else:
             # assume user_input_file_path references a sample-sheet.
             self.sample_sheet = self._validate_sample_sheet(input_file_path)
             self.mapping_file = None
 
+            if lane_number is not None:
+                # confirm that the lane_number is a reasonable value.
+                lane_number = int(lane_number)
+                if lane_number < 1 or lane_number > 8:
+                    raise ValueError(f"'{lane_number}' is not a valid name"
+                                     " number")
+
+                # create/overwrite the value for Lane.
+                for sample in self.sample_sheet.Samples:
+                    sample.Lane = lane_number
+
+                # overwrite the original file.
+                with open(input_file_path, 'w') as f:
+                    self.sample_sheet.write(f)
+
         self._configure_profile()
 
     def get_software_configuration(self, software):

From d883b7babe2c5cd1762624ae0b0410733f00ee3a Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 9 Oct 2024 15:07:51 -0700
Subject: [PATCH 20/47] bugfix

---
 sequence_processing_pipeline/Pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index fa5f5c83..977aed0e 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -266,7 +266,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
                                      " number")
 
                 # create/overwrite the value for Lane.
-                for sample in self.sample_sheet.Samples:
+                for sample in self.sample_sheet.samples:
                     sample.Lane = lane_number
 
                 # overwrite the original file.

From a075cd9df65dacbe2112a285a448fba677a8f74a Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 13 Oct 2024 22:30:02 -0700
Subject: [PATCH 21/47] Fixes error

Fixes error found when post-processing adapter-trimmed fastq files.
All files were being moved into one of the project sub-folders, rather than into their associated folders.
This appears to be due to recent implementation change.
All files are now moved into their correct folder.
---
 sequence_processing_pipeline/NuQCJob.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index 0ffacb1a..07261e39 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -1,6 +1,6 @@
 from metapool import load_sample_sheet
 from os import stat, makedirs, rename
-from os.path import join, basename, dirname, exists, abspath
+from os.path import join, basename, dirname, exists, abspath, split
 from sequence_processing_pipeline.Job import Job, KISSLoader
 from sequence_processing_pipeline.PipelineError import (PipelineError,
                                                         JobFailedError)
@@ -104,6 +104,7 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.minimum_bytes = 3100
         self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}'
                                       r'\.fastq\.gz$')
+        self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.interleave\.fastq\.gz$')
         self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$')
         self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$')
 
@@ -170,7 +171,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
             substr = regex.search(file_name)
             if substr is None:
                 raise ValueError(f"{file_name} does not follow naming "
-                                 " pattern.")
+                                 "pattern.")
             else:
                 # check if found substring is a member of this
                 # project. Note sample-name != sample-id
@@ -190,8 +191,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst):
         for fp in files_to_move:
             move(fp, dst)
 
-    @staticmethod
-    def _move_trimmed_files(project_name, output_path):
+    def _move_trimmed_files(self, project_name, output_path):
         '''
         Given output_path, move all fastqs to a new subdir named project_name.
         :param project_name: The name of the new folder to be created.
@@ -205,8 +205,15 @@ def _move_trimmed_files(project_name, output_path):
             # this directory shouldn't already exist.
             makedirs(join(output_path, project_name), exist_ok=False)
 
+            sample_ids = [x[0] for x in self.sample_ids if x[1] == project_name]
+
             for trimmed_file in list(glob.glob(pattern)):
-                move(trimmed_file, join(output_path, project_name))
+                file_name = split(trimmed_file)[1]
+                substr = self.interleave_fastq_regex.search(file_name)
+                if substr is not None:
+                    # only move the sample_ids in this project.
+                    if substr[1] in sample_ids:
+                        move(trimmed_file, join(output_path, project_name))
         else:
             raise ValueError(f"'{output_path}' does not exist")
 
@@ -258,7 +265,6 @@ def run(self, callback=None):
         for project in self.project_data:
             project_name = project['Sample_Project']
             needs_human_filtering = project['HumanFiltering']
-
             source_dir = join(self.output_path, project_name)
             pattern = f"{source_dir}/*.fastq.gz"
             completed_files = list(glob.glob(pattern))
@@ -270,7 +276,7 @@ def run(self, callback=None):
                                      'only-adapter-filtered')
 
             if exists(trimmed_only_path):
-                NuQCJob._move_trimmed_files(project_name, trimmed_only_path)
+                self._move_trimmed_files(project_name, trimmed_only_path)
 
             if needs_human_filtering is True:
                 filtered_directory = join(source_dir, 'filtered_sequences')

From 62734d8802e6b0fddc52de13ce6236efa11e7a3f Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 16 Oct 2024 20:51:14 -0700
Subject: [PATCH 22/47] Rewrote test

---
 sequence_processing_pipeline/NuQCJob.py       |   7 +-
 .../tests/test_NuQCJob.py                     | 135 ++++++++++--------
 setup.py                                      |   6 +-
 3 files changed, 86 insertions(+), 62 deletions(-)

diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index 1f17a46f..89b3106a 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -108,7 +108,9 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path,
         self.minimum_bytes = 3100
         self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}'
                                       r'\.fastq\.gz$')
-        self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.interleave\.fastq\.gz$')
+        self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d'
+                                                 r'_\d{3}\.interleave\.fastq'
+                                                 r'\.gz$')
         self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$')
         self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$')
 
@@ -209,7 +211,8 @@ def _move_trimmed_files(self, project_name, output_path):
             # this directory shouldn't already exist.
             makedirs(join(output_path, project_name), exist_ok=False)
 
-            sample_ids = [x[0] for x in self.sample_ids if x[1] == project_name]
+            sample_ids = [x[0] for x in self.sample_ids
+                          if x[1] == project_name]
 
             for trimmed_file in list(glob.glob(pattern)):
                 file_name = split(trimmed_file)[1]
diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py
index a7e04ec6..177c8162 100644
--- a/sequence_processing_pipeline/tests/test_NuQCJob.py
+++ b/sequence_processing_pipeline/tests/test_NuQCJob.py
@@ -9,7 +9,7 @@
 )
 from os import makedirs, remove
 from metapool import load_sample_sheet
-import glob
+from os import walk
 
 
 class TestNuQCJob(unittest.TestCase):
@@ -2166,10 +2166,56 @@ def test_generate_mmi_filter_cmds_w_annotate_fastq(self):
         self.assertEqual(obs, exp)
 
     def test_move_trimmed(self):
-        # Note: this test does not make use of the output_dir that other
-        # tests use.
+        # create a NuQCJob() object, but do not call run().
+        # instead we will manually create some files to test with.
+        double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"]
+        job = NuQCJob(
+            self.fastq_root_path,
+            self.output_path,
+            self.good_sample_sheet_path,
+            double_db_paths,
+            "queue_name",
+            1,
+            1440,
+            "8",
+            "fastp",
+            "minimap2",
+            "samtools",
+            [],
+            self.qiita_job_id,
+            1000,
+            "",
+            self.movi_path,
+            self.gres_value,
+            self.pmls_path,
+            ['BX']
+        )
 
-        for dummy_fp in SAMPLE_DIR:
+        sample_dir = [
+            "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R1_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R2_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R1_001."
+            "interleave.fastq.gz",
+            "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R2_001."
+            "interleave.fastq.gz",
+            "NuQCJob/NPH_15288/fastp_reports_dir/html/EP890158A02_S58_L001_"
+            "R1_001.html",
+            "NuQCJob/NPH_15288/fastp_reports_dir/json/EP023801B04_S27_L001_"
+            "R1_001.json",
+            "NuQCJob/process_all_fastq_files.sh",
+            "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981."
+            "completed",
+            "NuQCJob/logs/slurm-1897981_1.out",
+            "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1",
+            'NuQCJob/only-adapter-filtered/CDPH-SAL_'
+            'Salmonella_Typhi_MDL-150__S36_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/CDPH-SAL_'
+            'Salmonella_Typhi_MDL-150__S36_L001_R2_001.interleave.fastq.gz',
+        ]
+
+        for dummy_fp in sample_dir:
             dummy_fp = self.path(dummy_fp)
             dummy_path = dirname(dummy_fp)
             makedirs(dummy_path, exist_ok=True)
@@ -2178,38 +2224,33 @@ def test_move_trimmed(self):
 
         trimmed_only_path = self.path("NuQCJob", "only-adapter-filtered")
 
-        NuQCJob._move_trimmed_files("NPH_15288", trimmed_only_path)
-
-        new_path = join(trimmed_only_path, "NPH_15288")
-        pattern = f"{new_path}/*.fastq.gz"
-
-        exp = [
-            (
-                "only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001."
-                "fastq.gz"
-            ),
-            (
-                "only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001."
-                "fastq.gz"
-            ),
-        ]
-
-        for trimmed_file in list(glob.glob(pattern)):
-            trimmed_file = trimmed_file.split("NuQCJob/")[-1]
-            if trimmed_file not in exp:
-                self.assertIn(trimmed_file, exp)
+        # test _move_trimmed_files() by verifying that only the interleave
+        # fastq files from the NYU project are moved.
+        job._move_trimmed_files("NYU_BMS_Melanoma_13059", trimmed_only_path)
+
+        new_path = join(trimmed_only_path, "NYU_BMS_Melanoma_13059")
+
+        exp = {
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02'
+            '_S58_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04'
+            '_S27_L001_R1_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02'
+            '_S58_L001_R2_001.interleave.fastq.gz',
+            'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04'
+            '_S27_L001_R2_001.interleave.fastq.gz'
+            }
+
+        obs = []
+        for root, dirs, files in walk(new_path):
+            for some_file in files:
+                some_path = join(root, some_file)
+                some_path = some_path.replace(self.path(""), "")
+                obs.append(some_path)
+
+        # confirm that only the samples in NYU_BMS_Melanoma_13059 were
+        # moved.
+        self.assertEqual(set(obs), exp)
 
     def _helper(self, regex, good_names, bad_names):
         for good_name in good_names:
@@ -2221,27 +2262,5 @@ def _helper(self, regex, good_names, bad_names):
             self.assertIsNone(substr, msg=f"Regex failed on {bad_name}")
 
 
-SAMPLE_DIR = [
-    "NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz",
-    "NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json",
-    "NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json",
-    "NuQCJob/process_all_fastq_files.sh",
-    "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed",
-    "NuQCJob/logs/slurm-1897981_1.out",
-    "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1",
-]
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/setup.py b/setup.py
index 99103fbb..e0e94196 100644
--- a/setup.py
+++ b/setup.py
@@ -43,8 +43,10 @@
       install_requires=[
         'click', 'requests', 'pandas', 'flake8', 'nose', 'coverage',
         'pgzip', 'jinja2',
-        'metapool @ https://github.com/biocore/'
-        'metagenomics_pooling_notebook/archive/master.zip'
+        # 'metapool @ https://github.com/biocore/'
+        # 'metagenomics_pooling_notebook/archive/master.zip'
+        'metapool @ https://codeload.github.com/charles-cowart/metagenomics'
+        '_pooling_notebook/zip/refs/heads/fake_tellread'
         ],
       entry_points={
           'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli'

From 45131f114f748e1494f11ba5e6723b5a914e33d3 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sat, 2 Nov 2024 13:13:12 -0700
Subject: [PATCH 23/47] Updated branch to use new DFSheet() functionality

---
 sequence_processing_pipeline/Pipeline.py | 11 ++++-------
 setup.py                                 |  4 +++-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 977aed0e..86b01cdf 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -15,6 +15,7 @@
 from datetime import datetime
 from xml.etree import ElementTree as ET
 from metapool.prep import PREP_MF_COLUMNS
+from metapool import set_lane_number_in_sheet
 
 
 logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
@@ -265,13 +266,9 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
                     raise ValueError(f"'{lane_number}' is not a valid name"
                                      " number")
 
-                # create/overwrite the value for Lane.
-                for sample in self.sample_sheet.samples:
-                    sample.Lane = lane_number
-
-                # overwrite the original file.
-                with open(input_file_path, 'w') as f:
-                    self.sample_sheet.write(f)
+                # overwrite sample-sheet w/DFSheets processed version
+                # with overwritten Lane number.
+                set_lane_number_in_sheet(input_file_path, lane_number)
 
         self._configure_profile()
 
diff --git a/setup.py b/setup.py
index e0e94196..e7894aab 100644
--- a/setup.py
+++ b/setup.py
@@ -45,8 +45,10 @@
         'pgzip', 'jinja2',
         # 'metapool @ https://github.com/biocore/'
         # 'metagenomics_pooling_notebook/archive/master.zip'
+        # sample_sheet_update branch contains all of the changes in the
+        # fake_tellread branch + DFSheet.
         'metapool @ https://codeload.github.com/charles-cowart/metagenomics'
-        '_pooling_notebook/zip/refs/heads/fake_tellread'
+        '_pooling_notebook/zip/refs/heads/sample_sheet_update'
         ],
       entry_points={
           'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli'

From 4665ee8d89744af16c0c2b8ccb94f85ba38bca90 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 3 Nov 2024 22:57:54 -0800
Subject: [PATCH 24/47] Updated to recent changes in metapool

---
 sequence_processing_pipeline/Pipeline.py      |  8 ++---
 sequence_processing_pipeline/TellReadJob.py   |  6 ++--
 .../tests/test_TellReadJob.py                 | 30 +------------------
 3 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 86b01cdf..04d96f0a 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -255,10 +255,6 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
             # Optional lane_number parameter is ignored for Amplicon
             # runs, as the only valid value is 1.
         else:
-            # assume user_input_file_path references a sample-sheet.
-            self.sample_sheet = self._validate_sample_sheet(input_file_path)
-            self.mapping_file = None
-
             if lane_number is not None:
                 # confirm that the lane_number is a reasonable value.
                 lane_number = int(lane_number)
@@ -270,6 +266,10 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
                 # with overwritten Lane number.
                 set_lane_number_in_sheet(input_file_path, lane_number)
 
+            # assume user_input_file_path references a sample-sheet.
+            self.sample_sheet = self._validate_sample_sheet(input_file_path)
+            self.mapping_file = None
+
         self._configure_profile()
 
     def get_software_configuration(self, software):
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 2f7905d5..ad01ef8f 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -16,8 +16,7 @@ class TellReadJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
                  qiita_job_id, label, reference_base,
-                 reference_map, tmp1_path, sing_script_path, lane,
-                 cores_per_task):
+                 reference_map, tmp1_path, sing_script_path, cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -61,6 +60,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.sing_script_path = sing_script_path
         self.tmp1_path = tmp1_path
 
+        sheet = load_sample_sheet(self.sample_sheet_path)
+        lane = sheet.samples[0].Lane
+
         # force self.lane_number to be int. raise an Error if it's not.
         tmp = int(lane)
         if tmp < 1 or tmp > 8:
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
index 6cc12632..801947e8 100644
--- a/sequence_processing_pipeline/tests/test_TellReadJob.py
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -43,41 +43,13 @@ def setUp(self):
         self.cores_per_task = "4"
 
     def test_creation(self):
-        # confirm only sensible lane numbers are allowed.
-        with self.assertRaisesRegex(ValueError,
-                                    "'-1' is not a valid lane number"):
-            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
-                        self.queue_name, self.node_count, self.wall_time_limit,
-                        self.jmem, self.modules_to_load, self.qiita_job_id,
-                        self.label, self.reference_base, self.reference_map,
-                        self.tmp1_path, self.sing_script_path, -1,
-                        self.cores_per_task)
-
-        with self.assertRaisesRegex(ValueError,
-                                    "'0' is not a valid lane number"):
-            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
-                        self.queue_name, self.node_count, self.wall_time_limit,
-                        self.jmem, self.modules_to_load, self.qiita_job_id,
-                        self.label, self.reference_base, self.reference_map,
-                        self.tmp1_path, self.sing_script_path, 0,
-                        self.cores_per_task)
-
-        with self.assertRaisesRegex(ValueError,
-                                    "'9' is not a valid lane number"):
-            TellReadJob(self.run_dir, self.output_path, self.sample_sheet_path,
-                        self.queue_name, self.node_count, self.wall_time_limit,
-                        self.jmem, self.modules_to_load, self.qiita_job_id,
-                        self.label, self.reference_base, self.reference_map,
-                        self.tmp1_path, self.sing_script_path, 9,
-                        self.cores_per_task)
-
         # test basic good-path
         job = TellReadJob(self.run_dir, self.output_path,
                           self.sample_sheet_path, self.queue_name,
                           self.node_count, self.wall_time_limit,
                           self.jmem, self.modules_to_load, self.qiita_job_id,
                           self.label, self.reference_base, self.reference_map,
-                          self.tmp1_path, self.sing_script_path, self.lane,
+                          self.tmp1_path, self.sing_script_path,
                           self.cores_per_task)
 
         job._generate_job_script()

From 3542df37bd5970d1e6abc1f10eee3e568b3323bc Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 6 Nov 2024 18:11:58 -0800
Subject: [PATCH 25/47] Update from testing

---
 sequence_processing_pipeline/TellReadJob.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index ad01ef8f..e36888db 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -16,7 +16,7 @@ class TellReadJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
                  qiita_job_id, label, reference_base,
-                 reference_map, tmp1_path, sing_script_path, cores_per_task):
+                 reference_map, sing_script_path, cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -58,7 +58,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.qiita_job_id = qiita_job_id
         self.jinja_env = Environment(loader=KISSLoader('templates'))
         self.sing_script_path = sing_script_path
-        self.tmp1_path = tmp1_path
 
         sheet = load_sample_sheet(self.sample_sheet_path)
         lane = sheet.samples[0].Lane
@@ -122,7 +121,9 @@ def _process_sample_sheet(self):
 
         sample_ids = []
         for sample in sheet.samples:
-            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+            sample_ids.append((sample['Sample_ID'],
+                               sample['Sample_Project'],
+                               sample['barcode_id']))
 
         bioinformatics = sheet.Bioinformatics
 
@@ -143,12 +144,12 @@ def _generate_job_script(self):
         # generate a comma separated list of sample-ids from the tuples stored
         # in self.sample_ids.
 
-        # NB: the current sample-sheet format used for TellRead doesn't include
-        # sample-names and sample-ids, only sample_id. e.g. C501,C502,etc.
-        # Hence, when a final sample sheet format is ready, it may be prudent
-        # to switch this to pull values from the expected sample-names column
-        # instead.
-        samples = ','.join([id[0] for id in self.sample_ids])
+        # NB: Proposed sample-sheets will have traditional Sample_ID and
+        # Sample_Name columns as well as a new value named barcode_id. It's
+        # this column that will contain the 'C50n' values needed to be
+        # supplied to tellread. Later we will use this mapping to rename the
+        # files from C50n...fastq.gz to sample-name...fastq.gz.
+        samples = ','.join([id[2] for id in self.sample_ids])
 
         # since we haven't included support for reference_map yet, whenever a
         # reference is not included, the mapping against the list of sample_ids
@@ -170,7 +171,7 @@ def _generate_job_script(self):
                 "cores_per_task": self.cores_per_task,
                 "queue_name": self.queue_name,
                 "sing_script_path": self.sing_script_path,
-                "tmp_dir": self.tmp1_path,
+                "tmp_dir": join(self.output_path, "output", "tmp1"),
                 "modules_to_load": ' '.join(self.modules_to_load),
                 "lane": f"s_{self.lane_number}",
                 "output": join(self.output_path, "output"),

From c2c3b06b3f5babf357530dc1fd67848580f4dbe0 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 13 Nov 2024 18:01:22 -0800
Subject: [PATCH 26/47] Updates to TRIntegrateJob based on testing

---
 .../TRIntegrateJob.py                         |  52 +++++--
 .../TRNormCountsJob.py                        |   7 +-
 sequence_processing_pipeline/TellReadJob.py   |  22 +--
 .../templates/integrate.sbatch                | 130 +++++++-----------
 .../templates/tellread.sbatch                 |  17 +--
 5 files changed, 106 insertions(+), 122 deletions(-)

diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 25cec68a..3b1e8561 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -6,6 +6,8 @@
 from .Pipeline import Pipeline
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
+from os import makedirs
+from shutil import copy
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -14,8 +16,9 @@
 class TRIntegrateJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, max_array_length, indicies_script_path, label,
-                 reference_base, reference_map, cores_per_task):
+                 qiita_job_id, max_array_length, integrate_script_path,
+                 sil_path, raw_fastq_dir, reference_base, reference_map,
+                 cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -29,8 +32,8 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param modules_to_load: A list of Linux module names to load
         :param qiita_job_id: identify Torque jobs using qiita_job_id
         :param max_array_length: None
-        :param indicies_script_path: None
-        :param label: None
+        :param integrate_script_path: None
+        :param sil_path: A path to a confidential file mapping C5xx, adapters.
         :param reference_base: None
         :param reference_map: None
         :param cores_per_task: # of CPU cores per node to request.
@@ -50,7 +53,10 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.node_count = node_count
         self.wall_time_limit = wall_time_limit
         self.cores_per_task = cores_per_task
-        self.indicies_script_path = indicies_script_path
+        self.integrate_script_path = integrate_script_path
+        self.sil_path = sil_path
+        self.raw_fastq_dir = raw_fastq_dir
+        self.tmp_dir = join(self.output_path, 'tmp')
 
         self.reference_base = reference_base
         self.reference_map = reference_map
@@ -60,17 +66,31 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.qiita_job_id = qiita_job_id
         self.sample_count = len(self.sample_ids)
         self.jinja_env = Environment(loader=KISSLoader('templates'))
-        self.label = label
+        self.job_name = (f"integrate_{self.qiita_job_id}")
 
-        if self.reference_base is not None or self.reference_map is not None:
-            tag = 'reference-based'
-        else:
-            tag = 'reference-free'
-
-        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+        with open(self.sil_path, 'r') as f:
+            # obtain the number of unique barcode_ids as determined by
+            # TellReadJob() in order to set up an array job of the
+            # proper length.
+            lines = f.readlines()
+            lines = [x.strip() for x in lines]
+            lines = [x for x in lines if x != '']
+            self.barcode_id_count = len(lines)
 
     def run(self, callback=None):
         job_script_path = self._generate_job_script()
+
+        # copy sil_path to TRIntegrate working directory and rename to a
+        # predictable name.
+        copy(self.sil_path, join(self.output_path, 'sample_index_list.txt'))
+
+        # generate the tailored subset of adapter to barcode_id based on
+        # the proprietary lists owned by the manufacturer and supplied by
+        # the caller, and the barcode ids found in the sample-sheet.
+        self._generate_sample_index_list()
+
+        makedirs(self.tmp_dir)
+
         params = ['--parsable',
                   f'-J {self.job_name}',
                   f'--array 1-{self.sample_count}']
@@ -132,8 +152,14 @@ def _generate_job_script(self):
                 "mem_in_gb": self.jmem,
                 "node_count": self.node_count,
                 "cores_per_task": self.cores_per_task,
-                "iinp_script_path": self.indicies_script_path,
+                "integrate_script_path": self.integrate_script_path,
                 "queue_name": self.queue_name,
+                "barcode_id_count": self.barcode_id_count,
+                "raw_fastq_dir": self.raw_fastq_dir,
+                "tmp_dir": self.tmp_dir,
                 "output_dir": self.output_path}))
 
         return job_script_path
+
+    def parse_logs(self):
+        raise PipelineError("parse_logs() not implemented for TRIntegrateJob")
diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py
index a3603bcd..6887994a 100644
--- a/sequence_processing_pipeline/TRNormCountsJob.py
+++ b/sequence_processing_pipeline/TRNormCountsJob.py
@@ -62,12 +62,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         self.jinja_env = Environment(loader=KISSLoader('templates'))
         self.label = label
 
-        if self.reference_base is not None or self.reference_map is not None:
-            tag = 'reference-based'
-        else:
-            tag = 'reference-free'
-
-        self.job_name = (f"{self.label}-{tag}-THIS_IS_A_DATE-integrate")
+        self.job_name = (f"norm_counts_{self.qiita_job_id}")
 
     def run(self, callback=None):
         job_script_path = self._generate_job_script()
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index e36888db..322cca17 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -15,7 +15,7 @@
 class TellReadJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, label, reference_base,
+                 qiita_job_id, reference_base,
                  reference_map, sing_script_path, cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
@@ -29,7 +29,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param jmem: String representing total memory limit for entire job.
         :param modules_to_load: A list of Linux module names to load
         :param qiita_job_id: identify Torque jobs using qiita_job_id
-        :param label: None
         :param reference_base: None
         :param reference_map: None
         :param cores_per_task: (Optional) # of CPU cores per node to request.
@@ -75,15 +74,13 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
             tag = 'reference-free'
 
         date = datetime.today().strftime('%Y.%m.%d')
-        self.job_name = (f"{label}-{tag}-{date}-tellread")
+        self.job_name = (f"{self.qiita_job_id}-{tag}-{date}-tellread")
 
     def run(self, callback=None):
         job_script_path = self._generate_job_script()
-        params = ['--parsable',
-                  f'-J {self.job_name}',
-                  '-c ${sbatch_cores}',
-                  '--mem ${sbatch_mem}',
-                  '--time ${wall}']
+
+        # everything is in the job script so there are no additional params.
+        params = []
 
         try:
             self.job_info = self.submit_job(job_script_path,
@@ -171,10 +168,15 @@ def _generate_job_script(self):
                 "cores_per_task": self.cores_per_task,
                 "queue_name": self.queue_name,
                 "sing_script_path": self.sing_script_path,
-                "tmp_dir": join(self.output_path, "output", "tmp1"),
                 "modules_to_load": ' '.join(self.modules_to_load),
                 "lane": f"s_{self.lane_number}",
-                "output": join(self.output_path, "output"),
+                # NB: Note that we no longer create a sub-directory under the
+                # working directory for TellRead to create all its output
+                # folders and files. This means it is creating folders and
+                # files in the same directory that has our sbatch script and
+                # logs directory. Currently there are no name collisions,
+                # however.
+                "output": self.output_path,
                 "rundir_path": self.root_dir,
                 "samples": samples,
                 "refs": refs,
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index 8c767382..92dcfe87 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -1,96 +1,66 @@
 #!/bin/bash -l
-#SBATCH -J {{job_name}}             # integrate
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 8G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 1
-#SBATCH -p {{queue_name}}           # qiita
+#SBATCH -J {{job_name}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH -p {{queue_name}}
+#SBATCH --array=1-{{barcode_id_count}}
+#SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out
+#SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err
 
-#SBATCH --output integrate_%x-%A_%a.out
-#SBATCH --error integrate_%x-%A_%a.err
-
-# NB SLURM_ARRAY_TASK_ID is exported by Slurm
-if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
-    echo "Not operating in an array"
-    exit 1
-fi
-
-# NB SLURM_ARRAY_TASK_MIN is exported by Slurm
-if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
-    echo "Line extraction assumes 1-based index"
-    exit 1
-fi
-
-set -x 
+set -x
 set -e
-set -o pipefail
 
-samples=($(cat {{output_dir}}/sample_index_list_output.txt | cut -f 2))
+samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2))
 sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
 
-# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT.
-export TMPDIR=$(mktemp -d)
-function cleanup {                                                              
-  echo "Removing $TMPDIR"                                                          
-  rm  -r $TMPDIR                                                                   
-  unset TMPDIR                                                                  
-}                                                                               
-trap cleanup EXIT
+export TMPDIR={{tmp_dir}}
 
-files=${TMPDIR}/integration.files
-/bin/ls -1 {{output_dir}}/Full/*corrected.err_barcode_removed.fastq > ${files}
-mkdir -p {{output_dir}}/integrated
+# get list of samples and determine which sample this array instance will work
+# on.
+samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]}
 
-if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R1"
-    exit 1
-fi
+echo "Processing sample ${sample}..."
 
-if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R2"
-    exit 1
-fi
+# make temp directory
+export TMPDIR={{tmp_dir}}
+mkdir -p $TMPDIR
 
-if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} I1"
-    exit 1
-fi
 
-r1=$(grep -m 1 "_R1_${sample}" ${files})
-r2=$(grep -m 1 "_R2_${sample}" ${files})
-i1=$(grep -m 1 "_I1_${sample}" ${files})
-r1out={{output_dir}}/integrated/${sample}.R1.fastq.gz
-r2out={{output_dir}}/integrated/${sample}.R2.fastq.gz
-i1out={{output_dir}}/integrated/${sample}.I1.fastq.gz
+# TODO: All three input files must be non-zero in length.
+# If possible, do this check as part of normal FSR operation.
+# Previously this was done right here BEFORE integrating, rather
+# than after.
 
-if [[ ! -s ${r1} ]]; then
-    echo "${r1} is empty, cannot integrate"
-    if [[ -s ${r2} ]]; then
-        echo "R1 and R2 are inconsistent"
-        exit 1
-    fi
-    if [[ -s ${i1} ]]; then
-        echo "R1 and I1 are inconsistent"
-        exit 1
-    fi
+# NB: non-zero file-length check removed for now. This should be performed
+# by FSR after processing is done.
+# TODO: Make sure raw_fastq_dir is TellReadJob/Full
+r1_in={{raw_fastq_dir}}/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+r2_in={{raw_fastq_dir}}/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+i1_in={{raw_fastq_dir}}/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
 
-    # reflect the empties so Qiita can know of them
-    touch ${r1out}
-    touch ${r2out}
-    touch ${i1out}
-    exit 0
-fi
+# create output directory
+mkdir -p {{output_dir}}/integrated
+
+# generate output file names
+r1_out={{output_dir}}/integrated/${sample}.R1.fastq.gz
+r2_out={{output_dir}}/integrated/${sample}.R2.fastq.gz
+i1_out={{output_dir}}/integrated/${sample}.I1.fastq.gz
 
-# this can probably be backgrounded but then you have to get creative to
-# not mask a nonzero exit status (e.g., the python process raising)
-cat ${i1} | gzip > ${i1out} 
+# generate 'integrated' I1 fastq.gz file. We do this as part of each array so
+# they're done in parallel.
+gzip -c ${i1_in} > ${i1_out}
 
+# generate integrated R1 and R2 fastq.gz files.
 conda activate qp-knight-lab-processing-2022.03
-python {{iinp_script_path}} integrate \
-    --no-sort \
-    --r1-in ${r1} \
-    --r2-in ${r2} \
-    --i1-in ${i1} \
-    --r1-out ${r1out} \
-    --r2-out ${r2out} \
-    --threads ${SLURM_CPUS_PER_TASK}
+
+python {{integrate_script_path}} integrate \
+--no-sort \
+--r1-in ${r1_in} \
+--r2-in ${r2_in} \
+--i1-in ${i1_in} \
+--r1-out ${r1_out} \
+--r2-out ${r2_out} \
+--threads {{cores_per_task}}
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index 7d044bb7..89cae33f 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -6,17 +6,11 @@
 #SBATCH --mem {{mem_in_gb}}G
 #SBATCH --time {{wall_time_limit}}
 
-#SBATCH --output tellread_%x-%A.out
-#SBATCH --error tellread_%x-%A.err
+#SBATCH --output {{output}}/logs/tellread_%x-%A.out
+#SBATCH --error {{output}}/logs/tellread_%x-%A.err
 
 set -x
 
-export TMPDIR={{tmp_dir}}
-mkdir -p ${TMPDIR}
-export TMPDIR=$(mktemp -d)
-
-mkdir -p {{output}}
-
 module load {{modules_to_load}}
 {{sing_script_path}} \
     -i {{rundir_path}} \
@@ -27,11 +21,8 @@ module load {{modules_to_load}}
     -l {{lane}}
 
 if [[ -d {{output}}/Full ]]; then
-    echo "Run appears successful"
-elif [[ -d {{output}}/1_demult/Full ]]; then
-    echo "Run appears unsuccessful but has output"
-    exit 1
+    echo "tellread.sbatch successful"
 else
-    echo "Run appears unsuccessful"
+    echo "tellread.sbatch unsuccessful"
     exit 1
 fi

From 49f16732fce1df4bd105b57bdced5a2de2e7012c Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 13 Nov 2024 18:16:14 -0800
Subject: [PATCH 27/47] Updated sample config file

---
 .../iseq_metagenomic.json                     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
index 089e82f1..c82c76b0 100644
--- a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
+++ b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json
@@ -3,6 +3,25 @@
     "instrument_type": "iseq",
     "assay_type": "Metagenomic",
     "configuration": {
+      "tell-seq": {
+        "label": "my_label",
+        "reference_base": "",
+        "reference_map": "",
+        "sing_script_path": "/my_path/tellread-release-novaseqX/run_tellread_sing.sh",
+        "nodes": 1,
+        "lane": 1,
+        "sample_index_list": "/my_path/sample_index_list_1.txt",
+        "queue": "qiita",
+        "wallclock_time_in_minutes": 1440,
+        "modules_to_load": ["singularity_3.6.4"],
+        "integrate_script_path": "/my_path/integrate-indices-np.py",
+        "tellread_mem_limit": "16",
+        "tellread_cores": "4",
+        "normcount_cores": "1",
+        "integrate_cores": "1",
+        "normcount_mem_limit": "8",
+        "integrate_mem_limit": "8"
+      },
       "bcl2fastq": {
         "nodes": 1,
         "nprocs": 16,

From efc0849a007be79cddf8ce9a250d84aa165589f3 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 13 Nov 2024 19:48:42 -0800
Subject: [PATCH 28/47] Replaced legacy exit check for tellread

---
 .../templates/tellread.sbatch                 | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index 89cae33f..f46e0798 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -20,9 +20,33 @@ module load {{modules_to_load}}
     -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \
     -l {{lane}}
 
-if [[ -d {{output}}/Full ]]; then
-    echo "tellread.sbatch successful"
-else
-    echo "tellread.sbatch unsuccessful"
-    exit 1
-fi
+# instead of testing for the presence of '{{output}}/Full', we will review
+# the changed timestamps for all the files in '{{output}}/Full' and when
+# we can demonstrate that they haven't changed in an arbitrary period of time
+# we will consider the work completed. 
+
+# get the timestamp for the most recently changed file in directory '.'
+
+# hard-limit for wait time set to ~ 8 hours.
+# (4 checks per hour, for 8 hours equals 32 iterations)
+for i in $(seq 1 32);
+do
+    before="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+    # assume TellReadJob is finished if ctime hasn't changed in 15 minutes
+    # for any fastq file in the directory.
+    sleep 900
+    after="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+
+    echo "$before   $after"
+
+    if [[ "$before" == "$after" ]]; then
+        echo "DONE"
+        exit 0
+    else
+        echo "NOT DONE"
+    fi
+done
+
+# if we've reached this point then we've exceeded our hard-limit for waiting.
+# return w/an error.
+exit 1

From 47278653a0cf8104038394419a961f4799dac411 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Fri, 15 Nov 2024 21:29:21 -0800
Subject: [PATCH 29/47] recent updates

---
 sequence_processing_pipeline/Job.py            |  1 +
 sequence_processing_pipeline/TRIntegrateJob.py |  2 +-
 sequence_processing_pipeline/TellReadJob.py    |  4 ++--
 sequence_processing_pipeline/util.py           | 12 +++++++++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 59d9cea2..36248ab3 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -12,6 +12,7 @@
 import logging
 from inspect import stack
 import re
+from time import time
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 3b1e8561..4e273055 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -7,7 +7,7 @@
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
 from os import makedirs
-from shutil import copy
+from shutil import copyfile
 
 
 logging.basicConfig(level=logging.DEBUG)
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 322cca17..ee5054de 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -73,8 +73,7 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         else:
             tag = 'reference-free'
 
-        date = datetime.today().strftime('%Y.%m.%d')
-        self.job_name = (f"{self.qiita_job_id}-{tag}-{date}-tellread")
+        self.job_name = (f"{self.qiita_job_id}-tellread")
 
     def run(self, callback=None):
         job_script_path = self._generate_job_script()
@@ -187,3 +186,4 @@ def _generate_job_script(self):
 
     def parse_logs(self):
         raise PipelineError("parse_logs() not implemented for TellReadJob")
+
diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py
index d9586f81..c5b3cdef 100644
--- a/sequence_processing_pipeline/util.py
+++ b/sequence_processing_pipeline/util.py
@@ -1,7 +1,17 @@
 import re
 
 
-PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
+#PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
+
+# The above will truncate on the first _R1_ found, which only works when _R1_ or _R2_
+# appears exactly once in a file path. When the wet-lab incorporates these same strings
+# in their sample-names as descriptive metadata, this assumption is broken.
+# For all raw fastq files being used as input into NuQCJob, we can assume they end
+# in the following convention. Per Illumina spec, all fastq files end in _001 and we
+# preserve this convention even at the cost of renaming output files from TRIntegrateJob.
+# PAIR_DOT is kept as is, but may be removed later because for the purposes of SPP, no input
+# should ever be named with dots instead of underscores.
+PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), '_R1_001.fastq.gz', '_R2_001.fastq.gz')
 PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.')
 PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT)
 

From ba1399f8f621c5856134cbc6271ba8da71d2f9e5 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 19 Nov 2024 15:20:21 -0800
Subject: [PATCH 30/47] Updated tests

---
 sequence_processing_pipeline/Commands.py      |   2 +-
 sequence_processing_pipeline/Job.py           |   1 -
 sequence_processing_pipeline/Pipeline.py      |   5 +-
 .../TRIntegrateJob.py                         |   3 +-
 sequence_processing_pipeline/TellReadJob.py   |   8 --
 .../templates/tellread.sbatch                 |   3 +-
 .../data/tellread_output/tellread_test.sbatch |  55 ++++---
 .../data/tellseq_metag_dummy_sample_sheet.csv | 135 ++++++++++++++++++
 .../tests/test_TellReadJob.py                 |  12 +-
 .../tests/test_commands.py                    |  36 +++--
 .../tests/test_util.py                        |  32 ++---
 sequence_processing_pipeline/util.py          |  23 +--
 12 files changed, 231 insertions(+), 84 deletions(-)
 create mode 100644 sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv

diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
index cce7c605..642e49cf 100644
--- a/sequence_processing_pipeline/Commands.py
+++ b/sequence_processing_pipeline/Commands.py
@@ -87,7 +87,7 @@ def demux(id_map, fp, out_d, task, maxtask):
     """Split infile data based in provided map"""
     delimiter = '::MUX::'
     mode = 'wt'
-    ext = '.fastq.gz'
+    ext = '_001.fastq.gz'
     sep = '/'
     rec = '@'
 
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 36248ab3..59d9cea2 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -12,7 +12,6 @@
 import logging
 from inspect import stack
 import re
-from time import time
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 04d96f0a..2b9f3fa2 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -15,7 +15,6 @@
 from datetime import datetime
 from xml.etree import ElementTree as ET
 from metapool.prep import PREP_MF_COLUMNS
-from metapool import set_lane_number_in_sheet
 
 
 logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
@@ -264,7 +263,9 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
 
                 # overwrite sample-sheet w/DFSheets processed version
                 # with overwritten Lane number.
-                set_lane_number_in_sheet(input_file_path, lane_number)
+                sheet = load_sample_sheet(input_file_path)
+                with open(input_file_path, 'w') as f:
+                    sheet.write(f, lane=lane_number)
 
             # assume user_input_file_path references a sample-sheet.
             self.sample_sheet = self._validate_sample_sheet(input_file_path)
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 4e273055..875a1988 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -82,7 +82,8 @@ def run(self, callback=None):
 
         # copy sil_path to TRIntegrate working directory and rename to a
         # predictable name.
-        copy(self.sil_path, join(self.output_path, 'sample_index_list.txt'))
+        copyfile(self.sil_path,
+                 join(self.output_path, 'sample_index_list.txt'))
 
         # generate the tailored subset of adapter to barcode_id based on
         # the proprietary lists owned by the manufacturer and supplied by
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index ee5054de..3b3bf314 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -6,7 +6,6 @@
 from .Pipeline import Pipeline
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
-from datetime import datetime
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -67,12 +66,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
             raise ValueError(f"'{tmp}' is not a valid lane number")
         self.lane_number = tmp
 
-        # TODO: Need examples of these being not None
-        if self.reference_base is not None or self.reference_map is not None:
-            tag = 'reference-based'
-        else:
-            tag = 'reference-free'
-
         self.job_name = (f"{self.qiita_job_id}-tellread")
 
     def run(self, callback=None):
@@ -186,4 +179,3 @@ def _generate_job_script(self):
 
     def parse_logs(self):
         raise PipelineError("parse_logs() not implemented for TellReadJob")
-
diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index f46e0798..66d9d9fd 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -23,7 +23,7 @@ module load {{modules_to_load}}
 # instead of testing for the presence of '{{output}}/Full', we will review
 # the changed timestamps for all the files in '{{output}}/Full' and when
 # we can demonstrate that they haven't changed in an arbitrary period of time
-# we will consider the work completed. 
+# we will consider the work completed.
 
 # get the timestamp for the most recently changed file in directory '.'
 
@@ -50,3 +50,4 @@ done
 # if we've reached this point then we've exceeded our hard-limit for waiting.
 # return w/an error.
 exit 1
+
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
index a008937b..fb099cf3 100644
--- a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
+++ b/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
@@ -6,32 +6,47 @@
 #SBATCH --mem 16G
 #SBATCH --time 96:00:00
 
-#SBATCH --output tellread_%x-%A.out
-#SBATCH --error tellread_%x-%A.err
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.err
 
 set -x
 
-export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/tmp1
-mkdir -p ${TMPDIR}
-export TMPDIR=$(mktemp -d)
-
-mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output
-
 module load singularity_3.6.4
 $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
     -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \
-    -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output \
-    -s $(echo CDPH-SAL__Salmonella__Typhi__MDL-143,CDPH-SAL_Salmonella_Typhi_MDL-144,CDPH-SAL_Salmonella_Typhi_MDL-145,CDPH-SAL_Salmonella_Typhi_MDL-146,CDPH-SAL_Salmonella_Typhi_MDL-147,CDPH-SAL_Salmonella_Typhi_MDL-148,CDPH-SAL_Salmonella_Typhi_MDL-149,CDPH-SAL_Salmonella_Typhi_MDL-150,CDPH-SAL_Salmonella_Typhi_MDL-151,CDPH-SAL_Salmonella_Typhi_MDL-152,CDPH-SAL_Salmonella_Typhi_MDL-153,CDPH-SAL_Salmonella_Typhi_MDL-154,CDPH-SAL_Salmonella_Typhi_MDL-155,CDPH-SAL_Salmonella_Typhi_MDL-156,CDPH-SAL_Salmonella_Typhi_MDL-157,CDPH-SAL_Salmonella_Typhi_MDL-158,CDPH-SAL_Salmonella_Typhi_MDL-159,CDPH-SAL_Salmonella_Typhi_MDL-160,CDPH-SAL_Salmonella_Typhi_MDL-161,CDPH-SAL_Salmonella_Typhi_MDL-162,CDPH-SAL_Salmonella_Typhi_MDL-163,CDPH-SAL_Salmonella_Typhi_MDL-164,CDPH-SAL_Salmonella_Typhi_MDL-165,CDPH-SAL_Salmonella_Typhi_MDL-166,CDPH-SAL_Salmonella_Typhi_MDL-167,CDPH-SAL_Salmonella_Typhi_MDL-168,P21_E_coli_ELI344,P21_E_coli_ELI345,P21_E_coli_ELI347,P21_E_coli_ELI348,P21_E_coli_ELI349,P21_E_coli_ELI350,P21_E_coli_ELI351,P21_E_coli_ELI352,P21_E_coli_ELI353,P21_E_coli_ELI354,P21_E_coli_ELI355,P21_E_coli_ELI357,P21_E_coli_ELI358,P21_E_coli_ELI359,P21_E_coli_ELI361,P21_E_coli_ELI362,P21_E_coli_ELI363,P21_E_coli_ELI364,P21_E_coli_ELI365,P21_E_coli_ELI366,P21_E_coli_ELI367,P21_E_coli_ELI368,P21_E_coli_ELI369,stALE_E_coli_A1_F21_I1_R1,stALE_E_coli_A2_F21_I1_R1,stALE_E_coli_A3_F18_I1_R1,stALE_E_coli_A3_F40_I1_R1,stALE_E_coli_A4_F21_I1_R1,stALE_E_coli_A4_F21_I1_R2,stALE_E_coli_A4_F42_I1_R1,stALE_E_coli_A5_F21_I1_R1,stALE_E_coli_A5_F42_I1_R1,stALE_E_coli_A6_F21_I1_R1,stALE_E_coli_A6_F43_I1_R1,stALE_E_coli_A7_F21_I1_R1,stALE_E_coli_A7_F42_I1_R1,stALE_E_coli_A8_F20_I1_R1,stALE_E_coli_A8_F42_I1_R1,stALE_E_coli_A9_F21_I1_R1,stALE_E_coli_A9_F44_I1_R1,stALE_E_coli_A10_F21_I1_R1,stALE_E_coli_A10_F43_I1_R1,stALE_E_coli_A10_F131_I1_R1,stALE_E_coli_A11_F21_I1_R1,stALE_E_coli_A11_F43_I1_R1,stALE_E_coli_A11_F119_I1_R1,stALE_E_coli_A12_F21_I1_R1,stALE_E_coli_A12_F43_I1_R1,stALE_E_coli_A12_F136_I1_R1,stALE_E_coli_A13_F20_I1_R1,stALE_E_coli_A13_F42_I1_R1,stALE_E_coli_A13_F121_I1_R1,stALE_E_coli_A14_F20_I1_R1,stALE_E_coli_A14_F42_I1_R1,stALE_E_coli_A14_F133_I1_R1,stALE_E_coli_A15_F21_I1_R1,stALE_E_coli_A15_F42_I1_R1,stALE_E_coli_A15_F117_I1_R1,stALE_E_coli_A16_F20_I1_R1,stALE_E_coli_A16_F42_I1_R1,stALE_E_coli_A16_F134_I1_R1,stALE_E_coli_A17_F21_I1_R1,stALE_E_coli_A17_F118_I1_R1,stALE_E_coli_A18_F18_I1_R1,stALE_E_coli_A18_F39_I1_R1,stALE_E_coli_A18_F130_I1_R1,3A,4A,BLANK_40_12G,BLANK_40_12H,Pputida_JBEI__HGL_Pputida_107_BP6,Pputida_JBEI__HGL_Pputida_108_BP7,Pputida_JBEI__HGL_Pputida_109_BP8,Pputida_JBEI__HGL_Pputida_110_M2,Pputida_JBEI__HGL_Pputida_111_M5,Pputida_TALE__HGL_Pputida_112,Pputida_TALE__HGL_Pputida_113,Pputida_TALE__HGL_Pputida_114,Pputida_TALE__HGL_Pputida_115,Pputida_TALE__HGL_Pputida_116,Pputida_TALE__HGL_Pputida_117,Pputida_TALE__HGL_Pputida_118,Pputida_TALE__HGL_Pputida_119,Pputida_TALE__HGL_Pputida_120,Pputida_TALE__HGL_Pputida_121,Pputida_TALE__HGL_Pputida_122,Pputida_TALE__HGL_Pputida_123,Pputida_TALE__HGL_Pputida_124,Pputida_TALE__HGL_Pputida_125,Pputida_TALE__HGL_Pputida_126,Pputida_TALE__HGL_Pputida_127,Pputida_TALE__HGL_Pputida_128,Pputida_TALE__HGL_Pputida_129,Pputida_TALE__HGL_Pputida_130,Pputida_TALE__HGL_Pputida_131,Pputida_TALE__HGL_Pputida_132,Pputida_TALE__HGL_Pputida_133,Pputida_TALE__HGL_Pputida_134,Pputida_TALE__HGL_Pputida_135,Pputida_TALE__HGL_Pputida_136,Pputida_TALE__HGL_Pputida_137,Pputida_TALE__HGL_Pputida_138,Pputida_TALE__HGL_Pputida_139,Pputida_TALE__HGL_Pputida_140,Pputida_TALE__HGL_Pputida_141,Pputida_TALE__HGL_Pputida_142,Pputida_TALE__HGL_Pputida_143,Pputida_TALE__HGL_Pputida_144,Pputida_PALE__HGL_Pputida_145,Pputida_PALE__HGL_Pputida_146,Pputida_PALE__HGL_Pputida_147,Pputida_PALE__HGL_Pputida_148,Pputida_PALE__HGL_Pputida_149,Pputida_PALE__HGL_Pputida_150,Pputida_PALE__HGL_Pputida_151,Pputida_PALE__HGL_Pputida_152,Pputida_PALE__HGL_Pputida_153,Pputida_PALE__HGL_Pputida_154,Pputida_PALE__HGL_Pputida_155,Pputida_PALE__HGL_Pputida_156,Pputida_PALE__HGL_Pputida_157,Pputida_PALE__HGL_Pputida_158,Pputida_PALE__HGL_Pputida_159,Pputida_PALE__HGL_Pputida_160,Pputida_PALE__HGL_Pputida_161,Pputida_PALE__HGL_Pputida_162,Pputida_PALE__HGL_Pputida_163,Pputida_PALE__HGL_Pputida_164,Pputida_PALE__HGL_Pputida_165,Pputida_PALE__HGL_Pputida_166,Pputida_PALE__HGL_Pputida_167,Pputida_PALE__HGL_Pputida_168,Pputida_PALE__HGL_Pputida_169,Pputida_PALE__HGL_Pputida_170,Pputida_PALE__HGL_Pputida_171,Pputida_PALE__HGL_Pputida_172,Pputida_PALE__HGL_Pputida_173,Pputida_PALE__HGL_Pputida_174,Pputida_PALE__HGL_Pputida_175,Pputida_PALE__HGL_Pputida_176,JM-Metabolic__GN0_2005,JM-Metabolic__GN0_2007,JM-Metabolic__GN0_2009,JM-Metabolic__GN0_2094,JM-Metabolic__GN0_2099,JM-Metabolic__GN0_2148,JM-Metabolic__GN0_2165,JM-Metabolic__GN0_2169,JM-Metabolic__GN0_2172,JM-Metabolic__GN0_2175,JM-Metabolic__GN0_2183,JM-Metabolic__GN0_2215,JM-Metabolic__GN0_2254,JM-Metabolic__GN0_2277,JM-Metabolic__GN0_2290,JM-Metabolic__GN0_2337,JM-Metabolic__GN0_2317,JM-Metabolic__GN0_2354,JM-Metabolic__GN0_2375,JM-Metabolic__GN0_2380,JM-Metabolic__GN0_2393,JM-Metabolic__GN0_2404,5B,6A,BLANK_41_12G,BLANK_41_12H,Deoxyribose_PALE_ALE__MG1655_BOP27_4_14,Deoxyribose_PALE_ALE__MG1655_BOP27_4_23,Deoxyribose_PALE_ALE__MG1655_BOP27_4_48,Deoxyribose_PALE_ALE__MG1655_BOP27_6_21,Deoxyribose_PALE_ALE__MG1655_BOP27_6_35,Deoxyribose_PALE_ALE__MG1655_BOP27_10_13,Deoxyribose_PALE_ALE__MG1655_BOP27_10_28,Deoxyribose_PALE_ALE__MG1655_BOP27_10_51,Deoxyribose_PALE_ALE__MG1655_Lib4_18_19,Deoxyribose_PALE_ALE__MG1655_Lib4_18_59,Deoxyribose_PALE_ALE__MG1655_Lib4_18_35,Deoxyribose_PALE_ALE__MG1655_Lib4_20_16,Deoxyribose_PALE_ALE__MG1655_Lib4_20_43,Deoxyribose_PALE_ALE__MG1655_Lib4_20_71,Deoxyribose_PALE_ALE__MG1655_Lib4_22_16,Deoxyribose_PALE_ALE__MG1655_Lib4_22_28,Deoxyribose_PALE_ALE__MG1655_Lib4_22_52,Deoxyribose_PALE_ALE__MG1655_Lib4_24_9,Deoxyribose_PALE_ALE__MG1655_Lib4_24_24,Deoxyribose_PALE_ALE__MG1655_Lib4_24_52,Deoxyribose_PALE_ALE__MG1655_Lib4_26_6,Deoxyribose_PALE_ALE__MG1655_Lib4_26_27,Deoxyribose_PALE_ALE__MG1655_Lib4_26_69,Deoxyribose_PALE_ALE__MG1655_Lib4_28_13,Deoxyribose_PALE_ALE__MG1655_Lib4_28_28,Deoxyribose_PALE_ALE__MG1655_Lib4_28_53,Deoxyribose_PALE_ALE__MG1655_Lib4_30_7,Deoxyribose_PALE_ALE__MG1655_Lib4_30_22,Deoxyribose_PALE_ALE__MG1655_Lib4_30_60,Deoxyribose_PALE_ALE__MG1655_Lib4_32_6,Deoxyribose_PALE_ALE__MG1655_Lib4_32_20,Deoxyribose_PALE_ALE__MG1655_Lib4_32_56,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_1_69,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_50,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_3_61,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_22,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_36,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_5_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_7_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_58,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_17_64,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_55,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_19_63,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_23,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_46,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_21_51,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_25,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_49,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_29_57,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_24,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_42,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_31_62,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_21,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_41,AB5075_AZM_TALE_in_MHB_A_baumannii_AB5075_WT_33_50,JM-Metabolic__GN02514,JM-Metabolic__GN02529,JM-Metabolic__GN02531,JM-Metabolic__GN02567,JM-Metabolic__GN02590,JM-Metabolic__GN02657,JM-Metabolic__GN02748,JM-Metabolic__GN02766,JM-Metabolic__GN02769,JM-Metabolic__GN02787,JM-Metabolic__GN03132,JM-Metabolic__GN03218,JM-Metabolic__GN03252,JM-Metabolic__GN03409,JM-Metabolic__GN04014,JM-Metabolic__GN04094,JM-Metabolic__GN04255,JM-Metabolic__GN04306,JM-Metabolic__GN04428,JM-Metabolic__GN04488,JM-Metabolic__GN04540,JM-Metabolic__GN04563,JM-Metabolic__GN04612,JM-Metabolic__GN04665,JM-Metabolic__GN04682,JM-Metabolic__GN05002,JM-Metabolic__GN05109,JM-Metabolic__GN05128,JM-Metabolic__GN05367,JM-Metabolic__GN05377,7A,8A,BLANK_42_12G,BLANK_42_12H,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0326,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0327,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0328,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0329,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0330,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0352,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0353,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0354,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0355,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0356,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0357,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0364,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0366,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0367,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0368,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0369,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0370,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0371,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0372,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0373,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0374,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0375,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0376,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0377,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0378,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0380,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0381,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0382,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0383,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0384,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0385,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0386,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0387,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0388,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0389,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0390,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0391,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0392,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0393,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0394,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0395,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0396,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0397,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0398,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0399,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0400,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0401,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0402,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0403,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0404,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0405,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0406,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0407,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0408,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0409,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0417,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0418,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0419,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0420,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0421,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0473,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0474,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0483,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0484,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0485,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0486,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0516,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0517,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0518,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0519,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0520,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0521,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0522,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0523,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0524,JM-MEC__Staphylococcus_aureusstrain_BERTI-B0525,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08624,JM-MEC__Staphylococcus_aureusstrain_BERTI-R08704,JM-MEC__Staphylococcus_aureusstrain_BERTI-R10727,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11044,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11078,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11101,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11102,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11103,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11135,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11153,JM-MEC__Staphylococcus_aureusstrain_BERTI-R11154,JM-Metabolic__GN02424,JM-Metabolic__GN02446,JM-Metabolic__GN02449,JM-Metabolic__GN02487,JM-Metabolic__GN02501,ISB,GFR,BLANK_43_12G,BLANK_43_12H,RMA_KHP_rpoS_Mage_Q97D,RMA_KHP_rpoS_Mage_Q97L,RMA_KHP_rpoS_Mage_Q97N,RMA_KHP_rpoS_Mage_Q97E,JBI_KHP_HGL_021,JBI_KHP_HGL_022,JBI_KHP_HGL_023,JBI_KHP_HGL_024,JBI_KHP_HGL_025,JBI_KHP_HGL_026,JBI_KHP_HGL_027,JBI_KHP_HGL_028_Amitesh_soxR,JBI_KHP_HGL_029_Amitesh_oxyR,JBI_KHP_HGL_030_Amitesh_soxR_oxyR,JBI_KHP_HGL_031_Amitesh_rpoS,BLANK1_1A,BLANK1_1B,BLANK1_1C,BLANK1_1D,BLANK1_1E,BLANK1_1F,BLANK1_1G,BLANK1_1H,AP581451B02,EP256645B01,EP112567B02,EP337425B01,LP127890A01,EP159692B04,EP987683A01,AP959450A03,SP464350A04,C9,ep256643b01,EP121011B01,AP616837B04,SP506933A04,EP159695B01,EP256644B01,SP511289A02,EP305735B04,SP415030A01,AP549681B02,AP549678B01,EP260544B04,EP202452B01,EP282276B04,SP531696A04,SP515443A04,SP515763A04,EP184255B04,SP503615A02,EP260543B04,EP768748A04,AP309872B03,AP568785B04,EP721390A04,EP940013A01,EP291979B04,EP182065B04,EP128904B02,EP915769A04,SP464352A03,SP365864A04,SP511294A04,EP061002B01,SP410793A01,SP232077A04,EP128910B01,AP531397B04,EP043583B01,EP230245B01,EP606652B04,EP207041B01,EP727972A04,EP291980B04,EP087938B02,SP471496A04,SP573823A04,EP393718B01,SP612496A01,EP032410B02,EP073216B01,EP410046B01,SP561451A04,EP320438B01,SP612495A04,EP446604B03,EP446602B01,EP182243B02,EP333541B04,EP238034B01,AP298002B02,EP455759B04,EP207042B04,LP128479A01,LP128476A01,EP316863B03,C20,lp127896a01,SP491907A02,EP182060B03,EP422407B01,SP573859A04,SP584547A02,EP182346B04,AP668631B04,EP451428B04,LP128538A01,SP490298A02,SP573860A01,EP032412B02,EP163771B01,LP169879A01,EP729433A02,EP447940B04,SP584551A08,EP216516B04,EP023808B02,BLANK2_2A,BLANK2_2B,BLANK2_2C,BLANK2_2D,BLANK2_2E,BLANK2_2F,BLANK2_2G,BLANK2_2H,SP573843A04,EP683835A01,SP573824A04,SP335002A04,SP478193A02,SP232311A04,SP415021A02,SP231630A02,SP641029A02,SP232310A04,EP617442B01,EP587478B04,EP447928B04,EP587475B04,EP675042B01,EP554513B02,EP702221B04,AP568787B02,EP054632B01,EP121013B01,EP649418A02,EP573313B01,LP154981A01,AP470859B01,LP154986A01,AP732307B04,EP533426B03,EP587476B04,AP696363B02,EP587477B04,SP683466A02,EP554518B04,EP533429B04,EP431570B01,EP202095B04,EP504030B04,EP207036B01,EP393717B01,SP491898A02,EP484973B04,EP479794B02,EP554515B04,SP631994A04,EP921593A04,AP787247B04,EP090129B04,EP447975B02,EP212214B01,EP410042B01,SP404409A02,SP247340A04,AP029018B01,EP872341A01,AP062219B03,EP790020A02,EP808112A04,SP404403A02,EP073160B01,EP012991B03,SP317297A02,EP656055A04,EP649623A01,EP790019A01,SP257519A04,EP808104A01,EP808106A01,SP231629A02,EP675044A01,EP657260A01,EP808110A04,AP032413B04,EP843906A04,AP173305B04,SP231628A02,AP173301B04,SP404405A02,EP649653A04,EP718687A04,AP905750A02,EP738468A01,C6,EP890157A02,SP353893A02,EP944059A02,EP970005A01,EP927461A04,EP808111A03,EP927459A04,SP317293A02,SP235186A04,SP399724A04,EP738469A01,SP284095A03,C5,EP337325B04,EP759450A04,BLANK3_3A,BLANK3_3B,BLANK3_3C,BLANK3_3D,BLANK3_3E,BLANK3_3F,BLANK3_3G,BLANK3_3H,AP006367B02,EP929277A02,AP324642B04,EP786631A04,EP657385A04,SP235189A01,EP448041B04,SP231631A02,SP280481A02,AP032412B04,EP649737A03,AP967057A04,EP876243A04,SP229387A04,EP667743A04,SP246941A01,AP745799A04,SP205732A02,SP230382A04,SP230380A02,SP230381A01,SP205754A01,EP606662B04,AP780167B02,EP447927B04,C18,LP191039A01,EP606663B04,EP573296B01,EP447926B04,LP127767A01,EP479266B04,LP128543A01,EP479270B03,EP921594A04,EP554501B04,EP542577B04,EP487995B04,EP542578B04,EP573310B01,EP244366B01,EP533389B03,EP244360B01,AP911328B01,AP481403B02,22_001_801_552_503_00,EP372981B04,EP447929B04,SP573849A04,SP577399A02,EP606656B03,LP166715A01,AP668628B04,C14,EP446610B02,EP339061B02,SP681591A04,EP393712B02,EP410041B01,SP453872A01,22_001_710_503_791_00,LP128540A01,EP339053B02,EP617443B01,EP190307B01,AP795068B04,LP128541A01,EP584756B04,SP284096A02,EP431562B04,EP685640B01,EP339059B02,EP431575B01,EP379938B01,EP529635B02,EP554506B04,EP455757B04,SP491900A02,LP196272A01,SP704319A04,EP617441B01,AP687591B04,SP640978A02,EP981129A02,EP455763B04,EP339057B02,SP491897A02,EP980752B04,LP128539A01,EP996831B04,EP273332B04,EP483291B04,EP393715B01,EP617440B01,EP729434A01,SP645141A03,BLANK4_4A,BLANK4_4B,BLANK4_4C,BLANK4_4D,BLANK4_4E,BLANK4_4F,BLANK4_4G,BLANK4_4H,SP232114A04,EP393714B01,EP533388B01,EP724905B01,EP282108B01,EP282107B01,EP001625B01,EP073209B02,SP232079A01,EP772145A02,AP771472A04,AP223470B01,SP404412A02,EP772143A02,SP408629A01,EP749735A07,EP846485A01,EP808109A01,SP416130A04,EP882752A01,AP953594A02,AP046324B02,AP891020A04,EP790023A01,EP657386A01,EP805337A01,EP927458A04,AP173299B04,EP768164A02,EP886422A01,AP103463B01,AP744361A02,AP065292B01,SP257517A04,EP790021A04,EP675075A04,SP388683A02,SP232309A01,EP899038A04,EP636802A01,AP046327B02,EP905975A04,SP410796A02,EP784608A01,EP808105A01,SP331134A04,EP718688A01,SP232270A02,EP970001A01,EP001624B01,EP868682A01,EP927462A02,C3,EP890158A02,EP023801B04,EP400447B04,EP385379B01,EP385387B01,EP385384B01,SP754514A04,SP415025A01,SP415023A02,EP400448B04,EP479894B04 | tr -d '"') \
-    -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \
+    -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob \
+    -s $(echo C501,C509,C502,C510,C503,C511,C504,C512,C505,C513,C506,C514,C507,C515,C508,C516,C517,C525,C518,C526,C519,C527,C520,C528,C521,C529,C522,C530,C523,C531,C524,C532,C533,C541,C534,C542,C535,C543,C536,C544,C537,C545,C538,C546,C539,C547,C540,C548,C549,C557,C550,C558,C551,C559,C552,C560,C553,C561,C554,C562,C555,C563,C556,C564,C565,C573,C566,C574,C567,C575,C568,C576,C569,C577,C570,C578,C571,C579,C572,C580,C581,C589,C582,C590,C583,C591,C584,C592,C585,C593,C586,C594,C587,C595,C588,C596 | tr -d '"') \
+    -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \
     -j ${SLURM_JOB_CPUS_PER_NODE}  \
     -l s_1
 
-if [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/Full ]]; then
-    echo "Run appears successful"
-elif [[ -d sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/output/1_demult/Full ]]; then
-    echo "Run appears unsuccessful but has output"
-    exit 1
-else
-    echo "Run appears unsuccessful"
-    exit 1
-fi
\ No newline at end of file
+# instead of testing for the presence of 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full', we will review
+# the changed timestamps for all the files in 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full' and when
+# we can demonstrate that they haven't changed in an arbitrary period of time
+# we will consider the work completed.
+
+# get the timestamp for the most recently changed file in directory '.'
+
+# hard-limit for wait time set to ~ 8 hours.
+# (4 checks per hour, for 8 hours equals 32 iterations)
+for i in $(seq 1 32);
+do
+    before="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+    # assume TellReadJob is finished if ctime hasn't changed in 15 minutes
+    # for any fastq file in the directory.
+    sleep 900
+    after="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)"
+
+    echo "$before   $after"
+
+    if [[ "$before" == "$after" ]]; then
+        echo "DONE"
+        exit 0
+    else
+        echo "NOT DONE"
+    fi
+done
+
+# if we've reached this point then we've exceeded our hard-limit for waiting.
+# return w/an error.
+exit 1
diff --git a/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv
new file mode 100644
index 00000000..105330fd
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv
@@ -0,0 +1,135 @@
+[Header],,,,,,,,
+IEMFileVersion,1,,,,,,,
+SheetType,tellseq_metag,,,,,,,
+SheetVersion,10,,,,,,,
+Investigator Name,Knight,,,,,,,
+Experiment Name,RKL0151,,,,,,,
+Date,5/6/24,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,
+Application,FASTQ Only,,,,,,,
+Assay,Metagenomic,,,,,,,
+Description,,,,,,,,
+Chemistry,Default,,,,,,,
+,,,,,,,,
+[Reads],,,,,,,,
+151,,,,,,,,
+151,,,,,,,,
+,,,,,,,,
+[Settings],,,,,,,,
+ReverseComplement,0,,,,,,,
+,,,,,,,,
+[Data],,,,,,,,
+Sample_ID,Sample_Name,Sample_Plate,well_id_384,barcode_id,Sample_Project,Well_description,Lane,
+LS_8_10_2013_SRE,LS.8.10.2013.SRE,LS_Donor_SS_Samples_P1,A1,C501,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013.SRE,1,
+LS_12_17_2014_SRE,LS.12.17.2014.SRE,LS_Donor_SS_Samples_P1,B1,C509,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.17.2014.SRE,1,
+LS_4_4_2015_SRE,LS.4.4.2015.SRE,LS_Donor_SS_Samples_P1,C1,C502,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.4.2015.SRE,1,
+LS_2_23_2015_SRE,LS.2.23.2015.SRE,LS_Donor_SS_Samples_P1,D1,C510,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.23.2015.SRE,1,
+LS_9_28_2014_SRE,LS.9.28.2014.SRE,LS_Donor_SS_Samples_P1,E1,C503,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.28.2014.SRE,1,
+LS_12_14_2013_SRE,LS.12.14.2013.SRE,LS_Donor_SS_Samples_P1,F1,C511,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.14.2013.SRE,1,
+LS_4_7_2013_SRE,LS.4.7.2013.SRE,LS_Donor_SS_Samples_P1,G1,C504,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.7.2013.SRE,1,
+LS_7_14_2013_SRE,LS.7.14.2013.SRE,LS_Donor_SS_Samples_P1,H1,C512,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.14.2013.SRE,1,
+LS_10_27_2013_SRE,LS.10.27.2013.SRE,LS_Donor_SS_Samples_P1,I1,C505,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.10.27.2013.SRE,1,
+LS_1_19_2014_SRE,LS.1.19.2014.SRE,LS_Donor_SS_Samples_P1,J1,C513,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014.SRE,1,
+LS_9_3_2013_SRE,LS.9.3.2013.SRE,LS_Donor_SS_Samples_P1,K1,C506,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.3.2013.SRE,1,
+LS_2_25_2013_SRE,LS.2.25.2013.SRE,LS_Donor_SS_Samples_P1,L1,C514,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.25.2013.SRE,1,
+LS_7_26_2015_SRE,LS.7.26.2015.SRE,LS_Donor_SS_Samples_P1,M1,C507,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.26.2015.SRE,1,
+LS_2_17_2014_SRE,LS.2.17.2014.SRE,LS_Donor_SS_Samples_P1,N1,C515,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.17.2014.SRE,1,
+LS_6_29_2015_SRE,LS.6.29.2015.SRE,LS_Donor_SS_Samples_P1,O1,C508,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015.SRE,1,
+LS_3_24_2015_SRE,LS.3.24.2015.SRE,LS_Donor_SS_Samples_P1,P1,C516,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015.SRE,1,
+LS_1_6_2015_SRE,LS.1.6.2015.SRE,LS_Donor_SS_Samples_P1,A2,C517,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.6.2015.SRE,1,
+T_LS_7_15_15B_SRE,T.LS.7.15.15B.SRE,LS_Donor_SS_Samples_P1,B2,C525,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B.SRE,1,
+LS_6_9_2013_SRE,LS.6.9.2013.SRE,LS_Donor_SS_Samples_P1,C2,C518,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.9.2013.SRE,1,
+Person A_SRE,Person A.SRE,LS_Donor_SS_Samples_P1,D2,C526,Tellseq_Shortread_Metagenomic_Analysis_10283,Person A.SRE,1,
+LS_8_22_2014_R2_SRE,LS.8.22.2014.R2.SRE,LS_Donor_SS_Samples_P1,E2,C519,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R2.SRE,1,
+Person B_SRE,Person B.SRE,LS_Donor_SS_Samples_P1,F2,C527,Tellseq_Shortread_Metagenomic_Analysis_10283,Person B.SRE,1,
+LS_8_22_2014_R1_SRE,LS.8.22.2014.R1.SRE,LS_Donor_SS_Samples_P1,G2,C520,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R1.SRE,1,
+Person C_SRE,Person C.SRE,LS_Donor_SS_Samples_P1,H2,C528,Tellseq_Shortread_Metagenomic_Analysis_10283,Person C.SRE,1,
+LS_12_28_2011_SRE,LS.12.28.2011.SRE,LS_Donor_SS_Samples_P1,I2,C521,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.28.2011.SRE,1,
+Person D_SRE,Person D.SRE,LS_Donor_SS_Samples_P1,J2,C529,Tellseq_Shortread_Metagenomic_Analysis_10283,Person D.SRE,1,
+LS_5_4_2014_SRE,LS.5.4.2014.SRE,LS_Donor_SS_Samples_P1,K2,C522,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.5.4.2014.SRE,1,
+45208_1_1,45208.1.1,UROBIOME_TEST_MF_SAMPLES_P2,L2,C530,Tellseq_Shortread_Metagenomic_Analysis_10283,45208.1.1,1,
+LS_11_6_2012_SRE,LS.11.6.2012.SRE,LS_Donor_SS_Samples_P1,M2,C523,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.6.2012.SRE,1,
+45248_2_2,45248.2.2,UROBIOME_TEST_MF_SAMPLES_P2,N2,C531,Tellseq_Shortread_Metagenomic_Analysis_10283,45248.2.2,1,
+LS_4_3_2012_SRE,LS.4.3.2012.SRE,LS_Donor_SS_Samples_P1,O2,C524,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.3.2012.SRE,1,
+45261_2_1,45261.2.1,UROBIOME_TEST_MF_SAMPLES_P2,P2,C532,Tellseq_Shortread_Metagenomic_Analysis_10283,45261.2.1,1,
+45272_11_2,45272.11.2,UROBIOME_TEST_MF_SAMPLES_P2,A3,C533,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.11.2,1,
+T_LS_7_12_15A,T.LS.7.12.15A,Larry_Smarr_Plus_Donor_Samples_P3,B3,C541,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.12.15A,1,
+45316_8_1,45316.8.1,UROBIOME_TEST_MF_SAMPLES_P2,C3,C534,Tellseq_Shortread_Metagenomic_Analysis_10283,45316.8.1,1,
+T_LS_7_8_15A,T.LS.7.8.15A,Larry_Smarr_Plus_Donor_Samples_P3,D3,C542,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.8.15A,1,
+45327_7_2,45327.7.2,UROBIOME_TEST_MF_SAMPLES_P2,E3,C535,Tellseq_Shortread_Metagenomic_Analysis_10283,45327.7.2,1,
+LS_8_10_2013,LS.8.10.2013,LS_Time_Series_ABSQ_P4,F3,C543,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013,1,
+45272_1_swab_2,45272.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,G3,C536,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.1.swab.2,1,
+LS_6_29_2015,LS.6.29.2015,LS_Time_Series_ABSQ_P4,H3,C544,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015,1,
+45326_1_swab_2,45326.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,I3,C537,Tellseq_Shortread_Metagenomic_Analysis_10283,45326.1.swab.2,1,
+LS_3_8_2015,LS.3.8.2015,LS_Time_Series_ABSQ_P4,J3,C545,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.8.2015,1,
+T_LS_7_19_15A,T.LS.7.19.15A,Larry_Smarr_Plus_Donor_Samples_P3,K3,C538,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15A,1,
+LS_4_29_2013,LS.4.29.2013,LS_Time_Series_ABSQ_P4,L3,C546,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.29.2013,1,
+T_LS_7_15_15B,T.LS.7.15.15B,Larry_Smarr_Plus_Donor_Samples_P3,M3,C539,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B,1,
+LS_11_16_2014,LS.11.16.2014,LS_Time_Series_ABSQ_P4,N3,C547,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.16.2014,1,
+T_LS_7_19_15B,T.LS.7.19.15B,Larry_Smarr_Plus_Donor_Samples_P3,O3,C540,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15B,1,
+LS_1_19_2014,LS.1.19.2014,LS_Time_Series_ABSQ_P4,P3,C548,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014,1,
+LS_3_24_2015,LS.3.24.2015,LS_Time_Series_ABSQ_P4,A4,C549,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015,1,
+LS_2_8_2013,LS.2.8.2013,LS_Time_Series_ABSQ_P4,B4,C557,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.8.2013,1,
+LS_11_10_2013,LS.11.10.2013,LS_Time_Series_ABSQ_P4,C4,C550,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.10.2013,1,
+Marine_Sediment_0_2cm_R1,Marine.Sediment.0.2cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,D4,C558,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.0.2cm.R1,1,
+LS_3_23_2014,LS.3.23.2014,LS_Time_Series_ABSQ_P4,E4,C551,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.23.2014,1,
+Marine_Sediment_5_7cm_R1,Marine.Sediment.5.7cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,F4,C559,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.5.7cm.R1,1,
+LS_1_14_2015,LS.1.14.2015,LS_Time_Series_ABSQ_P4,G4,C552,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.14.2015,1,
+Marine_Sediment_10_12cm_R2,Marine.Sediment.10.12cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,H4,C560,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.10.12cm.R2,1,
+LS_8_25_2014,LS.8.25.2014,LS_Time_Series_ABSQ_P4,I4,C553,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.25.2014,1,
+Marine_Sediment_15_17cm_R1,Marine.Sediment.15.17cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,J4,C561,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.15.17cm.R1,1,
+LS_1_26_2013,LS.1.26.2013,LS_Time_Series_ABSQ_P4,K4,C554,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.26.2013,1,
+Marine_Sediment_20_22cm_R1,Marine.Sediment.20.22cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,L4,C562,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.20.22cm.R1,1,
+LS_6_16_2014,LS.6.16.2014,LS_Time_Series_ABSQ_P4,M4,C555,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.16.2014,1,
+Marine_Sediment_25_27cm_R2,Marine.Sediment.25.27cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,N4,C563,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.25.27cm.R2,1,
+LS_7_27_2014,LS.7.27.2014,LS_Time_Series_ABSQ_P4,O4,C556,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.27.2014,1,
+Marine_Sediment_30_32cm_R3,Marine.Sediment.30.32cm.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,P4,C564,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.30.32cm.R3,1,
+Person_A_R3,Person.A.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,A5,C565,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.A.R3,1,
+Soil_SynCom_T4_2_Tube5,Soil.SynCom.T4.2.Tube5,16_member_community_native_soil_P6,B5,C573,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.2.Tube5,1,
+Person_B_R2,Person.B.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,C5,C566,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.B.R2,1,
+A21,A21,Tumor_Community_P7,D5,C574,Tellseq_Shortread_Metagenomic_Analysis_10283,A21,1,
+Person_C_R4,Person.C.R4,MarineSediment_Donor_LarrySmarr_NoProK_P5,E5,C567,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.C.R4,1,
+A23,A23,Tumor_Community_P7,F5,C575,Tellseq_Shortread_Metagenomic_Analysis_10283,A23,1,
+Person_D_R2,Person.D.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,G5,C568,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.D.R2,1,
+A27,A27,Tumor_Community_P7,H5,C576,Tellseq_Shortread_Metagenomic_Analysis_10283,A27,1,
+Soil_SynCom_T1_2_Tube1,Soil.SynCom.T1.2.Tube1,16_member_community_native_soil_P6,I5,C569,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T1.2.Tube1,1,
+A30,A30,Tumor_Community_P7,J5,C577,Tellseq_Shortread_Metagenomic_Analysis_10283,A30,1,
+Soil _SynCom_T2_2_Tube2,Soil .SynCom.T2.2.Tube2,16_member_community_native_soil_P6,K5,C570,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil .SynCom.T2.2.Tube2,1,
+A31,A31,Tumor_Community_P7,L5,C578,Tellseq_Shortread_Metagenomic_Analysis_10283,A31,1,
+Soil_SynCom_T3_2_Tube3,Soil.SynCom.T3.2.Tube3,16_member_community_native_soil_P6,M5,C571,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T3.2.Tube3,1,
+S1_T1_A,S1.T1.A,Tumor_Community_P7,N5,C579,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.A,1,
+Soil_SynCom_T4_1_Tube4,Soil.SynCom.T4.1.Tube4,16_member_community_native_soil_P6,O5,C572,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.1.Tube4,1,
+S2_T1_B_A,S2.T1.B.A,Tumor_Community_P7,P5,C580,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.B.A,1,
+S2_T1_01BH1_Y_A,S2.T1.01BH1.Y.A,Tumor_Community_P7,A6,C581,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.01BH1.Y.A,1,
+S1_T1_1CIM_A,S1.T1.1CIM.A,Tumor_Community_P7,B6,C589,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.1CIM.A,1,
+S2_MT1_1HBI_Y_A,S2.MT1.1HBI.Y.A,Tumor_Community_P7,C6,C582,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.1HBI.Y.A,1,
+S1_M1_B_1CIM_A,S1.M1.B.1CIM.A,Tumor_Community_P7,D6,C590,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.M1.B.1CIM.A,1,
+S1_T1_B_LBM_A,S1.T1.B.LBM.A,Tumor_Community_P7,E6,C583,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.B.LBM.A,1,
+BLANK_K15_cancer_patient,BLANK.K15.cancer.patient,Tumor_Community_P7,F6,C591,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.K15.cancer.patient,1,
+S2_MT1_LBM_A,S2.MT1.LBM.A,Tumor_Community_P7,G6,C584,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.LBM.A,1,
+BLANK_M15_cancer_patient,BLANK.M15.cancer.patient,Tumor_Community_P7,H6,C592,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.M15.cancer.patient,1,
+S2_T1_A,S2.T1.A,Tumor_Community_P7,I6,C585,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.A,1,
+BLANK_O15_cancer_patient,BLANK.O15.cancer.patient,Tumor_Community_P7,J6,C593,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.O15.cancer.patient,1,
+1CIM_M_CNTL_A,1CIM.M.CNTL.A,Tumor_Community_P7,K6,C586,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.M.CNTL.A,1,
+BLANK_A17_cancer_patient,BLANK.A17.cancer.patient,Tumor_Community_P7,L6,C594,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.A17.cancer.patient,1,
+1CIM_G_CNTL_A,1CIM.G.CNTL.A,Tumor_Community_P7,M6,C587,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.G.CNTL.A,1,
+BLANK_C17_cancer_patient,BLANK.C17.cancer.patient,Tumor_Community_P7,N6,C595,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.C17.cancer.patient,1,
+GC_1HCOM_A,GC.1HCOM.A,Tumor_Community_P7,O6,C588,Tellseq_Shortread_Metagenomic_Analysis_10283,GC.1HCOM.A,1,
+BLANK_E17_cancer_patient,BLANK.E17.cancer.patient,Tumor_Community_P7,P6,C596,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.E17.cancer.patient,1,
+,,,,,,,,
+[Bioinformatics],,,,,,,,
+Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates
+Tellseq_Shortread_Metagenomic_Analysis_10283,10283,TRUE,GATCGGAAGAGCACACGTCTGAACTCCAGTCAC,GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT,TRUE,tellseq,tellseq metagenomics,FALSE
+,,,,,,,,
+[Contact],,,,,,,,
+Sample_Project,Email,,,,,,,
+Tellseq_Shortread_Metagenomic_Analysis_10283,cbrenchy@gmail.com,,,,,,,
+,,,,,,,,
+[SampleContext],,,,,,,,
+sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,,
+BLANK.K15.cancer.patient,control blank,10283,,,,,,
+BLANK.M15.cancer.patient,control blank,10283,,,,,,
+BLANK.O15.cancer.patient,control blank,10283,,,,,,
+BLANK.A17.cancer.patient,control blank,10283,,,,,,
+BLANK.C17.cancer.patient,control blank,10283,,,,,,
+BLANK.E17.cancer.patient,control blank,10283,,,,,,
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
index 801947e8..440192c8 100644
--- a/sequence_processing_pipeline/tests/test_TellReadJob.py
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -23,7 +23,9 @@ def setUp(self):
         self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
 
         # TODO: Revisit w/a proper sample-sheet once spec is near finalized.
-        self.sample_sheet_path = self.path('data', 'good-sample-sheet.csv')
+        self.sample_sheet_path = self.path('data',
+                                           'tellseq_metag_dummy_sample_'
+                                           'sheet.csv')
 
         self.queue_name = "qiita"
         self.node_count = "1"
@@ -48,9 +50,8 @@ def test_creation(self):
                           self.sample_sheet_path, self.queue_name,
                           self.node_count, self.wall_time_limit,
                           self.jmem, self.modules_to_load, self.qiita_job_id,
-                          self.label, self.reference_base, self.reference_map,
-                          self.tmp1_path, self.sing_script_path,
-                          self.cores_per_task)
+                          self.reference_base, self.reference_map,
+                          self.sing_script_path, self.cores_per_task)
 
         job._generate_job_script()
 
@@ -61,9 +62,6 @@ def test_creation(self):
             exp_lines = f.readlines()
 
         for obs_line, exp_line in zip(obs_lines, exp_lines):
-            print("OBS: %s" % obs_line)
-            print("EXP: %s" % exp_line)
-            print("")
             self.assertEqual(obs_line, exp_line)
 
 
diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py
index f58bb176..3919ef43 100644
--- a/sequence_processing_pipeline/tests/test_commands.py
+++ b/sequence_processing_pipeline/tests/test_commands.py
@@ -16,12 +16,12 @@ def test_split_similar_size_bins(self, glob, stat):
         class MockStat:
             st_size = 2 ** 28  # 256MB
 
-        mockglob = ['/foo/bar/a_R1_.fastq.gz',
-                    '/foo/bar/b_R2_.fastq.gz',
-                    '/foo/bar/a_R2_.fastq.gz',
-                    '/foo/baz/c_R2_.fastq.gz',
-                    '/foo/baz/c_R1_.fastq.gz',
-                    '/foo/bar/b_R1_.fastq.gz']
+        mockglob = ['/foo/bar/a_R1_001.fastq.gz',
+                    '/foo/bar/b_R2_001.fastq.gz',
+                    '/foo/bar/a_R2_001.fastq.gz',
+                    '/foo/baz/c_R2_001.fastq.gz',
+                    '/foo/baz/c_R1_001.fastq.gz',
+                    '/foo/bar/b_R1_001.fastq.gz']
 
         with TemporaryDirectory() as tmp:
             exp = (2, 1073741824)
@@ -30,9 +30,12 @@ class MockStat:
             obs = split_similar_size_bins('foo', 1, tmp + '/prefix')
             self.assertEqual(obs, exp)
 
-            exp_1 = ('/foo/bar/a_R1_.fastq.gz\t/foo/bar/a_R2_.fastq.gz\tbar\n'
-                     '/foo/bar/b_R1_.fastq.gz\t/foo/bar/b_R2_.fastq.gz\tbar\n')
-            exp_2 = '/foo/baz/c_R1_.fastq.gz\t/foo/baz/c_R2_.fastq.gz\tbaz\n'
+            exp_1 = ('/foo/bar/a_R1_001.fastq.gz\t/foo/bar/a_R2_001.fastq.gz'
+                     '\tbar\n'
+                     '/foo/bar/b_R1_001.fastq.gz\t/foo/bar/b_R2_001.fastq.gz'
+                     '\tbar\n')
+            exp_2 = ('/foo/baz/c_R1_001.fastq.gz\t/foo/baz/c_R2_001.fastq.gz'
+                     '\tbaz\n')
 
             obs_1 = open(tmp + '/prefix-1').read()
             self.assertEqual(obs_1, exp_1)
@@ -71,9 +74,16 @@ def test_demux(self):
 
             demux(id_map, infile, tmp, task, maxtask)
 
-            obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1.fastq.gz'),
+            foo = join(tmp, 'Project_12345')
+            from os import walk
+            for root, dirs, files in walk(foo):
+                for _file in files:
+                    _path = join(root, _file)
+                    print(_path)
+
+            obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'),
                                'rt').read()
-            obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2.fastq.gz'),
+            obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'),
                                'rt').read()
             exp = '\n'.join(exp_data_r1) + '\n'
             self.assertEqual(obs_r1, exp)
@@ -81,8 +91,8 @@ def test_demux(self):
             exp = '\n'.join(exp_data_r2) + '\n'
             self.assertEqual(obs_r2, exp)
 
-            self.assertFalse(os.path.exists(join(tmp, 'a_R1.fastq.gz')))
-            self.assertFalse(os.path.exists(join(tmp, 'a_R2.fastq.gz')))
+            self.assertFalse(os.path.exists(join(tmp, 'a_R1_001.fastq.gz')))
+            self.assertFalse(os.path.exists(join(tmp, 'a_R2_001.fastq.gz')))
 
 
 if __name__ == '__main__':
diff --git a/sequence_processing_pipeline/tests/test_util.py b/sequence_processing_pipeline/tests/test_util.py
index 136dc9a0..e5073101 100644
--- a/sequence_processing_pipeline/tests/test_util.py
+++ b/sequence_processing_pipeline/tests/test_util.py
@@ -4,24 +4,18 @@
 
 class TestUtil(unittest.TestCase):
     def test_iter_paired_files(self):
-        tests = [(['a_R1_foo',
-                   'b_R2_bar',
-                   'a_R2_baz',
-                   'b_R1_bing'],
-                  [('a_R1_foo', 'a_R2_baz'),
-                   ('b_R1_bing', 'b_R2_bar')]),
-                 (['a.R1.foo',
-                   'b.R2.bar',
-                   'a.R2.baz',
-                   'b.R1.bing'],
-                  [('a.R1.foo', 'a.R2.baz'),
-                   ('b.R1.bing', 'b.R2.bar')]),
-                 (['a.R1.foo',
-                   'b_R2_bar',
-                   'a.R2.baz',
-                   'b_R1_bing'],
-                  [('a.R1.foo', 'a.R2.baz'),
-                   ('b_R1_bing', 'b_R2_bar')])]
+        # tuples of randomly ordered fastq files and thier expected
+        # sorted and organized output from iter_paired_files().
+
+        # underscore filenames updated to require '_001.fastq.gz'.
+        # legacy dot filenames test remains as-is.
+        tests = [(['b_R2_001.fastq.gz', 'a_R1_001.fastq.gz',
+                   'a_R2_001.fastq.gz', 'b_R1_001.fastq.gz'],
+                  [('a_R1_001.fastq.gz', 'a_R2_001.fastq.gz'),
+                   ('b_R1_001.fastq.gz', 'b_R2_001.fastq.gz')]),
+                 (['a.R1.foo', 'b.R2.bar', 'a.R2.baz', 'b.R1.bing'],
+                  [('a.R1.foo', 'a.R2.baz'), ('b.R1.bing', 'b.R2.bar')])]
+
         for files, exp in tests:
             obs = list(iter_paired_files(files))
             self.assertEqual(obs, exp)
@@ -42,7 +36,7 @@ def test_iter_paired_files_bad_pair(self):
             list(iter_paired_files(files))
 
     def test_iter_paired_files_mismatch_prefix(self):
-        files = ['a_R1_foo', 'ab_R2_foo']
+        files = ['a_R1_001.fastq.gz', 'ab_R2_001.fastq.gz']
         with self.assertRaisesRegex(ValueError, "Mismatch prefixes"):
             list(iter_paired_files(files))
 
diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py
index c5b3cdef..e19bf98a 100644
--- a/sequence_processing_pipeline/util.py
+++ b/sequence_processing_pipeline/util.py
@@ -1,17 +1,18 @@
 import re
 
 
-#PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
-
-# The above will truncate on the first _R1_ found, which only works when _R1_ or _R2_
-# appears exactly once in a file path. When the wet-lab incorporates these same strings
-# in their sample-names as descriptive metadata, this assumption is broken.
-# For all raw fastq files being used as input into NuQCJob, we can assume they end
-# in the following convention. Per Illumina spec, all fastq files end in _001 and we
-# preserve this convention even at the cost of renaming output files from TRIntegrateJob.
-# PAIR_DOT is kept as is, but may be removed later because for the purposes of SPP, no input
-# should ever be named with dots instead of underscores.
-PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), '_R1_001.fastq.gz', '_R2_001.fastq.gz')
+# PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_')
+# The above will truncate on the first _R1_ found, which only works when _R1_
+# or _R2_ appears exactly once in a file path. When the wet-lab incorporates
+# these same strings in their sample-names as descriptive metadata, this
+# assumption is broken. For all raw fastq files being used as input into
+# NuQCJob, we can assume they end in the following convention. Per Illumina
+# spec, all fastq files end in _001 and we preserve this convention even at
+# the cost of renaming output files from TRIntegrateJob.
+# PAIR_DOT is kept as is, but may be removed later because for the purposes of
+# SPP, no input should ever be named with dots instead of underscores.
+PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'),
+                   '_R1_001.fastq.gz', '_R2_001.fastq.gz')
 PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.')
 PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT)
 

From fd1809b893d7b6ee3d0edbfc8d0fd2e9cc0927bc Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Tue, 19 Nov 2024 15:21:42 -0800
Subject: [PATCH 31/47] Update setup.py to point to merged metapool updates

---
 setup.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index e7894aab..99103fbb 100644
--- a/setup.py
+++ b/setup.py
@@ -43,12 +43,8 @@
       install_requires=[
         'click', 'requests', 'pandas', 'flake8', 'nose', 'coverage',
         'pgzip', 'jinja2',
-        # 'metapool @ https://github.com/biocore/'
-        # 'metagenomics_pooling_notebook/archive/master.zip'
-        # sample_sheet_update branch contains all of the changes in the
-        # fake_tellread branch + DFSheet.
-        'metapool @ https://codeload.github.com/charles-cowart/metagenomics'
-        '_pooling_notebook/zip/refs/heads/sample_sheet_update'
+        'metapool @ https://github.com/biocore/'
+        'metagenomics_pooling_notebook/archive/master.zip'
         ],
       entry_points={
           'console_scripts': ['demux=sequence_processing_pipeline.scripts.cli'

From 96f3cffad5f72fbe305e46f8f4eeebd4f3b73c22 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 16:48:41 -0800
Subject: [PATCH 32/47] New tests for slurm polling

---
 sequence_processing_pipeline/FastQCJob.py     |   2 +-
 .../GenPrepFileJob.py                         |   2 +-
 sequence_processing_pipeline/Job.py           | 156 +++++++++-------
 .../scripts/fake_squeue.py                    | 101 ++++++++++
 .../tests/test_Job.py                         | 173 +++++++++++++++++-
 5 files changed, 365 insertions(+), 69 deletions(-)
 create mode 100755 sequence_processing_pipeline/scripts/fake_squeue.py

diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
index 5e0bf4fc..889ef75d 100644
--- a/sequence_processing_pipeline/FastQCJob.py
+++ b/sequence_processing_pipeline/FastQCJob.py
@@ -255,7 +255,7 @@ def run(self, callback=None):
 
             cmd = ' '.join(cmd_head + input_path_list + cmd_tail)
 
-            results = self._system_call(cmd, callback=callback)
+            results = Job._system_call(cmd, callback=callback)
 
             if results['return_code'] != 0:
                 raise PipelineError("multiqc encountered an error")
diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py
index 49e8f651..0bb2c52c 100644
--- a/sequence_processing_pipeline/GenPrepFileJob.py
+++ b/sequence_processing_pipeline/GenPrepFileJob.py
@@ -159,7 +159,7 @@ def run(self, callback=None):
             # currently that is how it's done. Hence, self.output_directory
             # and the path to run_dir might be different locations than the
             # others.
-            res = self._system_call(' '.join(command), callback=callback)
+            res = Job._system_call(' '.join(command), callback=callback)
 
             if res['return_code'] != 0:
                 raise PipelineError("Seqpro encountered an error")
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 59d9cea2..7a771908 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -12,6 +12,7 @@
 import logging
 from inspect import stack
 import re
+from collections import Counter
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
@@ -233,6 +234,41 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None):
 
         return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code}
 
+    def query_slurm(self, job_ids):
+        # query_slurm encapsulates the handling of squeue.
+        count = 0
+        while True:
+            result = self._system_call("squeue -t all -j "
+                                       f"{','.join(job_ids)} "
+                                       "-o '%i,%T'")
+
+            if result['return_code'] == 0:
+                # there was no issue w/squeue, break this loop and
+                # continue.
+                break
+            else:
+                # there was likely an intermittent issue w/squeue. Pause
+                # and wait before trying a few more times. If the problem
+                # persists then report the error and exit.
+                count += 1
+
+                if count > 3:
+                    raise ExecFailedError(result['stderr'])
+
+                sleep(60)
+
+        lines = result['stdout'].split('\n')
+        lines.pop(0)  # remove header
+        lines = [x.split(',') for x in lines if x != '']
+
+        jobs = {}
+        for job_id, state in lines:
+            # ensure unique_id is of type string for downstream use.
+            job_id = str(job_id)
+            jobs[job_id] = state
+
+        return jobs
+
     def wait_on_job_ids(self, job_ids, callback=None):
         '''
         Wait for the given job-ids to finish running before returning.
@@ -250,65 +286,27 @@ def wait_on_job_ids(self, job_ids, callback=None):
         # ensure all ids are strings to ensure proper working w/join().
         job_ids = [str(x) for x in job_ids]
 
-        def query_slurm(job_ids):
-            # internal function query_slurm encapsulates the handling of
-            # squeue.
-            count = 0
-            while True:
-                result = self._system_call("squeue -t all -j "
-                                           f"{','.join(job_ids)} "
-                                           "-o '%F,%A,%T'")
-
-                if result['return_code'] == 0:
-                    # there was no issue w/squeue, break this loop and
-                    # continue.
-                    break
-                else:
-                    # there was a likely intermittent issue w/squeue. Pause
-                    # and wait before trying a few more times. If the problem
-                    # persists then report the error and exit.
-                    count += 1
-
-                    if count > 3:
-                        raise ExecFailedError(result['stderr'])
-
-                    sleep(60)
-
-            lines = result['stdout'].split('\n')
-            lines.pop(0)    # remove header
-            lines = [x.split(',') for x in lines if x != '']
-
-            jobs = {}
-            child_jobs = {}
-            for job_id, unique_id, state in lines:
-                # ensure unique_id is of type string for downstream use.
-                unique_id = str(unique_id)
-                jobs[unique_id] = state
-
-                if unique_id != job_id:
-                    child_jobs[unique_id] = job_id  # job is a child job
-
-            return jobs, child_jobs
-
         while True:
-            jobs, child_jobs = query_slurm(job_ids)
-
-            for jid in job_ids:
-                logging.debug("JOB %s: %s" % (jid, jobs[jid]))
-                if callback is not None:
-                    callback(jid=jid, status=jobs[jid])
-
-                children = [x for x in child_jobs if child_jobs[x] == jid]
-                if len(children) == 0:
-                    logging.debug("\tNO CHILDREN")
-                for cid in children:
-                    logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid]))
-            status = [jobs[x] in Job.slurm_status_not_running for x in job_ids]
-
-            if set(status) == {True}:
-                # all jobs either completed successfully or terminated.
+            # Because query_slurm only returns state on the job-ids we specify,
+            # the wait process is a simple check to see whether any of the
+            # states are 'running' states or not.
+            jobs = self.query_slurm(job_ids)
+
+            # jobs will be a dict of job-ids or array-ids for jobs that
+            # are array-jobs. the value of jobs[id] will be a state e.g.:
+            # 'RUNNING', 'FAILED', 'COMPLETED'.
+            states = [jobs[x] in Job.slurm_status_not_running for x in jobs]
+
+            if set(states) == {True}:
+                # if all the states are either FAILED or COMPLETED
+                # then the set of those states no matter how many
+                # array-jobs there were will ultimately be the set of
+                # {True}. If not then that means there are still jobs
+                # that are running.
                 break
 
+            logging.debug(f"sleeping {Job.polling_interval_in_seconds} "
+                          "seconds...")
             sleep(Job.polling_interval_in_seconds)
 
         return jobs
@@ -366,18 +364,50 @@ def submit_job(self, script_path, job_parameters=None,
         # attributes. This method will return a dict w/job_ids as keys and
         # their job status as values. This must be munged before returning
         # to the user.
-        results = self.wait_on_job_ids([job_id], callback=callback)
+        results = Job.wait_on_job_ids([job_id], callback=callback)
 
-        job_result = {'job_id': job_id, 'job_state': results[job_id]}
+        if job_id in results:
+            # job is a non-array job
+            job_result = {'job_id': job_id, 'job_state': results[job_id]}
+        else:
+            # job is an array job
+            # assume all array jobs in this case will be associated w/job_id.
+            counts = Counter()
+            for array_id in results:
+                counts[results[array_id]] += 1
+
+            # for array jobs we won't be returning a string representing the
+            # state of a single job. Instead we're returning a dictionary of
+            # the number of unique states the set of array-jobs ended up in and
+            # the number for each one.
+            job_result = {'job_id': job_id, 'job_state': dict(counts)}
 
         if callback is not None:
-            callback(jid=job_id, status=job_result['job_state'])
+            if isinstance(job_result['job_state'], dict):
+                # this is an array job
+                states = []
+                for key in counts:
+                    states.append(f"{key}: {counts[key]}")
+
+                callback(jid=job_id, status=", ".join(states))
+
+            else:
+                # this is a standard job
+                callback(jid=job_id, status=job_result['job_state'])
 
-        if job_result['job_state'] == 'COMPLETED':
-            return job_result
+        if isinstance(job_result['job_state'], dict):
+            states = list(job_result['job_state'].keys())
+            if states == ['COMPLETED']:
+                return job_result
+            else:
+                raise JobFailedError(f"job {job_id} exited with jobs in the "
+                                     f"following states: {', '.join(states)}")
         else:
-            raise JobFailedError(f"job {job_id} exited with status "
-                                 f"{job_result['job_state']}")
+            if job_result['job_state'] == 'COMPLETED':
+                return job_result
+            else:
+                raise JobFailedError(f"job {job_id} exited with status "
+                                     f"{job_result['job_state']}")
 
     def _group_commands(self, cmds):
         # break list of commands into chunks of max_array_length (Typically
diff --git a/sequence_processing_pipeline/scripts/fake_squeue.py b/sequence_processing_pipeline/scripts/fake_squeue.py
new file mode 100755
index 00000000..6c8511ce
--- /dev/null
+++ b/sequence_processing_pipeline/scripts/fake_squeue.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+from json import load, dumps
+from os.path import exists, join
+from sys import argv
+from random import randint, choice
+
+
+def print_state(state):
+    # Note that %i will appear w/column name 'JOBID' in actual squeue output.
+    # this is because %i shows the array-id if it's an array job and what we
+    # consider the regular job-id if it's not an array job.
+    print("JOBID,STATE")
+    for job_id in state:
+        if 'array_ids' in state[job_id]:
+            # this is an array job
+            for array_id in state[job_id]['array_ids']:
+                if state[job_id]['array_ids'][array_id] <= 0:
+                    end_state = state[job_id]['endgame'][array_id]
+                else:
+                    end_state = 'RUNNING'
+
+                print(f"{array_id},{end_state}")
+        else:
+            # this is a non-array job
+            if state[job_id]['countdown'] <= 0:
+                end_state = state[job_id]['endgame']
+            else:
+                end_state = 'RUNNING'
+
+            print(f"{job_id},{end_state}")
+
+
+def generate_output(job_ids):
+    results = {}
+
+    for job_id in job_ids:
+        is_successful = choice([True, False])
+        is_array_job = choice([True, False])
+
+        if is_array_job:
+            result = {'job_id': job_id}
+            result['array_ids'] = {}
+            result['endgame'] = {}
+
+            for i in range(0, randint(5, 15)):
+                array_id = "%s_%d" % (job_id, i)
+                result['array_ids'][array_id] = randint(3, 7)
+                result['array_ids'][array_id] = randint(3, 7)
+                if is_successful:
+                    # all array jobs must be successful
+                    result['endgame'][array_id] = "COMPLETED"
+                else:
+                    # some jobs may succeed but some may fail
+                    result['endgame'][array_id] = choice(
+                        ['COMPLETED', 'FAILED'])
+            results[job_id] = result
+        else:
+            result = {'job_id': job_id}
+            result['countdown'] = randint(3, 7)
+            result['endgame'] = choice(['COMPLETED', 'FAILED'])
+            results[job_id] = result
+
+    return results
+
+
+def save_state(state, file_path):
+    with open(file_path, 'w') as f:
+        print(dumps(state, indent=2), file=f)
+
+
+def load_state(file_path):
+    with open(file_path, 'r') as f:
+        return load(f)
+
+
+if __name__ == "__main__":
+    # "squeue -t all -j " f"{','.join(job_ids)} " "-o '%i,%T'"
+    job_ids = argv[4].split(',')
+
+    state_file_path = join("sequence_processing_pipeline", "scripts",
+                           "my_state.json")
+
+    state = generate_output(job_ids)
+
+    if exists(state_file_path):
+        state = load_state(state_file_path)
+    else:
+        state = generate_output(job_ids)
+
+    print_state(state)
+
+    for job_id in state:
+        if 'array_ids' in state[job_id]:
+            # this is an array job.
+            for array_id in state[job_id]['array_ids']:
+                state[job_id]['array_ids'][array_id] -= 1
+        else:
+            # this is a standard job.
+            state[job_id]['countdown'] -= 1
+
+    save_state(state, state_file_path)
diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py
index 7aa5889a..e7d58d66 100644
--- a/sequence_processing_pipeline/tests/test_Job.py
+++ b/sequence_processing_pipeline/tests/test_Job.py
@@ -1,10 +1,10 @@
 import unittest
 from sequence_processing_pipeline.Job import Job
 from sequence_processing_pipeline.PipelineError import PipelineError
-from os.path import abspath, join, dirname
-from os import makedirs
+from os.path import abspath, join, dirname, split, isdir
+from os import makedirs, chmod, remove
 from functools import partial
-from shutil import rmtree
+from shutil import rmtree, copyfile
 import re
 
 
@@ -14,7 +14,10 @@ def setUp(self):
 
     def tearDown(self):
         for some_path in self.remove_these:
-            rmtree(some_path)
+            if isdir(some_path):
+                rmtree(some_path)
+            else:
+                remove(some_path)
 
     def test_system_call(self):
         package_root = abspath('./sequence_processing_pipeline')
@@ -123,6 +126,168 @@ def test_extract_project_names_from_fastq_dir(self):
         obs = job.extract_project_names_from_fastq_dir(tmp)
         self.assertEqual(obs, ['NPH_15288'])
 
+    def test_query_slurm(self):
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        # set up a fake job so that we can test the query_jobs() method.
+        # it doesn't matter what the parameters are so long as the job
+        # passes initialization.
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        # locate python binary path
+        # we have a python script called fake_squeue.py that can simulate
+        # repeated calls to squeue. It does this by generating a fake random
+        # set of array job ids for each job id passed to it and records their
+        # state in my_state.json. Each array job is set to change state from
+        # RUNNING to either COMPLETED or FAILED between three to seven squeue
+        # calls. The choice of which job-ids will succeed or fail, as is which
+        # individual array-ids will succeed or fail is random.
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+
+        # place the fake squeue file in a place that's known to be in the
+        # PATH. Make sure this file is removed after this test is complete.
+        # Also make sure the saved state file is removed.
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1234567', '1234568', '1234569', '1234570']
+        jobs = job.query_slurm(job_ids)
+
+        # jobs is a dictionary of unique array_ids and/or job-ids for non-
+        # array jobs. The faked squeue reports anywhere between five and
+        # fifteen array jobs for a given job-id. After the first invocation
+        # all processes should be in the 'RUNNING' state.
+        # e.g.: "1234567_1": "RUNNING"
+
+        for j in jobs:
+            self.assertEqual(jobs[j], 'RUNNING')
+            if '_' in j:
+                jid, aid = j.split('_')
+            else:
+                jid = j
+                aid = None
+
+            # assert the job id component of the array-id is a valid job id.
+            self.assertIn(jid, job_ids)
+
+            if aid:
+                # assert the array-id component of the array-id is between 0
+                # and 15 as defined in the fake squeue script.
+                aid = int(aid)
+                self.assertLess(aid, 15)
+                self.assertGreaterEqual(aid, 0)
+
+    def test_query_slurm_single_job(self):
+        # perform test_query_slurm() but with a single job only.
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        # set up a fake job so that we can test the query_jobs() method.
+        # it doesn't matter what the parameters are so long as the job
+        # passes initialization.
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        # locate python binary path
+        # we have a python script called fake_squeue.py that can simulate
+        # repeated calls to squeue. It does this by generating a fake random
+        # set of array job ids for each job id passed to it and records their
+        # state in my_state.json. Each array job is set to change state from
+        # RUNNING to either COMPLETED or FAILED between three to seven squeue
+        # calls. The choice of which job-ids will succeed or fail, as is which
+        # individual array-ids will succeed or fail is random.
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+
+        # place the fake squeue file in a place that's known to be in the
+        # PATH. Make sure this file is removed after this test is complete.
+        # Also make sure the saved state file is removed.
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1234567']
+        jobs = job.query_slurm(job_ids)
+
+        # jobs is a dictionary of unique array_ids and/or job-ids for non-
+        # array jobs. The faked squeue reports anywhere between five and
+        # fifteen array jobs for a given job-id. After the first invocation
+        # all processes should be in the 'RUNNING' state.
+        # e.g.: "1234567_1": "RUNNING"
+
+        for j in jobs:
+            self.assertEqual(jobs[j], 'RUNNING')
+            if '_' in j:
+                jid, aid = j.split('_')
+            else:
+                jid = j
+                aid = None
+
+            # assert the job id component of the array-id is a valid job id.
+            self.assertIn(jid, job_ids)
+
+            if aid:
+                # assert the array-id component of the array-id is between 0
+                # and 15 as defined in the fake squeue script.
+                aid = int(aid)
+                self.assertLess(aid, 15)
+                self.assertGreaterEqual(aid, 0)
+
+    def test_wait_on_job_ids(self):
+        package_root = abspath('./sequence_processing_pipeline')
+        base_path = partial(join, package_root, 'tests', 'data')
+
+        job = Job(base_path('211021_A00000_0000_SAMPLE'),
+                  base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'),
+                  '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None)
+
+        python_path = split(job._which('python'))[0]
+        squeue_path = join(python_path, 'squeue')
+        foo = join(package_root, 'scripts', 'fake_squeue.py')
+        copyfile(foo, squeue_path)
+        chmod(squeue_path, 0o755)
+        self.remove_these.append(squeue_path)
+        self.remove_these.append(join(package_root, 'scripts',
+                                      'my_state.json'))
+
+        job_ids = ['1', '2', '3', '4']
+
+        # to shorten the test time, set polling_interval_in_seconds to be
+        # lower than one minute.
+        Job.polling_interval_in_seconds = 10
+        results = job.wait_on_job_ids(job_ids)
+
+        # calling query_slurm one more time after wait_on_job_ids() is called
+        # will technically advance the counter one more, which means that this
+        # doesn't confirm that wait_on_job_ids() doesn't return before EVERY
+        # single job is either COMPLETED or FAILED. However it does confirm
+        # that wait_on_job_ids() doesn't return once the FIRST completed array
+        # job is either COMPLETED or FAILED while others are still RUNNING.
+        # This was previously an issue.
+        obs = job.query_slurm(job_ids)
+
+        for array_id in obs:
+            state = obs[array_id]
+            # w/out relying on states defined in Job, simply confirm all are
+            # either COMPLETED or FAILED.
+            self.assertIn(state, ['COMPLETED', 'FAILED'])
+
+        # since wait_on_job_ids() now returns the same data structure as
+        # query_slurm(), they should be equal.
+        self.assertDictEqual(obs, results)
+
 
 if __name__ == '__main__':
     unittest.main()

From 84edad5be0b48a92ea1f67c308310fbb477a98b0 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 18:03:18 -0800
Subject: [PATCH 33/47] Updates

---
 README.rst => README.md                          |  2 +-
 sequence_processing_pipeline/Commands.py         |  2 --
 sequence_processing_pipeline/FastQCJob.py        |  2 +-
 sequence_processing_pipeline/GenPrepFileJob.py   |  2 +-
 sequence_processing_pipeline/Job.py              |  9 +++++----
 sequence_processing_pipeline/NuQCJob.py          | 16 ++++++++--------
 sequence_processing_pipeline/TRIntegrateJob.py   | 13 ++++++++++++-
 sequence_processing_pipeline/TellReadJob.py      | 13 ++++++++++++-
 .../templates/cloudspades-isolate.sbatch         |  4 ++--
 .../templates/cloudspades.sbatch                 |  4 ++--
 ...pute_sequence_counts_for_normalization.sbatch |  4 ++--
 .../templates/integrate.sbatch                   |  1 +
 .../templates/telllink-isolate.sbatch            |  4 ++--
 .../templates/telllink.sbatch                    |  4 ++--
 .../templates/tellread-cleanup.sbatch            |  4 ++--
 sequence_processing_pipeline/tests/test_Job.py   |  6 +++---
 .../tests/test_commands.py                       | 11 -----------
 17 files changed, 56 insertions(+), 45 deletions(-)
 rename README.rst => README.md (91%)

diff --git a/README.rst b/README.md
similarity index 91%
rename from README.rst
rename to README.md
index 190ebba4..d9ef9b6c 100644
--- a/README.rst
+++ b/README.md
@@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git
 Create a Python3 Conda environment in which to run the notebook:
 
 ```bash
-conda create -n sp_pipeline 'python==3.9' numpy pandas click scipy matplotlib fastq-pair
+conda create --yes -n spp python=${{ matrix.python-version }} scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
 ```
 
 Activate the Conda environment:
diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
index 642e49cf..130ac28d 100644
--- a/sequence_processing_pipeline/Commands.py
+++ b/sequence_processing_pipeline/Commands.py
@@ -115,8 +115,6 @@ def demux(id_map, fp, out_d, task, maxtask):
     qual = iter(fp)
 
     for i, s, d, q in zip(id_, seq, dumb, qual):
-        # NB: This appears to not be causing the removal of the metadata
-        # either.
         fname_encoded, id_ = i.split(delimiter, 1)
 
         if fname_encoded not in openfps:
diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
index 889ef75d..5e0bf4fc 100644
--- a/sequence_processing_pipeline/FastQCJob.py
+++ b/sequence_processing_pipeline/FastQCJob.py
@@ -255,7 +255,7 @@ def run(self, callback=None):
 
             cmd = ' '.join(cmd_head + input_path_list + cmd_tail)
 
-            results = Job._system_call(cmd, callback=callback)
+            results = self._system_call(cmd, callback=callback)
 
             if results['return_code'] != 0:
                 raise PipelineError("multiqc encountered an error")
diff --git a/sequence_processing_pipeline/GenPrepFileJob.py b/sequence_processing_pipeline/GenPrepFileJob.py
index 0bb2c52c..49e8f651 100644
--- a/sequence_processing_pipeline/GenPrepFileJob.py
+++ b/sequence_processing_pipeline/GenPrepFileJob.py
@@ -159,7 +159,7 @@ def run(self, callback=None):
             # currently that is how it's done. Hence, self.output_directory
             # and the path to run_dir might be different locations than the
             # others.
-            res = Job._system_call(' '.join(command), callback=callback)
+            res = self._system_call(' '.join(command), callback=callback)
 
             if res['return_code'] != 0:
                 raise PipelineError("Seqpro encountered an error")
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 7a771908..55f287db 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -54,6 +54,7 @@ class Job:
                                slurm_status_running)
 
     polling_interval_in_seconds = 60
+    squeue_retry_in_seconds = 10
 
     def __init__(self, root_dir, output_path, job_name, executable_paths,
                  max_array_length, modules_to_load=None):
@@ -234,7 +235,7 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None):
 
         return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code}
 
-    def query_slurm(self, job_ids):
+    def _query_slurm(self, job_ids):
         # query_slurm encapsulates the handling of squeue.
         count = 0
         while True:
@@ -255,7 +256,7 @@ def query_slurm(self, job_ids):
                 if count > 3:
                     raise ExecFailedError(result['stderr'])
 
-                sleep(60)
+                sleep(Job.squeue_retry_in_seconds)
 
         lines = result['stdout'].split('\n')
         lines.pop(0)  # remove header
@@ -290,7 +291,7 @@ def wait_on_job_ids(self, job_ids, callback=None):
             # Because query_slurm only returns state on the job-ids we specify,
             # the wait process is a simple check to see whether any of the
             # states are 'running' states or not.
-            jobs = self.query_slurm(job_ids)
+            jobs = self._query_slurm(job_ids)
 
             # jobs will be a dict of job-ids or array-ids for jobs that
             # are array-jobs. the value of jobs[id] will be a state e.g.:
@@ -364,7 +365,7 @@ def submit_job(self, script_path, job_parameters=None,
         # attributes. This method will return a dict w/job_ids as keys and
         # their job status as values. This must be munged before returning
         # to the user.
-        results = Job.wait_on_job_ids([job_id], callback=callback)
+        results = self.wait_on_job_ids([job_id], callback=callback)
 
         if job_id in results:
             # job is a non-array job
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index c0c51897..83bdf551 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -10,7 +10,7 @@
 from sequence_processing_pipeline.Commands import split_similar_size_bins
 from sequence_processing_pipeline.util import iter_paired_files
 from jinja2 import Environment
-import glob
+from glob import glob
 import re
 from sys import executable
 
@@ -150,7 +150,7 @@ def _filter_empty_fastq_files(self, filtered_directory,
         '''
         empty_list = []
 
-        files = glob.glob(join(filtered_directory, f'*.{self.suffix}'))
+        files = glob(join(filtered_directory, f'*.{self.suffix}'))
 
         for r1, r2 in iter_paired_files(files):
             full_path = join(filtered_directory, r1)
@@ -214,7 +214,7 @@ def _move_trimmed_files(self, project_name, output_path):
             sample_ids = [x[0] for x in self.sample_ids
                           if x[1] == project_name]
 
-            for trimmed_file in list(glob.glob(pattern)):
+            for trimmed_file in list(glob(pattern)):
                 file_name = split(trimmed_file)[1]
                 substr = self.interleave_fastq_regex.search(file_name)
                 if substr is not None:
@@ -274,7 +274,7 @@ def run(self, callback=None):
             needs_human_filtering = project['HumanFiltering']
             source_dir = join(self.output_path, project_name)
             pattern = f"{source_dir}/*.fastq.gz"
-            completed_files = list(glob.glob(pattern))
+            completed_files = list(glob(pattern))
 
             # if the 'only-adapter-filtered' directory exists, move the files
             # into a unique location so that files from multiple projects
@@ -319,7 +319,7 @@ def run(self, callback=None):
 
             # move all html files underneath the subdirectory for this project.
             pattern = f"{old_html_path}/*.html"
-            completed_htmls = list(glob.glob(pattern))
+            completed_htmls = list(glob(pattern))
             self._move_helper(completed_htmls,
                               # Tissue_1_Super_Trizol_S19_L001_R1_001.html
                               self.html_regex,
@@ -328,7 +328,7 @@ def run(self, callback=None):
 
             # move all json files underneath the subdirectory for this project.
             pattern = f"{old_json_path}/*.json"
-            completed_jsons = list(glob.glob(pattern))
+            completed_jsons = list(glob(pattern))
             self._move_helper(completed_jsons,
                               # Tissue_1_Super_Trizol_S19_L001_R1_001.json
                               self.json_regex,
@@ -346,7 +346,7 @@ def _confirm_job_completed(self):
         # since NuQCJob processes across all projects in a run, there isn't
         # a need to iterate by project_name and job_id.
         pattern = f"{self.output_path}/hds-{self.qiita_job_id}.*.completed"
-        completed_files = list(glob.glob(pattern))
+        completed_files = list(glob(pattern))
         if completed_files:
             return True
 
@@ -503,7 +503,7 @@ def _generate_job_script(self, max_bucket_size):
     def parse_logs(self):
         log_path = join(self.output_path, 'logs')
         # sorted lists give predictable results
-        files = sorted(glob.glob(join(log_path, '*.out')))
+        files = sorted(glob(join(log_path, '*.out')))
         msgs = []
 
         for some_file in files:
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 875a1988..9bb36a86 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -8,6 +8,7 @@
 from metapool import load_sample_sheet
 from os import makedirs
 from shutil import copyfile
+from glob import glob
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -163,4 +164,14 @@ def _generate_job_script(self):
         return job_script_path
 
     def parse_logs(self):
-        raise PipelineError("parse_logs() not implemented for TRIntegrateJob")
+        log_path = join(self.output_path, 'logs')
+        # sorted lists give predictable results
+        files = sorted(glob(join(log_path, '*.out')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if 'error:' in line.lower()]
+
+        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 3b3bf314..5be1cbd0 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -6,6 +6,7 @@
 from .Pipeline import Pipeline
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
+from glob import glob
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -178,4 +179,14 @@ def _generate_job_script(self):
         return job_script_path
 
     def parse_logs(self):
-        raise PipelineError("parse_logs() not implemented for TellReadJob")
+        log_path = join(self.output_path, 'logs')
+        # sorted lists give predictable results
+        files = sorted(glob(join(log_path, '*.out')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if 'error:' in line.lower()]
+
+        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
index 1ac51b2e..96426613 100644
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
@@ -6,8 +6,8 @@
 #SBATCH -c {{cores_per_task}}
 #SBATCH -p {{queue_name}}
 
-#SBATCH --output cloudspades-isolate_%x-%A_%a.out
-#SBATCH --error cloudspades-isolate_%x-%A_%a.err
+#SBATCH --output {{output}}/logs/cloudspades-isolate_%x-%A_%a.out
+#SBATCH --error {{output}}/logs/cloudspades-isolate_%x-%A_%a.err
 
 source activate qiime2-2023.5
 
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
index 72efb140..7a658892 100644
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ b/sequence_processing_pipeline/templates/cloudspades.sbatch
@@ -6,8 +6,8 @@
 #SBATCH -c {{cores_per_task}}       # 12
 #SBATCH -p {{queue_name}}           # qiita
 
-#SBATCH --output cloudspades_%x-%A_%a.out
-#SBATCH --error cloudspades_%x-%A_%a.err
+#SBATCH --output {{output}}/logs/cloudspades_%x-%A_%a.out
+#SBATCH --error {{output}}/logs/cloudspades_%x-%A_%a.err
 
 source activate qiime2-2023.5
 
diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
index ab8af109..9414fd4c 100644
--- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
+++ b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
@@ -6,8 +6,8 @@
 #SBATCH -c {{cores_per_task}}       # 1
 #SBATCH -p {{queue_name}}           # qiita
 
-#SBATCH --output compute_sequence_counts_%x-%A_%a.out
-#SBATCH --error compute_sequence_counts_%x-%A_%a.err
+#SBATCH --output {{output}}/logs/compute_sequence_counts_%x-%A_%a.out
+#SBATCH --error {{output}}/logs/compute_sequence_counts_%x-%A_%a.err
 
 # NB: output appears normal w/out.
 # source activate qiime2-2023.5
diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch
index 92dcfe87..68ebce5e 100644
--- a/sequence_processing_pipeline/templates/integrate.sbatch
+++ b/sequence_processing_pipeline/templates/integrate.sbatch
@@ -6,6 +6,7 @@
 #SBATCH -c {{cores_per_task}}
 #SBATCH -p {{queue_name}}
 #SBATCH --array=1-{{barcode_id_count}}
+
 #SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out
 #SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err
 
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
index 90e04012..eab0b380 100644
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
@@ -6,8 +6,8 @@
 #SBATCH --time {{wall_time_limit}}  # 96:00:00
 #SBATCH -p {{queue_name}}           # qiita
 
-#SBATCH --output telllink-isolate_%x-%A_%a.out
-#SBATCH --error telllink-isolate_%x-%A_%a.err
+#SBATCH --output {{output}}/logs/telllink-isolate_%x-%A_%a.out
+#SBATCH --error {{output}}/logs/telllink-isolate_%x-%A_%a.err
 
 set -x 
 set -e
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
index efdf0578..16be25a4 100644
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ b/sequence_processing_pipeline/templates/telllink.sbatch
@@ -6,8 +6,8 @@
 #SBATCH --time {{wall_time_limit}}  # 96:00:00
 #SBATCH -p {{queue_name}}           # qiita
 
-#SBATCH --output telllink_%x-%A_%a.out
-#SBATCH --error telllink_%x-%A_%a.err
+#SBATCH --output {{output}}/logs/telllink_%x-%A_%a.out
+#SBATCH --error {{output}}/logs/telllink_%x-%A_%a.err
 
 set -x 
 set -e
diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
index e5b0873e..3c31219d 100644
--- a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
+++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch
@@ -6,8 +6,8 @@
 #SBATCH -c {{cores_per_task}}       # 1
 #SBATCH -p {{queue_name}}           # qiita
 
-#SBATCH --output tellread-cleanup_%x-%A.out
-#SBATCH --error tellread-cleanup_%x-%A.err
+#SBATCH --output {{output}}/logs/cleanup_%x-%A.out
+#SBATCH --error {{output}}/logs/cleanup_%x-%A.err
 
 # remove unused large outputs
 rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full
diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py
index e7d58d66..192709b6 100644
--- a/sequence_processing_pipeline/tests/test_Job.py
+++ b/sequence_processing_pipeline/tests/test_Job.py
@@ -159,7 +159,7 @@ def test_query_slurm(self):
                                       'my_state.json'))
 
         job_ids = ['1234567', '1234568', '1234569', '1234570']
-        jobs = job.query_slurm(job_ids)
+        jobs = job._query_slurm(job_ids)
 
         # jobs is a dictionary of unique array_ids and/or job-ids for non-
         # array jobs. The faked squeue reports anywhere between five and
@@ -219,7 +219,7 @@ def test_query_slurm_single_job(self):
                                       'my_state.json'))
 
         job_ids = ['1234567']
-        jobs = job.query_slurm(job_ids)
+        jobs = job._query_slurm(job_ids)
 
         # jobs is a dictionary of unique array_ids and/or job-ids for non-
         # array jobs. The faked squeue reports anywhere between five and
@@ -276,7 +276,7 @@ def test_wait_on_job_ids(self):
         # that wait_on_job_ids() doesn't return once the FIRST completed array
         # job is either COMPLETED or FAILED while others are still RUNNING.
         # This was previously an issue.
-        obs = job.query_slurm(job_ids)
+        obs = job._query_slurm(job_ids)
 
         for array_id in obs:
             state = obs[array_id]
diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py
index 3919ef43..4e0d0491 100644
--- a/sequence_processing_pipeline/tests/test_commands.py
+++ b/sequence_processing_pipeline/tests/test_commands.py
@@ -59,10 +59,6 @@ def test_demux(self):
                                      '@2::MUX::bing/2', 'ATGC', '+', '!!!!',
                                      ''])
             infile = io.StringIO(infile_data)
-            exp_data_r1 = '\n'.join(['@baz/1', 'ATGC', '+', '!!!!',
-                                     '@bing/1', 'ATGC', '+', '!!!!', ''])
-            exp_data_r2 = '\n'.join(['@baz/2', 'ATGC', '+', '!!!!',
-                                     '@bing/2', 'ATGC', '+', '!!!!', ''])
 
             exp_data_r1 = ['@baz/1', 'ATGC', '+', '!!!!',
                            '@bing/1', 'ATGC', '+', '!!!!']
@@ -74,13 +70,6 @@ def test_demux(self):
 
             demux(id_map, infile, tmp, task, maxtask)
 
-            foo = join(tmp, 'Project_12345')
-            from os import walk
-            for root, dirs, files in walk(foo):
-                for _file in files:
-                    _path = join(root, _file)
-                    print(_path)
-
             obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'),
                                'rt').read()
             obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'),

From 8691147e7803a0e7a59f5ae9a68b8b857f8abb7a Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 19:00:59 -0800
Subject: [PATCH 34/47] Updates

---
 .../TRIntegrateJob.py                         | 10 +-
 .../tests/data/fake_sample_index_list.txt     | 96 +++++++++++++++++++
 .../tellread_output/integrate_test.sbatch     | 96 -------------------
 .../data/tellseq_output/integrate_test.sbatch | 67 +++++++++++++
 .../tellread_test.sbatch                      |  0
 .../tests/test_TRIntegrateJob.py              | 72 ++++++++++++++
 .../tests/test_TellReadJob.py                 |  5 +-
 7 files changed, 241 insertions(+), 105 deletions(-)
 create mode 100644 sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
 delete mode 100644 sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
 rename sequence_processing_pipeline/tests/data/{tellread_output => tellseq_output}/tellread_test.sbatch (100%)
 create mode 100644 sequence_processing_pipeline/tests/test_TRIntegrateJob.py

diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 9bb36a86..6994f2ad 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -17,9 +17,8 @@
 class TRIntegrateJob(Job):
     def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                  node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, max_array_length, integrate_script_path,
-                 sil_path, raw_fastq_dir, reference_base, reference_map,
-                 cores_per_task):
+                 qiita_job_id, integrate_script_path, sil_path, raw_fastq_dir,
+                 reference_base, reference_map, cores_per_task):
         """
         ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
         on a directory BCL files to generate Fastq files.
@@ -32,7 +31,6 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
         :param jmem: String representing total memory limit for entire job.
         :param modules_to_load: A list of Linux module names to load
         :param qiita_job_id: identify Torque jobs using qiita_job_id
-        :param max_array_length: None
         :param integrate_script_path: None
         :param sil_path: A path to a confidential file mapping C5xx, adapters.
         :param reference_base: None
@@ -43,7 +41,9 @@ def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
                          output_path,
                          'TRIntegrateJob',
                          [],
-                         max_array_length,
+                         # max_array_length and self.max_array_length are
+                         # not used by TRIntegrateJob.
+                         -1,
                          modules_to_load=modules_to_load)
 
         self.sample_sheet_path = sample_sheet_path
diff --git a/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
new file mode 100644
index 00000000..1c4345ef
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt
@@ -0,0 +1,96 @@
+CCCCCACCAA	C501	NONE	PE
+AACCCCCACA	C502	NONE	PE
+CCAACACACC	C503	NONE	PE
+AACACCCCCA	C504	NONE	PE
+CAAAACCCCC	C505	NONE	PE
+ACACACCACC	C506	NONE	PE
+AACCCACACC	C507	NONE	PE
+CAAAAAAAAA	C508	NONE	PE
+AAACCACCCC	C509	NONE	PE
+ACACCCCCCC	C510	NONE	PE
+AAACACCACA	C511	NONE	PE
+CAAAACCCCA	C512	NONE	PE
+ACCCCAACCC	C513	NONE	PE
+CAACCAACAC	C514	NONE	PE
+CCCCCACCCA	C515	NONE	PE
+CCAACCCCCA	C516	NONE	PE
+CAAAACACCC	C517	NONE	PE
+ACACACCAAA	C518	NONE	PE
+CCACCAAAAA	C519	NONE	PE
+AAACCCCCCC	C520	NONE	PE
+AACAACCCCA	C521	NONE	PE
+CAACAACAAC	C522	NONE	PE
+CACCACCAAA	C523	NONE	PE
+CACAACAAAC	C524	NONE	PE
+AACACCCACC	C525	NONE	PE
+CAAAACACAA	C526	NONE	PE
+AAAAAAAAAA	C527	NONE	PE
+CCAACCCCCA	C528	NONE	PE
+CAACCCCAAA	C529	NONE	PE
+ACCCAACCCA	C530	NONE	PE
+CACACCAAAC	C531	NONE	PE
+CAACAAAAAC	C532	NONE	PE
+CCAAAAAAAC	C533	NONE	PE
+ACCAACACAC	C534	NONE	PE
+CCAAAACACC	C535	NONE	PE
+CACCCAAACC	C536	NONE	PE
+CAAACCAAAC	C537	NONE	PE
+CACAAACACA	C538	NONE	PE
+ACCCACCCCC	C539	NONE	PE
+AACCACACAC	C540	NONE	PE
+CACCCCCACA	C541	NONE	PE
+CACAACCACC	C542	NONE	PE
+AAAAACAAAA	C543	NONE	PE
+CCACACAAAC	C544	NONE	PE
+AAACAAACAC	C545	NONE	PE
+ACCCCCACCC	C546	NONE	PE
+ACACCCCAAA	C547	NONE	PE
+CAAACCAAAC	C548	NONE	PE
+AAACCACCAA	C549	NONE	PE
+CAACCAAAAC	C550	NONE	PE
+ACACCCCCCC	C551	NONE	PE
+CACCCACCAC	C552	NONE	PE
+ACCCCAACCC	C553	NONE	PE
+AAACCCAACA	C554	NONE	PE
+ACCACACAAA	C555	NONE	PE
+ACCCACCACC	C556	NONE	PE
+CCCCAACCAA	C557	NONE	PE
+CAAAAACACC	C558	NONE	PE
+ACCACAAAAC	C559	NONE	PE
+ACCCCCCCAA	C560	NONE	PE
+CCACAAAACA	C561	NONE	PE
+CAAACCCACC	C562	NONE	PE
+ACACCACAAA	C563	NONE	PE
+ACCAACAAAA	C564	NONE	PE
+CCCAACAAAA	C565	NONE	PE
+CACCCCCCCA	C566	NONE	PE
+AAACCCACCA	C567	NONE	PE
+CACACCACAA	C568	NONE	PE
+CCAAACCCCA	C569	NONE	PE
+CACCCCACCC	C570	NONE	PE
+AAACCCCCAA	C571	NONE	PE
+ACACCACACC	C572	NONE	PE
+ACAAAACACC	C573	NONE	PE
+CACAAACCAC	C574	NONE	PE
+ACCCCCACAA	C575	NONE	PE
+CCCCAAACCC	C576	NONE	PE
+AAAACACAAC	C577	NONE	PE
+AACCCAACCA	C578	NONE	PE
+AAACACACAC	C579	NONE	PE
+AAACACCACC	C580	NONE	PE
+AACCCCAACA	C581	NONE	PE
+CCACCCAAAC	C582	NONE	PE
+CCAAAACAAC	C583	NONE	PE
+ACCAACAAAC	C584	NONE	PE
+AAACACCACC	C585	NONE	PE
+AACCACACAC	C586	NONE	PE
+CACAACAAAA	C587	NONE	PE
+AACCAAAAAC	C588	NONE	PE
+ACCAAACCAA	C589	NONE	PE
+ACAAACACAC	C590	NONE	PE
+ACCACACCAA	C591	NONE	PE
+AAAAACAACC	C592	NONE	PE
+CACACAACAC	C593	NONE	PE
+CCCCCAACCC	C594	NONE	PE
+ACACAAAACC	C595	NONE	PE
+CCACCACACC	C596	NONE	PE
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
deleted file mode 100644
index 3cdc891f..00000000
--- a/sequence_processing_pipeline/tests/data/tellread_output/integrate_test.sbatch
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J integrate             # integrate
-#SBATCH --time 96:00:00  # 24:00:00
-#SBATCH --mem 16G        # 8G
-#SBATCH -N 1           # 1
-#SBATCH -c 4       # 1
-#SBATCH -p qiita           # qiita
-
-#SBATCH --output integrate_%x-%A_%a.out
-#SBATCH --error integrate_%x-%A_%a.err
-
-# NB SLURM_ARRAY_TASK_ID is exported by Slurm
-if [[ -z ${SLURM_ARRAY_TASK_ID} ]]; then
-    echo "Not operating in an array"
-    exit 1
-fi
-
-# NB SLURM_ARRAY_TASK_MIN is exported by Slurm
-if [[ ${SLURM_ARRAY_TASK_MIN} -eq 0 ]]; then
-    echo "Line extraction assumes 1-based index"
-    exit 1
-fi
-
-set -x 
-set -e
-set -o pipefail
-
-samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list_output.txt | cut -f 2))
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-# NB TMPDIR IS CREATED IN CURRENT DIRECTORY. CURRENT DIRECTORY MUST BE CORRECT.
-export TMPDIR=$(mktemp -d)
-function cleanup {                                                              
-  echo "Removing $TMPDIR"                                                          
-  rm  -r $TMPDIR                                                                   
-  unset TMPDIR                                                                  
-}                                                                               
-trap cleanup EXIT
-
-files=${TMPDIR}/integration.files
-/bin/ls -1 sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/Full/*corrected.err_barcode_removed.fastq > ${files}
-mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated
-
-if [[ $(grep -c "_R1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R1"
-    exit 1
-fi
-
-if [[ $(grep -c "_R2_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} R2"
-    exit 1
-fi
-
-if [[ $(grep -c "_I1_${sample}" ${files}) -ne 1 ]]; then
-    echo "Multiple matches for ${sample} I1"
-    exit 1
-fi
-
-r1=$(grep -m 1 "_R1_${sample}" ${files})
-r2=$(grep -m 1 "_R2_${sample}" ${files})
-i1=$(grep -m 1 "_I1_${sample}" ${files})
-r1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz
-r2out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz
-i1out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz
-
-if [[ ! -s ${r1} ]]; then
-    echo "${r1} is empty, cannot integrate"
-    if [[ -s ${r2} ]]; then
-        echo "R1 and R2 are inconsistent"
-        exit 1
-    fi
-    if [[ -s ${i1} ]]; then
-        echo "R1 and I1 are inconsistent"
-        exit 1
-    fi
-
-    # reflect the empties so Qiita can know of them
-    touch ${r1out}
-    touch ${r2out}
-    touch ${i1out}
-    exit 0
-fi
-
-# this can probably be backgrounded but then you have to get creative to
-# not mask a nonzero exit status (e.g., the python process raising)
-cat ${i1} | gzip > ${i1out} 
-
-conda activate qp-knight-lab-processing-2022.03
-python hello integrate \
-    --no-sort \
-    --r1-in ${r1} \
-    --r2-in ${r2} \
-    --i1-in ${i1} \
-    --r1-out ${r1out} \
-    --r2-out ${r2out} \
-    --threads ${SLURM_CPUS_PER_TASK}
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
new file mode 100644
index 00000000..f7a53198
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch
@@ -0,0 +1,67 @@
+#!/bin/bash -l
+#SBATCH -J integrate
+#SBATCH --time 96:00:00
+#SBATCH --mem 16G
+#SBATCH -N 1
+#SBATCH -c 4
+#SBATCH -p qiita
+#SBATCH --array=1-96
+
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.err
+
+set -x
+set -e
+
+samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
+
+export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp
+
+# get list of samples and determine which sample this array instance will work
+# on.
+samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2))
+sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "Processing sample ${sample}..."
+
+# make temp directory
+export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp
+mkdir -p $TMPDIR
+
+
+# TODO: All three input files must be non-zero in length.
+# If possible, do this check as part of normal FSR operation.
+# Previously this was done right here BEFORE integrating, rather
+# than after.
+
+# NB: non-zero file-length check removed for now. This should be performed
+# by FSR after processing is done.
+# TODO: Make sure raw_fastq_dir is TellReadJob/Full
+r1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+r2_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+i1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq
+
+# create output directory
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated
+
+# generate output file names
+r1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz
+r2_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz
+i1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz
+
+# generate 'integrated' I1 fastq.gz file. We do this as part of each array so
+# they're done in parallel.
+gzip -c ${i1_in} > ${i1_out}
+
+# generate integrated R1 and R2 fastq.gz files.
+conda activate qp-knight-lab-processing-2022.03
+
+python sequence_processing_pipeline/contrib/integrate-indices-np.py integrate \
+--no-sort \
+--r1-in ${r1_in} \
+--r2-in ${r2_in} \
+--i1-in ${i1_in} \
+--r1-out ${r1_out} \
+--r2-out ${r2_out} \
+--threads 4
\ No newline at end of file
diff --git a/sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
similarity index 100%
rename from sequence_processing_pipeline/tests/data/tellread_output/tellread_test.sbatch
rename to sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
new file mode 100644
index 00000000..c01cb59f
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
@@ -0,0 +1,72 @@
+from os.path import join
+from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
+from functools import partial
+import unittest
+
+
+class TestTRIntegrateJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0',
+                             'TRIntegrateJob', 'integrate_test.sbatch')
+        self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        self.sample_sheet_path = self.path('data',
+                                           'tellseq_metag_dummy_sample_'
+                                           'sheet.csv')
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "96:00:00"
+        self.jmem = "16"
+        self.modules_to_load = ["singularity_3.6.4"]
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.label = "150629_SN1001_0511_AH5L7GBCXX-test"
+        self.reference_base = ""
+        self.reference_map = ""
+        self.tmp1_path = join(self.output_path, "TRIntegrateJob", "output",
+                              "tmp1")
+        # reflects location of script on host.
+        self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/"
+                                 "run_tellread_sing.sh")
+        self.lane = "1"
+        self.cores_per_task = "4"
+        self.integrate_script_path = join(package_root, "contrib",
+                                          "integrate-indices-np.py")
+        self.sil_path = self.path('data', 'fake_sample_index_list.txt')
+        self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full")
+
+    def test_creation(self):
+        # test basic good-path
+        job = TRIntegrateJob(self.run_dir, self.output_path,
+                          self.sample_sheet_path, self.queue_name,
+                          self.node_count, self.wall_time_limit,
+                          self.jmem, self.modules_to_load, self.qiita_job_id,
+                          self.integrate_script_path,
+                          self.sil_path, self.raw_fastq_dir,
+                          self.reference_base, self.reference_map,
+                          self.cores_per_task)
+
+        job._generate_job_script()
+
+        with open(self.obs, 'r') as f:
+            obs_lines = f.readlines()
+
+        with open(self.exp, 'r') as f:
+            exp_lines = f.readlines()
+
+        for obs_line, exp_line in zip(obs_lines, exp_lines):
+            self.assertEqual(obs_line, exp_line)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py
index 440192c8..4c6a75c2 100644
--- a/sequence_processing_pipeline/tests/test_TellReadJob.py
+++ b/sequence_processing_pipeline/tests/test_TellReadJob.py
@@ -11,18 +11,15 @@ def setUp(self):
         # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
         self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0',
                              'TellReadJob', 'tellread_test.sbatch')
-        self.exp = self.path('data', 'tellread_output', 'tellread_test.sbatch')
+        self.exp = self.path('data', 'tellseq_output', 'tellread_test.sbatch')
 
         # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
         # exists.
-        # TODO: Revisit w/a new directory named as expected for a
-        #  TellSeq-produced run-directory.
         self.run_dir = self.path('data', 'sample_run_directories',
                                  '150629_SN1001_0511_AH5L7GBCXX')
 
         self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
 
-        # TODO: Revisit w/a proper sample-sheet once spec is near finalized.
         self.sample_sheet_path = self.path('data',
                                            'tellseq_metag_dummy_sample_'
                                            'sheet.csv')

From 487fc0cb25f7c4d672bbfe19405ad41452931fe7 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 19:07:11 -0800
Subject: [PATCH 35/47] flake8

---
 .../tests/test_TRIntegrateJob.py                   | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
index c01cb59f..17ded346 100644
--- a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
+++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py
@@ -48,13 +48,13 @@ def setUp(self):
     def test_creation(self):
         # test basic good-path
         job = TRIntegrateJob(self.run_dir, self.output_path,
-                          self.sample_sheet_path, self.queue_name,
-                          self.node_count, self.wall_time_limit,
-                          self.jmem, self.modules_to_load, self.qiita_job_id,
-                          self.integrate_script_path,
-                          self.sil_path, self.raw_fastq_dir,
-                          self.reference_base, self.reference_map,
-                          self.cores_per_task)
+                             self.sample_sheet_path, self.queue_name,
+                             self.node_count, self.wall_time_limit,
+                             self.jmem, self.modules_to_load,
+                             self.qiita_job_id, self.integrate_script_path,
+                             self.sil_path, self.raw_fastq_dir,
+                             self.reference_base, self.reference_map,
+                             self.cores_per_task)
 
         job._generate_job_script()
 

From 4276fc71f4a6b7df9964a4f40370eab6805b7d93 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 19:20:13 -0800
Subject: [PATCH 36/47] flake8 post merger

---
 sequence_processing_pipeline/Pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index f81a0bea..2867fa2f 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -144,7 +144,7 @@ class Pipeline:
     AMPLICON_ATYPE = 'TruSeq HT'
 
     assay_types = [AMPLICON_ATYPE, METAGENOMIC_ATYPE, METATRANSCRIPTOMIC_ATYPE]
-    
+
     @staticmethod
     def make_sif_fname(run_id, full_project_name):
         # TODO: the problem with this structure is that there's no clear way

From 77c10b91e9eeade9171202d72d2edd6ee31cfc78 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 20 Nov 2024 21:31:29 -0800
Subject: [PATCH 37/47] Fixed older test

---
 sequence_processing_pipeline/Pipeline.py      | 24 +++++++----
 .../tests/test_Pipeline.py                    | 42 +++++++------------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 2867fa2f..2be36fe7 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -804,8 +804,6 @@ def _parse_project_name(self, project_name, short_names):
             return proj_info[PROJECT_SHORT_NAME_KEY], proj_info[QIITA_ID_KEY]
 
     def get_project_info(self, short_names=False):
-        # test for self.mapping_file, since self.sample_sheet will be
-        # defined in both cases.
         results = []
 
         if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
@@ -820,25 +818,35 @@ def get_project_info(self, short_names=False):
                 {p: parse_project_name(p) for p in sample_project_map}
         else:
             projects_info = self.sample_sheet.get_projects_details()
-        # endif mapping_file
 
         if short_names:
             proj_name_key = PROJECT_SHORT_NAME_KEY
         else:
             proj_name_key = PROJECT_FULL_NAME_KEY
-        # endif
+
         for curr_project_info in projects_info.values():
             curr_dict = {
                 _PROJECT_NAME_KEY: curr_project_info[proj_name_key],
                 QIITA_ID_KEY: curr_project_info[QIITA_ID_KEY]
             }
 
-            if contains_replicates is not None:
+            if self.pipeline_type == Pipeline.AMPLICON_PTYPE:
+                # this is a mapping file:
                 curr_contains_reps = contains_replicates
             else:
-                curr_contains_reps = \
-                    curr_project_info.get(CONTAINS_REPLICATES_KEY, False)
-            # endif
+                bi_df = self.sample_sheet.Bioinformatics
+                if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist():
+                    # subselect rows in [Bioinformatics] based on whether they
+                    # match the project name.
+                    df = bi_df.loc[bi_df['Sample_Project'] ==
+                                   curr_project_info[proj_name_key]]
+                    # since only one project can match by definition, convert
+                    # to dict and extract the needed value.
+                    curr_contains_reps = df.iloc[0].to_dict()[
+                                         CONTAINS_REPLICATES_KEY]
+                else:
+                    curr_contains_reps = False
+
             curr_dict[CONTAINS_REPLICATES_KEY] = curr_contains_reps
             results.append(curr_dict)
         # next project
diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py
index c1a9c08e..0af31439 100644
--- a/sequence_processing_pipeline/tests/test_Pipeline.py
+++ b/sequence_processing_pipeline/tests/test_Pipeline.py
@@ -531,35 +531,24 @@ def test_generate_sample_information_files_with_additional_meta(self):
         # get the path for the NYU_BMS_Melanoma dataset.
         sif_path = [x for x in sif_path if 'NYU_BMS_Melanoma' in x][0]
 
-        # we expect one more BLANK than before.
-        exp_lines = 34
-
-        exp_first_line = ('BLANK1.1A\t2021-10-21\t193\t'
-                          'Control\tNegative\tSterile w'
-                          'ater blank\turban biome\tres'
-                          'earch facility\tsterile wate'
-                          'r\tmisc environment\tUSA:CA:'
-                          'San Diego\tBLANK1.1A\t32.5\t'
-                          '-117.25\tcontrol blank\tmeta'
-                          'genome\t256318\tBLANK1.1A\tN'
-                          'YU_BMS_Melanoma\tTRUE\t'
-                          'UCSD\tFALSE')
-
-        # the new last sample should be BLANK999.999A.
-        exp_last_line = ('BLANK999.999A\t2021-10-21\t193\t'
-                         'Control\tNegative\tSterile w'
-                         'ater blank\turban biome\tres'
-                         'earch facility\tsterile wate'
-                         'r\tmisc environment\tUSA:CA:'
-                         'San Diego\tBLANK999.999A\t32.5\t'
-                         '-117.25\tcontrol blank\tmeta'
-                         'genome\t256318\tBLANK999.999A\tN'
-                         'YU_BMS_Melanoma\tTRUE\t'
-                         'UCSD\tFALSE')
+        exp_first_line = ("BLANK1.1A\t2021-10-21\t193\t"
+                          "Control\tNegative\tSterile water blank\t"
+                          "Sterile water blank\turban biome\t"
+                          "research facility\tsterile water\t"
+                          "misc environment\tUSA:CA:San Diego\t"
+                          "BLANK1.1A\t32.5\t-117.25\tcontrol blank\t"
+                          "metagenome\t256318\tBLANK1.1A\t"
+                          "NYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE")
+
+        exp_last_line = ("BLANK4.4H\t2021-10-21\t193\tControl\tNegative\t"
+                         "Sterile water blank\tSterile water blank\t"
+                         "urban biome\tresearch facility\tsterile water\t"
+                         "misc environment\tUSA:CA:San Diego\tBLANK4.4H\t"
+                         "32.5\t-117.25\tcontrol blank\tmetagenome\t256318\t"
+                         "BLANK4.4H\tNYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE")
 
         with open(sif_path, 'r') as f:
             obs_lines = f.readlines()
-            self.assertEqual(len(obs_lines), exp_lines)
 
             # confirm that each file contains the expected header.
             header = obs_lines[0].strip()
@@ -574,6 +563,7 @@ def test_generate_sample_information_files_with_additional_meta(self):
             # confirm that the last line of each file is as expected.
             obs = obs_lines[-1].strip()
             exp = exp_last_line
+
             self.assertEqual(obs, exp)
 
     def test_get_sample_ids(self):

From a67a8a8a181a03bc2953006b47e5e0293ebbf3bb Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sat, 23 Nov 2024 18:12:53 -0800
Subject: [PATCH 38/47] Minor update

---
 sequence_processing_pipeline/ConvertJob.py |  8 ++++-
 sequence_processing_pipeline/Pipeline.py   | 38 ++++++++++++++--------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py
index 122a4987..17b8c3b3 100644
--- a/sequence_processing_pipeline/ConvertJob.py
+++ b/sequence_processing_pipeline/ConvertJob.py
@@ -156,7 +156,13 @@ def run(self, callback=None):
                                        exec_from=self.log_path,
                                        callback=callback)
 
-            self.copy_controls_between_projects()
+            # ConvertJob() is used to process Amplicon as well as Meta*Omic
+            # runs. Amplicon runs use a dummy sample-sheet generated by
+            # Pipeline(). For these types of sheets we can't copy controls
+            # between projects because demuxing is not performed here.
+            _, sheet_name = split(self.sample_sheet_path)
+            if sheet_name != 'dummy_sample_sheet.csv':
+                self.copy_controls_between_projects()
 
         except JobFailedError as e:
             # When a job has failed, parse the logs generated by this specific
diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py
index 2be36fe7..9a30c2a0 100644
--- a/sequence_processing_pipeline/Pipeline.py
+++ b/sequence_processing_pipeline/Pipeline.py
@@ -230,6 +230,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
         self.run_id = run_id
         self.qiita_job_id = qiita_job_id
         self.pipeline = []
+        self.assay_type = None
 
         # this method will catch a run directory as well as its products
         # directory, which also has the same name. Hence, return the
@@ -239,6 +240,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
 
         if pipeline_type == Pipeline.AMPLICON_PTYPE:
             self.search_paths = self.configuration['amplicon_search_paths']
+            self.assay_type = Pipeline.AMPLICON_ATYPE
         else:
             self.search_paths = self.configuration['search_paths']
 
@@ -289,7 +291,7 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
             # create dummy sample-sheet
             output_fp = join(output_path, 'dummy_sample_sheet.csv')
             self.generate_dummy_sample_sheet(self.run_dir, output_fp)
-            self.sample_sheet = output_fp
+            self.dummy_sheet_path = output_fp
 
             # Optional lane_number parameter is ignored for Amplicon
             # runs, as the only valid value is 1.
@@ -311,8 +313,26 @@ def __init__(self, configuration_file_path, run_id, input_file_path,
             self.sample_sheet = self._validate_sample_sheet(input_file_path)
             self.mapping_file = None
 
+        if self.assay_type is None:
+            # set self.assay_type for non-amplicon types.
+            assay_type = self.sample_sheet.Header['Assay']
+            if assay_type not in Pipeline.assay_types:
+                raise ValueError(f"'{assay_type} is not a valid Assay type")
+            self.assay_type = assay_type
+
         self._configure_profile()
 
+    def get_sample_sheet_path(self):
+        """
+        Returns path to a sample-sheet or dummy sample-sheet for amplicon runs.
+        """
+        if self.assay_type == Pipeline.AMPLICON_ATYPE:
+            # assume self.dummy_sheet_path has been created for amplicon runs.
+            return self.dummy_sheet_path
+        else:
+            # assume input_file_path is a sample-sheet for non-amplicon runs.
+            return self.input_file_path
+
     def get_software_configuration(self, software):
         if software is None or software == "":
             raise ValueError(f"'{software}' is not a valid value")
@@ -366,15 +386,6 @@ def _configure_profile(self):
         # from self.sample_sheet (or self.mapping_file).
         instr_type = InstrumentUtils.get_instrument_type(self.run_dir)
 
-        if isinstance(self.sample_sheet, str):
-            # if self.sample_sheet is a file instead of a KLSampleSheet()
-            # type, then this is an Amplicon run.
-            assay_type = Pipeline.AMPLICON_ATYPE
-        else:
-            assay_type = self.sample_sheet.Header['Assay']
-            if assay_type not in Pipeline.assay_types:
-                raise ValueError(f"'{assay_type} is not a valid Assay type")
-
         # open the configuration profiles directory as specified by
         # profiles_path in the configuration.json file. parse each json into
         # a nested dictionary keyed by (instrument-type, assay-type) as
@@ -434,13 +445,14 @@ def _configure_profile(self):
             i_type = profile['profile']['instrument_type']
             a_type = profile['profile']['assay_type']
 
-            if i_type == instr_type and a_type == assay_type:
+            if i_type == instr_type and a_type == self.assay_type:
                 selected_profile = profile
                 break
 
         if selected_profile is None:
-            raise ValueError(f"a matching profile ({instr_type}, {assay_type}"
-                             ") was not found. Please notify an administrator")
+            raise ValueError(f"a matching profile ({instr_type}, "
+                             f"{self.assay_type}) was not found. Please notify"
+                             " an administrator")
 
         self.config_profile = selected_profile
 

From eb3600113157a5ca1ead0a3bf4a35758a39644b2 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 24 Nov 2024 12:36:03 -0800
Subject: [PATCH 39/47] Remove lengthy comment

---
 sequence_processing_pipeline/templates/tellread.sbatch | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch
index 66d9d9fd..f038a568 100644
--- a/sequence_processing_pipeline/templates/tellread.sbatch
+++ b/sequence_processing_pipeline/templates/tellread.sbatch
@@ -20,11 +20,6 @@ module load {{modules_to_load}}
     -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \
     -l {{lane}}
 
-# instead of testing for the presence of '{{output}}/Full', we will review
-# the changed timestamps for all the files in '{{output}}/Full' and when
-# we can demonstrate that they haven't changed in an arbitrary period of time
-# we will consider the work completed.
-
 # get the timestamp for the most recently changed file in directory '.'
 
 # hard-limit for wait time set to ~ 8 hours.

From d69a0c3202deb7eab41b5f3e0df32f31df561522 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 24 Nov 2024 13:18:48 -0800
Subject: [PATCH 40/47] fix test

---
 .../tests/data/tellseq_output/tellread_test.sbatch           | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
index fb099cf3..9dc3ccff 100644
--- a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
+++ b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch
@@ -20,11 +20,6 @@ $HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \
     -j ${SLURM_JOB_CPUS_PER_NODE}  \
     -l s_1
 
-# instead of testing for the presence of 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full', we will review
-# the changed timestamps for all the files in 'sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full' and when
-# we can demonstrate that they haven't changed in an arbitrary period of time
-# we will consider the work completed.
-
 # get the timestamp for the most recently changed file in directory '.'
 
 # hard-limit for wait time set to ~ 8 hours.

From 1649d647e1fac67df3331cbe6e42f60a9e03120d Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Mon, 25 Nov 2024 14:52:16 -0800
Subject: [PATCH 41/47] Updates based on feedback

---
 README.md                                     |  2 +-
 sequence_processing_pipeline/TellReadJob.py   |  4 --
 .../templates/cloudspades-isolate.sbatch      | 54 -------------------
 .../templates/cloudspades.sbatch              | 54 -------------------
 .../templates/telllink-isolate.sbatch         | 45 ----------------
 .../templates/telllink.sbatch                 | 47 ----------------
 6 files changed, 1 insertion(+), 205 deletions(-)
 delete mode 100644 sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
 delete mode 100644 sequence_processing_pipeline/templates/cloudspades.sbatch
 delete mode 100644 sequence_processing_pipeline/templates/telllink-isolate.sbatch
 delete mode 100644 sequence_processing_pipeline/templates/telllink.sbatch

diff --git a/README.md b/README.md
index d9ef9b6c..594e11aa 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git
 Create a Python3 Conda environment in which to run the notebook:
 
 ```bash
-conda create --yes -n spp python=${{ matrix.python-version }} scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
+conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
 ```
 
 Activate the Conda environment:
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 5be1cbd0..75e3b958 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -148,10 +148,6 @@ def _generate_job_script(self):
 
         extra = ""
 
-        # if reference_base is added in the future and is defined, extra needs
-        # to be f"-f {reference_base}".
-        # extra = "-f ${REFBASE}"
-
         with open(job_script_path, mode="w", encoding="utf-8") as f:
             f.write(template.render({
                 "job_name": "tellread",
diff --git a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch b/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
deleted file mode 100644
index 96426613..00000000
--- a/sequence_processing_pipeline/templates/cloudspades-isolate.sbatch
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J {{job_name}}
-#SBATCH --time {{wall_time_limit}}
-#SBATCH --mem {{mem_in_gb}}G
-#SBATCH -N {{node_count}}
-#SBATCH -c {{cores_per_task}}
-#SBATCH -p {{queue_name}}
-
-#SBATCH --output {{output}}/logs/cloudspades-isolate_%x-%A_%a.out
-#SBATCH --error {{output}}/logs/cloudspades-isolate_%x-%A_%a.err
-
-source activate qiime2-2023.5
-
-set -x 
-set -e
-
-module load {{modules_to_load}}
-
-samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))            
-
-# assumes 1-based array index, eg --array 1-N
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-cs={{output_path}}/cloudspades-isolate/${sample}
-
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${cs} ]]; then
-        rm -fr ${cs}
-    fi
-fi
-
-mkdir -p ${cs}
-pushd {{cloudspades_path}}/assembler/bin
-
-./spades.py \
-    -o ${cs} \
-    --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
-    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
-module unload gcc_9.3.0
-popd
-
-# TODO: Look for alternative method to load quast
-#mamba activate quast
-
-#quast \
-#    -o ${cs}/quast-scaffolds \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} \
-#    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
-#
-# remove intermediates that currently dont have a downstream use
-#if [[ -d ${cs}/K21 ]]; then
-#    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-#fi
diff --git a/sequence_processing_pipeline/templates/cloudspades.sbatch b/sequence_processing_pipeline/templates/cloudspades.sbatch
deleted file mode 100644
index 7a658892..00000000
--- a/sequence_processing_pipeline/templates/cloudspades.sbatch
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J {{job_name}}             # cs-assemble
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 128G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 12
-#SBATCH -p {{queue_name}}           # qiita
-
-#SBATCH --output {{output}}/logs/cloudspades_%x-%A_%a.out
-#SBATCH --error {{output}}/logs/cloudspades_%x-%A_%a.err
-
-source activate qiime2-2023.5
-
-set -x 
-set -e
-
-module load {{modules_to_load}}
-
-samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))            
-
-# assumes 1-based array index, eg --array 1-N
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-cs={{output_path}}/cloudspades/${sample}
-
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${cs} ]]; then
-        rm -fr ${cs}
-    fi
-fi
-
-mkdir -p ${cs}
-pushd {{cloudspades_path}}/assembler/bin
-
-./spades.py \
-    -o ${cs} \
-    --gemcode1-1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
-    --gemcode1-2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
-    --meta \
-    -t ${SLURM_JOB_CPUS_PER_NODE} > ${cs}/stdoutstderr.log 2>&1
-module unload gcc_9.3.0
-popd
-
-# TODO: Look for alternative method to load quast
-#mamba activate quast
-#quast \
-#    -o ${cs}/quast-scaffolds \
-#    -t ${SLURM_JOB_CPUS_PER_NODE} \
-#    ${cs}/scaffolds.fasta > ${cs}/quast-stdoutstderr.log 2>&1
-
-# remove intermediates that currently dont have a downstream use
-#if [[ -d ${cs}/K21 ]]; then
-#    rm -fr ${cs}/K21 ${cs}/K33 ${cs}/K55 ${cs}/corrected ${cs}/tmp
-#fi
diff --git a/sequence_processing_pipeline/templates/telllink-isolate.sbatch b/sequence_processing_pipeline/templates/telllink-isolate.sbatch
deleted file mode 100644
index eab0b380..00000000
--- a/sequence_processing_pipeline/templates/telllink-isolate.sbatch
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J {{job_name}}             # tellink-isolate
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 16
-#SBATCH --mem {{mem_in_gb}}G        # 160G
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
-#SBATCH -p {{queue_name}}           # qiita
-
-#SBATCH --output {{output}}/logs/telllink-isolate_%x-%A_%a.out
-#SBATCH --error {{output}}/logs/telllink-isolate_%x-%A_%a.err
-
-set -x 
-set -e
-
-module load {{modules_to_load}}
-
-samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-k=79
-lc=35
-
-tl={{output_path}}/tell-link-isolate/${sample}
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${tl} ]]; then
-        rm -fr ${tl}
-    fi
-fi
-
-mkdir -p ${tl}
-
-{{sing_path}} \
-    -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
-    -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
-    -i1 {{output_path}}}/integrated/${sample}.I1.fastq.gz \
-    -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \
-    -k ${k} \
-    -lc ${lc} \
-    -p ${sample} \
-    -j ${SLURM_CPUS_PER_TASK}
-
-# remove temporary data
-if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
-fi
diff --git a/sequence_processing_pipeline/templates/telllink.sbatch b/sequence_processing_pipeline/templates/telllink.sbatch
deleted file mode 100644
index 16be25a4..00000000
--- a/sequence_processing_pipeline/templates/telllink.sbatch
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J {{job_name}}             # tellink
-#SBATCH --mem {{mem_in_gb}}G        # 160G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 16
-#SBATCH --time {{wall_time_limit}}  # 96:00:00
-#SBATCH -p {{queue_name}}           # qiita
-
-#SBATCH --output {{output}}/logs/telllink_%x-%A_%a.out
-#SBATCH --error {{output}}/logs/telllink_%x-%A_%a.err
-
-set -x 
-set -e
-
-module load {{modules_to_load}}
-
-samples=($(cat {{output_path}}/sample_index_list_output.txt | cut -f 2))
-sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} 
-
-# TODO: leave these hardcoded for now
-k=79
-lc=35
-
-tl={{output_path}}/tell-link/${sample}
-if [[ ! -z ${FORCE} && ${FORCE} == "TRUE" ]]; then
-    if [[ -d ${tl} ]]; then
-        rm -fr ${tl}
-    fi
-fi
-
-mkdir -p ${tl}
-
-{{sing_path}} \
-    -r1 {{output_path}}/integrated/${sample}.R1.fastq.gz \
-    -r2 {{output_path}}/integrated/${sample}.R2.fastq.gz \
-    -i1 {{output_path}}/integrated/${sample}.I1.fastq.gz \
-    -d metagenomics \
-    -o ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc} \
-    -k ${k} \
-    -lc ${lc} \
-    -p ${sample} \
-    -j ${SLURM_CPUS_PER_TASK}
-
-# remove temporary data
-if [[ -d ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping ]]; then
-    rm -fr ${tl}/{{LABELTAG}}-link_${sample}_global_${k}_local_${lc}/${sample}/__skipping
-fi

From 65690393e674ba4e94bf82ff1e703a15beb7a166 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Mon, 25 Nov 2024 15:05:36 -0800
Subject: [PATCH 42/47] Update based on feedback

---
 .../TRNormCountsJob.py                        | 139 ------------------
 1 file changed, 139 deletions(-)
 delete mode 100644 sequence_processing_pipeline/TRNormCountsJob.py

diff --git a/sequence_processing_pipeline/TRNormCountsJob.py b/sequence_processing_pipeline/TRNormCountsJob.py
deleted file mode 100644
index 6887994a..00000000
--- a/sequence_processing_pipeline/TRNormCountsJob.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from os.path import join
-from .Job import Job, KISSLoader
-from .PipelineError import JobFailedError
-import logging
-from jinja2 import Environment
-from .Pipeline import Pipeline
-from .PipelineError import PipelineError
-from metapool import load_sample_sheet
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-
-class TRNormCountsJob(Job):
-    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
-                 node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, max_array_length, indicies_script_path, label,
-                 reference_base, reference_map, cores_per_task=4):
-        """
-        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
-        on a directory BCL files to generate Fastq files.
-        :param run_dir: The 'run' directory that contains BCL files.
-        :param output_path: Path where all pipeline-generated files live.
-        :param sample_sheet_path: The path to a sample-sheet.
-        :param queue_name: The name of the Torque queue to use for processing.
-        :param node_count: The number of nodes to request.
-        :param wall_time_limit: A hard time limit (in min) to bound processing.
-        :param jmem: String representing total memory limit for entire job.
-        :param modules_to_load: A list of Linux module names to load
-        :param qiita_job_id: identify Torque jobs using qiita_job_id
-        :param max_array_length: None
-        :param indicies_script_path: None
-        :param label: None
-        :param reference_base: None
-        :param reference_map: None
-        :param cores_per_task: (Optional) # of CPU cores per node to request.
-        """
-        super().__init__(run_dir,
-                         output_path,
-                         'TRIntegrateJob',
-                         [],
-                         max_array_length,
-                         modules_to_load=modules_to_load)
-
-        self.sample_sheet_path = sample_sheet_path
-        self._file_check(self.sample_sheet_path)
-        metadata = self._process_sample_sheet()
-        self.sample_ids = metadata['sample_ids']
-        self.queue_name = queue_name
-        self.node_count = node_count
-        self.wall_time_limit = wall_time_limit
-        self.cores_per_task = cores_per_task
-        self.indicies_script_path = indicies_script_path
-
-        self.reference_base = reference_base
-        self.reference_map = reference_map
-
-        # raise an Error if jmem is not a valid floating point value.
-        self.jmem = str(int(jmem))
-        self.qiita_job_id = qiita_job_id
-        self.sample_count = len(self.sample_ids)
-        self.jinja_env = Environment(loader=KISSLoader('templates'))
-        self.label = label
-
-        self.job_name = (f"norm_counts_{self.qiita_job_id}")
-
-    def run(self, callback=None):
-        job_script_path = self._generate_job_script()
-        params = ['--parsable',
-                  f'-J {self.job_name}',
-                  f'--array 1-{self.sample_count}']
-        try:
-            self.job_info = self.submit_job(job_script_path,
-                                            job_parameters=' '.join(params),
-                                            exec_from=None,
-                                            callback=callback)
-
-            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
-        except JobFailedError as e:
-            # When a job has failed, parse the logs generated by this specific
-            # job to return a more descriptive message to the user.
-            info = self.parse_logs()
-            # prepend just the message component of the Error.
-            info.insert(0, str(e))
-            raise JobFailedError('\n'.join(info))
-
-        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
-
-    def _process_sample_sheet(self):
-        sheet = load_sample_sheet(self.sample_sheet_path)
-
-        if not sheet.validate_and_scrub_sample_sheet():
-            s = "Sample sheet %s is not valid." % self.sample_sheet_path
-            raise PipelineError(s)
-
-        header = sheet.Header
-        chemistry = header['chemistry']
-
-        if header['Assay'] not in Pipeline.assay_types:
-            s = "Assay value '%s' is not recognized." % header['Assay']
-            raise PipelineError(s)
-
-        sample_ids = []
-        for sample in sheet.samples:
-            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
-
-        bioinformatics = sheet.Bioinformatics
-
-        # reorganize the data into a list of dictionaries, one for each row.
-        # the ordering of the rows will be preserved in the order of the list.
-        lst = bioinformatics.to_dict('records')
-
-        # human-filtering jobs are scoped by project. Each job requires
-        # particular knowledge of the project.
-        return {'chemistry': chemistry,
-                'projects': lst,
-                'sample_ids': sample_ids}
-
-    def _generate_job_script(self):
-        job_script_path = join(self.output_path, "compute_sequence_counts_for"
-                                                 "_normalization.sbatch")
-        template = self.jinja_env.get_template("compute_sequence_counts_for_"
-                                               "normalization2.sbatch")
-
-        with open(job_script_path, mode="w", encoding="utf-8") as f:
-            f.write(template.render({
-                "#job_name": "integrate",
-                "#wall_time_limit": self.wall_time_limit,
-                "#mem_in_gb": self.jmem,
-                "#node_count": self.node_count,
-                "#cores_per_task": self.cores_per_task,
-                "#queue_name": self.queue_name,
-                "#output_path": self.output_path,
-                "read_counts_path": "TODO",
-                "sample_sheet": "TODO",
-                "tellread_output": "TODO"
-            }))
-
-        return job_script_path

From 81922b56c42a446323efb80587d2af4ee800ae76 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Wed, 27 Nov 2024 19:23:39 -0800
Subject: [PATCH 43/47] Added renamed file

---
 sequence_processing_pipeline/NormCountsJob.py | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 sequence_processing_pipeline/NormCountsJob.py

diff --git a/sequence_processing_pipeline/NormCountsJob.py b/sequence_processing_pipeline/NormCountsJob.py
new file mode 100644
index 00000000..1909420d
--- /dev/null
+++ b/sequence_processing_pipeline/NormCountsJob.py
@@ -0,0 +1,139 @@
+from os.path import join
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from .Pipeline import Pipeline
+from .PipelineError import PipelineError
+from metapool import load_sample_sheet
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class NormCountsJob(Job):
+    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, indicies_script_path, label,
+                 reference_base, reference_map, cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param sample_sheet_path: The path to a sample-sheet.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: None
+        :param indicies_script_path: None
+        :param label: None
+        :param reference_base: None
+        :param reference_map: None
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'TRIntegrateJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.sample_sheet_path = sample_sheet_path
+        self._file_check(self.sample_sheet_path)
+        metadata = self._process_sample_sheet()
+        self.sample_ids = metadata['sample_ids']
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+        self.indicies_script_path = indicies_script_path
+
+        self.reference_base = reference_base
+        self.reference_map = reference_map
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.sample_count = len(self.sample_ids)
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+        self.label = label
+
+        self.job_name = (f"norm_counts_{self.qiita_job_id}")
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
+
+    def _process_sample_sheet(self):
+        sheet = load_sample_sheet(self.sample_sheet_path)
+
+        if not sheet.validate_and_scrub_sample_sheet():
+            s = "Sample sheet %s is not valid." % self.sample_sheet_path
+            raise PipelineError(s)
+
+        header = sheet.Header
+        chemistry = header['chemistry']
+
+        if header['Assay'] not in Pipeline.assay_types:
+            s = "Assay value '%s' is not recognized." % header['Assay']
+            raise PipelineError(s)
+
+        sample_ids = []
+        for sample in sheet.samples:
+            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
+
+        bioinformatics = sheet.Bioinformatics
+
+        # reorganize the data into a list of dictionaries, one for each row.
+        # the ordering of the rows will be preserved in the order of the list.
+        lst = bioinformatics.to_dict('records')
+
+        # human-filtering jobs are scoped by project. Each job requires
+        # particular knowledge of the project.
+        return {'chemistry': chemistry,
+                'projects': lst,
+                'sample_ids': sample_ids}
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, "compute_sequence_counts_for"
+                                                 "_normalization.sbatch")
+        template = self.jinja_env.get_template("compute_sequence_counts_for_"
+                                               "normalization2.sbatch")
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "#job_name": "integrate",
+                "#wall_time_limit": self.wall_time_limit,
+                "#mem_in_gb": self.jmem,
+                "#node_count": self.node_count,
+                "#cores_per_task": self.cores_per_task,
+                "#queue_name": self.queue_name,
+                "#output_path": self.output_path,
+                "read_counts_path": "TODO",
+                "sample_sheet": "TODO",
+                "tellread_output": "TODO"
+            }))
+
+        return job_script_path

From 01d77d6b66aac3ad2aa08d4228d895c8255b059b Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 1 Dec 2024 16:47:24 -0800
Subject: [PATCH 44/47] Refactored sequence counting job

Request from Antonio to make TRNormCountsJob more generalized for
current and upcoming work.
TRNormCountsJob replaced w/SeqCountsJob:
 * takes a list of paths to fastq and/or fastq.gz files.
 * runs seqtk to count sequences and bases in parallel.
 * aggregator code produces a json file of counts from log output.
---
 sequence_processing_pipeline/NormCountsJob.py | 139 ------------------
 sequence_processing_pipeline/SeqCountsJob.py  | 138 +++++++++++++++++
 .../aggregate_counts.py                       |  40 +++++
 ...e_sequence_counts_for_normalization.sbatch |  26 ----
 .../templates/seq_counts.sbatch               |  25 ++++
 .../tests/data/aggregate_counts_results.json  |  36 +++++
 .../tests/data/files_to_count.txt             |   8 +
 .../tests/data/seq_counts.sbatch              |  25 ++++
 .../seq_counts_logs/seq_count_2679966_1.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_1.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_2.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_2.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_3.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_3.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_4.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_4.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_5.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_5.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_6.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_6.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_7.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_7.out   |   2 +
 .../seq_counts_logs/seq_count_2679966_8.err   |   3 +
 .../seq_counts_logs/seq_count_2679966_8.out   |   2 +
 .../tests/test_SeqCountsJob.py                |  72 +++++++++
 25 files changed, 384 insertions(+), 165 deletions(-)
 delete mode 100644 sequence_processing_pipeline/NormCountsJob.py
 create mode 100644 sequence_processing_pipeline/SeqCountsJob.py
 create mode 100644 sequence_processing_pipeline/aggregate_counts.py
 delete mode 100644 sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
 create mode 100644 sequence_processing_pipeline/templates/seq_counts.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/aggregate_counts_results.json
 create mode 100644 sequence_processing_pipeline/tests/data/files_to_count.txt
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts.sbatch
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
 create mode 100644 sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
 create mode 100644 sequence_processing_pipeline/tests/test_SeqCountsJob.py

diff --git a/sequence_processing_pipeline/NormCountsJob.py b/sequence_processing_pipeline/NormCountsJob.py
deleted file mode 100644
index 1909420d..00000000
--- a/sequence_processing_pipeline/NormCountsJob.py
+++ /dev/null
@@ -1,139 +0,0 @@
-from os.path import join
-from .Job import Job, KISSLoader
-from .PipelineError import JobFailedError
-import logging
-from jinja2 import Environment
-from .Pipeline import Pipeline
-from .PipelineError import PipelineError
-from metapool import load_sample_sheet
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-
-class NormCountsJob(Job):
-    def __init__(self, run_dir, output_path, sample_sheet_path, queue_name,
-                 node_count, wall_time_limit, jmem, modules_to_load,
-                 qiita_job_id, max_array_length, indicies_script_path, label,
-                 reference_base, reference_map, cores_per_task=4):
-        """
-        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
-        on a directory BCL files to generate Fastq files.
-        :param run_dir: The 'run' directory that contains BCL files.
-        :param output_path: Path where all pipeline-generated files live.
-        :param sample_sheet_path: The path to a sample-sheet.
-        :param queue_name: The name of the Torque queue to use for processing.
-        :param node_count: The number of nodes to request.
-        :param wall_time_limit: A hard time limit (in min) to bound processing.
-        :param jmem: String representing total memory limit for entire job.
-        :param modules_to_load: A list of Linux module names to load
-        :param qiita_job_id: identify Torque jobs using qiita_job_id
-        :param max_array_length: None
-        :param indicies_script_path: None
-        :param label: None
-        :param reference_base: None
-        :param reference_map: None
-        :param cores_per_task: (Optional) # of CPU cores per node to request.
-        """
-        super().__init__(run_dir,
-                         output_path,
-                         'TRIntegrateJob',
-                         [],
-                         max_array_length,
-                         modules_to_load=modules_to_load)
-
-        self.sample_sheet_path = sample_sheet_path
-        self._file_check(self.sample_sheet_path)
-        metadata = self._process_sample_sheet()
-        self.sample_ids = metadata['sample_ids']
-        self.queue_name = queue_name
-        self.node_count = node_count
-        self.wall_time_limit = wall_time_limit
-        self.cores_per_task = cores_per_task
-        self.indicies_script_path = indicies_script_path
-
-        self.reference_base = reference_base
-        self.reference_map = reference_map
-
-        # raise an Error if jmem is not a valid floating point value.
-        self.jmem = str(int(jmem))
-        self.qiita_job_id = qiita_job_id
-        self.sample_count = len(self.sample_ids)
-        self.jinja_env = Environment(loader=KISSLoader('templates'))
-        self.label = label
-
-        self.job_name = (f"norm_counts_{self.qiita_job_id}")
-
-    def run(self, callback=None):
-        job_script_path = self._generate_job_script()
-        params = ['--parsable',
-                  f'-J {self.job_name}',
-                  f'--array 1-{self.sample_count}']
-        try:
-            self.job_info = self.submit_job(job_script_path,
-                                            job_parameters=' '.join(params),
-                                            exec_from=None,
-                                            callback=callback)
-
-            logging.debug(f'TRIntegrateJob Job Info: {self.job_info}')
-        except JobFailedError as e:
-            # When a job has failed, parse the logs generated by this specific
-            # job to return a more descriptive message to the user.
-            info = self.parse_logs()
-            # prepend just the message component of the Error.
-            info.insert(0, str(e))
-            raise JobFailedError('\n'.join(info))
-
-        logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed')
-
-    def _process_sample_sheet(self):
-        sheet = load_sample_sheet(self.sample_sheet_path)
-
-        if not sheet.validate_and_scrub_sample_sheet():
-            s = "Sample sheet %s is not valid." % self.sample_sheet_path
-            raise PipelineError(s)
-
-        header = sheet.Header
-        chemistry = header['chemistry']
-
-        if header['Assay'] not in Pipeline.assay_types:
-            s = "Assay value '%s' is not recognized." % header['Assay']
-            raise PipelineError(s)
-
-        sample_ids = []
-        for sample in sheet.samples:
-            sample_ids.append((sample['Sample_ID'], sample['Sample_Project']))
-
-        bioinformatics = sheet.Bioinformatics
-
-        # reorganize the data into a list of dictionaries, one for each row.
-        # the ordering of the rows will be preserved in the order of the list.
-        lst = bioinformatics.to_dict('records')
-
-        # human-filtering jobs are scoped by project. Each job requires
-        # particular knowledge of the project.
-        return {'chemistry': chemistry,
-                'projects': lst,
-                'sample_ids': sample_ids}
-
-    def _generate_job_script(self):
-        job_script_path = join(self.output_path, "compute_sequence_counts_for"
-                                                 "_normalization.sbatch")
-        template = self.jinja_env.get_template("compute_sequence_counts_for_"
-                                               "normalization2.sbatch")
-
-        with open(job_script_path, mode="w", encoding="utf-8") as f:
-            f.write(template.render({
-                "#job_name": "integrate",
-                "#wall_time_limit": self.wall_time_limit,
-                "#mem_in_gb": self.jmem,
-                "#node_count": self.node_count,
-                "#cores_per_task": self.cores_per_task,
-                "#queue_name": self.queue_name,
-                "#output_path": self.output_path,
-                "read_counts_path": "TODO",
-                "sample_sheet": "TODO",
-                "tellread_output": "TODO"
-            }))
-
-        return job_script_path
diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py
new file mode 100644
index 00000000..51f8e276
--- /dev/null
+++ b/sequence_processing_pipeline/SeqCountsJob.py
@@ -0,0 +1,138 @@
+from os.path import join, split
+from .Job import Job, KISSLoader
+from .PipelineError import JobFailedError
+import logging
+from jinja2 import Environment
+from os import walk
+from json import dumps
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class SeqCountsJob(Job):
+    def __init__(self, run_dir, output_path, queue_name,
+                 node_count, wall_time_limit, jmem, modules_to_load,
+                 qiita_job_id, max_array_length, files_to_count_path,
+                 cores_per_task=4):
+        """
+        ConvertJob provides a convenient way to run bcl-convert or bcl2fastq
+        on a directory BCL files to generate Fastq files.
+        :param run_dir: The 'run' directory that contains BCL files.
+        :param output_path: Path where all pipeline-generated files live.
+        :param queue_name: The name of the Torque queue to use for processing.
+        :param node_count: The number of nodes to request.
+        :param wall_time_limit: A hard time limit (in min) to bound processing.
+        :param jmem: String representing total memory limit for entire job.
+        :param modules_to_load: A list of Linux module names to load
+        :param qiita_job_id: identify Torque jobs using qiita_job_id
+        :param max_array_length: A hard-limit for array-sizes
+        :param files_to_count_path: A path to a list of file-paths to count.
+        :param cores_per_task: (Optional) # of CPU cores per node to request.
+        """
+        super().__init__(run_dir,
+                         output_path,
+                         'SeqCountsJob',
+                         [],
+                         max_array_length,
+                         modules_to_load=modules_to_load)
+
+        self.queue_name = queue_name
+        self.node_count = node_count
+        self.wall_time_limit = wall_time_limit
+        self.cores_per_task = cores_per_task
+
+        # raise an Error if jmem is not a valid floating point value.
+        self.jmem = str(int(jmem))
+        self.qiita_job_id = qiita_job_id
+        self.jinja_env = Environment(loader=KISSLoader('templates'))
+
+        self.job_name = (f"seq_counts_{self.qiita_job_id}")
+        self.files_to_count_path = files_to_count_path
+
+        with open(self.files_to_count_path, 'r') as f:
+            lines = f.readlines()
+            lines = [x.strip() for x in lines]
+            lines = [x for x in lines if x != '']
+            self.file_count = len(lines)
+
+    def run(self, callback=None):
+        job_script_path = self._generate_job_script()
+        params = ['--parsable',
+                  f'-J {self.job_name}',
+                  f'--array 1-{self.sample_count}']
+        try:
+            self.job_info = self.submit_job(job_script_path,
+                                            job_parameters=' '.join(params),
+                                            exec_from=None,
+                                            callback=callback)
+
+            logging.debug(f'SeqCountsJob Job Info: {self.job_info}')
+        except JobFailedError as e:
+            # When a job has failed, parse the logs generated by this specific
+            # job to return a more descriptive message to the user.
+            info = self.parse_logs()
+            # prepend just the message component of the Error.
+            info.insert(0, str(e))
+            raise JobFailedError('\n'.join(info))
+
+        self._aggregate_counts()
+
+        logging.debug(f'SeqCountJob {self.job_info["job_id"]} completed')
+
+    def _generate_job_script(self):
+        job_script_path = join(self.output_path, "seq_counts.sbatch")
+        template = self.jinja_env.get_template("seq_counts.sbatch")
+
+        #  got to make files_to_count.txt and put it in the output directory
+
+        with open(job_script_path, mode="w", encoding="utf-8") as f:
+            f.write(template.render({
+                "job_name": "seq_counts",
+                "wall_time_limit": self.wall_time_limit,
+                "mem_in_gb": self.jmem,
+                "node_count": self.node_count,
+                "cores_per_task": self.cores_per_task,
+                "queue_name": self.queue_name,
+                "file_count": self.file_count,
+                "output_path": self.output_path
+            }))
+
+        return job_script_path
+
+    def parse_logs(self):
+        # TODO
+        pass
+
+    def _aggregate_counts(self):
+        def extract_metadata(fp):
+            with open(fp, 'r') as f:
+                lines = f.readlines()
+                lines = [x.strip() for x in lines]
+                if len(lines) != 2:
+                    raise ValueError("error processing %s" % fp)
+                _dir, _file = split(lines[0])
+                seq_counts, base_pairs = lines[1].split('\t')
+                return _dir, _file, int(seq_counts), int(base_pairs)
+
+        results = {}
+
+        for root, dirs, files in walk(self.log_path):
+            for _file in files:
+                if _file.endswith('.out'):
+                    log_output_file = join(root, _file)
+                    _dir, _file, seq_counts, base_pairs = \
+                        extract_metadata(log_output_file)
+
+                    if _dir not in results:
+                        results[_dir] = {}
+
+                    results[_dir][_file] = {'seq_counts': seq_counts,
+                                            'base_pairs': base_pairs}
+
+        results_path = join(self.output_path, 'aggregate_counts.json')
+
+        with open(results_path, 'w') as f:
+            print(dumps(results, indent=2), file=f)
+
+        return results_path
diff --git a/sequence_processing_pipeline/aggregate_counts.py b/sequence_processing_pipeline/aggregate_counts.py
new file mode 100644
index 00000000..ace90212
--- /dev/null
+++ b/sequence_processing_pipeline/aggregate_counts.py
@@ -0,0 +1,40 @@
+from os import walk
+from sys import argv
+from os.path import join, split
+from json import dumps
+
+
+def extract_metadata(log_output_file_path):
+    with open(log_output_file_path, 'r') as f:
+        lines = f.readlines()
+        lines = [x.strip() for x in lines]
+        if len(lines) != 2:
+            raise ValueError("error processing %s" % log_output_file_path)
+        _dir, _file = split(lines[0])
+        seq_counts, base_pairs = lines[1].split('\t')
+        return _dir, _file, int(seq_counts), int(base_pairs)
+
+
+def aggregate_counts(fp):
+    results = {}
+
+    for root, dirs, files in walk(fp):
+        for _file in files:
+            if _file.endswith('.out'):
+                log_output_file = join(root, _file)
+                _dir, _file, seq_counts, base_pairs = \
+                    extract_metadata(log_output_file)
+
+                if _dir not in results:
+                    results[_dir] = {}
+
+                results[_dir][_file] = {'seq_counts': seq_counts,
+                                        'base_pairs': base_pairs}
+
+    return results
+
+
+if __name__ == '__main__':
+    results = aggregate_counts(argv[1])
+    with open(argv[2], 'w') as f:
+        print(dumps(results, indent=2), file=f)
diff --git a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch b/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
deleted file mode 100644
index 9414fd4c..00000000
--- a/sequence_processing_pipeline/templates/compute_sequence_counts_for_normalization.sbatch
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash -l
-#SBATCH -J {{job_name}}             # norm
-#SBATCH --time {{wall_time_limit}}  # 24:00:00
-#SBATCH --mem {{mem_in_gb}}G        # 8G
-#SBATCH -N {{node_count}}           # 1
-#SBATCH -c {{cores_per_task}}       # 1
-#SBATCH -p {{queue_name}}           # qiita
-
-#SBATCH --output {{output}}/logs/compute_sequence_counts_%x-%A_%a.out
-#SBATCH --error {{output}}/logs/compute_sequence_counts_%x-%A_%a.err
-
-# NB: output appears normal w/out.
-# source activate qiime2-2023.5
-
-set -x
-set -e
-set -o pipefail
-
-echo $TMPDIR
-
-mkdir -p {{output_path}}
-wc -l {{tellread_output}}/Full/*_I1_C5[0-9][0-9].fastq.gz.corrected.err_barcode_removed.fastq > {{output_path}}/record_counts.txt
-python {{plot_counts_path}} {{output_path}}/record_counts.txt {{sample_sheet}} {{output_path}}
-
-conda activate qp-knight-lab-processing-2022.03
-python {{create_picklist_path}} {{read_counts_path}}
diff --git a/sequence_processing_pipeline/templates/seq_counts.sbatch b/sequence_processing_pipeline/templates/seq_counts.sbatch
new file mode 100644
index 00000000..f44bd5b9
--- /dev/null
+++ b/sequence_processing_pipeline/templates/seq_counts.sbatch
@@ -0,0 +1,25 @@
+#!/bin/bash -l
+#SBATCH -J {{job_name}}
+#SBATCH --time {{wall_time_limit}}
+#SBATCH --mem {{mem_in_gb}}G
+#SBATCH -N {{node_count}}
+#SBATCH -c {{cores_per_task}}
+#SBATCH -p {{queue_name}}
+#SBATCH --array=1-{{file_count}}
+
+#SBATCH --output {{output_path}}/logs/%x_%A_%a.out
+#SBATCH --error {{output_path}}/logs/%x_%A_%a.err
+
+set -x
+set -e
+
+mkdir -p {{output_path}}/logs
+
+files=($(cat {{output_path}}/files_to_count.txt))
+my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "${my_file}"
+
+conda activate qp-knight-lab-processing-2022.03
+
+seqtk size ${my_file}
diff --git a/sequence_processing_pipeline/tests/data/aggregate_counts_results.json b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json
new file mode 100644
index 00000000..1cae0f05
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json
@@ -0,0 +1,36 @@
+{
+  "REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full": {
+    "TellReadJob_I1_C520.fastq.gz.erroneous.fastq": {
+      "seq_counts": 2139633,
+      "base_pairs": 38513394
+    },
+    "TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 8345327641
+    },
+    "TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 9293296513
+    },
+    "TellReadJob_I1_C519.fastq.gz.erroneous.fastq": {
+      "seq_counts": 1932116,
+      "base_pairs": 34778088
+    },
+    "TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 1160354916
+    },
+    "TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 64464162,
+      "base_pairs": 8370238082
+    },
+    "TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 9317943166
+    },
+    "TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq": {
+      "seq_counts": 70399028,
+      "base_pairs": 1267182504
+    }
+  }
+}
diff --git a/sequence_processing_pipeline/tests/data/files_to_count.txt b/sequence_processing_pipeline/tests/data/files_to_count.txt
new file mode 100644
index 00000000..8d7ce4b1
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/files_to_count.txt
@@ -0,0 +1,8 @@
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq
+/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts.sbatch b/sequence_processing_pipeline/tests/data/seq_counts.sbatch
new file mode 100644
index 00000000..cc73187c
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts.sbatch
@@ -0,0 +1,25 @@
+#!/bin/bash -l
+#SBATCH -J seq_counts
+#SBATCH --time 1440
+#SBATCH --mem 8G
+#SBATCH -N 1
+#SBATCH -c 1
+#SBATCH -p qiita
+#SBATCH --array=1-8
+
+#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.out
+#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.err
+
+set -x
+set -e
+
+mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs
+
+files=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/files_to_count.txt))
+my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]}
+
+echo "${my_file}"
+
+conda activate qp-knight-lab-processing-2022.03
+
+seqtk size ${my_file}
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
new file mode 100644
index 00000000..50a46674
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	8345327641
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
new file mode 100644
index 00000000..87ad9f55
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	9293296513
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
new file mode 100644
index 00000000..a22d9f8d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq
+1932116	34778088
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
new file mode 100644
index 00000000..0b35614a
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	9317943166
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
new file mode 100644
index 00000000..887522ae
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq
+70399028	1267182504
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
new file mode 100644
index 00000000..a4fbd555
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	8370238082
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
new file mode 100644
index 00000000..47c59651
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
new file mode 100644
index 00000000..6c6a9c06
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq
+64464162	1160354916
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
new file mode 100644
index 00000000..e9c0cf9d
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err
@@ -0,0 +1,3 @@
+This is an example .err file produced by seq_counts.sbatch.
+Additional details removed.
++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq
diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
new file mode 100644
index 00000000..9be52329
--- /dev/null
+++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out
@@ -0,0 +1,2 @@
+REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq
+2139633	38513394
diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
new file mode 100644
index 00000000..d0fee2cc
--- /dev/null
+++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
@@ -0,0 +1,72 @@
+from os.path import join
+from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob
+from functools import partial
+import unittest
+
+
+class TestSeqCountsJob(unittest.TestCase):
+    def setUp(self):
+        package_root = "sequence_processing_pipeline"
+        self.path = partial(join, package_root, "tests")
+        # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id.
+        self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch')
+
+        # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already
+        # exists.
+        self.run_dir = self.path('data', 'sample_run_directories',
+                                 '150629_SN1001_0511_AH5L7GBCXX')
+
+        self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0')
+
+        self.files_to_count_path = self.path("data", "files_to_count.txt")
+
+        self.queue_name = "qiita"
+        self.node_count = "1"
+        self.wall_time_limit = "1440"
+        self.jmem = "8"
+        self.modules_to_load = []
+        self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0"
+        self.cores_per_task = "1"
+        self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full")
+        self.max_array_length = 100
+        self.exp_sbatch_output = self.path("data", "seq_counts.sbatch")
+        self.exp_results = self.path("data",
+                                     "aggregate_counts_results.json")
+
+    def test_creation(self):
+        def compare_files(obs, exp):
+            with open(obs, 'r') as f:
+                obs_lines = f.readlines()
+                obs_lines = [x.strip() for x in obs_lines]
+                obs_lines = [x for x in obs_lines if x != '']
+
+            with open(exp, 'r') as f:
+                exp_lines = f.readlines()
+                exp_lines = [x.strip() for x in exp_lines]
+                exp_lines = [x for x in exp_lines if x != '']
+
+            for obs_line, exp_line in zip(obs_lines, exp_lines):
+                self.assertEqual(obs_line, exp_line)
+
+        # test basic good-path
+        job = SeqCountsJob(self.run_dir, self.output_path, self.queue_name,
+                           self.node_count, self.wall_time_limit, self.jmem,
+                           self.modules_to_load, self.qiita_job_id,
+                           self.max_array_length, self.files_to_count_path,
+                           self.cores_per_task)
+
+        obs = job._generate_job_script()
+
+        compare_files(obs, self.exp_sbatch_output)
+
+        # hack log path so that it points to test data directory rather than
+        # the output directory for a run we didn't run().
+        job.log_path = self.path("data", "seq_counts_logs")
+
+        obs = job._aggregate_counts()
+
+        compare_files(obs, self.exp_results)
+
+
+if __name__ == '__main__':
+    unittest.main()

From b718e8b790005801689068fccd4395b20539aaa6 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 1 Dec 2024 18:10:37 -0800
Subject: [PATCH 45/47] Update test based on randomness in output generation

---
 sequence_processing_pipeline/tests/test_SeqCountsJob.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
index d0fee2cc..d641c3b2 100644
--- a/sequence_processing_pipeline/tests/test_SeqCountsJob.py
+++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py
@@ -2,6 +2,7 @@
 from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob
 from functools import partial
 import unittest
+from json import load as json_load
 
 
 class TestSeqCountsJob(unittest.TestCase):
@@ -63,9 +64,10 @@ def compare_files(obs, exp):
         # the output directory for a run we didn't run().
         job.log_path = self.path("data", "seq_counts_logs")
 
-        obs = job._aggregate_counts()
+        obs = json_load(open(job._aggregate_counts(), 'r'))
+        exp = json_load(open(self.exp_results, 'r'))
 
-        compare_files(obs, self.exp_results)
+        self.assertDictEqual(obs, exp)
 
 
 if __name__ == '__main__':

From a0ffb81090ecf837e2fde71be44df864e15b2b54 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 1 Dec 2024 19:58:50 -0800
Subject: [PATCH 46/47] Updates based on feedback

---
 sequence_processing_pipeline/Commands.py            | 9 +++++++--
 sequence_processing_pipeline/tests/test_commands.py | 8 ++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py
index 130ac28d..ae971fc9 100644
--- a/sequence_processing_pipeline/Commands.py
+++ b/sequence_processing_pipeline/Commands.py
@@ -23,7 +23,12 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
     # add one more level to account for project_names nested under ConvertJob
     # dir.
     # this will ignore the _I1_ reads that appear in the integrated result.
-    fastq_paths = glob.glob(data_location_path + '/*/*_R?_001.fastq.gz')
+    fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz')
+
+    # case-specific filter for TellSeq output directories that also contain
+    # _I1_ files. Ensure paths are still sorted afterwards.
+    fastq_paths = [x for x in fastq_paths if '_I1_001.fastq.gz' not in x]
+    fastq_paths = sorted(fastq_paths)
 
     # convert from GB and halve as we sum R1
     max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
@@ -87,7 +92,7 @@ def demux(id_map, fp, out_d, task, maxtask):
     """Split infile data based in provided map"""
     delimiter = '::MUX::'
     mode = 'wt'
-    ext = '_001.fastq.gz'
+    ext = '.fastq.gz'
     sep = '/'
     rec = '@'
 
diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py
index 4e0d0491..ac8a4bd9 100644
--- a/sequence_processing_pipeline/tests/test_commands.py
+++ b/sequence_processing_pipeline/tests/test_commands.py
@@ -70,9 +70,9 @@ def test_demux(self):
 
             demux(id_map, infile, tmp, task, maxtask)
 
-            obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1_001.fastq.gz'),
+            obs_r1 = gzip.open(join(tmp, 'Project_12345', 'b_R1.fastq.gz'),
                                'rt').read()
-            obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2_001.fastq.gz'),
+            obs_r2 = gzip.open(join(tmp, 'Project_12345', 'b_R2.fastq.gz'),
                                'rt').read()
             exp = '\n'.join(exp_data_r1) + '\n'
             self.assertEqual(obs_r1, exp)
@@ -80,8 +80,8 @@ def test_demux(self):
             exp = '\n'.join(exp_data_r2) + '\n'
             self.assertEqual(obs_r2, exp)
 
-            self.assertFalse(os.path.exists(join(tmp, 'a_R1_001.fastq.gz')))
-            self.assertFalse(os.path.exists(join(tmp, 'a_R2_001.fastq.gz')))
+            self.assertFalse(os.path.exists(join(tmp, 'a_R1.fastq.gz')))
+            self.assertFalse(os.path.exists(join(tmp, 'a_R2.fastq.gz')))
 
 
 if __name__ == '__main__':

From 0b7ce90342435a14aa76cf42a29c2807b9029af1 Mon Sep 17 00:00:00 2001
From: Charles Cowart <ccowart@ucsd.edu>
Date: Sun, 1 Dec 2024 21:11:06 -0800
Subject: [PATCH 47/47] Common parse_log() method made default

---
 sequence_processing_pipeline/ConvertJob.py     |  1 +
 sequence_processing_pipeline/FastQCJob.py      | 17 -----------------
 sequence_processing_pipeline/Job.py            | 13 ++++++++++++-
 sequence_processing_pipeline/NuQCJob.py        | 13 -------------
 sequence_processing_pipeline/SeqCountsJob.py   | 13 +++++++++++--
 sequence_processing_pipeline/TRIntegrateJob.py | 14 --------------
 sequence_processing_pipeline/TellReadJob.py    | 14 --------------
 7 files changed, 24 insertions(+), 61 deletions(-)

diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py
index 17b8c3b3..dc9b36aa 100644
--- a/sequence_processing_pipeline/ConvertJob.py
+++ b/sequence_processing_pipeline/ConvertJob.py
@@ -175,6 +175,7 @@ def run(self, callback=None):
         logging.info(f'Successful job: {job_info}')
 
     def parse_logs(self):
+        # overrides Job.parse_logs() w/tailored parse for specific logs.
         log_path = join(self.output_path, 'Logs')
         errors = join(log_path, 'Errors.log')
 
diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py
index 5e0bf4fc..8db0440b 100644
--- a/sequence_processing_pipeline/FastQCJob.py
+++ b/sequence_processing_pipeline/FastQCJob.py
@@ -6,7 +6,6 @@
 from functools import partial
 from json import dumps
 import logging
-import glob
 
 
 class FastQCJob(Job):
@@ -305,19 +304,3 @@ def _generate_job_script(self):
 
         with open(sh_details_fp, 'w') as f:
             f.write('\n'.join(self.commands))
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        files = sorted(glob.glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         # note 'error' is not same
-                         # requirement as found in QCJob.
-                         # ('error:'). This is a very
-                         # generalized filter.
-                         if 'error' in line.lower()]
-
-        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py
index 55f287db..4121bd7f 100644
--- a/sequence_processing_pipeline/Job.py
+++ b/sequence_processing_pipeline/Job.py
@@ -13,6 +13,7 @@
 from inspect import stack
 import re
 from collections import Counter
+from glob import glob
 
 
 # taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader
@@ -126,7 +127,17 @@ def run(self):
         raise PipelineError("Base class run() method not implemented.")
 
     def parse_logs(self):
-        raise PipelineError("Base class parse_logs() method not implemented.")
+        # by default, look for anything to parse in the logs directory.
+        log_path = join(self.output_path, 'logs')
+        files = sorted(glob(join(log_path, '*')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if 'error:' in line.lower()]
+
+        return [msg.strip() for msg in msgs]
 
     def _which(self, file_path, modules_to_load=None):
         """
diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py
index 83bdf551..0e05b41d 100644
--- a/sequence_processing_pipeline/NuQCJob.py
+++ b/sequence_processing_pipeline/NuQCJob.py
@@ -499,16 +499,3 @@ def _generate_job_script(self, max_bucket_size):
                                     pmls_path=self.pmls_path))
 
         return job_script_path
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        # sorted lists give predictable results
-        files = sorted(glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         if 'error:' in line.lower()]
-
-        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py
index 51f8e276..f080bd00 100644
--- a/sequence_processing_pipeline/SeqCountsJob.py
+++ b/sequence_processing_pipeline/SeqCountsJob.py
@@ -5,6 +5,7 @@
 from jinja2 import Environment
 from os import walk
 from json import dumps
+from glob import glob
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -101,8 +102,16 @@ def _generate_job_script(self):
         return job_script_path
 
     def parse_logs(self):
-        # TODO
-        pass
+        # overrides Job.parse_logs() w/tailored parse for specific logs.
+        files = sorted(glob(join(self.log_path, '*.err')))
+        msgs = []
+
+        for some_file in files:
+            with open(some_file, 'r') as f:
+                msgs += [line for line in f.readlines()
+                         if line.startswith("[E::stk_size]")]
+
+        return [msg.strip() for msg in msgs]
 
     def _aggregate_counts(self):
         def extract_metadata(fp):
diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py
index 6994f2ad..7b8740b4 100644
--- a/sequence_processing_pipeline/TRIntegrateJob.py
+++ b/sequence_processing_pipeline/TRIntegrateJob.py
@@ -8,7 +8,6 @@
 from metapool import load_sample_sheet
 from os import makedirs
 from shutil import copyfile
-from glob import glob
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -162,16 +161,3 @@ def _generate_job_script(self):
                 "output_dir": self.output_path}))
 
         return job_script_path
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        # sorted lists give predictable results
-        files = sorted(glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         if 'error:' in line.lower()]
-
-        return [msg.strip() for msg in msgs]
diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py
index 75e3b958..3d68d4c8 100644
--- a/sequence_processing_pipeline/TellReadJob.py
+++ b/sequence_processing_pipeline/TellReadJob.py
@@ -6,7 +6,6 @@
 from .Pipeline import Pipeline
 from .PipelineError import PipelineError
 from metapool import load_sample_sheet
-from glob import glob
 
 
 logging.basicConfig(level=logging.DEBUG)
@@ -173,16 +172,3 @@ def _generate_job_script(self):
             }))
 
         return job_script_path
-
-    def parse_logs(self):
-        log_path = join(self.output_path, 'logs')
-        # sorted lists give predictable results
-        files = sorted(glob(join(log_path, '*.out')))
-        msgs = []
-
-        for some_file in files:
-            with open(some_file, 'r') as f:
-                msgs += [line for line in f.readlines()
-                         if 'error:' in line.lower()]
-
-        return [msg.strip() for msg in msgs]