diff --git a/README.md b/README.md index c51fcd0c..594e11aa 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git Create a Python3 Conda environment in which to run the notebook: ```bash -conda create -n sp_pipeline 'python==3.9' numpy pandas click scipy matplotlib fastq-pair +conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair ``` Activate the Conda environment: @@ -62,3 +62,9 @@ Please note that the setting 'minimap2_databases' is expected to be a list of pa For NuQCJob, minimap2_databases is expected to be the path to a directory containing two subdirectories: 'metagenomic' and 'metatranscriptomic'. Each directory should contain or symlink to the appropriate .mmi files needed for that Assay type. + +Additional TellSeq-related notes: +'spades-cloudspades-0.1', 'tellread-release-novaseqX' or similar directories must be placed in a location available to SPP. +Their paths should be made known to SPP in the configuration files. (See examples for details). +Additional scripts found in sequence_processing_pipeline/contrib were contributed by Daniel and Omar and can be similarly located and configured. + diff --git a/sequence_processing_pipeline/Commands.py b/sequence_processing_pipeline/Commands.py index b2cd5e41..ae971fc9 100644 --- a/sequence_processing_pipeline/Commands.py +++ b/sequence_processing_pipeline/Commands.py @@ -22,8 +22,14 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb, # is now the following: # add one more level to account for project_names nested under ConvertJob # dir. + # this will ignore the _I1_ reads that appear in the integrated result. fastq_paths = glob.glob(data_location_path + '*/*/*.fastq.gz') + # case-specific filter for TellSeq output directories that also contain + # _I1_ files. Ensure paths are still sorted afterwards. + fastq_paths = [x for x in fastq_paths if '_I1_001.fastq.gz' not in x] + fastq_paths = sorted(fastq_paths) + # convert from GB and halve as we sum R1 max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2) diff --git a/sequence_processing_pipeline/ConvertJob.py b/sequence_processing_pipeline/ConvertJob.py index 122a4987..dc9b36aa 100644 --- a/sequence_processing_pipeline/ConvertJob.py +++ b/sequence_processing_pipeline/ConvertJob.py @@ -156,7 +156,13 @@ def run(self, callback=None): exec_from=self.log_path, callback=callback) - self.copy_controls_between_projects() + # ConvertJob() is used to process Amplicon as well as Meta*Omic + # runs. Amplicon runs use a dummy sample-sheet generated by + # Pipeline(). For these types of sheets we can't copy controls + # between projects because demuxing is not performed here. + _, sheet_name = split(self.sample_sheet_path) + if sheet_name != 'dummy_sample_sheet.csv': + self.copy_controls_between_projects() except JobFailedError as e: # When a job has failed, parse the logs generated by this specific @@ -169,6 +175,7 @@ def run(self, callback=None): logging.info(f'Successful job: {job_info}') def parse_logs(self): + # overrides Job.parse_logs() w/tailored parse for specific logs. log_path = join(self.output_path, 'Logs') errors = join(log_path, 'Errors.log') diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 5e0bf4fc..8db0440b 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -6,7 +6,6 @@ from functools import partial from json import dumps import logging -import glob class FastQCJob(Job): @@ -305,19 +304,3 @@ def _generate_job_script(self): with open(sh_details_fp, 'w') as f: f.write('\n'.join(self.commands)) - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - files = sorted(glob.glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - # note 'error' is not same - # requirement as found in QCJob. - # ('error:'). This is a very - # generalized filter. - if 'error' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/Job.py b/sequence_processing_pipeline/Job.py index b650543e..4121bd7f 100644 --- a/sequence_processing_pipeline/Job.py +++ b/sequence_processing_pipeline/Job.py @@ -1,3 +1,6 @@ +from jinja2 import BaseLoader, TemplateNotFound +from os.path import getmtime +import pathlib from itertools import zip_longest from os import makedirs, walk from os.path import basename, exists, split, join @@ -9,6 +12,26 @@ import logging from inspect import stack import re +from collections import Counter +from glob import glob + + +# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader +class KISSLoader(BaseLoader): + def __init__(self, path): + # pin the path for loader to the location sequence_processing_pipeline + # (the location of this file), along w/the relative path to the + # templates directory. + self.path = join(pathlib.Path(__file__).parent.resolve(), path) + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) class Job: @@ -32,6 +55,7 @@ class Job: slurm_status_running) polling_interval_in_seconds = 60 + squeue_retry_in_seconds = 10 def __init__(self, root_dir, output_path, job_name, executable_paths, max_array_length, modules_to_load=None): @@ -103,7 +127,17 @@ def run(self): raise PipelineError("Base class run() method not implemented.") def parse_logs(self): - raise PipelineError("Base class parse_logs() method not implemented.") + # by default, look for anything to parse in the logs directory. + log_path = join(self.output_path, 'logs') + files = sorted(glob(join(log_path, '*'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if 'error:' in line.lower()] + + return [msg.strip() for msg in msgs] def _which(self, file_path, modules_to_load=None): """ @@ -212,6 +246,41 @@ def _system_call(self, cmd, allow_return_codes=[], callback=None): return {'stdout': stdout, 'stderr': stderr, 'return_code': return_code} + def _query_slurm(self, job_ids): + # query_slurm encapsulates the handling of squeue. + count = 0 + while True: + result = self._system_call("squeue -t all -j " + f"{','.join(job_ids)} " + "-o '%i,%T'") + + if result['return_code'] == 0: + # there was no issue w/squeue, break this loop and + # continue. + break + else: + # there was likely an intermittent issue w/squeue. Pause + # and wait before trying a few more times. If the problem + # persists then report the error and exit. + count += 1 + + if count > 3: + raise ExecFailedError(result['stderr']) + + sleep(Job.squeue_retry_in_seconds) + + lines = result['stdout'].split('\n') + lines.pop(0) # remove header + lines = [x.split(',') for x in lines if x != ''] + + jobs = {} + for job_id, state in lines: + # ensure unique_id is of type string for downstream use. + job_id = str(job_id) + jobs[job_id] = state + + return jobs + def wait_on_job_ids(self, job_ids, callback=None): ''' Wait for the given job-ids to finish running before returning. @@ -229,63 +298,27 @@ def wait_on_job_ids(self, job_ids, callback=None): # ensure all ids are strings to ensure proper working w/join(). job_ids = [str(x) for x in job_ids] - def query_slurm(job_ids): - # internal function query_slurm encapsulates the handling of - # squeue. - count = 0 - while True: - result = self._system_call("squeue -t all -j " - f"{','.join(job_ids)} " - "-o '%F,%A,%T'") - - if result['return_code'] == 0: - # there was no issue w/squeue, break this loop and - # continue. - break - else: - # there was a likely intermittent issue w/squeue. Pause - # and wait before trying a few more times. If the problem - # persists then report the error and exit. - count += 1 - - if count > 3: - raise ExecFailedError(result['stderr']) - - sleep(60) - - lines = result['stdout'].split('\n') - lines.pop(0) # remove header - lines = [x.split(',') for x in lines if x != ''] - - jobs = {} - child_jobs = {} - for job_id, unique_id, state in lines: - jobs[unique_id] = state - - if unique_id != job_id: - child_jobs[unique_id] = job_id # job is a child job - - return jobs, child_jobs - while True: - jobs, child_jobs = query_slurm(job_ids) - - for jid in job_ids: - logging.debug("JOB %s: %s" % (jid, jobs[jid])) - if callback is not None: - callback(jid=jid, status=jobs[jid]) - - children = [x for x in child_jobs if child_jobs[x] == jid] - if len(children) == 0: - logging.debug("\tNO CHILDREN") - for cid in children: - logging.debug("\tCHILD JOB %s: %s" % (cid, jobs[cid])) - status = [jobs[x] in Job.slurm_status_not_running for x in job_ids] - - if set(status) == {True}: - # all jobs either completed successfully or terminated. + # Because query_slurm only returns state on the job-ids we specify, + # the wait process is a simple check to see whether any of the + # states are 'running' states or not. + jobs = self._query_slurm(job_ids) + + # jobs will be a dict of job-ids or array-ids for jobs that + # are array-jobs. the value of jobs[id] will be a state e.g.: + # 'RUNNING', 'FAILED', 'COMPLETED'. + states = [jobs[x] in Job.slurm_status_not_running for x in jobs] + + if set(states) == {True}: + # if all the states are either FAILED or COMPLETED + # then the set of those states no matter how many + # array-jobs there were will ultimately be the set of + # {True}. If not then that means there are still jobs + # that are running. break + logging.debug(f"sleeping {Job.polling_interval_in_seconds} " + "seconds...") sleep(Job.polling_interval_in_seconds) return jobs @@ -345,16 +378,48 @@ def submit_job(self, script_path, job_parameters=None, # to the user. results = self.wait_on_job_ids([job_id], callback=callback) - job_result = {'job_id': job_id, 'job_state': results[job_id]} + if job_id in results: + # job is a non-array job + job_result = {'job_id': job_id, 'job_state': results[job_id]} + else: + # job is an array job + # assume all array jobs in this case will be associated w/job_id. + counts = Counter() + for array_id in results: + counts[results[array_id]] += 1 + + # for array jobs we won't be returning a string representing the + # state of a single job. Instead we're returning a dictionary of + # the number of unique states the set of array-jobs ended up in and + # the number for each one. + job_result = {'job_id': job_id, 'job_state': dict(counts)} if callback is not None: - callback(jid=job_id, status=job_result['job_state']) + if isinstance(job_result['job_state'], dict): + # this is an array job + states = [] + for key in counts: + states.append(f"{key}: {counts[key]}") - if job_result['job_state'] == 'COMPLETED': - return job_result + callback(jid=job_id, status=", ".join(states)) + + else: + # this is a standard job + callback(jid=job_id, status=job_result['job_state']) + + if isinstance(job_result['job_state'], dict): + states = list(job_result['job_state'].keys()) + if states == ['COMPLETED']: + return job_result + else: + raise JobFailedError(f"job {job_id} exited with jobs in the " + f"following states: {', '.join(states)}") else: - raise JobFailedError(f"job {job_id} exited with status " - f"{job_result['job_state']}") + if job_result['job_state'] == 'COMPLETED': + return job_result + else: + raise JobFailedError(f"job {job_id} exited with status " + f"{job_result['job_state']}") def _group_commands(self, cmds): # break list of commands into chunks of max_array_length (Typically diff --git a/sequence_processing_pipeline/NuQCJob.py b/sequence_processing_pipeline/NuQCJob.py index b2ffd3a9..0e05b41d 100644 --- a/sequence_processing_pipeline/NuQCJob.py +++ b/sequence_processing_pipeline/NuQCJob.py @@ -1,8 +1,7 @@ -from jinja2 import BaseLoader, TemplateNotFound from metapool import load_sample_sheet from os import stat, makedirs, rename -from os.path import join, basename, dirname, exists, abspath, getmtime -from sequence_processing_pipeline.Job import Job +from os.path import join, basename, dirname, exists, abspath, split +from sequence_processing_pipeline.Job import Job, KISSLoader from sequence_processing_pipeline.PipelineError import (PipelineError, JobFailedError) from sequence_processing_pipeline.Pipeline import Pipeline @@ -11,28 +10,9 @@ from sequence_processing_pipeline.Commands import split_similar_size_bins from sequence_processing_pipeline.util import iter_paired_files from jinja2 import Environment -import glob +from glob import glob import re from sys import executable -import pathlib - - -# taken from https://jinja.palletsprojects.com/en/3.0.x/api/#jinja2.BaseLoader -class KISSLoader(BaseLoader): - def __init__(self, path): - # pin the path for loader to the location sequence_processing_pipeline - # (the location of this file), along w/the relative path to the - # templates directory. - self.path = join(pathlib.Path(__file__).parent.resolve(), path) - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path) as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) logging.basicConfig(level=logging.DEBUG) @@ -128,6 +108,9 @@ def __init__(self, fastq_root_dir, output_path, sample_sheet_path, self.minimum_bytes = 3100 self.fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}' r'\.fastq\.gz$') + self.interleave_fastq_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d' + r'_\d{3}\.interleave\.fastq' + r'\.gz$') self.html_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.html$') self.json_regex = re.compile(r'^(.*)_S\d{1,4}_L\d{3}_R\d_\d{3}\.json$') @@ -167,7 +150,7 @@ def _filter_empty_fastq_files(self, filtered_directory, ''' empty_list = [] - files = glob.glob(join(filtered_directory, f'*.{self.suffix}')) + files = glob(join(filtered_directory, f'*.{self.suffix}')) for r1, r2 in iter_paired_files(files): full_path = join(filtered_directory, r1) @@ -194,7 +177,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): substr = regex.search(file_name) if substr is None: raise ValueError(f"{file_name} does not follow naming " - " pattern.") + "pattern.") else: # check if found substring is a member of this # project. Note sample-name != sample-id @@ -214,8 +197,7 @@ def _move_helper(self, completed_files, regex, samples_in_project, dst): for fp in files_to_move: move(fp, dst) - @staticmethod - def _move_trimmed_files(project_name, output_path): + def _move_trimmed_files(self, project_name, output_path): ''' Given output_path, move all fastqs to a new subdir named project_name. :param project_name: The name of the new folder to be created. @@ -229,8 +211,16 @@ def _move_trimmed_files(project_name, output_path): # this directory shouldn't already exist. makedirs(join(output_path, project_name), exist_ok=False) - for trimmed_file in list(glob.glob(pattern)): - move(trimmed_file, join(output_path, project_name)) + sample_ids = [x[0] for x in self.sample_ids + if x[1] == project_name] + + for trimmed_file in list(glob(pattern)): + file_name = split(trimmed_file)[1] + substr = self.interleave_fastq_regex.search(file_name) + if substr is not None: + # only move the sample_ids in this project. + if substr[1] in sample_ids: + move(trimmed_file, join(output_path, project_name)) else: raise ValueError(f"'{output_path}' does not exist") @@ -282,10 +272,9 @@ def run(self, callback=None): for project in self.project_data: project_name = project['Sample_Project'] needs_human_filtering = project['HumanFiltering'] - source_dir = join(self.output_path, project_name) pattern = f"{source_dir}/*.fastq.gz" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) # if the 'only-adapter-filtered' directory exists, move the files # into a unique location so that files from multiple projects @@ -294,7 +283,7 @@ def run(self, callback=None): 'only-adapter-filtered') if exists(trimmed_only_path): - NuQCJob._move_trimmed_files(project_name, trimmed_only_path) + self._move_trimmed_files(project_name, trimmed_only_path) if needs_human_filtering is True: filtered_directory = join(source_dir, 'filtered_sequences') @@ -330,7 +319,7 @@ def run(self, callback=None): # move all html files underneath the subdirectory for this project. pattern = f"{old_html_path}/*.html" - completed_htmls = list(glob.glob(pattern)) + completed_htmls = list(glob(pattern)) self._move_helper(completed_htmls, # Tissue_1_Super_Trizol_S19_L001_R1_001.html self.html_regex, @@ -339,7 +328,7 @@ def run(self, callback=None): # move all json files underneath the subdirectory for this project. pattern = f"{old_json_path}/*.json" - completed_jsons = list(glob.glob(pattern)) + completed_jsons = list(glob(pattern)) self._move_helper(completed_jsons, # Tissue_1_Super_Trizol_S19_L001_R1_001.json self.json_regex, @@ -357,7 +346,7 @@ def _confirm_job_completed(self): # since NuQCJob processes across all projects in a run, there isn't # a need to iterate by project_name and job_id. pattern = f"{self.output_path}/hds-{self.qiita_job_id}.*.completed" - completed_files = list(glob.glob(pattern)) + completed_files = list(glob(pattern)) if completed_files: return True @@ -510,16 +499,3 @@ def _generate_job_script(self, max_bucket_size): pmls_path=self.pmls_path)) return job_script_path - - def parse_logs(self): - log_path = join(self.output_path, 'logs') - # sorted lists give predictable results - files = sorted(glob.glob(join(log_path, '*.out'))) - msgs = [] - - for some_file in files: - with open(some_file, 'r') as f: - msgs += [line for line in f.readlines() - if 'error:' in line.lower()] - - return [msg.strip() for msg in msgs] diff --git a/sequence_processing_pipeline/Pipeline.py b/sequence_processing_pipeline/Pipeline.py index 21b460c2..9a30c2a0 100644 --- a/sequence_processing_pipeline/Pipeline.py +++ b/sequence_processing_pipeline/Pipeline.py @@ -173,25 +173,20 @@ def get_qiita_id_from_sif_fp(fp): hacky_name_pieces_dict = parse_project_name(temp_name) return hacky_name_pieces_dict[QIITA_ID_KEY] - def __init__(self, configuration_file_path, run_id, sample_sheet_path, - mapping_file_path, output_path, qiita_job_id, pipeline_type): + def __init__(self, configuration_file_path, run_id, input_file_path, + output_path, qiita_job_id, pipeline_type, lane_number=None): """ Initialize Pipeline object w/configuration information. :param configuration_file_path: Path to configuration.json file. :param run_id: Used w/search_paths to locate input run_directory. - :param sample_sheet_path: Path to sample-sheet. - :param mapping_file_path: Path to mapping file. + :param input_file_path: Path to sample-sheet or pre-prep file. :param output_path: Path where all pipeline-generated files live. :param qiita_job_id: Qiita Job ID creating this Pipeline. :param pipeline_type: Pipeline type ('Amplicon', 'Metagenomic', etc.) + :param lane_number: (Optional) overwrite lane_number in input_file. """ - if sample_sheet_path is not None and mapping_file_path is not None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") - - if sample_sheet_path is None and mapping_file_path is None: - raise PipelineError("sample_sheet_path or mapping_file_path " - "must be defined, but not both.") + if input_file_path is None: + raise PipelineError("user_input_file_path cannot be None") if pipeline_type not in Pipeline.pipeline_types: raise PipelineError(f"'{type}' is not a valid pipeline type.") @@ -235,22 +230,36 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, self.run_id = run_id self.qiita_job_id = qiita_job_id self.pipeline = [] + self.assay_type = None - if sample_sheet_path: - self.search_paths = self.configuration['search_paths'] - self.sample_sheet = self._validate_sample_sheet(sample_sheet_path) - self.mapping_file = None - else: + # this method will catch a run directory as well as its products + # directory, which also has the same name. Hence, return the + # shortest matching path as that will at least return the right + # path between the two. + results = [] + + if pipeline_type == Pipeline.AMPLICON_PTYPE: self.search_paths = self.configuration['amplicon_search_paths'] - self.mapping_file = self._validate_mapping_file(mapping_file_path) - # unlike _validate_sample_sheet() which returns a SampleSheet - # object that stores the path to the file it was created from, - # _validate_mapping_file() just returns a DataFrame. Store the - # path to the original mapping file itself as well. - self.mapping_file_path = mapping_file_path - self.sample_sheet = None + self.assay_type = Pipeline.AMPLICON_ATYPE + else: + self.search_paths = self.configuration['search_paths'] + + for search_path in self.search_paths: + logging.debug(f'Searching {search_path} for {self.run_id}') + for entry in listdir(search_path): + some_path = join(search_path, entry) + # ensure some_path never ends in '/' + some_path = some_path.rstrip('/') + if isdir(some_path) and some_path.endswith(self.run_id): + logging.debug(f'Found {some_path}') + results.append(some_path) - self.run_dir = self._search_for_run_dir() + if results: + results.sort(key=lambda s: len(s)) + self.run_dir = results[0] + else: + raise PipelineError(f"A run-dir for '{self.run_id}' could not be " + "found") # required files for successful operation # both RTAComplete.txt and RunInfo.xml should reside in the root of @@ -268,14 +277,78 @@ def __init__(self, configuration_file_path, run_id, sample_sheet_path, except PermissionError: raise PipelineError('RunInfo.xml is present, but not readable') - if self.mapping_file is not None: + self.input_file_path = input_file_path + + if pipeline_type == Pipeline.AMPLICON_PTYPE: + # assume input_file_path references a pre-prep (mapping) file. + + self.mapping_file = self._validate_mapping_file(input_file_path) + # unlike _validate_sample_sheet() which returns a SampleSheet + # object that stores the path to the file it was created from, + # _validate_mapping_file() just returns a DataFrame. Store the + # path to the original mapping file itself as well. + # create dummy sample-sheet output_fp = join(output_path, 'dummy_sample_sheet.csv') self.generate_dummy_sample_sheet(self.run_dir, output_fp) - self.sample_sheet = output_fp + self.dummy_sheet_path = output_fp + + # Optional lane_number parameter is ignored for Amplicon + # runs, as the only valid value is 1. + else: + if lane_number is not None: + # confirm that the lane_number is a reasonable value. + lane_number = int(lane_number) + if lane_number < 1 or lane_number > 8: + raise ValueError(f"'{lane_number}' is not a valid name" + " number") + + # overwrite sample-sheet w/DFSheets processed version + # with overwritten Lane number. + sheet = load_sample_sheet(input_file_path) + with open(input_file_path, 'w') as f: + sheet.write(f, lane=lane_number) + + # assume user_input_file_path references a sample-sheet. + self.sample_sheet = self._validate_sample_sheet(input_file_path) + self.mapping_file = None + + if self.assay_type is None: + # set self.assay_type for non-amplicon types. + assay_type = self.sample_sheet.Header['Assay'] + if assay_type not in Pipeline.assay_types: + raise ValueError(f"'{assay_type} is not a valid Assay type") + self.assay_type = assay_type self._configure_profile() + def get_sample_sheet_path(self): + """ + Returns path to a sample-sheet or dummy sample-sheet for amplicon runs. + """ + if self.assay_type == Pipeline.AMPLICON_ATYPE: + # assume self.dummy_sheet_path has been created for amplicon runs. + return self.dummy_sheet_path + else: + # assume input_file_path is a sample-sheet for non-amplicon runs. + return self.input_file_path + + def get_software_configuration(self, software): + if software is None or software == "": + raise ValueError(f"'{software}' is not a valid value") + + key_order = ['profile', 'configuration', software] + + config = self.config_profile + + for key in key_order: + if key in config: + config = config[key] + else: + raise PipelineError(f"'{key}' is not defined in configuration") + + return config + def identify_reserved_words(self, words): ''' Returns a list of words that should not appear as column names in any @@ -294,7 +367,7 @@ def identify_reserved_words(self, words): # specifically how the proper set of prep-info file columns are # generated. For now the functionality will be defined here as this # area of metapool is currently in flux. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: reserved = PREP_MF_COLUMNS else: # results will be dependent on SheetType and SheetVersion of @@ -313,15 +386,6 @@ def _configure_profile(self): # from self.sample_sheet (or self.mapping_file). instr_type = InstrumentUtils.get_instrument_type(self.run_dir) - if isinstance(self.sample_sheet, str): - # if self.sample_sheet is a file instead of a KLSampleSheet() - # type, then this is an Amplicon run. - assay_type = Pipeline.AMPLICON_ATYPE - else: - assay_type = self.sample_sheet.Header['Assay'] - if assay_type not in Pipeline.assay_types: - raise ValueError(f"'{assay_type} is not a valid Assay type") - # open the configuration profiles directory as specified by # profiles_path in the configuration.json file. parse each json into # a nested dictionary keyed by (instrument-type, assay-type) as @@ -381,40 +445,17 @@ def _configure_profile(self): i_type = profile['profile']['instrument_type'] a_type = profile['profile']['assay_type'] - if i_type == instr_type and a_type == assay_type: + if i_type == instr_type and a_type == self.assay_type: selected_profile = profile break if selected_profile is None: - raise ValueError(f"a matching profile ({instr_type}, {assay_type}" - ") was not found. Please notify an administrator") + raise ValueError(f"a matching profile ({instr_type}, " + f"{self.assay_type}) was not found. Please notify" + " an administrator") self.config_profile = selected_profile - def _search_for_run_dir(self): - # this method will catch a run directory as well as its products - # directory, which also has the same name. Hence, return the - # shortest matching path as that will at least return the right - # path between the two. - results = [] - - for search_path in self.search_paths: - logging.debug(f'Searching {search_path} for {self.run_id}') - for entry in listdir(search_path): - some_path = join(search_path, entry) - # ensure some_path never ends in '/' - some_path = some_path.rstrip('/') - if isdir(some_path) and some_path.endswith(self.run_id): - logging.debug(f'Found {some_path}') - results.append(some_path) - - if results: - results.sort(key=lambda s: len(s)) - return results[0] - - raise PipelineError(f"A run-dir for '{self.run_id}' could not be " - "found") - def _directory_check(self, directory_path, create=False): if exists(directory_path): logging.debug("directory '%s' exists." % directory_path) @@ -591,7 +632,7 @@ def generate_sample_info_files(self, addl_info=None): :param addl_info: A df of (sample-name, project-name) pairs. :return: A list of paths to sample-information-files. """ - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: # Generate a list of BLANKs for each project. temp_df = self.mapping_file[[SAMPLE_NAME_KEY, _PROJECT_NAME_KEY]] temp_df_as_dicts_list = temp_df.to_dict(orient='records') @@ -670,7 +711,7 @@ def get_sample_ids(self): # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: results = list(self.mapping_file.sample_name) else: results = [x.Sample_ID for x in self.sample_sheet.samples] @@ -685,7 +726,7 @@ def get_sample_names(self, project_name=None): ''' # test for self.mapping_file, since self.sample_sheet will be # defined in both cases. - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: return self._get_sample_names_from_mapping_file(project_name) else: return self._get_sample_names_from_sample_sheet(project_name) @@ -775,12 +816,9 @@ def _parse_project_name(self, project_name, short_names): return proj_info[PROJECT_SHORT_NAME_KEY], proj_info[QIITA_ID_KEY] def get_project_info(self, short_names=False): - # test for self.mapping_file, since self.sample_sheet will be - # defined in both cases. results = [] - contains_replicates = None - if self.mapping_file is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: if CONTAINS_REPLICATES_KEY in self.mapping_file: contains_replicates = True else: @@ -792,25 +830,35 @@ def get_project_info(self, short_names=False): {p: parse_project_name(p) for p in sample_project_map} else: projects_info = self.sample_sheet.get_projects_details() - # endif mapping_file if short_names: proj_name_key = PROJECT_SHORT_NAME_KEY else: proj_name_key = PROJECT_FULL_NAME_KEY - # endif + for curr_project_info in projects_info.values(): curr_dict = { _PROJECT_NAME_KEY: curr_project_info[proj_name_key], QIITA_ID_KEY: curr_project_info[QIITA_ID_KEY] } - if contains_replicates is not None: + if self.pipeline_type == Pipeline.AMPLICON_PTYPE: + # this is a mapping file: curr_contains_reps = contains_replicates else: - curr_contains_reps = \ - curr_project_info.get(CONTAINS_REPLICATES_KEY, False) - # endif + bi_df = self.sample_sheet.Bioinformatics + if CONTAINS_REPLICATES_KEY in bi_df.columns.tolist(): + # subselect rows in [Bioinformatics] based on whether they + # match the project name. + df = bi_df.loc[bi_df['Sample_Project'] == + curr_project_info[proj_name_key]] + # since only one project can match by definition, convert + # to dict and extract the needed value. + curr_contains_reps = df.iloc[0].to_dict()[ + CONTAINS_REPLICATES_KEY] + else: + curr_contains_reps = False + curr_dict[CONTAINS_REPLICATES_KEY] = curr_contains_reps results.append(curr_dict) # next project diff --git a/sequence_processing_pipeline/SeqCountsJob.py b/sequence_processing_pipeline/SeqCountsJob.py new file mode 100644 index 00000000..f080bd00 --- /dev/null +++ b/sequence_processing_pipeline/SeqCountsJob.py @@ -0,0 +1,147 @@ +from os.path import join, split +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from os import walk +from json import dumps +from glob import glob + + +logging.basicConfig(level=logging.DEBUG) + + +class SeqCountsJob(Job): + def __init__(self, run_dir, output_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, max_array_length, files_to_count_path, + cores_per_task=4): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param max_array_length: A hard-limit for array-sizes + :param files_to_count_path: A path to a list of file-paths to count. + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'SeqCountsJob', + [], + max_array_length, + modules_to_load=modules_to_load) + + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.jinja_env = Environment(loader=KISSLoader('templates')) + + self.job_name = (f"seq_counts_{self.qiita_job_id}") + self.files_to_count_path = files_to_count_path + + with open(self.files_to_count_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.file_count = len(lines) + + def run(self, callback=None): + job_script_path = self._generate_job_script() + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'SeqCountsJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + self._aggregate_counts() + + logging.debug(f'SeqCountJob {self.job_info["job_id"]} completed') + + def _generate_job_script(self): + job_script_path = join(self.output_path, "seq_counts.sbatch") + template = self.jinja_env.get_template("seq_counts.sbatch") + + # got to make files_to_count.txt and put it in the output directory + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "seq_counts", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "file_count": self.file_count, + "output_path": self.output_path + })) + + return job_script_path + + def parse_logs(self): + # overrides Job.parse_logs() w/tailored parse for specific logs. + files = sorted(glob(join(self.log_path, '*.err'))) + msgs = [] + + for some_file in files: + with open(some_file, 'r') as f: + msgs += [line for line in f.readlines() + if line.startswith("[E::stk_size]")] + + return [msg.strip() for msg in msgs] + + def _aggregate_counts(self): + def extract_metadata(fp): + with open(fp, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % fp) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + results = {} + + for root, dirs, files in walk(self.log_path): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + results_path = join(self.output_path, 'aggregate_counts.json') + + with open(results_path, 'w') as f: + print(dumps(results, indent=2), file=f) + + return results_path diff --git a/sequence_processing_pipeline/TRIntegrateJob.py b/sequence_processing_pipeline/TRIntegrateJob.py new file mode 100644 index 00000000..7b8740b4 --- /dev/null +++ b/sequence_processing_pipeline/TRIntegrateJob.py @@ -0,0 +1,163 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet +from os import makedirs +from shutil import copyfile + + +logging.basicConfig(level=logging.DEBUG) + + +class TRIntegrateJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, integrate_script_path, sil_path, raw_fastq_dir, + reference_base, reference_map, cores_per_task): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param integrate_script_path: None + :param sil_path: A path to a confidential file mapping C5xx, adapters. + :param reference_base: None + :param reference_map: None + :param cores_per_task: # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TRIntegrateJob', + [], + # max_array_length and self.max_array_length are + # not used by TRIntegrateJob. + -1, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + self.integrate_script_path = integrate_script_path + self.sil_path = sil_path + self.raw_fastq_dir = raw_fastq_dir + self.tmp_dir = join(self.output_path, 'tmp') + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.sample_count = len(self.sample_ids) + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.job_name = (f"integrate_{self.qiita_job_id}") + + with open(self.sil_path, 'r') as f: + # obtain the number of unique barcode_ids as determined by + # TellReadJob() in order to set up an array job of the + # proper length. + lines = f.readlines() + lines = [x.strip() for x in lines] + lines = [x for x in lines if x != ''] + self.barcode_id_count = len(lines) + + def run(self, callback=None): + job_script_path = self._generate_job_script() + + # copy sil_path to TRIntegrate working directory and rename to a + # predictable name. + copyfile(self.sil_path, + join(self.output_path, 'sample_index_list.txt')) + + # generate the tailored subset of adapter to barcode_id based on + # the proprietary lists owned by the manufacturer and supplied by + # the caller, and the barcode ids found in the sample-sheet. + self._generate_sample_index_list() + + makedirs(self.tmp_dir) + + params = ['--parsable', + f'-J {self.job_name}', + f'--array 1-{self.sample_count}'] + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TRIntegrateJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + info = self.parse_logs() + # prepend just the message component of the Error. + info.insert(0, str(e)) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TRIntegrateJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], sample['Sample_Project'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'integrate_test.sbatch') + template = self.jinja_env.get_template("integrate.sbatch") + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "integrate", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "integrate_script_path": self.integrate_script_path, + "queue_name": self.queue_name, + "barcode_id_count": self.barcode_id_count, + "raw_fastq_dir": self.raw_fastq_dir, + "tmp_dir": self.tmp_dir, + "output_dir": self.output_path})) + + return job_script_path diff --git a/sequence_processing_pipeline/TellReadJob.py b/sequence_processing_pipeline/TellReadJob.py new file mode 100644 index 00000000..3d68d4c8 --- /dev/null +++ b/sequence_processing_pipeline/TellReadJob.py @@ -0,0 +1,174 @@ +from os.path import join +from .Job import Job, KISSLoader +from .PipelineError import JobFailedError +import logging +from jinja2 import Environment +from .Pipeline import Pipeline +from .PipelineError import PipelineError +from metapool import load_sample_sheet + + +logging.basicConfig(level=logging.DEBUG) + + +class TellReadJob(Job): + def __init__(self, run_dir, output_path, sample_sheet_path, queue_name, + node_count, wall_time_limit, jmem, modules_to_load, + qiita_job_id, reference_base, + reference_map, sing_script_path, cores_per_task): + """ + ConvertJob provides a convenient way to run bcl-convert or bcl2fastq + on a directory BCL files to generate Fastq files. + :param run_dir: The 'run' directory that contains BCL files. + :param output_path: Path where all pipeline-generated files live. + :param sample_sheet_path: The path to a sample-sheet. + :param queue_name: The name of the Torque queue to use for processing. + :param node_count: The number of nodes to request. + :param wall_time_limit: A hard time limit (in min) to bound processing. + :param jmem: String representing total memory limit for entire job. + :param modules_to_load: A list of Linux module names to load + :param qiita_job_id: identify Torque jobs using qiita_job_id + :param reference_base: None + :param reference_map: None + :param cores_per_task: (Optional) # of CPU cores per node to request. + """ + super().__init__(run_dir, + output_path, + 'TellReadJob', + [], + 1, + modules_to_load=modules_to_load) + + self.sample_sheet_path = sample_sheet_path + self._file_check(self.sample_sheet_path) + metadata = self._process_sample_sheet() + self.sample_ids = metadata['sample_ids'] + self.queue_name = queue_name + self.node_count = node_count + self.wall_time_limit = wall_time_limit + self.cores_per_task = cores_per_task + + self.reference_base = reference_base + self.reference_map = reference_map + + # raise an Error if jmem is not a valid floating point value. + self.jmem = str(int(jmem)) + self.qiita_job_id = qiita_job_id + self.jinja_env = Environment(loader=KISSLoader('templates')) + self.sing_script_path = sing_script_path + + sheet = load_sample_sheet(self.sample_sheet_path) + lane = sheet.samples[0].Lane + + # force self.lane_number to be int. raise an Error if it's not. + tmp = int(lane) + if tmp < 1 or tmp > 8: + raise ValueError(f"'{tmp}' is not a valid lane number") + self.lane_number = tmp + + self.job_name = (f"{self.qiita_job_id}-tellread") + + def run(self, callback=None): + job_script_path = self._generate_job_script() + + # everything is in the job script so there are no additional params. + params = [] + + try: + self.job_info = self.submit_job(job_script_path, + job_parameters=' '.join(params), + exec_from=None, + callback=callback) + + logging.debug(f'TellReadJob Job Info: {self.job_info}') + except JobFailedError as e: + # When a job has failed, parse the logs generated by this specific + # job to return a more descriptive message to the user. + # TODO: We need more examples of failed jobs before we can create + # a parser for the logs. + # info = self.parse_logs() + # prepend just the message component of the Error. + # info.insert(0, str(e)) + info = str(e) + raise JobFailedError('\n'.join(info)) + + logging.debug(f'TellReadJob {self.job_info["job_id"]} completed') + + def _process_sample_sheet(self): + sheet = load_sample_sheet(self.sample_sheet_path) + + if not sheet.validate_and_scrub_sample_sheet(): + s = "Sample sheet %s is not valid." % self.sample_sheet_path + raise PipelineError(s) + + header = sheet.Header + chemistry = header['chemistry'] + + if header['Assay'] not in Pipeline.assay_types: + s = "Assay value '%s' is not recognized." % header['Assay'] + raise PipelineError(s) + + sample_ids = [] + for sample in sheet.samples: + sample_ids.append((sample['Sample_ID'], + sample['Sample_Project'], + sample['barcode_id'])) + + bioinformatics = sheet.Bioinformatics + + # reorganize the data into a list of dictionaries, one for each row. + # the ordering of the rows will be preserved in the order of the list. + lst = bioinformatics.to_dict('records') + + # human-filtering jobs are scoped by project. Each job requires + # particular knowledge of the project. + return {'chemistry': chemistry, + 'projects': lst, + 'sample_ids': sample_ids} + + def _generate_job_script(self): + job_script_path = join(self.output_path, 'tellread_test.sbatch') + template = self.jinja_env.get_template("tellread.sbatch") + + # generate a comma separated list of sample-ids from the tuples stored + # in self.sample_ids. + + # NB: Proposed sample-sheets will have traditional Sample_ID and + # Sample_Name columns as well as a new value named barcode_id. It's + # this column that will contain the 'C50n' values needed to be + # supplied to tellread. Later we will use this mapping to rename the + # files from C50n...fastq.gz to sample-name...fastq.gz. + samples = ','.join([id[2] for id in self.sample_ids]) + + # since we haven't included support for reference_map yet, whenever a + # reference is not included, the mapping against the list of sample_ids + # is ['NONE', 'NONE', ..., 'NONE']. + refs = ','.join(['NONE' for _ in self.sample_ids]) + + extra = "" + + with open(job_script_path, mode="w", encoding="utf-8") as f: + f.write(template.render({ + "job_name": "tellread", + "wall_time_limit": self.wall_time_limit, + "mem_in_gb": self.jmem, + "node_count": self.node_count, + "cores_per_task": self.cores_per_task, + "queue_name": self.queue_name, + "sing_script_path": self.sing_script_path, + "modules_to_load": ' '.join(self.modules_to_load), + "lane": f"s_{self.lane_number}", + # NB: Note that we no longer create a sub-directory under the + # working directory for TellRead to create all its output + # folders and files. This means it is creating folders and + # files in the same directory that has our sbatch script and + # logs directory. Currently there are no name collisions, + # however. + "output": self.output_path, + "rundir_path": self.root_dir, + "samples": samples, + "refs": refs, + "extra": extra + })) + + return job_script_path diff --git a/sequence_processing_pipeline/aggregate_counts.py b/sequence_processing_pipeline/aggregate_counts.py new file mode 100644 index 00000000..ace90212 --- /dev/null +++ b/sequence_processing_pipeline/aggregate_counts.py @@ -0,0 +1,40 @@ +from os import walk +from sys import argv +from os.path import join, split +from json import dumps + + +def extract_metadata(log_output_file_path): + with open(log_output_file_path, 'r') as f: + lines = f.readlines() + lines = [x.strip() for x in lines] + if len(lines) != 2: + raise ValueError("error processing %s" % log_output_file_path) + _dir, _file = split(lines[0]) + seq_counts, base_pairs = lines[1].split('\t') + return _dir, _file, int(seq_counts), int(base_pairs) + + +def aggregate_counts(fp): + results = {} + + for root, dirs, files in walk(fp): + for _file in files: + if _file.endswith('.out'): + log_output_file = join(root, _file) + _dir, _file, seq_counts, base_pairs = \ + extract_metadata(log_output_file) + + if _dir not in results: + results[_dir] = {} + + results[_dir][_file] = {'seq_counts': seq_counts, + 'base_pairs': base_pairs} + + return results + + +if __name__ == '__main__': + results = aggregate_counts(argv[1]) + with open(argv[2], 'w') as f: + print(dumps(results, indent=2), file=f) diff --git a/sequence_processing_pipeline/contrib/create_picklist.py b/sequence_processing_pipeline/contrib/create_picklist.py new file mode 100644 index 00000000..a1d6a1d0 --- /dev/null +++ b/sequence_processing_pipeline/contrib/create_picklist.py @@ -0,0 +1,72 @@ +import os +# from metapool.metapool import * +from sys import argv +import pandas as pd +import matplotlib.pyplot as plt +from metapool.metapool import (read_survival, make_2D_array, + calculate_iseqnorm_pooling_volumes, + format_pooling_echo_pick_list) +import seaborn as sns + +input_sheet_filename = argv[1] + +plate_df_w_reads = pd.read_csv(input_sheet_filename, sep='\t') +plate_df_w_reads['Blank'] = [True if 'blank' in s.lower() else False + for s in plate_df_w_reads['Sample_Name']] +reads_column = 'read_counts' + +well_col = 'Sample_Well' +assert reads_column in plate_df_w_reads.columns + +f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(8, 8)) +# evenness plot +rmax = int(round(plate_df_w_reads[reads_column].max(), -2)) + +foo = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is True, + reads_column], + label='Blanks', + rmax=rmax) + +bar = read_survival(plate_df_w_reads.loc[plate_df_w_reads['Blank'] is False, + reads_column], + label='Samples', + rmax=rmax) + +survival_df = pd.concat([foo, bar]) + +ax3.set_xlabel(reads_column) +ax3.set_ylabel('Samples') +survival_df.plot(color=['coral', 'steelblue'], ax=ax1) +ax1.set_xlabel(reads_column) +ax1.set_ylabel('Samples') + +# Histogram +sns.histplot(plate_df_w_reads[reads_column], ax=ax3) + +# Boxplot +sns.boxplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4) +sns.stripplot(x="Blank", y=reads_column, data=plate_df_w_reads, ax=ax4, + size=3, color='black', alpha=0.5) + +plt.tight_layout() +plt.savefig(input_sheet_filename + '.comboplot.pdf') + +pdfn = calculate_iseqnorm_pooling_volumes(plate_df_w_reads, + dynamic_range=20, + normalization_column=reads_column) +plt.savefig(input_sheet_filename + '.normalizedplot.pdf') + +vols = make_2D_array(pdfn, + data_col='iSeq normpool volume', + well_col=well_col).astype(float) + +# Write the picklist as .csv +picklist_fp = input_sheet_filename + '.picklist.csv' + +if os.path.isfile(picklist_fp): + print("Warning! This file exists already.") + +picklist = format_pooling_echo_pick_list(vols, max_vol_per_well=30000) + +with open(picklist_fp, 'w') as f: + f.write(picklist) diff --git a/sequence_processing_pipeline/contrib/integrate-indices-np.py b/sequence_processing_pipeline/contrib/integrate-indices-np.py new file mode 100644 index 00000000..b1be83a6 --- /dev/null +++ b/sequence_processing_pipeline/contrib/integrate-indices-np.py @@ -0,0 +1,332 @@ +# Why +# 1) cloudspades requires the index reads be inline in the record header +# 2) Ariadne requires the data are sorted by the barcodes +# +# Inlining is easy. Sorting is complex as the amount of data is large, and +# the ordering stems is determined external to the data being sorted. To +# determine order, all barcodes must be read in to gather the complete +# barcode <-> record association; if only partial data is read then +# associations to barcodes may be missed, and we cannot perform an insertion +# sort efficiently as we're writing to disk. Once we know an order for the +# records, we (currently) read in the entirety of the subsequent data (R1 then +# R2), reorder, and write. Performing this in blocks to minimize memory may be +# possible, but we have to assume access is random as a grouping barcode +# may be with any record along the file. +# +# A variety of approaches were considered, including: +# - indexing portions in a hashtable, reading inputs multiple times, and +# writing in blocks. This was tested in both rust and python. The amount of +# memory was large, and keeping it under control would be many many many +# passes over data on disk or in memory +# - using pandas to do the grouping, which possibly avoids the memory burden +# of a hashmap. it didn't +# - using mmap files. No go, these are large and we have to walk over them +# a lot. +# +# Parsing this stuff adds a lot of overhead in Python. It will add some, if not +# a lot, in rust as well -- our test data had 65M sequences. So the current +# approach operates in the raw file data itself, using regex's to parse +# individual records. We use numpy for sorting and getting record orders. +# This is memory expensive but so far much less than the other approaches tried +# and it does not require multiple passes over files. We bottleneck on write +# IO, so to mitigate that, we are using a parallel gzip (pgzip), which still +# bottlenecks but gets better throughput. +# +# There probably are smarter ways to do this to reduce the memory burden. +# Right now, it's O(N) where N is the number of records. We load R1 and R2 +# separately though so we at least halve the memory use. As for doing it +# faster, at the moment we appear to saturate time on gzip. Easiest solution +# would be to increase the number of threads, but then again, this process +# is expected to run in an array, and filesystem can only take so much. +# +# In addition to the inline tests, md5 checks to verify all record IDs are +# present in both R1 / R2, and relative to original input. Spot checks on +# an arbitrary set of records were performed on R1 / R2 to verify no apparent +# unusual modification. And spot checks were performed to verify that correct +# barcodes are incorporating as expected in output. +# +# author: Daniel McDonald (d3mcdonald@eng.ucsd.edu) +import numpy as np +import click +import re +import io +import pgzip +import gzip + + +RECORD = re.compile(rb'@\S+\n[ATGCN]+\n\+\n\S+\n') +BARCODE = re.compile(rb'@\S+\n([ATGCN]+)\n\+\n\S+\n') + + +def gather_order(i1_in_fp): + """Determine record order + + This is a fancy way of saying: get all the barcodes, and sort them. + + We return the order of the sorted records, the unique barcodes, + and the bounds for what barcode associated with what record + """ + # determine barcode length + _ = i1_in_fp.readline() + b = i1_in_fp.readline() + rec_len = len(b.strip()) + i1_in_fp.seek(0) + + # we need larger data in memory later anyway... + i1 = i1_in_fp.read() + start = 0 + end = len(i1) + + # get the number of records. we completely assume non-multiline fastq here + newlines = i1.count(b'\n') + assert newlines % 4 == 0 + barcodes = np.empty(newlines // 4, dtype='|S%d' % rec_len) + + # walk all index records + # grab each barcode + idx = 0 + while start < end: + barcode_result = BARCODE.search(i1, pos=start) + barcode = barcode_result.groups()[0] + assert len(barcode) == rec_len # get angry if the barcode is weird + + barcodes[idx] = barcode + idx += 1 + start = barcode_result.end() + + # we no longer need the raw data so let's toss it + del i1 + + # determine the record order of a lexicographic sort + # gather the unique barcodes so we can use them later, and the bounding + # points in the sorted set + record_order = barcodes.argsort() + barcodes = barcodes[record_order] + unique_barcodes, barcode_bounds = np.unique(barcodes, return_index=True) + + return record_order, unique_barcodes, barcode_bounds + + +def test_gather_order(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + exp_order = np.array([0, 2, 1, 4, 3, 5, 6]) + exp_unique = np.array([b'ATGC', b'TTGG', b'TTTT']) + exp_bounds = np.array([0, 2, 4]) + + assert (order == exp_order).all() + assert (unique == exp_unique).all() + assert (bounds == exp_bounds).all() + + +def troll_and_write(order, unique, bounds, in_, out_): + """Walk over the raw data, spit out barcode amended records in order + + - read all data + - get index boundaries for each record + - pull out each record in order according to the barcode data + - associate the barcode + - write + """ + + data = in_.read() + boundaries = np.empty([order.size, 2], dtype=np.uint64) + + stop = 0 + for idx in range(order.size): + rec = RECORD.search(data, pos=stop) + start, stop = rec.span() + boundaries[idx] = np.array([start, stop], dtype=np.uint64) + + current_barcode_idx = 0 + current_barcode = unique[current_barcode_idx] + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + for order_idx, record_idx in enumerate(order): + if order_idx >= current_barcode_bound_end: + current_barcode_idx += 1 + + if current_barcode_idx >= bounds.size: + raise ValueError("should not happen?") + current_barcode = unique[current_barcode_idx] + + if current_barcode_idx + 1 >= bounds.size: + # run to the end + current_barcode_bound_end = order.size + else: + current_barcode_bound_end = bounds[current_barcode_idx + 1] + + start, stop = boundaries[record_idx] + record = data[start:stop] + + # in a one-off, these might pass by chance. It would be real weird + # for them to always pass for all records in a large file. + # n.b., b'foo'[0] is int, because yay, so we use a slice to maintain + # a human readable character to test against as most mortals haven't + # memorized the ascii table + assert record[:1] == b'@' + assert record[-1:] == b'\n' + + with_barcode = insert_barcode(record, current_barcode) + out_.write(with_barcode) + + +def test_troll_and_write(): + i1data = [b'@foo', b'ATGC', b'+', b'!!!!', + b'@bar', b'TTGG', b'+', b'!!!!', + b'@baz', b'ATGC', b'+', b'!!!!', + b'@oof', b'TTTT', b'+', b'!!!!', + b'@rab', b'TTGG', b'+', b'!!!!', + b'@zab', b'TTTT', b'+', b'!!!!', + b'@ofo', b'TTTT', b'+', b'!!!!', b''] + + i1 = io.BytesIO(b'\n'.join(i1data)) + order, unique, bounds = gather_order(i1) + + # we assume records are in the same order, as that has previously been + # observed w/ tellread and is the normal expectation + r1data = [b'@foo', b'AATGC', b'+', b'!!!!!', + b'@bar', b'ATTGG', b'+', b'!!!!!', + b'@baz', b'AATGC', b'+', b'!!!!!', + b'@oof', b'ATTTT', b'+', b'!!!!!', + b'@rab', b'ATTGG', b'+', b'!!!!!', + b'@zab', b'ATTTT', b'+', b'!!!!!', + b'@ofo', b'ATTTT', b'+', b'!!!!!', b''] + r1 = io.BytesIO(b'\n'.join(r1data)) + r1out = io.BytesIO() + troll_and_write(order, unique, bounds, r1, r1out) + r1out.seek(0) + + r1exp = [b'@foo BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@baz BX:Z:ATGC-1', b'AATGC', b'+', b'!!!!!', + b'@bar BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@rab BX:Z:TTGG-1', b'ATTGG', b'+', b'!!!!!', + b'@oof BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@zab BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b'@ofo BX:Z:TTTT-1', b'ATTTT', b'+', b'!!!!!', + b''] + r1exp = b'\n'.join(r1exp) + assert r1exp == r1out.read() + + +def create_tag(t): + return b'BX:Z:%s-1' % t + + +def create_tag_no_suffix(t): + return b'BX:Z:%s' % t + + +def insert_barcode(record, barcode): + """Get the current ID, smash the needed tag in""" + # @foo\nATGC\n+\n!!!!\n + id_, remainder = record.split(b'\n', 1) + tag = create_tag(barcode) + return b'%s %s\n%s' % (id_, tag, remainder) + + +def readfq(fp): + if fp.mode == 'rb': + strip = bytes.strip + else: + strip = str.strip + + id_ = iter(fp) + seq = iter(fp) + dumb = iter(fp) + qual = iter(fp) + for rec in zip(id_, seq, dumb, qual): + yield list(map(strip, rec)) + + +def writefq(rec, out): + for item in rec: + out.write(item) + out.write(b'\n') + + +@click.group() +def cli(): + pass + + +@cli.command() +def tests(): + test_gather_order() + test_troll_and_write() + + +@cli.command() +@click.option('--r1-in', type=click.Path(exists=True), required=True) +@click.option('--r2-in', type=click.Path(exists=True), required=True) +@click.option('--i1-in', type=click.Path(exists=True), required=True) +@click.option('--r1-out', type=click.Path(exists=False), required=True) +@click.option('--r2-out', type=click.Path(exists=False), required=True) +@click.option('--threads', type=int, required=False, default=1) +@click.option('--no-sort', is_flag=True, default=False) +def integrate(r1_in, r2_in, i1_in, r1_out, r2_out, threads, no_sort): + r1_in_fp = open(r1_in, 'rb') + r2_in_fp = open(r2_in, 'rb') + i1_in_fp = open(i1_in, 'rb') + + if no_sort: + r1_out_fp = gzip.open(r1_out, mode='wb') + r2_out_fp = gzip.open(r2_out, mode='wb') + + r1_sniff = r1_in_fp.readline().strip() + r2_sniff = r2_in_fp.readline().strip() + r1_in_fp.seek(0) + r2_in_fp.seek(0) + + # outputs from tellread don't seem to have orientation information + # some downstream programs hate this, so let's add if needed. + if r1_sniff.endswith(b'/1'): + if not r2_sniff.endswith(b'/2'): + raise ValueError('unexpected endings: ' + f'{r1_sniff.decode("utf-8")} ' + f'{r2_sniff.decode("utf-8")}') + orient_r1 = '' + orient_r2 = '' + else: + assert b'/1' not in r1_sniff + + orient_r1 = b'/1' + orient_r2 = b'/2' + + for (r1, r2, i1) in zip(*map(readfq, [r1_in_fp, r2_in_fp, i1_in_fp])): + assert r1[0] == r2[0] + assert r1[0] == i1[0] + + tag = create_tag_no_suffix(i1[1]) + r1[0] = b"%s%s %s" % (r1[0], orient_r1, tag) + r2[0] = b"%s%s %s" % (r2[0], orient_r2, tag) + writefq(r1, r1_out_fp) + writefq(r2, r2_out_fp) + r1_out_fp.close() + r2_out_fp.close() + else: + # 200MB is what they use in their readme... + r1_out_fp = pgzip.open(r1_out, mode='wb', thread=threads, + blocksize=2*10**8) + r2_out_fp = pgzip.open(r2_out, mode='wb', thread=threads, + blocksize=2*10**8) + + order, unique, bounds = gather_order(i1_in_fp) + + for in_, out_ in zip([r1_in_fp, r2_in_fp], [r1_out_fp, r2_out_fp]): + troll_and_write(order, unique, bounds, in_, out_) + in_.close() + out_.close() + + +if __name__ == '__main__': + cli() diff --git a/sequence_processing_pipeline/contrib/plot_counts.py b/sequence_processing_pipeline/contrib/plot_counts.py new file mode 100644 index 00000000..ecab9e49 --- /dev/null +++ b/sequence_processing_pipeline/contrib/plot_counts.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import re +import sys +import os +import pandas as pd + +ex = re.compile(r'_I1_(C5\d\d).fastq.gz.corrected.err_barcode_removed.fastq') + +# remove total line from wc +data = [x.strip().split(' ') for x in open(sys.argv[1])][:-1] +plotdata = [(ex.search(i).groups()[0], int(v) / 4) for v, i in data] +sheetdata = dict(plotdata) + +ordered = sorted(plotdata, key=lambda x: x[1]) +f = plt.figure(figsize=(16, 8)) +plt.bar([i for i, _ in ordered], [v for _, v in ordered]) +plt.ylabel('I1 reads') +plt.xticks(list(range(len(ordered))), [i for i, _ in ordered], rotation=90) +plt.savefig(sys.argv[3] + '/counts.pdf') + +sheet = pd.read_csv(sys.argv[2], dtype=str) +sheet = sheet[~sheet['Lane'].isnull()] +sheet['read_counts'] = [sheetdata[i] for i in sheet['Barcode_ID']] +name = os.path.basename(sys.argv[2]).rsplit('.', 1)[0] +newname = name + '.read_counts.tsv' + +sheet.to_csv(sys.argv[3] + '/' + newname, sep='\t', index=False, header=True) diff --git a/sequence_processing_pipeline/scripts/fake_squeue.py b/sequence_processing_pipeline/scripts/fake_squeue.py new file mode 100755 index 00000000..6c8511ce --- /dev/null +++ b/sequence_processing_pipeline/scripts/fake_squeue.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +from json import load, dumps +from os.path import exists, join +from sys import argv +from random import randint, choice + + +def print_state(state): + # Note that %i will appear w/column name 'JOBID' in actual squeue output. + # this is because %i shows the array-id if it's an array job and what we + # consider the regular job-id if it's not an array job. + print("JOBID,STATE") + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job + for array_id in state[job_id]['array_ids']: + if state[job_id]['array_ids'][array_id] <= 0: + end_state = state[job_id]['endgame'][array_id] + else: + end_state = 'RUNNING' + + print(f"{array_id},{end_state}") + else: + # this is a non-array job + if state[job_id]['countdown'] <= 0: + end_state = state[job_id]['endgame'] + else: + end_state = 'RUNNING' + + print(f"{job_id},{end_state}") + + +def generate_output(job_ids): + results = {} + + for job_id in job_ids: + is_successful = choice([True, False]) + is_array_job = choice([True, False]) + + if is_array_job: + result = {'job_id': job_id} + result['array_ids'] = {} + result['endgame'] = {} + + for i in range(0, randint(5, 15)): + array_id = "%s_%d" % (job_id, i) + result['array_ids'][array_id] = randint(3, 7) + result['array_ids'][array_id] = randint(3, 7) + if is_successful: + # all array jobs must be successful + result['endgame'][array_id] = "COMPLETED" + else: + # some jobs may succeed but some may fail + result['endgame'][array_id] = choice( + ['COMPLETED', 'FAILED']) + results[job_id] = result + else: + result = {'job_id': job_id} + result['countdown'] = randint(3, 7) + result['endgame'] = choice(['COMPLETED', 'FAILED']) + results[job_id] = result + + return results + + +def save_state(state, file_path): + with open(file_path, 'w') as f: + print(dumps(state, indent=2), file=f) + + +def load_state(file_path): + with open(file_path, 'r') as f: + return load(f) + + +if __name__ == "__main__": + # "squeue -t all -j " f"{','.join(job_ids)} " "-o '%i,%T'" + job_ids = argv[4].split(',') + + state_file_path = join("sequence_processing_pipeline", "scripts", + "my_state.json") + + state = generate_output(job_ids) + + if exists(state_file_path): + state = load_state(state_file_path) + else: + state = generate_output(job_ids) + + print_state(state) + + for job_id in state: + if 'array_ids' in state[job_id]: + # this is an array job. + for array_id in state[job_id]['array_ids']: + state[job_id]['array_ids'][array_id] -= 1 + else: + # this is a standard job. + state[job_id]['countdown'] -= 1 + + save_state(state, state_file_path) diff --git a/sequence_processing_pipeline/templates/integrate.sbatch b/sequence_processing_pipeline/templates/integrate.sbatch new file mode 100644 index 00000000..68ebce5e --- /dev/null +++ b/sequence_processing_pipeline/templates/integrate.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{barcode_id_count}} + +#SBATCH --output {{output_dir}}/logs/integrate_%x_%A_%a.out +#SBATCH --error {{output_dir}}/logs/integrate_%x_%A_%a.err + +set -x +set -e + +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR={{tmp_dir}} + +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat {{output_dir}}/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "Processing sample ${sample}..." + +# make temp directory +export TMPDIR={{tmp_dir}} +mkdir -p $TMPDIR + + +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. + +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in={{raw_fastq_dir}}/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in={{raw_fastq_dir}}/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in={{raw_fastq_dir}}/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq + +# create output directory +mkdir -p {{output_dir}}/integrated + +# generate output file names +r1_out={{output_dir}}/integrated/${sample}.R1.fastq.gz +r2_out={{output_dir}}/integrated/${sample}.R2.fastq.gz +i1_out={{output_dir}}/integrated/${sample}.I1.fastq.gz + +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} + +# generate integrated R1 and R2 fastq.gz files. +conda activate qp-knight-lab-processing-2022.03 + +python {{integrate_script_path}} integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads {{cores_per_task}} diff --git a/sequence_processing_pipeline/templates/seq_counts.sbatch b/sequence_processing_pipeline/templates/seq_counts.sbatch new file mode 100644 index 00000000..f44bd5b9 --- /dev/null +++ b/sequence_processing_pipeline/templates/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH --time {{wall_time_limit}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH -p {{queue_name}} +#SBATCH --array=1-{{file_count}} + +#SBATCH --output {{output_path}}/logs/%x_%A_%a.out +#SBATCH --error {{output_path}}/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p {{output_path}}/logs + +files=($(cat {{output_path}}/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/templates/tellread-cleanup.sbatch b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch new file mode 100644 index 00000000..3c31219d --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread-cleanup.sbatch @@ -0,0 +1,13 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} # cleanup +#SBATCH --time {{wall_time_limit}} # 24:00:00 +#SBATCH --mem {{mem_in_gb}}G # 8G +#SBATCH -N {{node_count}} # 1 +#SBATCH -c {{cores_per_task}} # 1 +#SBATCH -p {{queue_name}} # qiita + +#SBATCH --output {{output}}/logs/cleanup_%x-%A.out +#SBATCH --error {{output}}/logs/cleanup_%x-%A.err + +# remove unused large outputs +rm -rf {{OUTPUT}}/biosample_format {{OUTPUT}}/1_demult {{OUTPUT}}/Full diff --git a/sequence_processing_pipeline/templates/tellread.sbatch b/sequence_processing_pipeline/templates/tellread.sbatch new file mode 100644 index 00000000..f038a568 --- /dev/null +++ b/sequence_processing_pipeline/templates/tellread.sbatch @@ -0,0 +1,48 @@ +#!/bin/bash -l +#SBATCH -J {{job_name}} +#SBATCH -p {{queue_name}} +#SBATCH -N {{node_count}} +#SBATCH -c {{cores_per_task}} +#SBATCH --mem {{mem_in_gb}}G +#SBATCH --time {{wall_time_limit}} + +#SBATCH --output {{output}}/logs/tellread_%x-%A.out +#SBATCH --error {{output}}/logs/tellread_%x-%A.err + +set -x + +module load {{modules_to_load}} +{{sing_script_path}} \ + -i {{rundir_path}} \ + -o {{output}} \ + -s $(echo {{samples}} | tr -d '"') \ + -g $(echo {{refs}} | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} {{extra}} \ + -l {{lane}} + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find {{output}}/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 + diff --git a/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv new file mode 100644 index 00000000..f696f0c9 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/20230906_FS10001773_68_BTR67708-1611.csv @@ -0,0 +1,41 @@ +Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Barcode_96_Well_Position,Barcode_ID,Sample_Project,Well_description,Lane +Person.A.TELLSEQ.R20.microbe,Person.A.TELLSEQ.R20.microbe,TellSeq3_15196_P3,A1,A4,C525,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.microbe,1 +Person.B.TELLSEQ.R24.microbe,Person.B.TELLSEQ.R24.microbe,TellSeq3_15196_P3,B1,B4,C526,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.microbe,1 +Person.C.TELLSEQ.R21.microbe,Person.C.TELLSEQ.R21.microbe,TellSeq3_15196_P3,C1,C4,C527,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.microbe,1 +Person.D.TELLSEQ.R26.microbe,Person.D.TELLSEQ.R26.microbe,TellSeq3_15196_P3,D1,D4,C528,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.microbe,1 +Person.E.TELLSEQ.R19.microbe,Person.E.TELLSEQ.R19.microbe,TellSeq3_15196_P3,E1,E4,C529,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.microbe,1 +Pet.C.TELLSEQ.R23.microbe,Pet.C.TELLSEQ.R23.microbe,TellSeq3_15196_P3,F1,F4,C530,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.microbe,1 +BLANK.TELLSEQ.3.12.H.microbe,BLANK.TELLSEQ.3.12.H.microbe,TellSeq3_15196_P3,G1,G4,C531,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.microbe,1 +Isolate.115.R1.microbe,Isolate.115.R1.microbe,TellSeq3_15196_P1,H1,H4,C532,TellSeq3_15196_P3,Isolate.115.R1.microbe,1 +Zymo.Mock.Community.R1.microbe,Zymo.Mock.Community.R1.microbe,TellSeq3_15196_P1,A2,A5,C533,TellSeq3_15196_P3,Zymo.Mock.Community.R1.microbe,1 +E.coli.QC.DNA.R1.microbe,E.coli.QC.DNA.R1.microbe,TellSeq3_15196_P1,B2,B5,C534,TellSeq3_15196_P3,E.coli.QC.DNA.R1.microbe,1 +Person.A.TELLSEQ.R20.purified.microbe,Person.A.TELLSEQ.R20.purified.microbe,TellSeq3_15196_P3,C2,C5,C535,TellSeq3_15196_P3,Person.A.TELLSEQ.R20.purified.microbe,1 +Person.B.TELLSEQ.R24.purified.microbe,Person.B.TELLSEQ.R24.purified.microbe,TellSeq3_15196_P3,D2,D5,C536,TellSeq3_15196_P3,Person.B.TELLSEQ.R24.purified.microbe,1 +Person.C.TELLSEQ.R21.purified.microbe,Person.C.TELLSEQ.R21.purified.microbe,TellSeq3_15196_P3,E2,E5,C537,TellSeq3_15196_P3,Person.C.TELLSEQ.R21.purified.microbe,1 +Person.D.TELLSEQ.R26.purified.microbe,Person.D.TELLSEQ.R26.purified.microbe,TellSeq3_15196_P3,F2,F5,C538,TellSeq3_15196_P3,Person.D.TELLSEQ.R26.purified.microbe,1 +Person.E.TELLSEQ.R19.purified.microbe,Person.E.TELLSEQ.R19.purified.microbe,TellSeq3_15196_P3,G2,G5,C539,TellSeq3_15196_P3,Person.E.TELLSEQ.R19.purified.microbe,1 +Pet.C.TELLSEQ.R23.purified.microbe,Pet.C.TELLSEQ.R23.purified.microbe,TellSeq3_15196_P3,H2,H5,C540,TellSeq3_15196_P3,Pet.C.TELLSEQ.R23.purified.microbe,1 +BLANK.TELLSEQ.3.12.H.purified.microbe,BLANK.TELLSEQ.3.12.H.purified.microbe,TellSeq3_15196_P3,A3,A6,C541,TellSeq3_15196_P3,BLANK.TELLSEQ.3.12.H.purified.microbe,1 +Isolate.115.R2.microbe,Isolate.115.R2.microbe,TellSeq3_15196_P1,B3,B6,C542,TellSeq3_15196_P3,Isolate.115.R2.microbe,1 +Zymo.Mock.Community.R2.microbe,Zymo.Mock.Community.R2.microbe,TellSeq3_15196_P1,C3,C6,C543,TellSeq3_15196_P3,Zymo.Mock.Community.R2.microbe,1 +E.coli.QC.DNA.R2.microbe,E.coli.QC.DNA.R2.microbe,TellSeq3_15196_P1,D3,D6,C544,TellSeq3_15196_P3,E.coli.QC.DNA.R2.microbe,1 +Person.A.TELLSEQ.R20.std,Person.A.TELLSEQ.R20.std,TellSeq3_15196_P3,A1,A1,C501,TellSeq3_15196,Person.A.TELLSEQ.R20.std,1 +Person.B.TELLSEQ.R24.std,Person.B.TELLSEQ.R24.std,TellSeq3_15196_P3,B1,B1,C502,TellSeq3_15196,Person.B.TELLSEQ.R24.std,1 +Person.C.TELLSEQ.R21.std,Person.C.TELLSEQ.R21.std,TellSeq3_15196_P3,C1,C1,C503,TellSeq3_15196,Person.C.TELLSEQ.R21.std,1 +Person.D.TELLSEQ.R26.std,Person.D.TELLSEQ.R26.std,TellSeq3_15196_P3,D1,D1,C504,TellSeq3_15196,Person.D.TELLSEQ.R26.std,1 +Person.E.TELLSEQ.R19.std,Person.E.TELLSEQ.R19.std,TellSeq3_15196_P3,E1,E1,C505,TellSeq3_15196,Person.E.TELLSEQ.R19.std,1 +Pet.C.TELLSEQ.R23.std,Pet.C.TELLSEQ.R23.std,TellSeq3_15196_P3,F1,F1,C506,TellSeq3_15196,Pet.C.TELLSEQ.R23.std,1 +BLANK.TELLSEQ.3.12.H.std,BLANK.TELLSEQ.3.12.H.std,TellSeq3_15196_P3,G1,G1,C507,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.std,1 +Isolate.115.R1.std,Isolate.115.R1.std,TellSeq3_15196_P1,H1,H1,C508,TellSeq3_15196,Isolate.115.R1.std,1 +Zymo.Mock.Community.R1.std,Zymo.Mock.Community.R1.std,TellSeq3_15196_P1,A2,A2,C509,TellSeq3_15196,Zymo.Mock.Community.R1.std,1 +E.coli.QC.DNA.R1.std,E.coli.QC.DNA.R1.std,TellSeq3_15196_P1,B2,B2,C510,TellSeq3_15196,E.coli.QC.DNA.R1.std,1 +Person.A.TELLSEQ.R20.purified.std,Person.A.TELLSEQ.R20.purified.std,TellSeq3_15196_P3,C2,C2,C511,TellSeq3_15196,Person.A.TELLSEQ.R20.purified.std,1 +Person.B.TELLSEQ.R24.purified.std,Person.B.TELLSEQ.R24.purified.std,TellSeq3_15196_P3,D2,D2,C512,TellSeq3_15196,Person.B.TELLSEQ.R24.purified.std,1 +Person.C.TELLSEQ.R21.purified.std,Person.C.TELLSEQ.R21.purified.std,TellSeq3_15196_P3,E2,E2,C513,TellSeq3_15196,Person.C.TELLSEQ.R21.purified.std,1 +Person.D.TELLSEQ.R26.purified.std,Person.D.TELLSEQ.R26.purified.std,TellSeq3_15196_P3,F2,F2,C514,TellSeq3_15196,Person.D.TELLSEQ.R26.purified.std,1 +Person.E.TELLSEQ.R19.purified.std,Person.E.TELLSEQ.R19.purified.std,TellSeq3_15196_P3,G2,G2,C515,TellSeq3_15196,Person.E.TELLSEQ.R19.purified.std,1 +Pet.C.TELLSEQ.R23.purified.std,Pet.C.TELLSEQ.R23.purified.std,TellSeq3_15196_P3,H2,H2,C516,TellSeq3_15196,Pet.C.TELLSEQ.R23.purified.std,1 +BLANK.TELLSEQ.3.12.H.purified.std,BLANK.TELLSEQ.3.12.H.purified.std,TellSeq3_15196_P3,A3,A3,C517,TellSeq3_15196,BLANK.TELLSEQ.3.12.H.purified.std,1 +Isolate.115.R2.std,Isolate.115.R2.std,TellSeq3_15196_P1,B3,B3,C518,TellSeq3_15196,Isolate.115.R2.std,1 +Zymo.Mock.Community.R2.std,Zymo.Mock.Community.R2.std,TellSeq3_15196_P1,C3,C3,C519,TellSeq3_15196,Zymo.Mock.Community.R2.std,1 +E.coli.QC.DNA.R2.std,E.coli.QC.DNA.R2.std,TellSeq3_15196_P1,D3,D3,C520,TellSeq3_15196,E.coli.QC.DNA.R2.std,1 diff --git a/sequence_processing_pipeline/tests/data/aggregate_counts_results.json b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json new file mode 100644 index 00000000..1cae0f05 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/aggregate_counts_results.json @@ -0,0 +1,36 @@ +{ + "REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full": { + "TellReadJob_I1_C520.fastq.gz.erroneous.fastq": { + "seq_counts": 2139633, + "base_pairs": 38513394 + }, + "TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8345327641 + }, + "TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9293296513 + }, + "TellReadJob_I1_C519.fastq.gz.erroneous.fastq": { + "seq_counts": 1932116, + "base_pairs": 34778088 + }, + "TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 1160354916 + }, + "TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 64464162, + "base_pairs": 8370238082 + }, + "TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 9317943166 + }, + "TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq": { + "seq_counts": 70399028, + "base_pairs": 1267182504 + } + } +} diff --git a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json index 089e82f1..c82c76b0 100644 --- a/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json +++ b/sequence_processing_pipeline/tests/data/configuration_profiles/iseq_metagenomic.json @@ -3,6 +3,25 @@ "instrument_type": "iseq", "assay_type": "Metagenomic", "configuration": { + "tell-seq": { + "label": "my_label", + "reference_base": "", + "reference_map": "", + "sing_script_path": "/my_path/tellread-release-novaseqX/run_tellread_sing.sh", + "nodes": 1, + "lane": 1, + "sample_index_list": "/my_path/sample_index_list_1.txt", + "queue": "qiita", + "wallclock_time_in_minutes": 1440, + "modules_to_load": ["singularity_3.6.4"], + "integrate_script_path": "/my_path/integrate-indices-np.py", + "tellread_mem_limit": "16", + "tellread_cores": "4", + "normcount_cores": "1", + "integrate_cores": "1", + "normcount_mem_limit": "8", + "integrate_mem_limit": "8" + }, "bcl2fastq": { "nodes": 1, "nprocs": 16, diff --git a/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt new file mode 100644 index 00000000..1c4345ef --- /dev/null +++ b/sequence_processing_pipeline/tests/data/fake_sample_index_list.txt @@ -0,0 +1,96 @@ +CCCCCACCAA C501 NONE PE +AACCCCCACA C502 NONE PE +CCAACACACC C503 NONE PE +AACACCCCCA C504 NONE PE +CAAAACCCCC C505 NONE PE +ACACACCACC C506 NONE PE +AACCCACACC C507 NONE PE +CAAAAAAAAA C508 NONE PE +AAACCACCCC C509 NONE PE +ACACCCCCCC C510 NONE PE +AAACACCACA C511 NONE PE +CAAAACCCCA C512 NONE PE +ACCCCAACCC C513 NONE PE +CAACCAACAC C514 NONE PE +CCCCCACCCA C515 NONE PE +CCAACCCCCA C516 NONE PE +CAAAACACCC C517 NONE PE +ACACACCAAA C518 NONE PE +CCACCAAAAA C519 NONE PE +AAACCCCCCC C520 NONE PE +AACAACCCCA C521 NONE PE +CAACAACAAC C522 NONE PE +CACCACCAAA C523 NONE PE +CACAACAAAC C524 NONE PE +AACACCCACC C525 NONE PE +CAAAACACAA C526 NONE PE +AAAAAAAAAA C527 NONE PE +CCAACCCCCA C528 NONE PE +CAACCCCAAA C529 NONE PE +ACCCAACCCA C530 NONE PE +CACACCAAAC C531 NONE PE +CAACAAAAAC C532 NONE PE +CCAAAAAAAC C533 NONE PE +ACCAACACAC C534 NONE PE +CCAAAACACC C535 NONE PE +CACCCAAACC C536 NONE PE +CAAACCAAAC C537 NONE PE +CACAAACACA C538 NONE PE +ACCCACCCCC C539 NONE PE +AACCACACAC C540 NONE PE +CACCCCCACA C541 NONE PE +CACAACCACC C542 NONE PE +AAAAACAAAA C543 NONE PE +CCACACAAAC C544 NONE PE +AAACAAACAC C545 NONE PE +ACCCCCACCC C546 NONE PE +ACACCCCAAA C547 NONE PE +CAAACCAAAC C548 NONE PE +AAACCACCAA C549 NONE PE +CAACCAAAAC C550 NONE PE +ACACCCCCCC C551 NONE PE +CACCCACCAC C552 NONE PE +ACCCCAACCC C553 NONE PE +AAACCCAACA C554 NONE PE +ACCACACAAA C555 NONE PE +ACCCACCACC C556 NONE PE +CCCCAACCAA C557 NONE PE +CAAAAACACC C558 NONE PE +ACCACAAAAC C559 NONE PE +ACCCCCCCAA C560 NONE PE +CCACAAAACA C561 NONE PE +CAAACCCACC C562 NONE PE +ACACCACAAA C563 NONE PE +ACCAACAAAA C564 NONE PE +CCCAACAAAA C565 NONE PE +CACCCCCCCA C566 NONE PE +AAACCCACCA C567 NONE PE +CACACCACAA C568 NONE PE +CCAAACCCCA C569 NONE PE +CACCCCACCC C570 NONE PE +AAACCCCCAA C571 NONE PE +ACACCACACC C572 NONE PE +ACAAAACACC C573 NONE PE +CACAAACCAC C574 NONE PE +ACCCCCACAA C575 NONE PE +CCCCAAACCC C576 NONE PE +AAAACACAAC C577 NONE PE +AACCCAACCA C578 NONE PE +AAACACACAC C579 NONE PE +AAACACCACC C580 NONE PE +AACCCCAACA C581 NONE PE +CCACCCAAAC C582 NONE PE +CCAAAACAAC C583 NONE PE +ACCAACAAAC C584 NONE PE +AAACACCACC C585 NONE PE +AACCACACAC C586 NONE PE +CACAACAAAA C587 NONE PE +AACCAAAAAC C588 NONE PE +ACCAAACCAA C589 NONE PE +ACAAACACAC C590 NONE PE +ACCACACCAA C591 NONE PE +AAAAACAACC C592 NONE PE +CACACAACAC C593 NONE PE +CCCCCAACCC C594 NONE PE +ACACAAAACC C595 NONE PE +CCACCACACC C596 NONE PE diff --git a/sequence_processing_pipeline/tests/data/files_to_count.txt b/sequence_processing_pipeline/tests/data/files_to_count.txt new file mode 100644 index 00000000..8d7ce4b1 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/files_to_count.txt @@ -0,0 +1,8 @@ +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +/ddn_scratch/qiita_t/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts.sbatch b/sequence_processing_pipeline/tests/data/seq_counts.sbatch new file mode 100644 index 00000000..cc73187c --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts.sbatch @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J seq_counts +#SBATCH --time 1440 +#SBATCH --mem 8G +#SBATCH -N 1 +#SBATCH -c 1 +#SBATCH -p qiita +#SBATCH --array=1-8 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs/%x_%A_%a.err + +set -x +set -e + +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/logs + +files=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/SeqCountsJob/files_to_count.txt)) +my_file=${files[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "${my_file}" + +conda activate qp-knight-lab-processing-2022.03 + +seqtk size ${my_file} diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out new file mode 100644 index 00000000..50a46674 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_1.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8345327641 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out new file mode 100644 index 00000000..87ad9f55 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_2.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9293296513 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out new file mode 100644 index 00000000..a22d9f8d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_3.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.erroneous.fastq +1932116 34778088 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out new file mode 100644 index 00000000..0b35614a --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_4.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 9317943166 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out new file mode 100644 index 00000000..887522ae --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_5.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.corrected.err_barcode_removed.fastq +70399028 1267182504 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out new file mode 100644 index 00000000..a4fbd555 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_6.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R2_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 8370238082 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err new file mode 100644 index 00000000..47c59651 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C519.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out new file mode 100644 index 00000000..6c6a9c06 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_7.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C519.fastq.gz.corrected.err_barcode_removed.fastq +64464162 1160354916 diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err new file mode 100644 index 00000000..e9c0cf9d --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.err @@ -0,0 +1,3 @@ +This is an example .err file produced by seq_counts.sbatch. +Additional details removed. ++ seqtk size REMOVED/working_dir/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_R1_C520.fastq.gz.corrected.err_barcode_removed.fastq diff --git a/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out new file mode 100644 index 00000000..9be52329 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/seq_counts_logs/seq_count_2679966_8.out @@ -0,0 +1,2 @@ +REMOVED/8edbdee2-da52-4278-af40-267185bbcd7e/TellReadJob/Full/TellReadJob_I1_C520.fastq.gz.erroneous.fastq +2139633 38513394 diff --git a/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv new file mode 100644 index 00000000..105330fd --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_metag_dummy_sample_sheet.csv @@ -0,0 +1,135 @@ +[Header],,,,,,,, +IEMFileVersion,1,,,,,,, +SheetType,tellseq_metag,,,,,,, +SheetVersion,10,,,,,,, +Investigator Name,Knight,,,,,,, +Experiment Name,RKL0151,,,,,,, +Date,5/6/24,,,,,,, +Workflow,GenerateFASTQ,,,,,,, +Application,FASTQ Only,,,,,,, +Assay,Metagenomic,,,,,,, +Description,,,,,,,, +Chemistry,Default,,,,,,, +,,,,,,,, +[Reads],,,,,,,, +151,,,,,,,, +151,,,,,,,, +,,,,,,,, +[Settings],,,,,,,, +ReverseComplement,0,,,,,,, +,,,,,,,, +[Data],,,,,,,, +Sample_ID,Sample_Name,Sample_Plate,well_id_384,barcode_id,Sample_Project,Well_description,Lane, +LS_8_10_2013_SRE,LS.8.10.2013.SRE,LS_Donor_SS_Samples_P1,A1,C501,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013.SRE,1, +LS_12_17_2014_SRE,LS.12.17.2014.SRE,LS_Donor_SS_Samples_P1,B1,C509,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.17.2014.SRE,1, +LS_4_4_2015_SRE,LS.4.4.2015.SRE,LS_Donor_SS_Samples_P1,C1,C502,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.4.2015.SRE,1, +LS_2_23_2015_SRE,LS.2.23.2015.SRE,LS_Donor_SS_Samples_P1,D1,C510,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.23.2015.SRE,1, +LS_9_28_2014_SRE,LS.9.28.2014.SRE,LS_Donor_SS_Samples_P1,E1,C503,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.28.2014.SRE,1, +LS_12_14_2013_SRE,LS.12.14.2013.SRE,LS_Donor_SS_Samples_P1,F1,C511,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.14.2013.SRE,1, +LS_4_7_2013_SRE,LS.4.7.2013.SRE,LS_Donor_SS_Samples_P1,G1,C504,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.7.2013.SRE,1, +LS_7_14_2013_SRE,LS.7.14.2013.SRE,LS_Donor_SS_Samples_P1,H1,C512,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.14.2013.SRE,1, +LS_10_27_2013_SRE,LS.10.27.2013.SRE,LS_Donor_SS_Samples_P1,I1,C505,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.10.27.2013.SRE,1, +LS_1_19_2014_SRE,LS.1.19.2014.SRE,LS_Donor_SS_Samples_P1,J1,C513,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014.SRE,1, +LS_9_3_2013_SRE,LS.9.3.2013.SRE,LS_Donor_SS_Samples_P1,K1,C506,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.9.3.2013.SRE,1, +LS_2_25_2013_SRE,LS.2.25.2013.SRE,LS_Donor_SS_Samples_P1,L1,C514,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.25.2013.SRE,1, +LS_7_26_2015_SRE,LS.7.26.2015.SRE,LS_Donor_SS_Samples_P1,M1,C507,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.26.2015.SRE,1, +LS_2_17_2014_SRE,LS.2.17.2014.SRE,LS_Donor_SS_Samples_P1,N1,C515,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.17.2014.SRE,1, +LS_6_29_2015_SRE,LS.6.29.2015.SRE,LS_Donor_SS_Samples_P1,O1,C508,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015.SRE,1, +LS_3_24_2015_SRE,LS.3.24.2015.SRE,LS_Donor_SS_Samples_P1,P1,C516,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015.SRE,1, +LS_1_6_2015_SRE,LS.1.6.2015.SRE,LS_Donor_SS_Samples_P1,A2,C517,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.6.2015.SRE,1, +T_LS_7_15_15B_SRE,T.LS.7.15.15B.SRE,LS_Donor_SS_Samples_P1,B2,C525,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B.SRE,1, +LS_6_9_2013_SRE,LS.6.9.2013.SRE,LS_Donor_SS_Samples_P1,C2,C518,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.9.2013.SRE,1, +Person A_SRE,Person A.SRE,LS_Donor_SS_Samples_P1,D2,C526,Tellseq_Shortread_Metagenomic_Analysis_10283,Person A.SRE,1, +LS_8_22_2014_R2_SRE,LS.8.22.2014.R2.SRE,LS_Donor_SS_Samples_P1,E2,C519,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R2.SRE,1, +Person B_SRE,Person B.SRE,LS_Donor_SS_Samples_P1,F2,C527,Tellseq_Shortread_Metagenomic_Analysis_10283,Person B.SRE,1, +LS_8_22_2014_R1_SRE,LS.8.22.2014.R1.SRE,LS_Donor_SS_Samples_P1,G2,C520,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.22.2014.R1.SRE,1, +Person C_SRE,Person C.SRE,LS_Donor_SS_Samples_P1,H2,C528,Tellseq_Shortread_Metagenomic_Analysis_10283,Person C.SRE,1, +LS_12_28_2011_SRE,LS.12.28.2011.SRE,LS_Donor_SS_Samples_P1,I2,C521,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.12.28.2011.SRE,1, +Person D_SRE,Person D.SRE,LS_Donor_SS_Samples_P1,J2,C529,Tellseq_Shortread_Metagenomic_Analysis_10283,Person D.SRE,1, +LS_5_4_2014_SRE,LS.5.4.2014.SRE,LS_Donor_SS_Samples_P1,K2,C522,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.5.4.2014.SRE,1, +45208_1_1,45208.1.1,UROBIOME_TEST_MF_SAMPLES_P2,L2,C530,Tellseq_Shortread_Metagenomic_Analysis_10283,45208.1.1,1, +LS_11_6_2012_SRE,LS.11.6.2012.SRE,LS_Donor_SS_Samples_P1,M2,C523,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.6.2012.SRE,1, +45248_2_2,45248.2.2,UROBIOME_TEST_MF_SAMPLES_P2,N2,C531,Tellseq_Shortread_Metagenomic_Analysis_10283,45248.2.2,1, +LS_4_3_2012_SRE,LS.4.3.2012.SRE,LS_Donor_SS_Samples_P1,O2,C524,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.3.2012.SRE,1, +45261_2_1,45261.2.1,UROBIOME_TEST_MF_SAMPLES_P2,P2,C532,Tellseq_Shortread_Metagenomic_Analysis_10283,45261.2.1,1, +45272_11_2,45272.11.2,UROBIOME_TEST_MF_SAMPLES_P2,A3,C533,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.11.2,1, +T_LS_7_12_15A,T.LS.7.12.15A,Larry_Smarr_Plus_Donor_Samples_P3,B3,C541,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.12.15A,1, +45316_8_1,45316.8.1,UROBIOME_TEST_MF_SAMPLES_P2,C3,C534,Tellseq_Shortread_Metagenomic_Analysis_10283,45316.8.1,1, +T_LS_7_8_15A,T.LS.7.8.15A,Larry_Smarr_Plus_Donor_Samples_P3,D3,C542,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.8.15A,1, +45327_7_2,45327.7.2,UROBIOME_TEST_MF_SAMPLES_P2,E3,C535,Tellseq_Shortread_Metagenomic_Analysis_10283,45327.7.2,1, +LS_8_10_2013,LS.8.10.2013,LS_Time_Series_ABSQ_P4,F3,C543,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.10.2013,1, +45272_1_swab_2,45272.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,G3,C536,Tellseq_Shortread_Metagenomic_Analysis_10283,45272.1.swab.2,1, +LS_6_29_2015,LS.6.29.2015,LS_Time_Series_ABSQ_P4,H3,C544,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.29.2015,1, +45326_1_swab_2,45326.1.swab.2,UROBIOME_TEST_MF_SAMPLES_P2,I3,C537,Tellseq_Shortread_Metagenomic_Analysis_10283,45326.1.swab.2,1, +LS_3_8_2015,LS.3.8.2015,LS_Time_Series_ABSQ_P4,J3,C545,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.8.2015,1, +T_LS_7_19_15A,T.LS.7.19.15A,Larry_Smarr_Plus_Donor_Samples_P3,K3,C538,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15A,1, +LS_4_29_2013,LS.4.29.2013,LS_Time_Series_ABSQ_P4,L3,C546,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.4.29.2013,1, +T_LS_7_15_15B,T.LS.7.15.15B,Larry_Smarr_Plus_Donor_Samples_P3,M3,C539,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.15.15B,1, +LS_11_16_2014,LS.11.16.2014,LS_Time_Series_ABSQ_P4,N3,C547,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.16.2014,1, +T_LS_7_19_15B,T.LS.7.19.15B,Larry_Smarr_Plus_Donor_Samples_P3,O3,C540,Tellseq_Shortread_Metagenomic_Analysis_10283,T.LS.7.19.15B,1, +LS_1_19_2014,LS.1.19.2014,LS_Time_Series_ABSQ_P4,P3,C548,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.19.2014,1, +LS_3_24_2015,LS.3.24.2015,LS_Time_Series_ABSQ_P4,A4,C549,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.24.2015,1, +LS_2_8_2013,LS.2.8.2013,LS_Time_Series_ABSQ_P4,B4,C557,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.2.8.2013,1, +LS_11_10_2013,LS.11.10.2013,LS_Time_Series_ABSQ_P4,C4,C550,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.11.10.2013,1, +Marine_Sediment_0_2cm_R1,Marine.Sediment.0.2cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,D4,C558,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.0.2cm.R1,1, +LS_3_23_2014,LS.3.23.2014,LS_Time_Series_ABSQ_P4,E4,C551,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.3.23.2014,1, +Marine_Sediment_5_7cm_R1,Marine.Sediment.5.7cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,F4,C559,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.5.7cm.R1,1, +LS_1_14_2015,LS.1.14.2015,LS_Time_Series_ABSQ_P4,G4,C552,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.14.2015,1, +Marine_Sediment_10_12cm_R2,Marine.Sediment.10.12cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,H4,C560,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.10.12cm.R2,1, +LS_8_25_2014,LS.8.25.2014,LS_Time_Series_ABSQ_P4,I4,C553,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.8.25.2014,1, +Marine_Sediment_15_17cm_R1,Marine.Sediment.15.17cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,J4,C561,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.15.17cm.R1,1, +LS_1_26_2013,LS.1.26.2013,LS_Time_Series_ABSQ_P4,K4,C554,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.1.26.2013,1, +Marine_Sediment_20_22cm_R1,Marine.Sediment.20.22cm.R1,MarineSediment_Donor_LarrySmarr_NoProK_P5,L4,C562,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.20.22cm.R1,1, +LS_6_16_2014,LS.6.16.2014,LS_Time_Series_ABSQ_P4,M4,C555,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.6.16.2014,1, +Marine_Sediment_25_27cm_R2,Marine.Sediment.25.27cm.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,N4,C563,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.25.27cm.R2,1, +LS_7_27_2014,LS.7.27.2014,LS_Time_Series_ABSQ_P4,O4,C556,Tellseq_Shortread_Metagenomic_Analysis_10283,LS.7.27.2014,1, +Marine_Sediment_30_32cm_R3,Marine.Sediment.30.32cm.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,P4,C564,Tellseq_Shortread_Metagenomic_Analysis_10283,Marine.Sediment.30.32cm.R3,1, +Person_A_R3,Person.A.R3,MarineSediment_Donor_LarrySmarr_NoProK_P5,A5,C565,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.A.R3,1, +Soil_SynCom_T4_2_Tube5,Soil.SynCom.T4.2.Tube5,16_member_community_native_soil_P6,B5,C573,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.2.Tube5,1, +Person_B_R2,Person.B.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,C5,C566,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.B.R2,1, +A21,A21,Tumor_Community_P7,D5,C574,Tellseq_Shortread_Metagenomic_Analysis_10283,A21,1, +Person_C_R4,Person.C.R4,MarineSediment_Donor_LarrySmarr_NoProK_P5,E5,C567,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.C.R4,1, +A23,A23,Tumor_Community_P7,F5,C575,Tellseq_Shortread_Metagenomic_Analysis_10283,A23,1, +Person_D_R2,Person.D.R2,MarineSediment_Donor_LarrySmarr_NoProK_P5,G5,C568,Tellseq_Shortread_Metagenomic_Analysis_10283,Person.D.R2,1, +A27,A27,Tumor_Community_P7,H5,C576,Tellseq_Shortread_Metagenomic_Analysis_10283,A27,1, +Soil_SynCom_T1_2_Tube1,Soil.SynCom.T1.2.Tube1,16_member_community_native_soil_P6,I5,C569,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T1.2.Tube1,1, +A30,A30,Tumor_Community_P7,J5,C577,Tellseq_Shortread_Metagenomic_Analysis_10283,A30,1, +Soil _SynCom_T2_2_Tube2,Soil .SynCom.T2.2.Tube2,16_member_community_native_soil_P6,K5,C570,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil .SynCom.T2.2.Tube2,1, +A31,A31,Tumor_Community_P7,L5,C578,Tellseq_Shortread_Metagenomic_Analysis_10283,A31,1, +Soil_SynCom_T3_2_Tube3,Soil.SynCom.T3.2.Tube3,16_member_community_native_soil_P6,M5,C571,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T3.2.Tube3,1, +S1_T1_A,S1.T1.A,Tumor_Community_P7,N5,C579,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.A,1, +Soil_SynCom_T4_1_Tube4,Soil.SynCom.T4.1.Tube4,16_member_community_native_soil_P6,O5,C572,Tellseq_Shortread_Metagenomic_Analysis_10283,Soil.SynCom.T4.1.Tube4,1, +S2_T1_B_A,S2.T1.B.A,Tumor_Community_P7,P5,C580,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.B.A,1, +S2_T1_01BH1_Y_A,S2.T1.01BH1.Y.A,Tumor_Community_P7,A6,C581,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.01BH1.Y.A,1, +S1_T1_1CIM_A,S1.T1.1CIM.A,Tumor_Community_P7,B6,C589,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.1CIM.A,1, +S2_MT1_1HBI_Y_A,S2.MT1.1HBI.Y.A,Tumor_Community_P7,C6,C582,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.1HBI.Y.A,1, +S1_M1_B_1CIM_A,S1.M1.B.1CIM.A,Tumor_Community_P7,D6,C590,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.M1.B.1CIM.A,1, +S1_T1_B_LBM_A,S1.T1.B.LBM.A,Tumor_Community_P7,E6,C583,Tellseq_Shortread_Metagenomic_Analysis_10283,S1.T1.B.LBM.A,1, +BLANK_K15_cancer_patient,BLANK.K15.cancer.patient,Tumor_Community_P7,F6,C591,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.K15.cancer.patient,1, +S2_MT1_LBM_A,S2.MT1.LBM.A,Tumor_Community_P7,G6,C584,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.MT1.LBM.A,1, +BLANK_M15_cancer_patient,BLANK.M15.cancer.patient,Tumor_Community_P7,H6,C592,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.M15.cancer.patient,1, +S2_T1_A,S2.T1.A,Tumor_Community_P7,I6,C585,Tellseq_Shortread_Metagenomic_Analysis_10283,S2.T1.A,1, +BLANK_O15_cancer_patient,BLANK.O15.cancer.patient,Tumor_Community_P7,J6,C593,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.O15.cancer.patient,1, +1CIM_M_CNTL_A,1CIM.M.CNTL.A,Tumor_Community_P7,K6,C586,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.M.CNTL.A,1, +BLANK_A17_cancer_patient,BLANK.A17.cancer.patient,Tumor_Community_P7,L6,C594,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.A17.cancer.patient,1, +1CIM_G_CNTL_A,1CIM.G.CNTL.A,Tumor_Community_P7,M6,C587,Tellseq_Shortread_Metagenomic_Analysis_10283,1CIM.G.CNTL.A,1, +BLANK_C17_cancer_patient,BLANK.C17.cancer.patient,Tumor_Community_P7,N6,C595,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.C17.cancer.patient,1, +GC_1HCOM_A,GC.1HCOM.A,Tumor_Community_P7,O6,C588,Tellseq_Shortread_Metagenomic_Analysis_10283,GC.1HCOM.A,1, +BLANK_E17_cancer_patient,BLANK.E17.cancer.patient,Tumor_Community_P7,P6,C596,Tellseq_Shortread_Metagenomic_Analysis_10283,BLANK.E17.cancer.patient,1, +,,,,,,,, +[Bioinformatics],,,,,,,, +Sample_Project,QiitaID,BarcodesAreRC,ForwardAdapter,ReverseAdapter,HumanFiltering,library_construction_protocol,experiment_design_description,contains_replicates +Tellseq_Shortread_Metagenomic_Analysis_10283,10283,TRUE,GATCGGAAGAGCACACGTCTGAACTCCAGTCAC,GATCGGAAGAGCGTCGTGTAGGGAAAGGAGTGT,TRUE,tellseq,tellseq metagenomics,FALSE +,,,,,,,, +[Contact],,,,,,,, +Sample_Project,Email,,,,,,, +Tellseq_Shortread_Metagenomic_Analysis_10283,cbrenchy@gmail.com,,,,,,, +,,,,,,,, +[SampleContext],,,,,,,, +sample_name,sample_type,primary_qiita_study,secondary_qiita_studies,,,,, +BLANK.K15.cancer.patient,control blank,10283,,,,,, +BLANK.M15.cancer.patient,control blank,10283,,,,,, +BLANK.O15.cancer.patient,control blank,10283,,,,,, +BLANK.A17.cancer.patient,control blank,10283,,,,,, +BLANK.C17.cancer.patient,control blank,10283,,,,,, +BLANK.E17.cancer.patient,control blank,10283,,,,,, \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch new file mode 100644 index 00000000..f7a53198 --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_output/integrate_test.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash -l +#SBATCH -J integrate +#SBATCH --time 96:00:00 +#SBATCH --mem 16G +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH -p qiita +#SBATCH --array=1-96 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/logs/integrate_%x_%A_%a.err + +set -x +set -e + +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp + +# get list of samples and determine which sample this array instance will work +# on. +samples=($(cat sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/sample_index_list.txt | cut -f 2)) +sample=${samples[$((${SLURM_ARRAY_TASK_ID} - 1))]} + +echo "Processing sample ${sample}..." + +# make temp directory +export TMPDIR=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/tmp +mkdir -p $TMPDIR + + +# TODO: All three input files must be non-zero in length. +# If possible, do this check as part of normal FSR operation. +# Previously this was done right here BEFORE integrating, rather +# than after. + +# NB: non-zero file-length check removed for now. This should be performed +# by FSR after processing is done. +# TODO: Make sure raw_fastq_dir is TellReadJob/Full +r1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +r2_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_R2_${sample}.fastq.gz.corrected.err_barcode_removed.fastq +i1_in=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full/TellReadJob_I1_${sample}.fastq.gz.corrected.err_barcode_removed.fastq + +# create output directory +mkdir -p sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated + +# generate output file names +r1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R1.fastq.gz +r2_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.R2.fastq.gz +i1_out=sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TRIntegrateJob/integrated/${sample}.I1.fastq.gz + +# generate 'integrated' I1 fastq.gz file. We do this as part of each array so +# they're done in parallel. +gzip -c ${i1_in} > ${i1_out} + +# generate integrated R1 and R2 fastq.gz files. +conda activate qp-knight-lab-processing-2022.03 + +python sequence_processing_pipeline/contrib/integrate-indices-np.py integrate \ +--no-sort \ +--r1-in ${r1_in} \ +--r2-in ${r2_in} \ +--i1-in ${i1_in} \ +--r1-out ${r1_out} \ +--r2-out ${r2_out} \ +--threads 4 \ No newline at end of file diff --git a/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch new file mode 100644 index 00000000..9dc3ccff --- /dev/null +++ b/sequence_processing_pipeline/tests/data/tellseq_output/tellread_test.sbatch @@ -0,0 +1,47 @@ +#!/bin/bash -l +#SBATCH -J tellread +#SBATCH -p qiita +#SBATCH -N 1 +#SBATCH -c 4 +#SBATCH --mem 16G +#SBATCH --time 96:00:00 + +#SBATCH --output sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.out +#SBATCH --error sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/logs/tellread_%x-%A.err + +set -x + +module load singularity_3.6.4 +$HOME/qiita-spots/tellread-release-novaseqX/run_tellread_sing.sh \ + -i sequence_processing_pipeline/tests/data/sample_run_directories/150629_SN1001_0511_AH5L7GBCXX \ + -o sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob \ + -s $(echo C501,C509,C502,C510,C503,C511,C504,C512,C505,C513,C506,C514,C507,C515,C508,C516,C517,C525,C518,C526,C519,C527,C520,C528,C521,C529,C522,C530,C523,C531,C524,C532,C533,C541,C534,C542,C535,C543,C536,C544,C537,C545,C538,C546,C539,C547,C540,C548,C549,C557,C550,C558,C551,C559,C552,C560,C553,C561,C554,C562,C555,C563,C556,C564,C565,C573,C566,C574,C567,C575,C568,C576,C569,C577,C570,C578,C571,C579,C572,C580,C581,C589,C582,C590,C583,C591,C584,C592,C585,C593,C586,C594,C587,C595,C588,C596 | tr -d '"') \ + -g $(echo NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE | tr -d '"') \ + -j ${SLURM_JOB_CPUS_PER_NODE} \ + -l s_1 + +# get the timestamp for the most recently changed file in directory '.' + +# hard-limit for wait time set to ~ 8 hours. +# (4 checks per hour, for 8 hours equals 32 iterations) +for i in $(seq 1 32); +do + before="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + # assume TellReadJob is finished if ctime hasn't changed in 15 minutes + # for any fastq file in the directory. + sleep 900 + after="$(find sequence_processing_pipeline/tests/2caa8226-cf69-45a3-bd40-1e90ec3d18d0/TellReadJob/Full -type f -printf '%T@\n' | sort -n | tail -1)" + + echo "$before $after" + + if [[ "$before" == "$after" ]]; then + echo "DONE" + exit 0 + else + echo "NOT DONE" + fi +done + +# if we've reached this point then we've exceeded our hard-limit for waiting. +# return w/an error. +exit 1 diff --git a/sequence_processing_pipeline/tests/test_ConvertJob.py b/sequence_processing_pipeline/tests/test_ConvertJob.py index e8879864..f394cfb8 100644 --- a/sequence_processing_pipeline/tests/test_ConvertJob.py +++ b/sequence_processing_pipeline/tests/test_ConvertJob.py @@ -955,7 +955,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) error_msg = ("This job died.\n2024-01-01T12:12:12Z thread 99999 ERROR:" diff --git a/sequence_processing_pipeline/tests/test_FastQCJob.py b/sequence_processing_pipeline/tests/test_FastQCJob.py index 28fe52cb..a2291296 100644 --- a/sequence_processing_pipeline/tests/test_FastQCJob.py +++ b/sequence_processing_pipeline/tests/test_FastQCJob.py @@ -1121,7 +1121,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: diff --git a/sequence_processing_pipeline/tests/test_Job.py b/sequence_processing_pipeline/tests/test_Job.py index 7aa5889a..192709b6 100644 --- a/sequence_processing_pipeline/tests/test_Job.py +++ b/sequence_processing_pipeline/tests/test_Job.py @@ -1,10 +1,10 @@ import unittest from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import PipelineError -from os.path import abspath, join, dirname -from os import makedirs +from os.path import abspath, join, dirname, split, isdir +from os import makedirs, chmod, remove from functools import partial -from shutil import rmtree +from shutil import rmtree, copyfile import re @@ -14,7 +14,10 @@ def setUp(self): def tearDown(self): for some_path in self.remove_these: - rmtree(some_path) + if isdir(some_path): + rmtree(some_path) + else: + remove(some_path) def test_system_call(self): package_root = abspath('./sequence_processing_pipeline') @@ -123,6 +126,168 @@ def test_extract_project_names_from_fastq_dir(self): obs = job.extract_project_names_from_fastq_dir(tmp) self.assertEqual(obs, ['NPH_15288']) + def test_query_slurm(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567', '1234568', '1234569', '1234570'] + jobs = job._query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_query_slurm_single_job(self): + # perform test_query_slurm() but with a single job only. + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + # set up a fake job so that we can test the query_jobs() method. + # it doesn't matter what the parameters are so long as the job + # passes initialization. + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + # locate python binary path + # we have a python script called fake_squeue.py that can simulate + # repeated calls to squeue. It does this by generating a fake random + # set of array job ids for each job id passed to it and records their + # state in my_state.json. Each array job is set to change state from + # RUNNING to either COMPLETED or FAILED between three to seven squeue + # calls. The choice of which job-ids will succeed or fail, as is which + # individual array-ids will succeed or fail is random. + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + + # place the fake squeue file in a place that's known to be in the + # PATH. Make sure this file is removed after this test is complete. + # Also make sure the saved state file is removed. + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1234567'] + jobs = job._query_slurm(job_ids) + + # jobs is a dictionary of unique array_ids and/or job-ids for non- + # array jobs. The faked squeue reports anywhere between five and + # fifteen array jobs for a given job-id. After the first invocation + # all processes should be in the 'RUNNING' state. + # e.g.: "1234567_1": "RUNNING" + + for j in jobs: + self.assertEqual(jobs[j], 'RUNNING') + if '_' in j: + jid, aid = j.split('_') + else: + jid = j + aid = None + + # assert the job id component of the array-id is a valid job id. + self.assertIn(jid, job_ids) + + if aid: + # assert the array-id component of the array-id is between 0 + # and 15 as defined in the fake squeue script. + aid = int(aid) + self.assertLess(aid, 15) + self.assertGreaterEqual(aid, 0) + + def test_wait_on_job_ids(self): + package_root = abspath('./sequence_processing_pipeline') + base_path = partial(join, package_root, 'tests', 'data') + + job = Job(base_path('211021_A00000_0000_SAMPLE'), + base_path('7b9d7d9c-2cd4-4d54-94ac-40e07a713585'), + '200nnn_xnnnnn_nnnn_xxxxxxxxxx', ['ls'], 2, None) + + python_path = split(job._which('python'))[0] + squeue_path = join(python_path, 'squeue') + foo = join(package_root, 'scripts', 'fake_squeue.py') + copyfile(foo, squeue_path) + chmod(squeue_path, 0o755) + self.remove_these.append(squeue_path) + self.remove_these.append(join(package_root, 'scripts', + 'my_state.json')) + + job_ids = ['1', '2', '3', '4'] + + # to shorten the test time, set polling_interval_in_seconds to be + # lower than one minute. + Job.polling_interval_in_seconds = 10 + results = job.wait_on_job_ids(job_ids) + + # calling query_slurm one more time after wait_on_job_ids() is called + # will technically advance the counter one more, which means that this + # doesn't confirm that wait_on_job_ids() doesn't return before EVERY + # single job is either COMPLETED or FAILED. However it does confirm + # that wait_on_job_ids() doesn't return once the FIRST completed array + # job is either COMPLETED or FAILED while others are still RUNNING. + # This was previously an issue. + obs = job._query_slurm(job_ids) + + for array_id in obs: + state = obs[array_id] + # w/out relying on states defined in Job, simply confirm all are + # either COMPLETED or FAILED. + self.assertIn(state, ['COMPLETED', 'FAILED']) + + # since wait_on_job_ids() now returns the same data structure as + # query_slurm(), they should be equal. + self.assertDictEqual(obs, results) + if __name__ == '__main__': unittest.main() diff --git a/sequence_processing_pipeline/tests/test_NuQCJob.py b/sequence_processing_pipeline/tests/test_NuQCJob.py index a3cfdada..8737552b 100644 --- a/sequence_processing_pipeline/tests/test_NuQCJob.py +++ b/sequence_processing_pipeline/tests/test_NuQCJob.py @@ -9,7 +9,7 @@ ) from os import makedirs, remove from metapool import load_sample_sheet -import glob +from os import walk class TestNuQCJob(unittest.TestCase): @@ -996,7 +996,7 @@ def test_error_msg_from_logs(self): # an internal method to force submit_job() to raise a JobFailedError # instead of submitting the job w/sbatch and waiting for a failed - # job w/sacct. + # job w/squeue. self.assertTrue(job._toggle_force_job_fail()) try: @@ -2208,10 +2208,56 @@ def test_generate_mmi_filter_cmds_w_annotate_fastq(self): self.assertEqual(obs, exp) def test_move_trimmed(self): - # Note: this test does not make use of the output_dir that other - # tests use. + # create a NuQCJob() object, but do not call run(). + # instead we will manually create some files to test with. + double_db_paths = ["db_path/mmi_1.db", "db_path/mmi_2.db"] + job = NuQCJob( + self.fastq_root_path, + self.output_path, + self.good_sample_sheet_path, + double_db_paths, + "queue_name", + 1, + 1440, + "8", + "fastp", + "minimap2", + "samtools", + [], + self.qiita_job_id, + 1000, + "", + self.movi_path, + self.gres_value, + self.pmls_path, + ['BX'] + ) - for dummy_fp in SAMPLE_DIR: + sample_dir = [ + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP890158A02_S58_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R1_001." + "interleave.fastq.gz", + "NuQCJob/only-adapter-filtered/EP023801B04_S27_L001_R2_001." + "interleave.fastq.gz", + "NuQCJob/NPH_15288/fastp_reports_dir/html/EP890158A02_S58_L001_" + "R1_001.html", + "NuQCJob/NPH_15288/fastp_reports_dir/json/EP023801B04_S27_L001_" + "R1_001.json", + "NuQCJob/process_all_fastq_files.sh", + "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981." + "completed", + "NuQCJob/logs/slurm-1897981_1.out", + "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/CDPH-SAL_' + 'Salmonella_Typhi_MDL-150__S36_L001_R2_001.interleave.fastq.gz', + ] + + for dummy_fp in sample_dir: dummy_fp = self.path(dummy_fp) dummy_path = dirname(dummy_fp) makedirs(dummy_path, exist_ok=True) @@ -2220,38 +2266,33 @@ def test_move_trimmed(self): trimmed_only_path = self.path("NuQCJob", "only-adapter-filtered") - NuQCJob._move_trimmed_files("NPH_15288", trimmed_only_path) - - new_path = join(trimmed_only_path, "NPH_15288") - pattern = f"{new_path}/*.fastq.gz" - - exp = [ - ( - "only-adapter-filtered/NPH_15288/359180345_S58_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R2_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180338_S51_L001_R1_001." - "fastq.gz" - ), - ( - "only-adapter-filtered/NPH_15288/359180337_S27_L001_R2_001." - "fastq.gz" - ), - ] - - for trimmed_file in list(glob.glob(pattern)): - trimmed_file = trimmed_file.split("NuQCJob/")[-1] - if trimmed_file not in exp: - self.assertIn(trimmed_file, exp) + # test _move_trimmed_files() by verifying that only the interleave + # fastq files from the NYU project are moved. + job._move_trimmed_files("NYU_BMS_Melanoma_13059", trimmed_only_path) + + new_path = join(trimmed_only_path, "NYU_BMS_Melanoma_13059") + + exp = { + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R1_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP890158A02' + '_S58_L001_R2_001.interleave.fastq.gz', + 'NuQCJob/only-adapter-filtered/NYU_BMS_Melanoma_13059/EP023801B04' + '_S27_L001_R2_001.interleave.fastq.gz' + } + + obs = [] + for root, dirs, files in walk(new_path): + for some_file in files: + some_path = join(root, some_file) + some_path = some_path.replace(self.path(""), "") + obs.append(some_path) + + # confirm that only the samples in NYU_BMS_Melanoma_13059 were + # moved. + self.assertEqual(set(obs), exp) def _helper(self, regex, good_names, bad_names): for good_name in good_names: @@ -2263,27 +2304,5 @@ def _helper(self, regex, good_names, bad_names): self.assertIsNone(substr, msg=f"Regex failed on {bad_name}") -SAMPLE_DIR = [ - "NuQCJob/only-adapter-filtered/359180345_S58_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R2_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180338_S51_L001_R1_001.fastq.gz", - "NuQCJob/only-adapter-filtered/359180337_S27_L001_R2_001.fastq.gz", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180354_S22_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180338_S51_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180345_S58_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180337_S27_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/html/359180353_S17_L001_R1_001.html", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180353_S17_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180337_S27_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180345_S58_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180338_S51_L001_R1_001.json", - "NuQCJob/NPH_15288/fastp_reports_dir/json/359180354_S22_L001_R1_001.json", - "NuQCJob/process_all_fastq_files.sh", - "NuQCJob/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d.1897981.completed", - "NuQCJob/logs/slurm-1897981_1.out", - "NuQCJob/tmp/hds-a439513a-5fcc-4f29-a1e5-902ee5c1309d-1", -] - if __name__ == "__main__": unittest.main() diff --git a/sequence_processing_pipeline/tests/test_Pipeline.py b/sequence_processing_pipeline/tests/test_Pipeline.py index 53267c7d..0af31439 100644 --- a/sequence_processing_pipeline/tests/test_Pipeline.py +++ b/sequence_processing_pipeline/tests/test_Pipeline.py @@ -140,7 +140,7 @@ def test_validate_mapping_file_numeric_ids(self): with NamedTemporaryFile() as tmp: self._make_mapping_file(tmp.name) exp = ['1.0', '1e-3'] - pipeline = Pipeline(self.good_config_file, self.good_run_id, None, + pipeline = Pipeline(self.good_config_file, self.good_run_id, tmp.name, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -153,7 +153,7 @@ def test_validate_mapping_file_numeric_ids(self): def test_get_sample_names_from_sample_sheet(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.mp_sheet_path, None, + self.mp_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -178,7 +178,7 @@ def test_get_sample_names_from_sample_sheet(self): def test_get_orig_names_from_sheet_with_replicates(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -198,7 +198,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -210,7 +210,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -222,7 +222,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, bu" "t not readable"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.make_runinfo_file_readable() @@ -232,7 +232,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -249,7 +249,7 @@ def test_creation(self): " valid sample-sheet."): Pipeline(self.good_config_file, self.good_run_id, - self.bad_assay_type_path, None, + self.bad_assay_type_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -257,7 +257,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -268,7 +268,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -279,7 +279,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -290,7 +290,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -300,7 +300,7 @@ def test_creation(self): "not a valid json file"): Pipeline(self.good_sample_sheet_path, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -323,7 +323,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -345,7 +345,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -368,7 +368,7 @@ def test_creation(self): "bad.json'"): Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -379,7 +379,7 @@ def test_sample_sheet_validation(self): # contained w/in its 'message' member. try: Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) except PipelineError as e: @@ -389,7 +389,7 @@ def test_sample_sheet_validation(self): # test unsuccessful validation of a bad sample-sheet with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - self.bad_sample_sheet_path, None, + self.bad_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) self.assertEqual(str(e.exception), ('Sample-sheet contains errors:\n' @@ -401,7 +401,6 @@ def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, self.good_sample_sheet_path, - None, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -511,6 +510,62 @@ def test_generate_sample_information_files(self): exp = exp_last_lines[some_name] self.assertEqual(obs, exp) + def test_generate_sample_information_files_with_additional_meta(self): + # TODO: With recent changes is this needed? + # test sample-information-file generation. + pipeline = Pipeline(self.good_config_file, self.good_run_id, + self.good_sample_sheet_path, + self.output_file_path, self.qiita_id, + Pipeline.METAGENOMIC_PTYPE) + + # create a dataframe with duplicate information to pass to + # generate_sample_information_files(). Confirm that the duplicates + # are dropped. Confirm 'NOTBLANK_999A' is also filtered out. + df = pd.DataFrame(data=[('BLANK999_999A', 'NYU_BMS_Melanoma_13059'), + ('BLANK999_999A', 'NYU_BMS_Melanoma_13059'), + ('NOTBLANK_999A', 'NYU_BMS_Melanoma_13059')], + columns=['sample_name', 'project_name']) + + sif_path = pipeline.generate_sample_info_files(addl_info=df) + + # get the path for the NYU_BMS_Melanoma dataset. + sif_path = [x for x in sif_path if 'NYU_BMS_Melanoma' in x][0] + + exp_first_line = ("BLANK1.1A\t2021-10-21\t193\t" + "Control\tNegative\tSterile water blank\t" + "Sterile water blank\turban biome\t" + "research facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\t" + "BLANK1.1A\t32.5\t-117.25\tcontrol blank\t" + "metagenome\t256318\tBLANK1.1A\t" + "NYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") + + exp_last_line = ("BLANK4.4H\t2021-10-21\t193\tControl\tNegative\t" + "Sterile water blank\tSterile water blank\t" + "urban biome\tresearch facility\tsterile water\t" + "misc environment\tUSA:CA:San Diego\tBLANK4.4H\t" + "32.5\t-117.25\tcontrol blank\tmetagenome\t256318\t" + "BLANK4.4H\tNYU_BMS_Melanoma\tTRUE\tUCSD\tFALSE") + + with open(sif_path, 'r') as f: + obs_lines = f.readlines() + + # confirm that each file contains the expected header. + header = obs_lines[0].strip() + self.assertEqual(header, '\t'.join(Pipeline.sif_header)) + + # confirm that the first line of each file is as expected. + obs = obs_lines[1].strip() + exp = exp_first_line + + self.assertEqual(obs, exp) + + # confirm that the last line of each file is as expected. + obs = obs_lines[-1].strip() + exp = exp_last_line + + self.assertEqual(obs, exp) + def test_get_sample_ids(self): exp_sample_ids = ['CDPH-SAL__Salmonella__Typhi__MDL-143', 'CDPH-SAL_Salmonella_Typhi_MDL-144', @@ -980,7 +1035,7 @@ def test_get_sample_ids(self): 'EP400448B04', 'EP479894B04'] # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1456,7 +1511,7 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1484,7 +1539,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1515,7 +1570,7 @@ def test_get_project_info(self): self.assertEqual(sorted(obs_project_names), sorted(exp_project_names)) pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sheet_w_replicates, None, + self.good_sheet_w_replicates, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1527,7 +1582,7 @@ def test_get_project_info(self): def test_configuration_profiles(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1556,7 +1611,7 @@ def test_configuration_profiles(self): def test_parse_project_name(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1588,7 +1643,7 @@ def test_parse_project_name(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_sample_sheet_path, None, + self.good_sample_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1605,7 +1660,7 @@ def test_identify_reserved_words(self): # create new pipeline using a/legacy (v90) metagenomic sample-sheet. pipeline = Pipeline(self.good_config_file, self.good_run_id, - self.good_legacy_sheet_path, None, + self.good_legacy_sheet_path, self.output_file_path, self.qiita_id, Pipeline.METAGENOMIC_PTYPE) @@ -1698,7 +1753,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RunInfo.xml" "' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1710,7 +1765,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "required file 'RTAComplete" ".txt' is not present."): Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1721,7 +1776,7 @@ def test_required_file_checks(self): with self.assertRaisesRegex(PipelineError, "RunInfo.xml is present, " "but not readable"): - Pipeline(self.good_config_file, self.good_run_id, None, + Pipeline(self.good_config_file, self.good_run_id, self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.make_runinfo_file_readable() @@ -1731,7 +1786,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.bad_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1746,7 +1801,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.invalid_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1757,7 +1812,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(None, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1768,7 +1823,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.invalid_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1779,7 +1834,7 @@ def test_creation(self): with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, None, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -1787,7 +1842,7 @@ def test_mapping_file_validation(self): # test successful validation of a good mapping-file. try: Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) except PipelineError as e: @@ -1797,7 +1852,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_missing_column, + self.mf_missing_column, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ('Mapping-file is missing ' @@ -1807,7 +1862,7 @@ def test_mapping_file_validation(self): # test unsuccessful validation of a bad mapping-file. with self.assertRaises(PipelineError) as e: Pipeline(self.good_config_file, self.good_run_id, - None, self.mf_duplicate_sample, + self.mf_duplicate_sample, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) self.assertEqual(str(e.exception), ("Mapping-file contains duplicate " @@ -1834,7 +1889,6 @@ def test_is_sample_sheet(self): def test_generate_sample_information_files(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2056,7 +2110,6 @@ def test_get_sample_ids(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2184,7 +2237,6 @@ def test_get_sample_names(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2208,7 +2260,7 @@ def test_get_project_info(self): # test sample-information-file generation. pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, + self.good_mapping_file_path, self.output_file_path, self.qiita_id, Pipeline.AMPLICON_PTYPE) @@ -2225,33 +2277,12 @@ def test_get_project_info(self): self.assertDictEqual(obs_d, exp_d) break - def test_additional_constuctor_check(self): - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - None, None, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - - with self.assertRaisesRegex(PipelineError, ("sample_sheet_path or " - "mapping_file_path must " - "be defined, but not " - "both.")): - Pipeline(self.good_config_file, self.good_run_id, - self.sample_sheet_path, - self.good_mapping_file_path, - self.output_file_path, - self.qiita_id, Pipeline.AMPLICON_PTYPE) - def test_dummy_sheet_generation(self): # generate a RunInfo.xml file w/only one indexed read. self.create_runinfo_file(four_reads=False) _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2270,7 +2301,6 @@ def test_dummy_sheet_generation(self): _ = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2290,7 +2320,6 @@ def test_dummy_sheet_generation(self): def test_process_run_info_file(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, @@ -2330,7 +2359,6 @@ def test_process_run_info_file(self): def test_identify_reserved_words(self): pipeline = Pipeline(self.good_config_file, self.good_run_id, - None, self.good_mapping_file_path, self.output_file_path, self.qiita_id, diff --git a/sequence_processing_pipeline/tests/test_SeqCountsJob.py b/sequence_processing_pipeline/tests/test_SeqCountsJob.py new file mode 100644 index 00000000..d641c3b2 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_SeqCountsJob.py @@ -0,0 +1,74 @@ +from os.path import join +from sequence_processing_pipeline.SeqCountsJob import SeqCountsJob +from functools import partial +import unittest +from json import load as json_load + + +class TestSeqCountsJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.files_to_count_path = self.path("data", "files_to_count.txt") + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "1440" + self.jmem = "8" + self.modules_to_load = [] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.cores_per_task = "1" + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + self.max_array_length = 100 + self.exp_sbatch_output = self.path("data", "seq_counts.sbatch") + self.exp_results = self.path("data", + "aggregate_counts_results.json") + + def test_creation(self): + def compare_files(obs, exp): + with open(obs, 'r') as f: + obs_lines = f.readlines() + obs_lines = [x.strip() for x in obs_lines] + obs_lines = [x for x in obs_lines if x != ''] + + with open(exp, 'r') as f: + exp_lines = f.readlines() + exp_lines = [x.strip() for x in exp_lines] + exp_lines = [x for x in exp_lines if x != ''] + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + # test basic good-path + job = SeqCountsJob(self.run_dir, self.output_path, self.queue_name, + self.node_count, self.wall_time_limit, self.jmem, + self.modules_to_load, self.qiita_job_id, + self.max_array_length, self.files_to_count_path, + self.cores_per_task) + + obs = job._generate_job_script() + + compare_files(obs, self.exp_sbatch_output) + + # hack log path so that it points to test data directory rather than + # the output directory for a run we didn't run(). + job.log_path = self.path("data", "seq_counts_logs") + + obs = json_load(open(job._aggregate_counts(), 'r')) + exp = json_load(open(self.exp_results, 'r')) + + self.assertDictEqual(obs, exp) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_TRIntegrateJob.py b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py new file mode 100644 index 00000000..17ded346 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TRIntegrateJob.py @@ -0,0 +1,72 @@ +from os.path import join +from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob +from functools import partial +import unittest + + +class TestTRIntegrateJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TRIntegrateJob', 'integrate_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'integrate_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TRIntegrateJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + self.integrate_script_path = join(package_root, "contrib", + "integrate-indices-np.py") + self.sil_path = self.path('data', 'fake_sample_index_list.txt') + self.raw_fastq_dir = join(self.output_path, "TellReadJob", "Full") + + def test_creation(self): + # test basic good-path + job = TRIntegrateJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, + self.qiita_job_id, self.integrate_script_path, + self.sil_path, self.raw_fastq_dir, + self.reference_base, self.reference_map, + self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_TellReadJob.py b/sequence_processing_pipeline/tests/test_TellReadJob.py new file mode 100644 index 00000000..4c6a75c2 --- /dev/null +++ b/sequence_processing_pipeline/tests/test_TellReadJob.py @@ -0,0 +1,66 @@ +from os.path import join +from sequence_processing_pipeline.TellReadJob import TellReadJob +from functools import partial +import unittest + + +class TestTellReadJob(unittest.TestCase): + def setUp(self): + package_root = "sequence_processing_pipeline" + self.path = partial(join, package_root, "tests") + # where 2caa8226-cf69-45a3-bd40-1e90ec3d18d0 is a random qiita job id. + self.obs = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0', + 'TellReadJob', 'tellread_test.sbatch') + self.exp = self.path('data', 'tellseq_output', 'tellread_test.sbatch') + + # where 150629_SN1001_0511_AH5L7GBCXX is a run-directory that already + # exists. + self.run_dir = self.path('data', 'sample_run_directories', + '150629_SN1001_0511_AH5L7GBCXX') + + self.output_path = self.path('2caa8226-cf69-45a3-bd40-1e90ec3d18d0') + + self.sample_sheet_path = self.path('data', + 'tellseq_metag_dummy_sample_' + 'sheet.csv') + + self.queue_name = "qiita" + self.node_count = "1" + self.wall_time_limit = "96:00:00" + self.jmem = "16" + self.modules_to_load = ["singularity_3.6.4"] + self.qiita_job_id = "2caa8226-cf69-45a3-bd40-1e90ec3d18d0" + self.label = "150629_SN1001_0511_AH5L7GBCXX-test" + self.reference_base = "" + self.reference_map = "" + self.tmp1_path = join(self.output_path, "TellReadJob", "output", + "tmp1") + # reflects location of script on host. + self.sing_script_path = ("$HOME/qiita-spots/tellread-release-novaseqX/" + "run_tellread_sing.sh") + self.lane = "1" + self.cores_per_task = "4" + + def test_creation(self): + # test basic good-path + job = TellReadJob(self.run_dir, self.output_path, + self.sample_sheet_path, self.queue_name, + self.node_count, self.wall_time_limit, + self.jmem, self.modules_to_load, self.qiita_job_id, + self.reference_base, self.reference_map, + self.sing_script_path, self.cores_per_task) + + job._generate_job_script() + + with open(self.obs, 'r') as f: + obs_lines = f.readlines() + + with open(self.exp, 'r') as f: + exp_lines = f.readlines() + + for obs_line, exp_line in zip(obs_lines, exp_lines): + self.assertEqual(obs_line, exp_line) + + +if __name__ == '__main__': + unittest.main() diff --git a/sequence_processing_pipeline/tests/test_commands.py b/sequence_processing_pipeline/tests/test_commands.py index f58bb176..ac8a4bd9 100644 --- a/sequence_processing_pipeline/tests/test_commands.py +++ b/sequence_processing_pipeline/tests/test_commands.py @@ -16,12 +16,12 @@ def test_split_similar_size_bins(self, glob, stat): class MockStat: st_size = 2 ** 28 # 256MB - mockglob = ['/foo/bar/a_R1_.fastq.gz', - '/foo/bar/b_R2_.fastq.gz', - '/foo/bar/a_R2_.fastq.gz', - '/foo/baz/c_R2_.fastq.gz', - '/foo/baz/c_R1_.fastq.gz', - '/foo/bar/b_R1_.fastq.gz'] + mockglob = ['/foo/bar/a_R1_001.fastq.gz', + '/foo/bar/b_R2_001.fastq.gz', + '/foo/bar/a_R2_001.fastq.gz', + '/foo/baz/c_R2_001.fastq.gz', + '/foo/baz/c_R1_001.fastq.gz', + '/foo/bar/b_R1_001.fastq.gz'] with TemporaryDirectory() as tmp: exp = (2, 1073741824) @@ -30,9 +30,12 @@ class MockStat: obs = split_similar_size_bins('foo', 1, tmp + '/prefix') self.assertEqual(obs, exp) - exp_1 = ('/foo/bar/a_R1_.fastq.gz\t/foo/bar/a_R2_.fastq.gz\tbar\n' - '/foo/bar/b_R1_.fastq.gz\t/foo/bar/b_R2_.fastq.gz\tbar\n') - exp_2 = '/foo/baz/c_R1_.fastq.gz\t/foo/baz/c_R2_.fastq.gz\tbaz\n' + exp_1 = ('/foo/bar/a_R1_001.fastq.gz\t/foo/bar/a_R2_001.fastq.gz' + '\tbar\n' + '/foo/bar/b_R1_001.fastq.gz\t/foo/bar/b_R2_001.fastq.gz' + '\tbar\n') + exp_2 = ('/foo/baz/c_R1_001.fastq.gz\t/foo/baz/c_R2_001.fastq.gz' + '\tbaz\n') obs_1 = open(tmp + '/prefix-1').read() self.assertEqual(obs_1, exp_1) @@ -56,10 +59,6 @@ def test_demux(self): '@2::MUX::bing/2', 'ATGC', '+', '!!!!', '']) infile = io.StringIO(infile_data) - exp_data_r1 = '\n'.join(['@baz/1', 'ATGC', '+', '!!!!', - '@bing/1', 'ATGC', '+', '!!!!', '']) - exp_data_r2 = '\n'.join(['@baz/2', 'ATGC', '+', '!!!!', - '@bing/2', 'ATGC', '+', '!!!!', '']) exp_data_r1 = ['@baz/1', 'ATGC', '+', '!!!!', '@bing/1', 'ATGC', '+', '!!!!'] diff --git a/sequence_processing_pipeline/tests/test_util.py b/sequence_processing_pipeline/tests/test_util.py index 136dc9a0..e5073101 100644 --- a/sequence_processing_pipeline/tests/test_util.py +++ b/sequence_processing_pipeline/tests/test_util.py @@ -4,24 +4,18 @@ class TestUtil(unittest.TestCase): def test_iter_paired_files(self): - tests = [(['a_R1_foo', - 'b_R2_bar', - 'a_R2_baz', - 'b_R1_bing'], - [('a_R1_foo', 'a_R2_baz'), - ('b_R1_bing', 'b_R2_bar')]), - (['a.R1.foo', - 'b.R2.bar', - 'a.R2.baz', - 'b.R1.bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b.R1.bing', 'b.R2.bar')]), - (['a.R1.foo', - 'b_R2_bar', - 'a.R2.baz', - 'b_R1_bing'], - [('a.R1.foo', 'a.R2.baz'), - ('b_R1_bing', 'b_R2_bar')])] + # tuples of randomly ordered fastq files and thier expected + # sorted and organized output from iter_paired_files(). + + # underscore filenames updated to require '_001.fastq.gz'. + # legacy dot filenames test remains as-is. + tests = [(['b_R2_001.fastq.gz', 'a_R1_001.fastq.gz', + 'a_R2_001.fastq.gz', 'b_R1_001.fastq.gz'], + [('a_R1_001.fastq.gz', 'a_R2_001.fastq.gz'), + ('b_R1_001.fastq.gz', 'b_R2_001.fastq.gz')]), + (['a.R1.foo', 'b.R2.bar', 'a.R2.baz', 'b.R1.bing'], + [('a.R1.foo', 'a.R2.baz'), ('b.R1.bing', 'b.R2.bar')])] + for files, exp in tests: obs = list(iter_paired_files(files)) self.assertEqual(obs, exp) @@ -42,7 +36,7 @@ def test_iter_paired_files_bad_pair(self): list(iter_paired_files(files)) def test_iter_paired_files_mismatch_prefix(self): - files = ['a_R1_foo', 'ab_R2_foo'] + files = ['a_R1_001.fastq.gz', 'ab_R2_001.fastq.gz'] with self.assertRaisesRegex(ValueError, "Mismatch prefixes"): list(iter_paired_files(files)) diff --git a/sequence_processing_pipeline/util.py b/sequence_processing_pipeline/util.py index d9586f81..e19bf98a 100644 --- a/sequence_processing_pipeline/util.py +++ b/sequence_processing_pipeline/util.py @@ -1,7 +1,18 @@ import re -PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +# PAIR_UNDERSCORE = (re.compile(r'_R1_'), '_R1_', '_R2_') +# The above will truncate on the first _R1_ found, which only works when _R1_ +# or _R2_ appears exactly once in a file path. When the wet-lab incorporates +# these same strings in their sample-names as descriptive metadata, this +# assumption is broken. For all raw fastq files being used as input into +# NuQCJob, we can assume they end in the following convention. Per Illumina +# spec, all fastq files end in _001 and we preserve this convention even at +# the cost of renaming output files from TRIntegrateJob. +# PAIR_DOT is kept as is, but may be removed later because for the purposes of +# SPP, no input should ever be named with dots instead of underscores. +PAIR_UNDERSCORE = (re.compile(r'_R1_001.fastq.gz'), + '_R1_001.fastq.gz', '_R2_001.fastq.gz') PAIR_DOT = (re.compile(r'\.R1\.'), '.R1.', '.R2.') PAIR_TESTS = (PAIR_UNDERSCORE, PAIR_DOT)