diff --git a/.travis.yml b/.travis.yml index 945c08f..a7a81e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,9 +13,15 @@ install: # At this moment this plugin has overlapping dependencies than qiita so installing everything in the same environment - conda create --yes -n qiita_env python=2.7 pip nose flake8 pyzmq networkx pyparsing natsort mock future libgfortran - 'pandas>=0.18' 'scipy>0.13.0' 'numpy>=1.7' 'h5py>=2.3.1' + 'pandas>=0.18' 'scipy>0.13.0' 'numpy>=1.7' 'h5py>=2.3.1' matplotlib - source activate qiita_env + - export CONDA_BIN=/home/travis/miniconda3/bin - pip install sphinx sphinx-bootstrap-theme coveralls ipython[all]==2.4.1 + # kneaddata dependencies + - conda install --yes -c bioconda bowtie2 trimmomatic=0.36 fastqc=0.11.5 + - export TRIMMOMATIC_BIN=$(readlink -f $(which trimmomatic)) + - export TRIMMOMATIC_DIR=`dirname $TRIMMOMATIC_BIN` + - export KD_DEMO_DB=$PWD/miniconda3/envs/qiita_env/lib/python2.7/site-packages/kneaddata/tests/data/demo_bowtie2_db/demo_db - pip install https://github.com/biocore/qiita/archive/master.zip --process-dependency-links # installing other deps - conda install --yes -c jjhelmus glpk diff --git a/qp_shotgun/kneaddata/__init__.py b/qp_shotgun/kneaddata/__init__.py new file mode 100644 index 0000000..e354c64 --- /dev/null +++ b/qp_shotgun/kneaddata/__init__.py @@ -0,0 +1,86 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- + +from qiita_client import QiitaPlugin, QiitaCommand + +from .kneaddata import kneaddata + +__all__ = ['kneaddata'] + +# Initialize the plugin +plugin = QiitaPlugin( + 'KneadData', '0.5.1', 'KneadData is a tool designed to perform quality ' + 'control on metagenomic and metatranscriptomic sequencing data, ' + 'especially data from microbiome experiments.') + +# Define the HUMAnN2 command +req_params = {'input': ('artifact', ['per_sample_FASTQ'])} +opt_params = { + # there are other parameters not included that will be ignored in this + # configuration as we assume that the correct values were set in the env + # by the admin installing the tools: + # trimmomatic + # bowtie2 + # --input # input FASTQ file (add a second argument for paired input files) + # --output # directory to write output files + # --output-prefix # prefix for output files [ DEFAULT : $SAMPLE_kneaddata ] + # --log # filepath for log [ DEFAULT : $OUTPUT_DIR/$SAMPLE_kneaddata.log ] + # --bowtie2 # path to bowtie executable + # --bmtagger # path to bmtagger exectuable + # --trf # path to TRF executable + 'reference-db': ['choice:["human_genome"]', 'human_genome'], # ref db + 'bypass-trim': ['boolean', 'False'], # bypass the trim step + 'threads': ['integer', '1'], # threads to run + 'processes': ['integer', '1'], # processes to run + 'quality-scores': ['choice:["phred33","phred64"]', 'phred33'], # quality + 'run-bmtagger': ['boolean', 'False'], # run BMTagger instead of Bowtie2 + 'run-trf': ['boolean', 'False'], # run TRF repeat finder tool + 'run-fastqc-start': ['boolean', 'True'], # run FastQC on original data + 'run-fastqc-end': ['boolean', 'True'], # run FastQC on filtered data + 'store-temp-output': ['boolean', 'False'], # store temp output files + 'log-level': ['choice:["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]', + 'DEBUG'], + + # Trimmomatic options + 'max-memory': ['string', '500m'], # max memory in mb [ DEFAULT : 500m ] + 'trimmomatic': ['string', '$TRIMMOMATIC_DIR'], + 'trimmomatic-options': ['string', 'ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 ' + 'SLIDINGWINDOW:4:15 MINLEN:36'], + # Bowtie2 options + # 'bowtie2-options': ['string', '--very-sensitive'] + # BMTagger options + # # TRF options + # 'match': ['integer', '2'], # matching weight + # 'mismatch': ['integer', '7'], # mismatching penalty + # 'delta': ['integer', '7'], # indel penalty + # 'pm': ['integer', '80'], # match probability + # 'pi': ['integer', '10'], # indel probability + # 'minscore': ['integer', '50'], # mimimum alignment score to report + # 'maxperiod': ['integer', '500m'] # maximum period size to report + # FastQC options + } +outputs = {'per_sample_FASTQ': 'per_sample_FASTQ'} +dflt_param_set = { + 'Defaults': { + 'reference-db': 'human_genome', 'bypass-trim': False, 'threads': 1, + 'processes': 1, 'quality-scores': 'phred33', 'run-bmtagger': False, + 'run-trf': False, 'run-fastqc-start': True, 'run-fastqc-end': True, + 'store-temp-output': False, 'log-level': 'DEBUG', 'max-memory': '500m', + 'trimmomatic': '"$TRIMMOMATIC_DIR"', + 'trimmomatic-options': '"ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"' + # , 'match': 2, 'mismatch': 7, + # 'delta': 7, 'pm': 80, 'pi': 10, 'minscore': 50, 'maxperiod': '500m' + } +} +kneaddata_cmd = QiitaCommand( + "KneadData", "Sequence QC", kneaddata, req_params, opt_params, + outputs, dflt_param_set) +plugin.register_command(kneaddata_cmd) diff --git a/qp_shotgun/kneaddata/kneaddata.py b/qp_shotgun/kneaddata/kneaddata.py new file mode 100644 index 0000000..5e0c716 --- /dev/null +++ b/qp_shotgun/kneaddata/kneaddata.py @@ -0,0 +1,287 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- + +from itertools import zip_longest +from os.path import basename, join + +from qiita_client import ArtifactInfo + +from qiita_client.util import system_call, get_sample_names_by_run_prefix + + +def make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file): + """Recovers read pairing information + + Parameters + ---------- + forward_seqs : list of str + The list of forward seqs filepaths + reverse_seqs : list of str + The list of reverse seqs filepaths + map_file : str + The path to the mapping file + + Returns + ------- + samples: list of tup + list of 3-tuples with run prefix, sample name, fwd read fp, rev read fp + + Raises + ------ + ValueError + If the rev is not an empty list and the same length than fwd seqs + The prefixes of the run_prefix don't match the file names + + Notes + ----- + At this stage it is required that if reverse sequences are present that all + samples have both a forward and a reverse sequence. However, the read + trimming step can sometimes eliminate all reverse reads, especially in low + coverage samples with poor overall reverse read quality. + """ + + # sort forward seqs + forward_seqs.sort() + + # check that rev seqs are same len + if reverse_seqs: + if len(forward_seqs) != len(reverse_seqs): + raise ValueError('Your reverse and forward files are of different ' + 'length. Forward: %s. Reverse: %s.' % + (', '.join(forward_seqs), + ', '.join(reverse_seqs))) + reverse_seqs.sort() + + # get run prefixes + # These are prefixes that should match uniquely to forward reads + # sn_by_rp is dict of samples keyed by run prefixes + sn_by_rp = get_sample_names_by_run_prefix(map_file) + + # make pairings + samples = [] + used_prefixes = set() + for i, (fwd_fp, rev_fp) in enumerate(zip_longest(forward_seqs, + reverse_seqs)): + # fwd_fp is the fwd read filepath + fwd_fn = basename(fwd_fp) + + # iterate over run prefixes and make sure only one matches + run_prefix = None + for rp in sn_by_rp: + if fwd_fn.startswith(rp) and run_prefix is None: + run_prefix = rp + elif fwd_fn.startswith(rp) and run_prefix is not None: + raise ValueError('Multiple run prefixes match this fwd read: ' + '%s' % fwd_fn) + + # make sure that we got one matching run prefix: + if run_prefix is None: + raise ValueError('No run prefix matching this fwd read: %s' + % fwd_fn) + + if run_prefix in used_prefixes: + raise ValueError('This run prefix matches multiple fwd reads: ' + '%s' % run_prefix) + + if rev_fp is None: + samples.append((run_prefix, sn_by_rp[run_prefix], fwd_fp, None)) + else: + rev_fn = basename(rev_fp) + # if we have reverse reads, make sure the matching pair also + # matches the run prefix: + if not rev_fn.startswith(run_prefix): + raise ValueError('Reverse read does not match this run prefix.' + '\nRun prefix: %s\nForward read: %s\n' + 'Reverse read: %s\n' % + (run_prefix, fwd_fn, rev_fn)) + + samples.append((run_prefix, sn_by_rp[run_prefix], fwd_fp, + rev_fp)) + + used_prefixes.add(run_prefix) + + return(samples) + + +def format_kneaddata_params(parameters): + + params = [] + + for param in sorted(parameters): + value = parameters[param] + + if value is True: + params.append('--%s' % param) + continue + elif value is False: + continue + elif value: + params.append('--%s %s' % (param, value)) + continue + + param_string = ' '.join(params) + + return(param_string) + + +def generate_kneaddata_commands(forward_seqs, reverse_seqs, map_file, + out_dir, parameters): + """Generates the KneadData commands + + Parameters + ---------- + forward_seqs : list of str + The list of forward seqs filepaths + reverse_seqs : list of str + The list of reverse seqs filepaths + map_file : str + The path to the mapping file + out_dir : str + The job output directory + parameters : dict + The command's parameters, keyed by parameter name + + Returns + ------- + cmds: list of str + The KneadData commands + samples: list of tup + list of 3-tuples with run prefix, sample name, fwd read fp, rev read fp + + Raises + ------ + ValueError + If the rev is not an empty list and the same length than fwd seqs + The prefixes of the run_prefix don't match the file names + + Notes + ----- + Currently this is requiring matched pairs in the make_read_pairs_per_sample + step but implicitly allowing empty reverse reads in the actual command + generation. This behavior may allow support of situations with empty + reverse reads in some samples, for example after trimming and QC. + """ + # making sure the forward and reverse reads are in the same order + + # we match filenames, samples, and run prefixes + samples = make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file) + + cmds = [] + + param_string = format_kneaddata_params(parameters) + prefixes = [] + for run_prefix, sample, f_fp, r_fp in samples: + prefixes.append(run_prefix) + if r_fp is None: + cmds.append('kneaddata --input "%s" --output "%s" ' + '--output-prefix "%s" %s' % + (f_fp, join(out_dir, run_prefix), + run_prefix, param_string)) + else: + cmds.append('kneaddata --input "%s" --input "%s" --output "%s" ' + '--output-prefix "%s" %s' % + (f_fp, r_fp, join(out_dir, run_prefix), + run_prefix, param_string)) + + return cmds, samples + + +def _run_commands(qclient, job_id, commands, msg): + for i, cmd in enumerate(commands): + qclient.update_job_step(job_id, msg % i) + std_out, std_err, return_value = system_call(cmd) + if return_value != 0: + error_msg = ("Error running KneadData:\nStd out: %s\nStd err: %s" + "\n\nCommand run was:\n%s" + % (std_out, std_err, cmd)) + return False, error_msg + + return True, "" + + +def _per_sample_ainfo(out_dir, samples): + ainfo = [] + for run_prefix, sample, f_fp, r_fp in samples: + sam_out_dir = join(out_dir, run_prefix) + + if r_fp: + ainfo.extend([ + ArtifactInfo('clean paired R1', 'per_sample_FASTQ', + [(join(sam_out_dir, '%s_paired_1.fastq' % + run_prefix), 'per_sample_FASTQ')]), + ArtifactInfo('clean paired R2', 'per_sample_FASTQ', + [(join(sam_out_dir, '%s_paired_2.fastq' % + run_prefix), 'per_sample_FASTQ')]), + ArtifactInfo('clean unpaired R1', 'per_sample_FASTQ', + [(join(sam_out_dir, '%s_unmatched_1.fastq' % + run_prefix), 'per_sample_FASTQ')]), + ArtifactInfo('clean unpaired R2', 'per_sample_FASTQ', + [(join(sam_out_dir, '%s_unmatched_2.fastq' % + run_prefix), 'per_sample_FASTQ')])]) + else: + ainfo.extend([ + ArtifactInfo('cleaned reads', 'per_sample_FASTQ', + [(join(sam_out_dir, '%s.fastq' % + run_prefix), 'per_sample_FASTQ')])]) + + return ainfo + + +def kneaddata(qclient, job_id, parameters, out_dir): + """Run kneaddata with the given parameters + + Parameters + ---------- + qclient : tgp.qiita_client.QiitaClient + The Qiita server client + job_id : str + The job id + parameters : dict + The parameter values to run split libraries + out_dir : str + The path to the job's output directory + + Returns + ------- + bool, list, str + The results of the job + """ + # Step 1 get the rest of the information need to run kneaddata + qclient.update_job_step(job_id, "Step 1 of 3: Collecting information") + artifact_id = parameters['input'] + + # removing input from parameters so it's not part of the final command + del parameters['input'] + + # Get the artifact filepath information + artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id) + fps = artifact_info['files'] + + # Get the artifact metadata + prep_info = qclient.get('/qiita_db/prep_template/%s/' + % artifact_info['prep_information'][0]) + qiime_map = prep_info['qiime-map'] + + # Step 2 generating command kneaddata + qclient.update_job_step(job_id, "Step 2 of 5: Generating" + " KneadData command") + rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else [] + commands, samples = generate_kneaddata_commands(fps['raw_forward_seqs'], + rs, qiime_map, out_dir, + parameters) + + # Step 3 execute kneaddata + msg = "Step 3 of 5: Executing KneadData job (%d/{0})".format(len(commands)) + success, msg = _run_commands(qclient, job_id, commands, msg) + if not success: + return False, None, msg + + # Step 4 generating artifacts + ainfo = _per_sample_ainfo(out_dir, samples) + + return True, ainfo, "" diff --git a/qp_shotgun/kneaddata/tests/__init__.py b/qp_shotgun/kneaddata/tests/__init__.py new file mode 100644 index 0000000..e0aff71 --- /dev/null +++ b/qp_shotgun/kneaddata/tests/__init__.py @@ -0,0 +1,7 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- diff --git a/qp_shotgun/kneaddata/tests/test_kneaddata.py b/qp_shotgun/kneaddata/tests/test_kneaddata.py new file mode 100644 index 0000000..a3105f6 --- /dev/null +++ b/qp_shotgun/kneaddata/tests/test_kneaddata.py @@ -0,0 +1,371 @@ +# ----------------------------------------------------------------------------- +# Copyright (c) 2014--, The Qiita Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ----------------------------------------------------------------------------- + +from unittest import main +from os import close, remove +from os.path import exists, isdir, join +from shutil import rmtree, copyfile +from tempfile import mkstemp, mkdtemp +from json import dumps + +from qiita_client.testing import PluginTestCase + +from qp_shotgun.kneaddata import plugin +from qp_shotgun.kneaddata.kneaddata import (make_read_pairs_per_sample, + generate_kneaddata_commands, + format_kneaddata_params, + kneaddata) + + +class KneaddataTests(PluginTestCase): + maxDiff = None + + def setUp(self): + plugin("https://localhost:21174", 'register', 'ignored') + self.params = { + 'reference-db': 'human_genome', 'bypass-trim': False, 'threads': 1, + 'processes': 1, 'quality-scores': 'phred33', 'run-bmtagger': False, + 'run-trf': False, 'run-fastqc-start': True, 'run-fastqc-end': True, + 'store-temp-output': False, 'log-level': 'DEBUG', + 'trimmomatic': '"$TRIMMOMATIC_DIR"', + 'max-memory': '500m', 'trimmomatic-options': '"ILLUMINACLIP:' + '$TRIMMOMATIC_DIR/adapters/TruSeq3-PE-2.fa:2:30:10 LEADING:3 ' + 'TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36"' + } + self._clean_up_files = [] + + def tearDown(self): + for fp in self._clean_up_files: + if exists(fp): + if isdir(fp): + rmtree(fp) + else: + remove(fp) + + def test_format_kneaddata_params(self): + obs = format_kneaddata_params(self.params) + exp = ('--log-level DEBUG --max-memory 500m --processes 1 ' + '--quality-scores phred33 --reference-db human_genome ' + '--run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" --trimmomatic-options ' + '"ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/TruSeq3-PE-2.fa:' + '2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36"') + + self.assertEqual(obs, exp) + + def test_make_read_pairs_per_sample_match_fwd_rev(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s1_S009_L001_R1.fastq.gz'] + + rev_fp = ['./folder/s3_S013_L001_R2.fastq.gz', + './folder/s2_S011_L001_R2.fastq.gz', + './folder/s1_S009_L001_R2.fastq.gz'] + + exp = [('s1', 'SKB8.640193', './folder/s1_S009_L001_R1.fastq.gz', + './folder/s1_S009_L001_R2.fastq.gz'), + ('s2', 'SKD8.640184', './folder/s2_S011_L001_R1.fastq.gz', + './folder/s2_S011_L001_R2.fastq.gz'), + ('s3', 'SKB7.640196', './folder/s3_S013_L001_R1.fastq.gz', + './folder/s3_S013_L001_R2.fastq.gz')] + + obs = make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + self.assertEqual(obs, exp) + + def test_make_read_pairs_per_sample_match_fwd_only(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s1_S009_L001_R1.fastq.gz'] + + rev_fp = [] + + exp = [('s1', 'SKB8.640193', './folder/s1_S009_L001_R1.fastq.gz', + None), + ('s2', 'SKD8.640184', './folder/s2_S011_L001_R1.fastq.gz', + None), + ('s3', 'SKB7.640196', './folder/s3_S013_L001_R1.fastq.gz', + None)] + + obs = make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + self.assertEqual(obs, exp) + + def test_make_read_pairs_per_sample_match_fwd_rev_diffnum(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s1_S009_L001_R1.fastq.gz'] + + rev_fp = ['./folder/s4_S013_L001_R2.fastq.gz', + './folder/s2_S011_L001_R2.fastq.gz', + './folder/s1_S009_L001_R2.fastq.gz'] + + with self.assertRaises(ValueError): + make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + def test_make_read_pairs_per_sample_match_fwd_rev_notmatch(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s1_S009_L001_R1.fastq.gz'] + + rev_fp = ['./folder/s3_S013_L001_R2.fastq.gz', + './folder/s2_S011_L001_R2.fastq.gz'] + + with self.assertRaises(ValueError): + make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + def test_make_read_pairs_per_sample_match_fwd_no_rp(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s4_S009_L001_R1.fastq.gz'] + + rev_fp = [] + + with self.assertRaises(ValueError): + make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + def test_make_read_pairs_per_sample_match_fwd_2match(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + fwd_fp = ['./folder/s3_S013_L001_R1.fastq.gz', + './folder/s2_S011_L001_R1.fastq.gz', + './folder/s2_S009_L001_R1.fastq.gz'] + + rev_fp = [] + + with self.assertRaises(ValueError): + make_read_pairs_per_sample(fwd_fp, rev_fp, fp) + + def test_generate_kneaddata_analysis_commands_only_fwd(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + exp_cmd = [ + 'kneaddata --input "fastq/s1.fastq" ' + '--output "output/s1" --output-prefix "s1" ' + '--log-level DEBUG --max-memory 500m ' + '--processes 1 --quality-scores phred33 --reference-db ' + 'human_genome --run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"', + 'kneaddata --input "fastq/s2.fastq.gz" ' + '--output "output/s2" --output-prefix "s2" ' + '--log-level DEBUG --max-memory 500m ' + '--processes 1 --quality-scores phred33 --reference-db ' + 'human_genome --run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"', + 'kneaddata --input "fastq/s3.fastq" ' + '--output "output/s3" --output-prefix "s3" ' + '--log-level DEBUG --max-memory 500m ' + '--processes 1 --quality-scores phred33 --reference-db ' + 'human_genome --run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"'] + + exp_sample = [('s1', 'SKB8.640193', 'fastq/s1.fastq', None), + ('s2', 'SKD8.640184', 'fastq/s2.fastq.gz', None), + ('s3', 'SKB7.640196', 'fastq/s3.fastq', None)] + + obs_cmd, obs_sample = generate_kneaddata_commands( + ['fastq/s1.fastq', 'fastq/s2.fastq.gz', 'fastq/s3.fastq'], [], + fp, 'output', self.params) + + self.assertEqual(obs_cmd, exp_cmd) + self.assertEqual(obs_sample, exp_sample) + + def test_generate_kneaddata_analysis_commands_forward_reverse(self): + fd, fp = mkstemp() + close(fd) + with open(fp, 'w') as f: + f.write(MAPPING_FILE) + self._clean_up_files.append(fp) + + exp_cmd = [ + 'kneaddata --input "fastq/s1.fastq" --input "fastq/s1.R2.fastq" ' + '--output "output/s1" --output-prefix "s1" ' + '--log-level DEBUG --max-memory 500m ' + '--processes 1 --quality-scores phred33 --reference-db ' + 'human_genome --run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"', + 'kneaddata --input "fastq/s2.fastq.gz" --input ' + '"fastq/s2.R2.fastq.gz" --output "output/s2" --output-prefix "s2" ' + '--log-level DEBUG ' + '--max-memory 500m --processes 1 --quality-scores phred33 ' + '--reference-db human_genome --run-fastqc-end --run-fastqc-start ' + '--threads 1 --trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/' + 'adapters/TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 ' + 'SLIDINGWINDOW:4:15 MINLEN:36"', + 'kneaddata --input "fastq/s3.fastq" --input "fastq/s3.R2.fastq" ' + '--output "output/s3" --output-prefix "s3" ' + '--log-level DEBUG --max-memory 500m ' + '--processes 1 --quality-scores phred33 --reference-db ' + 'human_genome --run-fastqc-end --run-fastqc-start --threads 1 ' + '--trimmomatic "$TRIMMOMATIC_DIR" ' + '--trimmomatic-options "ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/' + 'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 ' + 'MINLEN:36"'] + + exp_sample = [ + ('s1', 'SKB8.640193', 'fastq/s1.fastq', 'fastq/s1.R2.fastq'), + ('s2', 'SKD8.640184', 'fastq/s2.fastq.gz', 'fastq/s2.R2.fastq.gz'), + ('s3', 'SKB7.640196', 'fastq/s3.fastq', 'fastq/s3.R2.fastq')] + + obs_cmd, obs_sample = generate_kneaddata_commands( + ['fastq/s1.fastq', 'fastq/s2.fastq.gz', 'fastq/s3.fastq'], + ['fastq/s1.R2.fastq', 'fastq/s2.R2.fastq.gz', 'fastq/s3.R2.fastq'], + fp, 'output', self.params) + + print(obs_sample) + print(exp_sample) + + self.assertEqual(obs_cmd, exp_cmd) + self.assertEqual(obs_sample, exp_sample) + + def test_kneaddata(self): + # generating filepaths + in_dir = mkdtemp() + self._clean_up_files.append(in_dir) + + fp1_1 = join(in_dir, 'kd_test_1_R1.fastq.gz') + fp1_2 = join(in_dir, 'kd_test_1_R2.fastq.gz') + copyfile('support_files/kd_test_1_R1.fastq.gz', fp1_1) + copyfile('support_files/kd_test_1_R2.fastq.gz', fp1_2) + + # inserting new prep template + prep_info_dict = { + 'SKB7.640196': { + 'run_prefix': 'kd_test_1'} + } + data = {'prep_info': dumps(prep_info_dict), + # magic #1 = testing study + 'study': 1, + 'data_type': 'Metagenomic'} + pid = self.qclient.post('/apitest/prep_template/', data=data)['prep'] + + # inserting artifacts + data = { + 'filepaths': dumps([ + (fp1_1, 'raw_forward_seqs'), + (fp1_2, 'raw_reverse_seqs')]), + 'type': "per_sample_FASTQ", + 'name': "New test artifact", + 'prep': pid} + + aid = self.qclient.post('/apitest/artifact/', data=data)['artifact'] + + self.params['input'] = aid + # overwriting so the run uses the defaults + self.params['reference-db'] = ('$KD_DEMO_DB') + + data = {'user': 'demo@microbio.me', + 'command': dumps(['KneadData', '0.5.1', 'KneadData']), + 'status': 'running', + 'parameters': dumps(self.params)} + jid = self.qclient.post('/apitest/processing_job/', data=data)['job'] + + out_dir = mkdtemp() + self._clean_up_files.append(out_dir) + + success, ainfo, msg = kneaddata(self.qclient, jid, + self.params, out_dir) + + self.assertEqual("", msg) + self.assertTrue(success) + # we are expecting 16 artifacts per sample eventually + # but for now just the four fastqs + self.assertEqual(4, len(ainfo)) + + obs_fps = [] + for a in ainfo: + self.assertEqual('per_sample_FASTQ', a.artifact_type) + obs_fps.append(a.files) + + exp_fps = [[(join(out_dir, 'kd_test_1', 'kd_test_1_paired_1.fastq'), + 'per_sample_FASTQ')], + [(join(out_dir, 'kd_test_1', 'kd_test_1_paired_2.fastq'), + 'per_sample_FASTQ')], + [(join(out_dir, 'kd_test_1', 'kd_test_1_unmatched_1.fastq'), + 'per_sample_FASTQ')], + [(join(out_dir, 'kd_test_1', 'kd_test_1_unmatched_2.fastq'), + 'per_sample_FASTQ')]] + + self.assertItemsEqual(exp_fps, obs_fps) + + for f_a in exp_fps: + assert exists(f_a[0][0]) + + +MAPPING_FILE = ( + "#SampleID\tplatform\tbarcode\texperiment_design_description\t" + "library_construction_protocol\tcenter_name\tprimer\trun_prefix\t" + "instrument_model\tDescription\n" + "SKB7.640196\tILLUMINA\tA\tA\tA\tANL\tA\ts3\tIllumina MiSeq\tdesc1\n" + "SKB8.640193\tILLUMINA\tA\tA\tA\tANL\tA\ts1\tIllumina MiSeq\tdesc2\n" + "SKD8.640184\tILLUMINA\tA\tA\tA\tANL\tA\ts2\tIllumina MiSeq\tdesc3\n" +) + +MAPPING_FILE_2 = ( + "#SampleID\tplatform\tbarcode\texperiment_design_description\t" + "library_construction_protocol\tcenter_name\tprimer\t" + "run_prefix\tinstrument_model\tDescription\n" + "SKB7.640196\tILLUMINA\tA\tA\tA\tANL\tA\ts3\tIllumina MiSeq\tdesc1\n" + "SKB8.640193\tILLUMINA\tA\tA\tA\tANL\tA\ts1\tIllumina MiSeq\tdesc2\n" + "SKD8.640184\tILLUMINA\tA\tA\tA\tANL\tA\ts1\tIllumina MiSeq\tdesc3\n" +) + + +if __name__ == '__main__': + main() diff --git a/scripts/configure_shotgun b/scripts/configure_shotgun index 191888e..e6790c6 100644 --- a/scripts/configure_shotgun +++ b/scripts/configure_shotgun @@ -10,7 +10,8 @@ import click -from qp_shotgun.humann2 import plugin +from qp_shotgun.humann2 import plugin as h2_plugin +from qp_shotgun.kneaddata import plugin as kd_plugin @click.command() @@ -21,7 +22,9 @@ def config(env_script, server_cert): """Generates the Qiita configuration files""" if server_cert == 'None': server_cert = None - plugin.generate_config(env_script, 'start_shotgun', + kd_plugin.generate_config(env_script, 'start_shotgun', + server_cert=server_cert) + h2_plugin.generate_config(env_script, 'start_shotgun', server_cert=server_cert) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 4918e7f..6f4d4c9 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,7 @@ from setuptools import setup + __version__ = "0.1.0-dev" @@ -41,11 +42,15 @@ author_email="qiita.help@gmail.com", url='https://github.com/biocore/qiita', test_suite='nose.collector', - packages=['qp_shotgun', 'qp_shotgun/humann2'], + packages=['qp_shotgun', 'qp_shotgun/humann2', 'qp_shotgun/kneaddata'], package_data={'qp_shotgun': ['support_files/config_file.cfg']}, scripts=['scripts/configure_shotgun', 'scripts/start_shotgun'], extras_require={'test': ["nose >= 0.10.1", "pep8"]}, install_requires=['click >= 3.3', 'future', 'pandas >= 0.15', 'humann2', - 'h5py >= 2.3.1', 'biom-format'], + 'h5py >= 2.3.1', 'biom-format', 'kneaddata >= 0.5.2'], + dependency_links=[('https://bitbucket.org/biobakery/humann2/get/' + '0.9.3.1.tar.gz'), + ('https://bitbucket.org/biobakery/kneaddata/get/' + '0.5.1.tar.gz#egg=kneaddata-0.5.1')], classifiers=classifiers ) diff --git a/support_files/kd_test_1_R1.fastq.gz b/support_files/kd_test_1_R1.fastq.gz new file mode 100644 index 0000000..dca98bc Binary files /dev/null and b/support_files/kd_test_1_R1.fastq.gz differ diff --git a/support_files/kd_test_1_R2.fastq.gz b/support_files/kd_test_1_R2.fastq.gz new file mode 100644 index 0000000..11f0d5a Binary files /dev/null and b/support_files/kd_test_1_R2.fastq.gz differ diff --git a/support_files/kd_test_2_R1.fastq.gz b/support_files/kd_test_2_R1.fastq.gz new file mode 100644 index 0000000..2ebaae0 Binary files /dev/null and b/support_files/kd_test_2_R1.fastq.gz differ diff --git a/support_files/kd_test_2_R2.fastq.gz b/support_files/kd_test_2_R2.fastq.gz new file mode 100644 index 0000000..56f08ef Binary files /dev/null and b/support_files/kd_test_2_R2.fastq.gz differ