qiita-spots · tanaes · Sep 28, 2016 · Sep 29, 2016 · Sep 29, 2016 · Sep 29, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -13,9 +13,15 @@ install:
   # At this moment this plugin has overlapping dependencies than qiita so installing everything in the same environment
   - conda create --yes -n qiita_env python=2.7 pip nose flake8
     pyzmq networkx pyparsing natsort mock future libgfortran
-    'pandas>=0.18' 'scipy>0.13.0' 'numpy>=1.7' 'h5py>=2.3.1'
+    'pandas>=0.18' 'scipy>0.13.0' 'numpy>=1.7' 'h5py>=2.3.1' matplotlib
   - source activate qiita_env
+  - export CONDA_BIN=/home/travis/miniconda3/bin
   - pip install sphinx sphinx-bootstrap-theme coveralls ipython[all]==2.4.1
+  # kneaddata dependencies
+  - conda install --yes -c bioconda bowtie2 trimmomatic=0.36 fastqc=0.11.5
+  - export TRIMMOMATIC_BIN=$(readlink -f $(which trimmomatic))
+  - export TRIMMOMATIC_DIR=`dirname $TRIMMOMATIC_BIN`
+  - export KD_DEMO_DB=$PWD/miniconda3/envs/qiita_env/lib/python2.7/site-packages/kneaddata/tests/data/demo_bowtie2_db/demo_db
   - pip install https://github.com/biocore/qiita/archive/master.zip --process-dependency-links
   # installing other deps
   - conda install --yes -c jjhelmus glpk

diff --git a/qp_shotgun/kneaddata/__init__.py b/qp_shotgun/kneaddata/__init__.py
@@ -0,0 +1,86 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+from qiita_client import QiitaPlugin, QiitaCommand
+
+from .kneaddata import kneaddata
+
+__all__ = ['kneaddata']
+
+# Initialize the plugin
+plugin = QiitaPlugin(
+    'KneadData', '0.5.1', 'KneadData is a tool designed to perform quality '
+    'control on metagenomic and metatranscriptomic sequencing data, '
+    'especially data from microbiome experiments.')
+
+# Define the HUMAnN2 command
+req_params = {'input': ('artifact', ['per_sample_FASTQ'])}
+opt_params = {
+    # there are other parameters not included that will be ignored in this
+    # configuration as we assume that the correct values were set in the env
+    # by the admin installing the tools:
+    # trimmomatic
+    # bowtie2
+    # --input # input FASTQ file (add a second argument for paired input files)
+    # --output # directory to write output files
+    # --output-prefix # prefix for output files [ DEFAULT : $SAMPLE_kneaddata ]
+    # --log # filepath for log [ DEFAULT : $OUTPUT_DIR/$SAMPLE_kneaddata.log ]
+    # --bowtie2 # path to bowtie executable
+    # --bmtagger # path to bmtagger exectuable
+    # --trf # path to TRF executable
+    'reference-db': ['choice:["human_genome"]', 'human_genome'],  # ref db
+    'bypass-trim': ['boolean', 'False'],  # bypass the trim step
+    'threads': ['integer', '1'],  # threads to run
+    'processes': ['integer', '1'],  # processes to run
+    'quality-scores': ['choice:["phred33","phred64"]', 'phred33'],  # quality
+    'run-bmtagger': ['boolean', 'False'],  # run BMTagger instead of Bowtie2
+    'run-trf': ['boolean', 'False'],  # run TRF repeat finder tool
+    'run-fastqc-start': ['boolean', 'True'],  # run FastQC on original data
+    'run-fastqc-end': ['boolean', 'True'],  # run FastQC on filtered data
+    'store-temp-output': ['boolean', 'False'],  # store temp output files
+    'log-level': ['choice:["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]',
+                  'DEBUG'],
+
+    # Trimmomatic options
+    'max-memory': ['string', '500m'],  # max memory in mb [ DEFAULT : 500m ]
+    'trimmomatic': ['string', '$TRIMMOMATIC_DIR'],
+    'trimmomatic-options': ['string', 'ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/'
+                            'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 '
+                            'SLIDINGWINDOW:4:15 MINLEN:36'],
+    # Bowtie2 options
+    # 'bowtie2-options': ['string', '--very-sensitive']
+    # BMTagger options
+    # # TRF options
+    # 'match': ['integer', '2'], # matching weight
+    # 'mismatch': ['integer', '7'], # mismatching penalty
+    # 'delta': ['integer', '7'], # indel penalty
+    # 'pm': ['integer', '80'], # match probability
+    # 'pi': ['integer', '10'], # indel probability
+    # 'minscore': ['integer', '50'], # mimimum alignment score to report
+    # 'maxperiod': ['integer', '500m'] # maximum period size to report
+    # FastQC options
+    }
+outputs = {'per_sample_FASTQ': 'per_sample_FASTQ'}
+dflt_param_set = {
+    'Defaults': {
+        'reference-db': 'human_genome', 'bypass-trim': False, 'threads': 1,
+        'processes': 1, 'quality-scores': 'phred33', 'run-bmtagger': False,
+        'run-trf': False, 'run-fastqc-start': True, 'run-fastqc-end': True,
+        'store-temp-output': False, 'log-level': 'DEBUG', 'max-memory': '500m',
+        'trimmomatic': '"$TRIMMOMATIC_DIR"',
+        'trimmomatic-options': '"ILLUMINACLIP:$TRIMMOMATIC_DIR/adapters/'
+        'TruSeq3-PE-2.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 '
+        'MINLEN:36"'
+        # , 'match': 2, 'mismatch': 7,
+        # 'delta': 7, 'pm': 80, 'pi': 10, 'minscore': 50, 'maxperiod': '500m'
+        }
+}
+kneaddata_cmd = QiitaCommand(
+    "KneadData", "Sequence QC", kneaddata, req_params, opt_params,
+    outputs, dflt_param_set)
+plugin.register_command(kneaddata_cmd)
diff --git a/qp_shotgun/kneaddata/kneaddata.py b/qp_shotgun/kneaddata/kneaddata.py
@@ -0,0 +1,287 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------
+
+from itertools import zip_longest
+from os.path import basename, join
+
+from qiita_client import ArtifactInfo
+
+from qiita_client.util import system_call, get_sample_names_by_run_prefix
+
+
+def make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file):
+    """Recovers read pairing information
+
+    Parameters
+    ----------
+    forward_seqs : list of str
+        The list of forward seqs filepaths
+    reverse_seqs : list of str
+        The list of reverse seqs filepaths
+    map_file : str
+        The path to the mapping file
+
+    Returns
+    -------
+    samples: list of tup
+        list of 3-tuples with run prefix, sample name, fwd read fp, rev read fp
+
+    Raises
+    ------
+    ValueError
+        If the rev is not an empty list and the same length than fwd seqs
+        The prefixes of the run_prefix don't match the file names
+
+    Notes
+    -----
+    At this stage it is required that if reverse sequences are present that all
+    samples have both a forward and a reverse sequence. However, the read
+    trimming step can sometimes eliminate all reverse reads, especially in low
+    coverage samples with poor overall reverse read quality.
+    """
+
+    # sort forward seqs
+    forward_seqs.sort()
+
+    # check that rev seqs are same len
+    if reverse_seqs:
+        if len(forward_seqs) != len(reverse_seqs):
+            raise ValueError('Your reverse and forward files are of different '
+                             'length. Forward: %s. Reverse: %s.' %
+                             (', '.join(forward_seqs),
+                              ', '.join(reverse_seqs)))
+        reverse_seqs.sort()
+
+    # get run prefixes
+    # These are prefixes that should match uniquely to forward reads
+    # sn_by_rp is dict of samples keyed by run prefixes
+    sn_by_rp = get_sample_names_by_run_prefix(map_file)
+
+    # make pairings
+    samples = []
+    used_prefixes = set()
+    for i, (fwd_fp, rev_fp) in enumerate(zip_longest(forward_seqs,
+                                                     reverse_seqs)):
+        # fwd_fp is the fwd read filepath
+        fwd_fn = basename(fwd_fp)
+
+        # iterate over run prefixes and make sure only one matches
+        run_prefix = None
+        for rp in sn_by_rp:
+            if fwd_fn.startswith(rp) and run_prefix is None:
+                run_prefix = rp
+            elif fwd_fn.startswith(rp) and run_prefix is not None:
+                raise ValueError('Multiple run prefixes match this fwd read: '
+                                 '%s' % fwd_fn)
+
+        # make sure that we got one matching run prefix:
+        if run_prefix is None:
+            raise ValueError('No run prefix matching this fwd read: %s'
+                             % fwd_fn)
+
+        if run_prefix in used_prefixes:
+            raise ValueError('This run prefix matches multiple fwd reads: '
+                             '%s' % run_prefix)
+
+        if rev_fp is None:
+            samples.append((run_prefix, sn_by_rp[run_prefix], fwd_fp, None))
+        else:
+            rev_fn = basename(rev_fp)
+            # if we have reverse reads, make sure the matching pair also
+            # matches the run prefix:
+            if not rev_fn.startswith(run_prefix):
+                raise ValueError('Reverse read does not match this run prefix.'
+                                 '\nRun prefix: %s\nForward read: %s\n'
+                                 'Reverse read: %s\n' %
+                                 (run_prefix, fwd_fn, rev_fn))
+
+            samples.append((run_prefix, sn_by_rp[run_prefix], fwd_fp,
+                            rev_fp))
+
+        used_prefixes.add(run_prefix)
+
+    return(samples)
+
+
+def format_kneaddata_params(parameters):
+
+    params = []
+
+    for param in sorted(parameters):
+        value = parameters[param]
+
+        if value is True:
+            params.append('--%s' % param)
+            continue
+        elif value is False:
+            continue
+        elif value:
+            params.append('--%s %s' % (param, value))
+            continue
+
+    param_string = ' '.join(params)
+
+    return(param_string)
+
+
+def generate_kneaddata_commands(forward_seqs, reverse_seqs, map_file,
+                                out_dir, parameters):
+    """Generates the KneadData commands
+
+    Parameters
+    ----------
+    forward_seqs : list of str
+        The list of forward seqs filepaths
+    reverse_seqs : list of str
+        The list of reverse seqs filepaths
+    map_file : str
+        The path to the mapping file
+    out_dir : str
+        The job output directory
+    parameters : dict
+        The command's parameters, keyed by parameter name
+
+    Returns
+    -------
+    cmds: list of str
+        The KneadData commands
+    samples: list of tup
+        list of 3-tuples with run prefix, sample name, fwd read fp, rev read fp
+
+    Raises
+    ------
+    ValueError
+        If the rev is not an empty list and the same length than fwd seqs
+        The prefixes of the run_prefix don't match the file names
+
+    Notes
+    -----
+    Currently this is requiring matched pairs in the make_read_pairs_per_sample
+    step but implicitly allowing empty reverse reads in the actual command
+    generation. This behavior may allow support of situations with empty
+    reverse reads in some samples, for example after trimming and QC.
+    """
+    # making sure the forward and reverse reads are in the same order
+
+    # we match filenames, samples, and run prefixes
+    samples = make_read_pairs_per_sample(forward_seqs, reverse_seqs, map_file)
+
+    cmds = []
+
+    param_string = format_kneaddata_params(parameters)
+    prefixes = []
+    for run_prefix, sample, f_fp, r_fp in samples:
+        prefixes.append(run_prefix)
+        if r_fp is None:
+            cmds.append('kneaddata --input "%s" --output "%s" '
+                        '--output-prefix "%s" %s' %
+                        (f_fp, join(out_dir, run_prefix),
+                         run_prefix, param_string))
+        else:
+            cmds.append('kneaddata --input "%s" --input "%s" --output "%s" '
+                        '--output-prefix "%s" %s' %
+                        (f_fp, r_fp, join(out_dir, run_prefix),
+                         run_prefix, param_string))
+
+    return cmds, samples
+
+
+def _run_commands(qclient, job_id, commands, msg):
+    for i, cmd in enumerate(commands):
+        qclient.update_job_step(job_id, msg % i)
+        std_out, std_err, return_value = system_call(cmd)
+        if return_value != 0:
+            error_msg = ("Error running KneadData:\nStd out: %s\nStd err: %s"
+                         "\n\nCommand run was:\n%s"
+                         % (std_out, std_err, cmd))
+            return False, error_msg
+
+    return True, ""
+
+
+def _per_sample_ainfo(out_dir, samples):
+    ainfo = []
+    for run_prefix, sample, f_fp, r_fp in samples:
+        sam_out_dir = join(out_dir, run_prefix)
+
+        if r_fp:
+            ainfo.extend([
+                ArtifactInfo('clean paired R1', 'per_sample_FASTQ',
+                             [(join(sam_out_dir, '%s_paired_1.fastq' %
+                               run_prefix), 'per_sample_FASTQ')]),
+                ArtifactInfo('clean paired R2', 'per_sample_FASTQ',
+                             [(join(sam_out_dir, '%s_paired_2.fastq' %
+                               run_prefix), 'per_sample_FASTQ')]),
+                ArtifactInfo('clean unpaired R1', 'per_sample_FASTQ',
+                             [(join(sam_out_dir, '%s_unmatched_1.fastq' %
+                               run_prefix), 'per_sample_FASTQ')]),
+                ArtifactInfo('clean unpaired R2', 'per_sample_FASTQ',
+                             [(join(sam_out_dir, '%s_unmatched_2.fastq' %
+                               run_prefix), 'per_sample_FASTQ')])])
+        else:
+            ainfo.extend([
+                ArtifactInfo('cleaned reads', 'per_sample_FASTQ',
+                             [(join(sam_out_dir, '%s.fastq' %
+                               run_prefix), 'per_sample_FASTQ')])])
+
+    return ainfo
+
+
+def kneaddata(qclient, job_id, parameters, out_dir):
+    """Run kneaddata with the given parameters
+
+    Parameters
+    ----------
+    qclient : tgp.qiita_client.QiitaClient
+        The Qiita server client
+    job_id : str
+        The job id
+    parameters : dict
+        The parameter values to run split libraries
+    out_dir : str
+        The path to the job's output directory
+
+    Returns
+    -------
+    bool, list, str
+        The results of the job
+    """
+    # Step 1 get the rest of the information need to run kneaddata
+    qclient.update_job_step(job_id, "Step 1 of 3: Collecting information")
+    artifact_id = parameters['input']
+
+    # removing input from parameters so it's not part of the final command
+    del parameters['input']
+
+    # Get the artifact filepath information
+    artifact_info = qclient.get("/qiita_db/artifacts/%s/" % artifact_id)
+    fps = artifact_info['files']
+
+    # Get the artifact metadata
+    prep_info = qclient.get('/qiita_db/prep_template/%s/'
+                            % artifact_info['prep_information'][0])
+    qiime_map = prep_info['qiime-map']
+
+    # Step 2 generating command kneaddata
+    qclient.update_job_step(job_id, "Step 2 of 5: Generating"
+                                    " KneadData command")
+    rs = fps['raw_reverse_seqs'] if 'raw_reverse_seqs' in fps else []
+    commands, samples = generate_kneaddata_commands(fps['raw_forward_seqs'],
+                                                    rs, qiime_map, out_dir,
+                                                    parameters)
+
+    # Step 3 execute kneaddata
+    msg = "Step 3 of 5: Executing KneadData job (%d/{0})".format(len(commands))
+    success, msg = _run_commands(qclient, job_id, commands, msg)
+    if not success:
+        return False, None, msg
+
+    # Step 4 generating artifacts
+    ainfo = _per_sample_ainfo(out_dir, samples)
+
+    return True, ainfo, ""
diff --git a/qp_shotgun/kneaddata/tests/__init__.py b/qp_shotgun/kneaddata/tests/__init__.py
@@ -0,0 +1,7 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2014--, The Qiita Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# -----------------------------------------------------------------------------