ENH: Add interactive quality plots to summarize visualization

This adds interactive quality score box plots in a new tab in the visualization, and is intended as a replacement for q2-dada2's plot-qualities visualizer. This commit contains code that was written by @jakereps and @thermokarst.
qiime2 · Apr 19, 2017 · 44344ac · 44344ac
1 parent 99e2b2e
commit 44344ac
Show file tree

Hide file tree

Showing 21 changed files with 880 additions and 79 deletions.
diff --git a/.gitignore b/.gitignore
@@ -69,3 +69,4 @@ target/
 *.qzv
 
 .DS_Store
+node_modules
diff --git a/.travis.yml b/.travis.yml
@@ -1,3 +1,4 @@
+dist: trusty
 sudo: false
 language: python
 env:

diff --git a/q2_demux/__init__.py b/q2_demux/__init__.py
@@ -6,7 +6,8 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-from ._demux import emp_single, emp_paired, summarize
+from ._demux import emp_single, emp_paired
+from ._summarize import summarize
 from ._version import get_versions
 
 

diff --git a/q2_demux/_demux.py b/q2_demux/_demux.py
@@ -6,30 +6,24 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 
-import os.path
 import gzip
 import yaml
 import itertools
 import collections
 import collections.abc
-import pkg_resources
 import random
 import resource
 
 import skbio
-import pandas as pd
-import seaborn as sns
 import psutil
 
 import qiime2
 from q2_types.per_sample_sequences import (
     SingleLanePerSampleSingleEndFastqDirFmt,
     SingleLanePerSamplePairedEndFastqDirFmt,
-    FastqManifestFormat, YamlFormat, FastqGzFormat)
-import q2templates
+    FastqManifestFormat, YamlFormat)
 
 
-TEMPLATES = pkg_resources.resource_filename('q2_demux', 'assets')
 FastqHeader = collections.namedtuple('FastqHeader', ['id', 'description'])
 
 
@@ -219,51 +213,6 @@ def __iter__(self):
             yield barcode_record, forward_record, reverse_record
 
 
-def summarize(output_dir: str, data: SingleLanePerSampleSingleEndFastqDirFmt) \
-        -> None:
-    per_sample_fastqs = list(data.sequences.iter_views(FastqGzFormat))
-    per_sample_fastq_counts = {}
-    for relpath, view in per_sample_fastqs:
-        seqs = _read_fastq_seqs(str(view))
-        count = 0
-        for seq in seqs:
-            count += 1
-        sample_name = relpath.name.split('_', 1)[0]
-        per_sample_fastq_counts[sample_name] = count
-
-    result = pd.Series(per_sample_fastq_counts)
-    result.name = 'Sequence count'
-    result.index.name = 'Sample name'
-    result.sort_values(inplace=True, ascending=False)
-    result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'),
-                  header=True, index=True)
-
-    show_plot = len(per_sample_fastqs) > 1
-    if show_plot:
-        ax = sns.distplot(result, kde=False)
-        ax.set_xlabel('Number of sequences')
-        ax.set_ylabel('Frequency')
-        fig = ax.get_figure()
-        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png'))
-        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf'))
-
-    html = result.to_frame().to_html(classes='table table-striped table-hover')
-    html = html.replace('border="1"', 'border="0"')
-    index = os.path.join(TEMPLATES, 'index.html')
-    context = {
-        'result_data': {
-            'min': result.min(),
-            'median': result.median(),
-            'mean': result.mean(),
-            'max': result.max(),
-            'sum': result.sum()
-        },
-        'result': html,
-        'show_plot': show_plot
-    }
-    q2templates.render(index, output_dir, context=context)
-
-
 def _make_barcode_map(barcodes, rev_comp_mapping_barcodes):
     barcode_map = {}
     barcode_len = None

diff --git a/q2_demux/_summarize/__init__.py b/q2_demux/_summarize/__init__.py
@@ -0,0 +1,11 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2017, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+# TODO: Remove _PlotQualView once QIIME 2 #220 completed
+from ._visualizer import summarize, _PlotQualView
+
+__all__ = ['summarize', '_PlotQualView']
diff --git a/q2_demux/_summarize/_visualizer.py b/q2_demux/_summarize/_visualizer.py
@@ -0,0 +1,196 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2017, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import collections
+import os
+import pkg_resources
+import shutil
+import random
+
+import pandas as pd
+import seaborn as sns
+import numpy as np
+
+from q2_demux._demux import _read_fastq_seqs
+import q2templates
+
+TEMPLATES = pkg_resources.resource_filename('q2_demux', '_summarize')
+
+
+def _decode_qual_to_phred33(qual_str):
+    # this function is adapted from scikit-bio
+    qual = np.fromstring(qual_str, dtype=np.uint8) - 33
+    return qual
+
+
+# TODO: Remove _PlotQualView once QIIME 2 #220 completed
+class _PlotQualView:
+    """
+    A very simple pass-through view which is made up of a single-end or
+    paired-end directory format with a bool indicating if single or paired.
+    """
+    def __init__(self, directory_format, paired):
+        self.directory_format = directory_format
+        self.paired = paired
+
+
+def _link_sample_n_to_file(files, counts, subsample_ns):
+    results = collections.defaultdict(list)
+    for num in subsample_ns:
+        total = 0
+        for file in files:
+            sample_name = os.path.basename(file).split('_', 1)[0]
+            total += counts[sample_name]
+            if num < total:
+                idx = counts[sample_name] - (total - num)
+                results[file].append(idx)
+                break
+    return results
+
+
+def _subsample_paired(fastq_map):
+    qual_sample = collections.defaultdict(list)
+    for fwd, rev, index in fastq_map:
+        file_pair = zip(_read_fastq_seqs(fwd), _read_fastq_seqs(rev))
+        for i, (fseq, rseq) in enumerate(file_pair):
+            if i == index[0]:
+                qual_sample['forward'].append(_decode_qual_to_phred33(fseq[3]))
+                qual_sample['reverse'].append(_decode_qual_to_phred33(rseq[3]))
+                index.pop(0)
+                if len(index) == 0:
+                    break
+
+    return qual_sample
+
+
+def _subsample_single(fastq_map):
+    qual_sample = collections.defaultdict(list)
+    for file, index in fastq_map:
+        for i, seq in enumerate(_read_fastq_seqs(file)):
+            if i == index[0]:
+                qual_sample['forward'].append(_decode_qual_to_phred33(seq[3]))
+                index.pop(0)
+                if len(index) == 0:
+                    break
+    return qual_sample
+
+
+def _compute_stats_of_df(df):
+    df_stats = df.describe(
+        percentiles=[0.02, 0.09, 0.25, 0.5, 0.75, 0.91, 0.98])
+    drop_cols = df_stats.index.isin(['count', 'std', 'mean', 'min', 'max'])
+    df_stats = df_stats[~drop_cols]
+    return df_stats
+
+
+def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
+    paired = data.paired
+    data = data.directory_format
+    dangers = []
+    warnings = []
+
+    manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec),
+                           header=0, comment='#')
+    manifest.filename = manifest.filename.apply(
+        lambda x: os.path.join(str(data), x))
+
+    fwd = manifest[manifest.direction == 'forward'].filename.tolist()
+    rev = manifest[manifest.direction == 'reverse'].filename.tolist()
+
+    per_sample_fastq_counts = {}
+    reads = rev if not fwd and rev else fwd
+    for file in reads:
+        count = 0
+        for seq in _read_fastq_seqs(file):
+            count += 1
+        sample_name = os.path.basename(file).split('_', 1)[0]
+        per_sample_fastq_counts[sample_name] = count
+
+    result = pd.Series(per_sample_fastq_counts)
+    result.name = 'Sequence count'
+    result.index.name = 'Sample name'
+    result.sort_values(inplace=True, ascending=False)
+    result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'),
+                  header=True, index=True)
+    sequence_count = result.sum()
+
+    if n > sequence_count:
+        n = sequence_count
+        warnings.append('A subsample value was provided that is greater than '
+                        'the amount of sequences across all samples. The plot '
+                        'was generated using all available sequences.')
+
+    subsample_ns = sorted(random.sample(range(sequence_count), n))
+    link = _link_sample_n_to_file(reads, per_sample_fastq_counts, subsample_ns)
+    if paired:
+        sample_map = [(file, rev[fwd.index(file)], link[file])
+                      for file in link]
+        quality_scores = _subsample_paired(sample_map)
+    else:
+        sample_map = [(file, link[file]) for file in link]
+        quality_scores = _subsample_single(sample_map)
+
+    forward_scores = pd.DataFrame(quality_scores['forward'])
+    forward_stats = _compute_stats_of_df(forward_scores)
+
+    if (forward_stats.loc['50%'] > 45).any():
+        dangers.append('Some of the PHRED quality values are out of range. '
+                       'This is likely because an incorrect PHRED offset '
+                       'was chosen on import of your raw data. You can learn '
+                       'how to choose your PHRED offset during import in the '
+                       'importing tutorial.')
+    if paired:
+        reverse_scores = pd.DataFrame(quality_scores['reverse'])
+        reverse_stats = _compute_stats_of_df(reverse_scores)
+
+    show_plot = len(fwd) > 1
+    if show_plot:
+        ax = sns.distplot(result, kde=False)
+        ax.set_xlabel('Number of sequences')
+        ax.set_ylabel('Frequency')
+        fig = ax.get_figure()
+        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png'))
+        fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf'))
+
+    html = result.to_frame().to_html(classes='table table-striped table-hover')
+    html = html.replace('border="1"', 'border="0"')
+    index = os.path.join(TEMPLATES, 'assets', 'index.html')
+    overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html')
+    quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html')
+    context = {
+        'result_data': {
+            'min': result.min(),
+            'median': result.median(),
+            'mean': result.mean(),
+            'max': result.max(),
+            'sum': sequence_count
+        },
+        'result': html,
+        'show_plot': show_plot,
+        'paired': paired,
+        'tabs': [{'title': 'Overview',
+                  'url': 'overview.html'},
+                 {'title': 'Interactive Quality Plot',
+                  'url': 'quality-plot.html'}],
+        'dangers': dangers,
+        'warnings': warnings,
+        'sample_n': n
+    }
+    templates = [index, overview_template, quality_template]
+    q2templates.render(templates, output_dir, context=context)
+
+    shutil.copytree(os.path.join(TEMPLATES, 'assets', 'app'),
+                    os.path.join(output_dir, 'app'))
+
+    with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
+        fh.write("app.init(")
+        forward_stats.to_json(fh)
+        if paired:
+            fh.write(',')
+            reverse_stats.to_json(fh)
+        fh.write(');')
diff --git a/q2_demux/_summarize/assets/app/bundle.js b/q2_demux/_summarize/assets/app/bundle.js
diff --git a/q2_demux/_summarize/assets/app/vendor.bundle.js b/q2_demux/_summarize/assets/app/vendor.bundle.js
diff --git a/q2_demux/_summarize/assets/index.html b/q2_demux/_summarize/assets/index.html
@@ -0,0 +1 @@
+{% extends "tab-parent.html" %}
diff --git a/q2_demux/assets/index.html → q2_demux/_summarize/assets/overview.html b/q2_demux/assets/index.html → q2_demux/_summarize/assets/overview.html
@@ -1,4 +1,4 @@
-{% extends "base.html" %}
+{% extends "tab-child.html" %}
 
 {% block content %}
 <div class="row">