Skip to content

Commit

Permalink
ENH: Add interactive quality plots to summarize visualization
Browse files Browse the repository at this point in the history
This adds interactive quality score box plots in a new tab in the visualization, and is intended as a replacement for q2-dada2's plot-qualities visualizer. This commit contains code that was written by @jakereps and @thermokarst.
  • Loading branch information
jakereps authored and gregcaporaso committed Apr 19, 2017
1 parent 99e2b2e commit 44344ac
Show file tree
Hide file tree
Showing 21 changed files with 880 additions and 79 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@ target/
*.qzv

.DS_Store
node_modules
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
dist: trusty
sudo: false
language: python
env:
Expand Down
3 changes: 2 additions & 1 deletion q2_demux/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from ._demux import emp_single, emp_paired, summarize
from ._demux import emp_single, emp_paired
from ._summarize import summarize
from ._version import get_versions


Expand Down
53 changes: 1 addition & 52 deletions q2_demux/_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,24 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os.path
import gzip
import yaml
import itertools
import collections
import collections.abc
import pkg_resources
import random
import resource

import skbio
import pandas as pd
import seaborn as sns
import psutil

import qiime2
from q2_types.per_sample_sequences import (
SingleLanePerSampleSingleEndFastqDirFmt,
SingleLanePerSamplePairedEndFastqDirFmt,
FastqManifestFormat, YamlFormat, FastqGzFormat)
import q2templates
FastqManifestFormat, YamlFormat)


TEMPLATES = pkg_resources.resource_filename('q2_demux', 'assets')
FastqHeader = collections.namedtuple('FastqHeader', ['id', 'description'])


Expand Down Expand Up @@ -219,51 +213,6 @@ def __iter__(self):
yield barcode_record, forward_record, reverse_record


def summarize(output_dir: str, data: SingleLanePerSampleSingleEndFastqDirFmt) \
-> None:
per_sample_fastqs = list(data.sequences.iter_views(FastqGzFormat))
per_sample_fastq_counts = {}
for relpath, view in per_sample_fastqs:
seqs = _read_fastq_seqs(str(view))
count = 0
for seq in seqs:
count += 1
sample_name = relpath.name.split('_', 1)[0]
per_sample_fastq_counts[sample_name] = count

result = pd.Series(per_sample_fastq_counts)
result.name = 'Sequence count'
result.index.name = 'Sample name'
result.sort_values(inplace=True, ascending=False)
result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'),
header=True, index=True)

show_plot = len(per_sample_fastqs) > 1
if show_plot:
ax = sns.distplot(result, kde=False)
ax.set_xlabel('Number of sequences')
ax.set_ylabel('Frequency')
fig = ax.get_figure()
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png'))
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf'))

html = result.to_frame().to_html(classes='table table-striped table-hover')
html = html.replace('border="1"', 'border="0"')
index = os.path.join(TEMPLATES, 'index.html')
context = {
'result_data': {
'min': result.min(),
'median': result.median(),
'mean': result.mean(),
'max': result.max(),
'sum': result.sum()
},
'result': html,
'show_plot': show_plot
}
q2templates.render(index, output_dir, context=context)


def _make_barcode_map(barcodes, rev_comp_mapping_barcodes):
barcode_map = {}
barcode_len = None
Expand Down
11 changes: 11 additions & 0 deletions q2_demux/_summarize/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2017, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
# TODO: Remove _PlotQualView once QIIME 2 #220 completed
from ._visualizer import summarize, _PlotQualView

__all__ = ['summarize', '_PlotQualView']
196 changes: 196 additions & 0 deletions q2_demux/_summarize/_visualizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2017, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import collections
import os
import pkg_resources
import shutil
import random

import pandas as pd
import seaborn as sns
import numpy as np

from q2_demux._demux import _read_fastq_seqs
import q2templates

TEMPLATES = pkg_resources.resource_filename('q2_demux', '_summarize')


def _decode_qual_to_phred33(qual_str):
# this function is adapted from scikit-bio
qual = np.fromstring(qual_str, dtype=np.uint8) - 33
return qual


# TODO: Remove _PlotQualView once QIIME 2 #220 completed
class _PlotQualView:
"""
A very simple pass-through view which is made up of a single-end or
paired-end directory format with a bool indicating if single or paired.
"""
def __init__(self, directory_format, paired):
self.directory_format = directory_format
self.paired = paired


def _link_sample_n_to_file(files, counts, subsample_ns):
results = collections.defaultdict(list)
for num in subsample_ns:
total = 0
for file in files:
sample_name = os.path.basename(file).split('_', 1)[0]
total += counts[sample_name]
if num < total:
idx = counts[sample_name] - (total - num)
results[file].append(idx)
break
return results


def _subsample_paired(fastq_map):
qual_sample = collections.defaultdict(list)
for fwd, rev, index in fastq_map:
file_pair = zip(_read_fastq_seqs(fwd), _read_fastq_seqs(rev))
for i, (fseq, rseq) in enumerate(file_pair):
if i == index[0]:
qual_sample['forward'].append(_decode_qual_to_phred33(fseq[3]))
qual_sample['reverse'].append(_decode_qual_to_phred33(rseq[3]))
index.pop(0)
if len(index) == 0:
break

return qual_sample


def _subsample_single(fastq_map):
qual_sample = collections.defaultdict(list)
for file, index in fastq_map:
for i, seq in enumerate(_read_fastq_seqs(file)):
if i == index[0]:
qual_sample['forward'].append(_decode_qual_to_phred33(seq[3]))
index.pop(0)
if len(index) == 0:
break
return qual_sample


def _compute_stats_of_df(df):
df_stats = df.describe(
percentiles=[0.02, 0.09, 0.25, 0.5, 0.75, 0.91, 0.98])
drop_cols = df_stats.index.isin(['count', 'std', 'mean', 'min', 'max'])
df_stats = df_stats[~drop_cols]
return df_stats


def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
paired = data.paired
data = data.directory_format
dangers = []
warnings = []

manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec),
header=0, comment='#')
manifest.filename = manifest.filename.apply(
lambda x: os.path.join(str(data), x))

fwd = manifest[manifest.direction == 'forward'].filename.tolist()
rev = manifest[manifest.direction == 'reverse'].filename.tolist()

per_sample_fastq_counts = {}
reads = rev if not fwd and rev else fwd
for file in reads:
count = 0
for seq in _read_fastq_seqs(file):
count += 1
sample_name = os.path.basename(file).split('_', 1)[0]
per_sample_fastq_counts[sample_name] = count

result = pd.Series(per_sample_fastq_counts)
result.name = 'Sequence count'
result.index.name = 'Sample name'
result.sort_values(inplace=True, ascending=False)
result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'),
header=True, index=True)
sequence_count = result.sum()

if n > sequence_count:
n = sequence_count
warnings.append('A subsample value was provided that is greater than '
'the amount of sequences across all samples. The plot '
'was generated using all available sequences.')

subsample_ns = sorted(random.sample(range(sequence_count), n))
link = _link_sample_n_to_file(reads, per_sample_fastq_counts, subsample_ns)
if paired:
sample_map = [(file, rev[fwd.index(file)], link[file])
for file in link]
quality_scores = _subsample_paired(sample_map)
else:
sample_map = [(file, link[file]) for file in link]
quality_scores = _subsample_single(sample_map)

forward_scores = pd.DataFrame(quality_scores['forward'])
forward_stats = _compute_stats_of_df(forward_scores)

if (forward_stats.loc['50%'] > 45).any():
dangers.append('Some of the PHRED quality values are out of range. '
'This is likely because an incorrect PHRED offset '
'was chosen on import of your raw data. You can learn '
'how to choose your PHRED offset during import in the '
'importing tutorial.')
if paired:
reverse_scores = pd.DataFrame(quality_scores['reverse'])
reverse_stats = _compute_stats_of_df(reverse_scores)

show_plot = len(fwd) > 1
if show_plot:
ax = sns.distplot(result, kde=False)
ax.set_xlabel('Number of sequences')
ax.set_ylabel('Frequency')
fig = ax.get_figure()
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png'))
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf'))

html = result.to_frame().to_html(classes='table table-striped table-hover')
html = html.replace('border="1"', 'border="0"')
index = os.path.join(TEMPLATES, 'assets', 'index.html')
overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html')
quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html')
context = {
'result_data': {
'min': result.min(),
'median': result.median(),
'mean': result.mean(),
'max': result.max(),
'sum': sequence_count
},
'result': html,
'show_plot': show_plot,
'paired': paired,
'tabs': [{'title': 'Overview',
'url': 'overview.html'},
{'title': 'Interactive Quality Plot',
'url': 'quality-plot.html'}],
'dangers': dangers,
'warnings': warnings,
'sample_n': n
}
templates = [index, overview_template, quality_template]
q2templates.render(templates, output_dir, context=context)

shutil.copytree(os.path.join(TEMPLATES, 'assets', 'app'),
os.path.join(output_dir, 'app'))

with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
fh.write("app.init(")
forward_stats.to_json(fh)
if paired:
fh.write(',')
reverse_stats.to_json(fh)
fh.write(');')
1 change: 1 addition & 0 deletions q2_demux/_summarize/assets/app/bundle.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions q2_demux/_summarize/assets/app/vendor.bundle.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions q2_demux/_summarize/assets/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% extends "tab-parent.html" %}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% extends "base.html" %}
{% extends "tab-child.html" %}

{% block content %}
<div class="row">
Expand Down

0 comments on commit 44344ac

Please sign in to comment.