-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: Add interactive quality plots to
summarize
visualization
This adds interactive quality score box plots in a new tab in the visualization, and is intended as a replacement for q2-dada2's plot-qualities visualizer. This commit contains code that was written by @jakereps and @thermokarst.
- Loading branch information
1 parent
99e2b2e
commit 44344ac
Showing
21 changed files
with
880 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,3 +69,4 @@ target/ | |
*.qzv | ||
|
||
.DS_Store | ||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
dist: trusty | ||
sudo: false | ||
language: python | ||
env: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016-2017, QIIME 2 development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
# TODO: Remove _PlotQualView once QIIME 2 #220 completed | ||
from ._visualizer import summarize, _PlotQualView | ||
|
||
__all__ = ['summarize', '_PlotQualView'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2016-2017, QIIME 2 development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
import collections | ||
import os | ||
import pkg_resources | ||
import shutil | ||
import random | ||
|
||
import pandas as pd | ||
import seaborn as sns | ||
import numpy as np | ||
|
||
from q2_demux._demux import _read_fastq_seqs | ||
import q2templates | ||
|
||
TEMPLATES = pkg_resources.resource_filename('q2_demux', '_summarize') | ||
|
||
|
||
def _decode_qual_to_phred33(qual_str): | ||
# this function is adapted from scikit-bio | ||
qual = np.fromstring(qual_str, dtype=np.uint8) - 33 | ||
return qual | ||
|
||
|
||
# TODO: Remove _PlotQualView once QIIME 2 #220 completed | ||
class _PlotQualView: | ||
""" | ||
A very simple pass-through view which is made up of a single-end or | ||
paired-end directory format with a bool indicating if single or paired. | ||
""" | ||
def __init__(self, directory_format, paired): | ||
self.directory_format = directory_format | ||
self.paired = paired | ||
|
||
|
||
def _link_sample_n_to_file(files, counts, subsample_ns): | ||
results = collections.defaultdict(list) | ||
for num in subsample_ns: | ||
total = 0 | ||
for file in files: | ||
sample_name = os.path.basename(file).split('_', 1)[0] | ||
total += counts[sample_name] | ||
if num < total: | ||
idx = counts[sample_name] - (total - num) | ||
results[file].append(idx) | ||
break | ||
return results | ||
|
||
|
||
def _subsample_paired(fastq_map): | ||
qual_sample = collections.defaultdict(list) | ||
for fwd, rev, index in fastq_map: | ||
file_pair = zip(_read_fastq_seqs(fwd), _read_fastq_seqs(rev)) | ||
for i, (fseq, rseq) in enumerate(file_pair): | ||
if i == index[0]: | ||
qual_sample['forward'].append(_decode_qual_to_phred33(fseq[3])) | ||
qual_sample['reverse'].append(_decode_qual_to_phred33(rseq[3])) | ||
index.pop(0) | ||
if len(index) == 0: | ||
break | ||
|
||
return qual_sample | ||
|
||
|
||
def _subsample_single(fastq_map): | ||
qual_sample = collections.defaultdict(list) | ||
for file, index in fastq_map: | ||
for i, seq in enumerate(_read_fastq_seqs(file)): | ||
if i == index[0]: | ||
qual_sample['forward'].append(_decode_qual_to_phred33(seq[3])) | ||
index.pop(0) | ||
if len(index) == 0: | ||
break | ||
return qual_sample | ||
|
||
|
||
def _compute_stats_of_df(df): | ||
df_stats = df.describe( | ||
percentiles=[0.02, 0.09, 0.25, 0.5, 0.75, 0.91, 0.98]) | ||
drop_cols = df_stats.index.isin(['count', 'std', 'mean', 'min', 'max']) | ||
df_stats = df_stats[~drop_cols] | ||
return df_stats | ||
|
||
|
||
def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None: | ||
paired = data.paired | ||
data = data.directory_format | ||
dangers = [] | ||
warnings = [] | ||
|
||
manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec), | ||
header=0, comment='#') | ||
manifest.filename = manifest.filename.apply( | ||
lambda x: os.path.join(str(data), x)) | ||
|
||
fwd = manifest[manifest.direction == 'forward'].filename.tolist() | ||
rev = manifest[manifest.direction == 'reverse'].filename.tolist() | ||
|
||
per_sample_fastq_counts = {} | ||
reads = rev if not fwd and rev else fwd | ||
for file in reads: | ||
count = 0 | ||
for seq in _read_fastq_seqs(file): | ||
count += 1 | ||
sample_name = os.path.basename(file).split('_', 1)[0] | ||
per_sample_fastq_counts[sample_name] = count | ||
|
||
result = pd.Series(per_sample_fastq_counts) | ||
result.name = 'Sequence count' | ||
result.index.name = 'Sample name' | ||
result.sort_values(inplace=True, ascending=False) | ||
result.to_csv(os.path.join(output_dir, 'per-sample-fastq-counts.csv'), | ||
header=True, index=True) | ||
sequence_count = result.sum() | ||
|
||
if n > sequence_count: | ||
n = sequence_count | ||
warnings.append('A subsample value was provided that is greater than ' | ||
'the amount of sequences across all samples. The plot ' | ||
'was generated using all available sequences.') | ||
|
||
subsample_ns = sorted(random.sample(range(sequence_count), n)) | ||
link = _link_sample_n_to_file(reads, per_sample_fastq_counts, subsample_ns) | ||
if paired: | ||
sample_map = [(file, rev[fwd.index(file)], link[file]) | ||
for file in link] | ||
quality_scores = _subsample_paired(sample_map) | ||
else: | ||
sample_map = [(file, link[file]) for file in link] | ||
quality_scores = _subsample_single(sample_map) | ||
|
||
forward_scores = pd.DataFrame(quality_scores['forward']) | ||
forward_stats = _compute_stats_of_df(forward_scores) | ||
|
||
if (forward_stats.loc['50%'] > 45).any(): | ||
dangers.append('Some of the PHRED quality values are out of range. ' | ||
'This is likely because an incorrect PHRED offset ' | ||
'was chosen on import of your raw data. You can learn ' | ||
'how to choose your PHRED offset during import in the ' | ||
'importing tutorial.') | ||
if paired: | ||
reverse_scores = pd.DataFrame(quality_scores['reverse']) | ||
reverse_stats = _compute_stats_of_df(reverse_scores) | ||
|
||
show_plot = len(fwd) > 1 | ||
if show_plot: | ||
ax = sns.distplot(result, kde=False) | ||
ax.set_xlabel('Number of sequences') | ||
ax.set_ylabel('Frequency') | ||
fig = ax.get_figure() | ||
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.png')) | ||
fig.savefig(os.path.join(output_dir, 'demultiplex-summary.pdf')) | ||
|
||
html = result.to_frame().to_html(classes='table table-striped table-hover') | ||
html = html.replace('border="1"', 'border="0"') | ||
index = os.path.join(TEMPLATES, 'assets', 'index.html') | ||
overview_template = os.path.join(TEMPLATES, 'assets', 'overview.html') | ||
quality_template = os.path.join(TEMPLATES, 'assets', 'quality-plot.html') | ||
context = { | ||
'result_data': { | ||
'min': result.min(), | ||
'median': result.median(), | ||
'mean': result.mean(), | ||
'max': result.max(), | ||
'sum': sequence_count | ||
}, | ||
'result': html, | ||
'show_plot': show_plot, | ||
'paired': paired, | ||
'tabs': [{'title': 'Overview', | ||
'url': 'overview.html'}, | ||
{'title': 'Interactive Quality Plot', | ||
'url': 'quality-plot.html'}], | ||
'dangers': dangers, | ||
'warnings': warnings, | ||
'sample_n': n | ||
} | ||
templates = [index, overview_template, quality_template] | ||
q2templates.render(templates, output_dir, context=context) | ||
|
||
shutil.copytree(os.path.join(TEMPLATES, 'assets', 'app'), | ||
os.path.join(output_dir, 'app')) | ||
|
||
with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh: | ||
fh.write("app.init(") | ||
forward_stats.to_json(fh) | ||
if paired: | ||
fh.write(',') | ||
reverse_stats.to_json(fh) | ||
fh.write(');') |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{% extends "tab-parent.html" %} |
2 changes: 1 addition & 1 deletion
2
q2_demux/assets/index.html → q2_demux/_summarize/assets/overview.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
{% extends "base.html" %} | ||
{% extends "tab-child.html" %} | ||
|
||
{% block content %} | ||
<div class="row"> | ||
|
Oops, something went wrong.