Skip to content

Commit

Permalink
ENH: Summarize length warning instead of error + truncation
Browse files Browse the repository at this point in the history
Pair programmed with @Kleptobismol and @thermokarst.
Fixes #47
  • Loading branch information
maxvonhippel authored and gregcaporaso committed Jun 13, 2017
1 parent f10368d commit d59be81
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 70 deletions.
43 changes: 14 additions & 29 deletions q2_demux/_summarize/_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pkg_resources
import shutil
import random
import json

import pandas as pd
import seaborn as sns
Expand Down Expand Up @@ -55,50 +56,39 @@ def _link_sample_n_to_file(files, counts, subsample_ns):

def _subsample_paired(fastq_map):
qual_sample = collections.defaultdict(list)
seq_len = None
min_seq_len = {'forward': float('inf'), 'reverse': float('inf')}
for fwd, rev, index in fastq_map:
file_pair = zip(_read_fastq_seqs(fwd), _read_fastq_seqs(rev))
for i, (fseq, rseq) in enumerate(file_pair):
if seq_len is None:
seq_len = len(fseq[1])
if seq_len != len(fseq[1]):
raise ValueError(inconsistent_length_template
% (seq_len, len(fseq[1])))
if seq_len != len(rseq[1]):
raise ValueError(inconsistent_length_template
% (seq_len, len(rseq[1])))
min_seq_len['forward'] = min(min_seq_len['forward'], len(fseq[1]))
min_seq_len['reverse'] = min(min_seq_len['reverse'], len(rseq[1]))
if i == index[0]:
qual_sample['forward'].append(_decode_qual_to_phred33(fseq[3]))
qual_sample['reverse'].append(_decode_qual_to_phred33(rseq[3]))
index.pop(0)
if len(index) == 0:
break

return qual_sample
return qual_sample, min_seq_len


def _subsample_single(fastq_map):
qual_sample = collections.defaultdict(list)
seq_len = None
min_seq_len = {'forward': float('inf'), 'reverse': None}
for file, index in fastq_map:
for i, seq in enumerate(_read_fastq_seqs(file)):
if seq_len is None:
seq_len = len(seq[1])
if seq_len != len(seq[1]):
raise ValueError(inconsistent_length_template
% (seq_len, len(seq[1])))
min_seq_len['forward'] = min(min_seq_len['forward'], len(seq[1]))
if i == index[0]:
qual_sample['forward'].append(_decode_qual_to_phred33(seq[3]))
index.pop(0)
if len(index) == 0:
break
return qual_sample
return qual_sample, min_seq_len


def _compute_stats_of_df(df):
df_stats = df.describe(
percentiles=[0.02, 0.09, 0.25, 0.5, 0.75, 0.91, 0.98])
drop_cols = df_stats.index.isin(['count', 'std', 'mean', 'min', 'max'])
drop_cols = df_stats.index.isin(['std', 'mean', 'min', 'max'])
df_stats = df_stats[~drop_cols]
return df_stats

Expand Down Expand Up @@ -145,10 +135,10 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
if paired:
sample_map = [(file, rev[fwd.index(file)], link[file])
for file in link]
quality_scores = _subsample_paired(sample_map)
quality_scores, min_seq_len = _subsample_paired(sample_map)
else:
sample_map = [(file, link[file]) for file in link]
quality_scores = _subsample_single(sample_map)
quality_scores, min_seq_len = _subsample_single(sample_map)

forward_scores = pd.DataFrame(quality_scores['forward'])
forward_stats = _compute_stats_of_df(forward_scores)
Expand Down Expand Up @@ -194,7 +184,6 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:
'url': 'quality-plot.html'}],
'dangers': dangers,
'warnings': warnings,
'sample_n': n
}
templates = [index, overview_template, quality_template]
q2templates.render(templates, output_dir, context=context)
Expand All @@ -204,15 +193,11 @@ def summarize(output_dir: str, data: _PlotQualView, n: int=10000) -> None:

with open(os.path.join(output_dir, 'data.jsonp'), 'w') as fh:
fh.write("app.init(")
json.dump({'n': int(n), 'totalSeqCount': int(sequence_count),
'minSeqLen': min_seq_len}, fh)
fh.write(',')
forward_stats.to_json(fh)
if paired:
fh.write(',')
reverse_stats.to_json(fh)
fh.write(');')


inconsistent_length_template = ('Observed sequences of length %d '
'and %d while generating a random '
'subsample of sequences. Inconsistent '
'length sequences are not supported '
'in this visualization at this time.')
2 changes: 1 addition & 1 deletion q2_demux/_summarize/assets/app/bundle.js

Large diffs are not rendered by default.

8 changes: 0 additions & 8 deletions q2_demux/_summarize/assets/quality-plot.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@
<div class="col-xs-12 text-center">
<p>
Click and drag on plot to zoom in. Double click to zoom back out to full size.
</p>
<p>
Hover over a box to see the parametric seven-number summary of the quality scores at the corresponding position.
</p>
</div>
Expand All @@ -68,12 +66,6 @@ <h5 class="text-center">Reverse Reads</h5>
{% else %}
<div id="forwardContainer" class="col-xs-12 col-md-6 col-md-offset-3"></div>
{% endif %}
<div class="col-xs-12 text-center">
<p>
These plots were generated using a random sampling of {{ "{:,}".format(sample_n) }} out of {{ "{:,}".format(result_data['sum']) }} sequences without replacement.
Outlier quality scores are not shown in box plots for clarity.
</p>
</div>
</div>
<script src="./app/bundle.js" charset="utf-8"></script>
<script src="./data.jsonp" charset="utf-8"></script>
Expand Down
35 changes: 30 additions & 5 deletions q2_demux/_summarize/assets/src/box.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@

import * as d3 from 'd3';

export default function plotBoxes(svg, data, x, y) {
export default function plotBoxes(svg, data, x, y, seqProps) {
const halfWidth = (x.range()[1] - x.range()[0]) / (x.domain()[1] - x.domain()[0]) / 2;
const quarterWidth = halfWidth / 2;
const t = svg.transition().duration(750);
const darkBlue = 'steelblue';
const lightBlue = 'skyblue';
const darkRed = '#a94442';
const lightRed = '#ebccd1';
const minSeqLen = seqProps.minSeqLen[data.direction];

const containerUpdate = svg.selectAll('.container')
.data(data);
Expand All @@ -25,12 +30,17 @@ export default function plotBoxes(svg, data, x, y) {
.selection()
.on('mouseover', function mouseover() {
const data = d3.select(this).data();
const position = data[0][0];
const stats = data[0][1];
const inTheDangerZone = stats.count < seqProps.n;
const svg = d3.select(this.parentNode).node();
const plotContainer = d3.select(svg.parentNode);
plotContainer
.select('.panel')
.attr('class', inTheDangerZone ? 'panel panel-danger' : 'panel panel-default');
plotContainer
.select('.panel-heading')
.html(`Parametric seven-number summary for <strong>position ${data[0][0]}</strong>`);
.html(`Parametric seven-number summary for <strong>position ${position}</strong>`);
plotContainer
.select('.stats')
.select('tbody')
Expand All @@ -47,6 +57,21 @@ export default function plotBoxes(svg, data, x, y) {
.selectAll('td')
.data(d => d)
.text(d => d);

let seqLenNote = `The minimum sequence length identified during subsampling was ${minSeqLen} bases`;
if (inTheDangerZone) {
seqLenNote = `This position (${position}) is greater than the minimum sequence length observed
during subsampling (${minSeqLen} bases). As a result, the plot at this position
is not based on data from all of the sequences, so it should be interpreted with
caution when compared to plots for other positions`;
}

plotContainer.select('.random-sampling')
.classed('text-danger', inTheDangerZone)
.html(`The plot at position ${position} was generated using a random
sampling of ${stats.count} out of ${seqProps.totalSeqCount} sequences
without replacement. ${seqLenNote}. Outlier quality scores are
not shown in box plots for clarity.`);
});

const centerUpdate = containers.selectAll('line.center').data(d => [d]);
Expand All @@ -73,18 +98,18 @@ export default function plotBoxes(svg, data, x, y) {
.attr('y', d => y(d[1]['75%']))
.attr('width', halfWidth)
.attr('height', d => (y(d[1]['25%']) - y(d[1]['75%'])))
.attr('fill', 'steelblue')
.attr('fill', d => (d[1]['count'] < seqProps.n ? lightRed : lightBlue))
.attr('stroke-width', 1)
.attr('stroke', 'black')
.selection()
// The two event handlers don't use fat-arrows because we need the lexical `this` in scope
.on('mouseover', function mouseover() {
d3.select(this)
.attr('fill', 'skyblue');
.attr('fill', d => (d[1]['count'] < seqProps.n ? darkRed : darkBlue));
})
.on('mouseout', function mouseout() {
d3.select(this)
.attr('fill', 'steelblue');
.attr('fill', d => (d[1]['count'] < seqProps.n ? lightRed : lightBlue));
});

const medianUpdate = containers.selectAll('line.median').data(d => [d]);
Expand Down
4 changes: 2 additions & 2 deletions q2_demux/_summarize/assets/src/brush.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import * as d3 from 'd3';

import plotBoxes from './box';

export function addBrush(svg, data, x, y, x0, y0, xAxis, yAxis, ticksCounts) {
export function addBrush(svg, data, x, y, x0, y0, xAxis, yAxis, ticksCounts, seqProps) {
const brush = d3.brush();
let idleTimeout;
const idleDelay = 350;
Expand Down Expand Up @@ -44,7 +44,7 @@ export function addBrush(svg, data, x, y, x0, y0, xAxis, yAxis, ticksCounts) {

svg.select('.axis--x').transition(t).call(xAxis);
svg.select('.axis--y').transition(t).call(yAxis);
plotBoxes(svg, data, x, y);
plotBoxes(svg, data, x, y, seqProps);
};

const brushEnded = () => {
Expand Down
4 changes: 2 additions & 2 deletions q2_demux/_summarize/assets/src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

import initializePlot from './plot';

export const init = (fwdScores, revScores = undefined) => { // eslint-disable-line
export const init = (seqProps, fwdScores, revScores = undefined) => { // eslint-disable-line
const qualScores = { forward: Object.keys(fwdScores).map(key => [+key + 1, fwdScores[key]]) };
if (revScores) {
qualScores.reverse = Object.keys(revScores).map(key => [+key + 1, revScores[key]]);
}
initializePlot(qualScores);
initializePlot(qualScores, seqProps);
};
23 changes: 17 additions & 6 deletions q2_demux/_summarize/assets/src/plot.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ import * as d3 from 'd3';
import plotBoxes from './box';
import { updateXTicks, addBrush } from './brush';

const plot = (data, props, container) => {
const plot = (data, props, container, seqProps) => {
const plotContainer = d3.select(container);

const svg = plotContainer
.append('svg')
.attr('class', 'col-xs-12')
Expand All @@ -22,6 +21,17 @@ const plot = (data, props, container) => {
.attr('width', props.width + props.margin.left + props.margin.right)
.attr('height', props.height + props.margin.top + props.margin.bottom);

plotContainer
.append('div')
.attr('class', 'col-xs-12')
.append('p')
.attr('class', 'random-sampling')
.html(`These plots were generated using a random sampling of ${seqProps.n}
out of ${seqProps.totalSeqCount} sequences without replacement. The
minimum sequence length identified during subsampling was
${seqProps.minSeqLen[data.direction]} bases. Outlier quality scores are not shown in box
plots for clarity.`);

const panel = plotContainer
.append('div')
.attr('class', 'col-xs-12')
Expand Down Expand Up @@ -87,8 +97,8 @@ const plot = (data, props, container) => {
svg.attr('height', props.height + props.margin.bottom + props.margin.top);
svg.attr('width', x.range()[0] + x.range()[1]);

addBrush(svg, data, x, y, x0, y0, xAxis, yAxis, ticks)
plotBoxes(svg, data, x, y);
addBrush(svg, data, x, y, x0, y0, xAxis, yAxis, ticks, seqProps)
plotBoxes(svg, data, x, y, seqProps);

svg.append('rect')
.attr('width', props.width + props.margin.left + props.margin.right)
Expand Down Expand Up @@ -135,7 +145,7 @@ const plot = (data, props, container) => {
.text('Sequence Base');
};

const initializePlot = (data) => {
const initializePlot = (data, seqProps) => {
const margin = { top: 10, right: 30, bottom: 30, left: 30 };
const width = d3.select('#forwardContainer').node().offsetWidth;
const props = {
Expand All @@ -144,7 +154,8 @@ const initializePlot = (data) => {
height: ((width * 9) / 16) - margin.top - margin.bottom };

Object.keys(data).forEach((direction) => {
plot(data[direction], props, `#${direction}Container`);
data[direction].direction = direction
plot(data[direction], props, `#${direction}Container`, seqProps);
});
};

Expand Down
57 changes: 40 additions & 17 deletions q2_demux/tests/test_demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import unittest.mock as mock
import os.path
import tempfile
import json

import pandas as pd
import skbio
Expand Down Expand Up @@ -776,31 +777,42 @@ def test_phred_score_out_of_range(self):
self.assertIn('<strong>Danger:</strong>', html)

def test_inconsistent_sequence_length_single(self):
sequences = [('@s1/1 abc/1', 'GGGG', '+', 'YYYY'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AA', '+', 'PP'),
sequences = [('@s1/1 abc/1', 'GGGGGGG', '+', 'YYYYYYY'),
('@s2/1 abc/1', 'CCCCC', '+', 'PPPPP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abc/1', 'T', '+', 'P')]
bsi = BarcodeSequenceFastqIterator(self.barcodes, sequences)

barcode_map = pd.Series(['AAAA', 'AACC'], index=['sample1', 'sample2'])
barcode_map = qiime2.MetadataCategory(barcode_map)

demux_data = emp_single(bsi, barcode_map)
with tempfile.TemporaryDirectory() as output_dir:
with self.assertRaisesRegex(ValueError,
'Observed sequences of length'):
lengths = [1, 3, 5, 7]
for n in range(1, 6):
with tempfile.TemporaryDirectory() as output_dir:
lengths_ = lengths[0:5-n] if n < 4 else [1]
# TODO: Remove _PlotQualView wrapper
summarize(output_dir, _PlotQualView(demux_data,
paired=False), n=2)
paired=False), n=n)
plot_fp = os.path.join(output_dir, 'data.jsonp')
with open(plot_fp, 'r') as fh:
jsonp = fh.read()
json_ = jsonp.replace('app.init(',
'[').replace(');', ']')
payload = json.loads(json_)[0]
self.assertEqual(payload["totalSeqCount"], 4)
self.assertIn(payload["minSeqLen"]["forward"], lengths_)
self.assertEqual(payload["minSeqLen"]["reverse"], None)
self.assertEqual(payload["n"], min(n, 4))

def test_inconsistent_sequence_length_paired(self):
forward = [('@s1/1 abc/1', 'G', '+', 'Y'),
('@s2/1 abc/1', 'CC', '+', 'PP'),
('@s3/1 abc/1', 'AAA', '+', 'PPP'),
('@s4/1 abc/1', 'TTTT', '+', 'PPPP')]
reverse = [('@s1/1 abc/1', 'AAAA', '+', 'YYYY'),
('@s2/1 abc/1', 'TTT', '+', 'PPP'),
('@s3/1 abc/1', 'GG', '+', 'PP'),
('@s2/1 abc/1', 'CCC', '+', 'PPP'),
('@s3/1 abc/1', 'AAAAA', '+', 'PPPPP'),
('@s4/1 abc/1', 'TTTTTTT', '+', 'PPPPPPP')]
reverse = [('@s1/1 abc/1', 'AAAAAAA', '+', 'YYYYYYY'),
('@s2/1 abc/1', 'TTTTT', '+', 'PPPPP'),
('@s3/1 abc/1', 'GGG', '+', 'PPP'),
('@s4/1 abc/1', 'C', '+', 'P')]
bpsi = BarcodePairedSequenceFastqIterator(self.barcodes, forward,
reverse)
Expand All @@ -809,12 +821,23 @@ def test_inconsistent_sequence_length_paired(self):
barcode_map = qiime2.MetadataCategory(barcode_map)

demux_data = emp_paired(bpsi, barcode_map)
with tempfile.TemporaryDirectory() as output_dir:
with self.assertRaisesRegex(ValueError,
'Observed sequences of length'):
lengths = [1, 3, 5, 7]
for n in range(1, 6):
with tempfile.TemporaryDirectory() as output_dir:
lengths_ = lengths[0:5-n] if n < 4 else [1]
# TODO: Remove _PlotQualView wrapper
summarize(output_dir, _PlotQualView(demux_data,
paired=True), n=2)
paired=True), n=n)
plot_fp = os.path.join(output_dir, 'data.jsonp')
with open(plot_fp, 'r') as fh:
jsonp = fh.read()
json_ = jsonp.replace('app.init(',
'[').replace(');', ']')
payload = json.loads(json_)[0]
self.assertEqual(payload["totalSeqCount"], 4)
self.assertIn(payload["minSeqLen"]["forward"], lengths_)
self.assertIn(payload["minSeqLen"]["reverse"], lengths_)
self.assertEqual(payload["n"], min(n, 4))


if __name__ == '__main__':
Expand Down

0 comments on commit d59be81

Please sign in to comment.