Skip to content

Commit

Permalink
BUG/MAINT/TST: code cleanup, refactoring, a few bug fixes, unit test …
Browse files Browse the repository at this point in the history
…additions

Took a pass through the code and unit tests, cleaning up the implementation a
bit and improving error messages. Also refactored some of the code for easier
testing of individual pieces, which uncovered a few bugs:

- every value in `value_map` was being used to compute min/median/max for the
  legend, and default values (if `value_map` was a defaultdict) were ignored
- plotting an empty alignment resulted in a cryptic error
- plotting an alignment with a single sequence produced the wrong y-axis labels

Fixes #765.
  • Loading branch information
jairideout committed Feb 11, 2015
1 parent 0e2aabd commit 89f45fc
Show file tree
Hide file tree
Showing 2 changed files with 210 additions and 115 deletions.
135 changes: 87 additions & 48 deletions skbio/alignment/_alignment.py
Expand Up @@ -1672,8 +1672,8 @@ def heatmap(self, value_map,
Plot the alignment as a heatmap, coloring heatmap cells using the
character mapping in `value_map`. The x-axis is labeled by majority
consensus, and the y-axis is labeled by sequence IDs (every third
sequence ID is displayed).
consensus, and the y-axis is labeled by sequence IDs (first, last, and
every third sequence ID are displayed).
Parameters
----------
Expand All @@ -1684,7 +1684,11 @@ def heatmap(self, value_map,
can, for example, default to ``nan``.
legend_labels : iterable, optional
Labels for the minimum, median, and maximum values in the legend.
Must be an iterable of exactly three strings.
Must be an iterable of exactly three strings. Mininum, median, and
maximum values are computed from the values in `value_map`. Only
values that are actually used to perform a character-to-value
mapping are included when computing the minimum, median, and
maximum. ``nan`` values are ignored.
fig_size : tuple, optional
Size of the figure in inches. If ``None``, defaults to figure size
specified in the user's matplotlibrc file.
Expand All @@ -1708,9 +1712,10 @@ def heatmap(self, value_map,
------
KeyError
If a sequence character in the alignment is not in `value_map` and
`value_map` is not a ``collections.defaultdict``, or if
`sequence_order` contains a sequence ID that isn't in the
alignment.
`value_map` is not a ``collections.defaultdict``.
ValueError
If `sequence_order` isn't the same set of sequence IDs in the
alignment, or if `sequence_order` contains duplicate sequence IDs.
See Also
--------
Expand Down Expand Up @@ -1741,7 +1746,7 @@ def heatmap(self, value_map,
... 'W': 2.65, 'H': 0.61, 'Y': 1.88,
... 'I': 2.22, 'V': 1.32})
Define an Alignment of protein sequences:
Define an alignment of protein sequences:
>>> from skbio import Alignment, Protein
>>> sequences = [Protein('VHLTPEEKSAVTALWGKVNVDEV--', id="seq1"),
Expand All @@ -1750,65 +1755,99 @@ def heatmap(self, value_map,
... Protein('-VLSEGEWQLVLHVWAKVEADIAGH', id="seq4")]
>>> aln = Alignment(sequences)
Plot the Alignment as a heatmap:
Plot the alignment as a heatmap:
>>> fig = aln.heatmap(hydrophobicity_idx, fig_size=(15, 10),
... legend_labels=['Hydrophilic', 'Medium',
... 'Hydrophobic'],
... cmap='Greens')
"""
# cache the sequence length, count, and ids to avoid multiple look-ups
sequence_length = self.sequence_length()
sequence_count = self.sequence_count()
if len(legend_labels) != 3:
raise ValueError(
"`legend_labels` must contain exactly three labels.")

if sequence_order is not None:
if len(sequence_order) != sequence_count:
raise ValueError("Too many objects in passed tuple, you must"
"use", sequence_count, "objects.")
if len(sequence_order) > len(set(sequence_order)):
raise ValueError("The sequence_order must only contain"
"unique elements")
sequence_order_set = set(sequence_order)
if len(sequence_order) != len(sequence_order_set):
raise ValueError(
"Found duplicate sequence ID(s) in `sequence_order`.")
if sequence_order_set != set(self.ids()):
raise ValueError(
"`sequence_order` must contain the same set of sequence "
"IDs in the alignment.")
else:
sequence_order = self.ids()
if len(legend_labels) != 3:
raise ValueError
values = list(value_map.values())

# create an empty data matrix
mtx = np.empty([sequence_length, sequence_count])
# fill the data matrix by iterating over the alignment and mapping
# characters to values
for i in range(sequence_length):
for j, sequence_id in enumerate(sequence_order):
mp = str(self[sequence_id][i])
mtx[i][j] = value_map[mp]

# build the heatmap, this code derived from the Matplotlib Gallery
# http://matplotlib.org/examples/pylab_examples/...
# colorbar_tick_labelling_demo.html

mtx, min_val, median_val, max_val = self._alignment_to_heatmap_matrix(
value_map, sequence_order)

# heatmap code derived from the matplotlib gallery:
# http://matplotlib.org/examples/pylab_examples/
# colorbar_tick_labelling_demo.html
fig, ax = plt.subplots()
if fig_size is not None:
fig.set_size_inches(fig_size)

cax = ax.imshow(mtx.T, interpolation='nearest', cmap=cmap)

# Add colorbar and define tick labels
cbar = fig.colorbar(cax,
ticks=[min(values), np.median(values),
max(values)], orientation='horizontal')
ax.set_yticks([0] + list(range(3, sequence_count - 3, 3)) +
[sequence_count-1])
yt = ax.get_yticks()
ytl = []
for i in range(len(yt)):
ytl.append(sequence_order[yt[i]])
ax.set_yticklabels(ytl)

ax.set_xticks(range(sequence_length))
cax = ax.imshow(mtx, interpolation='nearest', cmap=cmap)

sequence_count = self.sequence_count()
y_ticks = [0] + list(range(3, sequence_count - 3, 3))
if sequence_count > 1:
# we have more than one sequence, so add the index of the last
# sequence
y_ticks += [sequence_count - 1]
y_tick_labels = [sequence_order[tick] for tick in y_ticks]
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_tick_labels)

ax.set_xticks(range(self.sequence_length()))
ax.set_xticklabels(self.majority_consensus(), size=7)

# add colorbar legend
cbar = fig.colorbar(cax, ticks=[min_val, median_val, max_val],
orientation='horizontal')
cbar.ax.set_xticklabels(legend_labels)

return fig

def _alignment_to_heatmap_matrix(self, value_map, sequence_order):
"""Convert alignment to numeric matrix for heatmap plotting."""
if self.is_empty():
raise AlignmentError(
"Cannot create heatmap from an empty alignment.")

sequence_length = self.sequence_length()
if sequence_length < 1:
raise AlignmentError(
"Cannot create heatmap from an alignment without any "
"positions.")

# track the values that are actually used -- this can differ from the
# values in `value_map` (e.g., if `value_map` is a superset or a
# defaultdict). we want to use these values to compute the min, median,
# and max for the colorbar legend labels
values = {}
mtx = np.empty([self.sequence_count(), sequence_length])
for row_idx, sequence_id in enumerate(sequence_order):
seq = str(self[sequence_id])
for col_idx in range(sequence_length):
char = seq[col_idx]
value = value_map[char]
mtx[row_idx][col_idx] = value
values[char] = value

# remove nans as this can mess up the median calculation below.
# numpy.nanmedian was added in 1.9.0, so consider updating this code
# if/when we start supporting numpy >= 1.9.0. scipy.stats.nanmedian is
# deprecated so we can't use that either.
#
# modified from http://stackoverflow.com/a/26475626/3776794
values = np.asarray(values.values())
values = values[~np.isnan(values)]

return mtx, min(values), np.median(values), max(values)

def _validate_lengths(self):
"""Return ``True`` if all sequences same length, ``False`` otherwise
"""
Expand Down
190 changes: 123 additions & 67 deletions skbio/alignment/tests/test_alignment.py
Expand Up @@ -828,76 +828,132 @@ def test_to_phylip_no_positions(self):
with self.assertRaises(SequenceCollectionError):
npt.assert_warns(DeprecationWarning, a.to_phylip)

def test_heatmap_with_defaults(self):
values, sequences, a1 = self.heatmap_set_values()
fig = a1.heatmap(values)
self.heatmap_basic_sanity(fig, ['A', 'A', 'C', 'C', 'C', 'G', 'T'],
['seq1', 'seq2'], ['Minimum', 'Median',
'Maximum'])

def test_heatmap_with_custom(self):
sequences = [DNA('AGTCGGT', id="seq1"),
DNA('CAACGGA', id="seq2"),
DNA('AACCTCT', id="seq3"),
DNA('TACTCGT', id="seq4")]
a1 = Alignment(sequences)
values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
clabels = ['a', 'b', 'c']
fig = a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
legend_labels=clabels,
sequence_order=('seq4', 'seq3', 'seq2', 'seq1'))
self.heatmap_basic_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
['seq4', 'seq1'], clabels)
self.assertEqual(fig.get_figwidth(), 15.0)
self.assertEqual(fig.get_figheight(), 10.0)

def test_heatmap_raises(self):
values, sequences, a1 = self.heatmap_set_values()
def check_heatmap_sanity(self, fig, exp_x_tick_labels, exp_y_tick_labels,
exp_legend_labels):
"""Helper method for testing basic heatmap figure properties."""
axes = fig.get_axes()
self.assertEqual(len(axes), 2)

ax, axc = axes

x_tick_labels = [e.get_text() for e in ax.get_xticklabels()]
self.assertEqual(x_tick_labels, exp_x_tick_labels)

y_tick_labels = [e.get_text() for e in ax.get_yticklabels()]
self.assertEqual(y_tick_labels, exp_y_tick_labels)

legend_labels = [e.get_text() for e in axc.get_xticklabels()]
self.assertEqual(legend_labels, exp_legend_labels)

def test_heatmap_empty(self):
# no seqs
aln = Alignment([])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
with self.assertRaises(AlignmentError):
aln.heatmap(value_map)

# no positions
aln = Alignment([DNA(''), RNA('', id="s2")])
with self.assertRaises(AlignmentError):
aln.heatmap(value_map)

def test_heatmap_1_seq(self):
aln = Alignment([RNA('AACGU', id="seq1")])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'A', 'C', 'G', 'U'], ['seq1'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_2_seqs(self):
aln = Alignment([RNA('AACGU', id="seq1"),
RNA('AACGU', id="seq2")])
value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'A', 'C', 'G', 'U'], ['seq1', 'seq2'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_3_seqs(self):
aln = Alignment([DNA('AACCCGT', id="seq1"),
DNA('ACCCGGT', id="seq2"),
DNA('ACCCGGT', id="seq3")])
value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
fig = aln.heatmap(value_map)

self.check_heatmap_sanity(
fig, ['A', 'C', 'C', 'C', 'G', 'G', 'T'], ['seq1', 'seq3'],
['Minimum', 'Median', 'Maximum'])

def test_heatmap_with_non_defaults(self):
aln = Alignment([DNA('AGTCGGT', id="seq1"),
DNA('CAACGGA', id="seq2"),
DNA('AACCTCT', id="seq3"),
DNA('TACTCGT', id="seq4")])
value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
labels = ['a', 'b', 'c']
fig = aln.heatmap(
value_map, legend_labels=labels, fig_size=(42, 22), cmap='Blues',
sequence_order=('seq4', 'seq3', 'seq1', 'seq2'))

self.check_heatmap_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
['seq4', 'seq2'], labels)
self.assertEqual(fig.get_figwidth(), 42.0)
self.assertEqual(fig.get_figheight(), 22.0)

def test_heatmap_invalid_legend_labels(self):
with self.assertRaises(ValueError):
a1.heatmap(values, legend_labels=['a', 'b', 'c', 'd'])

def test_heatmap_errors(self):
sequences = [DNA('AGTCGGT', id="seq1"),
DNA('CAACGGA', id="seq2"),
DNA('AACCTCT', id="seq3"),
DNA('TACTCGT', id="seq4")]
a1 = Alignment(sequences)
values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
clabels = ['a', 'b', 'c']
self.a1.heatmap({}, legend_labels=['a', 'b', 'c', 'd'])

def test_heatmap_invalid_sequence_order(self):
# duplicate ids
with self.assertRaises(ValueError):
a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
legend_labels=clabels,
sequence_order=('seq1', 'seq2', 'seq3', 'seq3'))
self.a1.heatmap({}, sequence_order=['d1', 'd2', 'd1'])

# provided set of ids doesn't match alignment's set of ids
with self.assertRaises(ValueError):
a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
legend_labels=clabels,
sequence_order=('seq1', 'seq2', 'seq4', 'seq3',
'seq5'))

def heatmap_set_values(self):
sequences = [DNA('AACCCGT', id="seq1"),
DNA('AACCGGT', id="seq2")]
a1 = Alignment(sequences)
values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
return(values, sequences, a1)

def heatmap_basic_sanity(self, fig, xt, yt, clabels):
axes = fig.get_axes()
self.assertEqual(len(axes), 2)
ax = axes[0]
axc = axes[1]
xticks = []
for tick in ax.get_xticklabels():
xticks.append(tick.get_text())
self.assertEqual(xticks, xt)
yticks = []
for tick in ax.get_yticklabels():
yticks.append(tick.get_text())
self.assertEqual(yticks, yt)
cticks = []
for tick in axc.get_xticklabels():
cticks.append(tick.get_text())
self.assertEqual(clabels, cticks)
self.a1.heatmap({}, sequence_order=['d2', 'd3', 'd0'])

def test_heatmap_missing_character_in_value_map(self):
with self.assertRaises(KeyError):
self.a1.heatmap({})

def test_alignment_to_heatmap_matrix(self):
aln = Alignment([DNA('ACTG', id='d1'),
DNA('A.-G', id='d2'),
DNA('TC-G', id='d3')])
value_map = defaultdict(lambda: np.nan)
value_map.update({'A': 42, 'C': 10.5, 'T': 22.1, 'G': -7.789,
'U': -999.9, 'Z': 42000})
exp_min = -7.789
exp_median = 16.3
exp_max = 42

# sequence order is same as what's in the alignment
mtx, min_val, median_val, max_val = \
aln._alignment_to_heatmap_matrix(value_map, ['d1', 'd2', 'd3'])

exp_mtx = np.array([[42., 10.5, 22.1, -7.789],
[42., np.nan, np.nan, -7.789],
[22.1, 10.5, np.nan, -7.789]])
npt.assert_array_equal(mtx, exp_mtx)
self.assertEqual(min_val, exp_min)
self.assertAlmostEqual(median_val, exp_median)
self.assertEqual(max_val, exp_max)

# sequence order is different from what's in the alignment
mtx, min_val, median_val, max_val = \
aln._alignment_to_heatmap_matrix(value_map, ['d3', 'd1', 'd2'])

exp_mtx = np.array([[22.1, 10.5, np.nan, -7.789],
[42., 10.5, 22.1, -7.789],
[42., np.nan, np.nan, -7.789]])
npt.assert_array_equal(mtx, exp_mtx)
self.assertEqual(min_val, exp_min)
self.assertAlmostEqual(median_val, exp_median)
self.assertEqual(max_val, exp_max)

def test_validate_lengths(self):
self.assertTrue(self.a1._validate_lengths())
Expand Down

0 comments on commit 89f45fc

Please sign in to comment.