BUG/MAINT/TST: code cleanup, refactoring, a few bug fixes, unit test …

…additions Took a pass through the code and unit tests, cleaning up the implementation a bit and improving error messages. Also refactored some of the code for easier testing of individual pieces, which uncovered a few bugs: - every value in `value_map` was being used to compute min/median/max for the legend, and default values (if `value_map` was a defaultdict) were ignored - plotting an empty alignment resulted in a cryptic error - plotting an alignment with a single sequence produced the wrong y-axis labels Fixes #765.
scikit-bio · Feb 11, 2015 · 89f45fc · 89f45fc
1 parent 0e2aabd
commit 89f45fc
Show file tree

Hide file tree

Showing 2 changed files with 210 additions and 115 deletions.
diff --git a/skbio/alignment/_alignment.py b/skbio/alignment/_alignment.py
@@ -1672,8 +1672,8 @@ def heatmap(self, value_map,
 
         Plot the alignment as a heatmap, coloring heatmap cells using the
         character mapping in `value_map`. The x-axis is labeled by majority
-        consensus, and the y-axis is labeled by sequence IDs (every third
-        sequence ID is displayed).
+        consensus, and the y-axis is labeled by sequence IDs (first, last, and
+        every third sequence ID are displayed).
 
         Parameters
         ----------
@@ -1684,7 +1684,11 @@ def heatmap(self, value_map,
             can, for example, default to ``nan``.
         legend_labels : iterable, optional
             Labels for the minimum, median, and maximum values in the legend.
-            Must be an iterable of exactly three strings.
+            Must be an iterable of exactly three strings. Mininum, median, and
+            maximum values are computed from the values in `value_map`. Only
+            values that are actually used to perform a character-to-value
+            mapping are included when computing the minimum, median, and
+            maximum. ``nan`` values are ignored.
         fig_size : tuple, optional
             Size of the figure in inches. If ``None``, defaults to figure size
             specified in the user's matplotlibrc file.
@@ -1708,9 +1712,10 @@ def heatmap(self, value_map,
         ------
         KeyError
             If a sequence character in the alignment is not in `value_map` and
-            `value_map` is not a ``collections.defaultdict``, or if
-            `sequence_order` contains a sequence ID that isn't in the
-            alignment.
+            `value_map` is not a ``collections.defaultdict``.
+        ValueError
+            If `sequence_order` isn't the same set of sequence IDs in the
+            alignment, or if `sequence_order` contains duplicate sequence IDs.
 
         See Also
         --------
@@ -1741,7 +1746,7 @@ def heatmap(self, value_map,
            ...                            'W': 2.65, 'H': 0.61, 'Y': 1.88,
            ...                            'I': 2.22, 'V': 1.32})
 
-           Define an Alignment of protein sequences:
+           Define an alignment of protein sequences:
 
            >>> from skbio import Alignment, Protein
            >>> sequences = [Protein('VHLTPEEKSAVTALWGKVNVDEV--', id="seq1"),
@@ -1750,65 +1755,99 @@ def heatmap(self, value_map,
            ...              Protein('-VLSEGEWQLVLHVWAKVEADIAGH', id="seq4")]
            >>> aln = Alignment(sequences)
 
-           Plot the Alignment as a heatmap:
+           Plot the alignment as a heatmap:
 
            >>> fig = aln.heatmap(hydrophobicity_idx, fig_size=(15, 10),
            ...                   legend_labels=['Hydrophilic', 'Medium',
            ...                                  'Hydrophobic'],
            ...                   cmap='Greens')
 
         """
-        # cache the sequence length, count, and ids to avoid multiple look-ups
-        sequence_length = self.sequence_length()
-        sequence_count = self.sequence_count()
+        if len(legend_labels) != 3:
+            raise ValueError(
+                "`legend_labels` must contain exactly three labels.")
+
         if sequence_order is not None:
-            if len(sequence_order) != sequence_count:
-                raise ValueError("Too many objects in passed tuple, you must"
-                                 "use", sequence_count, "objects.")
-            if len(sequence_order) > len(set(sequence_order)):
-                raise ValueError("The sequence_order must only contain"
-                                 "unique elements")
+            sequence_order_set = set(sequence_order)
+            if len(sequence_order) != len(sequence_order_set):
+                raise ValueError(
+                    "Found duplicate sequence ID(s) in `sequence_order`.")
+            if sequence_order_set != set(self.ids()):
+                raise ValueError(
+                    "`sequence_order` must contain the same set of sequence "
+                    "IDs in the alignment.")
         else:
             sequence_order = self.ids()
-        if len(legend_labels) != 3:
-            raise ValueError
-        values = list(value_map.values())
-
-        # create an empty data matrix
-        mtx = np.empty([sequence_length, sequence_count])
-        # fill the data matrix by iterating over the alignment and mapping
-        # characters to values
-        for i in range(sequence_length):
-            for j, sequence_id in enumerate(sequence_order):
-                mp = str(self[sequence_id][i])
-                mtx[i][j] = value_map[mp]
-
-        # build the heatmap, this code derived from the Matplotlib Gallery
-        # http://matplotlib.org/examples/pylab_examples/...
-        # colorbar_tick_labelling_demo.html
+
+        mtx, min_val, median_val, max_val = self._alignment_to_heatmap_matrix(
+            value_map, sequence_order)
+
+        # heatmap code derived from the matplotlib gallery:
+        #     http://matplotlib.org/examples/pylab_examples/
+        #     colorbar_tick_labelling_demo.html
         fig, ax = plt.subplots()
         if fig_size is not None:
             fig.set_size_inches(fig_size)
 
-        cax = ax.imshow(mtx.T, interpolation='nearest', cmap=cmap)
-
-        # Add colorbar and define tick labels
-        cbar = fig.colorbar(cax,
-                            ticks=[min(values), np.median(values),
-                                   max(values)], orientation='horizontal')
-        ax.set_yticks([0] + list(range(3, sequence_count - 3, 3)) +
-                      [sequence_count-1])
-        yt = ax.get_yticks()
-        ytl = []
-        for i in range(len(yt)):
-            ytl.append(sequence_order[yt[i]])
-        ax.set_yticklabels(ytl)
-
-        ax.set_xticks(range(sequence_length))
+        cax = ax.imshow(mtx, interpolation='nearest', cmap=cmap)
+
+        sequence_count = self.sequence_count()
+        y_ticks = [0] + list(range(3, sequence_count - 3, 3))
+        if sequence_count > 1:
+            # we have more than one sequence, so add the index of the last
+            # sequence
+            y_ticks += [sequence_count - 1]
+        y_tick_labels = [sequence_order[tick] for tick in y_ticks]
+        ax.set_yticks(y_ticks)
+        ax.set_yticklabels(y_tick_labels)
+
+        ax.set_xticks(range(self.sequence_length()))
         ax.set_xticklabels(self.majority_consensus(), size=7)
+
+        # add colorbar legend
+        cbar = fig.colorbar(cax, ticks=[min_val, median_val, max_val],
+                            orientation='horizontal')
         cbar.ax.set_xticklabels(legend_labels)
+
         return fig
 
+    def _alignment_to_heatmap_matrix(self, value_map, sequence_order):
+        """Convert alignment to numeric matrix for heatmap plotting."""
+        if self.is_empty():
+            raise AlignmentError(
+                "Cannot create heatmap from an empty alignment.")
+
+        sequence_length = self.sequence_length()
+        if sequence_length < 1:
+            raise AlignmentError(
+                "Cannot create heatmap from an alignment without any "
+                "positions.")
+
+        # track the values that are actually used -- this can differ from the
+        # values in `value_map` (e.g., if `value_map` is a superset or a
+        # defaultdict). we want to use these values to compute the min, median,
+        # and max for the colorbar legend labels
+        values = {}
+        mtx = np.empty([self.sequence_count(), sequence_length])
+        for row_idx, sequence_id in enumerate(sequence_order):
+            seq = str(self[sequence_id])
+            for col_idx in range(sequence_length):
+                char = seq[col_idx]
+                value = value_map[char]
+                mtx[row_idx][col_idx] = value
+                values[char] = value
+
+        # remove nans as this can mess up the median calculation below.
+        # numpy.nanmedian was added in 1.9.0, so consider updating this code
+        # if/when we start supporting numpy >= 1.9.0. scipy.stats.nanmedian is
+        # deprecated so we can't use that either.
+        #
+        # modified from http://stackoverflow.com/a/26475626/3776794
+        values = np.asarray(values.values())
+        values = values[~np.isnan(values)]
+
+        return mtx, min(values), np.median(values), max(values)
+
     def _validate_lengths(self):
         """Return ``True`` if all sequences same length, ``False`` otherwise
         """

diff --git a/skbio/alignment/tests/test_alignment.py b/skbio/alignment/tests/test_alignment.py
@@ -828,76 +828,132 @@ def test_to_phylip_no_positions(self):
         with self.assertRaises(SequenceCollectionError):
             npt.assert_warns(DeprecationWarning, a.to_phylip)
 
-    def test_heatmap_with_defaults(self):
-        values, sequences, a1 = self.heatmap_set_values()
-        fig = a1.heatmap(values)
-        self.heatmap_basic_sanity(fig, ['A', 'A', 'C', 'C', 'C', 'G', 'T'],
-                                  ['seq1', 'seq2'], ['Minimum', 'Median',
-                                                     'Maximum'])
-
-    def test_heatmap_with_custom(self):
-        sequences = [DNA('AGTCGGT', id="seq1"),
-                     DNA('CAACGGA', id="seq2"),
-                     DNA('AACCTCT', id="seq3"),
-                     DNA('TACTCGT', id="seq4")]
-        a1 = Alignment(sequences)
-        values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
-        clabels = ['a', 'b', 'c']
-        fig = a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
-                         legend_labels=clabels,
-                         sequence_order=('seq4', 'seq3', 'seq2', 'seq1'))
-        self.heatmap_basic_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
-                                  ['seq4', 'seq1'], clabels)
-        self.assertEqual(fig.get_figwidth(), 15.0)
-        self.assertEqual(fig.get_figheight(), 10.0)
-
-    def test_heatmap_raises(self):
-        values, sequences, a1 = self.heatmap_set_values()
+    def check_heatmap_sanity(self, fig, exp_x_tick_labels, exp_y_tick_labels,
+                             exp_legend_labels):
+        """Helper method for testing basic heatmap figure properties."""
+        axes = fig.get_axes()
+        self.assertEqual(len(axes), 2)
+
+        ax, axc = axes
+
+        x_tick_labels = [e.get_text() for e in ax.get_xticklabels()]
+        self.assertEqual(x_tick_labels, exp_x_tick_labels)
+
+        y_tick_labels = [e.get_text() for e in ax.get_yticklabels()]
+        self.assertEqual(y_tick_labels, exp_y_tick_labels)
+
+        legend_labels = [e.get_text() for e in axc.get_xticklabels()]
+        self.assertEqual(legend_labels, exp_legend_labels)
+
+    def test_heatmap_empty(self):
+        # no seqs
+        aln = Alignment([])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        with self.assertRaises(AlignmentError):
+            aln.heatmap(value_map)
+
+        # no positions
+        aln = Alignment([DNA(''), RNA('', id="s2")])
+        with self.assertRaises(AlignmentError):
+            aln.heatmap(value_map)
+
+    def test_heatmap_1_seq(self):
+        aln = Alignment([RNA('AACGU', id="seq1")])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'A', 'C', 'G', 'U'], ['seq1'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_2_seqs(self):
+        aln = Alignment([RNA('AACGU', id="seq1"),
+                         RNA('AACGU', id="seq2")])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'A', 'C', 'G', 'U'], ['seq1', 'seq2'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_3_seqs(self):
+        aln = Alignment([DNA('AACCCGT', id="seq1"),
+                         DNA('ACCCGGT', id="seq2"),
+                         DNA('ACCCGGT', id="seq3")])
+        value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'C', 'C', 'C', 'G', 'G', 'T'], ['seq1', 'seq3'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_with_non_defaults(self):
+        aln = Alignment([DNA('AGTCGGT', id="seq1"),
+                         DNA('CAACGGA', id="seq2"),
+                         DNA('AACCTCT', id="seq3"),
+                         DNA('TACTCGT', id="seq4")])
+        value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
+        labels = ['a', 'b', 'c']
+        fig = aln.heatmap(
+            value_map, legend_labels=labels, fig_size=(42, 22), cmap='Blues',
+            sequence_order=('seq4', 'seq3', 'seq1', 'seq2'))
+
+        self.check_heatmap_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
+                                  ['seq4', 'seq2'], labels)
+        self.assertEqual(fig.get_figwidth(), 42.0)
+        self.assertEqual(fig.get_figheight(), 22.0)
+
+    def test_heatmap_invalid_legend_labels(self):
         with self.assertRaises(ValueError):
-            a1.heatmap(values, legend_labels=['a', 'b', 'c', 'd'])
-
-    def test_heatmap_errors(self):
-        sequences = [DNA('AGTCGGT', id="seq1"),
-                     DNA('CAACGGA', id="seq2"),
-                     DNA('AACCTCT', id="seq3"),
-                     DNA('TACTCGT', id="seq4")]
-        a1 = Alignment(sequences)
-        values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
-        clabels = ['a', 'b', 'c']
+            self.a1.heatmap({}, legend_labels=['a', 'b', 'c', 'd'])
+
+    def test_heatmap_invalid_sequence_order(self):
+        # duplicate ids
         with self.assertRaises(ValueError):
-            a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
-                       legend_labels=clabels,
-                       sequence_order=('seq1', 'seq2', 'seq3', 'seq3'))
+            self.a1.heatmap({}, sequence_order=['d1', 'd2', 'd1'])
+
+        # provided set of ids doesn't match alignment's set of ids
         with self.assertRaises(ValueError):
-            a1.heatmap(values, fig_size=(15, 10), cmap='Blues',
-                       legend_labels=clabels,
-                       sequence_order=('seq1', 'seq2', 'seq4', 'seq3',
-                                       'seq5'))
-
-    def heatmap_set_values(self):
-        sequences = [DNA('AACCCGT', id="seq1"),
-                     DNA('AACCGGT', id="seq2")]
-        a1 = Alignment(sequences)
-        values = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
-        return(values, sequences, a1)
-
-    def heatmap_basic_sanity(self, fig, xt, yt, clabels):
-        axes = fig.get_axes()
-        self.assertEqual(len(axes), 2)
-        ax = axes[0]
-        axc = axes[1]
-        xticks = []
-        for tick in ax.get_xticklabels():
-            xticks.append(tick.get_text())
-        self.assertEqual(xticks, xt)
-        yticks = []
-        for tick in ax.get_yticklabels():
-            yticks.append(tick.get_text())
-        self.assertEqual(yticks, yt)
-        cticks = []
-        for tick in axc.get_xticklabels():
-            cticks.append(tick.get_text())
-        self.assertEqual(clabels, cticks)
+            self.a1.heatmap({}, sequence_order=['d2', 'd3', 'd0'])
+
+    def test_heatmap_missing_character_in_value_map(self):
+        with self.assertRaises(KeyError):
+            self.a1.heatmap({})
+
+    def test_alignment_to_heatmap_matrix(self):
+        aln = Alignment([DNA('ACTG', id='d1'),
+                         DNA('A.-G', id='d2'),
+                         DNA('TC-G', id='d3')])
+        value_map = defaultdict(lambda: np.nan)
+        value_map.update({'A': 42, 'C': 10.5, 'T': 22.1, 'G': -7.789,
+                          'U': -999.9, 'Z': 42000})
+        exp_min = -7.789
+        exp_median = 16.3
+        exp_max = 42
+
+        # sequence order is same as what's in the alignment
+        mtx, min_val, median_val, max_val = \
+            aln._alignment_to_heatmap_matrix(value_map, ['d1', 'd2', 'd3'])
+
+        exp_mtx = np.array([[42., 10.5, 22.1, -7.789],
+                            [42., np.nan, np.nan, -7.789],
+                            [22.1, 10.5, np.nan, -7.789]])
+        npt.assert_array_equal(mtx, exp_mtx)
+        self.assertEqual(min_val, exp_min)
+        self.assertAlmostEqual(median_val, exp_median)
+        self.assertEqual(max_val, exp_max)
+
+        # sequence order is different from what's in the alignment
+        mtx, min_val, median_val, max_val = \
+            aln._alignment_to_heatmap_matrix(value_map, ['d3', 'd1', 'd2'])
+
+        exp_mtx = np.array([[22.1, 10.5, np.nan, -7.789],
+                            [42., 10.5, 22.1, -7.789],
+                            [42., np.nan, np.nan, -7.789]])
+        npt.assert_array_equal(mtx, exp_mtx)
+        self.assertEqual(min_val, exp_min)
+        self.assertAlmostEqual(median_val, exp_median)
+        self.assertEqual(max_val, exp_max)
 
     def test_validate_lengths(self):
         self.assertTrue(self.a1._validate_lengths())