scikit-bio · jairideout · Nov 25, 2014 · Nov 25, 2014 · Nov 25, 2014 · Nov 25, 2014
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Version 0.2.3-dev (changes since 0.2.3 release go here)
 
 ### Features
+* Added ``heatmap`` method to ``skbio.alignment.Alignment`` for creating a heatmap from an alignment ([#765](https://github.com/biocore/scikit-bio/issues/765)).
 
 ### Bug fixes
 * Changed `BiologicalSequence.distance` to raise an error any time two sequences are passed of different lengths regardless of the `distance_fn` being passed. [(#514)](https://github.com/biocore/scikit-bio/issues/514)

diff --git a/skbio/alignment/_alignment.py b/skbio/alignment/_alignment.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 from scipy.stats import entropy
+import matplotlib.pyplot as plt
 
 from skbio._base import SkbioObject
 from skbio.sequence import BiologicalSequence
@@ -1405,6 +1406,194 @@ def sequence_length(self):
         else:
             return len(self._data[0])
 
+    def heatmap(self, value_map,
+                legend_labels=('Minimum', 'Median', 'Maximum'), fig_size=None,
+                cmap=None, sequence_order=None):
+        """Plot alignment as a heatmap.
+
+        Plot the alignment as a heatmap, coloring heatmap cells using the
+        character mapping in `value_map`. The x-axis is labeled by majority
+        consensus, and the y-axis is labeled by sequence IDs (first, last, and
+        every third sequence ID are displayed).
+
+        Parameters
+        ----------
+        value_map : dict, collections.defaultdict
+            Dictionary mapping characters in the alignment to numeric values.
+            ``KeyErrors`` are not caught, so all possible values should be in
+            `value_map`, or it should be a ``collections.defaultdict`` which
+            can, for example, default to ``nan``.
+        legend_labels : iterable, optional
+            Labels for the minimum, median, and maximum values in the legend.
+            Must be an iterable of exactly three strings. Mininum, median, and
+            maximum values are computed from the values in `value_map`. Only
+            values that are actually used to perform a character-to-value
+            mapping are included when computing the minimum, median, and
+            maximum. ``nan`` values are ignored.
+        fig_size : tuple, optional
+            Size of the figure in inches. If ``None``, defaults to figure size
+            specified in the user's matplotlibrc file.
+        cmap : matplotlib colormap, optional
+            See here for choices:
+            http://matplotlib.org/examples/color/colormaps_reference.html
+            If ``None``, defaults to the colormap specified in the user's
+            matplotlibrc file.
+        sequence_order : iterable, optional
+            The order, from top-to-bottom, that the sequences should be
+            plotted in. Must be an iterable containing all sequence IDs in the
+            alignment. If ``None``, sequences are plotted as they are ordered
+            in the alignment.
+
+        Returns
+        -------
+        matplotlib.figure.Figure
+            Figure containing the heatmap and legend of the plotted alignment.
+
+        Raises
+        ------
+        KeyError
+            If a sequence character in the alignment is not in `value_map` and
+            `value_map` is not a ``collections.defaultdict``.
+        ValueError
+            If `sequence_order` isn't the same set of sequence IDs in the
+            alignment, or if `sequence_order` contains duplicate sequence IDs.
+
+        See Also
+        --------
+        majority_consensus
+
+        References
+        ----------
+        .. [1] http://www.genome.jp/aaindex/
+        .. [2] http://www.genome.jp/dbget-bin/www_bget?aaindex:ARGP820101
+
+        Examples
+        --------
+        .. plot::
+
+           Let's visualize an alignment of protein sequences as a heatmap, with
+           amino acids colored by their hydrophobicity.
+
+           Load the AAIndex database [1]_ hydrophobicity index values [2]_:
+
+           >>> from collections import defaultdict
+           >>> import numpy as np
+           >>> hydrophobicity_idx = defaultdict(lambda: np.nan)
+           >>> hydrophobicity_idx.update({'A': 0.61, 'L': 1.53, 'R': 0.60,
+           ...                            'K': 1.15, 'N': 0.06, 'M': 1.18,
+           ...                            'D': 0.46, 'F': 2.02, 'C': 1.07,
+           ...                            'P': 1.95, 'Q': 0., 'S': 0.05,
+           ...                            'E': 0.47, 'T': 0.05, 'G': 0.07,
+           ...                            'W': 2.65, 'H': 0.61, 'Y': 1.88,
+           ...                            'I': 2.22, 'V': 1.32})
+
+           Define an alignment of protein sequences:
+
+           >>> from skbio import Alignment, Protein
+           >>> sequences = [Protein('VHLTPEEKSAVTALWGKVNVDEV--', id="seq1"),
+           ...              Protein('-GLSDGEWQLVLKVWGKVEGDLPGH', id="seq2"),
+           ...              Protein('-GLSDGEWQLVLKVWGKVETDITGH', id="seq3"),
+           ...              Protein('-VLSEGEWQLVLHVWAKVEADIAGH', id="seq4")]
+           >>> aln = Alignment(sequences)
+
+           Plot the alignment as a heatmap:
+
+           >>> fig = aln.heatmap(hydrophobicity_idx, fig_size=(15, 10),
+           ...                   legend_labels=['Hydrophilic', 'Medium',
+           ...                                  'Hydrophobic'],
+           ...                   cmap='Greens')
+
+        """
+        if len(legend_labels) != 3:
+            raise ValueError(
+                "`legend_labels` must contain exactly three labels.")
+
+        if sequence_order is not None:
+            sequence_order_set = set(sequence_order)
+            if len(sequence_order) != len(sequence_order_set):
+                raise ValueError(
+                    "Found duplicate sequence ID(s) in `sequence_order`.")
+            if sequence_order_set != set(self.ids()):
+                raise ValueError(
+                    "`sequence_order` must contain the same set of sequence "
+                    "IDs in the alignment.")
+        else:
+            sequence_order = self.ids()
+
+        mtx, min_val, median_val, max_val = self._alignment_to_heatmap_matrix(
+            value_map, sequence_order)
+
+        # heatmap code derived from the matplotlib gallery:
+        #     http://matplotlib.org/examples/pylab_examples/
+        #     colorbar_tick_labelling_demo.html
+        fig, ax = plt.subplots()
+        if fig_size is not None:
+            fig.set_size_inches(fig_size)
+
+        cax = ax.imshow(mtx, interpolation='nearest', cmap=cmap)
+
+        sequence_count = self.sequence_count()
+        y_ticks = [0] + list(range(3, sequence_count - 3, 3))
+        if sequence_count > 1:
+            # we have more than one sequence, so add the index of the last
+            # sequence
+            y_ticks += [sequence_count - 1]
+        y_tick_labels = [sequence_order[tick] for tick in y_ticks]
+        ax.set_yticks(y_ticks)
+        ax.set_yticklabels(y_tick_labels)
+
+        ax.set_xticks(range(self.sequence_length()))
+        ax.set_xticklabels(self.majority_consensus(), size=7)
+
+        # add colorbar legend
+        cbar = fig.colorbar(cax, ticks=[min_val, median_val, max_val],
+                            orientation='horizontal')
+        cbar.ax.set_xticklabels(legend_labels)
+
+        return fig
+
+    def _alignment_to_heatmap_matrix(self, value_map, sequence_order):
+        """Convert alignment to numeric matrix for heatmap plotting."""
+        if self.is_empty():
+            raise AlignmentError(
+                "Cannot create heatmap from an empty alignment.")
+
+        sequence_length = self.sequence_length()
+        if sequence_length < 1:
+            raise AlignmentError(
+                "Cannot create heatmap from an alignment without any "
+                "positions.")
+
+        # track the values that are actually used -- this can differ from the
+        # values in `value_map` (e.g., if `value_map` is a superset or a
+        # defaultdict). we want to use these values to compute the min, median,
+        # and max for the colorbar legend labels
+        values = {}
+        mtx = np.empty([self.sequence_count(), sequence_length])
+        for row_idx, sequence_id in enumerate(sequence_order):
+            seq = str(self[sequence_id])
+            for col_idx in range(sequence_length):
+                char = seq[col_idx]
+                value = value_map[char]
+                mtx[row_idx][col_idx] = value
+                values[char] = value
+
+        # remove nans as this can mess up the median calculation below.
+        # numpy.nanmedian was added in 1.9.0, so consider updating this code
+        # if/when we start supporting numpy >= 1.9.0. scipy.stats.nanmedian is
+        # deprecated so we can't use that either.
+        #
+        # modified from http://stackoverflow.com/a/26475626/3776794
+        #
+        # note: we need to explicitly create a list from the dict's values in
+        # order to have this work in Python 3. Using values.values() directly,
+        # or with future.utils.viewvalues, creates a numpy array of object
+        # dtype, which makes np.isnan fail.
+        values = np.asarray(list(values.values()))
+        values = values[~np.isnan(values)]
+
+        return mtx, min(values), np.median(values), max(values)
+
     def _validate_lengths(self):
         """Return ``True`` if all sequences same length, ``False`` otherwise
         """

diff --git a/skbio/alignment/tests/test_alignment.py b/skbio/alignment/tests/test_alignment.py
@@ -17,6 +17,7 @@
 import tempfile
 
 import numpy as np
+import numpy.testing as npt
 from scipy.spatial.distance import hamming
 
 from skbio import (BiologicalSequence, DNASequence, RNASequence, DNA, RNA,
@@ -753,6 +754,133 @@ def test_sequence_length(self):
         self.assertEqual(self.a2.sequence_length(), 5)
         self.assertEqual(self.empty.sequence_length(), 0)
 
+    def check_heatmap_sanity(self, fig, exp_x_tick_labels, exp_y_tick_labels,
+                             exp_legend_labels):
+        """Helper method for testing basic heatmap figure properties."""
+        axes = fig.get_axes()
+        self.assertEqual(len(axes), 2)
+
+        ax, axc = axes
+
+        x_tick_labels = [e.get_text() for e in ax.get_xticklabels()]
+        self.assertEqual(x_tick_labels, exp_x_tick_labels)
+
+        y_tick_labels = [e.get_text() for e in ax.get_yticklabels()]
+        self.assertEqual(y_tick_labels, exp_y_tick_labels)
+
+        legend_labels = [e.get_text() for e in axc.get_xticklabels()]
+        self.assertEqual(legend_labels, exp_legend_labels)
+
+    def test_heatmap_empty(self):
+        # no seqs
+        aln = Alignment([])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        with self.assertRaises(AlignmentError):
+            aln.heatmap(value_map)
+
+        # no positions
+        aln = Alignment([DNA(''), RNA('', id="s2")])
+        with self.assertRaises(AlignmentError):
+            aln.heatmap(value_map)
+
+    def test_heatmap_1_seq(self):
+        aln = Alignment([RNA('AACGU', id="seq1")])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'A', 'C', 'G', 'U'], ['seq1'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_2_seqs(self):
+        aln = Alignment([RNA('AACGU', id="seq1"),
+                         RNA('AACGU', id="seq2")])
+        value_map = {'A': 0.1, 'C': 1.5, 'U': -0.42, 'G': 0.55, 'T': 99}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'A', 'C', 'G', 'U'], ['seq1', 'seq2'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_3_seqs(self):
+        aln = Alignment([DNA('AACCCGT', id="seq1"),
+                         DNA('ACCCGGT', id="seq2"),
+                         DNA('ACCCGGT', id="seq3")])
+        value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
+        fig = aln.heatmap(value_map)
+
+        self.check_heatmap_sanity(
+            fig, ['A', 'C', 'C', 'C', 'G', 'G', 'T'], ['seq1', 'seq3'],
+            ['Minimum', 'Median', 'Maximum'])
+
+    def test_heatmap_with_non_defaults(self):
+        aln = Alignment([DNA('AGTCGGT', id="seq1"),
+                         DNA('CAACGGA', id="seq2"),
+                         DNA('AACCTCT', id="seq3"),
+                         DNA('TACTCGT', id="seq4")])
+        value_map = {'A': 0.61, 'C': 1.07, 'T': 0.05, 'G': 0.07}
+        labels = ['a', 'b', 'c']
+        fig = aln.heatmap(
+            value_map, legend_labels=labels, fig_size=(42, 22), cmap='Blues',
+            sequence_order=('seq4', 'seq3', 'seq1', 'seq2'))
+
+        self.check_heatmap_sanity(fig, ['A', 'A', 'C', 'C', 'G', 'G', 'T'],
+                                  ['seq4', 'seq2'], labels)
+        self.assertEqual(fig.get_figwidth(), 42.0)
+        self.assertEqual(fig.get_figheight(), 22.0)
+
+    def test_heatmap_invalid_legend_labels(self):
+        with self.assertRaises(ValueError):
+            self.a1.heatmap({}, legend_labels=['a', 'b', 'c', 'd'])
+
+    def test_heatmap_invalid_sequence_order(self):
+        # duplicate ids
+        with self.assertRaises(ValueError):
+            self.a1.heatmap({}, sequence_order=['d1', 'd2', 'd1'])
+
+        # provided set of ids doesn't match alignment's set of ids
+        with self.assertRaises(ValueError):
+            self.a1.heatmap({}, sequence_order=['d2', 'd3', 'd0'])
+
+    def test_heatmap_missing_character_in_value_map(self):
+        with self.assertRaises(KeyError):
+            self.a1.heatmap({})
+
+    def test_alignment_to_heatmap_matrix(self):
+        aln = Alignment([DNA('ACTG', id='d1'),
+                         DNA('A.-G', id='d2'),
+                         DNA('TC-G', id='d3')])
+        value_map = defaultdict(lambda: np.nan)
+        value_map.update({'A': 42, 'C': 10.5, 'T': 22.1, 'G': -7.789,
+                          'U': -999.9, 'Z': 42000})
+        exp_min = -7.789
+        exp_median = 16.3
+        exp_max = 42
+
+        # sequence order is same as what's in the alignment
+        mtx, min_val, median_val, max_val = \
+            aln._alignment_to_heatmap_matrix(value_map, ['d1', 'd2', 'd3'])
+
+        exp_mtx = np.array([[42., 10.5, 22.1, -7.789],
+                            [42., np.nan, np.nan, -7.789],
+                            [22.1, 10.5, np.nan, -7.789]])
+        npt.assert_array_equal(mtx, exp_mtx)
+        self.assertEqual(min_val, exp_min)
+        self.assertAlmostEqual(median_val, exp_median)
+        self.assertEqual(max_val, exp_max)
+
+        # sequence order is different from what's in the alignment
+        mtx, min_val, median_val, max_val = \
+            aln._alignment_to_heatmap_matrix(value_map, ['d3', 'd1', 'd2'])
+
+        exp_mtx = np.array([[22.1, 10.5, np.nan, -7.789],
+                            [42., 10.5, 22.1, -7.789],
+                            [42., np.nan, np.nan, -7.789]])
+        npt.assert_array_equal(mtx, exp_mtx)
+        self.assertEqual(min_val, exp_min)
+        self.assertAlmostEqual(median_val, exp_median)
+        self.assertEqual(max_val, exp_max)
+
     def test_validate_lengths(self):
         self.assertTrue(self.a1._validate_lengths())
         self.assertTrue(self.a2._validate_lengths())