scverse · yugeji · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/scanpy/plotting/_anndata.py b/scanpy/plotting/_anndata.py
@@ -2060,7 +2060,7 @@ def _prepare_dataframe(
         # and does not need to be given
         groupby = groupby.copy()  # copy to not modify user passed parameter
         groupby.remove(groupby_index)
-    keys = list(groupby) + list(np.unique(var_names))
+    keys = [*groupby, *np.unique(var_names)]
     obs_tidy = get.obs_df(
         adata, keys=keys, layer=layer, use_raw=use_raw, gene_symbols=gene_symbols
     )

diff --git a/scanpy/plotting/_baseplot_class.py b/scanpy/plotting/_baseplot_class.py
@@ -9,6 +9,7 @@
 from warnings import warn
 
 import numpy as np
+import pandas as pd
 from matplotlib import gridspec
 from matplotlib import pyplot as plt
 
@@ -97,6 +98,7 @@
         var_names: _VarNames | Mapping[str, _VarNames],
         groupby: str | Sequence[str],
         *,
+        groupby_cols: str | Sequence[str] = (),
         use_raw: bool | None = None,
         log: bool = False,
         num_categories: int = 7,
@@ -120,7 +122,10 @@
         self.var_group_positions = var_group_positions
         self.var_group_rotation = var_group_rotation
         self.width, self.height = figsize if figsize is not None else (None, None)
-
+        self.groupby = [groupby] if isinstance(groupby, str) else groupby
+        self.groupby_cols = (
+            [groupby_cols] if isinstance(groupby_cols, str) else groupby_cols
+        )
         self.has_var_groups = (
             True
             if var_group_positions is not None and len(var_group_positions) > 0
@@ -132,13 +137,33 @@
         self.categories, self.obs_tidy = _prepare_dataframe(
             adata,
             self.var_names,
-            groupby,
+            self.groupby,
             use_raw=use_raw,
             log=log,
             num_categories=num_categories,
             layer=layer,
             gene_symbols=gene_symbols,
         )
+        # reset obs_tidy if using groupby_cols
+        if len(self.groupby_cols) > 0:
+            if overlap := (set(self.groupby) & set(self.groupby_cols)):
+                raise ValueError(
+                    f"`groupby` and `groupby_cols` have overlapping elements: {overlap}."
+                )
+            # TODO : Check if we rather need the product of categories ?
+            self.categories_cols = adata.obs.loc[:, self.groupby_cols].nunique().sum()
+            _, self.obs_tidy = _prepare_dataframe(
+                adata,
+                self.var_names,
+                [*self.groupby, *self.groupby_cols],
+                use_raw,
+                log,
+                num_categories,
+                layer=layer,
+                gene_symbols=gene_symbols,
+            )
+        else:
+            self.categories_cols = 0
         if len(self.categories) > self.MAX_NUM_CATEGORIES:
             warn(
                 f"Over {self.MAX_NUM_CATEGORIES} categories found. "
@@ -159,7 +184,6 @@
                 return
 
         self.adata = adata
-        self.groupby = [groupby] if isinstance(groupby, str) else groupby
         self.log = log
         self.kwds = kwds
 
@@ -372,6 +396,11 @@
         _sort = True if sort is not None else False
         _ascending = True if sort == "ascending" else False
         counts_df = self.obs_tidy.index.value_counts(sort=_sort, ascending=_ascending)
+        # could remove the previous line and only use this but this is slower
+        if len(self.groupby_cols) > 0:
+            counts_df = self.adata.obs[self.groupby].value_counts(
+                sort=_sort, ascending=_ascending
+            )
 
         if _sort:
             self.categories_order = counts_df.index
@@ -586,7 +615,7 @@
         self._plot_colorbar(color_legend_ax, normalize)
         return_ax_dict["color_legend_ax"] = color_legend_ax
 
-    def _mainplot(self, ax):
+    def _mainplot(self, ax: Axes):
         y_labels = self.categories
         x_labels = self.var_names
 
@@ -655,7 +684,8 @@
         if self.height is None:
             mainplot_height = len(self.categories) * category_height
             mainplot_width = (
-                len(self.var_names) * category_width + self.group_extra_size
+                len(self.var_names) * category_width * (1 + self.categories_cols)
+                + self.group_extra_size
             )
             if self.are_axes_swapped:
                 mainplot_height, mainplot_width = mainplot_width, mainplot_height
@@ -857,6 +887,37 @@
         self.make_figure()
         plt.savefig(filename, bbox_inches=bbox_inches, **kwargs)
 
+    def _convert_tidy_to_stacked(self, values_df: pd.DataFrame) -> pd.DataFrame:
+        """\
+        Utility function used to convert obs_tidy into the correct format when using a groupby_col.
+        """
+        label = values_df.index.name
+        stacked_df = values_df.reset_index()
+        stacked_df.index = pd.MultiIndex.from_tuples(
+            stacked_df[label].str.split("_").tolist(),
+            names=self.groupby + self.groupby_cols,
+        )
+        stacked_df = stacked_df.drop(label, axis=1).unstack(level=self.groupby_cols)
+
+        # recreate the original formatting of values_df
+        values_df = stacked_df.reset_index(drop=True)
+        if isinstance(stacked_df.index, pd.MultiIndex):
+            values_df.index = (
+                stacked_df.index.to_series()
+                .apply(lambda x: "_".join(map(str, x)))
+                .values
+            )
+        else:
+            values_df.index = (
+                stacked_df.index.to_series()
+                .apply(lambda x: "".join(map(str, x)))
+                .values
+            )
+        values_df.columns = (
+            stacked_df.columns.to_series().apply(lambda x: "_".join(map(str, x))).values
+        )
+        return values_df
+
     def _reorder_categories_after_dendrogram(self, dendrogram) -> None:
         """\
         Function used by plotting functions that need to reorder the the groupby

diff --git a/scanpy/plotting/_docs.py b/scanpy/plotting/_docs.py
@@ -203,6 +203,8 @@
     then the `var_group_labels` and `var_group_positions` are set.
 groupby
     The key of the observation grouping to consider.
+groupby_cols
+    The key of the observation grouping to consider for grouping columns.
 use_raw
     Use `raw` attribute of `adata` if present.
 log

diff --git a/scanpy/plotting/_dotplot.py b/scanpy/plotting/_dotplot.py
@@ -141,6 +141,7 @@
         var_names: _VarNames | Mapping[str, _VarNames],
         groupby: str | Sequence[str],
         *,
+        groupby_cols: str | Sequence[str] = (),
         use_raw: bool | None = None,
         log: bool = False,
         num_categories: int = 7,
@@ -169,6 +170,7 @@
             adata,
             var_names,
             groupby,
+            groupby_cols=groupby_cols,
             use_raw=use_raw,
             log=log,
             num_categories=num_categories,
@@ -204,6 +206,8 @@
                 obs_bool.groupby(level=0, observed=True).sum()
                 / obs_bool.groupby(level=0, observed=True).count()
             )
+            if len(groupby_cols) > 0:
+                dot_size_df = self._convert_tidy_to_stacked(dot_size_df)
 
         if dot_color_df is None:
             # 2. compute mean expression value value
@@ -227,6 +231,8 @@
                 pass
             else:
                 logg.warning("Unknown type for standard_scale, ignored")
+            if len(groupby_cols) > 0:
+                dot_color_df = self._convert_tidy_to_stacked(dot_color_df)
         else:
             # check that both matrices have the same shape
             if dot_color_df.shape != dot_size_df.shape:
@@ -568,7 +574,7 @@
             self._plot_colorbar(color_legend_ax, normalize)
             return_ax_dict["color_legend_ax"] = color_legend_ax
 
-    def _mainplot(self, ax):
+    def _mainplot(self, ax: Axes):
         # work on a copy of the dataframes. This is to avoid changes
         # on the original data frames after repetitive calls to the
         # DotPlot object, for example once with swap_axes and other without
@@ -737,7 +743,7 @@
         mean_flat = dot_color.values.flatten()
         cmap = plt.get_cmap(cmap)
         if dot_max is None:
-            dot_max = np.ceil(max(frac) * 10) / 10
+            dot_max = np.ceil(np.nanmax(frac) * 10) / 10
         else:
             if dot_max < 0 or dot_max > 1:
                 raise ValueError("`dot_max` value has to be between 0 and 1")
@@ -758,6 +764,8 @@
         # rescale size to match smallest_dot and largest_dot
         size = size * (largest_dot - smallest_dot) + smallest_dot
         normalize = check_colornorm(vmin, vmax, vcenter, norm)
+        # circumvent unexpected behavior with nan in matplotlib
+        normalize(mean_flat[~np.isnan(mean_flat)])
 
         if color_on == "square":
             if edge_color is None:
@@ -871,6 +879,7 @@
     var_names: _VarNames | Mapping[str, _VarNames],
     groupby: str | Sequence[str],
     *,
+    groupby_cols: str | Sequence[str] = (),
     use_raw: bool | None = None,
     log: bool = False,
     num_categories: int = 7,
@@ -907,6 +916,7 @@
     Makes a *dot plot* of the expression values of `var_names`.
 
     For each var_name and each `groupby` category a dot is plotted.
+    Columns can optionally be grouped by specifying `groupby_cols`.
     Each dot represents two values: mean expression within each category
     (visualized by color) and fraction of cells expressing the `var_name` in the
     category (visualized by the size of the dot). If `groupby` is not given,
@@ -1013,7 +1023,8 @@
     dp = DotPlot(
         adata,
         var_names,
-        groupby,
+        groupby=groupby,
+        groupby_cols=groupby_cols,
         use_raw=use_raw,
         log=log,
         num_categories=num_categories,

diff --git a/scanpy/plotting/_matrixplot.py b/scanpy/plotting/_matrixplot.py
@@ -122,6 +122,7 @@
         var_names: _VarNames | Mapping[str, _VarNames],
         groupby: str | Sequence[str],
         *,
+        groupby_cols: str | Sequence[str] = (),
         use_raw: bool | None = None,
         log: bool = False,
         num_categories: int = 7,
@@ -147,6 +148,7 @@
             adata,
             var_names,
             groupby,
+            groupby_cols=groupby_cols,
             use_raw=use_raw,
             log=log,
             num_categories=num_categories,
@@ -189,6 +191,9 @@
             else:
                 logg.warning("Unknown type for standard_scale, ignored")
 
+            if len(groupby_cols) > 0:
+                values_df = self._convert_tidy_to_stacked(values_df)
+
         self.values_df = values_df
 
         self.cmap = self.DEFAULT_COLORMAP
@@ -252,7 +257,7 @@
 
         return self
 
-    def _mainplot(self, ax):
+    def _mainplot(self, ax: Axes):
         # work on a copy of the dataframes. This is to avoid changes
         # on the original data frames after repetitive calls to the
         # MatrixPlot object, for example once with swap_axes and other without
@@ -339,6 +344,7 @@
     var_names: _VarNames | Mapping[str, _VarNames],
     groupby: str | Sequence[str],
     *,
+    groupby_cols: str | Sequence[str] = (),
     use_raw: bool | None = None,
     log: bool = False,
     num_categories: int = 7,
@@ -367,6 +373,7 @@
 ) -> MatrixPlot | dict[str, Axes] | None:
     """\
     Creates a heatmap of the mean expression values per group of each var_names.
+    Columns can optionally be grouped by specifying `groupby_cols`.
 
     This function provides a convenient interface to the :class:`~scanpy.pl.MatrixPlot`
     class. If you need more flexibility, you should use :class:`~scanpy.pl.MatrixPlot`
@@ -432,6 +439,7 @@
         adata,
         var_names,
         groupby=groupby,
+        groupby_cols=groupby_cols,
         use_raw=use_raw,
         log=log,
         num_categories=num_categories,