pysal · martinfleis · Jun 5, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
@@ -9,6 +9,15 @@
 GPD_013 = Version(geopandas.__version__) >= Version("0.13")
 PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
 
+try:
+    from numba import njit  # noqa: E401
+
+    HAS_NUMBA = True
+except ModuleNotFoundError:
+    from libpysal.common import jit as njit
+
+    HAS_NUMBA = False
+
 
 class CoplanarError(ValueError):
     """Custom ValueError raised when coplanar points are detected."""
@@ -274,3 +283,107 @@
         .reindex(ids, level=1)
         .reset_index()
     )
+
+
+@njit
+def _mode(values, index):  # noqa: ARG001
+    """Custom mode function for numba."""
+    array = np.sort(values.ravel())
+    mask = np.empty(array.shape, dtype=np.bool_)
+    mask[:1] = True
+    mask[1:] = array[1:] != array[:-1]
+    unique = array[mask]
+    idx = np.nonzero(mask)[0]
+    idx = np.append(idx, mask.size)
+    counts = np.diff(idx)
+    return unique[np.argmax(counts)]
+
+
+@njit
+def _limit_range(values, index, low, high):  # noqa: ARG001
+    nan_tracker = np.isnan(values)
+
+    if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
+        lower, higher = np.percentile(values, (low, high))
+    else:
+        return ~nan_tracker
+
+    return (lower <= values) & (values <= higher)
+
+
+def _compute_stats(grouper, to_compute: list[str] | None = None):
+    """Fast compute of "count", "mean", "median", "std", "min", "max", \\
+    "sum", "nunique" and "mode" within a grouper object. Using numba.
+
+    Parameters
+    ----------
+    grouper : pandas.GroupBy
+        Groupby Object which specifies the aggregations to be performed.
+    to_compute : List[str]
+        A list of stats functions to pass to groupby.agg
+
+    Returns
+    -------
+    DataFrame
+    """
+
+    if not HAS_NUMBA:
+        warnings.warn(
+            "The numba package is used extensively in this module"
+            " to accelerate the computation of graphs. Without numba,"
+            " these computations may become unduly slow on large data.",
+            stacklevel=3,
+        )
+
+    if to_compute is None:
+        to_compute = [
+            "count",
+            "mean",
+            "median",
+            "std",
+            "min",
+            "max",
+            "sum",
+            "nunique",
+            "mode",
+        ]
+    agg_to_compute = [f for f in to_compute if f != "mode"]
+    stat_ = grouper.agg(agg_to_compute)
+    if "mode" in to_compute:
+        if HAS_NUMBA:
+            stat_["mode"] = grouper.agg(_mode, engine="numba")
+        else:
+            stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))
+
+    return stat_
+
+
+def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
+    """Carry out a filtration of graph neighbours \\
+        based on the quantiles of  ``y``, specified in ``q``"""
+    if not HAS_NUMBA:
+        warnings.warn(
+            "The numba package is used extensively in this module"
+            " to accelerate the computation of graphs. Without numba,"
+            " these computations may become unduly slow on large data.",
+            stacklevel=3,
+        )
+
+    ## need to reset since numba transform has an indexing issue
+    grouper = (
+        y.take(graph_adjacency_index.codes[-1])
+        .reset_index(drop=True)
+        .groupby(graph_adjacency_index.codes[0])
+    )
+    if HAS_NUMBA:
+        to_keep = grouper.transform(
+            _limit_range, q[0], q[1], engine="numba"
+        ).values.astype(bool)
+    else:
+        to_keep = grouper.transform(
+            lambda x: _limit_range(x.values, x.index, q[0], q[1])
+        ).values.astype(bool)
+    filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
+        graph_adjacency_index.codes[0][to_keep]
+    )
+    return filtered_grouper
diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
@@ -24,8 +24,10 @@
 from ._spatial_lag import _lag_spatial
 from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi
 from ._utils import (
+    _compute_stats,
     _evaluate_index,
     _neighbor_dict_to_edges,
+    _percentile_filtration_grouper,
     _resolve_islands,
     _sparse_to_arrays,
 )
@@ -1993,6 +1995,69 @@
         """
         return self._adjacency.groupby(level=0).agg(func)
 
+    def describe(
+        self,
+        y: np.typing.NDArray[np.float_] | pd.Series,
+        q: tuple[float, float] | None = None,
+        statistics: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """Describe the distribution of ``y`` values within the neighbors of each node.
+
+        Given the graph, computes the descriptive statistics of values within the
+        neighbourhood of each node. Optionally, the values can be limited to a certain
+        quantile range before computing the statistics.
+
+        Notes
+        -----
+        The index of ``values`` must match the index of the graph.
+
+        Weight values do not affect the calculations, only adjacency does.
+
+        Returns nan for all isolates.
+
+        The numba package is used extensively in this function
+        to accelerate the computation of statistics.
+        Without numba, these computations may become slow on large data.
+
+        Parameters
+        ----------
+        y : NDArray[np.float_] | Series
+            An 1D array of numeric values to be described.
+        q : tuple[float, float] | None, optional
+            Tuple of percentages for the percentiles to compute.
+            Values must be between 0 and 100 inclusive. When set, values below and above
+            the percentiles will be discarded before computation of the statistics.
+            The percentiles are computed for each neighborhood. By default None.
+        statistics : List[str] | None
+            A list of stats functions to compute. If None, compute all
+            available functions - "count", "mean", "median",
+            "std", "min", "max", "sum", "nunique", "mode". By default None.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with descriptive statistics.
+        """
+
+        if not isinstance(y, pd.Series):
+            y = pd.Series(y)
+
+        if q is None:
+            grouper = y.take(self._adjacency.index.codes[1]).groupby(
+                self._adjacency.index.codes[0]
+            )
+        else:
+            grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q)
+
+        stat_ = _compute_stats(grouper, statistics)
+
+        stat_.index = self.unique_ids
+        if isinstance(stat_, pd.Series):
+            stat_.name = None
+        # NA isolates
+        stat_.loc[self.isolates] = np.nan
+        return stat_
+
 
 def _arrange_arrays(heads, tails, weights, ids=None):
     """

diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
@@ -93,6 +93,7 @@ def setup_method(self):
         self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered)
 
         self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
+        self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))
 
     def test_init(self):
         g = graph.Graph(self.adjacency_int_binary)
@@ -1129,3 +1130,67 @@ def test_aggregate(self):
             contig.aggregate(lambda x: np.exp(np.sum(x))),
             expected,
         )
+
+    def test_describe(self):
+        contig = graph.Graph.build_knn(self.guerry.geometry.centroid, k=5)
+        y = self.guerry.geometry.area
+        stats = contig.describe(y)
+        pd.testing.assert_series_equal(
+            stats["count"],
+            contig.cardinalities,
+            check_names=False,
+            check_dtype=False,
+        )
+        pd.testing.assert_series_equal(
+            stats["sum"],
+            pd.Series(contig.lag(y), index=contig.unique_ids),
+            check_names=False,
+        )
+        r_contig = contig.transform("R")
+        pd.testing.assert_series_equal(
+            stats["mean"],
+            pd.Series(r_contig.lag(y), index=contig.unique_ids),
+            check_names=False,
+        )
+        ## compute only some statistics
+        specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
+        ## assert only the specified values are computed
+        assert list(specific_stats.columns) == ["count", "sum", "mean"]
+
+        pd.testing.assert_frame_equal(
+            specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
+        )
+
+        percentile_stats = contig.describe(y, q=(25, 75))
+
+        for i in contig.unique_ids:
+            neigh_vals = y[contig[i].index.values]
+            low, high = neigh_vals.describe()[["25%", "75%"]]
+            neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
+            expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
+            res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
+            pd.testing.assert_series_equal(res, expected, check_names=False)
+
+        # test with isolates and string index
+        nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False)
+        stats = nybb_contig.describe(
+            self.nybb.geometry.area, statistics=["count", "sum"]
+        )
+        ## all isolate values should be nan
+        assert stats.loc["Staten Island"].isna().all()
+
+        # for easier comparison and na has already been checked.
+        stats = stats.fillna(0)
+
+        pd.testing.assert_series_equal(
+            stats["sum"],
+            pd.Series(nybb_contig.lag(self.nybb.geometry.area), index=self.nybb.index),
+            check_names=False,
+        )
+
+        pd.testing.assert_series_equal(
+            stats["count"].sort_index(),
+            nybb_contig.cardinalities.sort_index(),
+            check_dtype=False,
+            check_names=False,
+        )