From aedb687a658399bb5357237db559f28b2ae252da Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 14:22:51 +0200
Subject: [PATCH 01/10] summary statistics for neigbourhood values

---
 libpysal/graph/_utils.py          | 99 +++++++++++++++++++++++++++++++
 libpysal/graph/base.py            | 60 +++++++++++++++++++
 libpysal/graph/tests/test_base.py | 40 +++++++++++++
 3 files changed, 199 insertions(+)

diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
index 320826cee..e62627533 100644
--- a/libpysal/graph/_utils.py
+++ b/libpysal/graph/_utils.py
@@ -9,6 +9,15 @@
 GPD_013 = Version(geopandas.__version__) >= Version("0.13")
 PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
 
+try:
+    from numba import njit  # noqa: E401
+
+    HAS_NUMBA = True
+except ModuleNotFoundError:
+    from libpysal.common import jit as njit
+
+    HAS_NUMBA = False
+
 
 class CoplanarError(ValueError):
     """Custom ValueError raised when coplanar points are detected."""
@@ -274,3 +283,93 @@ def _reorder_adjtable_by_ids(adjtable, ids):
         .reindex(ids, level=1)
         .reset_index()
     )
+
+
+@njit
+def _mode(values, index):
+    """Custom mode function for numba."""
+    array = np.sort(values.ravel())
+    mask = np.empty(array.shape, dtype=np.bool_)
+    mask[:1] = True
+    mask[1:] = array[1:] != array[:-1]
+    unique = array[mask]
+    idx = np.nonzero(mask)[0]
+    idx = np.append(idx, mask.size)
+    counts = np.diff(idx)
+    return unique[np.argmax(counts)]
+
+@njit
+def _limit_range(values, index, low, high):
+    nan_tracker = np.isnan(values)
+
+    if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
+        lower, higher = np.percentile(values, (low, high))
+    else:
+        return ~nan_tracker
+
+    return (lower <= values) & (values <= higher)
+
+def _compute_stats(grouper, to_compute:list[str]|None=None):
+    """Fast compute of "count", "mean", "median", "std", "min", "max", \\
+    "sum", "nunique" and "mode" within a grouper object. Using numba.
+
+    Parameters
+    ----------
+    grouper : pandas.GroupBy
+        Groupby Object which specifies the aggregations to be performed.
+    to_compute : List[str]
+        A list of stats functions to pass to groupby.agg
+
+    Returns
+    -------
+    DataFrame
+    """
+
+    if not HAS_NUMBA:
+        warnings.warn(
+            "The numba package is used extensively in this module"
+            " to accelerate the computation of graphs. Without numba,"
+            " these computations may become unduly slow on large data.",
+            stacklevel=3,
+        )
+
+    if to_compute is None:
+        to_compute = ["count", "mean", "median",
+                    "std", "min", "max", "sum", "nunique", "mode"]
+    agg_to_compute = [f for f in to_compute if f != 'mode']
+    stat_ = grouper.agg(agg_to_compute)
+    if 'mode' in to_compute:
+        if HAS_NUMBA:
+            stat_["mode"] = grouper.agg(_mode, engine='numba')
+        else:
+            stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))
+
+    return stat_
+
+
+def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
+    """Carry out a filtration of graph neighbours \\
+        based on the quantiles of  ``y``, specified in ``q``"""
+    if not HAS_NUMBA:
+        warnings.warn(
+            "The numba package is used extensively in this module"
+            " to accelerate the computation of graphs. Without numba,"
+            " these computations may become unduly slow on large data.",
+            stacklevel=3,
+        )
+
+    ## need to reset since numba transform has an indexing issue
+    grouper = y.take(graph_adjacency_index.codes[-1]).reset_index(drop=True).groupby(
+        graph_adjacency_index.codes[0]
+    )
+    if HAS_NUMBA:
+        to_keep = grouper.transform(_limit_range, q[0], q[1],
+                                    engine='numba').values.astype(bool)
+    else:
+        to_keep = grouper.transform(
+            lambda x: _limit_range(x.values, x.index, q[0], q[1])
+            ).values.astype(bool)
+    filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
+            graph_adjacency_index.codes[0][to_keep]
+        )
+    return filtered_grouper
diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index 03f51f547..9cac46e23 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -28,6 +28,8 @@
     _neighbor_dict_to_edges,
     _resolve_islands,
     _sparse_to_arrays,
+    _compute_stats,
+    _percentile_filtration_grouper,
 )
 from .io._gal import _read_gal, _to_gal
 from .io._gwt import _read_gwt, _to_gwt
@@ -1993,6 +1995,64 @@ def aggregate(self, func):
         """
         return self._adjacency.groupby(level=0).agg(func)
 
+    def describe(
+        self,
+        y: np.typing.NDArray[np.float_] | pd.Series,
+        q: tuple[float, float] | None = None,
+        statistics: list[str] | None = None,
+    ) -> pd.DataFrame:
+        """Describe the distribution of ``y`` values within the graph.
+
+        Given the graph, computes the descriptive statistics of values within the
+        neighbourhood of each node. Optionally, the values can be limited to a certain
+        quantile range before computing the statistics.
+
+        Notes
+        -----
+        The index of ``values`` must match the index of the graph.
+
+        Weight values do not affect the calculations, only adjacency does.
+
+        The numba package is used extensively in this function
+        to accelerate the computation of statistics.
+        Without numba, these computations may become slow on large data.
+
+        Parameters
+        ----------
+        y : NDArray[np.float_] | Series
+            An 1D array of numeric values to be described.
+        q : tuple[float, float] | None, optional
+            Tuple of percentages for the percentiles to compute.
+            Values must be between 0 and 100 inclusive. When set, values below and above
+            the percentiles will be discarded before computation of the average.
+            The percentiles are computed for each neighborhood. By default None.
+        statistics : List[str] | None
+            A list of stats functions to compute. If None, compute all
+            available functions - "count", "mean", "median",
+            "std", "min", "max", "sum", "nunique", "mode". By default None.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with descriptive statistics.
+        """
+
+        if not isinstance(y, pd.Series):
+            y = pd.Series(y)
+
+        if q is None:
+            grouper = y.take(self._adjacency.index.codes[1]).groupby(
+                self._adjacency.index.codes[0]
+            )
+        else:
+            grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q)
+
+        stat_ = _compute_stats(grouper, statistics)
+
+        # NA isolates
+        stat_.loc[self.isolates] = np.nan
+        return stat_
+
 
 def _arrange_arrays(heads, tails, weights, ids=None):
     """
diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index 2d57fd6fc..179ff9e32 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -93,6 +93,8 @@ def setup_method(self):
         self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered)
 
         self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
+        self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))
+
 
     def test_init(self):
         g = graph.Graph(self.adjacency_int_binary)
@@ -1129,3 +1131,41 @@ def test_aggregate(self):
             contig.aggregate(lambda x: np.exp(np.sum(x))),
             expected,
         )
+
+    def test_describe(self):
+        contig = graph.Graph.build_contiguity(
+            self.guerry, rook=False).higher_order(
+                k=3, lower_order=True).assign_self_weight()
+        y = self.guerry.geometry.area
+        stats = contig.describe(y)
+        pd.testing.assert_series_equal(stats['count'],
+                                       contig.cardinalities,
+                                       check_index_type=False,
+                                       check_names=False)
+        pd.testing.assert_series_equal(stats['sum'],
+                                        pd.Series(contig.lag(y),
+                                                  index=contig.unique_ids),
+                                        check_index_type=False,
+                                        check_names=False)
+        r_contig = contig.transform('R')
+        pd.testing.assert_series_equal(stats['mean'],
+                                        pd.Series(r_contig.lag(y),
+                                                  index=contig.unique_ids),
+                                        check_index_type=False,
+                                        check_names=False)
+        ## compute only some statistics
+        specific_stats = contig.describe(y, statistics=['count', 'sum', 'mean'])
+        pd.testing.assert_frame_equal(specific_stats[['count', 'sum', 'mean']],
+                                    stats[['count', 'sum', 'mean']])
+
+        percentile_stats = contig.describe(y, q=(25, 75))
+
+        for i in contig.unique_ids:
+            neigh_vals = y[contig[i].index.values]
+            low, high = neigh_vals.describe()[['25%', '75%']]
+            neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
+            expected = neigh_vals.describe()[['count', 'mean', 'std', 'min', 'max']]
+            res = percentile_stats.loc[i][['count', 'mean', 'std', 'min', 'max']]
+            pd.testing.assert_series_equal(res, expected, check_names=False)
+
+

From cb4b1e0cf8b5dfee608ad2daad90bd62493cc664 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 14:25:19 +0200
Subject: [PATCH 02/10] summary statistics for neigbourhood values

---
 libpysal/graph/_utils.py | 4 ++--
 libpysal/graph/base.py   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
index e62627533..2710142a1 100644
--- a/libpysal/graph/_utils.py
+++ b/libpysal/graph/_utils.py
@@ -286,7 +286,7 @@ def _reorder_adjtable_by_ids(adjtable, ids):
 
 
 @njit
-def _mode(values, index):
+def _mode(values, index):  # noqa: ARG001
     """Custom mode function for numba."""
     array = np.sort(values.ravel())
     mask = np.empty(array.shape, dtype=np.bool_)
@@ -299,7 +299,7 @@ def _mode(values, index):
     return unique[np.argmax(counts)]
 
 @njit
-def _limit_range(values, index, low, high):
+def _limit_range(values, index, low, high):  # noqa: ARG001
     nan_tracker = np.isnan(values)
 
     if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index 9cac46e23..d7152ab78 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -24,12 +24,12 @@
 from ._spatial_lag import _lag_spatial
 from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi
 from ._utils import (
+    _compute_stats,
     _evaluate_index,
     _neighbor_dict_to_edges,
+    _percentile_filtration_grouper,
     _resolve_islands,
     _sparse_to_arrays,
-    _compute_stats,
-    _percentile_filtration_grouper,
 )
 from .io._gal import _read_gal, _to_gal
 from .io._gwt import _read_gwt, _to_gwt

From 188f02b9b5e042c4bf821014a07b5d68b4e24568 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 14:46:06 +0200
Subject: [PATCH 03/10] formatting

---
 libpysal/graph/_utils.py          | 40 ++++++++++++++-------
 libpysal/graph/tests/test_base.py | 58 +++++++++++++++++--------------
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
index 2710142a1..1b6fe9d36 100644
--- a/libpysal/graph/_utils.py
+++ b/libpysal/graph/_utils.py
@@ -298,6 +298,7 @@ def _mode(values, index):  # noqa: ARG001
     counts = np.diff(idx)
     return unique[np.argmax(counts)]
 
+
 @njit
 def _limit_range(values, index, low, high):  # noqa: ARG001
     nan_tracker = np.isnan(values)
@@ -309,7 +310,8 @@ def _limit_range(values, index, low, high):  # noqa: ARG001
 
     return (lower <= values) & (values <= higher)
 
-def _compute_stats(grouper, to_compute:list[str]|None=None):
+
+def _compute_stats(grouper, to_compute: list[str] | None = None):
     """Fast compute of "count", "mean", "median", "std", "min", "max", \\
     "sum", "nunique" and "mode" within a grouper object. Using numba.
 
@@ -334,13 +336,22 @@ def _compute_stats(grouper, to_compute:list[str]|None=None):
         )
 
     if to_compute is None:
-        to_compute = ["count", "mean", "median",
-                    "std", "min", "max", "sum", "nunique", "mode"]
-    agg_to_compute = [f for f in to_compute if f != 'mode']
+        to_compute = [
+            "count",
+            "mean",
+            "median",
+            "std",
+            "min",
+            "max",
+            "sum",
+            "nunique",
+            "mode",
+        ]
+    agg_to_compute = [f for f in to_compute if f != "mode"]
     stat_ = grouper.agg(agg_to_compute)
-    if 'mode' in to_compute:
+    if "mode" in to_compute:
         if HAS_NUMBA:
-            stat_["mode"] = grouper.agg(_mode, engine='numba')
+            stat_["mode"] = grouper.agg(_mode, engine="numba")
         else:
             stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))
 
@@ -359,17 +370,20 @@ def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
         )
 
     ## need to reset since numba transform has an indexing issue
-    grouper = y.take(graph_adjacency_index.codes[-1]).reset_index(drop=True).groupby(
-        graph_adjacency_index.codes[0]
+    grouper = (
+        y.take(graph_adjacency_index.codes[-1])
+        .reset_index(drop=True)
+        .groupby(graph_adjacency_index.codes[0])
     )
     if HAS_NUMBA:
-        to_keep = grouper.transform(_limit_range, q[0], q[1],
-                                    engine='numba').values.astype(bool)
+        to_keep = grouper.transform(
+            _limit_range, q[0], q[1], engine="numba"
+        ).values.astype(bool)
     else:
         to_keep = grouper.transform(
             lambda x: _limit_range(x.values, x.index, q[0], q[1])
-            ).values.astype(bool)
+        ).values.astype(bool)
     filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
-            graph_adjacency_index.codes[0][to_keep]
-        )
+        graph_adjacency_index.codes[0][to_keep]
+    )
     return filtered_grouper
diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index 179ff9e32..7c47b1940 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -95,7 +95,6 @@ def setup_method(self):
         self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
         self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))
 
-
     def test_init(self):
         g = graph.Graph(self.adjacency_int_binary)
         assert isinstance(g, graph.Graph)
@@ -1133,39 +1132,44 @@ def test_aggregate(self):
         )
 
     def test_describe(self):
-        contig = graph.Graph.build_contiguity(
-            self.guerry, rook=False).higher_order(
-                k=3, lower_order=True).assign_self_weight()
+        contig = (
+            graph.Graph.build_contiguity(self.guerry, rook=False)
+            .higher_order(k=3, lower_order=True)
+            .assign_self_weight()
+        )
         y = self.guerry.geometry.area
         stats = contig.describe(y)
-        pd.testing.assert_series_equal(stats['count'],
-                                       contig.cardinalities,
-                                       check_index_type=False,
-                                       check_names=False)
-        pd.testing.assert_series_equal(stats['sum'],
-                                        pd.Series(contig.lag(y),
-                                                  index=contig.unique_ids),
-                                        check_index_type=False,
-                                        check_names=False)
-        r_contig = contig.transform('R')
-        pd.testing.assert_series_equal(stats['mean'],
-                                        pd.Series(r_contig.lag(y),
-                                                  index=contig.unique_ids),
-                                        check_index_type=False,
-                                        check_names=False)
+        pd.testing.assert_series_equal(
+            stats["count"],
+            contig.cardinalities,
+            check_index_type=False,
+            check_names=False,
+        )
+        pd.testing.assert_series_equal(
+            stats["sum"],
+            pd.Series(contig.lag(y), index=contig.unique_ids),
+            check_index_type=False,
+            check_names=False,
+        )
+        r_contig = contig.transform("R")
+        pd.testing.assert_series_equal(
+            stats["mean"],
+            pd.Series(r_contig.lag(y), index=contig.unique_ids),
+            check_index_type=False,
+            check_names=False,
+        )
         ## compute only some statistics
-        specific_stats = contig.describe(y, statistics=['count', 'sum', 'mean'])
-        pd.testing.assert_frame_equal(specific_stats[['count', 'sum', 'mean']],
-                                    stats[['count', 'sum', 'mean']])
+        specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
+        pd.testing.assert_frame_equal(
+            specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
+        )
 
         percentile_stats = contig.describe(y, q=(25, 75))
 
         for i in contig.unique_ids:
             neigh_vals = y[contig[i].index.values]
-            low, high = neigh_vals.describe()[['25%', '75%']]
+            low, high = neigh_vals.describe()[["25%", "75%"]]
             neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
-            expected = neigh_vals.describe()[['count', 'mean', 'std', 'min', 'max']]
-            res = percentile_stats.loc[i][['count', 'mean', 'std', 'min', 'max']]
+            expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
+            res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
             pd.testing.assert_series_equal(res, expected, check_names=False)
-
-

From 045e7927dc8b317229064194a981cb543e97d1e5 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 15:40:49 +0200
Subject: [PATCH 04/10] Apply suggestions from code review

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
---
 libpysal/graph/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index d7152ab78..f409659ac 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2001,7 +2001,7 @@ def describe(
         q: tuple[float, float] | None = None,
         statistics: list[str] | None = None,
     ) -> pd.DataFrame:
-        """Describe the distribution of ``y`` values within the graph.
+        """Describe the distribution of ``y`` values within the neighbors of each node.
 
         Given the graph, computes the descriptive statistics of values within the
         neighbourhood of each node. Optionally, the values can be limited to a certain
@@ -2024,7 +2024,7 @@ def describe(
         q : tuple[float, float] | None, optional
             Tuple of percentages for the percentiles to compute.
             Values must be between 0 and 100 inclusive. When set, values below and above
-            the percentiles will be discarded before computation of the average.
+            the percentiles will be discarded before computation of the statistics.
             The percentiles are computed for each neighborhood. By default None.
         statistics : List[str] | None
             A list of stats functions to compute. If None, compute all

From b634b388557a8ba34637957eb305f0bc8c2479ea Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 16:42:42 +0200
Subject: [PATCH 05/10] extra tests

---
 libpysal/graph/base.py            |  5 +++++
 libpysal/graph/tests/test_base.py | 36 ++++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index f409659ac..e6dbe68bb 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2013,6 +2013,8 @@ def describe(
 
         Weight values do not affect the calculations, only adjacency does.
 
+        Returns nan for all isolates.
+
         The numba package is used extensively in this function
         to accelerate the computation of statistics.
         Without numba, these computations may become slow on large data.
@@ -2049,6 +2051,9 @@ def describe(
 
         stat_ = _compute_stats(grouper, statistics)
 
+        stat_.index = self.unique_ids
+        if isinstance(stat_, pd.Series):
+            stat_.name = None
         # NA isolates
         stat_.loc[self.isolates] = np.nan
         return stat_
diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index 7c47b1940..bf3a3b3ac 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -1132,34 +1132,30 @@ def test_aggregate(self):
         )
 
     def test_describe(self):
-        contig = (
-            graph.Graph.build_contiguity(self.guerry, rook=False)
-            .higher_order(k=3, lower_order=True)
-            .assign_self_weight()
-        )
+        contig = graph.Graph.build_knn(self.guerry.geometry.centroid, k=5)
         y = self.guerry.geometry.area
         stats = contig.describe(y)
         pd.testing.assert_series_equal(
             stats["count"],
             contig.cardinalities,
-            check_index_type=False,
             check_names=False,
         )
         pd.testing.assert_series_equal(
             stats["sum"],
             pd.Series(contig.lag(y), index=contig.unique_ids),
-            check_index_type=False,
             check_names=False,
         )
         r_contig = contig.transform("R")
         pd.testing.assert_series_equal(
             stats["mean"],
             pd.Series(r_contig.lag(y), index=contig.unique_ids),
-            check_index_type=False,
             check_names=False,
         )
         ## compute only some statistics
         specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
+        ## assert only the specified values are computed
+        assert list(specific_stats.columns) == ["count", "sum", "mean"]
+
         pd.testing.assert_frame_equal(
             specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
         )
@@ -1173,3 +1169,27 @@ def test_describe(self):
             expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
             res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
             pd.testing.assert_series_equal(res, expected, check_names=False)
+
+        # test with isolates and string index
+        nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False)
+        stats = nybb_contig.describe(
+            self.nybb.geometry.area, statistics=["count", "sum"]
+        )
+        ## all isolate values should be nan
+        assert stats.loc["Staten Island"].isna().all()
+
+        # for easier comparison and na has already been checked.
+        stats = stats.fillna(0)
+
+        pd.testing.assert_series_equal(
+            stats["sum"],
+            pd.Series(nybb_contig.lag(self.nybb.geometry.area), index=self.nybb.index),
+            check_names=False,
+        )
+
+        pd.testing.assert_series_equal(
+            stats["count"].sort_index(),
+            nybb_contig.cardinalities.sort_index(),
+            check_dtype=False,
+            check_names=False,
+        )

From 805e5c2630986bb41411316560d0368cfb211276 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 16:54:09 +0200
Subject: [PATCH 06/10] testing fix

---
 libpysal/graph/tests/test_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index bf3a3b3ac..7b94c66d7 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -1139,6 +1139,7 @@ def test_describe(self):
             stats["count"],
             contig.cardinalities,
             check_names=False,
+            check_dtype=False,
         )
         pd.testing.assert_series_equal(
             stats["sum"],

From 3a0ebed1eceb3866d248ab2ae2425936ca9e6fab Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Tue, 4 Jun 2024 19:24:22 +0200
Subject: [PATCH 07/10] Apply suggestions from code review

Co-authored-by: James Gaboardi <jgaboardi@gmail.com>
---
 libpysal/graph/_utils.py | 2 +-
 libpysal/graph/base.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
index 1b6fe9d36..e17d9a7a4 100644
--- a/libpysal/graph/_utils.py
+++ b/libpysal/graph/_utils.py
@@ -319,7 +319,7 @@ def _compute_stats(grouper, to_compute: list[str] | None = None):
     ----------
     grouper : pandas.GroupBy
         Groupby Object which specifies the aggregations to be performed.
-    to_compute : List[str]
+    to_compute : list[str]
         A list of stats functions to pass to groupby.agg
 
     Returns
diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index e6dbe68bb..62bf9994e 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2028,7 +2028,7 @@ def describe(
             Values must be between 0 and 100 inclusive. When set, values below and above
             the percentiles will be discarded before computation of the statistics.
             The percentiles are computed for each neighborhood. By default None.
-        statistics : List[str] | None
+        statistics : list[str] | None
             A list of stats functions to compute. If None, compute all
             available functions - "count", "mean", "median",
             "std", "min", "max", "sum", "nunique", "mode". By default None.

From 8f3a3007e6b5fc4066766c2418cd20edceb1f85a Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Wed, 5 Jun 2024 10:59:51 +0200
Subject: [PATCH 08/10] docstring

---
 libpysal/graph/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index 62bf9994e..9aa4551a3 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2013,7 +2013,7 @@ def describe(
 
         Weight values do not affect the calculations, only adjacency does.
 
-        Returns nan for all isolates.
+        Returns numpy.nan for all isolates.
 
         The numba package is used extensively in this function
         to accelerate the computation of statistics.

From 4cfa9435f88475a9c6d86bb24d05b6c624d2da1a Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Wed, 5 Jun 2024 12:59:15 +0200
Subject: [PATCH 09/10] ndarray test

---
 libpysal/graph/base.py            |  2 +-
 libpysal/graph/tests/test_base.py | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index 9aa4551a3..91c8277d6 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2040,7 +2040,7 @@ def describe(
         """
 
         if not isinstance(y, pd.Series):
-            y = pd.Series(y)
+            y = pd.Series(y, index=self.unique_ids)
 
         if q is None:
             grouper = y.take(self._adjacency.index.codes[1]).groupby(
diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index 7b94c66d7..fd96c56e5 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -1194,3 +1194,17 @@ def test_describe(self):
             check_dtype=False,
             check_names=False,
         )
+
+        ## test passing ndarray
+        stats1 = nybb_contig.describe(self.nybb.geometry.area, statistics=["sum"])[
+            "sum"
+        ]
+        stats2 = nybb_contig.describe(
+            self.nybb.geometry.area.values, statistics=["sum"]
+        )["sum"]
+        pd.testing.assert_series_equal(
+            stats1,
+            stats2,
+            check_dtype=False,
+            check_names=False,
+        )

From 879f3f53629fad61daadf3184e3b7abc476089b6 Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Wed, 5 Jun 2024 15:09:24 +0200
Subject: [PATCH 10/10] nas equivalence and more filtration tests

---
 libpysal/graph/_utils.py          |  2 +-
 libpysal/graph/base.py            |  3 +++
 libpysal/graph/tests/test_base.py | 13 +++++++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py
index e17d9a7a4..fe10c6739 100644
--- a/libpysal/graph/_utils.py
+++ b/libpysal/graph/_utils.py
@@ -304,7 +304,7 @@ def _limit_range(values, index, low, high):  # noqa: ARG001
     nan_tracker = np.isnan(values)
 
     if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
-        lower, higher = np.percentile(values, (low, high))
+        lower, higher = np.nanpercentile(values, (low, high))
     else:
         return ~nan_tracker
 
diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py
index 91c8277d6..4c9529e76 100644
--- a/libpysal/graph/base.py
+++ b/libpysal/graph/base.py
@@ -2042,6 +2042,9 @@ def describe(
         if not isinstance(y, pd.Series):
             y = pd.Series(y, index=self.unique_ids)
 
+        if (y.index != self.unique_ids).all():
+            raise ValueError("The values index is not aligned with the graph index.")
+
         if q is None:
             grouper = y.take(self._adjacency.index.codes[1]).groupby(
                 self._adjacency.index.codes[0]
diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py
index fd96c56e5..100450f8f 100644
--- a/libpysal/graph/tests/test_base.py
+++ b/libpysal/graph/tests/test_base.py
@@ -1171,6 +1171,13 @@ def test_describe(self):
             res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
             pd.testing.assert_series_equal(res, expected, check_names=False)
 
+        ## test NA equivalence between filtration and pandas
+        nan_areas = y.copy()
+        nan_areas.iloc[range(0, len(y), 3),] = np.nan
+        res1 = contig.describe(y, statistics=["count"])["count"]
+        res2 = contig.describe(y, statistics=["count"], q=(0, 100))["count"]
+        pd.testing.assert_series_equal(res1, res2)
+
         # test with isolates and string index
         nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False)
         stats = nybb_contig.describe(
@@ -1208,3 +1215,9 @@ def test_describe(self):
             check_dtype=False,
             check_names=False,
         )
+
+        ## test index alignment
+        with pytest.raises(
+            ValueError, match="The values index is not aligned with the graph index."
+        ):
+            nybb_contig.describe(self.nybb.geometry.area.reset_index(drop=True))