From aedb687a658399bb5357237db559f28b2ae252da Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 14:22:51 +0200 Subject: [PATCH 01/10] summary statistics for neigbourhood values --- libpysal/graph/_utils.py | 99 +++++++++++++++++++++++++++++++ libpysal/graph/base.py | 60 +++++++++++++++++++ libpysal/graph/tests/test_base.py | 40 +++++++++++++ 3 files changed, 199 insertions(+) diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index 320826cee..e62627533 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -9,6 +9,15 @@ GPD_013 = Version(geopandas.__version__) >= Version("0.13") PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0") +try: + from numba import njit # noqa: E401 + + HAS_NUMBA = True +except ModuleNotFoundError: + from libpysal.common import jit as njit + + HAS_NUMBA = False + class CoplanarError(ValueError): """Custom ValueError raised when coplanar points are detected.""" @@ -274,3 +283,93 @@ def _reorder_adjtable_by_ids(adjtable, ids): .reindex(ids, level=1) .reset_index() ) + + +@njit +def _mode(values, index): + """Custom mode function for numba.""" + array = np.sort(values.ravel()) + mask = np.empty(array.shape, dtype=np.bool_) + mask[:1] = True + mask[1:] = array[1:] != array[:-1] + unique = array[mask] + idx = np.nonzero(mask)[0] + idx = np.append(idx, mask.size) + counts = np.diff(idx) + return unique[np.argmax(counts)] + +@njit +def _limit_range(values, index, low, high): + nan_tracker = np.isnan(values) + + if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2): + lower, higher = np.percentile(values, (low, high)) + else: + return ~nan_tracker + + return (lower <= values) & (values <= higher) + +def _compute_stats(grouper, to_compute:list[str]|None=None): + """Fast compute of "count", "mean", "median", "std", "min", "max", \\ + "sum", "nunique" and "mode" within a grouper object. Using numba. + + Parameters + ---------- + grouper : pandas.GroupBy + Groupby Object which specifies the aggregations to be performed. + to_compute : List[str] + A list of stats functions to pass to groupby.agg + + Returns + ------- + DataFrame + """ + + if not HAS_NUMBA: + warnings.warn( + "The numba package is used extensively in this module" + " to accelerate the computation of graphs. Without numba," + " these computations may become unduly slow on large data.", + stacklevel=3, + ) + + if to_compute is None: + to_compute = ["count", "mean", "median", + "std", "min", "max", "sum", "nunique", "mode"] + agg_to_compute = [f for f in to_compute if f != 'mode'] + stat_ = grouper.agg(agg_to_compute) + if 'mode' in to_compute: + if HAS_NUMBA: + stat_["mode"] = grouper.agg(_mode, engine='numba') + else: + stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index)) + + return stat_ + + +def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)): + """Carry out a filtration of graph neighbours \\ + based on the quantiles of ``y``, specified in ``q``""" + if not HAS_NUMBA: + warnings.warn( + "The numba package is used extensively in this module" + " to accelerate the computation of graphs. Without numba," + " these computations may become unduly slow on large data.", + stacklevel=3, + ) + + ## need to reset since numba transform has an indexing issue + grouper = y.take(graph_adjacency_index.codes[-1]).reset_index(drop=True).groupby( + graph_adjacency_index.codes[0] + ) + if HAS_NUMBA: + to_keep = grouper.transform(_limit_range, q[0], q[1], + engine='numba').values.astype(bool) + else: + to_keep = grouper.transform( + lambda x: _limit_range(x.values, x.index, q[0], q[1]) + ).values.astype(bool) + filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby( + graph_adjacency_index.codes[0][to_keep] + ) + return filtered_grouper diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 03f51f547..9cac46e23 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -28,6 +28,8 @@ _neighbor_dict_to_edges, _resolve_islands, _sparse_to_arrays, + _compute_stats, + _percentile_filtration_grouper, ) from .io._gal import _read_gal, _to_gal from .io._gwt import _read_gwt, _to_gwt @@ -1993,6 +1995,64 @@ def aggregate(self, func): """ return self._adjacency.groupby(level=0).agg(func) + def describe( + self, + y: np.typing.NDArray[np.float_] | pd.Series, + q: tuple[float, float] | None = None, + statistics: list[str] | None = None, + ) -> pd.DataFrame: + """Describe the distribution of ``y`` values within the graph. + + Given the graph, computes the descriptive statistics of values within the + neighbourhood of each node. Optionally, the values can be limited to a certain + quantile range before computing the statistics. + + Notes + ----- + The index of ``values`` must match the index of the graph. + + Weight values do not affect the calculations, only adjacency does. + + The numba package is used extensively in this function + to accelerate the computation of statistics. + Without numba, these computations may become slow on large data. + + Parameters + ---------- + y : NDArray[np.float_] | Series + An 1D array of numeric values to be described. + q : tuple[float, float] | None, optional + Tuple of percentages for the percentiles to compute. + Values must be between 0 and 100 inclusive. When set, values below and above + the percentiles will be discarded before computation of the average. + The percentiles are computed for each neighborhood. By default None. + statistics : List[str] | None + A list of stats functions to compute. If None, compute all + available functions - "count", "mean", "median", + "std", "min", "max", "sum", "nunique", "mode". By default None. + + Returns + ------- + DataFrame + A DataFrame with descriptive statistics. + """ + + if not isinstance(y, pd.Series): + y = pd.Series(y) + + if q is None: + grouper = y.take(self._adjacency.index.codes[1]).groupby( + self._adjacency.index.codes[0] + ) + else: + grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q) + + stat_ = _compute_stats(grouper, statistics) + + # NA isolates + stat_.loc[self.isolates] = np.nan + return stat_ + def _arrange_arrays(heads, tails, weights, ids=None): """ diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index 2d57fd6fc..179ff9e32 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -93,6 +93,8 @@ def setup_method(self): self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered) self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName") + self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry")) + def test_init(self): g = graph.Graph(self.adjacency_int_binary) @@ -1129,3 +1131,41 @@ def test_aggregate(self): contig.aggregate(lambda x: np.exp(np.sum(x))), expected, ) + + def test_describe(self): + contig = graph.Graph.build_contiguity( + self.guerry, rook=False).higher_order( + k=3, lower_order=True).assign_self_weight() + y = self.guerry.geometry.area + stats = contig.describe(y) + pd.testing.assert_series_equal(stats['count'], + contig.cardinalities, + check_index_type=False, + check_names=False) + pd.testing.assert_series_equal(stats['sum'], + pd.Series(contig.lag(y), + index=contig.unique_ids), + check_index_type=False, + check_names=False) + r_contig = contig.transform('R') + pd.testing.assert_series_equal(stats['mean'], + pd.Series(r_contig.lag(y), + index=contig.unique_ids), + check_index_type=False, + check_names=False) + ## compute only some statistics + specific_stats = contig.describe(y, statistics=['count', 'sum', 'mean']) + pd.testing.assert_frame_equal(specific_stats[['count', 'sum', 'mean']], + stats[['count', 'sum', 'mean']]) + + percentile_stats = contig.describe(y, q=(25, 75)) + + for i in contig.unique_ids: + neigh_vals = y[contig[i].index.values] + low, high = neigh_vals.describe()[['25%', '75%']] + neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)] + expected = neigh_vals.describe()[['count', 'mean', 'std', 'min', 'max']] + res = percentile_stats.loc[i][['count', 'mean', 'std', 'min', 'max']] + pd.testing.assert_series_equal(res, expected, check_names=False) + + From cb4b1e0cf8b5dfee608ad2daad90bd62493cc664 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 14:25:19 +0200 Subject: [PATCH 02/10] summary statistics for neigbourhood values --- libpysal/graph/_utils.py | 4 ++-- libpysal/graph/base.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index e62627533..2710142a1 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -286,7 +286,7 @@ def _reorder_adjtable_by_ids(adjtable, ids): @njit -def _mode(values, index): +def _mode(values, index): # noqa: ARG001 """Custom mode function for numba.""" array = np.sort(values.ravel()) mask = np.empty(array.shape, dtype=np.bool_) @@ -299,7 +299,7 @@ def _mode(values, index): return unique[np.argmax(counts)] @njit -def _limit_range(values, index, low, high): +def _limit_range(values, index, low, high): # noqa: ARG001 nan_tracker = np.isnan(values) if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2): diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 9cac46e23..d7152ab78 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -24,12 +24,12 @@ from ._spatial_lag import _lag_spatial from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi from ._utils import ( + _compute_stats, _evaluate_index, _neighbor_dict_to_edges, + _percentile_filtration_grouper, _resolve_islands, _sparse_to_arrays, - _compute_stats, - _percentile_filtration_grouper, ) from .io._gal import _read_gal, _to_gal from .io._gwt import _read_gwt, _to_gwt From 188f02b9b5e042c4bf821014a07b5d68b4e24568 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 14:46:06 +0200 Subject: [PATCH 03/10] formatting --- libpysal/graph/_utils.py | 40 ++++++++++++++------- libpysal/graph/tests/test_base.py | 58 +++++++++++++++++-------------- 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index 2710142a1..1b6fe9d36 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -298,6 +298,7 @@ def _mode(values, index): # noqa: ARG001 counts = np.diff(idx) return unique[np.argmax(counts)] + @njit def _limit_range(values, index, low, high): # noqa: ARG001 nan_tracker = np.isnan(values) @@ -309,7 +310,8 @@ def _limit_range(values, index, low, high): # noqa: ARG001 return (lower <= values) & (values <= higher) -def _compute_stats(grouper, to_compute:list[str]|None=None): + +def _compute_stats(grouper, to_compute: list[str] | None = None): """Fast compute of "count", "mean", "median", "std", "min", "max", \\ "sum", "nunique" and "mode" within a grouper object. Using numba. @@ -334,13 +336,22 @@ def _compute_stats(grouper, to_compute:list[str]|None=None): ) if to_compute is None: - to_compute = ["count", "mean", "median", - "std", "min", "max", "sum", "nunique", "mode"] - agg_to_compute = [f for f in to_compute if f != 'mode'] + to_compute = [ + "count", + "mean", + "median", + "std", + "min", + "max", + "sum", + "nunique", + "mode", + ] + agg_to_compute = [f for f in to_compute if f != "mode"] stat_ = grouper.agg(agg_to_compute) - if 'mode' in to_compute: + if "mode" in to_compute: if HAS_NUMBA: - stat_["mode"] = grouper.agg(_mode, engine='numba') + stat_["mode"] = grouper.agg(_mode, engine="numba") else: stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index)) @@ -359,17 +370,20 @@ def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)): ) ## need to reset since numba transform has an indexing issue - grouper = y.take(graph_adjacency_index.codes[-1]).reset_index(drop=True).groupby( - graph_adjacency_index.codes[0] + grouper = ( + y.take(graph_adjacency_index.codes[-1]) + .reset_index(drop=True) + .groupby(graph_adjacency_index.codes[0]) ) if HAS_NUMBA: - to_keep = grouper.transform(_limit_range, q[0], q[1], - engine='numba').values.astype(bool) + to_keep = grouper.transform( + _limit_range, q[0], q[1], engine="numba" + ).values.astype(bool) else: to_keep = grouper.transform( lambda x: _limit_range(x.values, x.index, q[0], q[1]) - ).values.astype(bool) + ).values.astype(bool) filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby( - graph_adjacency_index.codes[0][to_keep] - ) + graph_adjacency_index.codes[0][to_keep] + ) return filtered_grouper diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index 179ff9e32..7c47b1940 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -95,7 +95,6 @@ def setup_method(self): self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName") self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry")) - def test_init(self): g = graph.Graph(self.adjacency_int_binary) assert isinstance(g, graph.Graph) @@ -1133,39 +1132,44 @@ def test_aggregate(self): ) def test_describe(self): - contig = graph.Graph.build_contiguity( - self.guerry, rook=False).higher_order( - k=3, lower_order=True).assign_self_weight() + contig = ( + graph.Graph.build_contiguity(self.guerry, rook=False) + .higher_order(k=3, lower_order=True) + .assign_self_weight() + ) y = self.guerry.geometry.area stats = contig.describe(y) - pd.testing.assert_series_equal(stats['count'], - contig.cardinalities, - check_index_type=False, - check_names=False) - pd.testing.assert_series_equal(stats['sum'], - pd.Series(contig.lag(y), - index=contig.unique_ids), - check_index_type=False, - check_names=False) - r_contig = contig.transform('R') - pd.testing.assert_series_equal(stats['mean'], - pd.Series(r_contig.lag(y), - index=contig.unique_ids), - check_index_type=False, - check_names=False) + pd.testing.assert_series_equal( + stats["count"], + contig.cardinalities, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_series_equal( + stats["sum"], + pd.Series(contig.lag(y), index=contig.unique_ids), + check_index_type=False, + check_names=False, + ) + r_contig = contig.transform("R") + pd.testing.assert_series_equal( + stats["mean"], + pd.Series(r_contig.lag(y), index=contig.unique_ids), + check_index_type=False, + check_names=False, + ) ## compute only some statistics - specific_stats = contig.describe(y, statistics=['count', 'sum', 'mean']) - pd.testing.assert_frame_equal(specific_stats[['count', 'sum', 'mean']], - stats[['count', 'sum', 'mean']]) + specific_stats = contig.describe(y, statistics=["count", "sum", "mean"]) + pd.testing.assert_frame_equal( + specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]] + ) percentile_stats = contig.describe(y, q=(25, 75)) for i in contig.unique_ids: neigh_vals = y[contig[i].index.values] - low, high = neigh_vals.describe()[['25%', '75%']] + low, high = neigh_vals.describe()[["25%", "75%"]] neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)] - expected = neigh_vals.describe()[['count', 'mean', 'std', 'min', 'max']] - res = percentile_stats.loc[i][['count', 'mean', 'std', 'min', 'max']] + expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]] + res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]] pd.testing.assert_series_equal(res, expected, check_names=False) - - From 045e7927dc8b317229064194a981cb543e97d1e5 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 15:40:49 +0200 Subject: [PATCH 04/10] Apply suggestions from code review Co-authored-by: Martin Fleischmann --- libpysal/graph/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index d7152ab78..f409659ac 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2001,7 +2001,7 @@ def describe( q: tuple[float, float] | None = None, statistics: list[str] | None = None, ) -> pd.DataFrame: - """Describe the distribution of ``y`` values within the graph. + """Describe the distribution of ``y`` values within the neighbors of each node. Given the graph, computes the descriptive statistics of values within the neighbourhood of each node. Optionally, the values can be limited to a certain @@ -2024,7 +2024,7 @@ def describe( q : tuple[float, float] | None, optional Tuple of percentages for the percentiles to compute. Values must be between 0 and 100 inclusive. When set, values below and above - the percentiles will be discarded before computation of the average. + the percentiles will be discarded before computation of the statistics. The percentiles are computed for each neighborhood. By default None. statistics : List[str] | None A list of stats functions to compute. If None, compute all From b634b388557a8ba34637957eb305f0bc8c2479ea Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 16:42:42 +0200 Subject: [PATCH 05/10] extra tests --- libpysal/graph/base.py | 5 +++++ libpysal/graph/tests/test_base.py | 36 ++++++++++++++++++++++++------- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index f409659ac..e6dbe68bb 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2013,6 +2013,8 @@ def describe( Weight values do not affect the calculations, only adjacency does. + Returns nan for all isolates. + The numba package is used extensively in this function to accelerate the computation of statistics. Without numba, these computations may become slow on large data. @@ -2049,6 +2051,9 @@ def describe( stat_ = _compute_stats(grouper, statistics) + stat_.index = self.unique_ids + if isinstance(stat_, pd.Series): + stat_.name = None # NA isolates stat_.loc[self.isolates] = np.nan return stat_ diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index 7c47b1940..bf3a3b3ac 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -1132,34 +1132,30 @@ def test_aggregate(self): ) def test_describe(self): - contig = ( - graph.Graph.build_contiguity(self.guerry, rook=False) - .higher_order(k=3, lower_order=True) - .assign_self_weight() - ) + contig = graph.Graph.build_knn(self.guerry.geometry.centroid, k=5) y = self.guerry.geometry.area stats = contig.describe(y) pd.testing.assert_series_equal( stats["count"], contig.cardinalities, - check_index_type=False, check_names=False, ) pd.testing.assert_series_equal( stats["sum"], pd.Series(contig.lag(y), index=contig.unique_ids), - check_index_type=False, check_names=False, ) r_contig = contig.transform("R") pd.testing.assert_series_equal( stats["mean"], pd.Series(r_contig.lag(y), index=contig.unique_ids), - check_index_type=False, check_names=False, ) ## compute only some statistics specific_stats = contig.describe(y, statistics=["count", "sum", "mean"]) + ## assert only the specified values are computed + assert list(specific_stats.columns) == ["count", "sum", "mean"] + pd.testing.assert_frame_equal( specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]] ) @@ -1173,3 +1169,27 @@ def test_describe(self): expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]] res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]] pd.testing.assert_series_equal(res, expected, check_names=False) + + # test with isolates and string index + nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False) + stats = nybb_contig.describe( + self.nybb.geometry.area, statistics=["count", "sum"] + ) + ## all isolate values should be nan + assert stats.loc["Staten Island"].isna().all() + + # for easier comparison and na has already been checked. + stats = stats.fillna(0) + + pd.testing.assert_series_equal( + stats["sum"], + pd.Series(nybb_contig.lag(self.nybb.geometry.area), index=self.nybb.index), + check_names=False, + ) + + pd.testing.assert_series_equal( + stats["count"].sort_index(), + nybb_contig.cardinalities.sort_index(), + check_dtype=False, + check_names=False, + ) From 805e5c2630986bb41411316560d0368cfb211276 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 16:54:09 +0200 Subject: [PATCH 06/10] testing fix --- libpysal/graph/tests/test_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index bf3a3b3ac..7b94c66d7 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -1139,6 +1139,7 @@ def test_describe(self): stats["count"], contig.cardinalities, check_names=False, + check_dtype=False, ) pd.testing.assert_series_equal( stats["sum"], From 3a0ebed1eceb3866d248ab2ae2425936ca9e6fab Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Tue, 4 Jun 2024 19:24:22 +0200 Subject: [PATCH 07/10] Apply suggestions from code review Co-authored-by: James Gaboardi --- libpysal/graph/_utils.py | 2 +- libpysal/graph/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index 1b6fe9d36..e17d9a7a4 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -319,7 +319,7 @@ def _compute_stats(grouper, to_compute: list[str] | None = None): ---------- grouper : pandas.GroupBy Groupby Object which specifies the aggregations to be performed. - to_compute : List[str] + to_compute : list[str] A list of stats functions to pass to groupby.agg Returns diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index e6dbe68bb..62bf9994e 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2028,7 +2028,7 @@ def describe( Values must be between 0 and 100 inclusive. When set, values below and above the percentiles will be discarded before computation of the statistics. The percentiles are computed for each neighborhood. By default None. - statistics : List[str] | None + statistics : list[str] | None A list of stats functions to compute. If None, compute all available functions - "count", "mean", "median", "std", "min", "max", "sum", "nunique", "mode". By default None. From 8f3a3007e6b5fc4066766c2418cd20edceb1f85a Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Wed, 5 Jun 2024 10:59:51 +0200 Subject: [PATCH 08/10] docstring --- libpysal/graph/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 62bf9994e..9aa4551a3 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2013,7 +2013,7 @@ def describe( Weight values do not affect the calculations, only adjacency does. - Returns nan for all isolates. + Returns numpy.nan for all isolates. The numba package is used extensively in this function to accelerate the computation of statistics. From 4cfa9435f88475a9c6d86bb24d05b6c624d2da1a Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Wed, 5 Jun 2024 12:59:15 +0200 Subject: [PATCH 09/10] ndarray test --- libpysal/graph/base.py | 2 +- libpysal/graph/tests/test_base.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 9aa4551a3..91c8277d6 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2040,7 +2040,7 @@ def describe( """ if not isinstance(y, pd.Series): - y = pd.Series(y) + y = pd.Series(y, index=self.unique_ids) if q is None: grouper = y.take(self._adjacency.index.codes[1]).groupby( diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index 7b94c66d7..fd96c56e5 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -1194,3 +1194,17 @@ def test_describe(self): check_dtype=False, check_names=False, ) + + ## test passing ndarray + stats1 = nybb_contig.describe(self.nybb.geometry.area, statistics=["sum"])[ + "sum" + ] + stats2 = nybb_contig.describe( + self.nybb.geometry.area.values, statistics=["sum"] + )["sum"] + pd.testing.assert_series_equal( + stats1, + stats2, + check_dtype=False, + check_names=False, + ) From 879f3f53629fad61daadf3184e3b7abc476089b6 Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Wed, 5 Jun 2024 15:09:24 +0200 Subject: [PATCH 10/10] nas equivalence and more filtration tests --- libpysal/graph/_utils.py | 2 +- libpysal/graph/base.py | 3 +++ libpysal/graph/tests/test_base.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index e17d9a7a4..fe10c6739 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -304,7 +304,7 @@ def _limit_range(values, index, low, high): # noqa: ARG001 nan_tracker = np.isnan(values) if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2): - lower, higher = np.percentile(values, (low, high)) + lower, higher = np.nanpercentile(values, (low, high)) else: return ~nan_tracker diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 91c8277d6..4c9529e76 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -2042,6 +2042,9 @@ def describe( if not isinstance(y, pd.Series): y = pd.Series(y, index=self.unique_ids) + if (y.index != self.unique_ids).all(): + raise ValueError("The values index is not aligned with the graph index.") + if q is None: grouper = y.take(self._adjacency.index.codes[1]).groupby( self._adjacency.index.codes[0] diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index fd96c56e5..100450f8f 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -1171,6 +1171,13 @@ def test_describe(self): res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]] pd.testing.assert_series_equal(res, expected, check_names=False) + ## test NA equivalence between filtration and pandas + nan_areas = y.copy() + nan_areas.iloc[range(0, len(y), 3),] = np.nan + res1 = contig.describe(y, statistics=["count"])["count"] + res2 = contig.describe(y, statistics=["count"], q=(0, 100))["count"] + pd.testing.assert_series_equal(res1, res2) + # test with isolates and string index nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False) stats = nybb_contig.describe( @@ -1208,3 +1215,9 @@ def test_describe(self): check_dtype=False, check_names=False, ) + + ## test index alignment + with pytest.raises( + ValueError, match="The values index is not aligned with the graph index." + ): + nybb_contig.describe(self.nybb.geometry.area.reset_index(drop=True))