ENH: describe as a replacement of AverageCharacter (#570)

* ENH: describe as a replacement of AverageCharacter * perf + fixup * limit_range array input * Apply suggestions from code review Co-authored-by: James Gaboardi <jgaboardi@gmail.com> * rename to y * move to diversity * move assert_result to its own file * add tests * skip mode tests on oldest * use agg * conftest --------- Co-authored-by: James Gaboardi <jgaboardi@gmail.com>
pysal · Apr 24, 2024 · cef9290 · cef9290
1 parent cf73295
commit cef9290
Show file tree

Hide file tree

Showing 10 changed files with 225 additions and 20 deletions.
diff --git a/momepy/__init__.py b/momepy/__init__.py
@@ -9,6 +9,7 @@
 from .elements import *
 from .functional._dimension import *
 from .functional._distribution import *
+from .functional._diversity import *
 from .functional._elements import *
 from .functional._shape import *
 from .graph import *

diff --git a/momepy/dimension.py b/momepy/dimension.py
@@ -445,7 +445,7 @@ def __init__(
                 values_list = data.loc[neighbours]
 
                 if rng:
-                    values_list = limit_range(values_list, rng=rng)
+                    values_list = limit_range(values_list.values, rng=rng)
                 if "mean" in mode:
                     means.append(np.mean(values_list))
                 if "median" in mode:

diff --git a/momepy/diversity.py b/momepy/diversity.py
@@ -209,7 +209,7 @@ def __init__(self, gdf, values, spatial_weights, unique_id, rng=None, verbose=Tr
                 values_list = data.loc[neighbours]
 
                 if rng:
-                    values_list = limit_range(values_list, rng=rng)
+                    values_list = limit_range(values_list.values, rng=rng)
                 results_list.append(Theil(values_list).T)
             else:
                 results_list.append(np.nan)

diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py
@@ -0,0 +1,85 @@
+import numpy as np
+from libpysal.graph import Graph
+from numpy.typing import NDArray
+from pandas import DataFrame, Series
+from scipy import stats
+
+from ..utils import limit_range
+
+__all__ = ["describe"]
+
+
+def describe(
+    y: NDArray[np.float_] | Series,
+    graph: Graph,
+    q: tuple[float, float] | None = None,
+    include_mode: bool = False,
+) -> DataFrame:
+    """Describe the distribution of values within a set neighbourhood.
+
+    Given the graph, computes the descriptive statisitcs of values within the
+    neighbourhood of each node. Optionally, the values can be limited to a certain
+    quantile range before computing the statistics.
+
+    Notes
+    -----
+    The index of ``values`` must match the index along which the ``graph`` is
+    built.
+
+    Parameters
+    ----------
+    y : NDArray[np.float_] | Series
+        An 1D array of numeric values to be described.
+    graph : libpysal.graph.Graph
+        Graph representing spatial relationships between elements.
+    q : tuple[float, float] | None, optional
+        Tuple of percentages for the percentiles to compute. Values must be between 0
+        and 100 inclusive. When set, values below and above the percentiles will be
+        discarded before computation of the average. The percentiles are computed for
+        each neighborhood. By default None.
+    include_mode : False
+        Compute mode along with other statistics. Default is False. Mode is
+        computationally expensive and not useful for continous variables.
+
+    Returns
+    -------
+    DataFrame
+        A DataFrame with descriptive statistics.
+    """
+
+    def _describe(values, q, include_mode=False):
+        """Helper function to calculate average."""
+        values = limit_range(values.values, q)
+
+        results = [
+            np.mean(values),
+            np.median(values),
+            np.std(values),
+            np.min(values),
+            np.max(values),
+            np.sum(values),
+        ]
+        if include_mode:
+            results.append(stats.mode(values, keepdims=False)[0])
+        return results
+
+    if not isinstance(y, Series):
+        y = Series(y)
+
+    grouper = y.take(graph._adjacency.index.codes[1]).groupby(
+        graph._adjacency.index.codes[0]
+    )
+
+    if q is None:
+        stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"])
+        if include_mode:
+            stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0])
+    else:
+        agg = grouper.agg(_describe, q=q, include_mode=include_mode)
+        stat_ = DataFrame(zip(*agg, strict=True)).T
+        cols = ["mean", "median", "std", "min", "max", "sum"]
+        if include_mode:
+            cols.append("mode")
+        stat_.columns = cols
+
+    return stat_
diff --git a/momepy/functional/tests/conftest.py b/momepy/functional/tests/conftest.py
@@ -0,0 +1,11 @@
+import pandas as pd
+import pytest
+from pandas.testing import assert_index_equal
+
+
+def assert_result(result, expected, geometry, **kwargs):
+    """Check the expected values and types of the result."""
+    for key, value in expected.items():
+        assert getattr(result, key)() == pytest.approx(value)
+    assert isinstance(result, pd.Series)
+    assert_index_equal(result.index, geometry.index, **kwargs)
diff --git a/momepy/functional/tests/test_distribution.py b/momepy/functional/tests/test_distribution.py
@@ -4,7 +4,7 @@
 
 import momepy as mm
 
-from .test_shape import assert_result
+from .conftest import assert_result
 
 
 class TestDistribution:

diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py
@@ -0,0 +1,117 @@
+import geopandas as gpd
+import pytest
+from libpysal.graph import Graph
+from packaging.version import Version
+from pandas.testing import assert_frame_equal
+
+import momepy as mm
+
+from .conftest import assert_result
+
+GPD_013 = Version(gpd.__version__) >= Version("0.13")
+
+
+class TestDistribution:
+    def setup_method(self):
+        test_file_path = mm.datasets.get_path("bubenec")
+        self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
+        self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)
+
+    def test_describe(self):
+        area = self.df_buildings.area
+        r = mm.describe(area, self.graph)
+
+        expected_mean = {
+            "mean": 587.3761020554495,
+            "sum": 84582.15869598472,
+            "min": 50.44045729583316,
+            "max": 1187.2662413659234,
+        }
+        assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)
+
+        expected_median = {
+            "mean": 577.4640489818667,
+            "sum": 83154.8230533888,
+            "min": 50.43336175017242,
+            "max": 1225.8094201694726,
+        }
+        assert_result(r["median"], expected_median, self.df_buildings, exact=False)
+
+        expected_std = {
+            "mean": 255.59307136480083,
+            "sum": 36805.40227653132,
+            "min": 0.05050450812944085,
+            "max": 1092.484902679786,
+        }
+        assert_result(r["std"], expected_std, self.df_buildings, exact=False)
+
+        expected_min = {
+            "mean": 349.53354434499295,
+            "sum": 50332.830385678986,
+            "min": 50.39387578315866,
+            "max": 761.0313042971973,
+        }
+        assert_result(r["min"], expected_min, self.df_buildings, exact=False)
+
+        expected_max = {
+            "mean": 835.1307128394886,
+            "sum": 120258.82264888636,
+            "min": 50.49413435416841,
+            "max": 2127.7522277389035,
+        }
+        assert_result(r["max"], expected_max, self.df_buildings, exact=False)
+
+        expected_sum = {
+            "mean": 1762.128306166348,
+            "sum": 253746.47608795413,
+            "min": 151.32137188749948,
+            "max": 3561.79872409777,
+        }
+        assert_result(r["sum"], expected_sum, self.df_buildings, exact=False)
+
+    def test_describe_quantile(self):
+        graph = Graph.build_knn(self.df_buildings.centroid, k=15)
+        area = self.df_buildings.area
+        r = mm.describe(area, graph, q=(25, 75))
+
+        expected_mean = {
+            "mean": 601.6960154385389,
+            "sum": 86644.2262231496,
+            "min": 250.25984637364323,
+            "max": 901.0028506943196,
+        }
+        assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)
+
+    @pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
+    def test_describe_mode(self):
+        corners = mm.corners(self.df_buildings)
+        r = mm.describe(corners, self.graph, include_mode=True)
+
+        expected = {
+            "mean": 6.152777777777778,
+            "sum": 886,
+            "min": 4,
+            "max": 17,
+        }
+        assert_result(r["mode"], expected, self.df_buildings, exact=False)
+
+    @pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
+    def test_describe_quantile_mode(self):
+        graph = Graph.build_knn(self.df_buildings.centroid, k=15)
+        corners = mm.corners(self.df_buildings)
+        r = mm.describe(corners, graph, q=(25, 75), include_mode=True)
+
+        expected = {
+            "mean": 6.958333333333333,
+            "sum": 1002.0,
+            "min": 4.0,
+            "max": 12,
+        }
+        assert_result(r["mode"], expected, self.df_buildings, exact=False)
+
+    def test_describe_array(self):
+        area = self.df_buildings.area
+        r = mm.describe(area, self.graph)
+        r2 = mm.describe(area.values, self.graph)
+
+        assert_frame_equal(r, r2)
diff --git a/momepy/functional/tests/test_shape.py b/momepy/functional/tests/test_shape.py
@@ -3,19 +3,13 @@
 import pandas as pd
 import pytest
 from packaging.version import Version
-from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 import momepy as mm
 
-GPD_013 = Version(gpd.__version__) >= Version("0.13")
-
+from .conftest import assert_result
 
-def assert_result(result, expected, geometry, **kwargs):
-    """Check the expected values and types of the result."""
-    for key, value in expected.items():
-        assert getattr(result, key)() == pytest.approx(value)
-    assert isinstance(result, pd.Series)
-    assert_index_equal(result.index, geometry.index, **kwargs)
+GPD_013 = Version(gpd.__version__) >= Version("0.13")
 
 
 class TestShape:

diff --git a/momepy/tests/test_utils.py b/momepy/tests/test_utils.py
@@ -196,9 +196,9 @@ def test_nx_to_gdf_osmnx(self):
         assert len(lines) == 16
 
     def test_limit_range(self):
-        assert list(mm.limit_range(range(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
-        assert list(mm.limit_range(range(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
-        assert list(mm.limit_range([0, 1], rng=(25, 75))) == [0, 1]
+        assert list(mm.limit_range(np.arange(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
+        assert list(mm.limit_range(np.arange(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
+        assert list(mm.limit_range(np.array([0, 1]), rng=(25, 75))) == [0, 1]
         assert list(
             mm.limit_range(np.array([0, 1, 2, 3, 4, np.nan]), rng=(25, 75))
         ) == [1, 2, 3]

diff --git a/momepy/utils.py b/momepy/utils.py
@@ -469,7 +469,6 @@ def limit_range(vals, rng):
         The limited array.
     """
 
-    vals = np.asarray(vals)
     nan_tracker = np.isnan(vals)
 
     if (len(vals) > 2) and (not nan_tracker.all()):
@@ -479,11 +478,9 @@ def limit_range(vals, rng):
             method = {"interpolation": "nearest"}
         rng = sorted(rng)
         if nan_tracker.any():
-            lower = np.nanpercentile(vals, rng[0], **method)
-            higher = np.nanpercentile(vals, rng[1], **method)
+            lower, higher = np.nanpercentile(vals, rng, **method)
         else:
-            lower = np.percentile(vals, rng[0], **method)
-            higher = np.percentile(vals, rng[1], **method)
+            lower, higher = np.percentile(vals, rng, **method)
         vals = vals[(lower <= vals) & (vals <= higher)]
 
     return vals