From 889fb44d74f99c34acb491fd041f50b60d3a960b Mon Sep 17 00:00:00 2001
From: Krasen Samardzhiev <krasensam@gmail.com>
Date: Mon, 13 May 2024 21:40:53 +0200
Subject: [PATCH] ENH: add describe_reached as a replacement of Reached (#575)

* functional reached calculations

* update pandas version in latest ubuntu image

* added test versioning

* changes based on PR

* changes based on PR

* env rollback

* pr changes and nan stat test

* typing and test versioning check

* Apply suggestions from code review

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>

* documentation changes

---------

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
---
 momepy/functional/_diversity.py           | 293 ++++++++++++++++++----
 momepy/functional/tests/conftest.py       |   3 +-
 momepy/functional/tests/test_diversity.py | 245 +++++++++++++++++-
 3 files changed, 484 insertions(+), 57 deletions(-)

diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py
index 5ee80bf9..f7be66be 100644
--- a/momepy/functional/_diversity.py
+++ b/momepy/functional/_diversity.py
@@ -1,11 +1,119 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 from libpysal.graph import Graph
 from numpy.typing import NDArray
+from packaging.version import Version
 from pandas import DataFrame, Series
 
-__all__ = ["describe"]
+try:
+    from numba import njit
+
+    HAS_NUMBA = True
+except (ModuleNotFoundError, ImportError):
+    HAS_NUMBA = False
+    from libpysal.common import jit as njit
+
+__all__ = ["describe", "describe_reached"]
+
+
+@njit
+def _mode(array):
+    """Custom mode function for numba."""
+    array = np.sort(array.ravel())
+    mask = np.empty(array.shape, dtype=np.bool_)
+    mask[:1] = True
+    mask[1:] = array[1:] != array[:-1]
+    unique = array[mask]
+    idx = np.nonzero(mask)[0]
+    idx = np.append(idx, mask.size)
+    counts = np.diff(idx)
+    return unique[np.argmax(counts)]
+
+
+@njit
+def _describe(values, q, include_mode=False):
+    """Helper function to calculate average."""
+    nan_tracker = np.isnan(values)
+
+    if nan_tracker.all():
+        return [
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+        ]
+
+    else:
+        values = values[~np.isnan(values)]
+
+    if len(values) > 2:
+        lower, higher = np.percentile(values, q)
+        values = values[(lower <= values) & (values <= higher)]
+
+    mean = np.mean(values)
+    n = values.shape[0]
+    if n == 1:
+        std = np.nan
+    else:
+        # for pandas compatability
+        std = np.sqrt(np.sum(np.abs(values - mean) ** 2) / (n - 1))
+
+    results = [
+        n,
+        mean,
+        np.median(values),
+        std,
+        np.min(values),
+        np.max(values),
+        np.sum(values),
+    ]
+
+    if include_mode:
+        results.append(_mode(values))
+
+    return results
+
+
+def _compute_stats(grouper, q=None, include_mode=False):
+    """
+    Fast compute of "count", "mean", "median", "std", "min", "max", "sum" statistics,
+    with optional mode and quartile limits.
+
+    Parameters
+    ----------
+    grouper : pandas.GroupBy
+        Groupby Object which specifies the aggregations to be performed.
+    q : tuple[float, float] | None, optional
+        Tuple of percentages for the percentiles to compute. Values must be between 0
+        and 100 inclusive. When set, values below and above the percentiles will be
+        discarded before computation of the average. The percentiles are computed for
+        each neighborhood. By default None.
+    include_mode : False
+        Compute mode along with other statistics. Default is False. Mode is
+        computationally expensive and not useful for continous variables.
+
+    Returns
+    -------
+    DataFrame
+    """
+    if q is None:
+        stat_ = grouper.agg(["count", "mean", "median", "std", "min", "max", "sum"])
+        if include_mode:
+            stat_["mode"] = grouper.agg(lambda x: _mode(x.values))
+    else:
+        agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode))
+        stat_ = DataFrame(np.stack(agg.values), index=agg.index)
+        cols = ["count", "mean", "median", "std", "min", "max", "sum"]
+        if include_mode:
+            cols.append("mode")
+        stat_.columns = cols
+
+    return stat_
 
 
 def describe(
@@ -16,7 +124,7 @@ def describe(
 ) -> DataFrame:
     """Describe the distribution of values within a set neighbourhood.
 
-    Given the graph, computes the descriptive statisitcs of values within the
+    Given the graph, computes the descriptive statistics of values within the
     neighbourhood of each node. Optionally, the values can be limited to a certain
     quantile range before computing the statistics.
 
@@ -48,9 +156,8 @@ def describe(
     DataFrame
         A DataFrame with descriptive statistics.
     """
-    try:
-        from numba import njit
-    except (ModuleNotFoundError, ImportError):
+
+    if not HAS_NUMBA:
         warnings.warn(
             "The numba package is used extensively in this function to accelerate the"
             " computation of statistics but it is not installed or  cannot be imported."
@@ -58,44 +165,6 @@ def describe(
             UserWarning,
             stacklevel=2,
         )
-        from libpysal.common import jit as njit
-
-    @njit
-    def _mode(array):
-        """Custom mode function for numba."""
-        array = np.sort(array.ravel())
-        mask = np.empty(array.shape, dtype=np.bool_)
-        mask[:1] = True
-        mask[1:] = array[1:] != array[:-1]
-        unique = array[mask]
-        idx = np.nonzero(mask)[0]
-        idx = np.append(idx, mask.size)
-        counts = np.diff(idx)
-        return unique[np.argmax(counts)]
-
-    @njit
-    def _describe(values, q, include_mode=False):
-        """Helper function to calculate average."""
-        nan_tracker = np.isnan(values)
-
-        if (len(values) > 2) and (not nan_tracker.all()):
-            if nan_tracker.any():
-                lower, higher = np.nanpercentile(values, q)
-            else:
-                lower, higher = np.percentile(values, q)
-            values = values[(lower <= values) & (values <= higher)]
-
-        results = [
-            np.mean(values),
-            np.median(values),
-            np.std(values),
-            np.min(values),
-            np.max(values),
-            np.sum(values),
-        ]
-        if include_mode:
-            results.append(_mode(values))
-        return results
 
     if not isinstance(y, Series):
         y = Series(y)
@@ -104,16 +173,132 @@ def _describe(values, q, include_mode=False):
         graph._adjacency.index.codes[0]
     )
 
-    if q is None:
-        stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"])
-        if include_mode:
-            stat_["mode"] = grouper.agg(lambda x: _mode(x.values))
+    return _compute_stats(grouper, q, include_mode)
+
+
+def describe_reached(
+    y: np.ndarray | Series,
+    graph_index: np.ndarray | Series,
+    result_index: pd.Index = None,
+    graph: Graph = None,
+    q: tuple | list | None = None,
+    include_mode: bool = False,
+) -> DataFrame:
+    """Describe the distribution of values reached on a neighbourhood graph.
+
+    Given a neighborhood graph or a grouping, computes the descriptive statistics
+    of values reached. Optionally, the values can be limited to a certain
+    quantile range before computing the statistics.
+
+    The statistics calculated are count, sum, mean, median, std.
+    Optionally, mode can be calculated, or the statistics can be calculated in
+    quantiles ``q``.
+
+    The neighbourhood is defined in ``graph``. If ``graph`` is ``None``,
+    the function will assume topological distance ``0`` (element itself)
+    and ``result_index`` is required in order to arrange the results.
+    If ``graph``, the results are arranged according to the spatial weights ordering.
+
+    Adapted from :cite:`hermosilla2012` and :cite:`feliciotti2018`.
+
+    Notes
+    -----
+    The numba package is used extensively in this function to accelerate the computation
+    of statistics. Without numba, these computations may become slow on large data.
+
+    Parameters
+    ----------
+    y : Series | numpy.array
+        A Series or numpy.array containing values to analyse.
+    graph_index : Series | numpy.array
+        The unique ID that specifies the aggregation
+        of ``y`` objects to ``graph`` groups.
+    result_index : pd.Index (default None)
+        An index that specifies how to order the results when ``graph`` is None.
+        When ``graph`` is given, the index is derived from its unique IDs.
+    graph : libpysal.graph.Graph (default None)
+        A spatial weights matrix of the element ``y`` is grouped into.
+    q : tuple[float, float] | None, optional
+        Tuple of percentages for the percentiles to compute. Values must be between 0
+        and 100 inclusive. When set, values below and above the percentiles will be
+        discarded before computation of the average. The percentiles are computed for
+        each neighborhood. By default None.
+    include_mode : False
+        Compute mode along with other statistics. Default is False. Mode is
+        computationally expensive and not useful for continous variables.
+
+    Returns
+    -------
+    DataFrame
+
+    Examples
+    --------
+    >>> res = mm.describe_reached(
+    ...         tessellation['area'], tessellation['nID'] , graph=streets_q1
+    ...     )
+    >>> streets["tessalations_reached"] = res['count']
+    >>> streets["tessalations_reached_area"] = res['sum']
+
+    """
+
+    if Version(pd.__version__) <= Version("2.1.0"):
+        raise ImportError("pandas 2.1.0 or newer is required to use this function.")
+
+    if not HAS_NUMBA:
+        warnings.warn(
+            "The numba package is used extensively in this function to accelerate the"
+            " computation of statistics but it is not installed or  cannot be imported."
+            " Without numba, these computations may become slow on large data.",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    param_err = ValueError(
+        "One of result_index or graph has to be specified, but not both."
+    )
+    # case where params are none
+    if (result_index is None) and (graph is None):
+        raise param_err
+    elif result_index is None:
+        result_index = graph.unique_ids
+    elif graph is None:
+        result_index = result_index
+    # case where both params are passed
     else:
-        agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode))
-        stat_ = DataFrame(zip(*agg, strict=True)).T
-        cols = ["mean", "median", "std", "min", "max", "sum"]
-        if include_mode:
-            cols.append("mode")
-        stat_.columns = cols
+        raise param_err
 
-    return stat_
+    # series indice needs renaming, since multiindices
+    # without explicit names cannot be joined
+    if isinstance(y, np.ndarray):
+        y = pd.Series(y, name="obs_index")
+    else:
+        y = y.rename_axis("obs_index")
+
+    if isinstance(graph_index, np.ndarray):
+        graph_index = pd.Series(graph_index, name="neighbor")
+    else:
+        graph_index = graph_index.rename("neighbor")
+
+    # aggregate data
+    if graph is None:
+        grouper = y.groupby(graph_index)
+
+    else:
+        df_multi_index = y.to_frame().set_index(graph_index, append=True).swaplevel()
+        combined_index = graph.adjacency.index.join(df_multi_index.index).dropna()
+        grouper = y.loc[combined_index.get_level_values(-1)].groupby(
+            combined_index.get_level_values(0)
+        )
+
+    stats = _compute_stats(grouper, q, include_mode)
+
+    result = pd.DataFrame(
+        np.full((result_index.shape[0], stats.shape[1]), np.nan), index=result_index
+    )
+    result.loc[stats.index.values] = stats.values
+    result.columns = stats.columns
+    # fill only counts with zeros, other stats are NA
+    result.loc[:, "count"] = result.loc[:, "count"].fillna(0)
+    result.index.names = result_index.names
+
+    return result
diff --git a/momepy/functional/tests/conftest.py b/momepy/functional/tests/conftest.py
index d8a7b62f..2f1fff4c 100644
--- a/momepy/functional/tests/conftest.py
+++ b/momepy/functional/tests/conftest.py
@@ -4,7 +4,8 @@
 
 
 def assert_result(result, expected, geometry, **kwargs):
-    """Check the expected values and types of the result."""
+    """Check the expected values and types of the result.
+    Note: ''count'' refers to the number of non-NAs in the result."""
     for key, value in expected.items():
         assert getattr(result, key)() == pytest.approx(value)
     assert isinstance(result, pd.Series)
diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py
index 2fe06319..78e472c0 100644
--- a/momepy/functional/tests/test_diversity.py
+++ b/momepy/functional/tests/test_diversity.py
@@ -1,20 +1,44 @@
 import geopandas as gpd
+import numpy as np
+import pandas as pd
 import pytest
 from libpysal.graph import Graph
 from packaging.version import Version
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 
 import momepy as mm
 
 from .conftest import assert_result
 
 GPD_013 = Version(gpd.__version__) >= Version("0.13")
+PD_210 = Version(pd.__version__) >= Version("2.1.0")
 
 
-class TestDistribution:
+class TestDescribe:
     def setup_method(self):
         test_file_path = mm.datasets.get_path("bubenec")
         self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
+        self.df_streets = gpd.read_file(test_file_path, layer="streets")
+        self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation")
+        self.df_streets["nID"] = mm.unique_id(self.df_streets)
+        self.df_buildings["height"] = np.linspace(10.0, 30.0, 144)
+        self.df_tessellation["area"] = self.df_tessellation.geometry.area
+        self.df_buildings["area"] = self.df_buildings.geometry.area
+        self.df_buildings["fl_area"] = mm.FloorArea(self.df_buildings, "height").series
+        self.df_buildings["nID"] = mm.get_network_id(
+            self.df_buildings, self.df_streets, "nID"
+        )
+        blocks = mm.Blocks(
+            self.df_tessellation, self.df_streets, self.df_buildings, "bID", "uID"
+        )
+        self.blocks = blocks.blocks
+        self.df_buildings["bID"] = blocks.buildings_id
+        self.df_tessellation["bID"] = blocks.tessellation_id
+        self.graph_sw = (
+            Graph.build_contiguity(self.df_streets.set_index("nID"), rook=False)
+            .higher_order(k=2, lower_order=True)
+            .assign_self_weight()
+        )
         self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)
 
     def test_describe(self):
@@ -115,3 +139,220 @@ def test_describe_array(self):
         r2 = mm.describe(area.values, self.graph)
 
         assert_frame_equal(r, r2)
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached_input(self):
+        with pytest.raises(
+            ValueError,
+            match=("One of result_index or graph has to be specified, but not both."),
+        ):
+            mm.describe_reached(self.df_buildings[["area"]], self.df_buildings["nID"])
+
+        with pytest.raises(
+            ValueError,
+            match=("One of result_index or graph has to be specified, but not both."),
+        ):
+            mm.describe_reached(
+                self.df_buildings[["area"]],
+                self.df_buildings["nID"],
+                result_index=self.df_streets.index,
+                graph=self.graph_sw,
+            )
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached(self):
+        df = mm.describe_reached(
+            self.df_buildings["area"],
+            self.df_buildings["nID"],
+            self.df_streets.index,
+        )
+
+        # not testing std, there are different implementations:
+        # OO momepy uses ddof=0, functional momepy - ddof=1
+        expected_area_sum = {
+            "min": 618.4470363735187,
+            "max": 18085.458977113314,
+            "count": 22,
+            "mean": 4765.589763157915,
+        }
+        expected_area_mean = {
+            "min": 218.962248810988,
+            "max": 1808.5458977113315,
+            "count": 22,
+            "mean": 746.7028417890866,
+        }
+        expected_area_count = {
+            "min": 0,
+            "max": 18,
+            "count": 35,
+            "mean": 4.114285714285714,
+        }
+        assert_result(df["count"], expected_area_count, self.df_streets)
+        assert_result(df["sum"], expected_area_sum, self.df_streets)
+        assert_result(df["mean"], expected_area_mean, self.df_streets)
+
+        df = mm.describe_reached(
+            self.df_buildings["fl_area"].values,
+            self.df_buildings["nID"],
+            self.df_streets.index,
+        )
+
+        expected_fl_area_sum = {
+            "min": 1894.492021221426,
+            "max": 79169.31385861782,
+            "count": 22,
+            "mean": 26494.88163951223,
+        }
+        expected_fl_area_mean = {
+            "min": 939.9069666320963,
+            "max": 7916.931385861784,
+            "count": 22,
+            "mean": 3995.8307750062318,
+        }
+        expected_fl_area_count = {
+            "min": 0,
+            "max": 18,
+            "count": 35,
+            "mean": 4.114285714285714,
+        }
+        assert_result(df["count"], expected_fl_area_count, self.df_streets)
+        assert_result(df["sum"], expected_fl_area_sum, self.df_streets)
+        assert_result(df["mean"], expected_fl_area_mean, self.df_streets)
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached_sw(self):
+        df_sw = mm.describe_reached(
+            self.df_buildings["fl_area"], self.df_buildings["nID"], graph=self.graph_sw
+        )
+
+        # not using assert_result since the method
+        # is returning an aggregation, indexed based on nID
+        assert max(df_sw["count"]) == 138
+        expected = {"min": 6, "max": 138, "count": 35, "mean": 67.8}
+        assert_result(df_sw["count"], expected, self.df_streets, check_names=False)
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached_input_equality(self):
+        island_result_df = mm.describe_reached(
+            self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index
+        )
+        island_result_series = mm.describe_reached(
+            self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index
+        )
+        island_result_ndarray = mm.describe_reached(
+            self.df_buildings["area"].values,
+            self.df_buildings["nID"].values,
+            self.df_streets.index,
+        )
+
+        assert np.allclose(
+            island_result_df.values, island_result_series.values, equal_nan=True
+        )
+        assert np.allclose(
+            island_result_df.values, island_result_ndarray.values, equal_nan=True
+        )
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_na_results(self):
+        nan_areas = self.df_buildings["area"]
+        nan_areas.iloc[range(0, len(self.df_buildings), 3),] = np.nan
+
+        pandas_agg_vals = mm.describe_reached(
+            nan_areas,
+            self.df_buildings["nID"],
+            self.df_streets.index,
+        )
+
+        numba_agg_vals = mm.describe_reached(
+            nan_areas, self.df_buildings["nID"], self.df_streets.index, q=(0, 100)
+        )
+
+        assert_frame_equal(pandas_agg_vals, numba_agg_vals)
+
+
+class TestDescribeEquality:
+    def setup_method(self):
+        test_file_path = mm.datasets.get_path("bubenec")
+        self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
+        self.df_streets = gpd.read_file(test_file_path, layer="streets")
+        self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation")
+        self.df_streets["nID"] = mm.unique_id(self.df_streets)
+        self.df_buildings["height"] = np.linspace(10.0, 30.0, 144)
+        self.df_tessellation["area"] = self.df_tessellation.geometry.area
+        self.df_buildings["area"] = self.df_buildings.geometry.area
+        self.df_buildings["fl_area"] = mm.FloorArea(self.df_buildings, "height").series
+        self.df_buildings["nID"] = mm.get_network_id(
+            self.df_buildings, self.df_streets, "nID"
+        )
+        blocks = mm.Blocks(
+            self.df_tessellation, self.df_streets, self.df_buildings, "bID", "uID"
+        )
+        self.blocks = blocks.blocks
+        self.df_buildings["bID"] = blocks.buildings_id
+        self.df_tessellation["bID"] = blocks.tessellation_id
+        self.graph_sw = (
+            Graph.build_contiguity(self.df_streets.set_index("nID"), rook=False)
+            .higher_order(k=2, lower_order=True)
+            .assign_self_weight()
+        )
+        self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached_equality(self):
+        new_df = mm.describe_reached(
+            self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index
+        )
+
+        new_count = new_df["count"]
+        old_count = mm.Reached(self.df_streets, self.df_buildings, "nID", "nID").series
+        assert_series_equal(new_count, old_count, check_names=False, check_dtype=False)
+
+        new_area = new_df["sum"]
+        old_area = mm.Reached(
+            self.df_streets, self.df_buildings, "nID", "nID", mode="sum"
+        ).series
+        assert_series_equal(new_area, old_area, check_names=False, check_dtype=False)
+
+        new_area_mean = new_df["mean"]
+        old_area_mean = mm.Reached(
+            self.df_streets, self.df_buildings, "nID", "nID", mode="mean"
+        ).series
+        assert_series_equal(
+            new_area_mean, old_area_mean, check_names=False, check_dtype=False
+        )
+
+    @pytest.mark.skipif(
+        not PD_210, reason="aggregation is different in previous pandas versions"
+    )
+    def test_describe_reached_equality_sw(self):
+        new_df = mm.describe_reached(
+            self.df_buildings["fl_area"], self.df_buildings["nID"], graph=self.graph_sw
+        )
+
+        new_fl_area = new_df["sum"]
+
+        sw = mm.sw_high(k=2, gdf=self.df_streets)
+        old_fl_area = mm.Reached(
+            self.df_streets,
+            self.df_buildings,
+            "nID",
+            "nID",
+            spatial_weights=sw,
+            mode="sum",
+            values="fl_area",
+        ).series
+        assert_series_equal(
+            new_fl_area, old_fl_area, check_names=False, check_dtype=False
+        )