From 889fb44d74f99c34acb491fd041f50b60d3a960b Mon Sep 17 00:00:00 2001 From: Krasen Samardzhiev Date: Mon, 13 May 2024 21:40:53 +0200 Subject: [PATCH] ENH: add describe_reached as a replacement of Reached (#575) * functional reached calculations * update pandas version in latest ubuntu image * added test versioning * changes based on PR * changes based on PR * env rollback * pr changes and nan stat test * typing and test versioning check * Apply suggestions from code review Co-authored-by: Martin Fleischmann * documentation changes --------- Co-authored-by: Martin Fleischmann --- momepy/functional/_diversity.py | 293 ++++++++++++++++++---- momepy/functional/tests/conftest.py | 3 +- momepy/functional/tests/test_diversity.py | 245 +++++++++++++++++- 3 files changed, 484 insertions(+), 57 deletions(-) diff --git a/momepy/functional/_diversity.py b/momepy/functional/_diversity.py index 5ee80bf9..f7be66be 100644 --- a/momepy/functional/_diversity.py +++ b/momepy/functional/_diversity.py @@ -1,11 +1,119 @@ import warnings import numpy as np +import pandas as pd from libpysal.graph import Graph from numpy.typing import NDArray +from packaging.version import Version from pandas import DataFrame, Series -__all__ = ["describe"] +try: + from numba import njit + + HAS_NUMBA = True +except (ModuleNotFoundError, ImportError): + HAS_NUMBA = False + from libpysal.common import jit as njit + +__all__ = ["describe", "describe_reached"] + + +@njit +def _mode(array): + """Custom mode function for numba.""" + array = np.sort(array.ravel()) + mask = np.empty(array.shape, dtype=np.bool_) + mask[:1] = True + mask[1:] = array[1:] != array[:-1] + unique = array[mask] + idx = np.nonzero(mask)[0] + idx = np.append(idx, mask.size) + counts = np.diff(idx) + return unique[np.argmax(counts)] + + +@njit +def _describe(values, q, include_mode=False): + """Helper function to calculate average.""" + nan_tracker = np.isnan(values) + + if nan_tracker.all(): + return [ + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + + else: + values = values[~np.isnan(values)] + + if len(values) > 2: + lower, higher = np.percentile(values, q) + values = values[(lower <= values) & (values <= higher)] + + mean = np.mean(values) + n = values.shape[0] + if n == 1: + std = np.nan + else: + # for pandas compatability + std = np.sqrt(np.sum(np.abs(values - mean) ** 2) / (n - 1)) + + results = [ + n, + mean, + np.median(values), + std, + np.min(values), + np.max(values), + np.sum(values), + ] + + if include_mode: + results.append(_mode(values)) + + return results + + +def _compute_stats(grouper, q=None, include_mode=False): + """ + Fast compute of "count", "mean", "median", "std", "min", "max", "sum" statistics, + with optional mode and quartile limits. + + Parameters + ---------- + grouper : pandas.GroupBy + Groupby Object which specifies the aggregations to be performed. + q : tuple[float, float] | None, optional + Tuple of percentages for the percentiles to compute. Values must be between 0 + and 100 inclusive. When set, values below and above the percentiles will be + discarded before computation of the average. The percentiles are computed for + each neighborhood. By default None. + include_mode : False + Compute mode along with other statistics. Default is False. Mode is + computationally expensive and not useful for continous variables. + + Returns + ------- + DataFrame + """ + if q is None: + stat_ = grouper.agg(["count", "mean", "median", "std", "min", "max", "sum"]) + if include_mode: + stat_["mode"] = grouper.agg(lambda x: _mode(x.values)) + else: + agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode)) + stat_ = DataFrame(np.stack(agg.values), index=agg.index) + cols = ["count", "mean", "median", "std", "min", "max", "sum"] + if include_mode: + cols.append("mode") + stat_.columns = cols + + return stat_ def describe( @@ -16,7 +124,7 @@ def describe( ) -> DataFrame: """Describe the distribution of values within a set neighbourhood. - Given the graph, computes the descriptive statisitcs of values within the + Given the graph, computes the descriptive statistics of values within the neighbourhood of each node. Optionally, the values can be limited to a certain quantile range before computing the statistics. @@ -48,9 +156,8 @@ def describe( DataFrame A DataFrame with descriptive statistics. """ - try: - from numba import njit - except (ModuleNotFoundError, ImportError): + + if not HAS_NUMBA: warnings.warn( "The numba package is used extensively in this function to accelerate the" " computation of statistics but it is not installed or cannot be imported." @@ -58,44 +165,6 @@ def describe( UserWarning, stacklevel=2, ) - from libpysal.common import jit as njit - - @njit - def _mode(array): - """Custom mode function for numba.""" - array = np.sort(array.ravel()) - mask = np.empty(array.shape, dtype=np.bool_) - mask[:1] = True - mask[1:] = array[1:] != array[:-1] - unique = array[mask] - idx = np.nonzero(mask)[0] - idx = np.append(idx, mask.size) - counts = np.diff(idx) - return unique[np.argmax(counts)] - - @njit - def _describe(values, q, include_mode=False): - """Helper function to calculate average.""" - nan_tracker = np.isnan(values) - - if (len(values) > 2) and (not nan_tracker.all()): - if nan_tracker.any(): - lower, higher = np.nanpercentile(values, q) - else: - lower, higher = np.percentile(values, q) - values = values[(lower <= values) & (values <= higher)] - - results = [ - np.mean(values), - np.median(values), - np.std(values), - np.min(values), - np.max(values), - np.sum(values), - ] - if include_mode: - results.append(_mode(values)) - return results if not isinstance(y, Series): y = Series(y) @@ -104,16 +173,132 @@ def _describe(values, q, include_mode=False): graph._adjacency.index.codes[0] ) - if q is None: - stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"]) - if include_mode: - stat_["mode"] = grouper.agg(lambda x: _mode(x.values)) + return _compute_stats(grouper, q, include_mode) + + +def describe_reached( + y: np.ndarray | Series, + graph_index: np.ndarray | Series, + result_index: pd.Index = None, + graph: Graph = None, + q: tuple | list | None = None, + include_mode: bool = False, +) -> DataFrame: + """Describe the distribution of values reached on a neighbourhood graph. + + Given a neighborhood graph or a grouping, computes the descriptive statistics + of values reached. Optionally, the values can be limited to a certain + quantile range before computing the statistics. + + The statistics calculated are count, sum, mean, median, std. + Optionally, mode can be calculated, or the statistics can be calculated in + quantiles ``q``. + + The neighbourhood is defined in ``graph``. If ``graph`` is ``None``, + the function will assume topological distance ``0`` (element itself) + and ``result_index`` is required in order to arrange the results. + If ``graph``, the results are arranged according to the spatial weights ordering. + + Adapted from :cite:`hermosilla2012` and :cite:`feliciotti2018`. + + Notes + ----- + The numba package is used extensively in this function to accelerate the computation + of statistics. Without numba, these computations may become slow on large data. + + Parameters + ---------- + y : Series | numpy.array + A Series or numpy.array containing values to analyse. + graph_index : Series | numpy.array + The unique ID that specifies the aggregation + of ``y`` objects to ``graph`` groups. + result_index : pd.Index (default None) + An index that specifies how to order the results when ``graph`` is None. + When ``graph`` is given, the index is derived from its unique IDs. + graph : libpysal.graph.Graph (default None) + A spatial weights matrix of the element ``y`` is grouped into. + q : tuple[float, float] | None, optional + Tuple of percentages for the percentiles to compute. Values must be between 0 + and 100 inclusive. When set, values below and above the percentiles will be + discarded before computation of the average. The percentiles are computed for + each neighborhood. By default None. + include_mode : False + Compute mode along with other statistics. Default is False. Mode is + computationally expensive and not useful for continous variables. + + Returns + ------- + DataFrame + + Examples + -------- + >>> res = mm.describe_reached( + ... tessellation['area'], tessellation['nID'] , graph=streets_q1 + ... ) + >>> streets["tessalations_reached"] = res['count'] + >>> streets["tessalations_reached_area"] = res['sum'] + + """ + + if Version(pd.__version__) <= Version("2.1.0"): + raise ImportError("pandas 2.1.0 or newer is required to use this function.") + + if not HAS_NUMBA: + warnings.warn( + "The numba package is used extensively in this function to accelerate the" + " computation of statistics but it is not installed or cannot be imported." + " Without numba, these computations may become slow on large data.", + UserWarning, + stacklevel=2, + ) + + param_err = ValueError( + "One of result_index or graph has to be specified, but not both." + ) + # case where params are none + if (result_index is None) and (graph is None): + raise param_err + elif result_index is None: + result_index = graph.unique_ids + elif graph is None: + result_index = result_index + # case where both params are passed else: - agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode)) - stat_ = DataFrame(zip(*agg, strict=True)).T - cols = ["mean", "median", "std", "min", "max", "sum"] - if include_mode: - cols.append("mode") - stat_.columns = cols + raise param_err - return stat_ + # series indice needs renaming, since multiindices + # without explicit names cannot be joined + if isinstance(y, np.ndarray): + y = pd.Series(y, name="obs_index") + else: + y = y.rename_axis("obs_index") + + if isinstance(graph_index, np.ndarray): + graph_index = pd.Series(graph_index, name="neighbor") + else: + graph_index = graph_index.rename("neighbor") + + # aggregate data + if graph is None: + grouper = y.groupby(graph_index) + + else: + df_multi_index = y.to_frame().set_index(graph_index, append=True).swaplevel() + combined_index = graph.adjacency.index.join(df_multi_index.index).dropna() + grouper = y.loc[combined_index.get_level_values(-1)].groupby( + combined_index.get_level_values(0) + ) + + stats = _compute_stats(grouper, q, include_mode) + + result = pd.DataFrame( + np.full((result_index.shape[0], stats.shape[1]), np.nan), index=result_index + ) + result.loc[stats.index.values] = stats.values + result.columns = stats.columns + # fill only counts with zeros, other stats are NA + result.loc[:, "count"] = result.loc[:, "count"].fillna(0) + result.index.names = result_index.names + + return result diff --git a/momepy/functional/tests/conftest.py b/momepy/functional/tests/conftest.py index d8a7b62f..2f1fff4c 100644 --- a/momepy/functional/tests/conftest.py +++ b/momepy/functional/tests/conftest.py @@ -4,7 +4,8 @@ def assert_result(result, expected, geometry, **kwargs): - """Check the expected values and types of the result.""" + """Check the expected values and types of the result. + Note: ''count'' refers to the number of non-NAs in the result.""" for key, value in expected.items(): assert getattr(result, key)() == pytest.approx(value) assert isinstance(result, pd.Series) diff --git a/momepy/functional/tests/test_diversity.py b/momepy/functional/tests/test_diversity.py index 2fe06319..78e472c0 100644 --- a/momepy/functional/tests/test_diversity.py +++ b/momepy/functional/tests/test_diversity.py @@ -1,20 +1,44 @@ import geopandas as gpd +import numpy as np +import pandas as pd import pytest from libpysal.graph import Graph from packaging.version import Version -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal import momepy as mm from .conftest import assert_result GPD_013 = Version(gpd.__version__) >= Version("0.13") +PD_210 = Version(pd.__version__) >= Version("2.1.0") -class TestDistribution: +class TestDescribe: def setup_method(self): test_file_path = mm.datasets.get_path("bubenec") self.df_buildings = gpd.read_file(test_file_path, layer="buildings") + self.df_streets = gpd.read_file(test_file_path, layer="streets") + self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") + self.df_streets["nID"] = mm.unique_id(self.df_streets) + self.df_buildings["height"] = np.linspace(10.0, 30.0, 144) + self.df_tessellation["area"] = self.df_tessellation.geometry.area + self.df_buildings["area"] = self.df_buildings.geometry.area + self.df_buildings["fl_area"] = mm.FloorArea(self.df_buildings, "height").series + self.df_buildings["nID"] = mm.get_network_id( + self.df_buildings, self.df_streets, "nID" + ) + blocks = mm.Blocks( + self.df_tessellation, self.df_streets, self.df_buildings, "bID", "uID" + ) + self.blocks = blocks.blocks + self.df_buildings["bID"] = blocks.buildings_id + self.df_tessellation["bID"] = blocks.tessellation_id + self.graph_sw = ( + Graph.build_contiguity(self.df_streets.set_index("nID"), rook=False) + .higher_order(k=2, lower_order=True) + .assign_self_weight() + ) self.graph = Graph.build_knn(self.df_buildings.centroid, k=3) def test_describe(self): @@ -115,3 +139,220 @@ def test_describe_array(self): r2 = mm.describe(area.values, self.graph) assert_frame_equal(r, r2) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached_input(self): + with pytest.raises( + ValueError, + match=("One of result_index or graph has to be specified, but not both."), + ): + mm.describe_reached(self.df_buildings[["area"]], self.df_buildings["nID"]) + + with pytest.raises( + ValueError, + match=("One of result_index or graph has to be specified, but not both."), + ): + mm.describe_reached( + self.df_buildings[["area"]], + self.df_buildings["nID"], + result_index=self.df_streets.index, + graph=self.graph_sw, + ) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached(self): + df = mm.describe_reached( + self.df_buildings["area"], + self.df_buildings["nID"], + self.df_streets.index, + ) + + # not testing std, there are different implementations: + # OO momepy uses ddof=0, functional momepy - ddof=1 + expected_area_sum = { + "min": 618.4470363735187, + "max": 18085.458977113314, + "count": 22, + "mean": 4765.589763157915, + } + expected_area_mean = { + "min": 218.962248810988, + "max": 1808.5458977113315, + "count": 22, + "mean": 746.7028417890866, + } + expected_area_count = { + "min": 0, + "max": 18, + "count": 35, + "mean": 4.114285714285714, + } + assert_result(df["count"], expected_area_count, self.df_streets) + assert_result(df["sum"], expected_area_sum, self.df_streets) + assert_result(df["mean"], expected_area_mean, self.df_streets) + + df = mm.describe_reached( + self.df_buildings["fl_area"].values, + self.df_buildings["nID"], + self.df_streets.index, + ) + + expected_fl_area_sum = { + "min": 1894.492021221426, + "max": 79169.31385861782, + "count": 22, + "mean": 26494.88163951223, + } + expected_fl_area_mean = { + "min": 939.9069666320963, + "max": 7916.931385861784, + "count": 22, + "mean": 3995.8307750062318, + } + expected_fl_area_count = { + "min": 0, + "max": 18, + "count": 35, + "mean": 4.114285714285714, + } + assert_result(df["count"], expected_fl_area_count, self.df_streets) + assert_result(df["sum"], expected_fl_area_sum, self.df_streets) + assert_result(df["mean"], expected_fl_area_mean, self.df_streets) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached_sw(self): + df_sw = mm.describe_reached( + self.df_buildings["fl_area"], self.df_buildings["nID"], graph=self.graph_sw + ) + + # not using assert_result since the method + # is returning an aggregation, indexed based on nID + assert max(df_sw["count"]) == 138 + expected = {"min": 6, "max": 138, "count": 35, "mean": 67.8} + assert_result(df_sw["count"], expected, self.df_streets, check_names=False) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached_input_equality(self): + island_result_df = mm.describe_reached( + self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index + ) + island_result_series = mm.describe_reached( + self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index + ) + island_result_ndarray = mm.describe_reached( + self.df_buildings["area"].values, + self.df_buildings["nID"].values, + self.df_streets.index, + ) + + assert np.allclose( + island_result_df.values, island_result_series.values, equal_nan=True + ) + assert np.allclose( + island_result_df.values, island_result_ndarray.values, equal_nan=True + ) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_na_results(self): + nan_areas = self.df_buildings["area"] + nan_areas.iloc[range(0, len(self.df_buildings), 3),] = np.nan + + pandas_agg_vals = mm.describe_reached( + nan_areas, + self.df_buildings["nID"], + self.df_streets.index, + ) + + numba_agg_vals = mm.describe_reached( + nan_areas, self.df_buildings["nID"], self.df_streets.index, q=(0, 100) + ) + + assert_frame_equal(pandas_agg_vals, numba_agg_vals) + + +class TestDescribeEquality: + def setup_method(self): + test_file_path = mm.datasets.get_path("bubenec") + self.df_buildings = gpd.read_file(test_file_path, layer="buildings") + self.df_streets = gpd.read_file(test_file_path, layer="streets") + self.df_tessellation = gpd.read_file(test_file_path, layer="tessellation") + self.df_streets["nID"] = mm.unique_id(self.df_streets) + self.df_buildings["height"] = np.linspace(10.0, 30.0, 144) + self.df_tessellation["area"] = self.df_tessellation.geometry.area + self.df_buildings["area"] = self.df_buildings.geometry.area + self.df_buildings["fl_area"] = mm.FloorArea(self.df_buildings, "height").series + self.df_buildings["nID"] = mm.get_network_id( + self.df_buildings, self.df_streets, "nID" + ) + blocks = mm.Blocks( + self.df_tessellation, self.df_streets, self.df_buildings, "bID", "uID" + ) + self.blocks = blocks.blocks + self.df_buildings["bID"] = blocks.buildings_id + self.df_tessellation["bID"] = blocks.tessellation_id + self.graph_sw = ( + Graph.build_contiguity(self.df_streets.set_index("nID"), rook=False) + .higher_order(k=2, lower_order=True) + .assign_self_weight() + ) + self.graph = Graph.build_knn(self.df_buildings.centroid, k=3) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached_equality(self): + new_df = mm.describe_reached( + self.df_buildings["area"], self.df_buildings["nID"], self.df_streets.index + ) + + new_count = new_df["count"] + old_count = mm.Reached(self.df_streets, self.df_buildings, "nID", "nID").series + assert_series_equal(new_count, old_count, check_names=False, check_dtype=False) + + new_area = new_df["sum"] + old_area = mm.Reached( + self.df_streets, self.df_buildings, "nID", "nID", mode="sum" + ).series + assert_series_equal(new_area, old_area, check_names=False, check_dtype=False) + + new_area_mean = new_df["mean"] + old_area_mean = mm.Reached( + self.df_streets, self.df_buildings, "nID", "nID", mode="mean" + ).series + assert_series_equal( + new_area_mean, old_area_mean, check_names=False, check_dtype=False + ) + + @pytest.mark.skipif( + not PD_210, reason="aggregation is different in previous pandas versions" + ) + def test_describe_reached_equality_sw(self): + new_df = mm.describe_reached( + self.df_buildings["fl_area"], self.df_buildings["nID"], graph=self.graph_sw + ) + + new_fl_area = new_df["sum"] + + sw = mm.sw_high(k=2, gdf=self.df_streets) + old_fl_area = mm.Reached( + self.df_streets, + self.df_buildings, + "nID", + "nID", + spatial_weights=sw, + mode="sum", + values="fl_area", + ).series + assert_series_equal( + new_fl_area, old_fl_area, check_names=False, check_dtype=False + )