Skip to content

Commit

Permalink
ENH: describe as a replacement of AverageCharacter (#570)
Browse files Browse the repository at this point in the history
* ENH: describe as a replacement of AverageCharacter

* perf + fixup

* limit_range array input

* Apply suggestions from code review

Co-authored-by: James Gaboardi <jgaboardi@gmail.com>

* rename to y

* move to diversity

* move assert_result to its own file

* add tests

* skip mode tests on oldest

* use agg

* conftest

---------

Co-authored-by: James Gaboardi <jgaboardi@gmail.com>
  • Loading branch information
martinfleis and jGaboardi committed Apr 24, 2024
1 parent cf73295 commit cef9290
Show file tree
Hide file tree
Showing 10 changed files with 225 additions and 20 deletions.
1 change: 1 addition & 0 deletions momepy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .elements import *
from .functional._dimension import *
from .functional._distribution import *
from .functional._diversity import *
from .functional._elements import *
from .functional._shape import *
from .graph import *
Expand Down
2 changes: 1 addition & 1 deletion momepy/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def __init__(
values_list = data.loc[neighbours]

if rng:
values_list = limit_range(values_list, rng=rng)
values_list = limit_range(values_list.values, rng=rng)
if "mean" in mode:
means.append(np.mean(values_list))
if "median" in mode:
Expand Down
2 changes: 1 addition & 1 deletion momepy/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def __init__(self, gdf, values, spatial_weights, unique_id, rng=None, verbose=Tr
values_list = data.loc[neighbours]

if rng:
values_list = limit_range(values_list, rng=rng)
values_list = limit_range(values_list.values, rng=rng)
results_list.append(Theil(values_list).T)
else:
results_list.append(np.nan)
Expand Down
85 changes: 85 additions & 0 deletions momepy/functional/_diversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np
from libpysal.graph import Graph
from numpy.typing import NDArray
from pandas import DataFrame, Series
from scipy import stats

from ..utils import limit_range

__all__ = ["describe"]


def describe(
y: NDArray[np.float_] | Series,
graph: Graph,
q: tuple[float, float] | None = None,
include_mode: bool = False,
) -> DataFrame:
"""Describe the distribution of values within a set neighbourhood.
Given the graph, computes the descriptive statisitcs of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.
Notes
-----
The index of ``values`` must match the index along which the ``graph`` is
built.
Parameters
----------
y : NDArray[np.float_] | Series
An 1D array of numeric values to be described.
graph : libpysal.graph.Graph
Graph representing spatial relationships between elements.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute. Values must be between 0
and 100 inclusive. When set, values below and above the percentiles will be
discarded before computation of the average. The percentiles are computed for
each neighborhood. By default None.
include_mode : False
Compute mode along with other statistics. Default is False. Mode is
computationally expensive and not useful for continous variables.
Returns
-------
DataFrame
A DataFrame with descriptive statistics.
"""

def _describe(values, q, include_mode=False):
"""Helper function to calculate average."""
values = limit_range(values.values, q)

results = [
np.mean(values),
np.median(values),
np.std(values),
np.min(values),
np.max(values),
np.sum(values),
]
if include_mode:
results.append(stats.mode(values, keepdims=False)[0])
return results

if not isinstance(y, Series):
y = Series(y)

grouper = y.take(graph._adjacency.index.codes[1]).groupby(
graph._adjacency.index.codes[0]
)

if q is None:
stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"])
if include_mode:
stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0])
else:
agg = grouper.agg(_describe, q=q, include_mode=include_mode)
stat_ = DataFrame(zip(*agg, strict=True)).T
cols = ["mean", "median", "std", "min", "max", "sum"]
if include_mode:
cols.append("mode")
stat_.columns = cols

return stat_
11 changes: 11 additions & 0 deletions momepy/functional/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd
import pytest
from pandas.testing import assert_index_equal


def assert_result(result, expected, geometry, **kwargs):
"""Check the expected values and types of the result."""
for key, value in expected.items():
assert getattr(result, key)() == pytest.approx(value)
assert isinstance(result, pd.Series)
assert_index_equal(result.index, geometry.index, **kwargs)
2 changes: 1 addition & 1 deletion momepy/functional/tests/test_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import momepy as mm

from .test_shape import assert_result
from .conftest import assert_result


class TestDistribution:
Expand Down
117 changes: 117 additions & 0 deletions momepy/functional/tests/test_diversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import geopandas as gpd
import pytest
from libpysal.graph import Graph
from packaging.version import Version
from pandas.testing import assert_frame_equal

import momepy as mm

from .conftest import assert_result

GPD_013 = Version(gpd.__version__) >= Version("0.13")


class TestDistribution:
def setup_method(self):
test_file_path = mm.datasets.get_path("bubenec")
self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)

def test_describe(self):
area = self.df_buildings.area
r = mm.describe(area, self.graph)

expected_mean = {
"mean": 587.3761020554495,
"sum": 84582.15869598472,
"min": 50.44045729583316,
"max": 1187.2662413659234,
}
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)

expected_median = {
"mean": 577.4640489818667,
"sum": 83154.8230533888,
"min": 50.43336175017242,
"max": 1225.8094201694726,
}
assert_result(r["median"], expected_median, self.df_buildings, exact=False)

expected_std = {
"mean": 255.59307136480083,
"sum": 36805.40227653132,
"min": 0.05050450812944085,
"max": 1092.484902679786,
}
assert_result(r["std"], expected_std, self.df_buildings, exact=False)

expected_min = {
"mean": 349.53354434499295,
"sum": 50332.830385678986,
"min": 50.39387578315866,
"max": 761.0313042971973,
}
assert_result(r["min"], expected_min, self.df_buildings, exact=False)

expected_max = {
"mean": 835.1307128394886,
"sum": 120258.82264888636,
"min": 50.49413435416841,
"max": 2127.7522277389035,
}
assert_result(r["max"], expected_max, self.df_buildings, exact=False)

expected_sum = {
"mean": 1762.128306166348,
"sum": 253746.47608795413,
"min": 151.32137188749948,
"max": 3561.79872409777,
}
assert_result(r["sum"], expected_sum, self.df_buildings, exact=False)

def test_describe_quantile(self):
graph = Graph.build_knn(self.df_buildings.centroid, k=15)
area = self.df_buildings.area
r = mm.describe(area, graph, q=(25, 75))

expected_mean = {
"mean": 601.6960154385389,
"sum": 86644.2262231496,
"min": 250.25984637364323,
"max": 901.0028506943196,
}
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)

@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
def test_describe_mode(self):
corners = mm.corners(self.df_buildings)
r = mm.describe(corners, self.graph, include_mode=True)

expected = {
"mean": 6.152777777777778,
"sum": 886,
"min": 4,
"max": 17,
}
assert_result(r["mode"], expected, self.df_buildings, exact=False)

@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
def test_describe_quantile_mode(self):
graph = Graph.build_knn(self.df_buildings.centroid, k=15)
corners = mm.corners(self.df_buildings)
r = mm.describe(corners, graph, q=(25, 75), include_mode=True)

expected = {
"mean": 6.958333333333333,
"sum": 1002.0,
"min": 4.0,
"max": 12,
}
assert_result(r["mode"], expected, self.df_buildings, exact=False)

def test_describe_array(self):
area = self.df_buildings.area
r = mm.describe(area, self.graph)
r2 = mm.describe(area.values, self.graph)

assert_frame_equal(r, r2)
12 changes: 3 additions & 9 deletions momepy/functional/tests/test_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,13 @@
import pandas as pd
import pytest
from packaging.version import Version
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
from pandas.testing import assert_frame_equal, assert_series_equal

import momepy as mm

GPD_013 = Version(gpd.__version__) >= Version("0.13")

from .conftest import assert_result

def assert_result(result, expected, geometry, **kwargs):
"""Check the expected values and types of the result."""
for key, value in expected.items():
assert getattr(result, key)() == pytest.approx(value)
assert isinstance(result, pd.Series)
assert_index_equal(result.index, geometry.index, **kwargs)
GPD_013 = Version(gpd.__version__) >= Version("0.13")


class TestShape:
Expand Down
6 changes: 3 additions & 3 deletions momepy/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ def test_nx_to_gdf_osmnx(self):
assert len(lines) == 16

def test_limit_range(self):
assert list(mm.limit_range(range(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
assert list(mm.limit_range(range(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
assert list(mm.limit_range([0, 1], rng=(25, 75))) == [0, 1]
assert list(mm.limit_range(np.arange(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
assert list(mm.limit_range(np.arange(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
assert list(mm.limit_range(np.array([0, 1]), rng=(25, 75))) == [0, 1]
assert list(
mm.limit_range(np.array([0, 1, 2, 3, 4, np.nan]), rng=(25, 75))
) == [1, 2, 3]
Expand Down
7 changes: 2 additions & 5 deletions momepy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,6 @@ def limit_range(vals, rng):
The limited array.
"""

vals = np.asarray(vals)
nan_tracker = np.isnan(vals)

if (len(vals) > 2) and (not nan_tracker.all()):
Expand All @@ -479,11 +478,9 @@ def limit_range(vals, rng):
method = {"interpolation": "nearest"}
rng = sorted(rng)
if nan_tracker.any():
lower = np.nanpercentile(vals, rng[0], **method)
higher = np.nanpercentile(vals, rng[1], **method)
lower, higher = np.nanpercentile(vals, rng, **method)
else:
lower = np.percentile(vals, rng[0], **method)
higher = np.percentile(vals, rng[1], **method)
lower, higher = np.percentile(vals, rng, **method)
vals = vals[(lower <= vals) & (vals <= higher)]

return vals
Expand Down

0 comments on commit cef9290

Please sign in to comment.