Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: describe as a replacement of AverageCharacter #570

Merged
merged 11 commits into from
Apr 24, 2024
1 change: 1 addition & 0 deletions momepy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .elements import *
from .functional._dimension import *
from .functional._distribution import *
from .functional._diversity import *
from .functional._elements import *
from .functional._shape import *
from .graph import *
Expand Down
2 changes: 1 addition & 1 deletion momepy/dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def __init__(
values_list = data.loc[neighbours]

if rng:
values_list = limit_range(values_list, rng=rng)
values_list = limit_range(values_list.values, rng=rng)
if "mean" in mode:
means.append(np.mean(values_list))
if "median" in mode:
Expand Down
2 changes: 1 addition & 1 deletion momepy/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def __init__(self, gdf, values, spatial_weights, unique_id, rng=None, verbose=Tr
values_list = data.loc[neighbours]

if rng:
values_list = limit_range(values_list, rng=rng)
values_list = limit_range(values_list.values, rng=rng)
results_list.append(Theil(values_list).T)
else:
results_list.append(np.nan)
Expand Down
85 changes: 85 additions & 0 deletions momepy/functional/_diversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import numpy as np
from libpysal.graph import Graph
from numpy.typing import NDArray
from pandas import DataFrame, Series
from scipy import stats

from ..utils import limit_range

__all__ = ["describe"]


def describe(
y: NDArray[np.float_] | Series,
graph: Graph,
q: tuple[float, float] | None = None,
include_mode: bool = False,
) -> DataFrame:
"""Describe the distribution of values within a set neighbourhood.

Given the graph, computes the descriptive statisitcs of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.

Notes
-----
The index of ``values`` must match the index along which the ``graph`` is
built.

Parameters
----------
y : NDArray[np.float_] | Series
An 1D array of numeric values to be described.
graph : libpysal.graph.Graph
Graph representing spatial relationships between elements.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute. Values must be between 0
and 100 inclusive. When set, values below and above the percentiles will be
discarded before computation of the average. The percentiles are computed for
each neighborhood. By default None.
include_mode : False
Compute mode along with other statistics. Default is False. Mode is
computationally expensive and not useful for continous variables.

Returns
-------
DataFrame
A DataFrame with descriptive statistics.
"""

def _describe(values, q, include_mode=False):
"""Helper function to calculate average."""
values = limit_range(values.values, q)

results = [
np.mean(values),
np.median(values),
np.std(values),
np.min(values),
np.max(values),
np.sum(values),
]
if include_mode:
results.append(stats.mode(values, keepdims=False)[0])
return results

if not isinstance(y, Series):
y = Series(y)

grouper = y.take(graph._adjacency.index.codes[1]).groupby(
graph._adjacency.index.codes[0]
)

if q is None:
stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"])
if include_mode:
stat_["mode"] = grouper.agg(lambda x: stats.mode(x, keepdims=False)[0])
else:
agg = grouper.agg(_describe, q=q, include_mode=include_mode)
stat_ = DataFrame(zip(*agg, strict=True)).T
cols = ["mean", "median", "std", "min", "max", "sum"]
if include_mode:
cols.append("mode")
stat_.columns = cols

return stat_
11 changes: 11 additions & 0 deletions momepy/functional/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd
import pytest
from pandas.testing import assert_index_equal


def assert_result(result, expected, geometry, **kwargs):
"""Check the expected values and types of the result."""
for key, value in expected.items():
assert getattr(result, key)() == pytest.approx(value)
assert isinstance(result, pd.Series)
assert_index_equal(result.index, geometry.index, **kwargs)
2 changes: 1 addition & 1 deletion momepy/functional/tests/test_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import momepy as mm

from .test_shape import assert_result
from .conftest import assert_result


class TestDistribution:
Expand Down
117 changes: 117 additions & 0 deletions momepy/functional/tests/test_diversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import geopandas as gpd
import pytest
from libpysal.graph import Graph
from packaging.version import Version
from pandas.testing import assert_frame_equal

import momepy as mm

from .conftest import assert_result

GPD_013 = Version(gpd.__version__) >= Version("0.13")


class TestDistribution:
def setup_method(self):
test_file_path = mm.datasets.get_path("bubenec")
self.df_buildings = gpd.read_file(test_file_path, layer="buildings")
self.graph = Graph.build_knn(self.df_buildings.centroid, k=3)

def test_describe(self):
area = self.df_buildings.area
r = mm.describe(area, self.graph)

expected_mean = {
"mean": 587.3761020554495,
"sum": 84582.15869598472,
"min": 50.44045729583316,
"max": 1187.2662413659234,
}
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)

expected_median = {
"mean": 577.4640489818667,
"sum": 83154.8230533888,
"min": 50.43336175017242,
"max": 1225.8094201694726,
}
assert_result(r["median"], expected_median, self.df_buildings, exact=False)

expected_std = {
"mean": 255.59307136480083,
"sum": 36805.40227653132,
"min": 0.05050450812944085,
"max": 1092.484902679786,
}
assert_result(r["std"], expected_std, self.df_buildings, exact=False)

expected_min = {
"mean": 349.53354434499295,
"sum": 50332.830385678986,
"min": 50.39387578315866,
"max": 761.0313042971973,
}
assert_result(r["min"], expected_min, self.df_buildings, exact=False)

expected_max = {
"mean": 835.1307128394886,
"sum": 120258.82264888636,
"min": 50.49413435416841,
"max": 2127.7522277389035,
}
assert_result(r["max"], expected_max, self.df_buildings, exact=False)

expected_sum = {
"mean": 1762.128306166348,
"sum": 253746.47608795413,
"min": 151.32137188749948,
"max": 3561.79872409777,
}
assert_result(r["sum"], expected_sum, self.df_buildings, exact=False)

def test_describe_quantile(self):
graph = Graph.build_knn(self.df_buildings.centroid, k=15)
area = self.df_buildings.area
r = mm.describe(area, graph, q=(25, 75))

expected_mean = {
"mean": 601.6960154385389,
"sum": 86644.2262231496,
"min": 250.25984637364323,
"max": 901.0028506943196,
}
assert_result(r["mean"], expected_mean, self.df_buildings, exact=False)

@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
def test_describe_mode(self):
corners = mm.corners(self.df_buildings)
r = mm.describe(corners, self.graph, include_mode=True)

expected = {
"mean": 6.152777777777778,
"sum": 886,
"min": 4,
"max": 17,
}
assert_result(r["mode"], expected, self.df_buildings, exact=False)

@pytest.mark.skipif(not GPD_013, reason="get_coordinates() not available")
def test_describe_quantile_mode(self):
graph = Graph.build_knn(self.df_buildings.centroid, k=15)
corners = mm.corners(self.df_buildings)
r = mm.describe(corners, graph, q=(25, 75), include_mode=True)

expected = {
"mean": 6.958333333333333,
"sum": 1002.0,
"min": 4.0,
"max": 12,
}
assert_result(r["mode"], expected, self.df_buildings, exact=False)

def test_describe_array(self):
area = self.df_buildings.area
r = mm.describe(area, self.graph)
r2 = mm.describe(area.values, self.graph)

assert_frame_equal(r, r2)
12 changes: 3 additions & 9 deletions momepy/functional/tests/test_shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,13 @@
import pandas as pd
import pytest
from packaging.version import Version
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
from pandas.testing import assert_frame_equal, assert_series_equal

import momepy as mm

GPD_013 = Version(gpd.__version__) >= Version("0.13")

from .conftest import assert_result

def assert_result(result, expected, geometry, **kwargs):
"""Check the expected values and types of the result."""
for key, value in expected.items():
assert getattr(result, key)() == pytest.approx(value)
assert isinstance(result, pd.Series)
assert_index_equal(result.index, geometry.index, **kwargs)
GPD_013 = Version(gpd.__version__) >= Version("0.13")


class TestShape:
Expand Down
6 changes: 3 additions & 3 deletions momepy/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ def test_nx_to_gdf_osmnx(self):
assert len(lines) == 16

def test_limit_range(self):
assert list(mm.limit_range(range(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
assert list(mm.limit_range(range(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
assert list(mm.limit_range([0, 1], rng=(25, 75))) == [0, 1]
assert list(mm.limit_range(np.arange(10), rng=(25, 75))) == [2, 3, 4, 5, 6, 7]
jGaboardi marked this conversation as resolved.
Show resolved Hide resolved
assert list(mm.limit_range(np.arange(10), rng=(10, 90))) == [1, 2, 3, 4, 5, 6, 7, 8]
assert list(mm.limit_range(np.array([0, 1]), rng=(25, 75))) == [0, 1]
assert list(
mm.limit_range(np.array([0, 1, 2, 3, 4, np.nan]), rng=(25, 75))
) == [1, 2, 3]
Expand Down
7 changes: 2 additions & 5 deletions momepy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,6 @@ def limit_range(vals, rng):
The limited array.
"""

vals = np.asarray(vals)
nan_tracker = np.isnan(vals)

if (len(vals) > 2) and (not nan_tracker.all()):
Expand All @@ -479,11 +478,9 @@ def limit_range(vals, rng):
method = {"interpolation": "nearest"}
rng = sorted(rng)
if nan_tracker.any():
lower = np.nanpercentile(vals, rng[0], **method)
higher = np.nanpercentile(vals, rng[1], **method)
lower, higher = np.nanpercentile(vals, rng, **method)
else:
lower = np.percentile(vals, rng[0], **method)
higher = np.percentile(vals, rng[1], **method)
lower, higher = np.percentile(vals, rng, **method)
vals = vals[(lower <= vals) & (vals <= higher)]

return vals
Expand Down
Loading