Skip to content

Commit

Permalink
ENH: add describe_reached as a replacement of Reached (#575)
Browse files Browse the repository at this point in the history
* functional reached calculations

* update pandas version in latest ubuntu image

* added test versioning

* changes based on PR

* changes based on PR

* env rollback

* pr changes and nan stat test

* typing and test versioning check

* Apply suggestions from code review

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>

* documentation changes

---------

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
  • Loading branch information
u3ks and martinfleis committed May 13, 2024
1 parent 7c1d30a commit 889fb44
Show file tree
Hide file tree
Showing 3 changed files with 484 additions and 57 deletions.
293 changes: 239 additions & 54 deletions momepy/functional/_diversity.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,119 @@
import warnings

import numpy as np
import pandas as pd
from libpysal.graph import Graph
from numpy.typing import NDArray
from packaging.version import Version
from pandas import DataFrame, Series

__all__ = ["describe"]
try:
from numba import njit

HAS_NUMBA = True
except (ModuleNotFoundError, ImportError):
HAS_NUMBA = False
from libpysal.common import jit as njit

__all__ = ["describe", "describe_reached"]


@njit
def _mode(array):
"""Custom mode function for numba."""
array = np.sort(array.ravel())
mask = np.empty(array.shape, dtype=np.bool_)
mask[:1] = True
mask[1:] = array[1:] != array[:-1]
unique = array[mask]
idx = np.nonzero(mask)[0]
idx = np.append(idx, mask.size)
counts = np.diff(idx)
return unique[np.argmax(counts)]


@njit
def _describe(values, q, include_mode=False):
"""Helper function to calculate average."""
nan_tracker = np.isnan(values)

if nan_tracker.all():
return [
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
]

else:
values = values[~np.isnan(values)]

if len(values) > 2:
lower, higher = np.percentile(values, q)
values = values[(lower <= values) & (values <= higher)]

mean = np.mean(values)
n = values.shape[0]
if n == 1:
std = np.nan
else:
# for pandas compatability
std = np.sqrt(np.sum(np.abs(values - mean) ** 2) / (n - 1))

results = [
n,
mean,
np.median(values),
std,
np.min(values),
np.max(values),
np.sum(values),
]

if include_mode:
results.append(_mode(values))

return results


def _compute_stats(grouper, q=None, include_mode=False):
"""
Fast compute of "count", "mean", "median", "std", "min", "max", "sum" statistics,
with optional mode and quartile limits.
Parameters
----------
grouper : pandas.GroupBy
Groupby Object which specifies the aggregations to be performed.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute. Values must be between 0
and 100 inclusive. When set, values below and above the percentiles will be
discarded before computation of the average. The percentiles are computed for
each neighborhood. By default None.
include_mode : False
Compute mode along with other statistics. Default is False. Mode is
computationally expensive and not useful for continous variables.
Returns
-------
DataFrame
"""
if q is None:
stat_ = grouper.agg(["count", "mean", "median", "std", "min", "max", "sum"])
if include_mode:
stat_["mode"] = grouper.agg(lambda x: _mode(x.values))
else:
agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode))
stat_ = DataFrame(np.stack(agg.values), index=agg.index)
cols = ["count", "mean", "median", "std", "min", "max", "sum"]
if include_mode:
cols.append("mode")
stat_.columns = cols

return stat_


def describe(
Expand All @@ -16,7 +124,7 @@ def describe(
) -> DataFrame:
"""Describe the distribution of values within a set neighbourhood.
Given the graph, computes the descriptive statisitcs of values within the
Given the graph, computes the descriptive statistics of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.
Expand Down Expand Up @@ -48,54 +156,15 @@ def describe(
DataFrame
A DataFrame with descriptive statistics.
"""
try:
from numba import njit
except (ModuleNotFoundError, ImportError):

if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this function to accelerate the"
" computation of statistics but it is not installed or cannot be imported."
" Without numba, these computations may become slow on large data.",
UserWarning,
stacklevel=2,
)
from libpysal.common import jit as njit

@njit
def _mode(array):
"""Custom mode function for numba."""
array = np.sort(array.ravel())
mask = np.empty(array.shape, dtype=np.bool_)
mask[:1] = True
mask[1:] = array[1:] != array[:-1]
unique = array[mask]
idx = np.nonzero(mask)[0]
idx = np.append(idx, mask.size)
counts = np.diff(idx)
return unique[np.argmax(counts)]

@njit
def _describe(values, q, include_mode=False):
"""Helper function to calculate average."""
nan_tracker = np.isnan(values)

if (len(values) > 2) and (not nan_tracker.all()):
if nan_tracker.any():
lower, higher = np.nanpercentile(values, q)
else:
lower, higher = np.percentile(values, q)
values = values[(lower <= values) & (values <= higher)]

results = [
np.mean(values),
np.median(values),
np.std(values),
np.min(values),
np.max(values),
np.sum(values),
]
if include_mode:
results.append(_mode(values))
return results

if not isinstance(y, Series):
y = Series(y)
Expand All @@ -104,16 +173,132 @@ def _describe(values, q, include_mode=False):
graph._adjacency.index.codes[0]
)

if q is None:
stat_ = grouper.agg(["mean", "median", "std", "min", "max", "sum"])
if include_mode:
stat_["mode"] = grouper.agg(lambda x: _mode(x.values))
return _compute_stats(grouper, q, include_mode)


def describe_reached(
y: np.ndarray | Series,
graph_index: np.ndarray | Series,
result_index: pd.Index = None,
graph: Graph = None,
q: tuple | list | None = None,
include_mode: bool = False,
) -> DataFrame:
"""Describe the distribution of values reached on a neighbourhood graph.
Given a neighborhood graph or a grouping, computes the descriptive statistics
of values reached. Optionally, the values can be limited to a certain
quantile range before computing the statistics.
The statistics calculated are count, sum, mean, median, std.
Optionally, mode can be calculated, or the statistics can be calculated in
quantiles ``q``.
The neighbourhood is defined in ``graph``. If ``graph`` is ``None``,
the function will assume topological distance ``0`` (element itself)
and ``result_index`` is required in order to arrange the results.
If ``graph``, the results are arranged according to the spatial weights ordering.
Adapted from :cite:`hermosilla2012` and :cite:`feliciotti2018`.
Notes
-----
The numba package is used extensively in this function to accelerate the computation
of statistics. Without numba, these computations may become slow on large data.
Parameters
----------
y : Series | numpy.array
A Series or numpy.array containing values to analyse.
graph_index : Series | numpy.array
The unique ID that specifies the aggregation
of ``y`` objects to ``graph`` groups.
result_index : pd.Index (default None)
An index that specifies how to order the results when ``graph`` is None.
When ``graph`` is given, the index is derived from its unique IDs.
graph : libpysal.graph.Graph (default None)
A spatial weights matrix of the element ``y`` is grouped into.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute. Values must be between 0
and 100 inclusive. When set, values below and above the percentiles will be
discarded before computation of the average. The percentiles are computed for
each neighborhood. By default None.
include_mode : False
Compute mode along with other statistics. Default is False. Mode is
computationally expensive and not useful for continous variables.
Returns
-------
DataFrame
Examples
--------
>>> res = mm.describe_reached(
... tessellation['area'], tessellation['nID'] , graph=streets_q1
... )
>>> streets["tessalations_reached"] = res['count']
>>> streets["tessalations_reached_area"] = res['sum']
"""

if Version(pd.__version__) <= Version("2.1.0"):
raise ImportError("pandas 2.1.0 or newer is required to use this function.")

if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this function to accelerate the"
" computation of statistics but it is not installed or cannot be imported."
" Without numba, these computations may become slow on large data.",
UserWarning,
stacklevel=2,
)

param_err = ValueError(
"One of result_index or graph has to be specified, but not both."
)
# case where params are none
if (result_index is None) and (graph is None):
raise param_err
elif result_index is None:
result_index = graph.unique_ids
elif graph is None:
result_index = result_index
# case where both params are passed
else:
agg = grouper.agg(lambda x: _describe(x.values, q=q, include_mode=include_mode))
stat_ = DataFrame(zip(*agg, strict=True)).T
cols = ["mean", "median", "std", "min", "max", "sum"]
if include_mode:
cols.append("mode")
stat_.columns = cols
raise param_err

return stat_
# series indice needs renaming, since multiindices
# without explicit names cannot be joined
if isinstance(y, np.ndarray):
y = pd.Series(y, name="obs_index")
else:
y = y.rename_axis("obs_index")

if isinstance(graph_index, np.ndarray):
graph_index = pd.Series(graph_index, name="neighbor")
else:
graph_index = graph_index.rename("neighbor")

# aggregate data
if graph is None:
grouper = y.groupby(graph_index)

else:
df_multi_index = y.to_frame().set_index(graph_index, append=True).swaplevel()
combined_index = graph.adjacency.index.join(df_multi_index.index).dropna()
grouper = y.loc[combined_index.get_level_values(-1)].groupby(
combined_index.get_level_values(0)
)

stats = _compute_stats(grouper, q, include_mode)

result = pd.DataFrame(
np.full((result_index.shape[0], stats.shape[1]), np.nan), index=result_index
)
result.loc[stats.index.values] = stats.values
result.columns = stats.columns
# fill only counts with zeros, other stats are NA
result.loc[:, "count"] = result.loc[:, "count"].fillna(0)
result.index.names = result_index.names

return result
3 changes: 2 additions & 1 deletion momepy/functional/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@


def assert_result(result, expected, geometry, **kwargs):
"""Check the expected values and types of the result."""
"""Check the expected values and types of the result.
Note: ''count'' refers to the number of non-NAs in the result."""
for key, value in expected.items():
assert getattr(result, key)() == pytest.approx(value)
assert isinstance(result, pd.Series)
Expand Down
Loading

0 comments on commit 889fb44

Please sign in to comment.