Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: include Graph.describe() to describe neighbourhood values #717

Merged
merged 10 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions libpysal/graph/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
GPD_013 = Version(geopandas.__version__) >= Version("0.13")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")

try:
from numba import njit # noqa: E401

HAS_NUMBA = True
except ModuleNotFoundError:
from libpysal.common import jit as njit

HAS_NUMBA = False


class CoplanarError(ValueError):
"""Custom ValueError raised when coplanar points are detected."""
Expand Down Expand Up @@ -274,3 +283,107 @@
.reindex(ids, level=1)
.reset_index()
)


@njit
def _mode(values, index): # noqa: ARG001
"""Custom mode function for numba."""
array = np.sort(values.ravel())
mask = np.empty(array.shape, dtype=np.bool_)
mask[:1] = True
mask[1:] = array[1:] != array[:-1]
unique = array[mask]
idx = np.nonzero(mask)[0]
idx = np.append(idx, mask.size)
counts = np.diff(idx)
return unique[np.argmax(counts)]


@njit
def _limit_range(values, index, low, high): # noqa: ARG001
nan_tracker = np.isnan(values)

if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
lower, higher = np.nanpercentile(values, (low, high))
else:
return ~nan_tracker

Check warning on line 309 in libpysal/graph/_utils.py

View check run for this annotation

Codecov / codecov/patch

libpysal/graph/_utils.py#L309

Added line #L309 was not covered by tests

return (lower <= values) & (values <= higher)


def _compute_stats(grouper, to_compute: list[str] | None = None):
"""Fast compute of "count", "mean", "median", "std", "min", "max", \\
"sum", "nunique" and "mode" within a grouper object. Using numba.

Parameters
----------
grouper : pandas.GroupBy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be pandas.Grouper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the pandas.Grouper is another type of object, that is used for filtering columns , i used the name grouper since its used in other functions and the type is groupby since, pandas groupy returns a groupby object

Groupby Object which specifies the aggregations to be performed.
to_compute : list[str]
A list of stats functions to pass to groupby.agg

Returns
-------
DataFrame
"""

if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
)

if to_compute is None:
to_compute = [
"count",
"mean",
"median",
"std",
"min",
"max",
"sum",
"nunique",
"mode",
]
agg_to_compute = [f for f in to_compute if f != "mode"]
stat_ = grouper.agg(agg_to_compute)
if "mode" in to_compute:
if HAS_NUMBA:
stat_["mode"] = grouper.agg(_mode, engine="numba")
else:
stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))

return stat_


def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
"""Carry out a filtration of graph neighbours \\
based on the quantiles of ``y``, specified in ``q``"""
if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
)

## need to reset since numba transform has an indexing issue
grouper = (
y.take(graph_adjacency_index.codes[-1])
.reset_index(drop=True)
.groupby(graph_adjacency_index.codes[0])
)
if HAS_NUMBA:
to_keep = grouper.transform(
_limit_range, q[0], q[1], engine="numba"
).values.astype(bool)
else:
to_keep = grouper.transform(
lambda x: _limit_range(x.values, x.index, q[0], q[1])
).values.astype(bool)
filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
graph_adjacency_index.codes[0][to_keep]
)
return filtered_grouper
68 changes: 68 additions & 0 deletions libpysal/graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
from ._spatial_lag import _lag_spatial
from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi
from ._utils import (
_compute_stats,
_evaluate_index,
_neighbor_dict_to_edges,
_percentile_filtration_grouper,
_resolve_islands,
_sparse_to_arrays,
)
Expand Down Expand Up @@ -1993,6 +1995,72 @@
"""
return self._adjacency.groupby(level=0).agg(func)

def describe(
self,
y: np.typing.NDArray[np.float_] | pd.Series,
q: tuple[float, float] | None = None,
statistics: list[str] | None = None,
) -> pd.DataFrame:
"""Describe the distribution of ``y`` values within the neighbors of each node.

Given the graph, computes the descriptive statistics of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.

Notes
-----
The index of ``values`` must match the index of the graph.

Weight values do not affect the calculations, only adjacency does.

Returns numpy.nan for all isolates.

The numba package is used extensively in this function
to accelerate the computation of statistics.
Without numba, these computations may become slow on large data.

Parameters
----------
y : NDArray[np.float_] | Series
An 1D array of numeric values to be described.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute.
Values must be between 0 and 100 inclusive. When set, values below and above
the percentiles will be discarded before computation of the statistics.
The percentiles are computed for each neighborhood. By default None.
statistics : list[str] | None
A list of stats functions to compute. If None, compute all
available functions - "count", "mean", "median",
"std", "min", "max", "sum", "nunique", "mode". By default None.

Returns
-------
DataFrame
A DataFrame with descriptive statistics.
"""

if not isinstance(y, pd.Series):
y = pd.Series(y, index=self.unique_ids)

if (y.index != self.unique_ids).all():
raise ValueError("The values index is not aligned with the graph index.")

if q is None:
grouper = y.take(self._adjacency.index.codes[1]).groupby(
self._adjacency.index.codes[0]
)
else:
grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q)

stat_ = _compute_stats(grouper, statistics)

stat_.index = self.unique_ids
if isinstance(stat_, pd.Series):
stat_.name = None

Check warning on line 2059 in libpysal/graph/base.py

View check run for this annotation

Codecov / codecov/patch

libpysal/graph/base.py#L2059

Added line #L2059 was not covered by tests
# NA isolates
stat_.loc[self.isolates] = np.nan
return stat_


def _arrange_arrays(heads, tails, weights, ids=None):
"""
Expand Down
92 changes: 92 additions & 0 deletions libpysal/graph/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def setup_method(self):
self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered)

self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))

def test_init(self):
g = graph.Graph(self.adjacency_int_binary)
Expand Down Expand Up @@ -1129,3 +1130,94 @@ def test_aggregate(self):
contig.aggregate(lambda x: np.exp(np.sum(x))),
expected,
)

def test_describe(self):
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
contig = graph.Graph.build_knn(self.guerry.geometry.centroid, k=5)
y = self.guerry.geometry.area
stats = contig.describe(y)
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
pd.testing.assert_series_equal(
stats["count"],
contig.cardinalities,
check_names=False,
check_dtype=False,
)
pd.testing.assert_series_equal(
stats["sum"],
pd.Series(contig.lag(y), index=contig.unique_ids),
check_names=False,
)
r_contig = contig.transform("R")
pd.testing.assert_series_equal(
stats["mean"],
pd.Series(r_contig.lag(y), index=contig.unique_ids),
check_names=False,
)
## compute only some statistics
specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
## assert only the specified values are computed
assert list(specific_stats.columns) == ["count", "sum", "mean"]

pd.testing.assert_frame_equal(
specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
)

percentile_stats = contig.describe(y, q=(25, 75))

for i in contig.unique_ids:
neigh_vals = y[contig[i].index.values]
low, high = neigh_vals.describe()[["25%", "75%"]]
neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
pd.testing.assert_series_equal(res, expected, check_names=False)

## test NA equivalence between filtration and pandas
nan_areas = y.copy()
nan_areas.iloc[range(0, len(y), 3),] = np.nan
res1 = contig.describe(y, statistics=["count"])["count"]
res2 = contig.describe(y, statistics=["count"], q=(0, 100))["count"]
pd.testing.assert_series_equal(res1, res2)

# test with isolates and string index
nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False)
stats = nybb_contig.describe(
self.nybb.geometry.area, statistics=["count", "sum"]
)
## all isolate values should be nan
assert stats.loc["Staten Island"].isna().all()

# for easier comparison and na has already been checked.
stats = stats.fillna(0)

pd.testing.assert_series_equal(
stats["sum"],
pd.Series(nybb_contig.lag(self.nybb.geometry.area), index=self.nybb.index),
check_names=False,
)

pd.testing.assert_series_equal(
stats["count"].sort_index(),
nybb_contig.cardinalities.sort_index(),
check_dtype=False,
check_names=False,
)

## test passing ndarray
stats1 = nybb_contig.describe(self.nybb.geometry.area, statistics=["sum"])[
"sum"
]
stats2 = nybb_contig.describe(
self.nybb.geometry.area.values, statistics=["sum"]
)["sum"]
pd.testing.assert_series_equal(
stats1,
stats2,
check_dtype=False,
check_names=False,
)

## test index alignment
with pytest.raises(
ValueError, match="The values index is not aligned with the graph index."
):
nybb_contig.describe(self.nybb.geometry.area.reset_index(drop=True))
Loading