Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: include Graph.describe() to describe neighbourhood values #717

Merged
merged 10 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions libpysal/graph/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
GPD_013 = Version(geopandas.__version__) >= Version("0.13")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")

try:
from numba import njit # noqa: E401

HAS_NUMBA = True
except ModuleNotFoundError:
from libpysal.common import jit as njit

HAS_NUMBA = False


class CoplanarError(ValueError):
"""Custom ValueError raised when coplanar points are detected."""
Expand Down Expand Up @@ -274,3 +283,107 @@
.reindex(ids, level=1)
.reset_index()
)


@njit
def _mode(values, index): # noqa: ARG001
"""Custom mode function for numba."""
array = np.sort(values.ravel())
mask = np.empty(array.shape, dtype=np.bool_)
mask[:1] = True
mask[1:] = array[1:] != array[:-1]
unique = array[mask]
idx = np.nonzero(mask)[0]
idx = np.append(idx, mask.size)
counts = np.diff(idx)
return unique[np.argmax(counts)]


@njit
def _limit_range(values, index, low, high): # noqa: ARG001
nan_tracker = np.isnan(values)

if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
lower, higher = np.percentile(values, (low, high))
else:
return ~nan_tracker

Check warning on line 309 in libpysal/graph/_utils.py

View check run for this annotation

Codecov / codecov/patch

libpysal/graph/_utils.py#L309

Added line #L309 was not covered by tests

return (lower <= values) & (values <= higher)


def _compute_stats(grouper, to_compute: list[str] | None = None):
"""Fast compute of "count", "mean", "median", "std", "min", "max", \\
"sum", "nunique" and "mode" within a grouper object. Using numba.

Parameters
----------
grouper : pandas.GroupBy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be pandas.Grouper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the pandas.Grouper is another type of object, that is used for filtering columns , i used the name grouper since its used in other functions and the type is groupby since, pandas groupy returns a groupby object

Groupby Object which specifies the aggregations to be performed.
to_compute : List[str]
u3ks marked this conversation as resolved.
Show resolved Hide resolved
A list of stats functions to pass to groupby.agg

Returns
-------
DataFrame
"""

if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
)

if to_compute is None:
to_compute = [
"count",
"mean",
"median",
"std",
"min",
"max",
"sum",
"nunique",
"mode",
]
agg_to_compute = [f for f in to_compute if f != "mode"]
stat_ = grouper.agg(agg_to_compute)
if "mode" in to_compute:
if HAS_NUMBA:
stat_["mode"] = grouper.agg(_mode, engine="numba")
else:
stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))

return stat_


def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
"""Carry out a filtration of graph neighbours \\
based on the quantiles of ``y``, specified in ``q``"""
if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
)

## need to reset since numba transform has an indexing issue
grouper = (
y.take(graph_adjacency_index.codes[-1])
.reset_index(drop=True)
.groupby(graph_adjacency_index.codes[0])
)
if HAS_NUMBA:
to_keep = grouper.transform(
_limit_range, q[0], q[1], engine="numba"
).values.astype(bool)
else:
to_keep = grouper.transform(
lambda x: _limit_range(x.values, x.index, q[0], q[1])
).values.astype(bool)
filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
graph_adjacency_index.codes[0][to_keep]
)
return filtered_grouper
65 changes: 65 additions & 0 deletions libpysal/graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
from ._spatial_lag import _lag_spatial
from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi
from ._utils import (
_compute_stats,
_evaluate_index,
_neighbor_dict_to_edges,
_percentile_filtration_grouper,
_resolve_islands,
_sparse_to_arrays,
)
Expand Down Expand Up @@ -1993,6 +1995,69 @@
"""
return self._adjacency.groupby(level=0).agg(func)

def describe(
self,
y: np.typing.NDArray[np.float_] | pd.Series,
q: tuple[float, float] | None = None,
statistics: list[str] | None = None,
) -> pd.DataFrame:
"""Describe the distribution of ``y`` values within the neighbors of each node.

Given the graph, computes the descriptive statistics of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.

Notes
-----
The index of ``values`` must match the index of the graph.

Weight values do not affect the calculations, only adjacency does.

Returns nan for all isolates.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe 'nan' is OK here, but also maybe NaN or numpy.nan (or something else?)

Probably not a big deal either way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed to numpy.nan


The numba package is used extensively in this function
to accelerate the computation of statistics.
Without numba, these computations may become slow on large data.

Parameters
----------
y : NDArray[np.float_] | Series
An 1D array of numeric values to be described.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute.
Values must be between 0 and 100 inclusive. When set, values below and above
the percentiles will be discarded before computation of the statistics.
The percentiles are computed for each neighborhood. By default None.
statistics : List[str] | None
u3ks marked this conversation as resolved.
Show resolved Hide resolved
A list of stats functions to compute. If None, compute all
available functions - "count", "mean", "median",
"std", "min", "max", "sum", "nunique", "mode". By default None.

Returns
-------
DataFrame
A DataFrame with descriptive statistics.
"""

if not isinstance(y, pd.Series):
y = pd.Series(y)

Check warning on line 2043 in libpysal/graph/base.py

View check run for this annotation

Codecov / codecov/patch

libpysal/graph/base.py#L2043

Added line #L2043 was not covered by tests

if q is None:
grouper = y.take(self._adjacency.index.codes[1]).groupby(
self._adjacency.index.codes[0]
)
else:
grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q)

stat_ = _compute_stats(grouper, statistics)

stat_.index = self.unique_ids
if isinstance(stat_, pd.Series):
stat_.name = None

Check warning on line 2056 in libpysal/graph/base.py

View check run for this annotation

Codecov / codecov/patch

libpysal/graph/base.py#L2056

Added line #L2056 was not covered by tests
# NA isolates
stat_.loc[self.isolates] = np.nan
return stat_


def _arrange_arrays(heads, tails, weights, ids=None):
"""
Expand Down
65 changes: 65 additions & 0 deletions libpysal/graph/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def setup_method(self):
self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered)

self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))

def test_init(self):
g = graph.Graph(self.adjacency_int_binary)
Expand Down Expand Up @@ -1129,3 +1130,67 @@ def test_aggregate(self):
contig.aggregate(lambda x: np.exp(np.sum(x))),
expected,
)

def test_describe(self):
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
contig = graph.Graph.build_knn(self.guerry.geometry.centroid, k=5)
y = self.guerry.geometry.area
stats = contig.describe(y)
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
pd.testing.assert_series_equal(
stats["count"],
contig.cardinalities,
check_names=False,
check_dtype=False,
)
pd.testing.assert_series_equal(
stats["sum"],
pd.Series(contig.lag(y), index=contig.unique_ids),
check_names=False,
)
r_contig = contig.transform("R")
pd.testing.assert_series_equal(
stats["mean"],
pd.Series(r_contig.lag(y), index=contig.unique_ids),
check_names=False,
)
## compute only some statistics
specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
## assert only the specified values are computed
assert list(specific_stats.columns) == ["count", "sum", "mean"]

pd.testing.assert_frame_equal(
specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
)

percentile_stats = contig.describe(y, q=(25, 75))

for i in contig.unique_ids:
neigh_vals = y[contig[i].index.values]
low, high = neigh_vals.describe()[["25%", "75%"]]
neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
pd.testing.assert_series_equal(res, expected, check_names=False)

# test with isolates and string index
nybb_contig = graph.Graph.build_contiguity(self.nybb, rook=False)
stats = nybb_contig.describe(
self.nybb.geometry.area, statistics=["count", "sum"]
)
## all isolate values should be nan
assert stats.loc["Staten Island"].isna().all()

# for easier comparison and na has already been checked.
stats = stats.fillna(0)

pd.testing.assert_series_equal(
stats["sum"],
pd.Series(nybb_contig.lag(self.nybb.geometry.area), index=self.nybb.index),
check_names=False,
)

pd.testing.assert_series_equal(
stats["count"].sort_index(),
nybb_contig.cardinalities.sort_index(),
check_dtype=False,
check_names=False,
)
Loading