Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: include Graph.describe() to describe neighbourhood values #717

Merged
merged 10 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions libpysal/graph/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
GPD_013 = Version(geopandas.__version__) >= Version("0.13")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")

try:
from numba import njit # noqa: E401

HAS_NUMBA = True
except ModuleNotFoundError:
from libpysal.common import jit as njit

HAS_NUMBA = False


class CoplanarError(ValueError):
"""Custom ValueError raised when coplanar points are detected."""
Expand Down Expand Up @@ -274,3 +283,107 @@ def _reorder_adjtable_by_ids(adjtable, ids):
.reindex(ids, level=1)
.reset_index()
)


@njit
def _mode(values, index): # noqa: ARG001
"""Custom mode function for numba."""
array = np.sort(values.ravel())
mask = np.empty(array.shape, dtype=np.bool_)
mask[:1] = True
mask[1:] = array[1:] != array[:-1]
unique = array[mask]
idx = np.nonzero(mask)[0]
idx = np.append(idx, mask.size)
counts = np.diff(idx)
return unique[np.argmax(counts)]


@njit
def _limit_range(values, index, low, high): # noqa: ARG001
nan_tracker = np.isnan(values)

if (not nan_tracker.all()) & (len(values[~nan_tracker]) > 2):
lower, higher = np.percentile(values, (low, high))
else:
return ~nan_tracker

return (lower <= values) & (values <= higher)


def _compute_stats(grouper, to_compute: list[str] | None = None):
"""Fast compute of "count", "mean", "median", "std", "min", "max", \\
"sum", "nunique" and "mode" within a grouper object. Using numba.

Parameters
----------
grouper : pandas.GroupBy
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be pandas.Grouper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the pandas.Grouper is another type of object, that is used for filtering columns , i used the name grouper since its used in other functions and the type is groupby since, pandas groupy returns a groupby object

Groupby Object which specifies the aggregations to be performed.
to_compute : List[str]
u3ks marked this conversation as resolved.
Show resolved Hide resolved
A list of stats functions to pass to groupby.agg

Returns
-------
DataFrame
"""

if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
)

if to_compute is None:
to_compute = [
"count",
"mean",
"median",
"std",
"min",
"max",
"sum",
"nunique",
"mode",
]
agg_to_compute = [f for f in to_compute if f != "mode"]
stat_ = grouper.agg(agg_to_compute)
if "mode" in to_compute:
if HAS_NUMBA:
stat_["mode"] = grouper.agg(_mode, engine="numba")
else:
stat_["mode"] = grouper.agg(lambda x: _mode(x.values, x.index))

return stat_


def _percentile_filtration_grouper(y, graph_adjacency_index, q=(25, 75)):
"""Carry out a filtration of graph neighbours \\
based on the quantiles of ``y``, specified in ``q``"""
if not HAS_NUMBA:
warnings.warn(
"The numba package is used extensively in this module"
" to accelerate the computation of graphs. Without numba,"
" these computations may become unduly slow on large data.",
stacklevel=3,
)

## need to reset since numba transform has an indexing issue
grouper = (
y.take(graph_adjacency_index.codes[-1])
.reset_index(drop=True)
.groupby(graph_adjacency_index.codes[0])
)
if HAS_NUMBA:
to_keep = grouper.transform(
_limit_range, q[0], q[1], engine="numba"
).values.astype(bool)
else:
to_keep = grouper.transform(
lambda x: _limit_range(x.values, x.index, q[0], q[1])
).values.astype(bool)
filtered_grouper = y.take(graph_adjacency_index.codes[-1][to_keep]).groupby(
graph_adjacency_index.codes[0][to_keep]
)
return filtered_grouper
60 changes: 60 additions & 0 deletions libpysal/graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
from ._spatial_lag import _lag_spatial
from ._triangulation import _delaunay, _gabriel, _relative_neighborhood, _voronoi
from ._utils import (
_compute_stats,
_evaluate_index,
_neighbor_dict_to_edges,
_percentile_filtration_grouper,
_resolve_islands,
_sparse_to_arrays,
)
Expand Down Expand Up @@ -1993,6 +1995,64 @@ def aggregate(self, func):
"""
return self._adjacency.groupby(level=0).agg(func)

def describe(
self,
y: np.typing.NDArray[np.float_] | pd.Series,
q: tuple[float, float] | None = None,
statistics: list[str] | None = None,
) -> pd.DataFrame:
"""Describe the distribution of ``y`` values within the graph.
u3ks marked this conversation as resolved.
Show resolved Hide resolved

Given the graph, computes the descriptive statistics of values within the
neighbourhood of each node. Optionally, the values can be limited to a certain
quantile range before computing the statistics.

Notes
-----
The index of ``values`` must match the index of the graph.

Weight values do not affect the calculations, only adjacency does.

The numba package is used extensively in this function
to accelerate the computation of statistics.
Without numba, these computations may become slow on large data.

Parameters
----------
y : NDArray[np.float_] | Series
An 1D array of numeric values to be described.
q : tuple[float, float] | None, optional
Tuple of percentages for the percentiles to compute.
Values must be between 0 and 100 inclusive. When set, values below and above
the percentiles will be discarded before computation of the average.
u3ks marked this conversation as resolved.
Show resolved Hide resolved
The percentiles are computed for each neighborhood. By default None.
statistics : List[str] | None
u3ks marked this conversation as resolved.
Show resolved Hide resolved
A list of stats functions to compute. If None, compute all
available functions - "count", "mean", "median",
"std", "min", "max", "sum", "nunique", "mode". By default None.

Returns
-------
DataFrame
A DataFrame with descriptive statistics.
"""

if not isinstance(y, pd.Series):
y = pd.Series(y)

if q is None:
grouper = y.take(self._adjacency.index.codes[1]).groupby(
self._adjacency.index.codes[0]
)
else:
grouper = _percentile_filtration_grouper(y, self._adjacency.index, q=q)

stat_ = _compute_stats(grouper, statistics)

# NA isolates
stat_.loc[self.isolates] = np.nan
return stat_


def _arrange_arrays(heads, tails, weights, ids=None):
"""
Expand Down
44 changes: 44 additions & 0 deletions libpysal/graph/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def setup_method(self):
self.g_str_unodered = graph.Graph.from_weights_dict(self.W_dict_str_unordered)

self.nybb = gpd.read_file(geodatasets.get_path("nybb")).set_index("BoroName")
self.guerry = gpd.read_file(geodatasets.get_path("geoda guerry"))

def test_init(self):
g = graph.Graph(self.adjacency_int_binary)
Expand Down Expand Up @@ -1129,3 +1130,46 @@ def test_aggregate(self):
contig.aggregate(lambda x: np.exp(np.sum(x))),
expected,
)

def test_describe(self):
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
contig = (
graph.Graph.build_contiguity(self.guerry, rook=False)
.higher_order(k=3, lower_order=True)
.assign_self_weight()
)
y = self.guerry.geometry.area
stats = contig.describe(y)
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
pd.testing.assert_series_equal(
stats["count"],
contig.cardinalities,
check_index_type=False,
check_names=False,
)
pd.testing.assert_series_equal(
stats["sum"],
pd.Series(contig.lag(y), index=contig.unique_ids),
check_index_type=False,
check_names=False,
)
r_contig = contig.transform("R")
pd.testing.assert_series_equal(
stats["mean"],
pd.Series(r_contig.lag(y), index=contig.unique_ids),
check_index_type=False,
check_names=False,
)
## compute only some statistics
specific_stats = contig.describe(y, statistics=["count", "sum", "mean"])
pd.testing.assert_frame_equal(
specific_stats[["count", "sum", "mean"]], stats[["count", "sum", "mean"]]
)

percentile_stats = contig.describe(y, q=(25, 75))

for i in contig.unique_ids:
neigh_vals = y[contig[i].index.values]
low, high = neigh_vals.describe()[["25%", "75%"]]
neigh_vals = neigh_vals[(low <= neigh_vals) & (neigh_vals <= high)]
expected = neigh_vals.describe()[["count", "mean", "std", "min", "max"]]
res = percentile_stats.loc[i][["count", "mean", "std", "min", "max"]]
pd.testing.assert_series_equal(res, expected, check_names=False)
Loading