Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categorical spatial lag using the Graph #716

Merged
merged 17 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 160 additions & 4 deletions libpysal/graph/_spatial_lag.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,181 @@
def _lag_spatial(graph, y):
import numpy as np
import pandas as pd


def _lag_spatial(graph, y, categorical=False, ties="raise"):
"""Spatial lag operator

If w is row standardized, returns the average of each observation's neighbors;
if not, returns the weighted sum of each observation's neighbors.
Constructs spatial lag based on neighbor relations of the graph.


Parameters
----------
graph : Graph
libpysal.graph.Graph
y : array
numpy array with dimensionality conforming to w
categorical : bool
True if y is categorical, False if y is continuous.
ties : {'raise', 'random', 'tryself'}, optional
Policy on how to break ties when a focal unit has multiple
modes for a categorical lag.
- 'raise': This will raise an exception if ties are
encountered to alert the user (Default).
- 'random': modal label ties Will be broken randomly.
- 'tryself': check if focal label breaks the tie between label
modes. If the focal label does not break the modal tie, the
tie will be be broken randomly. If the focal unit has a
self-weight, focal label is not used to break any tie,
rather any tie will be broken randomly.


Returns
-------
numpy.array
array of numeric values for the spatial lag
array of numeric|categorical values for the spatial lag


Examples
--------
>>> from libpysal.graph._spatial_lag import _lag_spatial
>>> import numpy as np
>>> from libpysal.weights.util import lat2W
>>> from libpysal.graph import Graph
>>> graph = Graph.from_W(lat2W(3,3))
>>> y = np.arange(9)
>>> _lag_spatial(graph, y)
array([ 4., 6., 6., 10., 16., 14., 10., 18., 12.])

Row standardization
>>> w = lat2W(3,3)
>>> w.transform = 'r'
>>> graph = Graph.from_W(w)
>>> y = np.arange(9)
>>> _lag_spatial(graph, y)
array([2. , 2. , 3. , 3.33333333, 4. ,
4.66666667, 5. , 6. , 6. ])


Categorical Lag (no ties)
>>> y = np.array([*'ababcbcbc'])
>>> _lag_spatial(graph, y, categorical=True)
array(['b', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object)

Handling ties
>>> y[3] = 'a'
>>> np.random.seed(12345)
>>> _lag_spatial(graph, y, categorical=True, ties='random')
array(['a', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object)
>>> _lag_spatial(graph, y, categorical=True, ties='random')
array(['b', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object)
>>> _lag_spatial(graph, y, categorical=True, ties='tryself')
array(['a', 'a', 'b', 'c', 'b', 'c', 'a', 'c', 'b'], dtype=object)

"""
sp = graph.sparse
if len(y) != sp.shape[0]:
raise ValueError(
"The length of `y` needs to match the number of observations "
f"in Graph. Expected {sp.shape[0]}, got {len(y)}."
)

# coerce list to array
sjsrey marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(y, list):
y = np.array(y)

if (
isinstance(y.dtype, pd.CategoricalDtype)
or pd.api.types.is_object_dtype(y.dtype)
or pd.api.types.is_bool_dtype(y.dtype)
or pd.api.types.is_string_dtype(y.dtype)
):
categorical = True
if categorical:
df = pd.DataFrame(data=graph.adjacency)
df["neighbor_label"] = y[graph.adjacency.index.get_level_values(1)]
df["own_label"] = y[graph.adjacency.index.get_level_values(0)]
df["neighbor_idx"] = df.index.get_level_values(1)
df["focal_idx"] = df.index.get_level_values(0)
gb = df.groupby(["focal", "neighbor_label"]).count().groupby(level="focal")
n_ties = gb.apply(_check_ties).sum()
if n_ties and ties == "raise":
raise ValueError(
f"There are {n_ties} ties that must be broken "
f"to define the categorical "
"spatial lag for these observations. To address this "
"issue, consider setting `ties='tryself'` "
"or `ties='random'` or consult the documentation "
"about ties and the categorical spatial lag."
)
# either there are ties and random|tryself specified or
# there are no ties
gb = df.groupby(by=["focal"])
if ties == "random" or ties == "raise":
return gb.apply(_get_categorical_lag).values
elif ties == "tryself" or ties == "raise":
return gb.apply(_get_categorical_lag, ties="tryself").values
else:
raise ValueError(
f"Received option ties='{ties}', but only options "
"'raise','random','tryself' are supported."
)

return sp @ y


def _check_ties(focal):
"""Reduction to determine if a focal unit has multiple modes for neighbor labels.

Parameters
----------
focal: row from pandas Dataframe
Data is a Graph with an additional column having the labels for the neighbors

Returns
-------
bool
"""

max_count = focal.weight.max()
if (focal.weight == max_count).sum() > 1:
return True
return False


def _get_categorical_lag(focal, ties="random"):
"""Reduction to determine categorical spatial lag for a focal unit.

Parameters
----------
focal: row from pandas Dataframe
Data is a Graph with an additional column having the labels for the neighbors

ties : {'raise', 'random', 'tryself'}, optional
Policy on how to break ties when a focal unit has multiple
modes for a categorical lag.
- 'raise': This will raise an exception if ties are
encountered to alert the user (Default).
- 'random': Will break ties randomly.
- 'tryself': check if focal label breaks the tie between label
modes. If the focal label does not break the modal tie, the
tie will be be broken randomly. If the focal unit has a
self-weight, focal label is not used to break any tie,
rather any tie will be broken randomly.


Returns
-------
str|int|float:
Label for the value of the categorical lag
"""
self_weight = focal.focal_idx.values[0] in focal.neighbor_idx.values
labels, counts = np.unique(focal.neighbor_label, return_counts=True)
node_label = labels[counts == counts.max()]
if ties == "random" or (ties == "tryself" and self_weight):
return np.random.choice(node_label, 1)[0]
elif ties == "tryself" and not self_weight:
self_label = focal.own_label.values[0]
if self_label in node_label: # focal breaks tie
return self_label
else:
return np.random.choice(node_label, 1)[0]
29 changes: 21 additions & 8 deletions libpysal/graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1613,24 +1613,37 @@ def higher_order(self, k=2, shortest_path=True, diagonal=False, lower_order=Fals
ids=self.unique_ids,
)

def lag(self, y):
def lag(self, y, categorical=False, ties="raise"):
"""Spatial lag operator

If weights are row standardized, returns the mean of each
observation's neighbors; if not, returns the weighted sum
of each observation's neighbors.
Constructs spatial lag based on neighbor relations of the graph.


Parameters
----------
y : array-like
array-like (N,) shape where N is equal to number of observations in self.
y : array
numpy array with dimensionality conforming to w
categorical : bool
True if y is categorical, False if y is continuous.
ties : {'raise', 'random', 'tryself'}, optional
Policy on how to break ties when a focal unit has multiple
modes for a categorical lag.
- 'raise': This will raise an exception if ties are
encountered to alert the user (Default).
- 'random': modal label ties Will be broken randomly.
- 'tryself': check if focal label breaks the tie between label
modes. If the focal label does not break the modal tie, the
tie will be be broken randomly. If the focal unit has a
self-weight, focal label is not used to break any tie,
rather any tie will be broken randomly.


Returns
-------
numpy.ndarray
array of numeric values for the spatial lag
array of numeric|categorical values for the spatial lag
"""
return _lag_spatial(self, y)
return _lag_spatial(self, y, categorical=categorical, ties=ties)

def to_parquet(self, path, **kwargs):
"""Save Graph to a Apache Parquet
Expand Down
36 changes: 36 additions & 0 deletions libpysal/graph/tests/test_spatial_lag.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest

from libpysal import graph
from libpysal.graph._spatial_lag import _lag_spatial
Expand All @@ -16,6 +17,10 @@ def setup_method(self):
self.weights = {"a": [1.0], "b": [1.0, 1.0], "c": [1.0], "d": []}
self.g = graph.Graph.from_dicts(self.neighbors, self.weights)
self.y = np.array([0, 1, 2, 3])
self.yc = np.array([*"ababcbcbc"])
w = lat2W(3, 3)
w.transform = "r"
self.gc = graph.Graph.from_W(w)

def test_lag_spatial(self):
yl = _lag_spatial(self.g, self.y)
Expand All @@ -29,3 +34,34 @@ def test_lag_spatial(self):
yl = _lag_spatial(g_row, y)
ylc = np.array([2.0, 2.0, 3.0, 3.33333333, 4.0, 4.66666667, 5.0, 6.0, 6.0])
np.testing.assert_array_almost_equal(yl, ylc)

def test_lag_spatial_categorical(self):
yl = _lag_spatial(self.gc, self.yc)
ylc = np.array(["b", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object)
np.testing.assert_array_equal(yl, ylc)
self.yc[3] = "a" # create ties
np.random.seed(12345)
yl = _lag_spatial(self.gc, self.yc, categorical=True, ties="random")
ylc = np.array(["a", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object)
yl1 = _lag_spatial(self.gc, self.yc, categorical=True, ties="random")
yls = _lag_spatial(self.gc, self.yc, categorical=True, ties="tryself")
np.testing.assert_array_equal(yl, ylc)
yl1c = np.array(["b", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object)
np.testing.assert_array_equal(yl1, yl1c)
ylsc = np.array(["a", "a", "b", "c", "b", "c", "a", "c", "b"], dtype=object)
np.testing.assert_array_equal(yls, ylsc)
# self-weight
neighbors = self.gc.neighbors
neighbors[0] = (0, 3, 1) # add self neighbor for observation 0
gc = graph.Graph.from_dicts(neighbors)
self.yc[3] = "b"
yls = _lag_spatial(gc, self.yc, categorical=True, ties="tryself")
assert yls[0] in ["b", "a"]
self.yc[3] = "a"
yls = _lag_spatial(gc, self.yc, categorical=True, ties="tryself")
assert yls[0] == "a"

def test_ties_raise(self):
with pytest.raises(ValueError, match="There are 2 ties that must be broken"):
self.yc[3] = "a" # create ties
_lag_spatial(self.gc, self.yc, categorical=True)
Loading