diff --git a/libpysal/graph/_spatial_lag.py b/libpysal/graph/_spatial_lag.py index 3ad5c704e..f591aaa9f 100644 --- a/libpysal/graph/_spatial_lag.py +++ b/libpysal/graph/_spatial_lag.py @@ -1,8 +1,12 @@ -def _lag_spatial(graph, y): +import numpy as np +import pandas as pd + + +def _lag_spatial(graph, y, categorical=False, ties="raise"): """Spatial lag operator - If w is row standardized, returns the average of each observation's neighbors; - if not, returns the weighted sum of each observation's neighbors. + Constructs spatial lag based on neighbor relations of the graph. + Parameters ---------- @@ -10,11 +14,63 @@ def _lag_spatial(graph, y): libpysal.graph.Graph y : array numpy array with dimensionality conforming to w + categorical : bool + True if y is categorical, False if y is continuous. + ties : {'raise', 'random', 'tryself'}, optional + Policy on how to break ties when a focal unit has multiple + modes for a categorical lag. + - 'raise': This will raise an exception if ties are + encountered to alert the user (Default). + - 'random': modal label ties Will be broken randomly. + - 'tryself': check if focal label breaks the tie between label + modes. If the focal label does not break the modal tie, the + tie will be be broken randomly. If the focal unit has a + self-weight, focal label is not used to break any tie, + rather any tie will be broken randomly. + Returns ------- numpy.array - array of numeric values for the spatial lag + array of numeric|categorical values for the spatial lag + + + Examples + -------- + >>> from libpysal.graph._spatial_lag import _lag_spatial + >>> import numpy as np + >>> from libpysal.weights.util import lat2W + >>> from libpysal.graph import Graph + >>> graph = Graph.from_W(lat2W(3,3)) + >>> y = np.arange(9) + >>> _lag_spatial(graph, y) + array([ 4., 6., 6., 10., 16., 14., 10., 18., 12.]) + + Row standardization + >>> w = lat2W(3,3) + >>> w.transform = 'r' + >>> graph = Graph.from_W(w) + >>> y = np.arange(9) + >>> _lag_spatial(graph, y) + array([2. , 2. , 3. , 3.33333333, 4. , + 4.66666667, 5. , 6. , 6. ]) + + + Categorical Lag (no ties) + >>> y = np.array([*'ababcbcbc']) + >>> _lag_spatial(graph, y, categorical=True) + array(['b', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object) + + Handling ties + >>> y[3] = 'a' + >>> np.random.seed(12345) + >>> _lag_spatial(graph, y, categorical=True, ties='random') + array(['a', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object) + >>> _lag_spatial(graph, y, categorical=True, ties='random') + array(['b', 'a', 'b', 'c', 'b', 'c', 'b', 'c', 'b'], dtype=object) + >>> _lag_spatial(graph, y, categorical=True, ties='tryself') + array(['a', 'a', 'b', 'c', 'b', 'c', 'a', 'c', 'b'], dtype=object) + """ sp = graph.sparse if len(y) != sp.shape[0]: @@ -22,4 +78,104 @@ def _lag_spatial(graph, y): "The length of `y` needs to match the number of observations " f"in Graph. Expected {sp.shape[0]}, got {len(y)}." ) + + # coerce list to array + if isinstance(y, list): + y = np.array(y) + + if ( + isinstance(y.dtype, pd.CategoricalDtype) + or pd.api.types.is_object_dtype(y.dtype) + or pd.api.types.is_bool_dtype(y.dtype) + or pd.api.types.is_string_dtype(y.dtype) + ): + categorical = True + if categorical: + df = pd.DataFrame(data=graph.adjacency) + df["neighbor_label"] = y[graph.adjacency.index.get_level_values(1)] + df["own_label"] = y[graph.adjacency.index.get_level_values(0)] + df["neighbor_idx"] = df.index.get_level_values(1) + df["focal_idx"] = df.index.get_level_values(0) + gb = df.groupby(["focal", "neighbor_label"]).count().groupby(level="focal") + n_ties = gb.apply(_check_ties).sum() + if n_ties and ties == "raise": + raise ValueError( + f"There are {n_ties} ties that must be broken " + f"to define the categorical " + "spatial lag for these observations. To address this " + "issue, consider setting `ties='tryself'` " + "or `ties='random'` or consult the documentation " + "about ties and the categorical spatial lag." + ) + # either there are ties and random|tryself specified or + # there are no ties + gb = df.groupby(by=["focal"]) + if ties == "random" or ties == "raise": + return gb.apply(_get_categorical_lag).values + elif ties == "tryself" or ties == "raise": + return gb.apply(_get_categorical_lag, ties="tryself").values + else: + raise ValueError( + f"Received option ties='{ties}', but only options " + "'raise','random','tryself' are supported." + ) + return sp @ y + + +def _check_ties(focal): + """Reduction to determine if a focal unit has multiple modes for neighbor labels. + + Parameters + ---------- + focal: row from pandas Dataframe + Data is a Graph with an additional column having the labels for the neighbors + + Returns + ------- + bool + """ + + max_count = focal.weight.max() + if (focal.weight == max_count).sum() > 1: + return True + return False + + +def _get_categorical_lag(focal, ties="random"): + """Reduction to determine categorical spatial lag for a focal unit. + + Parameters + ---------- + focal: row from pandas Dataframe + Data is a Graph with an additional column having the labels for the neighbors + + ties : {'raise', 'random', 'tryself'}, optional + Policy on how to break ties when a focal unit has multiple + modes for a categorical lag. + - 'raise': This will raise an exception if ties are + encountered to alert the user (Default). + - 'random': Will break ties randomly. + - 'tryself': check if focal label breaks the tie between label + modes. If the focal label does not break the modal tie, the + tie will be be broken randomly. If the focal unit has a + self-weight, focal label is not used to break any tie, + rather any tie will be broken randomly. + + + Returns + ------- + str|int|float: + Label for the value of the categorical lag + """ + self_weight = focal.focal_idx.values[0] in focal.neighbor_idx.values + labels, counts = np.unique(focal.neighbor_label, return_counts=True) + node_label = labels[counts == counts.max()] + if ties == "random" or (ties == "tryself" and self_weight): + return np.random.choice(node_label, 1)[0] + elif ties == "tryself" and not self_weight: + self_label = focal.own_label.values[0] + if self_label in node_label: # focal breaks tie + return self_label + else: + return np.random.choice(node_label, 1)[0] diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index b3755f645..96e8458ef 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -1613,24 +1613,37 @@ def higher_order(self, k=2, shortest_path=True, diagonal=False, lower_order=Fals ids=self.unique_ids, ) - def lag(self, y): + def lag(self, y, categorical=False, ties="raise"): """Spatial lag operator - If weights are row standardized, returns the mean of each - observation's neighbors; if not, returns the weighted sum - of each observation's neighbors. + Constructs spatial lag based on neighbor relations of the graph. + Parameters ---------- - y : array-like - array-like (N,) shape where N is equal to number of observations in self. + y : array + numpy array with dimensionality conforming to w + categorical : bool + True if y is categorical, False if y is continuous. + ties : {'raise', 'random', 'tryself'}, optional + Policy on how to break ties when a focal unit has multiple + modes for a categorical lag. + - 'raise': This will raise an exception if ties are + encountered to alert the user (Default). + - 'random': modal label ties Will be broken randomly. + - 'tryself': check if focal label breaks the tie between label + modes. If the focal label does not break the modal tie, the + tie will be be broken randomly. If the focal unit has a + self-weight, focal label is not used to break any tie, + rather any tie will be broken randomly. + Returns ------- numpy.ndarray - array of numeric values for the spatial lag + array of numeric|categorical values for the spatial lag """ - return _lag_spatial(self, y) + return _lag_spatial(self, y, categorical=categorical, ties=ties) def to_parquet(self, path, **kwargs): """Save Graph to a Apache Parquet diff --git a/libpysal/graph/tests/test_spatial_lag.py b/libpysal/graph/tests/test_spatial_lag.py index bce3d94d6..f4cd07958 100644 --- a/libpysal/graph/tests/test_spatial_lag.py +++ b/libpysal/graph/tests/test_spatial_lag.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from libpysal import graph from libpysal.graph._spatial_lag import _lag_spatial @@ -16,6 +17,10 @@ def setup_method(self): self.weights = {"a": [1.0], "b": [1.0, 1.0], "c": [1.0], "d": []} self.g = graph.Graph.from_dicts(self.neighbors, self.weights) self.y = np.array([0, 1, 2, 3]) + self.yc = np.array([*"ababcbcbc"]) + w = lat2W(3, 3) + w.transform = "r" + self.gc = graph.Graph.from_W(w) def test_lag_spatial(self): yl = _lag_spatial(self.g, self.y) @@ -29,3 +34,34 @@ def test_lag_spatial(self): yl = _lag_spatial(g_row, y) ylc = np.array([2.0, 2.0, 3.0, 3.33333333, 4.0, 4.66666667, 5.0, 6.0, 6.0]) np.testing.assert_array_almost_equal(yl, ylc) + + def test_lag_spatial_categorical(self): + yl = _lag_spatial(self.gc, self.yc) + ylc = np.array(["b", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object) + np.testing.assert_array_equal(yl, ylc) + self.yc[3] = "a" # create ties + np.random.seed(12345) + yl = _lag_spatial(self.gc, self.yc, categorical=True, ties="random") + ylc = np.array(["a", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object) + yl1 = _lag_spatial(self.gc, self.yc, categorical=True, ties="random") + yls = _lag_spatial(self.gc, self.yc, categorical=True, ties="tryself") + np.testing.assert_array_equal(yl, ylc) + yl1c = np.array(["b", "a", "b", "c", "b", "c", "b", "c", "b"], dtype=object) + np.testing.assert_array_equal(yl1, yl1c) + ylsc = np.array(["a", "a", "b", "c", "b", "c", "a", "c", "b"], dtype=object) + np.testing.assert_array_equal(yls, ylsc) + # self-weight + neighbors = self.gc.neighbors + neighbors[0] = (0, 3, 1) # add self neighbor for observation 0 + gc = graph.Graph.from_dicts(neighbors) + self.yc[3] = "b" + yls = _lag_spatial(gc, self.yc, categorical=True, ties="tryself") + assert yls[0] in ["b", "a"] + self.yc[3] = "a" + yls = _lag_spatial(gc, self.yc, categorical=True, ties="tryself") + assert yls[0] == "a" + + def test_ties_raise(self): + with pytest.raises(ValueError, match="There are 2 ties that must be broken"): + self.yc[3] = "a" # create ties + _lag_spatial(self.gc, self.yc, categorical=True)