Merge pull request #548 from ljwolf/geographs

Intercept Coincident points
pysal · Aug 25, 2023 · 3c686b7 · 3c686b7
2 parents b5d5dc8 + c4383e1
commit 3c686b7
Show file tree

Hide file tree

Showing 9 changed files with 464 additions and 202 deletions.
diff --git a/libpysal/graph/_kernel.py b/libpysal/graph/_kernel.py
@@ -1,7 +1,8 @@
 import numpy
+import pandas
 from scipy import sparse, optimize, spatial, stats
 
-from ._utils import _validate_geometry_input
+from ._utils import _validate_geometry_input, _build_coincidence_lookup, _induce_cliques, _jitter_geoms, _sparse_to_arrays
 
 _VALID_GEOMETRY_TYPES = ["Point"]
 
@@ -49,7 +50,7 @@ def _identity(distances, bandwidth):
     "boxcar": _boxcar,
     "discrete": _boxcar,
     "identity": _identity,
-    None: _identity,
+    None: _identity
 }
 
 
@@ -61,6 +62,7 @@ def _kernel(
     k=None,
     ids=None,
     p=2,
+    taper=True,
 ):
     """
     Compute a kernel function over a distance matrix.
@@ -127,17 +129,22 @@ def _kernel(
             D = sparse.csc_array(coordinates)
     if bandwidth is None:
         bandwidth = numpy.percentile(D.data, 25)
-    elif bandwidth == "opt":
-        bandwidth = _optimize_bandwidth(D, kernel)
+    elif bandwidth == "auto":
+        if (kernel == 'identity') or (kernel is None):
+            bandwidth = numpy.nan # ignored by identity
+        else:
+            bandwidth = _optimize_bandwidth(D, kernel)
     if callable(kernel):
         smooth = kernel(D.data, bandwidth)
     else:
         smooth = _kernel_functions[kernel](D.data, bandwidth)
 
     sp = sparse.csc_array((smooth, D.indices, D.indptr), dtype=smooth.dtype)
-    sp.eliminate_zeros()
+
+    if taper:
+        sp.eliminate_zeros()
 
-    return sp, ids
+    return _sparse_to_arrays(sp)
 
     # TODO: ensure isloates are properly handled
     # TODO: handle co-located points
@@ -146,34 +153,66 @@ def _kernel(
 def _knn(
     coordinates,
     metric="euclidean",
-    k=None,
+    k=1,
     p=2,
+    coincident="raise"
 ):
-    n_samples, _ = coordinates.shape
 
-    if metric == "haversine":
-        # sklearn haversine works with (lat,lng) in radians...
-        coordinates = numpy.fliplr(numpy.deg2rad(coordinates))
-    query = _prepare_tree_query(coordinates, metric, p=p)
-    D_linear, ixs = query(coordinates, k=k + 1)
-    self_ix, neighbor_ix = ixs[:, 0], ixs[:, 1:]
-    D_linear = D_linear[:, 1:]
-    self_ix_flat = numpy.repeat(self_ix, k)
-    neighbor_ix_flat = neighbor_ix.flatten()
-    D_linear_flat = D_linear.flatten()
-    if metric == "haversine":
-        D_linear_flat * 6371  # express haversine distances in kilometers
-    D = sparse.csc_array(
-        (D_linear_flat, (self_ix_flat, neighbor_ix_flat)),
-        shape=(n_samples, n_samples),
+    coordinates, ids, geoms = _validate_geometry_input(
+        coordinates, ids=None, valid_geometry_types=_VALID_GEOMETRY_TYPES
     )
-    return D
+    n_coincident, coincident_lut = _build_coincidence_lookup(geoms)
+    max_at_one_site = coincident_lut.input_index.str.len().max()
+    n_samples, _ = coordinates.shape
 
-    # TODO: ensure isloates are properly handled
-    # TODO: handle co-located points
-    # TODO: haversine requires lat lan coords so we need to check if the gdf is in the
-    # correct CRS (or None) and if an array is given, that it is bounded by -180,180 and -90,90
-    # and explanation that the result is in kilometres
+    if max_at_one_site <= k:
+
+        if metric == "haversine":
+            # sklearn haversine works with (lat,lng) in radians...
+            coordinates = numpy.fliplr(numpy.deg2rad(coordinates))
+        query = _prepare_tree_query(coordinates, metric, p=p)
+        D_linear, ixs = query(coordinates, k=k + 1)
+        self_ix, neighbor_ix = ixs[:, 0], ixs[:, 1:]
+        D_linear = D_linear[:, 1:]
+        self_ix_flat = numpy.repeat(self_ix, k)
+        neighbor_ix_flat = neighbor_ix.flatten()
+        D_linear_flat = D_linear.flatten()
+        if metric == "haversine":
+            D_linear_flat * 6371  # express haversine distances in kilometers
+        D = sparse.csc_array(
+            (D_linear_flat, (self_ix_flat, neighbor_ix_flat)),
+            shape=(n_samples, n_samples),
+        )
+        return _sparse_to_arrays(D)
+
+        # TODO: ensure isloates are properly handled
+        # TODO: handle co-located points
+        # TODO: haversine requires lat lan coords so we need to check if the gdf is in the
+        # correct CRS (or None) and if an array is given, that it is bounded by -180,180 and -90,90
+        # and explanation that the result is in kilometres
+    else:
+        if coincident == "raise":
+            raise ValueError(
+                f"There are {len(coincident_lut)} "
+                f"unique locations in the dataset, but {len(geoms)} observations."
+                f"At least one of these sites has {max_at_one_site} points, more than the"
+                f" {k} nearest neighbors requested. This means there are more than {k} points"
+                " in the same location, which makes this graph type undefined. To address "
+                " this issue, consider setting `coincident='clique' or consult the "
+                "documentation about coincident points."
+                )
+        if coincident == "jitter":
+            # force re-jittering over and over again until the coincidence is broken
+            return _knn(_jitter_geoms(coordinates, geoms)[-1], metric=metric, k=k, p=p, coincident='jitter')
+        # implicit coincident == "clique"
+        heads, tails, weights = _knn(coincident_lut.geometry, metric=metric, k=k, p=p, coincident='raise')
+        adjtable = pandas.DataFrame.from_dict(
+            dict(
+                focal = heads, neighbor = tails, weight = weights
+            )
+        )
+        adjtable = _induce_cliques(adjtable, coincident_lut, fill_value=0)
+        return adjtable.focal.values, adjtable.neighbor.values, adjtable.weight.values
 
 
 def _distance_band(coordinates, threshold, ids=None):
@@ -182,7 +221,7 @@ def _distance_band(coordinates, threshold, ids=None):
     )
     tree = spatial.KDTree(coordinates)
     dist = tree.sparse_distance_matrix(tree, threshold, output_type="ndarray")
-    return sparse.csr_array((dist["v"], (dist["i"], dist["j"])))
+    return dist['i'], dist['j'], dist["v"]
 
     # TODO: handle co-located points
     # TODO: ensure isloates are properly handled
@@ -228,7 +267,10 @@ def _optimize_bandwidth(D, kernel):
     uniform distribution of kernel values, which is a good proxy for a
     "moderate" level of smoothing.
     """
-    kernel_function = _kernel_functions[kernel]
+    kernel_function = _kernel_functions.get(kernel, kernel)
+    assert callable(kernel_function), (
+        f"kernel {kernel} was not in supported kernel types {_kernel_functions.keys()} or callable"
+    )
 
     def _loss(bandwidth, D=D, kernel_function=kernel_function):
         Ku = kernel_function(D.data, bandwidth)

diff --git a/libpysal/graph/_set_ops.py b/libpysal/graph/_set_ops.py
@@ -39,6 +39,9 @@ def __iand__(self, other):
     def __ior__(self, other):
         raise TypeError("weights are immutable")
 
+    def __len__(self):
+        return self.n_edges
+
 
 # TODO: performance test the pandas implementation
 def intersects(left, right):