diff --git a/libpysal/graph/_kernel.py b/libpysal/graph/_kernel.py index 39a9a6e80..37aa13a58 100644 --- a/libpysal/graph/_kernel.py +++ b/libpysal/graph/_kernel.py @@ -1,7 +1,8 @@ import numpy +import pandas from scipy import sparse, optimize, spatial, stats -from ._utils import _validate_geometry_input +from ._utils import _validate_geometry_input, _build_coincidence_lookup, _induce_cliques, _jitter_geoms, _sparse_to_arrays _VALID_GEOMETRY_TYPES = ["Point"] @@ -49,7 +50,7 @@ def _identity(distances, bandwidth): "boxcar": _boxcar, "discrete": _boxcar, "identity": _identity, - None: _identity, + None: _identity } @@ -61,6 +62,7 @@ def _kernel( k=None, ids=None, p=2, + taper=True, ): """ Compute a kernel function over a distance matrix. @@ -127,17 +129,22 @@ def _kernel( D = sparse.csc_array(coordinates) if bandwidth is None: bandwidth = numpy.percentile(D.data, 25) - elif bandwidth == "opt": - bandwidth = _optimize_bandwidth(D, kernel) + elif bandwidth == "auto": + if (kernel == 'identity') or (kernel is None): + bandwidth = numpy.nan # ignored by identity + else: + bandwidth = _optimize_bandwidth(D, kernel) if callable(kernel): smooth = kernel(D.data, bandwidth) else: smooth = _kernel_functions[kernel](D.data, bandwidth) sp = sparse.csc_array((smooth, D.indices, D.indptr), dtype=smooth.dtype) - sp.eliminate_zeros() + + if taper: + sp.eliminate_zeros() - return sp, ids + return _sparse_to_arrays(sp) # TODO: ensure isloates are properly handled # TODO: handle co-located points @@ -146,34 +153,66 @@ def _kernel( def _knn( coordinates, metric="euclidean", - k=None, + k=1, p=2, + coincident="raise" ): - n_samples, _ = coordinates.shape - if metric == "haversine": - # sklearn haversine works with (lat,lng) in radians... - coordinates = numpy.fliplr(numpy.deg2rad(coordinates)) - query = _prepare_tree_query(coordinates, metric, p=p) - D_linear, ixs = query(coordinates, k=k + 1) - self_ix, neighbor_ix = ixs[:, 0], ixs[:, 1:] - D_linear = D_linear[:, 1:] - self_ix_flat = numpy.repeat(self_ix, k) - neighbor_ix_flat = neighbor_ix.flatten() - D_linear_flat = D_linear.flatten() - if metric == "haversine": - D_linear_flat * 6371 # express haversine distances in kilometers - D = sparse.csc_array( - (D_linear_flat, (self_ix_flat, neighbor_ix_flat)), - shape=(n_samples, n_samples), + coordinates, ids, geoms = _validate_geometry_input( + coordinates, ids=None, valid_geometry_types=_VALID_GEOMETRY_TYPES ) - return D + n_coincident, coincident_lut = _build_coincidence_lookup(geoms) + max_at_one_site = coincident_lut.input_index.str.len().max() + n_samples, _ = coordinates.shape - # TODO: ensure isloates are properly handled - # TODO: handle co-located points - # TODO: haversine requires lat lan coords so we need to check if the gdf is in the - # correct CRS (or None) and if an array is given, that it is bounded by -180,180 and -90,90 - # and explanation that the result is in kilometres + if max_at_one_site <= k: + + if metric == "haversine": + # sklearn haversine works with (lat,lng) in radians... + coordinates = numpy.fliplr(numpy.deg2rad(coordinates)) + query = _prepare_tree_query(coordinates, metric, p=p) + D_linear, ixs = query(coordinates, k=k + 1) + self_ix, neighbor_ix = ixs[:, 0], ixs[:, 1:] + D_linear = D_linear[:, 1:] + self_ix_flat = numpy.repeat(self_ix, k) + neighbor_ix_flat = neighbor_ix.flatten() + D_linear_flat = D_linear.flatten() + if metric == "haversine": + D_linear_flat * 6371 # express haversine distances in kilometers + D = sparse.csc_array( + (D_linear_flat, (self_ix_flat, neighbor_ix_flat)), + shape=(n_samples, n_samples), + ) + return _sparse_to_arrays(D) + + # TODO: ensure isloates are properly handled + # TODO: handle co-located points + # TODO: haversine requires lat lan coords so we need to check if the gdf is in the + # correct CRS (or None) and if an array is given, that it is bounded by -180,180 and -90,90 + # and explanation that the result is in kilometres + else: + if coincident == "raise": + raise ValueError( + f"There are {len(coincident_lut)} " + f"unique locations in the dataset, but {len(geoms)} observations." + f"At least one of these sites has {max_at_one_site} points, more than the" + f" {k} nearest neighbors requested. This means there are more than {k} points" + " in the same location, which makes this graph type undefined. To address " + " this issue, consider setting `coincident='clique' or consult the " + "documentation about coincident points." + ) + if coincident == "jitter": + # force re-jittering over and over again until the coincidence is broken + return _knn(_jitter_geoms(coordinates, geoms)[-1], metric=metric, k=k, p=p, coincident='jitter') + # implicit coincident == "clique" + heads, tails, weights = _knn(coincident_lut.geometry, metric=metric, k=k, p=p, coincident='raise') + adjtable = pandas.DataFrame.from_dict( + dict( + focal = heads, neighbor = tails, weight = weights + ) + ) + adjtable = _induce_cliques(adjtable, coincident_lut, fill_value=0) + return adjtable.focal.values, adjtable.neighbor.values, adjtable.weight.values def _distance_band(coordinates, threshold, ids=None): @@ -182,7 +221,7 @@ def _distance_band(coordinates, threshold, ids=None): ) tree = spatial.KDTree(coordinates) dist = tree.sparse_distance_matrix(tree, threshold, output_type="ndarray") - return sparse.csr_array((dist["v"], (dist["i"], dist["j"]))) + return dist['i'], dist['j'], dist["v"] # TODO: handle co-located points # TODO: ensure isloates are properly handled @@ -228,7 +267,10 @@ def _optimize_bandwidth(D, kernel): uniform distribution of kernel values, which is a good proxy for a "moderate" level of smoothing. """ - kernel_function = _kernel_functions[kernel] + kernel_function = _kernel_functions.get(kernel, kernel) + assert callable(kernel_function), ( + f"kernel {kernel} was not in supported kernel types {_kernel_functions.keys()} or callable" + ) def _loss(bandwidth, D=D, kernel_function=kernel_function): Ku = kernel_function(D.data, bandwidth) diff --git a/libpysal/graph/_set_ops.py b/libpysal/graph/_set_ops.py index 40a320b7c..d5e732683 100644 --- a/libpysal/graph/_set_ops.py +++ b/libpysal/graph/_set_ops.py @@ -39,6 +39,9 @@ def __iand__(self, other): def __ior__(self, other): raise TypeError("weights are immutable") + def __len__(self): + return self.n_edges + # TODO: performance test the pandas implementation def intersects(left, right): diff --git a/libpysal/graph/_triangulation.py b/libpysal/graph/_triangulation.py index 71ccda133..81bc51a16 100644 --- a/libpysal/graph/_triangulation.py +++ b/libpysal/graph/_triangulation.py @@ -2,13 +2,19 @@ import numpy import pandas -from scipy import spatial +from scipy import spatial, sparse from ._contiguity import _vertex_set_intersection -from ._kernel import _kernel_functions -from ._utils import _validate_geometry_input - +from ._kernel import _kernel, _optimize_bandwidth, _kernel_functions +from ._utils import ( + _validate_geometry_input, + _build_coincidence_lookup, + _induce_cliques, + _jitter_geoms, + _vec_euclidean_distances + ) from libpysal.cg import voronoi_frames +from functools import wraps try: from numba import njit # noqa E401 @@ -23,8 +29,75 @@ Serge Rey (sjsrey@gmail.com) """ +# This is in the module, rather than in `utils`, to ensure that it +# can access `_VALID_GEOMETRY_TYPES` without defining a nested decorator. +def _validate_coincident(triangulator): + """This is a decorator that validates input for coincident points""" + @wraps(triangulator) + def tri_with_validation(coordinates, ids=None, coincident='raise', kernel=None, bandwidth=None, **kwargs): + coordinates, ids, geoms = _validate_geometry_input( + coordinates, ids=ids, valid_geometry_types=_VALID_GEOMETRY_TYPES + ) + n_coincident, coincident_lut = _build_coincidence_lookup(geoms) + if n_coincident > 0: + if coincident == "raise": + raise ValueError( + f"There are {len(coincident_lut)} " + f"unique locations in the dataset, but {len(geoms)} observations." + "This means there are multiple points in the same location, which" + " is undefined for this graph type. To address this issue, consider setting " + " `coincident='clique' or consult the documentation about coincident points." + ) + elif coincident == "jitter": + coordinates, geoms = _jitter_geoms(coordinates, geoms) + elif coincident == "clique": + input_coordinates, input_ids, input_geoms = coordinates, ids, geoms + coordinates, ids, geoms = _validate_geometry_input( + coincident_lut.geometry, ids=coincident_lut.index, valid_geometry_types=_VALID_GEOMETRY_TYPES + ) + else: + raise ValueError( + f"Recieved option `coincident='{coincident}', but only options 'raise','clique','jitter' are suppported." + ) + heads_ix, tails_ix = triangulator(coordinates, **kwargs) + + heads, tails = ids[heads_ix], ids[tails_ix] + + if kernel is None: + weights = numpy.ones(heads_ix.shape, dtype=numpy.int8) + else: + distances = _vec_euclidean_distances(coordinates[heads_ix], coordinates[tails_ix]).squeeze() + sparse_D = sparse.csc_array((distances, (heads_ix, tails_ix))) + if bandwidth == "auto": + bandwidth = _optimize_bandwidth(sparse_D, kernel) + _k = _kernel( + sparse_D, + metric='precomputed', + kernel=kernel, + bandwidth=bandwidth, + taper=False + )[0] + weights = _k.data + adjtable = pandas.DataFrame.from_dict( + dict( + focal = heads, neighbor = tails, weight = weights + ) + ) + + if (n_coincident > 0) & (coincident == "clique"): + # note that the kernel is only used to compute a fill value for the clique. + # in the case of the voronoi weights. Using boxcar with an infinite bandwidth + # also gives us the correct fill value for the voronoi weight: 1. + fill_value = _kernel_functions[kernel](numpy.array([0]), bandwidth).item() + adjtable = _induce_cliques(adjtable, coincident_lut, fill_value=fill_value) + # from here, how to ensure ordering? + return adjtable.focal.values, adjtable.neighbor.values, adjtable.weight.values + return tri_with_validation + -def _delaunay(coordinates, ids=None, bandwidth=numpy.inf, kernel="boxcar"): + +@_validate_coincident +def _delaunay(coordinates): """ Constructor of the Delaunay graph of a set of input points. Relies on scipy.spatial.Delaunay and numba to quickly construct @@ -79,31 +152,14 @@ def _delaunay(coordinates, ids=None, bandwidth=numpy.inf, kernel="boxcar"): " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data." ) - coordinates, ids, _ = _validate_geometry_input( - coordinates, ids=ids, valid_geometry_types=_VALID_GEOMETRY_TYPES - ) edges, _ = _voronoi_edges(coordinates) + heads_ix, tails_ix = edges.T - ids = numpy.asarray(ids) - head, tail = ids[edges[:, 0]], ids[edges[:, 1]] - - if bandwidth is numpy.inf and kernel == "boxcar": - weights = numpy.ones(head.shape, dtype=numpy.int8) - else: - distances = ((coordinates[edges[:, 0]] - coordinates[edges[:, 1]]) ** 2).sum( - axis=1 - ).squeeze() ** 0.5 - weights = _kernel_functions[kernel](distances, bandwidth) - - # TODO: check for coincident points which result in - # dropped points from the triangulation and the - # misalignment of the weights and the attribute array - - return head, tail, weights - + return heads_ix, tails_ix -def _gabriel(coordinates, ids=None, bandwidth=numpy.inf, kernel="boxcar"): +@_validate_coincident +def _gabriel(coordinates): """ Constructs the Gabriel graph of a set of points. This graph is a subset of the Delaunay triangulation where only "short" links are retained. This @@ -148,35 +204,19 @@ def _gabriel(coordinates, ids=None, bandwidth=numpy.inf, kernel="boxcar"): " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data." ) - coordinates, ids, _ = _validate_geometry_input( - coordinates, ids=ids, valid_geometry_types=_VALID_GEOMETRY_TYPES - ) edges, dt = _voronoi_edges(coordinates) droplist = _filter_gabriel( edges, dt.points, ) - output = numpy.row_stack(list(set(map(tuple, edges)).difference(set(droplist)))) - ids = numpy.asarray(ids) - head, tail = ids[output[:, 0]], ids[output[:, 1]] - - if bandwidth is numpy.inf and kernel == "boxcar": - weights = numpy.ones(head.shape, dtype=numpy.int8) - else: - distances = ((coordinates[output[:, 0]] - coordinates[output[:, 1]]) ** 2).sum( - axis=1 - ).squeeze() ** 0.5 - weights = _kernel_functions[kernel](distances, bandwidth) - - # TODO: check for coincident points which result in - # dropped points from the triangulation and the - # misalignment of the weights and the attribute array + heads_ix, tails_ix = numpy.row_stack(list(set(map(tuple, edges)).difference(set(droplist)))).T - return head, tail, weights + return heads_ix, tails_ix -def _relative_neighborhood(coordinates, ids=None, bandwidth=numpy.inf, kernel="boxcar"): +@_validate_coincident +def _relative_neighborhood(coordinates): """ Constructs the Relative Neighborhood graph from a set of points. This graph is a subset of the Delaunay triangulation, where only @@ -219,26 +259,18 @@ def _relative_neighborhood(coordinates, ids=None, bandwidth=numpy.inf, kernel="b " to accelerate the computation of graphs. Without numba," " these computations may become unduly slow on large data." ) - coordinates, ids, _ = _validate_geometry_input( - coordinates, ids=ids, valid_geometry_types=_VALID_GEOMETRY_TYPES - ) + edges, dt = _voronoi_edges(coordinates) output, _ = _filter_relativehood(edges, dt.points, return_dkmax=False) - head_ix, tail_ix, distance = zip(*output) - - head, tail = ids[numpy.asarray(head_ix)], ids[numpy.asarray(tail_ix)] + heads_ix, tails_ix, distance = zip(*output) + heads_ix, tails_ix = numpy.asarray(heads_ix), numpy.asarray(tails_ix) - if bandwidth is numpy.inf and kernel == "boxcar": - weights = numpy.ones(head.shape, dtype=numpy.int8) - else: - weights = _kernel_functions[kernel](numpy.array(distance), bandwidth) + return heads_ix, tails_ix - return head, tail, weights - - -def _voronoi(coordinates, ids=None, clip="extent", rook=True): +@_validate_coincident +def _voronoi(coordinates, clip="extent", rook=True): """ Compute contiguity weights according to a clipped Voronoi diagram. @@ -290,13 +322,10 @@ def _voronoi(coordinates, ids=None, clip="extent", rook=True): delaunay triangulations in many applied contexts and generally will remove "long" links in the delaunay graph. """ - coordinates, ids, _ = _validate_geometry_input( - coordinates, ids=ids, valid_geometry_types=_VALID_GEOMETRY_TYPES - ) - cells, _ = voronoi_frames(coordinates, clip=clip) - return _vertex_set_intersection(cells, rook=rook, ids=ids) + heads_ix, tails_ix, weights = _vertex_set_intersection(cells, rook=rook) + return heads_ix, tails_ix #### utilities @@ -342,7 +371,6 @@ def _filter_gabriel(edges, coordinates): in order to construct the Gabriel graph. """ edge_pointer = 0 - edges.max() n_edges = len(edges) to_drop = [] while edge_pointer < n_edges: @@ -382,8 +410,7 @@ def _filter_relativehood(edges, coordinates, return_dkmax=False): 3. for each edge of the delaunay (i,j), prune if any dkmax is greater than d(i,j) """ - n = edges.max() - len(edges) + n_edges = len(edges) out = [] r = [] for edge in edges: @@ -393,7 +420,7 @@ def _filter_relativehood(edges, coordinates, return_dkmax=False): dkmax = 0 dij = ((pi - pj) ** 2).sum() ** 0.5 prune = False - for k in range(n): + for k in range(n_edges): pk = coordinates[k] dik = ((pi - pk) ** 2).sum() ** 0.5 djk = ((pj - pk) ** 2).sum() ** 0.5 @@ -420,4 +447,4 @@ def _voronoi_edges(coordinates): .drop_duplicates() .values ) - return edges, dt + return edges, dt \ No newline at end of file diff --git a/libpysal/graph/_utils.py b/libpysal/graph/_utils.py index 7a5cc9090..8775602b9 100644 --- a/libpysal/graph/_utils.py +++ b/libpysal/graph/_utils.py @@ -2,6 +2,80 @@ import numpy as np import pandas as pd import shapely +from itertools import permutations + +def _sparse_to_arrays(sparray, ids=None): + if ids is None: + maxdim = np.maximum(*sparray.shape) + ids = np.arange(maxdim) + head_ix, tail_ix = sparray.nonzero() + return ids[head_ix], ids[tail_ix], sparray.data + +def _jitter_geoms(coordinates, geoms, seed=None): + """ + Jitter geometries based on the smallest required movements to induce + uniqueness. For each point, this samples a radius and angle uniformly + at random from the unit circle, rescales it to a circle of values that + are extremely small relative to the precision of the input, and + then displaces the point. For a non-euclidean geometry, like latitude + longitude coordinates, this will distort according to a plateƩ carree + projection, jittering slightly more in the x direction than the y direction. + """ + rng = np.random.default_rng(seed=seed) + dtype = coordinates.dtype + if dtype not in (np.float32, np.float64): + # jittering requires us to cast ints to float + # and the rng.random generator only works with float32 and float64 + dtype = np.float32 + # the resolution is the approximate difference between two floats + # that can be resolved at the given dtype. + resolution = np.finfo(dtype).resolution + r = rng.random(size=coordinates.shape[0], dtype=dtype)**.5 * resolution + theta = rng.random(size=coordinates.shape[0], dtype=dtype) * np.pi * 2 + # converting from polar to cartesian + dx = r + np.sin(theta) + dy = r + np.cos(theta) + # then adding the displacements + coordinates = coordinates + np.column_stack((dx,dy)) + geoms = geopandas.GeoSeries(geopandas.points_from_xy(*coordinates.T, crs=geoms.crs)) + return coordinates, geoms + +def _induce_cliques(adjtable, clique_to_members, fill_value=1): + """ + induce cliques into the input graph. This connects everything within a + clique together, as well as connecting all things outside of the clique + to all members of the clique. + + This does not guarantee/understand ordering of the *output* adjacency table. + """ + adj_across_clique = adjtable.merge( + clique_to_members['input_index'], left_index=True, right_index=True + ).explode('input_index').rename( + columns=dict(input_index='subclique_focal') + ).merge( + clique_to_members['input_index'], left_on='neighbor', right_index=True + ).explode('input_index').rename( + columns=dict(input_index='subclique_neighbor') + ).reset_index().drop( + ['focal','neighbor', 'index'], axis=1 + ).rename( + columns=dict(subclique_focal="focal", subclique_neighbor='neighbor') + ) + is_multimember_clique = clique_to_members['input_index'].str.len()>1 + adj_within_clique = clique_to_members[ + is_multimember_clique + ]['input_index'].apply( + lambda x: list(permutations(x, 2)) + ).explode().apply( + pd.Series + ).rename(columns={0:"focal", 1:"neighbor"}).assign(weight=fill_value) + + new_adj = pd.concat( + (adj_across_clique, adj_within_clique), + ignore_index=True, axis=0 + ).reset_index(drop=True) + + return new_adj def _neighbor_dict_to_edges(neighbors, weights=None): @@ -23,6 +97,20 @@ def _neighbor_dict_to_edges(neighbors, weights=None): return heads, tails, data_array +def _build_coincidence_lookup(geoms): + """ + Identify coincident points and create a look-up table for the coincident geometries. + """ + valid_coincident_geom_types = set(("Point",)) + if not set(geoms.geom_type) <= valid_coincident_geom_types: + raise ValueError( + f"coindicence checks are only well-defined for geom_types: {valid_coincident_geom_types}" + ) + max_coincident = geoms.geometry.duplicated().sum() + lut = geoms.to_frame("geometry").reset_index().groupby("geometry")['index'].agg(list).reset_index() + lut = geopandas.GeoDataFrame(lut) + return max_coincident, lut.rename(columns=dict(index='input_index')) + def _validate_geometry_input(geoms, ids=None, valid_geometry_types=None): """ Ensure that input geometries are always aligned to (and refer back to) @@ -171,6 +259,13 @@ def lat2Graph(nrows=5, ncols=5, rook=True, id_type="int"): return Graph.from_dicts(weights) +def _vec_euclidean_distances(X,Y): + """ + compute the euclidean distances along corresponding rows of two arrays + """ + return ((X-Y)**2).sum(axis=1)**.5 + + def _evaluate_index(data): """Helper to get ids from any input.""" return ( diff --git a/libpysal/graph/base.py b/libpysal/graph/base.py index 7a31f1dcf..707e5ba8a 100644 --- a/libpysal/graph/base.py +++ b/libpysal/graph/base.py @@ -445,6 +445,7 @@ def build_triangulation( kernel="boxcar", clip="extent", rook=True, + coincident='raise' ): """Generate Graph from geometry based on triangulation @@ -490,6 +491,11 @@ def build_triangulation( If True, two geometries are considered neighbours if they share at least one edge. If False, two geometries are considered neighbours if they share at least one vertex. By default True + coincident: str, optional (default "raise") + Method for handling coincident points. Options include + ``'raise'`` (raising an exception when coincident points are present), + ``'jitter'`` (randomly displace coincident points to produce uniqueness), and + ``'clique'`` (induce fully-connected sub cliques for coincident points). Returns ------- @@ -500,18 +506,18 @@ def build_triangulation( if method == "delaunay": head, tail, weights = _delaunay( - data, ids=ids, bandwidth=bandwidth, kernel=kernel + data, ids=ids, bandwidth=bandwidth, kernel=kernel, coincident=coincident ) elif method == "gabriel": head, tail, weights = _gabriel( - data, ids=ids, bandwidth=bandwidth, kernel=kernel + data, ids=ids, bandwidth=bandwidth, kernel=kernel, coincident=coincident ) elif method == "relative_neighborhood": head, tail, weights = _relative_neighborhood( - data, ids=ids, bandwidth=bandwidth, kernel=kernel + data, ids=ids, bandwidth=bandwidth, kernel=kernel, coincident=coincident ) elif method == "voronoi": - head, tail, weights = _voronoi(data, ids=ids, clip=clip, rook=rook) + head, tail, weights = _voronoi(data, ids=ids, clip=clip, rook=rook, coincident=coincident) else: raise ValueError( f"Method '{method}' is not supported. Use one of ['delaunay', " @@ -873,6 +879,16 @@ def n(self): """Number of observations.""" return self.unique_ids.shape[0] + @cached_property + def n_nodes(self): + """Number of observations.""" + return self.unique_ids.shape[0] + + @cached_property + def n_edges(self): + """Number of observations.""" + return self.adjacency.shape[0] + @cached_property def pct_nonzero(self): """Percentage of nonzero weights.""" @@ -882,8 +898,7 @@ def pct_nonzero(self): @cached_property def nonzero(self): """Number of nonzero weights.""" - nnz = self.sparse.nnz - return nnz + return n_edges - len(self.isolates) def asymmetry(self, intrinsic=True): """Asymmetry check. @@ -1053,6 +1068,27 @@ def to_parquet(self, path, **kwargs): """ _to_parquet(self, path, **kwargs) +def _arrange_arrays(heads, tails, weights, ids=None): + """ + Rearrange input arrays so that observation indices + are well-ordered with respect to the input ids. That is, + an "early" identifier should always preceed a "later" identifier + in the heads, but the tails should be sorted with respect + to heads *first*, then sorted within the tails. + """ + if ids is None: + ids = numpy.unique(numpy.hstack((heads, tails))) + lookup = list(ids).index + input_df = pandas.DataFrame.from_dict(dict(focal=heads, neighbor=tails, weight=weights)) + return input_df.set_index(['focal', 'neighbor']).assign( + focal_loc = input_df.focal.apply(lookup).values, + neighbor_loc = input_df.neighbor.apply(lookup).values + ).sort_values( + ['focal_loc', 'neighbor_loc'] + ).reset_index().drop( + ['focal_loc', 'neighbor_loc'], axis=1 + ).values.T + def read_parquet(path, **kwargs): """Read Graph from a Apache Parquet diff --git a/libpysal/graph/tests/conftests.py b/libpysal/graph/tests/conftest.py similarity index 99% rename from libpysal/graph/tests/conftests.py rename to libpysal/graph/tests/conftest.py index cd5dfaa93..97ad11e49 100644 --- a/libpysal/graph/tests/conftests.py +++ b/libpysal/graph/tests/conftest.py @@ -1,3 +1,4 @@ +""" import geodatasets import geopandas import pytest @@ -30,3 +31,4 @@ parametrize_external_ids = pytest.mark.parametrize( "external_ids", external_ids, ids=["use set_index", "use id vector"] ) +""" \ No newline at end of file diff --git a/libpysal/graph/tests/test_base.py b/libpysal/graph/tests/test_base.py index bc0600f4c..6bc27b34d 100644 --- a/libpysal/graph/tests/test_base.py +++ b/libpysal/graph/tests/test_base.py @@ -225,13 +225,13 @@ def test_from_sparse(self): sp = sparse.coo_array((data, (row, col)), shape=(4, 4)) G = graph.Graph.from_sparse(sp) expected = graph.Graph.from_arrays(row, col, data) - assert G == expected + assert G == expected, "sparse constructor does not match arrays constructor" G = graph.Graph.from_sparse(sp.tocsr()) - assert G == expected + assert G == expected, "csc input does not match coo input" G = graph.Graph.from_sparse(sp.tocsc()) - assert G == expected + assert G == expected, "csr input does not match coo input" ids = ["zero", "one", "two", "three"] @@ -249,13 +249,13 @@ def test_from_sparse(self): sp.tocsr(), ids=ids, ) - assert G == expected + assert G == expected, "sparse csr with ids does not match arrays constructor" G = graph.Graph.from_sparse( sp.tocsc(), ids=ids, ) - assert G == expected + assert G == expected, "sparse csr with ids does not match arrays constructor" dense = np.array( [ @@ -297,7 +297,7 @@ def test_from_sparse(self): ], np.ones(10), ) - assert G == expected + assert G == expected, "sparse csr nybb with ids does not match arrays constructor" np.testing.assert_array_equal(G.sparse.todense(), sp.todense()) def test_from_arrays(self): diff --git a/libpysal/graph/tests/test_builders.py b/libpysal/graph/tests/test_builders.py index dd8f6d3a9..7d6c8ef98 100644 --- a/libpysal/graph/tests/test_builders.py +++ b/libpysal/graph/tests/test_builders.py @@ -79,9 +79,10 @@ def test_block_contiguity(self): class TestTriangulation: def setup_method(self): - self.gdf = gpd.read_file(geodatasets.get_path("geoda liquor_stores")).explode( + gdf = gpd.read_file(geodatasets.get_path("geoda liquor_stores")).explode( ignore_index=True ) + self.gdf = gdf[~gdf.geometry.duplicated()] self.gdf_str = self.gdf.set_index("placeid") @pytest.mark.parametrize("method", TRIANGULATIONS) diff --git a/libpysal/graph/tests/test_triangulation.py b/libpysal/graph/tests/test_triangulation.py index 14629b758..e672d91cd 100644 --- a/libpysal/graph/tests/test_triangulation.py +++ b/libpysal/graph/tests/test_triangulation.py @@ -9,98 +9,154 @@ - numba/nonumba """ import pandas, geopandas, geodatasets, pytest, shapely, numpy +from scipy import spatial from libpysal.graph._triangulation import ( _delaunay, _gabriel, _relative_neighborhood, _voronoi, ) +from libpysal.graph._kernel import _kernel_functions +from libpysal.graph.base import Graph -# ### TODO: is there any way to remove this duplication -# ### btw. test_triangulation and test_kernel using the -# ### conftests.py? We need the same data/kernel combos -# ### and also need to parameterize on numba existing, -# ### and also have to test coincident points. - -# grocs = geopandas.read_file( -# geodatasets.get_path("geoda groceries") -# )[['OBJECTID', 'geometry']] -# grocs['strID'] = grocs.OBJECTID.astype(str) -# grocs['intID'] = grocs.OBJECTID.values - -# kernel_functions = list(_kernel_functions.keys()) - -# def my_kernel(distances, bandwidth): -# output = numpy.cos(distances/distances.max()) -# output[distances < bandwidth] = 0 -# return output - -# kernel_functions.append(my_kernel) - -# # optimal, small, and larger than largest distance. -# # the optimal bandwidth should be smaller than the -# # max distance for all kernels except identity -# bandwidths = [None, .05, .4] - -# numpy.random.seed(6301) -# # create a 2-d laplace distribution as a "degenerate" -# # over-concentrated distribution -# lap_coords = numpy.random.laplace(size=(200,2)) - -# # create a 2-d cauchy as a "degenerate" -# # spatial outlier-y distribution -# # cau_coords = numpy.random.standard_cauchy(size=(200,2)) - -# data = ( -# grocs, -# lap_coords -# ) - -# parametrize_ids = pytest.mark.parametrize("ids", [None, "strID", "intID"]) -# parametrize_data = pytest.mark.parametrize("data", [grocs, lap_coords, cau_coords], ["groceries", "coords: 2d-laplacian"]) -# parametrize_kernelfunctions = pytest.mark.parametrize( -# "kernel", -# kernel_functions, -# kernel_functions[:-2] + ['None', 'custom callable'] -# ) -# parametrize_bw = pytest.mark.parametrize( -# "bandwidth", -# bandwidth, -# ['optimal', 'small', 'large'] -# ) -# paramterize_graphs = pytest.mark.parametrize( -# "graphtype", -# (delaunay, gabriel, relative_neighborhood, voronoi), -# ['delaunay', 'gabriel' 'relative_neighborhood', 'voronoi'] -# ) - -# @parametrize_ids -# @parametrize_data -# @parametrize_kernelfunctions -# @paramterize_bw -# def test_voronoi(data, ids, kernel, bandwidth): -# raise NotImplementedError() - -# @parametrize_ids -# @parametrize_data -# @parametrize_kernelfunctions -# @paramterize_bw -# def test_delaunay(data, ids, kernel, bandwidth): -# raise NotImplementedError() - -# @parametrize_ids -# @parametrize_data -# @parametrize_kernelfunctions -# @paramterize_bw -# def test_gabriel(data, ids, kernel, bandwidth): -# raise NotImplementedError() - -# @parametrize_ids -# @parametrize_data -# @parametrize_kernelfunctions -# @paramterize_bw -# def test_relative_neighborhood(data, ids, kernel, bandwidth): -# raise NotImplementedError() - -# @paramterize_graphs -# def test_collinear() +stores = geopandas.read_file(geodatasets.get_path("geoda liquor_stores")).explode(index_parts=False) +stores_unique = stores.drop_duplicates(subset='geometry') + +kernel_functions = [None] + list(_kernel_functions.keys()) + +def my_kernel(distances, bandwidth): + output = numpy.cos(distances/distances.max()) + output[distances < bandwidth] = 0 + return output + +kernel_functions.append(my_kernel) + +# optimal, small, and larger than largest distance. +bandwidths = [None, 'auto', .5] + +numpy.random.seed(6301) +# create a 2-d laplace distribution as a "degenerate" +# over-concentrated distribution +lap_coords = numpy.random.laplace(size=(5,2)) + +# create a 2-d cauchy as a "degenerate" +# spatial outlier-y distribution +cau_coords = numpy.random.standard_cauchy(size=(5,2)) + +parametrize_ids = pytest.mark.parametrize("ids", [None, "id", "placeid"], ids = ["no id", "id", "placeid"]) + +parametrize_kernelfunctions = pytest.mark.parametrize( + "kernel", + kernel_functions, + ids = kernel_functions[:-2] + ['None', 'custom callable'] + ) +parametrize_bw = pytest.mark.parametrize( + "bandwidth", + bandwidths, + ids = ["no bandwidth", 'auto', 'fixed'] + ) +parametrize_constructors = pytest.mark.parametrize( + "constructor", + [_delaunay, _gabriel, _relative_neighborhood, _voronoi], + ids = ['delaunay', 'gabriel', 'relhood', 'voronoi'] + ) + +@parametrize_constructors +@parametrize_ids +@parametrize_kernelfunctions +@parametrize_bw +def test_option_combinations(constructor, ids, kernel, bandwidth): + """ + NOTE: This does not check for the *validity* of the output, just + the structure of the output. + """ + heads, tails, weights = constructor( + stores_unique, + ids=stores_unique[ids] if ids is not None else None, + kernel=kernel, + bandwidth=bandwidth + ) + assert heads.dtype == tails.dtype + assert heads.dtype == stores_unique.get(ids, stores_unique.index).dtype, 'ids failed to propagate' + if kernel is None and bandwidth is None: + numpy.testing.assert_array_equal(weights, numpy.ones_like(heads)) + assert set(zip(heads, tails)) == set(zip(tails, heads)), "all triangulations should be symmetric, this is not" + + +def test_correctness_voronoi_clipping(): + noclip = _voronoi(lap_coords, clip=None, rook=True) + extent = _voronoi(lap_coords, clip='extent', rook=True) + alpha = _voronoi(lap_coords, clip='ashape', rook=True) + + G_noclip = Graph.from_arrays(*noclip) + G_extent = Graph.from_arrays(*extent) + G_alpha = Graph.from_arrays(*alpha) + + assert G_alpha < G_extent + assert G_extent <= G_noclip + + D = spatial.distance.squareform(spatial.distance.pdist(lap_coords)) + + extent_known = [ + numpy.array([0,0,0,0,1,1,1,2,2,3,3,3,4,4]), + numpy.array([1,2,3,4,0,3,4,0,3,0,1,2,0,1]), + ] + alpha_known = [ + numpy.array([0,0,0,1,2,2,3,3,4,4]), + numpy.array([2,3,4,4,0,3,0,2,0,1]) + ] + + numpy.testing.assert_array_equal(G_extent.adjacency.index, extent_known[0]) + numpy.testing.assert_array_equal(G_extent.adjacency.neighbor, extent_known[1]) + + numpy.testing.assert_array_equal(G_alpha.adjacency.index, alpha_known[0]) + numpy.testing.assert_array_equal(G_alpha.adjacency.neighbor, alpha_known[1]) + +def test_correctness_delaunay_family(): + for i, data in enumerate((cau_coords, lap_coords)): + voronoi = _voronoi(data, clip=False) + delaunay = _delaunay(data) + gabriel = _gabriel(data) + relneigh = _relative_neighborhood(data) + G_voronoi = Graph.from_arrays(*voronoi) + G_delaunay = Graph.from_arrays(*delaunay) + G_gabriel = Graph.from_arrays(*gabriel) + G_relneigh = Graph.from_arrays(*relneigh) + + if i: + known_delaunay = [ + (0,0,0,1,1,1,2,2,2,2,3,3,3,4,4,4), + (1,2,4,0,2,3,0,1,3,4,1,2,4,0,2,3) + ] + known_gabriel = [ + (4,0,0,2,1,3,2,0,3,4), + (0,4,3,0,4,0,3,2,2,1) + ] + known_relneigh = [ + (0,0,0,0,1,1,2,2,3,3,4,4), + (1,2,3,4,0,4,0,3,0,2,0,1) + ] + else: + known_delaunay = [ + (0,0,0,0,1,1,1,2,2,2,3,3,3,4,4,4), + (1,2,3,4,0,3,4,0,3,4,0,1,2,0,1,2) + ] + known_gabriel = [ + (4,0,0,2,1,3,2,0,3,4), + (0,4,3,0,4,0,3,2,2,1) + ] + known_relneigh = [ + (0,0,0,0,1,1,2,2,3,3,4,4), + (1,2,3,4,0,4,0,3,0,2,0,1) + ] + + assert G_voronoi == G_delaunay + assert G_gabriel <= G_delaunay + assert G_relneigh <= G_delaunay + for i,name in enumerate(['delaunay', 'gabriel', 'relneigh']): + G_known = (known_delaunay, known_gabriel, known_relneigh)[i] + G_computed = (delaunay, voronoi, gabriel)[i] + assert G_known == G_computed, ( + f"computed {name} not equivalent to stored {name} for " + f"{('cauchy', 'laplacian')[i]} coordinates!" + )