Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add STRtree nearest neighbor(s) function #272

Merged
merged 27 commits into from Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cf5e55c
Rework index tracking in STRtree to use addresses of geometries in tr…
brendan-ward Dec 6, 2020
051204b
Merge branch 'master' into pr261_2
brendan-ward Dec 9, 2020
165e285
PR feedback
brendan-ward Dec 9, 2020
e275253
WIP: initial refactor of STRtree::nearest
brendan-ward Dec 14, 2020
c60e870
Fix accidental line delete, require GEOS >= 3.6 for STRtree::nearest
brendan-ward Dec 14, 2020
88bf099
Handle equidistant / intersected tree geometries correctly
brendan-ward Dec 14, 2020
3820a82
exclude nearest from doctests
brendan-ward Dec 14, 2020
dbc5a71
Fix missed skip of doctest
brendan-ward Dec 14, 2020
713859b
Add nearest benchmarks
brendan-ward Dec 14, 2020
3955e57
Update changelog, remove TODOs
brendan-ward Dec 14, 2020
6c9b2ba
Add prescreening in STRtree::nearest by max distance
brendan-ward Dec 15, 2020
2186e14
Fix bugs in get_bounds for GEOS < 3.7
brendan-ward Dec 15, 2020
4f68f9d
Fix missing return for NAN
brendan-ward Dec 15, 2020
f213290
Try again, moving retval up
brendan-ward Dec 15, 2020
3bb676b
Merge branch 'master' into strtree_nearest_take2
brendan-ward Jan 29, 2021
ddeb142
Add back in changes after merge
brendan-ward Jan 29, 2021
a7f3d7e
PR feedback, esp. nearest => nearest_all
brendan-ward Feb 23, 2021
99617dc
Add STRtree::nearest
brendan-ward Feb 24, 2021
c292daf
docstring cleanup, expand tests, update benchmarks
brendan-ward Feb 24, 2021
985d57f
Merge branch 'master' into strtree_nearest_take2
brendan-ward Feb 24, 2021
95e95d2
Update changelog
brendan-ward Feb 24, 2021
8284081
Fix bounds and box C header entries
brendan-ward Feb 24, 2021
f6de7a6
Merge branch 'master' into strtree_nearest_take2
brendan-ward Feb 25, 2021
11848ac
Add benchmarks for equidistant nearest neighbors
brendan-ward Feb 25, 2021
ae938bc
Add benchmark to manually search for equidistant neighbors
brendan-ward Feb 25, 2021
4585152
Add comment for tree nearest manual benchmark
brendan-ward Feb 25, 2021
f362d7c
Optimization: pass GEOSGeometry query geometry to distance callback
brendan-ward Feb 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 6 additions & 3 deletions CHANGELOG.rst
Expand Up @@ -4,9 +4,10 @@ Changelog
Version 0.10 (unreleased)
------------------------

**Highlights of this release**
**Major enhancements**

* ...
* Addition of ``nearest`` and ``nearest_all`` functions to ``STRtree`` for
GEOS >= 3.6 to find the nearest neighbors (#272).

**API Changes**

Expand All @@ -26,7 +27,9 @@ Version 0.10 (unreleased)
Thanks to everyone who contributed to this release!
People with a "+" by their names contributed a patch for the first time.

* ...
* Brendan Ward
* Casper van der Wel
* Joris Van den Bossche


Version 0.9 (2021-01-23)
Expand Down
134 changes: 128 additions & 6 deletions benchmarks/benchmarks.py
Expand Up @@ -8,6 +8,7 @@

class PointPolygonTimeSuite:
"""Benchmarks running on 100000 points and one polygon"""

def setup(self):
self.points = pygeos.points(np.random.random((100000, 2)))
self.polygon = pygeos.polygons(np.random.random((3, 2)))
Expand All @@ -24,6 +25,7 @@ def time_intersection(self):

class IOSuite:
"""Benchmarks I/O operations (WKT and WKB) on a set of 10000 polygons"""

def setup(self):
self.to_write = pygeos.polygons(np.random.random((10000, 100, 2)))
self.to_read_wkt = pygeos.to_wkt(self.to_write)
Expand All @@ -44,6 +46,7 @@ def time_read_from_wkb(self):

class ConstructiveSuite:
"""Benchmarks constructive functions on a set of 10,000 points"""

def setup(self):
self.points = pygeos.points(np.random.random((10000, 2)))

Expand All @@ -66,16 +69,15 @@ class ClipSuite:
def setup(self):
# create irregular polygons by merging overlapping point buffers
self.polygon = pygeos.union_all(
pygeos.buffer(pygeos.points(np.random.random((1000, 2)) * 500), 10)
)
pygeos.buffer(pygeos.points(np.random.random((1000, 2)) * 500), 10)
)
xmin = np.random.random(100) * 100
xmax = xmin + 100
ymin = np.random.random(100) * 100
ymax = ymin + 100
self.bounds = np.array([xmin, ymin, xmax, ymax]).T
self.boxes = pygeos.box(xmin, ymin, xmax, ymax)


def time_clip_by_box(self):
pygeos.intersection(self.polygon, self.boxes)

Expand All @@ -88,7 +90,13 @@ class GetParts:
"""Benchmarks for getting individual parts from 100 multipolygons of 100 polygons each"""

def setup(self):
self.multipolygons = np.array([pygeos.multipolygons(pygeos.polygons(np.random.random((2, 100, 2)))) for i in range(10000)], dtype=object)
self.multipolygons = np.array(
[
pygeos.multipolygons(pygeos.polygons(np.random.random((2, 100, 2))))
for i in range(10000)
],
dtype=object,
)

def time_get_parts(self):
"""Cython implementation of get_parts"""
Expand All @@ -110,9 +118,11 @@ class OverlaySuite:

def setup(self):
# create irregular polygons by merging overlapping point buffers
self.left = pygeos.union_all(pygeos.buffer(pygeos.points(np.random.random((500, 2)) * 500), 15))
self.left = pygeos.union_all(
pygeos.buffer(pygeos.points(np.random.random((500, 2)) * 500), 15)
)
# shift this up and right
self.right = pygeos.apply(self.left, lambda x: x+50)
self.right = pygeos.apply(self.left, lambda x: x + 50)

def time_difference(self):
pygeos.difference(self.left, self.right)
Expand Down Expand Up @@ -174,6 +184,20 @@ def setup(self):
# initialize the tree by making a tiny query first
self.tree.query(pygeos.points(0, 0))

# create points that extend beyond the domain of the above polygons to ensure
# some don't overlap
self.points = pygeos.points((np.random.random((2000, 2)) * 750) - 125)
self.point_tree = pygeos.STRtree(
pygeos.points(np.random.random((2000, 2)) * 750)
)
self.point_tree.query(pygeos.points(0, 0))

# create points on a grid for testing equidistant nearest neighbors
# creates 2025 points
grid_coords = np.mgrid[:45, :45].T.reshape(-1, 2)
self.grid_point_tree = pygeos.STRtree(pygeos.points(grid_coords))
self.grid_points = pygeos.points(grid_coords + 0.5)

def time_tree_create(self):
tree = pygeos.STRtree(self.polygons)
tree.query(pygeos.points(0, 0))
Expand Down Expand Up @@ -207,3 +231,101 @@ def time_tree_query_bulk_covered_by(self):

def time_tree_query_bulk_contains_properly(self):
self.tree.query_bulk(self.polygons, predicate="contains_properly")

def time_tree_nearest_points(self):
self.point_tree.nearest(self.points)

def time_tree_nearest_points_equidistant(self):
self.grid_point_tree.nearest(self.grid_points)

def time_tree_nearest_points_equidistant_manual_all(self):
# This benchmark approximates nearest_all for equidistant results
# starting from singular nearest neighbors and searching for more
# within same distance.

# try to find all equidistant neighbors ourselves given single nearest
# result
l, r = self.grid_point_tree.nearest(self.grid_points)
# calculate distance to nearest neighbor
dist = pygeos.distance(self.grid_points.take(l), self.grid_point_tree.geometries.take(r))
# include a slight epsilon to ensure nearest are within this radius
b = pygeos.buffer(self.grid_points, dist + 1e-8)

# query the tree for others in the same buffer distance
left, right = self.grid_point_tree.query_bulk(b, predicate='intersects')
dist = pygeos.distance(
self.grid_points.take(left), self.grid_point_tree.geometries.take(right)
)

# sort by left, distance
ix = np.lexsort((right, dist, left))
left = left[ix]
right = right[ix]
dist = dist[ix]

run_start = np.r_[True, left[:-1] != left[1:]]
run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

mins = dist[run_start]

# spread to rest of array so we can extract out all within each group that match
all_mins = np.repeat(mins, run_counts)
ix = dist == all_mins
left = left[ix]
right = right[ix]
dist = dist[ix]

def time_tree_nearest_all_points(self):
self.point_tree.nearest_all(self.points)

def time_tree_nearest_all_points_equidistant(self):
self.grid_point_tree.nearest_all(self.grid_points)

def time_tree_nearest_all_points_small_max_distance(self):
# returns >300 results
self.point_tree.nearest_all(self.points, max_distance=5)

def time_tree_nearest_all_points_large_max_distance(self):
# measures the overhead of using a distance that would encompass all tree points
self.point_tree.nearest_all(self.points, max_distance=1000)

def time_tree_nearest_poly(self):
self.tree.nearest(self.points)

def time_tree_nearest_all_poly(self):
self.tree.nearest_all(self.points)

def time_tree_nearest_all_poly_small_max_distance(self):
# returns >300 results
self.tree.nearest_all(self.points, max_distance=5)

def time_tree_nearest_all_poly_python(self):
# returns all input points

# use an arbitrary search tolerance that seems appropriate for the density of
# geometries
tolerance = 200
b = pygeos.buffer(self.points, tolerance, quadsegs=1)
left, right = self.tree.query_bulk(b)
dist = pygeos.distance(self.points.take(left), self.polygons.take(right))

# sort by left, distance
ix = np.lexsort((right, dist, left))
left = left[ix]
right = right[ix]
dist = dist[ix]

run_start = np.r_[True, left[:-1] != left[1:]]
run_counts = np.diff(np.r_[np.nonzero(run_start)[0], left.shape[0]])

mins = dist[run_start]

# spread to rest of array so we can extract out all within each group that match
all_mins = np.repeat(mins, run_counts)
ix = dist == all_mins
left = left[ix]
right = right[ix]
dist = dist[ix]

# arrays are now roughly representative of what tree.nearest_all would provide, though
# some nearest_all neighbors may be missed if they are outside tolerance
126 changes: 126 additions & 0 deletions pygeos/strtree.py
@@ -1,6 +1,7 @@
from enum import IntEnum
import numpy as np
from . import lib
from .decorators import requires_geos


__all__ = ["STRtree"]
Expand Down Expand Up @@ -49,6 +50,8 @@ class STRtree:
>>> # Query geometries that overlap envelopes of `geoms`
>>> tree.query_bulk([pygeos.box(2, 2, 4, 4), pygeos.box(5, 5, 6, 6)]).tolist()
[[0, 0, 0, 1, 1], [2, 3, 4, 5, 6]]
>>> tree.nearest([pygeos.points(1,1), pygeos.points(3,5)]).tolist() # doctest: +SKIP
[[0, 1], [1, 4]]
"""

def __init__(self, geometries, leafsize=10):
Expand Down Expand Up @@ -186,3 +189,126 @@ def query_bulk(self, geometry, predicate=None):
predicate = BinaryPredicate[predicate].value

return self._tree.query_bulk(geometry, predicate)

@requires_geos("3.6.0")
def nearest(self, geometry):
"""Returns the index of the nearest item in the tree for each input
geometry.
If there are multiple equidistant or intersected geometries in the tree,
only a single result is returned for each input geometry, based on the
order that tree geometries are visited; this order may be
nondeterministic.
Any geometry that is None or empty in the input geometries is omitted
from the output.
Parameters
----------
geometry : Geometry or array_like
Input geometries to query the tree.
Returns
-------
ndarray with shape (2, n)
caspervdw marked this conversation as resolved.
Show resolved Hide resolved
The first subarray contains input geometry indexes.
The second subarray contains tree geometry indexes.
See also
--------
nearest_all: returns all equidistant geometries and optional distances
Examples
--------
>>> import pygeos
>>> tree = pygeos.STRtree(pygeos.points(np.arange(10), np.arange(10)))
>>> tree.nearest(pygeos.points(1,1)).tolist() # doctest: +SKIP
[[0], [1]]
>>> tree.nearest([pygeos.box(1,1,3,3)]).tolist() # doctest: +SKIP
[[0], [1]]
>>> points = pygeos.points(0.5,0.5)
>>> tree.nearest([None, pygeos.points(10,10)]).tolist() # doctest: +SKIP
[[1], [9]]
"""

geometry = np.asarray(geometry, dtype=object)
if geometry.ndim == 0:
geometry = np.expand_dims(geometry, 0)

return self._tree.nearest(geometry)

@requires_geos("3.6.0")
def nearest_all(self, geometry, max_distance=None, return_distance=False):
"""Returns the index of the nearest item(s) in the tree for each input
geometry.
If there are multiple equidistant or intersected geometries in tree, all
are returned. Tree indexes are returned in the order they are visited
for each input geometry and may not be in ascending index order; no meaningful
order is implied.
The max_distance used to search for nearest items in the tree may have a
significant impact on performance by reducing the number of input geometries
that are evaluated for nearest items in the tree. Only those input geometries
with at least one tree item within +/- max_distance beyond their envelope will
be evaluated.
The distance, if returned, will be 0 for any intersected geometries in the tree.
Any geometry that is None or empty in the input geometries is omitted from
the output.
Parameters
----------
geometry : Geometry or array_like
Input geometries to query the tree.
max_distance : float, optional (default: None)
Maximum distance within which to query for nearest items in tree.
Must be greater than 0.
return_distance : bool, optional (default: False)
If True, will return distances in addition to indexes.
Returns
-------
indices or tuple of (indices, distances)
indices is an ndarray of shape (2,n) and distances (if present) an
ndarray of shape (n).
The first subarray of indices contains input geometry indices.
The second subarray of indices contains tree geometry indices.
See also
--------
nearest: returns singular nearest geometry for each input
Examples
--------
>>> import pygeos
>>> tree = pygeos.STRtree(pygeos.points(np.arange(10), np.arange(10)))
>>> tree.nearest_all(pygeos.points(1,1)).tolist() # doctest: +SKIP
[[0], [1]]
>>> tree.nearest_all([pygeos.box(1,1,3,3)]).tolist() # doctest: +SKIP
[[0, 0, 0], [1, 2, 3]]
>>> points = pygeos.points(0.5,0.5)
>>> index, distance = tree.nearest_all(points, return_distance=True) # doctest: +SKIP
>>> index.tolist() # doctest: +SKIP
[[0, 0], [0, 1]]
>>> distance.round(4).tolist() # doctest: +SKIP
[0.7071, 0.7071]
>>> tree.nearest_all(None).tolist() # doctest: +SKIP
[[], []]
"""

geometry = np.asarray(geometry, dtype=object)
if geometry.ndim == 0:
geometry = np.expand_dims(geometry, 0)

if max_distance is not None and max_distance <= 0:
raise ValueError("max_distance must be greater than 0")

# a distance of 0 means no max_distance is used
max_distance = max_distance or 0

if return_distance:
return self._tree.nearest_all(geometry, max_distance)

return self._tree.nearest_all(geometry, max_distance)[0]