Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 219 lines (195 sloc) 7.891 kb
612e114 THE NEW HOTNESS
Schuyler Erle authored
1 from shapely.geometry import Point, MultiPoint, Polygon, MultiPolygon, asShape
2 from shapely.ops import cascaded_union, polygonize
3 from shapely.prepared import prep
4 from rtree import Rtree
5 import sys, random, json, numpy, math, pickle, os, geojson
6
7 SAMPLE_SIZE = 20
8 SCALE_FACTOR = 111111.0
9 MEDIAN_THRESHOLD = 5.0
10
11 median_distance_cache = {}
12 def median_distances(pts, aggregate=numpy.median):
13 key = tuple(sorted(pts))
14 if key in median_distance_cache: return median_distance_cache[key]
15 median = (numpy.median([pt[0] for pt in pts]),
16 numpy.median([pt[1] for pt in pts]))
17 distances = []
18 for pt in pts:
19 dist = math.sqrt(((median[0]-pt[0])*math.cos(median[1]*math.pi/180.0))**2+(median[1]-pt[1])**2)
20 distances.append((dist, pt))
21
22 median_dist = aggregate([dist for dist, pt in distances])
23 median_distance_cache[key] = (median_dist, distances)
24 return (median_dist, distances)
25
26 def mean_distances(pts):
27 return median_distances(pts, numpy.mean)
28
29 name_file, line_file, point_file = sys.argv[1:4]
30
31 places = {}
32 names = {}
33 blocks = {}
34 if os.path.exists(point_file + '.cache'):
35 print >>sys.stderr, "Reading from %s cache..." % point_file
36 names, blocks, places = pickle.load(file(point_file + ".cache"))
37 blocks = map(asShape, blocks)
38 else:
39 all_names = {}
40 count = 0
41 for line in file(name_file):
42 place_id, name = line.strip().split(None, 1)
43 all_names[int(place_id)] = name
44 count += 1
45 if count % 1000 == 0:
46 print >>sys.stderr, "\rRead %d names from %s." % (count, name_file),
47 print >>sys.stderr, "\rRead %d names from %s." % (count, name_file)
48
49 count = 0
50 for line in file(point_file):
51 place_id, lon, lat = line.strip().split()
52 place_id = int(place_id)
53 names[place_id] = all_names.get(place_id, "")
54 point = (float(lon), float(lat))
55 pts = places.setdefault(place_id, set())
56 pts.add(point)
57 count += 1
58 if count % 1000 == 0:
59 print >>sys.stderr, "\rRead %d points in %d places." % (count, len(places)),
60 print >>sys.stderr, "\rRead %d points in %d places." % (count, len(places))
61
62 count = 0
63 discarded = 0
64 for place_id, pts in places.items():
65 count += 1
66 print >>sys.stderr, "\rComputing outliers for %d of %d places..." % (count, len(places)),
67 median_dist, distances = median_distances(pts)
68 keep = [pt for dist, pt in distances if dist < median_dist * MEDIAN_THRESHOLD]
69 discarded += len(pts) - len(keep)
70 places[place_id] = keep
71
72 print >>sys.stderr, "%d points discarded." % discarded
73
74 lines = []
75 do_polygonize = False
76 print >>sys.stderr, "Reading lines from %s..." % line_file,
77 for feature in geojson.loads(file(line_file).read()):
78 if feature.geometry.type in ('LineString', 'MultiLineString'):
79 do_polygonize = True
80 lines.append(asShape(feature.geometry.to_dict()))
81 print >>sys.stderr, "%d lines read." % len(lines)
82 if do_polygonize:
83 print >>sys.stderr, "Polygonizing %d lines..." % (len(lines)),
84 blocks = [poly.__geo_interface__ for poly in polygonize(lines)]
85 print >>sys.stderr, "%d blocks formed." % len(blocks)
86 else:
87 blocks = [poly.__geo_interface__ for poly in lines]
88
89 if not os.path.exists(point_file + '.cache'):
90 print >>sys.stderr, "Caching points, blocks, and names ..."
91 pickle.dump((names, blocks, places), file(point_file + ".cache", "w"), -1)
92 blocks = map(asShape, blocks)
93
94 points = []
95 place_list = set()
96 count = 0
97 for place_id, pts in places.items():
98 count += 1
99 print >>sys.stderr, "\rPreparing %d of %d places..." % (count, len(places)),
100 for pt in pts:
101 place_list.add((len(points), pt+pt, None))
102 points.append((place_id, Point(pt)))
103 print >>sys.stderr, "Indexing...",
104 index = Rtree(place_list)
105 print >>sys.stderr, "Done."
106
107 def score_block(polygon):
108 centroid = polygon.centroid
109 score = {}
110 for item in index.nearest((centroid.x, centroid.y), SAMPLE_SIZE):
111 place_id, point = points[item]
112 score.setdefault(place_id, 0.0)
113 score[place_id] += 1.0 / math.sqrt(max(polygon.distance(point)*SCALE_FACTOR, 1.0))
114 return list(reversed(sorted((sc, place_id) for place_id, sc in score.items())))
115
116 count = 0
117 assigned_blocks = {}
118 for polygon in blocks:
119 count += 1
120 print >>sys.stderr, "\rScoring %d of %d blocks..." % (count, len(blocks)),
121 if polygon.is_empty: continue
122 if not polygon.is_valid:
123 polygon = polygon.buffer(0)
124 if not polygon.is_valid:
125 continue
126 scores = score_block(polygon)
127 winner = scores[0][1]
128 assigned_blocks.setdefault(winner, [])
129 assigned_blocks[winner].append(polygon)
130 print >>sys.stderr, "Done."
131
132 polygons = {}
133 count = 0
134 for place_id in places.keys():
135 count += 1
136 print >>sys.stderr, "\rMerging %d of %d boundaries..." % (count, len(places)),
137 if place_id not in assigned_blocks: continue
138 polygons[place_id] = cascaded_union(assigned_blocks[place_id])
139 print >>sys.stderr, "Done."
140
141 count = 0
142 orphans = []
143 for place_id, multipolygon in polygons.items():
144 count += 1
145 print >>sys.stderr, "\rRemoving %d orphans from %d of %d polygons..." % (len(orphans), count, len(polygons)),
146 if type(multipolygon) is not MultiPolygon: continue
147 polygon_count = [0] * len(multipolygon)
148 for i, polygon in enumerate(multipolygon.geoms):
149 prepared = prep(polygon)
150 for item in index.intersection(polygon.bounds):
151 item_id, point = points[item]
152 if item_id == place_id and prepared.intersects(point):
153 polygon_count[i] += 1
154 winner = max((c, i) for (i, c) in enumerate(polygon_count))[1]
155 polygons[place_id] = multipolygon.geoms[winner]
156 orphans.extend((place_id, p) for i, p in enumerate(multipolygon.geoms) if i != winner)
157 print >>sys.stderr, "Done."
158
159 count = 0
160 total = len(orphans)
161 retries = 0
162 unassigned = None
163 while orphans:
164 unassigned = []
165 for origin_id, orphan in orphans:
166 count += 1
167 changed = False
168 print >>sys.stderr, "\rReassigning %d of %d orphans..." % (count-retries, total),
169 for score, place_id in score_block(orphan):
170 if place_id not in polygons:
171 # Turns out we just wind up assigning tiny, inappropriate places
172 #polygons[place_id] = orphan
173 #changed = True
174 continue
175 elif place_id != origin_id and orphan.intersects(polygons[place_id]):
176 polygons[place_id] = polygons[place_id].union(orphan)
177 changed = True
178 if changed:
179 break
180 if not changed:
181 unassigned.append((origin_id, orphan))
182 retries += 1
183 if len(unassigned) == len(orphans):
184 # give up
185 break
186 orphans = unassigned
187 print >>sys.stderr, "%d retried, %d unassigned." % (retries, len(unassigned))
188
189 print >>sys.stderr, "Returning remaining orphans to original places."
190 for origin_id, orphan in orphans:
191 if orphan.intersects(polygons[origin_id]):
192 polygons[origin_id] = polygons[origin_id].union(orphan)
193
194 print >>sys.stderr, "Buffering polygons."
195 for place_id, polygon in polygons.items():
196 if type(polygon) is Polygon:
17a3ca7 Bug fix for multipolygons.
Schuyler Erle authored
197 polygon = Polygon(polygon.exterior.coords)
612e114 THE NEW HOTNESS
Schuyler Erle authored
198 else:
17a3ca7 Bug fix for multipolygons.
Schuyler Erle authored
199 polygon = MultiPolygon([Polygon(p.exterior.coords)for p in polygon.geoms])
200 polygons[place_id] = polygon.buffer(0)
612e114 THE NEW HOTNESS
Schuyler Erle authored
201
202 print >>sys.stderr, "Writing output."
203 features = []
204 for place_id, poly in polygons.items():
205 features.append({
206 "type": "Feature",
207 "id": place_id,
208 "geometry": poly.__geo_interface__,
209 "properties": {"woe_id": place_id, "name": names.get(place_id, "")}
210 })
211
212 collection = {
213 "type": "FeatureCollection",
214 "features": features
215 }
216
217 print json.dumps(collection)
218
Something went wrong with that request. Please try again.