-
Notifications
You must be signed in to change notification settings - Fork 155
/
utils.py
executable file
·334 lines (231 loc) · 10.1 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
from ..utils import constants, gislib
import pandas as pd
import geopandas as gpd
import shapely
import os
import errno
import requests
import numpy as np
distance = gislib.getDistance
def diff_seconds(t_0, t_1):
return (t_1 - t_0).total_seconds()
def is_multi_user(data):
if constants.UID in data.columns:
return True
return False
def is_multi_trajectory(data):
if constants.TID in data.columns:
return True
return False
def to_matrix(data, columns=None):
if columns is None:
columns = [constants.LATITUDE, constants.LONGITUDE, constants.DATETIME]
columns = columns + list(set(data.columns) - set(columns))
return data[columns].values
def get_columns(data):
columns = [constants.LATITUDE, constants.LONGITUDE, constants.DATETIME]
columns = columns + list(set(data.columns) - set(columns))
return columns
def to_dataframe(data, columns):
# Reorder columns to maintain the original order
df = pd.DataFrame(data, columns=columns)
return df
def assign_crs(shape, crs):
if crs is not None:
return shape.to_crs(crs)
return shape
def to_geodataframe(df, keep=False, latitude=constants.LATITUDE, longitude=constants.LONGITUDE,
crs=constants.DEFAULT_CRS):
geometry = [shapely.geometry.Point(xy) for xy in zip(df[longitude], df[latitude])]
gdf = gpd.GeoDataFrame(df, crs=crs, geometry=geometry)
df.drop('geometry', inplace=True, axis=1)
if keep is False:
return gdf.drop([longitude, latitude], axis=1)
return gdf
def silentremove(filename):
try:
os.remove(filename)
except OSError as e: # this would be "except OSError, e:" before Python 2.6
if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory
raise # re-raise exception if a different error occurred
def setattrpandas(df, name, value):
df._metadata.append(name)
df.name = value
return df
def group_df_by_time(tdf, freq_str='1D', offset_value=0, offset_unit='hours', add_starting_location=False,
dtformat='%Y-%m-%d %H:%M:%S'):
"""
Split a `TrajDataFrame` into subtrajectories of fixed temporal length (`freq_str`).
:param tdf: TrajDataFrame
`TrajDataFrame` to split.
:param freq_str: str
`freq` parameter of `pd.date_range`. default: '1D' (one day).
:param offset_value: float
value of the offset time used to shift the start time of each subtrajectories.
:param offset_unit: str
time unit of the offset time used to shift the start time of each subtrajectories.
:param add_starting_location: bool
if `True`, the last point of the previous subtrajectory will be appended
at the beginning of the next subtrajectory.
:param dtformat: str
datetime format.
:return: list containing the subtrajectories
"""
df = tdf.sort_values([constants.DATETIME])
offset = pd.Timedelta(offset_value, offset_unit) # datetime.timedelta(seconds=offset_secs)
t_init = pd.to_datetime(df[constants.DATETIME].min().date())
t_end = pd.to_datetime(df[constants.DATETIME].max().date()) + pd.Timedelta(days=1) # datetime.timedelta(days=1)
rg = pd.date_range((t_init + offset).strftime(dtformat), end=(t_end + offset).strftime(dtformat), freq=freq_str)
groups = []
for st, en in list(zip(rg, rg[1:])):
dfslice = df.loc[(df[constants.DATETIME] >= st.strftime(dtformat)) &
(df[constants.DATETIME] < en.strftime(dtformat))]
if len(dfslice) > 0:
# groups[st.date()] = [dfslice]
groups += [dfslice.reset_index(drop=True)]
if add_starting_location:
for i in range(1, len(groups)):
groups[i] = pd.concat([groups[i - 1][-1:], groups[i]]).reset_index(drop=True)
return groups
def frequency_vector(trajectory):
freq = trajectory.groupby([constants.UID, constants.LATITUDE, constants.LONGITUDE]).size().reset_index(
name=constants.FREQUENCY)
return freq.sort_values(by=[constants.UID, constants.FREQUENCY])
def probability_vector(trajectory):
freq = trajectory.groupby([constants.UID, constants.LATITUDE, constants.LONGITUDE]).size().reset_index(
name=constants.FREQUENCY)
prob = pd.merge(freq, trajectory.groupby(constants.UID).size().reset_index(name=constants.TOTAL_FREQ),
left_on=constants.UID, right_on=constants.UID)
prob[constants.PROBABILITY] = prob[constants.FREQUENCY] / prob[constants.TOTAL_FREQ]
return prob[[constants.UID, constants.LATITUDE, constants.LONGITUDE, constants.PROBABILITY]].sort_values(
by=[constants.UID, constants.PROBABILITY])
def date_time_precision(dt, precision):
result = ""
if precision == "Year" or precision == "year":
result += str(dt.year)
elif precision == "Month" or precision == "month":
result += str(dt.year) + str(dt.month)
elif precision == "Day" or precision == "day":
result += str(dt.year) + str(dt.month) + str(dt.day)
elif precision == "Hour" or precision == "hour":
result += str(dt.year) + str(dt.month) + str(dt.day) + str(dt.month)
elif precision == "Minute" or precision == "minute":
result += str(dt.year) + str(dt.month) + str(dt.day) + str(dt.month) + str(dt.minute)
elif precision == "Second" or precision == "second":
result += str(dt.year) + str(dt.month) + str(dt.day) + str(dt.month) + str(dt.minute) + str(dt.second)
return result
def bbox_from_points(points, crs=None):
try:
coords = points.total_bounds
except AttributeError:
coords = points
base = shapely.geometry.box(coords[0], coords[1], coords[2], coords[3], ccw=True)
base = gpd.GeoDataFrame(geometry=[base], crs=constants.DEFAULT_CRS)
if crs is None:
return base
return base.to_crs(crs)
def bbox_from_area(area, bbox_side_len=500, crs=None):
centroid = area.iloc[0].geometry.centroid
# get North-East corner
ne = [float(coord) + (bbox_side_len / 2) for coord in centroid]
# get South-West corner
sw = [float(coord) - (bbox_side_len / 2) for coord in centroid]
# build bbox from NE,SW corners
bbox = shapely.geometry.box(sw[0], sw[1], ne[0], ne[1], ccw=True)
base = gpd.GeoDataFrame(geometry=[bbox], crs=constants.DEFAULT_CRS)
if crs is None:
return base
return base.to_crs(crs)
def bbox_from_name(query, which_osm_result=0, crs=None):
"""
Create a GeoDataFrame from a single place name query.
(adapted from https://github.com/gboeing/osmnx)
Parameters
----------
query : string or dict
query string or structured query dict to geocode/download
which_osm_result : int
number of result to return (`which_osm_result=-1` to return all results)
Returns
-------
GeoDataFrame
Example
-------
gdf = gdf_from_string("Florence, Italy")
"""
nominatim_url = "https://nominatim.openstreetmap.org/search.php?" + \
"q=%s&polygon_geojson=1&format=json" % query
response = requests.get(nominatim_url)
data = response.json()
if len(data) > 0:
features = []
for result in data:
# extract data elements from the JSON response
bbox_south, bbox_north, bbox_west, bbox_east = [float(x) for x in result['boundingbox']]
coords = result['geojson']['coordinates']
try:
# it is a MultiPolygon
geometry = shapely.geometry.MultiPolygon( \
[shapely.geometry.Polygon(c[0], [inner for inner in c[1:]]) for c in coords])
except TypeError:
try:
# it is a Polygon
geometry = shapely.geometry.MultiPolygon( \
[shapely.geometry.Polygon(coords[0], [inner for inner in coords[1:]])])
except TypeError:
# it is something else
geometry = result['geojson']
place = result['display_name']
features += [{'type': 'Feature',
'geometry': geometry,
'properties': {'place_name': place,
'bbox_north': bbox_north,
'bbox_south': bbox_south,
'bbox_east': bbox_east,
'bbox_west': bbox_west}}]
gdf = gpd.GeoDataFrame.from_features(features)
if crs is None:
gdf.crs = constants.DEFAULT_CRS
else:
gdf = gdf.to_crs(crs)
if which_osm_result >= 0:
gdf = gdf.loc[[which_osm_result]]
else:
gdf = gpd.GeoDataFrame()
return gdf
def nearest(origin, tessellation, col):
"""
Brute force approach to find, for each point in a geodataframe, the nearest point into another geodataframe. It
returns a Pandas Series containing the value in col for each matching row.
:param origin: GeoDataFrame
:param tessellation: GeoDataFrame
:param col: column containing the value to return from the tessellation
:return: Series
"""
def _nearest(df, points):
near = float("+inf")
point = None
main = (df['geometry'].y, df['geometry'].x)
for index, row in points.iterrows():
p = row['geometry']
d = distance(main, (p.y, p.x))
if d < near:
near = d
point = index
return point
return tessellation.iloc[origin.apply(_nearest, args=(tessellation,), axis=1)][col]
def get_geom_centroid(geom, return_lat_lng=False):
"""
Compute the centroid of a Polygon or Multipolygon.
:param geom: shapely Polygon or Multipolygon
geometry, Polygon or Multipolygon, whose centroid will be computed.
:param return_lat_lng: bool
if `True`, the first coordinate in the returned list is the centroid's latitude, otherwise it is the longitude.
:return: list
coordinates of `geom`'s centroid.
"""
lng, lat = map(lambda x: x.pop(), geom.centroid.xy)
if return_lat_lng:
return [lat, lng]
else:
return [lng, lat]