/
vectorized_raster_interpolation.py
589 lines (435 loc) · 23.1 KB
/
vectorized_raster_interpolation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
"""Functions to perform Population Interpolation using a vectorized version of
a raster file. This is a generic framework that can be used to distribute
population more accurately in harmonized spatial structures.
Inspired by Reibel, Michael, and Aditya Agrawal. "Areal interpolation of population counts using pre-classified land cover data." Population Research and Policy Review 26.5-6 (2007): 619-633.
Note: This study was financed in part by the Coordenação de Aperfeiçoamento de Pessoal de Nível Superior - Brasil (CAPES) - Process number 88881.170553/2018-01
"""
__author__ = "Renan X. Cortes <renanc@ucr.edu>, Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>"
import json
import pandas as pd
from geopandas import GeoDataFrame
import geopandas as gpd
import numpy as np
import rasterio
from rasterio.mask import mask
import rasterstats as rs
import warnings
from tqdm.auto import tqdm
from tobler.util.util import _check_presence_of_crs
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Poisson, Gaussian, NegativeBinomial
__all__ = [
"getFeatures",
"fast_append_profile_in_gdf",
"return_weights_from_regression",
"return_weights_from_xgboost",
"create_lon_lat",
"create_non_zero_population_by_pixels_locations",
"calculate_interpolated_polygon_population_from_correspondence_table",
"calculate_interpolated_population_from_correspondence_table",
]
def getFeatures(gdf):
"""Function to parse features from GeoDataFrame in such a manner that
rasterio wants them.
Notes
-----
This function was obtained at https://automating-gis-processes.github.io/CSC18/lessons/L6/clipping-raster.html.
"""
return [json.loads(gdf.to_json())["features"][0]["geometry"]]
def fast_append_profile_in_gdf(geodataframe, raster_path, force_crs_match=True):
"""Function that appends the columns of the profile in a geopandas
according to a given raster taking advantage of rasterstats.
geodataframe : geopandas.GeoDataFrame
geodataframe that has overlay with the raster. The variables of the profile will be appended in this data.
If some polygon do not overlay the raster, consider a preprocessing step using the function subset_gdf_polygons_from_raster.
raster_path : str
the path to the associated raster image.
force_crs_match : bool, Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
Notes
-----
The generated geodataframe will input the value 0 for each Type that is not present in the raster for each polygon.
"""
_check_presence_of_crs(geodataframe)
if force_crs_match:
with rasterio.open(raster_path) as raster:
# raster =
with warnings.catch_warnings():
warnings.simplefilter("ignore")
geodataframe = geodataframe.to_crs(crs=raster.crs.data)
else:
warnings.warn(
"The GeoDataFrame is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
)
zonal_gjson = rs.zonal_stats(
geodataframe, raster_path, prefix="Type_", geojson_out=True, categorical=True
)
zonal_ppt_gdf = GeoDataFrame.from_features(zonal_gjson)
return zonal_ppt_gdf
def return_weights_from_regression(
geodataframe,
raster_path,
pop_string,
codes=[21, 22, 23, 24],
likelihood="poisson",
formula_string=None,
n_pixels_option_values=256,
force_crs_match=True,
na_value=255,
ReLU=True,
):
"""Function that returns the weights of each land type according to NLCD
types/codes.
Parameters
----------
geodataframe : geopandas.GeoDataFrame
used to build regression
raster_path : str
the path to the associated raster image.
formula_string : str
patsy-style model formula
pop_string : str
the name of the variable on geodataframe that the regression shall be conducted
codes : list
an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
likelihood : str, {'Poisson', 'Gaussian'}
the likelihood assumed for the dependent variable (population). It can be 'Poisson' or 'Gaussian'.
With the 'Poisson' a Generalized Linear Model with log as link function will be fitted and 'Gaussian' an ordinary least squares will be fitted.
n_pixels_option_values : int
number of options of the pixel values of rasterior. Default is 256.
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
na_value : int. Default is 255.
The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
ReLU : bool. Default is True.
Whether the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.
Notes
-----
1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
"""
_check_presence_of_crs(geodataframe)
if na_value in codes:
raise ValueError("codes should not assume the na_value value.")
if not likelihood in ["poisson", "gaussian"]:
raise ValueError("likelihood must one of 'poisson', 'gaussian'")
profiled_df = fast_append_profile_in_gdf(
geodataframe[["geometry", pop_string]], raster_path, force_crs_match
) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).
# If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
codes.sort()
if not formula_string:
# Formula WITHOUT intercept
str_codes = [str(i) for i in codes]
formula_string = (
pop_string + " ~ -1 + " + " + ".join(["Type_" + s for s in str_codes])
)
if likelihood == "poisson":
results = smf.glm(formula_string, data=profiled_df, family=Poisson()).fit()
if likelihood == "gaussian":
results = smf.ols(formula_string, data=profiled_df).fit()
weights = np.zeros(n_pixels_option_values)
weights[codes] = results.params
if ReLU:
weights = np.where(weights < 0, 0, weights)
return weights
def return_weights_from_xgboost(
geodataframe,
raster_path,
pop_string,
codes=[21, 22, 23, 24],
n_pixels_option_values=256,
tuned_xgb=False,
gbm_hyperparam_grid={
"learning_rate": [0.001, 0.01, 0.1],
"n_estimators": [200],
"subsample": [0.3, 0.5],
"max_depth": [4, 5, 6],
"num_boosting_rounds": [10, 20],
},
force_crs_match=True,
na_value=255,
ReLU=True,
):
"""Function that returns the weights of each land type according to NLCD
types/codes given by Extreme Gradient Boost model (XGBoost)
Parameters
----------
geodataframe : a geopandas geoDataFrame used to build regression
raster_path : the path to the associated raster image.
pop_string : the name of the variable on geodataframe that the regression shall be conducted
codes : an integer list of codes values that should be considered as 'populated' from the National Land Cover Database (NLCD).
The description of each code can be found here: https://www.mrlc.gov/sites/default/files/metadata/landcover.html
The default is 21 (Developed, Open Space), 22 (Developed, Low Intensity), 23 (Developed, Medium Intensity) and 24 (Developed, High Intensity).
n_pixels_option_values : number of options of the pixel values of rasterior. Default is 256.
tuned_xgb : bool. Default is False.
If True the XGBoost model will be tuned making a grid search using gbm_hyperparam_grid dictionary a picking the best model in terms of mean squared error with some pre-defined number of cross-validation.
Otherwise, the XGBoost model is fitted with default values of xgboost.train function from xgboost Python library.
gbm_hyperparam_grid : a dictionary that represent the grid for the grid search of XGBoost.
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
na_value : int. Default is 255.
The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
ReLU : bool. Default is True.
Wheter the Rectified Linear Units (ReLU) transformation will be used to avoid negative weights for the land types.
Notes
-----
1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
1) The formula uses a substring called 'Type_' before the code number due to the 'append_profile_in_gdf' function.
2) The pixel value, usually, ranges from 0 to 255. That is why the default of 'n_pixels_option_values' is 256.
3) The returning weights represent the average of the Shapley's values from each feature.
"""
try:
import xgboost as xgb
import shap
except ImportError as e:
raise ImportError("xgboost and shap are required to perform this.")
_check_presence_of_crs(geodataframe)
if na_value in codes:
raise ValueError("codes should not assume the na_value value.")
profiled_df = fast_append_profile_in_gdf(
geodataframe[["geometry", pop_string]], raster_path, force_crs_match
) # Use only two columns to build the weights (this avoids error, if the original dataset has already types appended on it).
# If the list is unsorted, the codes will be sorted to guarantee that the position of the weights will match
codes.sort()
str_codes = [str(i) for i in codes]
feature_names = ["Type_" + s for s in str_codes]
y = profiled_df[pop_string]
X = profiled_df[feature_names]
if tuned_xgb == False:
# Create the DMatrix
xgb_dmatrix = xgb.DMatrix(X, y)
# Create the parameter dictionary
params = {"objective": "reg:linear"}
# Train the model
xg_reg = xgb.train(params=params, dtrain=xgb_dmatrix)
if tuned_xgb == True:
try:
from sklearn.model_selection import GridSearchCV
except ImportError as e:
raise ImportError("sklearn is required to perform this.")
gbm = xgb.XGBRegressor()
grid_mse = GridSearchCV(
estimator=gbm,
param_grid=gbm_hyperparam_grid,
scoring="neg_mean_squared_error",
cv=4, # 4-fold crossvalidation
verbose=3, # Prints the grid search profile
n_jobs=-1,
) # Process the GridSearch in parallel all cores availables
# Fit the grid to the data
grid_mse.fit(X, y)
best_params = grid_mse.best_params_
best_params["objective"] = "reg:linear"
# Create the DMatrix
xgb_dmatrix = xgb.DMatrix(X, y)
# Train the model from the best parameters of the grid search
xg_reg = xgb.train(params=best_params, dtrain=xgb_dmatrix)
# Build explainer and fit Shapley's values (https://github.com/slundberg/shap)
explainer = shap.TreeExplainer(xg_reg, feature_dependence="independent")
shap_values = explainer.shap_values(X)
weights_from_xgb = shap_values.mean(axis=0) # This is already sorted by pixel Type
weights = np.zeros(n_pixels_option_values)
weights[codes] = list(weights_from_xgb) # Convert to list a dict_values
if ReLU:
weights = np.where(weights < 0, 0, weights)
return weights
def create_lon_lat(out_img, out_transform):
"""Function that returns a tuple of longitudes and latitudes from numpy
array and Affline.
Parameters
----------
out_img : numpy array generated by the mask function (first tuple element)
out_transform : Affline transformation generated by the mask function (second tuple element)
Notes
-----
Inside the inner loop there is an important thing to consider which is that the dimensions
out_img is (1, lat, lon), whereas the Affline transformation gives (lon, lat) (or, accordingly
to their documentation is (x,y)) that is why it is swapped the indexes.
Also, note the repetition in the indexes of (j, j) and (i, i) inside the inner loop.
"""
lons = np.empty(shape=(1, out_img.shape[1], out_img.shape[2]))
lats = np.empty(shape=(1, out_img.shape[1], out_img.shape[2]))
for i in range(out_img.shape[1]):
for j in range(out_img.shape[2]):
lons[0][i][j] = (out_transform * (j, j))[0] # Only index j.
lats[0][i][j] = (out_transform * (i, i))[1] # Only indexes of longitudes!
# Return two arrays: one is the longitudes of each pixel and one is the latitudes of each pixel
return lons, lats
def create_non_zero_population_by_pixels_locations(
geodataframe, raster, pop_string, weights=None, force_crs_match=True
):
"""Function that returns the actual population of each pixel from a given
geodataframe and variable.
geodataframe : a geopandas dataframe that it is contained in the raster
raster : the raster used from rasterio
pop_string : a string of the column name of the geodataframe that the estimation will be made
weights : vector of weights in each position of the pixel values according 'return_weights_from_regression' function. This must be provided by the user.
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
"""
_check_presence_of_crs(geodataframe)
if not force_crs_match:
warnings.warn(
"The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
)
else:
with rasterio.open(raster) as raster:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
geodataframe_projected = geodataframe.to_crs(crs=raster.crs.data)
result_pops_array = np.array([])
result_lons_array = np.array([])
result_lats_array = np.array([])
pbar = tqdm(
total=len(geodataframe_projected),
desc="Estimating population per pixel",
)
for line_index in range(len(geodataframe_projected)):
polygon_projected = geodataframe_projected.iloc[[line_index]]
coords = getFeatures(polygon_projected)
out_img, out_transform = mask(dataset=raster, shapes=coords, crop=True)
"""Calculating the population for each pixel"""
trans_numpy = weights[out_img] # Pixel population from regression
orig_estimate = polygon_projected[
pop_string
] # Original Population Value of The polygon
correction_term = orig_estimate / trans_numpy.sum()
final_pop_numpy_pre = trans_numpy * np.array(correction_term)
flatten_final_pop_numpy_pre = np.ndarray.flatten(final_pop_numpy_pre)
non_zero_pop_index = np.where(flatten_final_pop_numpy_pre != 0)
final_pop_numpy = flatten_final_pop_numpy_pre[non_zero_pop_index]
"""Retrieving location of each pixel"""
lons, lats = create_lon_lat(out_img, out_transform)
final_lons = np.ndarray.flatten(lons)[non_zero_pop_index]
final_lats = np.ndarray.flatten(lats)[non_zero_pop_index]
"""Append all flattens numpy arrays"""
result_pops_array = np.append(result_pops_array, final_pop_numpy)
result_lons_array = np.append(result_lons_array, final_lons)
result_lats_array = np.append(result_lats_array, final_lats)
pbar.update(1)
data = {
"pop_value": result_pops_array,
"lons": result_lons_array.round().astype(int).tolist(),
"lats": result_lats_array.round().astype(int).tolist(),
}
corresp = pd.DataFrame.from_dict(data)
pbar.close()
return corresp
def calculate_interpolated_polygon_population_from_correspondence_table(
polygon, raster, corresp_table, force_crs_match=True, na_value=255
):
"""Function that returns the interpolated population of a given polygon
according to a correspondence table previous built.
Parameters
----------
polygon : polygon for the profile (it can be a row of a geopandas)
raster : the associated raster (from rasterio.open)
corresp_table : correspondence table that has the interpolated population for each pixel. This object is created with the function 'create_non_zero_population_by_pixels_locations'.
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
na_value : int. Default is 255.
The number which is considered to be 'Not a Number' (NaN) in the raster pixel values.
Notes
-----
When you clip a specific polygon, there are pixels that lie beyond the polygon extent, because the clipping is rectangular.
Therefore, the population could be wrongly summed from another spatial unit.
The solution is to build a pandas and filter the pixels different than 255. This is done during the construction of the polygon summary for the resulting population of this function.
"""
_check_presence_of_crs(polygon)
if force_crs_match:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
polygon_projected = polygon.to_crs(crs=raster.crs.data)
else:
warnings.warn(
"The polygon is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
)
coords = getFeatures(polygon_projected)
out_img, out_transform = mask(dataset=raster, shapes=coords, crop=True)
lons, lats = create_lon_lat(out_img, out_transform)
data = {
"lons": np.ndarray.flatten(lons).round().astype(int).tolist(),
"lats": np.ndarray.flatten(lats).round().astype(int).tolist(),
"pixel_value": np.ndarray.flatten(out_img),
}
polygon_summary_full = pd.DataFrame.from_dict(data)
# Remove pixels of the polygon that do not belong to the spatial unit, but might be from another one
polygon_summary = polygon_summary_full[polygon_summary_full.pixel_value != na_value]
merged_polygon = corresp_table.merge(polygon_summary, on=["lons", "lats"])
pop = merged_polygon["pop_value"].sum()
return pop
def calculate_interpolated_population_from_correspondence_table(
geodataframe, raster, corresp_table, variable_name=None, force_crs_match=True
):
"""Function that returns the interpolated population of an entire geopandas
according to a correspondence table previous built.
Parameters
----------
geodataframe : a GeoDataFrame from geopandas
raster : the associated raster (from rasterio.open)
corresp_table : correspondence table that has the interpolated population for each pixel. This object is created with the function 'create_non_zero_population_by_pixels_locations'.
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
Notes
-----
This function returns the same GeoDataFrame used as input with the addition of a new variable called 'interpolated_population', which is the resulting population.
"""
_check_presence_of_crs(geodataframe)
final_geodataframe = geodataframe.copy()[["geometry"]]
pop_final = np.empty(len(geodataframe))
with rasterio.open(raster) as raster:
pbar = tqdm(total=len(geodataframe), desc="Estimating target polygon values")
for line_index in range(len(geodataframe)):
polygon = geodataframe.iloc[[line_index]]
pop_aux = calculate_interpolated_polygon_population_from_correspondence_table(
polygon, raster, corresp_table, force_crs_match
)
pop_final[line_index] = pop_aux
pbar.update(1)
pbar.close()
final_geodataframe[variable_name] = pop_final
return final_geodataframe
def subset_gdf_polygons_from_raster(geodataframe, raster, force_crs_match=True):
"""Function that returns only the polygons that actually have some
intersection with a given raster.
Parameters
----------
geodataframe : a GeoDataFrame from geopandas
raster : the associated raster (from rasterio.open)
force_crs_match : bool. Default is True.
Wheter the Coordinate Reference System (CRS) of the polygon will be reprojected to the CRS of the raster file.
It is recommended to let this argument as True.
"""
_check_presence_of_crs(geodataframe)
if force_crs_match:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
reprojected_gdf = geodataframe.to_crs(crs=raster.crs.data)
else:
warnings.warn(
"The geodataframe is not being reprojected. The clipping might be being performing on unmatching polygon to the raster."
)
# has_intersection is a boolean vector: True if the polygon has some overlay with raster, False otherwise
has_intersection = []
pbar = tqdm(total=len(reprojected_gdf), desc="Subsetting polygons")
for i in list(range(len(reprojected_gdf))):
pbar.update(1)
coords = getFeatures(reprojected_gdf.iloc[[i]])
try:
out_img = mask(dataset=raster, shapes=coords, crop=True)[0]
has_intersection.append(True)
except:
has_intersection.append(False)
pbar.close()
overlayed_subset_gdf = reprojected_gdf.iloc[has_intersection]
overlayed_subset_gdf = overlayed_subset_gdf.set_geometry("geometry")
return overlayed_subset_gdf