-
Notifications
You must be signed in to change notification settings - Fork 122
/
sel_bw.py
executable file
·442 lines (394 loc) · 18.4 KB
/
sel_bw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# GWR Bandwidth selection class
#x_glob parameter does not yet do anything; it is for semiparametric
__author__ = "Taylor Oshan Tayoshan@gmail.com"
import spreg.user_output as USER
import warnings
import numpy as np
from scipy.spatial.distance import pdist
from scipy.optimize import minimize_scalar
from spglm.family import Gaussian, Poisson, Binomial
from .kernels import Kernel,local_cdist
from .gwr import GWR
from .search import golden_section, equal_interval, multi_bw
from .diagnostics import get_AICc, get_AIC, get_BIC, get_CV
getDiag = {'AICc': get_AICc, 'AIC': get_AIC, 'BIC': get_BIC, 'CV': get_CV}
class Sel_BW(object):
"""
Select bandwidth for kernel
Methods: p211 - p213, bandwidth selection
:cite:`fotheringham_geographically_2002`: Fotheringham, A. S., Brunsdon, C., & Charlton, M. (2002).
Geographically weighted regression: the analysis of spatially varying relationships.
Parameters
----------
y : array
n*1, dependent variable.
X_glob : array
n*k1, fixed independent variable.
X_loc : array
n*k2, local independent variable, including constant.
coords : list of tuples
(x,y) of points used in bandwidth selection
family : family object/instance, optional
underlying probability model: Gaussian(), Poisson(),
Binomial(). Default is Gaussian().
offset : array
n*1, the offset variable at the ith location. For Poisson model
this term is often the size of the population at risk or
the expected size of the outcome in spatial epidemiology
Default is None where Ni becomes 1.0 for all locations
kernel : string, optional
kernel function: 'gaussian', 'bisquare', 'exponential'.
Default is 'bisquare'.
fixed : boolean
True for fixed bandwidth and False for adaptive (NN)
multi : True for multiple (covaraite-specific) bandwidths
False for a traditional (same for all covariates)
bandwdith; defualt is False.
constant : boolean
True to include intercept (default) in model and False to exclude
intercept.
spherical : boolean
True for shperical coordinates (long-lat),
False for projected coordinates (defalut).
n_jobs : integer
The number of jobs (default -1) to run in parallel. -1 means using all processors.
Attributes
----------
y : array
n*1, dependent variable.
X_glob : array
n*k1, fixed independent variable.
X_loc : array
n*k2, local independent variable, including constant.
coords : list of tuples
(x,y) of points used in bandwidth selection
family : string
GWR model type: 'Gaussian', 'logistic, 'Poisson''
kernel : string
type of kernel used and wether fixed or adaptive
fixed : boolean
True for fixed bandwidth and False for adaptive (NN)
criterion : string
bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV'
search_method : string
bw search method: 'golden', 'interval'
bw_min : float
min value used in bandwidth search
bw_max : float
max value used in bandwidth search
interval : float
interval increment used in interval search
tol : float
tolerance used to determine convergence
max_iter : integer
max interations if no convergence to tol
multi : True for multiple (covaraite-specific) bandwidths
False for a traditional (same for all covariates)
bandwdith; defualt is False.
constant : boolean
True to include intercept (default) in model and False to exclude
intercept.
offset : array
n*1, the offset variable at the ith location. For Poisson model
this term is often the size of the population at risk or
the expected size of the outcome in spatial epidemiology
Default is None where Ni becomes 1.0 for all locations
spherical : boolean
True for shperical coordinates (long-lat),
False for projected coordinates (defalut).
search_params : dict
stores search arguments
int_score : boolan
True if adaptive bandwidth is being used and bandwdith
selection should be discrete. False
if fixed bandwidth is being used and bandwidth does not have
to be discrete.
bw : scalar or array-like
Derived optimal bandwidth(s). Will be a scalar for GWR
(multi=False) and a list of scalars for MGWR (multi=True)
with one bandwidth for each covariate.
S : array
n*n, hat matrix derived from the iterative backfitting
algorthim for MGWR during bandwidth selection
R : array
n*n*k, partial hat matrices derived from the iterative
backfitting algoruthm for MGWR during bandwidth selection.
There is one n*n matrix for each of the k covariates.
params : array
n*k, calibrated parameter estimates for MGWR based on the
iterative backfitting algorithm - computed and saved here to
avoid having to do it again in the MGWR object.
Examples
--------
>>> import libpysal as ps
>>> from mgwr.sel_bw import Sel_BW
>>> data = ps.io.open(ps.examples.get_path('GData_utm.csv'))
>>> coords = list(zip(data.by_col('X'), data.by_col('Y')))
>>> y = np.array(data.by_col('PctBach')).reshape((-1,1))
>>> rural = np.array(data.by_col('PctRural')).reshape((-1,1))
>>> pov = np.array(data.by_col('PctPov')).reshape((-1,1))
>>> african_amer = np.array(data.by_col('PctBlack')).reshape((-1,1))
>>> X = np.hstack([rural, pov, african_amer])
Golden section search AICc - adaptive bisquare
>>> bw = Sel_BW(coords, y, X).search(criterion='AICc')
>>> print(bw)
93.0
Golden section search AIC - adaptive Gaussian
>>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='AIC')
>>> print(bw)
50.0
Golden section search BIC - adaptive Gaussian
>>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='BIC')
>>> print(bw)
62.0
Golden section search CV - adaptive Gaussian
>>> bw = Sel_BW(coords, y, X, kernel='gaussian').search(criterion='CV')
>>> print(bw)
68.0
Interval AICc - fixed bisquare
>>> sel = Sel_BW(coords, y, X, fixed=True)
>>> bw = sel.search(search_method='interval', bw_min=211001.0, bw_max=211035.0, interval=2)
>>> print(bw)
211025.0
"""
def __init__(self, coords, y, X_loc, X_glob=None, family=Gaussian(),
offset=None, kernel='bisquare', fixed=False, multi=False,
constant=True, spherical=False,n_jobs=-1):
self.coords = np.array(coords)
self.y = y
self.X_loc = X_loc
if X_glob is not None:
self.X_glob = X_glob
else:
self.X_glob = []
self.family = family
self.fixed = fixed
self.kernel = kernel
if offset is None:
self.offset = np.ones((len(y), 1))
else:
self.offset = offset * 1.0
self.multi = multi
self._functions = []
self.constant = constant
self.spherical = spherical
self.n_jobs = n_jobs
self.search_params = {}
def search(self, search_method='golden_section', criterion='AICc',
bw_min=None, bw_max=None, interval=0.0, tol=1.0e-6,
max_iter=200, init_multi=None, tol_multi=1.0e-5,
rss_score=False, max_iter_multi=200, multi_bw_min=[None],
multi_bw_max=[None
], bws_same_times=5, verbose=False,pool=None):
"""
Method to select one unique bandwidth for a gwr model or a
bandwidth vector for a mgwr model.
Parameters
----------
criterion : string
bw selection criterion: 'AICc', 'AIC', 'BIC', 'CV'
search_method : string
bw search method: 'golden', 'interval'
bw_min : float
min value used in bandwidth search
bw_max : float
max value used in bandwidth search
multi_bw_min : list
min values used for each covariate in mgwr bandwidth search.
Must be either a single value or have one value for
each covariate including the intercept
multi_bw_max : list
max values used for each covariate in mgwr bandwidth
search. Must be either a single value or have one value
for each covariate including the intercept
interval : float
interval increment used in interval search
tol : float
tolerance used to determine convergence
max_iter : integer
max iterations if no convergence to tol
init_multi : float
None (default) to initialize MGWR with a bandwidth
derived from GWR. Otherwise this option will choose the
bandwidth to initialize MGWR with.
tol_multi : convergence tolerence for the multiple bandwidth
backfitting algorithm; a larger tolerance may stop the
algorith faster though it may result in a less optimal
model
max_iter_multi : max iterations if no convergence to tol for multiple
bandwidth backfitting algorithm
rss_score : True to use the residual sum of sqaures to evaluate
each iteration of the multiple bandwidth backfitting
routine and False to use a smooth function; default is
False
bws_same_times : If bandwidths keep the same between iterations for
bws_same_times (default 5) in backfitting, then use the
current set of bandwidths as final bandwidths.
verbose : Boolean
If true, bandwidth searching history is printed out; default is False.
pool : None, deprecated and not used.
Returns
-------
bw : scalar or array
optimal bandwidth value or values; returns scalar for
multi=False and array for multi=True; ordering of bandwidths
matches the ordering of the covariates (columns) of the
designs matrix, X
"""
k = self.X_loc.shape[1]
if self.constant: #k is the number of covariates
k += 1
self.search_method = search_method
self.criterion = criterion
self.bw_min = bw_min
self.bw_max = bw_max
self.bws_same_times = bws_same_times
self.verbose = verbose
if len(multi_bw_min) == k:
self.multi_bw_min = multi_bw_min
elif len(multi_bw_min) == 1:
self.multi_bw_min = multi_bw_min * k
else:
raise AttributeError(
"multi_bw_min must be either a list containing"
" a single entry or a list containing an entry for each of k"
" covariates including the intercept")
if len(multi_bw_max) == k:
self.multi_bw_max = multi_bw_max
elif len(multi_bw_max) == 1:
self.multi_bw_max = multi_bw_max * k
else:
raise AttributeError(
"multi_bw_max must be either a list containing"
" a single entry or a list containing an entry for each of k"
" covariates including the intercept")
if pool:
warnings.warn("The pool parameter is no longer used and will have no effect; parallelization is default and implemented using joblib instead.", RuntimeWarning, stacklevel=2)
self.interval = interval
self.tol = tol
self.max_iter = max_iter
self.init_multi = init_multi
self.tol_multi = tol_multi
self.rss_score = rss_score
self.max_iter_multi = max_iter_multi
self.search_params['search_method'] = search_method
self.search_params['criterion'] = criterion
self.search_params['bw_min'] = bw_min
self.search_params['bw_max'] = bw_max
self.search_params['interval'] = interval
self.search_params['tol'] = tol
self.search_params['max_iter'] = max_iter
#self._check_min_max()
self.int_score = not self.fixed
if self.multi:
self._mbw()
self.params = self.bw[3] #params n by k
self.sel_hist = self.bw[-2] #bw searching history
self.bw_init = self.bw[
-1] #scalar, optimal bw from initial gwr model
else:
self._bw()
self.sel_hist = self.bw[-1]
return self.bw[0]
def _bw(self):
gwr_func = lambda bw: getDiag[self.criterion](GWR(
self.coords, self.y, self.X_loc, bw, family=self.family, kernel=
self.kernel, fixed=self.fixed, constant=self.constant, offset=self.
offset, spherical=self.spherical, n_jobs=self.n_jobs).fit(lite=True))
self._optimized_function = gwr_func
if self.search_method == 'golden_section':
a, c = self._init_section(self.X_glob, self.X_loc, self.coords,
self.constant)
delta = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0
self.bw = golden_section(a, c, delta, gwr_func, self.tol,
self.max_iter, self.bw_max, self.int_score,
self.verbose)
elif self.search_method == 'interval':
self.bw = equal_interval(self.bw_min, self.bw_max, self.interval,
gwr_func, self.int_score, self.verbose)
elif self.search_method == 'scipy':
self.bw_min, self.bw_max = self._init_section(
self.X_glob, self.X_loc, self.coords, self.constant)
if self.bw_min == self.bw_max:
raise Exception(
'Maximum bandwidth and minimum bandwidth must be distinct for scipy optimizer.'
)
self._optimize_result = minimize_scalar(
gwr_func, bounds=(self.bw_min, self.bw_max), method='bounded')
self.bw = [self._optimize_result.x, self._optimize_result.fun, []]
else:
raise TypeError('Unsupported computational search method ',
self.search_method)
def _mbw(self):
y = self.y
if self.constant:
X,keep_x,warn = USER.check_constant(self.X_loc)
else:
X = self.X_loc
n, k = X.shape
family = self.family
offset = self.offset
kernel = self.kernel
fixed = self.fixed
spherical = self.spherical
coords = self.coords
search_method = self.search_method
criterion = self.criterion
bw_min = self.bw_min
bw_max = self.bw_max
multi_bw_min = self.multi_bw_min
multi_bw_max = self.multi_bw_max
interval = self.interval
tol = self.tol
max_iter = self.max_iter
bws_same_times = self.bws_same_times
def gwr_func(y, X, bw):
return GWR(coords, y, X, bw, family=family, kernel=kernel,
fixed=fixed, offset=offset, constant=False,
spherical=self.spherical, hat_matrix=False,n_jobs=self.n_jobs).fit(
lite=True)
def bw_func(y, X):
selector = Sel_BW(coords, y, X, X_glob=[], family=family,
kernel=kernel, fixed=fixed, offset=offset,
constant=False, spherical=self.spherical,n_jobs=self.n_jobs)
return selector
def sel_func(bw_func, bw_min=None, bw_max=None):
return bw_func.search(
search_method=search_method, criterion=criterion,
bw_min=bw_min, bw_max=bw_max, interval=interval, tol=tol,
max_iter=max_iter, verbose=False)
self.bw = multi_bw(self.init_multi, y, X, n, k, family, self.tol_multi,
self.max_iter_multi, self.rss_score, gwr_func,
bw_func, sel_func, multi_bw_min, multi_bw_max,
bws_same_times, verbose=self.verbose)
def _init_section(self, X_glob, X_loc, coords, constant):
if len(X_glob) > 0:
n_glob = X_glob.shape[1]
else:
n_glob = 0
if len(X_loc) > 0:
n_loc = X_loc.shape[1]
else:
n_loc = 0
if constant:
n_vars = n_glob + n_loc + 1
else:
n_vars = n_glob + n_loc
n = np.array(coords).shape[0]
if self.int_score:
a = 40 + 2 * n_vars
c = n
else:
min_dist = np.min(np.array([np.min(np.delete(
local_cdist(coords[i],coords,spherical=self.spherical),i))
for i in range(n)]))
max_dist = np.max(np.array([np.max(
local_cdist(coords[i],coords,spherical=self.spherical))
for i in range(n)]))
a = min_dist / 2.0
c = max_dist * 2.0
if self.bw_min is not None:
a = self.bw_min
if self.bw_max is not None and self.bw_max is not np.inf:
c = self.bw_max
return a, c