-
Notifications
You must be signed in to change notification settings - Fork 186
/
wtte.py
376 lines (288 loc) · 11.7 KB
/
wtte.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import warnings
import numpy as np
from keras import backend as K
from keras.callbacks import Callback
def _keras_unstack_hack(ab):
"""Implements tf.unstack(y_true_keras, num=2, axis=-1).
Keras-hack adopted to be compatible with Theano backend.
:param ab: stacked variables
:return a, b: unstacked variables
"""
ndim = len(K.int_shape(ab))
if ndim == 0:
print('can not unstack with ndim=0')
else:
a = ab[..., 0]
b = ab[..., 1]
return a, b
def output_lambda(x, init_alpha=1.0, max_beta_value=5.0,
alpha_kernel_scalefactor=None):
"""Elementwise (Lambda) computation of alpha and regularized beta.
- Alpha:
(activation)
Exponential units seems to give faster training than
the original papers softplus units. Makes sense due to logarithmic
effect of change in alpha.
(initialization)
To get faster training and fewer exploding gradients,
initialize alpha to be around its scale when beta is around 1.0,
approx the expected value/mean of training tte.
Because we're lazy we want the correct scale of output built
into the model so initialize implicitly;
multiply assumed exp(0)=1 by scale factor `init_alpha`.
- Beta:
(activation)
We want slow changes when beta-> 0 so Softplus made sense in the original
paper but we get similar effect with sigmoid. It also has nice features.
(regularization) Use max_beta_value to implicitly regularize the model
(initialization) Fixed to begin moving slowly around 1.0
- Usage
.. code-block:: python
model.add(TimeDistributed(Dense(2)))
model.add(Lambda(wtte.output_lambda, arguments={"init_alpha":init_alpha,
"max_beta_value":2.0
}))
:param x: tensor with last dimension having length 2 with x[...,0] = alpha, x[...,1] = beta
:param init_alpha: initial value of `alpha`. Default value is 1.0.
:param max_beta_value: maximum beta value. Default value is 5.0.
:param max_alpha_value: maxumum alpha value. Default is `None`.
:type x: Array
:type init_alpha: Integer
:type max_beta_value: Integer
:type max_alpha_value: Integer
:return x: A positive `Tensor` of same shape as input
:rtype: Array
"""
if max_beta_value is None or max_beta_value > 3:
if K.epsilon() <= 1e-07 and K.backend() == 'tensorflow':
message = "\
Using tensorflow backend and allowing high `max_beta_value` may lead to\n\
gradient NaN during training unless `K.epsilon()` is small.\n\
Call `keras.backend.set_epsilon(1e-08)` to lower epsilon \
"
warnings.warn(message)
a, b = _keras_unstack_hack(x)
# Implicitly initialize alpha:
if alpha_kernel_scalefactor is None:
a = init_alpha * K.exp(a)
else:
a = init_alpha * K.exp(alpha_kernel_scalefactor*a)
m = max_beta_value
if m > 1.05: # some value >>1.0
# shift to start around 1.0
# assuming input is around 0.0
_shift = np.log(m - 1.0)
b = K.sigmoid(b - _shift)
else:
b = K.sigmoid(b)
# Clipped sigmoid : has zero gradient at 0,1
# Reduces the small tendency of instability after long training
# by zeroing gradient.
b = m * K.clip(x=b, min_value=K.epsilon(), max_value=1. - K.epsilon())
x = K.stack([a, b], axis=-1)
return x
class output_activation(object):
""" Elementwise computation of alpha and regularized beta.
Object-Oriented Wrapper to `output_lambda` using keras.layers.Activation.
- Usage
.. code-block:: python
wtte_activation = wtte.output_activation(init_alpha=1.,
max_beta_value=4.0).activation
model.add(Dense(2))
model.add(Activation(wtte_activation))
"""
def __init__(self, init_alpha=1.0, max_beta_value=5.0):
self.init_alpha = init_alpha
self.max_beta_value = max_beta_value
def activation(self, ab):
""" (Internal function) Activation wrapper
:param ab: original tensor with alpha and beta.
:return ab: return of `output_lambda` with `init_alpha` and `max_beta_value`.
"""
ab = output_lambda(ab, init_alpha=self.init_alpha,
max_beta_value=self.max_beta_value)
return ab
class loss(object):
""" Creates a keras WTTE-loss function.
If regularize is called, a penalty is added creating 'wall' that beta
do not want to pass over. This is not necessary with Sigmoid-beta
activation.
- Usage
:Example:
.. code-block:: python
loss = wtte.loss(kind='discrete').loss_function
model.compile(loss=loss, optimizer=RMSprop(lr=0.01))
# And with masking:
loss = wtte.loss(kind='discrete',reduce_loss=False).loss_function
model.compile(loss=loss, optimizer=RMSprop(lr=0.01),
sample_weight_mode='temporal')
.. note::
With masking keras needs to access each loss-contribution individually.
Therefore we do not sum/reduce down to scalar (dim 1), instead return a
tensor (with reduce_loss=False).
"""
def __init__(self,
kind,
reduce_loss=True,
regularize=False,
location=10.0,
growth=20.0):
self.kind = kind
self.reduce_loss = reduce_loss
self.regularize = regularize
if regularize:
self.location = location
self.growth = growth
def loss_function(self, y_true, y_pred):
def keras_split(y_true, y_pred):
"""
Everything is a hack around the y_true,y_pred paradigm.
"""
y, u = _keras_unstack_hack(y_true)
a, b = _keras_unstack_hack(y_pred)
return y, u, a, b
def loglik_discrete(y, u, a, b, epsilon=1e-35):
hazard0 = K.pow((y + epsilon) / a, b)
hazard1 = K.pow((y + 1.0) / a, b)
loglikelihoods = u * \
K.log(K.exp(hazard1 - hazard0) - 1.0) - hazard1
return loglikelihoods
def loglik_continuous(y, u, a, b, epsilon=1e-35):
ya = (y + epsilon) / a
loglikelihoods = u * (K.log(b) + b * K.log(ya)) - K.pow(ya, b)
return loglikelihoods
def loglik_continuous_conditional_correction(y, u, a, b, epsilon=1e-35):
"""Integrated conditional excess loss.
Explanation TODO
"""
ya = (y + epsilon) / a
loglikelihoods = y * \
(u * (K.log(b) + b * K.log(ya)) - (b / (b + 1.)) * K.pow(ya, b))
return loglikelihoods
def penalty_term(b, location, growth):
scale = growth / location
penalty = K.exp(scale * (b - location))
return penalty
def accumulate_loss(loglikelihoods):
loss = -1.0 * K.mean(loglikelihoods, axis=-1)
return loss
y, u, a, b = keras_split(y_true, y_pred)
if self.kind == 'discrete':
loglikelihoods = loglik_discrete(y, u, a, b)
elif self.kind == 'continuous':
loglikelihoods = loglik_continuous(y, u, a, b)
if self.regularize:
loglikelihoods = loglikelihoods + \
penalty_term(b, self.location, self.growth)
if self.reduce_loss:
loss = accumulate_loss(loglikelihoods)
else:
loss = -loglikelihoods
return loss
class WeightWatcher(Callback):
"""Keras Callback to keep an eye on output layer weights.
(under development)
Usage:
weightwatcher = WeightWatcher(per_batch=True,per_epoch=False)
model.fit(...,callbacks=[weightwatcher])
weightwatcher.plot()
"""
def __init__(self,
per_batch=False,
per_epoch=True
):
self.per_batch = per_batch
self.per_epoch = per_epoch
def on_train_begin(self, logs={}):
self.a_weights_mean = []
self.b_weights_mean = []
self.a_weights_min = []
self.b_weights_min = []
self.a_weights_max = []
self.b_weights_max = []
self.a_bias = []
self.b_bias = []
def append_metrics(self):
# Last two weightlayers in model
output_weights, output_biases = self.model.get_weights()[-2:]
a_weights_mean, b_weights_mean = output_weights.mean(0)
a_weights_min, b_weights_min = output_weights.min(0)
a_weights_max, b_weights_max = output_weights.max(0)
a_bias, b_bias = output_biases
self.a_weights_mean.append(a_weights_mean)
self.b_weights_mean.append(b_weights_mean)
self.a_weights_min.append(a_weights_min)
self.b_weights_min.append(b_weights_min)
self.a_weights_max.append(a_weights_max)
self.b_weights_max.append(b_weights_max)
self.a_bias.append(a_bias)
self.b_bias.append(b_bias)
def on_train_end(self, logs={}):
if self.per_epoch:
self.append_metrics()
return
def on_epoch_begin(self, epoch, logs={}):
if self.per_epoch:
self.append_metrics()
return
def on_epoch_end(self, epoch, logs={}):
return
def on_batch_begin(self, batch, logs={}):
if self.per_batch:
self.append_metrics()
return
def on_batch_end(self, batch, logs={}):
if self.per_batch:
self.append_metrics()
return
def plot(self):
import matplotlib.pyplot as plt
# Create axes
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(self.a_bias, color='b')
ax1.set_xlabel('step')
ax1.set_ylabel('alpha')
ax2.plot(self.b_bias, color='r')
ax2.set_ylabel('beta')
# Change color of each axis
def color_y_axis(ax, color):
"""Color your axes."""
for t in ax.get_yticklabels():
t.set_color(color)
return None
plt.title('biases')
color_y_axis(ax1, 'b')
color_y_axis(ax2, 'r')
plt.show()
###############
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(self.a_weights_min, color='blue',
linestyle='dotted', label='min', linewidth=2)
ax1.plot(self.a_weights_mean, color='blue',
linestyle='solid', label='mean', linewidth=1)
ax1.plot(self.a_weights_max, color='blue',
linestyle='dotted', label='max', linewidth=2)
ax1.set_xlabel('step')
ax1.set_ylabel('alpha')
ax2.plot(self.b_weights_min, color='red',
linestyle='dotted', linewidth=2)
ax2.plot(self.b_weights_mean, color='red',
linestyle='solid', linewidth=1)
ax2.plot(self.b_weights_max, color='red',
linestyle='dotted', linewidth=2)
ax2.set_ylabel('beta')
# Change color of each axis
def color_y_axis(ax, color):
"""Color your axes."""
for t in ax.get_yticklabels():
t.set_color(color)
return None
plt.title('weights (min,mean,max)')
color_y_axis(ax1, 'b')
color_y_axis(ax2, 'r')
plt.show()