-
Notifications
You must be signed in to change notification settings - Fork 0
/
RL_learn.py
430 lines (345 loc) · 17.1 KB
/
RL_learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
"""
# Deep Q Network
# Battery Model
created by: Qiong
09/28/2021
"""
import numpy as np
import matplotlib.pyplot as plt
# import tensorflow as tf
import tensorflow.compat.v1 as tf
def set_seed(seed):
np.random.seed(seed)
tf.set_random_seed(seed)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.optimizers import Adam
# Deep Q-Network Model, Rewrite in DQNPrioritizedReplay
class DQNNet():
def __init__(self, state_size, action_size, learning_rate):
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.model = self.create_model()
def create_model(self):
# state_size = (3, )
input = Input(shape=self.state_size)
x = Dense(50, activation="relu", kernel_initializer=glorot_uniform(seed=42))(input)
x = Dense(200, activation="relu", kernel_initializer=glorot_uniform(seed=42))(x)
output = Dense(
self.action_size,
activation="linear",
kernel_initializer=glorot_uniform(seed=42),
)(x)
model = Model(inputs=[input], outputs=[output])
model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
model.summary()
return model
# Memory Model
# A tree based array containing priority of each experience for fast sampling
class SumTree():
"""
__init__ - create data array storing experience and a tree based array storing priority
add - store new experience in data array and update tree with new priority
update - update tree and propagate the change through the tree
get_leaf - find the final nodes with a given priority value
store data with its priority in the tree.
"""
data_pointer = 0
def __init__(self, capacity):
"""
capacity - Number of final nodes containing experience, for all priority values
data - array containing experience (with pointers to Python objects), for all transitions
tree - a tree shape array containing priority of each experience
tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0, 1, 2, 3, 4, 5, 6]
"""
self.capacity = capacity
self.tree = np.zeros(2 * capacity - 1)
self.data = np.zeros(capacity, dtype=object)
def add(self, priority, data):
# Start from first leaf node of the most bottom layer
tree_index = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # Update data frame
self.update(tree_index, priority) # Update priority
# Overwrite if exceed memory capacity
self.data_pointer += 1
if self.data_pointer >= self.capacity:
self.data_pointer = 0
def update(self, tree_index, priority):
# Change = new priority score - former priority score
change = priority - self.tree[tree_index]
self.tree[tree_index] = priority
# Propagate the change through tree
while tree_index != 0: # this method is faster than the recursive loop in the reference code
tree_index = (tree_index - 1) // 2
self.tree[tree_index] += change
def get_leaf(self, v):
parent_index = 0
while True: # while loop is faster than the method in the reference code
left_child_index = 2 * parent_index + 1 # this leaf's left and right kids
right_child_index = left_child_index + 1
# Downward search, always search for a higher priority node till the last layer
if left_child_index >= len(self.tree): # reach the bottom, end search
leaf_index = parent_index
break
else: # downward search, always search for a higher priority node
if v <= self.tree[left_child_index]:
parent_index = left_child_index
else:
v -= self.tree[left_child_index]
parent_index = right_child_index
data_index = leaf_index - self.capacity + 1
# tree leaf index, priority, experience
return leaf_index, self.tree[leaf_index], self.data[data_index]
# Memory Model
class Memory(): # stored as (s, a, r, s_) in SumTree
"""
__init__ - create SumTree memory
store - assign priority to new experience and store with SumTree.add & SumTree.update
sample - uniformly sample from the range between 0 and total priority and
retrieve the leaf index, priority and experience with SumTree.get_leaf
batch_update - update the priority of experience after training with SumTree.update
PER_e - Hyperparameter that avoid experiences having 0 probability of being taken
PER_a - Hyperparameter that allows tradeoff between taking only experience with
high priority and sampling randomly (0 - pure uniform randomness, 1 -
select experiences with the highest priority)
PER_b - Importance-sampling, from initial value increasing to 1, control how much
IS affect learning
"""
PER_e = 0.01 # small amount to avoid zero priority
PER_a = 0.6 # [0~1] convert the importance of TD error to priority (0: no IS)
PER_b = 0.4 # importance-sampling, from initial value increasing to 1
PER_b_increment_per_sampling = 0.001
absolute_error_upper = 1.0 # Clipped abs error
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, experience):
# Find the max priority
max_priority = np.max(self.tree.tree[-self.tree.capacity:])
# If the max priority = 0, this experience will never have a chance to be selected
# So a minimum priority is assigned
if max_priority == 0:
max_priority = self.absolute_error_upper
self.tree.add(max_priority, experience)
def sample(self, n):
"""
First, to sample a minibatch of k size, the range [0, priority_total] is
divided into k ranges. A value is uniformly sampled from each range. Search
in the sumtree, the experience where priority score correspond to sample
values are retrieved from. Calculate IS weights for each minibatch element
"""
# b_memory = []
b_memory = np.empty((n, self.tree.data[0].size))
b_idx = np.empty((n,), dtype=np.int32) # np.empty((n,))
b_ISWeights = np.empty((n, 1))
priority_segment = self.tree.tree[0] / n
self.PER_b = np.min([1.0, self.PER_b + self.PER_b_increment_per_sampling]) # max = 1
prob_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.tree[0]
# max_weight = (prob_min * n) ** (-self.PER_b)
for i in range(n):
a = priority_segment * i
b = priority_segment * (i + 1)
value = np.random.uniform(a, b)
index, priority, data = self.tree.get_leaf(value)
prob = priority / self.tree.tree[0]
# b_ISWeights[i, 0] = (prob * n) ** (-self.PER_b) / max_weight
# print(prob, prob_min)
b_ISWeights[i, 0] = np.power(prob / prob_min, -self.PER_b)
b_idx[i] = index
# b_memory.append([data])
b_memory[i, :] = data
return b_idx, b_memory, b_ISWeights
def batch_update(self, tree_idx, abs_errors):
# convert to abs and avoid 0 probability
abs_errors += self.PER_e
clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
ps = np.power(clipped_errors, self.PER_a)
for ti, p in zip(tree_idx, ps):
self.tree.update(ti, p)
# DQN, with/without Prioritized Replay
class DQNPrioritizedReplay:
def __init__(self,
n_actions,
n_features,
learning_rate=0.005,
reward_decay=0.9,
e_greedy=0.95,
replace_target_iter=100, # 500,
memory_size=10000,
batch_size=256,
seed=1,
e_greedy_increment=None,
output_graph=True, # False,
prioritized=True,
test=False,
sess=None):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.seed = seed
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
self.prioritized = prioritized # decide to use prioritize experience replay or not
self.test = test
self.learn_step_counter = 0
self._build_net()
t_params = tf.get_collection('target_net_params')
e_params = tf.get_collection('eval_net_params')
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
if self.prioritized:
self.memory = Memory(capacity=memory_size)
else:
self.memory = np.zeros((self.memory_size, n_features * 2 + 4)) # [s, a, r, s_], #a=3
if sess is None:
self.sess = tf.Session()
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
# output_grahp=True
if output_graph:
# $ tensorboard --logdir=logs
# http://localhost:6006/
tf.summary.FileWriter("logs/", self.sess.graph)
self.cost_his = []
def _build_net(self):
def build_layers(s, c_names, n_l1, w_initializer, b_initializer, trainable):
set_seed(self.seed)
with tf.variable_scope('l1'):
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names,
trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names,
trainable=trainable)
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)
with tf.variable_scope('l2'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names,
trainable=trainable)
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names,
trainable=trainable)
out = tf.matmul(l1, w2) + b2
return out
# ------------------ build evaluate_net ------------------
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
if self.prioritized:
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 20, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer, True)
with tf.variable_scope('loss'):
if self.prioritized:
self.abs_errors = tf.math.reduce_sum(tf.math.abs(self.q_target - self.q_eval),
axis=1) # for updating Sumtree
self.loss = tf.math.reduce_mean(self.ISWeights * tf.math.squared_difference(self.q_target, self.q_eval))
else:
self.loss = tf.math.reduce_mean(tf.math.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer, False)
def store_transition(self, s, a, r, s_): # a: action list
if self.prioritized: # prioritized replay
transition = np.hstack((s, [a[0], a[1], a[2], r], s_))
self.memory.store(transition) # have high priority for newly arrived transition
else: # random replay
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a[0], a[1], a[2], r], s_))
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action_single(self, observation):
observation = observation[np.newaxis, :]
# action selection
if np.random.uniform() < self.epsilon:
# choose best action
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(actions_value)
else:
# choose random action
action = np.random.choice(self.n_actions)
return action
def choose_actions(self, observation):
# TODO
# fixed the first action, select the second <- softmax, repeat for 3rd
# np.argmax, get the top 3 values?
# print("epsilon value:", self.epsilon)
# to have batch dimension when feed into tf placeholder
observation = observation[np.newaxis, :]
# no test when training
# if self.test:
# self.epsilon = 0
# else:
# self.epsilon =
self.epsilon = self.epsilon_max if self.test is True else self.epsilon
print("epsilon value is", self.epsilon)
if np.random.uniform() < self.epsilon: # act greedy
# forward feed the observation and get q value for every actions
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
# action = np.argmax(actions_value)
print("act greedy")
actions = sorted(np.argsort(np.squeeze(-actions_value))[:3], reverse=True)
else: # act non-greedy, random actions
# action = np.random.randint(0, self.n_actions)
print("random actions")
actions = sorted(np.random.choice(self.n_actions, 3, replace=False), reverse=True) # top 3 values from random
return actions
def learn(self):
# check to replace target parameters
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')
# sample batch memory from all memory
if self.prioritized:
tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size)
# print("batch_memo: ", batch_memory)
else:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
# print("batch_memo: ", batch_memory)
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={self.s_: batch_memory[:, -self.n_features:], # fixed params
self.s: batch_memory[:, :self.n_features]}) # newest params
# change q_target w.r.t q_eval's action
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
eval_act_index = batch_memory[:, self.n_features].astype(int)
reward = batch_memory[:, self.n_features + 1]
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
# train eval network
if self.prioritized:
_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target,
self.ISWeights: ISWeights})
self.memory.batch_update(tree_idx, abs_errors) # update priority
else:
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()