/
DoubleDuelingRDQNAgent.py
272 lines (229 loc) · 9.88 KB
/
DoubleDuelingRDQNAgent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import numpy as np
import tensorflow as tf
from grid2op.Parameters import Parameters
from grid2op.Agent import AgentWithConverter
from grid2op.Converter import IdToAct
from ExperienceBuffer import ExperienceBuffer
from DoubleDuelingRDQN import DoubleDuelingRDQN
INITIAL_EPSILON = 0.95
FINAL_EPSILON = 0.0
DECAY_EPSILON = 1024*32
STEP_EPSILON = (INITIAL_EPSILON-FINAL_EPSILON)/DECAY_EPSILON
DISCOUNT_FACTOR = 0.99
REPLAY_BUFFER_SIZE = 2048
UPDATE_FREQ = 8
class DoubleDuelingRDQNAgent(AgentWithConverter):
def __init__(self,
env,
action_space,
name=__name__,
trace_length=1,
batch_size=1,
is_training=False,
lr=1e-5):
# Call parent constructor
AgentWithConverter.__init__(self, action_space,
action_space_converter=IdToAct)
# Store constructor params
self.env = env
self.name = name
self.trace_length = trace_length
self.batch_size = batch_size
self.is_training = is_training
self.lr = lr
# Declare required vars
self.Qmain = None
self.obs = None
self.state = []
self.mem_state = None
self.carry_state = None
# Declare training vars
self.exp_buffer = None
self.done = False
self.epoch_rewards = None
self.epoch_alive = None
self.Qtarget = None
# Compute dimensions from intial state
self.obs = self.env.reset()
self.state = self.convert_obs(self.obs)
self.observation_size = self.state.shape[0]
self.action_size = self.action_space.size()
# Load network graph
self.Qmain = DoubleDuelingRDQN(self.action_size,
self.observation_size,
learning_rate = self.lr)
# Setup inital state
self._reset_state()
# Setup training vars if needed
if self.is_training:
self._init_training()
def _init_training(self):
self.exp_buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE, self.batch_size, self.trace_length)
self.done = True
self.epoch_rewards = []
self.epoch_alive = []
self.Qtarget = DoubleDuelingRDQN(self.action_size,
self.observation_size,
learning_rate = self.lr)
def _reset_state(self):
# Initial state
self.obs = self.env.current_obs
self.state = self.convert_obs(self.obs)
self.done = False
self.mem_state = np.zeros(self.Qmain.h_size)
self.carry_state = np.zeros(self.Qmain.h_size)
def _register_experience(self, episode_exp, episode):
missing_obs = self.trace_length - len(episode_exp)
if missing_obs > 0: # We are missing exp to make a trace
exp = episode_exp[0] # Use inital state to fill out
for missing in range(missing_obs):
# Use do_nothing action at index 0
self.exp_buffer.add(exp[0], 0, exp[2], exp[3], exp[4], episode)
# Register the actual experience
for exp in episode_exp:
self.exp_buffer.add(exp[0], exp[1], exp[2], exp[3], exp[4], episode)
## Agent Interface
def convert_obs(self, observation):
return observation.to_vect()
def convert_act(self, action):
return super().convert_act(action)
def reset(self):
self._reset_state()
def my_act(self, state, reward, done=False):
data_input = np.array(state)
data_input.reshape(1, 1, self.observation_size)
a, _, m, c = self.Qmain.predict_move(data_input, self.mem_state, self.carry_state)
self.mem_state = m
self.carry_state = c
return a
def load_network(self, path):
self.Qmain.load_network(path)
if self.is_training:
self.Qmain.update_target_weights(self.Qtarget.model)
def save_network(self, path):
self.Qmain.save_network(path)
## Training Procedure
def train(self, num_pre_training_steps, num_training_steps):
# Loop vars
num_steps = num_pre_training_steps + num_training_steps
step = 0
epsilon = INITIAL_EPSILON
alive_steps = 0
total_reward = 0
episode = 0
episode_exp = []
self.tf_writer = tf.summary.create_file_writer("./logs/{}".format(self.name), name=self.name)
self._reset_state()
# Training loop
while step < num_steps:
# New episode
if self.done:
self.env.reset() # This shouldn't raise
self._reset_state()
# Push current episode experience to experience buffer
self._register_experience(episode_exp, episode)
# Reset current episode experience
episode += 1
episode_exp = []
if step % 1000 == 0:
print("Step [{}] -- Random [{}]".format(step, epsilon))
# Choose an action
if step <= num_pre_training_steps:
a, m, c = self.Qmain.random_move(self.state, self.mem_state, self.carry_state)
else:
a, _, m, c = self.Qmain.bayesian_move(self.state, self.mem_state, self.carry_state, epsilon)
# Update LSTM state
self.mem_state = m
self.carry_state = c
# Convert it to a valid action
act = self.convert_act(a)
# Execute action
new_obs, reward, self.done, info = self.env.step(act)
new_state = self.convert_obs(new_obs)
# Save to current episode experience
episode_exp.append((self.state, a, reward, self.done, new_state))
# Train when pre-training is over
if step > num_pre_training_steps:
# Slowly decay dropout rate
if epsilon > FINAL_EPSILON:
epsilon -= STEP_EPSILON
else:
epsilon = FINAL_EPSILON
# Perform training at given frequency
if step % UPDATE_FREQ == 0 and self.exp_buffer.can_sample():
# Sample from experience buffer
batch = self.exp_buffer.sample()
# Perform training
training_step = step - num_pre_training_steps
self._batch_train(batch, training_step)
# Update target network towards primary network
self.Qmain.update_target(self.Qtarget.model)
total_reward += reward
if self.done:
self.epoch_rewards.append(total_reward)
self.epoch_alive.append(alive_steps)
print("Survived [{}] steps".format(alive_steps))
print("Total reward [{}]".format(total_reward))
alive_steps = 0
total_reward = 0
else:
alive_steps += 1
# Save the network every 1000 iterations
if step > 0 and step % 1000 == 0:
self.Qmain.save_network(self.name + ".h5")
# Iterate to next loop
step += 1
self.obs = new_obs
self.state = new_state
# Save model after all steps
self.Qmain.save_network(self.name + ".h5")
def _batch_train(self, batch, step):
"""Trains network to fit given parameters"""
Q = np.zeros((self.batch_size, self.action_size))
batch_mem = np.zeros((self.batch_size, self.Qmain.h_size))
batch_carry = np.zeros((self.batch_size, self.Qmain.h_size))
input_size = self.observation_size
m_data = np.vstack(batch[:, 0])
m_data = m_data.reshape(self.batch_size, self.trace_length, input_size)
t_data = np.reshape(np.vstack(batch[:, 4]), [self.batch_size, self.trace_length, input_size])
t_data = t_data.reshape(self.batch_size, self.trace_length, input_size)
m_input = [batch_mem, batch_carry, m_data]
t_input = [batch_mem, batch_carry, t_data]
# Batch predict
self.Qmain.trace_length.assign(self.trace_length)
self.Qmain.dropout_rate.assign(0.0)
Q, _, _ = self.Qmain.model.predict(t_input, batch_size = self.batch_size)
self.Qtarget.trace_length.assign(self.trace_length)
self.Qtarget.dropout_rate.assign(0.0)
Q2, _, _ = self.Qtarget.model.predict(t_input, batch_size = self.batch_size)
# Compute batch Double Q update to Qtarget
for i in range(self.batch_size):
idx = i * (self.trace_length - 1)
doubleQ = Q2[i, np.argmax(Q[i])]
a = batch[idx][1]
r = batch[idx][2]
d = batch[idx][3]
Q[i, a] = r
if d == False:
Q[i, a] += DISCOUNT_FACTOR * doubleQ
# Batch train
batch_x = [batch_mem, batch_carry, m_data]
batch_y = [Q, batch_mem, batch_carry]
loss = self.Qmain.model.train_on_batch(batch_x, batch_y)
loss = loss[0]
# Log some useful metrics
print("loss =", loss)
with self.tf_writer.as_default():
mean_reward = np.mean(self.epoch_rewards)
mean_alive = np.mean(self.epoch_alive)
if len(self.epoch_rewards) >= 100:
mean_reward_100 = np.mean(self.epoch_rewards[-100:])
mean_alive_100 = np.mean(self.epoch_alive[-100:])
else:
mean_reward_100 = mean_reward
mean_alive_100 = mean_alive
tf.summary.scalar("mean_reward", mean_reward, step)
tf.summary.scalar("mean_alive", mean_alive, step)
tf.summary.scalar("mean_reward_100", mean_reward_100, step)
tf.summary.scalar("mean_alive_100", mean_alive_100, step)
tf.summary.scalar("loss", loss, step)