From 01f44482ffb5acfd090d9cba219c0e20990534ba Mon Sep 17 00:00:00 2001 From: Quan Vuong Date: Sat, 29 Apr 2017 00:11:41 +0400 Subject: [PATCH] replace model by policy --- reinforcement_learning/reinforce.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/reinforcement_learning/reinforce.py b/reinforcement_learning/reinforce.py index 77a733c702..cf356399aa 100644 --- a/reinforcement_learning/reinforce.py +++ b/reinforcement_learning/reinforce.py @@ -43,33 +43,33 @@ def forward(self, x): return F.softmax(action_scores) -model = Policy() -optimizer = optim.Adam(model.parameters(), lr=1e-2) +policy = Policy() +optimizer = optim.Adam(policy.parameters(), lr=1e-2) def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) - probs = model(Variable(state)) + probs = policy(Variable(state)) action = probs.multinomial() - model.saved_actions.append(action) + policy.saved_actions.append(action) return action.data def finish_episode(): R = 0 rewards = [] - for r in model.rewards[::-1]: + for r in policy.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) - for action, r in zip(model.saved_actions, rewards): + for action, r in zip(policy.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() - autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) + autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions]) optimizer.step() - del model.rewards[:] - del model.saved_actions[:] + del policy.rewards[:] + del policy.saved_actions[:] running_reward = 10 @@ -80,7 +80,7 @@ def finish_episode(): state, reward, done, _ = env.step(action[0,0]) if args.render: env.render() - model.rewards.append(reward) + policy.rewards.append(reward) if done: break