# Deep Deterministic Policy Gradient

# Import

In [1]:
import os
import random
import gym
import numpy as np
from collections import deque
from itertools import count
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Environment

In [2]:
gym.logger.set_level(40) # subpress warnings
env = gym.make('Pendulum-v0')

In [3]:
env.observation_space

Box(-8.0, 8.0, (3,), float32)

In [4]:
env.observation_space.low, env.observation_space.high

(array([-1., -1., -8.], dtype=float32), array([1., 1., 8.], dtype=float32))

In [5]:
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [6]:
env.action_space.low, env.action_space.high

(array([-2.], dtype=float32), array([2.], dtype=float32))

# Networks

In [7]:
class CriticNet(nn.Module):
    def __init__(self, state_shape, action_shape, *, n_hidden=128, lr=.0025):
        super().__init__()
        # layers
        self._in_s_0 = nn.Linear(state_shape[0], n_hidden)
        self._hd_s_1 = nn.Linear(n_hidden, n_hidden)
        self._in_a_0 = nn.Linear(action_shape[0], n_hidden)
        self._hd_a_1 = nn.Linear(n_hidden, n_hidden)
        self._q = nn.Linear(n_hidden, action_shape[0])
        # optimizer
        self.optim = optim.Adam(self.parameters(), lr=lr)
        
    def forward(self, state, action):
        # state value
        sv = T.relu(self._in_s_0(state))
        sv = self._hd_s_1(sv)
        # action value
        av = T.relu(self._in_a_0(action))
        av = self._hd_a_1(av)
        # state-action value
        sav = T.relu(T.add(sv, av))
        sav = self._q(sav)
        return sav        

In [8]:
class ActorNet(nn.Module):
    def __init__(self, state_shape, action_shape, *, n_hidden=128, lr=.0025):
        super().__init__()
        # layers
        self._in_0 = nn.Linear(state_shape[0], n_hidden)
        self._hd_1 = nn.Linear(n_hidden, n_hidden)
        self._mu = nn.Linear(n_hidden, action_shape[0])
        # optimizer
        self.optim = optim.Adam(self.parameters(), lr=lr)
    
    def forward(self, state):
        s = T.relu(self._in_0(state))
        s = T.relu(self._hd_1(s))
        mu = self._mu(s)
        return mu       

# Replay Buffer

In [13]:
class Memory:
    def __init__(self, maxlen):
        self._memory = deque(maxlen=maxlen)
    
    def __len__(self): return len(self._memory)
        
    def remember(self, state, action, reward, next_state, done):
        self._memory.append((state, action, reward, next_state, done))
        
    def get_batch(self, size):
        # sampling
        samples = random.sample(self._memory, min(len(self), size))
        # reshape
        states, actions, rewards, next_states, dones = np.array(samples).transpose()
        states, next_states = np.stack(states), np.stack(next_states)
        actions, rewards, dones = (a.reshape((states.shape[0],-1)) 
            for a in (actions, rewards, dones))
        # convert to tensors
        states = T.as_tensor(states.astype(float), dtype=T.float32)
        actions = T.as_tensor(actions.astype(float), dtype=T.float32)
        rewards = T.as_tensor(rewards.astype(float), dtype=T.float32)
        next_states = T.as_tensor(next_states.astype(float), dtype=T.float32)
        dones = T.as_tensor(dones.astype(int), dtype=T.int32)
        return states, actions, rewards, next_states, dones

# Agent

In [14]:
class Agent:
    def __init__(self, env):
        self._env = env
        self._memory = Memory(100000)
        # create networks
        state_shape, action_shape = env.observation_space.shape, env.action_space.shape
        self._actor = ActorNet(state_shape, action_shape)
        self._actor_target = ActorNet(state_shape, action_shape)
        self._critic = CriticNet(state_shape, action_shape)
        self._critic_target = CriticNet(state_shape, action_shape) 
        # clone weights to target
        self.update_targets(0.0)
    
    def choose_action(self, state, i_step=None, n_step=None, *, noise=True):
        with T.no_grad():
            state = T.tensor(state, dtype=T.float32)
            mu = self._actor(state).detach().numpy()
            if noise:
                sd = max(0.02, (1-i_step/n_step)*self._env.action_space.high)
                noise = np.random.normal(0, sd, mu.shape)
                return mu + noise
            else:
                return mu

In [15]:
class Agent(Agent):
    def play(self, *, render=False, summary=True):
        state = env.reset()
        done = False
        rewards = 0
        for i_step in count():
            action = self.choose_action(state, i_step, noise=False)
            next_state, reward, done, info = env.step(action)            
            rewards += reward
            state = next_state
            if render: 
                env.render()
            if done: 
                break
        if render:
            if summary:
                print(f'Steps taken: {i_step}, rewards earned: {rewards:.4f}')
            env.close()
        else:
            return rewards

In [16]:
class Agent(Agent):
    def learn(self, batch_size, gamma=0.99):
        if len(self._memory) < batch_size: return
        # get batch
        states, actions, rewards, next_states, dones = self._memory.get_batch(batch_size)
        # calculate targets
        a_targ = self._actor_target(next_states)
        c_vals = self._critic(states, actions)
        c_vals_targ = rewards + gamma * (1-dones) * self._critic_target(next_states, a_targ)
        # critic loss
        self._critic.optim.zero_grad()
        critic_loss = F.mse_loss(c_vals_targ, c_vals)
        critic_loss.backward()
        self._critic.optim.step()
        # actor loss
        self._actor.optim.zero_grad()
        mu = self._actor(states)
        actor_loss = T.mean(-self._critic(states, mu))
        actor_loss.backward()
        self._actor.optim.step()
        # update targets
        self.update_targets()
        
    def update_targets(self, polyak=0.99):
        for m, mT in ((self._actor, self._actor_target), (self._critic, self._critic_target)):
            dd, ddT, new_ddT = dict(m.named_parameters()), dict(mT.named_parameters()), dict()
            for name in ddT.keys():
                new_ddT[name] = polyak * ddT[name] + (1-polyak) * dd[name]
            mT.load_state_dict(new_ddT)

In [17]:
class Agent(Agent):
    def run(self, n_eps=200, batch_size=64):
        score_hist = deque(maxlen=10)
        for i_eps in range(1, n_eps+1):
            score = 0
            state = self._env.reset()
            done = False
            for i_step in count():
                action = self.choose_action(state, i_step, self._env._max_episode_steps)
                next_state, reward, done, info = self._env.step(action)
                self._memory.remember(state, action, reward, next_state, done)
                self.learn(batch_size)
                score += reward
                state = next_state
                if done: 
                    break
            score_hist.append(score)
            if i_eps%1==0:
                print('#', end='')
            if i_eps%1==0:
                score_mean = np.mean(score_hist)
                print(f' | Episode {i_eps:>4d} | score: {score:+7.1f} | rolling mean: {score_mean:+7.1f}')
                # self.play(render=True, summary=False)

# Run

In [18]:
agent = Agent(env)

In [19]:
agent.run(50)

# | Episode    1 | score: -1379.0 | rolling mean: -1379.0
# | Episode    2 | score: -1572.9 | rolling mean: -1475.9
# | Episode    3 | score: -1461.1 | rolling mean: -1471.0
# | Episode    4 | score: -1383.2 | rolling mean: -1449.0
# | Episode    5 | score: -1470.4 | rolling mean: -1453.3
# | Episode    6 | score: -1265.1 | rolling mean: -1422.0
# | Episode    7 | score: -1357.7 | rolling mean: -1412.8
# | Episode    8 | score: -1500.7 | rolling mean: -1423.8
# | Episode    9 | score: -1512.8 | rolling mean: -1433.7
# | Episode   10 | score: -1392.6 | rolling mean: -1429.6
# | Episode   11 | score: -1367.4 | rolling mean: -1428.4
# | Episode   12 | score: -1511.4 | rolling mean: -1422.2
# | Episode   13 | score: -1365.9 | rolling mean: -1412.7
# | Episode   14 | score:  -870.1 | rolling mean: -1361.4
# | Episode   15 | score: -1418.9 | rolling mean: -1356.3
# | Episode   16 | score: -1525.4 | rolling mean: -1382.3
# | Episode   17 | score:  -963.7 | rolling mean: -1342.9
# | Episode   

# Evaluation

In [20]:
agent.play(render=True)

Steps taken: 199, rewards earned: -125.8101


# Comment

* off-policy model for continuous action space.
* choose of activation function (i.e. relu vs tanh vs linear) is crucial. For example, must not use relu at any output layers (including intermediate state value output layer before T.add) because that will trim all negative value information and fail the model.
* never put unused layers in nn.Module constructor, it mess up the trainable parameters pool and failed the model.