# Introduction
In homework assignment 3, we will implement the deep deterministic policy gradient (DDPG) algorithm to solve a classic rocket trajectory optimization problem--Lunar Lander v2

# Enabling and testing the GPU

First, you may need to enable GPUs for this notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

Next, we'll confirm that we can connect to the GPU with pytorch:

In [None]:
import torch
import os
if torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"]="0"
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print('Found device at: {}'.format(device))

#Install gym environment

In [None]:
!pip install swig
!pip install gymnasium[box2d]

# Load tensorboard for visualizing

In [None]:
%load_ext tensorboard

#Import required package

In [None]:
import torch.nn as nn
from typing import Tuple
from collections import namedtuple
from collections import deque
import numpy.random as nr
import numpy as np

import gymnasium as gym
from torch.utils.tensorboard import SummaryWriter

import datetime
import copy

Tensor = torch.DoubleTensor
torch.set_default_tensor_type(Tensor)
Transitions = namedtuple('Transitions', ['obs', 'action', 'reward', 'next_obs', 'done'])

#Replay buffer

In [None]:
class ReplayBuffer(nn.Module):
    def __init__(self, config):
        super().__init__()
        replay_buffer_size = config['replay_buffer_size']
        seed = config['seed']
        self.device = config['device']
        nr.seed(seed)

        self.replay_buffer_size = replay_buffer_size
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)

    def append_memory(self,
                      obs,
                      action,
                      reward,
                      next_obs,
                      done: bool):
        self.obs.append(obs)
        self.action.append(action)
        self.reward.append(reward)
        self.next_obs.append(next_obs)
        self.done.append(done)

    def sample(self, batch_size):
        buffer_size = len(self.obs)

        idx = nr.choice(buffer_size,
                        size=min(buffer_size, batch_size),
                        replace=False)
        t = Transitions
        t.obs = torch.stack(list(map(self.obs.__getitem__, idx))).to(self.device)
        t.action = torch.stack(list(map(self.action.__getitem__, idx))).to(self.device)
        t.reward = torch.stack(list(map(self.reward.__getitem__, idx))).to(self.device)
        t.next_obs = torch.stack(list(map(self.next_obs.__getitem__, idx))).to(self.device)
        t.done = torch.tensor(list(map(self.done.__getitem__, idx)))[:, None].to(self.device)
        return t

    def clear(self):
        self.obs = deque([], maxlen=self.replay_buffer_size)
        self.action = deque([], maxlen=self.replay_buffer_size)
        self.reward = deque([], maxlen=self.replay_buffer_size)
        self.next_obs = deque([], maxlen=self.replay_buffer_size)
        self.done = deque([], maxlen=self.replay_buffer_size)

#Actor network

In [None]:
class ActorNet(nn.Module):
    def __init__(self,
                 dim_obs: int,
                 dim_action: int,
                 dims_hidden_neurons: Tuple[int] = (64, 64)):
        super(ActorNet, self).__init__()
        self.n_layers = len(dims_hidden_neurons)
        self.dim_action = dim_action

        n_neurons = (dim_obs,) + dims_hidden_neurons + (dim_action,)
        for i, (dim_in, dim_out) in enumerate(zip(n_neurons[:-2], n_neurons[1:-1])):
            layer = nn.Linear(dim_in, dim_out).double()
            # nn.Linear: input: (batch_size, n_feature)
            #            output: (batch_size, n_output)
            torch.nn.init.xavier_uniform_(layer.weight)
            torch.nn.init.zeros_(layer.bias)
            exec('self.layer{} = layer'.format(i + 1))  # exec(str): execute a short program written in the str

        self.output = nn.Linear(n_neurons[-2], n_neurons[-1]).double()
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)

    def forward(self, obs: torch.Tensor):
        x = obs
        for i in range(self.n_layers):
            x = eval('torch.relu(self.layer{}(x))'.format(i + 1))
        a = torch.tanh(self.output(x))
        return a


#Critic network

In [None]:
class QCriticNet(nn.Module):
    def __init__(self,
                 dim_obs: int,
                 dim_action: int,
                 dims_hidden_neurons: Tuple[int] = (64, 64)):
        super(QCriticNet, self).__init__()
        self.n_layers = len(dims_hidden_neurons)
        self.dim_action = dim_action

        n_neurons = (dim_obs + dim_action,) + dims_hidden_neurons + (1,)
        for i, (dim_in, dim_out) in enumerate(zip(n_neurons[:-2], n_neurons[1:-1])):
            layer = nn.Linear(dim_in, dim_out).double()
            # nn.Linear: input: (batch_size, n_feature)
            #            output: (batch_size, n_output)
            torch.nn.init.xavier_uniform_(layer.weight)
            torch.nn.init.zeros_(layer.bias)
            exec('self.layer{} = layer'.format(i + 1))  # exec(str): execute a short program written in the str

        self.output = nn.Linear(n_neurons[-2], n_neurons[-1]).double()
        torch.nn.init.xavier_uniform_(self.output.weight)
        torch.nn.init.zeros_(self.output.bias)

    def forward(self, obs: torch.Tensor, action: torch.Tensor):
        x = torch.cat((obs, action), dim=1)
        for i in range(self.n_layers):
            x = eval('torch.relu(self.layer{}(x))'.format(i + 1))
        return self.output(x)


#DDPG agent
The base code are given in this section. The updates of the actor and critic networks are missing and are left out for you to fill. You may refer to the DDPG paper https://arxiv.org/pdf/1509.02971.pdf or Spinning up tutorial for DDPG https://spinningup.openai.com/en/latest/algorithms/ddpg.html

In [None]:
class DDPG(nn.Module):
    def __init__(self, config):
        super(DDPG,self).__init__()
        torch.manual_seed(config['seed'])

        self.lr_actor = config['lr_actor']  # learning rate
        self.lr_critic = config['lr_critic']
        self.smooth = config['smooth']  # smoothing coefficient for target net
        self.discount = config['discount']  # discount factor
        self.batch_size = config['batch_size']  # mini batch size
        self.sig = config['sig']  # exploration noise

        self.dims_hidden_neurons = config['dims_hidden_neurons']
        self.dim_obs = config['dim_obs']
        self.dim_action = config['dim_action']

        self.device = config['device']

        self.actor = ActorNet(dim_obs=self.dim_obs,
                              dim_action=self.dim_action,
                              dims_hidden_neurons=self.dims_hidden_neurons).to(self.device)
        self.Q = QCriticNet(dim_obs=self.dim_obs,
                            dim_action=self.dim_action,
                            dims_hidden_neurons=self.dims_hidden_neurons).to(self.device)
        self.actor_tar = ActorNet(dim_obs=self.dim_obs,
                                  dim_action=self.dim_action,
                                  dims_hidden_neurons=self.dims_hidden_neurons).to(self.device)
        self.Q_tar = QCriticNet(dim_obs=self.dim_obs,
                                dim_action=self.dim_action,
                                dims_hidden_neurons=self.dims_hidden_neurons).to(self.device)

        self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=self.lr_actor)
        self.optimizer_Q = torch.optim.Adam(self.Q.parameters(), lr=self.lr_critic)

    def update(self, buffer):
        # sample from replay memory
        t = buffer.sample(self.batch_size)

        # TO DO: Perform the updates for the actor and critic networks


    def act_probabilistic(self, obs: torch.Tensor):
        self.actor.eval()
        exploration_noise = torch.normal(torch.zeros(size=(self.dim_action,)), self.sig).to(self.device)
        a = self.actor(obs) + exploration_noise
        self.actor.train()
        return a

    def act_deterministic(self, obs: torch.Tensor):
        self.actor.eval()
        a = self.actor(obs)
        self.actor.train()
        return a


#Create environment

In [None]:
env = gym.make('LunarLanderContinuous-v2')

config = {
    'dim_obs': 8,
    'dim_action': 2,
    'dims_hidden_neurons': (400, 200),
    'lr_actor': 0.001,
    'lr_critic': 0.005,
    'smooth': 0.99,
    'discount': 0.99,
    'sig': 0.01,
    'batch_size': 32,
    'replay_buffer_size': 20000,
    'seed': 1,
    'max_episode': 500,
    'device':device
}


#Create agent

In [None]:
ddpg = DDPG(config).to(device)
buffer = ReplayBuffer(config)
train_writer = SummaryWriter(log_dir='tensorboard/ddpg')

#Start training

In [None]:
steps = 0
for i_episode in range(config['max_episode']):
    obs = env.reset()[0]
    done = False
    truncated = False
    t = 0
    ret = 0.
    while done is False and truncated is False:
        # env.render() 

        obs_tensor = torch.tensor(obs).type(Tensor).to(device)

        action = ddpg.act_probabilistic(obs_tensor[None, :]).detach().cpu().numpy()[0, :]

        next_obs, reward, done, truncated,_ = env.step(action)

        buffer.append_memory(obs=obs_tensor,
                             action=torch.from_numpy(action),
                             reward=torch.from_numpy(np.array([reward/10.0])),
                             next_obs=torch.from_numpy(next_obs).type(Tensor),
                             done=done)

        ddpg.update(buffer)

        t += 1
        steps += 1
        ret += reward

        obs = copy.deepcopy(next_obs)

        if done or truncated:
            print("Episode {} return {}".format(i_episode, ret))
        train_writer.add_scalar('Performance/episodic_return', ret, i_episode)

env.close()
train_writer.close()


#Visualizing

In [None]:
%tensorboard --logdir='tensorboard/ddpg'