**Dependencies and setup**

This can take a minute or so...

In [None]:
# CITATIONS: https://github.com/soumik12345/Twin-Delayed-DDPG

%%capture
!apt update
!pip install 'gym[box2d]'
!python --version
!apt install xvfb -y
!pip install pyvirtualdisplay

import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import sys
from pyvirtualdisplay import Display
from IPython import display as disp
from tqdm import tqdm
from copy import deepcopy
%matplotlib inline

display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

plot_interval = 10 # update the plot every N episodes
video_every = 25 # videos can take a very long time to render so only do it every N episodes

**Reinforcement learning agent**


In [None]:
class Agent(object):

    def __init__(
            self, state_dim, action_dim, max_action, device,
            discount=0.99, rho=0.995, policy_noise=0.2, noise_clip=0.5, update_policy_delay=2):
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.0003)
        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.0003)
        self.max_action = max_action
        self.discount = discount
        self.rho = rho
        self.device = device
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.update_policy_delay = update_policy_delay
        self.total_it = 0

    def sample_action(self, s):
        s = torch.FloatTensor(s.reshape(1, -1)).to(self.device)
        return self.actor(s).cpu().data.numpy().flatten()

    @staticmethod
    def soft_update(local_model, target_model, rho):
        for param, target_param in zip(local_model.parameters(), target_model.parameters()):
            target_param.data.copy_((1-rho) * param.data + rho * target_param.data)

    def train(self, replay_buffer, batch_size=100):
        self.total_it += 1
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
        with torch.no_grad():
            noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
            target_q1, target_q2 = self.critic_target(next_state, next_action)
            target_q = torch.min(target_q1, target_q2)
            target_q = reward + not_done * self.discount * target_q

        current_q1, current_q2 = self.critic(state, action)
        critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        if self.total_it % self.update_policy_delay == 0:
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            Agent.soft_update(self.critic, self.critic_target, self.rho)
            Agent.soft_update(self.actor, self.actor_target, self.rho)
        
class Actor(torch.nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.l1 = torch.nn.Linear(state_dim, 256)
        self.l2 = torch.nn.Linear(256, 256)
        self.l3 = torch.nn.Linear(256, action_dim)
        self.max_action = max_action

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return self.max_action * torch.tanh(self.l3(a))


class Critic(torch.nn.Module):

    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.l1 = torch.nn.Linear(state_dim + action_dim, 256)
        self.l2 = torch.nn.Linear(256, 256)
        self.l3 = torch.nn.Linear(256, 1)
        self.l4 = torch.nn.Linear(state_dim + action_dim, 256)
        self.l5 = torch.nn.Linear(256, 256)
        self.l6 = torch.nn.Linear(256, 1)

    def forward(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = self.l6(q2)
        return q1, q2

    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)
        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = self.l3(q1)
        return q1

class ReplayBuffer(object):

    def __init__(self, obs_dim, act_dim, device, max_size=int(1500000)):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0
        self.state = np.zeros((max_size, obs_dim))
        self.action = np.zeros((max_size, act_dim))
        self.next_state = np.zeros((max_size, obs_dim))
        self.reward = np.zeros((max_size, 1))
        self.not_done = np.zeros((max_size, 1))
        self.device = device

    def add(self, state, action, next_state, reward, done):
        self.state[self.ptr] = state
        self.action[self.ptr] = action
        self.next_state[self.ptr] = next_state
        self.reward[self.ptr] = reward
        self.not_done[self.ptr] = 1. - done
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)

    def sample(self, batch_size):
        ind = np.random.randint(0, self.size, size=batch_size)
        return (
            torch.FloatTensor(self.state[ind]).to(self.device),
            torch.FloatTensor(self.action[ind]).to(self.device),
            torch.FloatTensor(self.next_state[ind]).to(self.device),
            torch.FloatTensor(self.reward[ind]).to(self.device),
            torch.FloatTensor(self.not_done[ind]).to(self.device)
        )

**Prepare the environment and wrap it to capture videos**

In [None]:
%%capture
env = gym.make("BipedalWalker-v3")
# env = gym.make("Pendulum-v0") # useful continuous environment for quick experiments
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda ep_id: ep_id%video_every == 0, force=True)

obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

In [None]:
print('The environment has {} observations and the agent can take {} actions'.format(obs_dim, act_dim))
print('The device is: {}'.format(device))

if device.type != 'cpu': print('It\'s recommended to train on the cpu for this')

In [None]:
# in the submission please use seed 42 for verification
seed = 42
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)

# logging variables
ep_reward = 0
reward_list = []
plot_data = []
log_f = open("agent-log.txt","w+")

# initialise agent
agent = Agent(
    state_dim=obs_dim, action_dim=act_dim,
    max_action=float(env.action_space.high[0]), device=device,
    discount=0.99, rho=0.995
)
max_episodes = 1250
max_timesteps = 2000
memory = ReplayBuffer(obs_dim, act_dim, device=device)

# training procedure
ts = 0
for episode in range(1, max_episodes+1):
    state = env.reset()
    for t in range(max_timesteps):
        ts += 1

        # select the agent action
        if ts < 18000:
            action = env.action_space.sample()
        else:
            action = (
            agent.sample_action(np.array(state)) + np.random.normal(
                0, float(env.action_space.high[0]) * 0.1,
                size=act_dim
            )
        ).clip(
            -float(env.action_space.high[0]),
            float(env.action_space.high[0])
        )

        # take action in environment and get r and s'
        next_state, reward, done, _ = env.step(action)
        memory.add(
        state, action, next_state, reward,
        float(done) if t < 2000 else 0)
        state = next_state
        ep_reward += reward

        if ts >= 18000:
            agent.train(memory, 256)

        # stop iterating when the episode finished
        if done or t==(max_timesteps-1):
            break
    
    # append the episode reward to the reward list
    reward_list.append(ep_reward)

    # do NOT change this logging code - it is used for automated marking!
    log_f.write('episode: {}, reward: {}\n'.format(episode, ep_reward))
    log_f.flush()
    ep_reward = 0

    # print reward data every so often - add a graph like this in your report
    if episode % plot_interval == 0:
        plot_data.append([episode, np.array(reward_list).mean(), np.array(reward_list).std()])
        reward_list = []
        # plt.rcParams['figure.dpi'] = 100
        plt.plot([x[0] for x in plot_data], [x[1] for x in plot_data], '-', color='tab:grey')
        plt.fill_between([x[0] for x in plot_data], [x[1]-x[2] for x in plot_data], [x[1]+x[2] for x in plot_data], alpha=0.2, color='tab:grey')
        plt.xlabel('Episode number')
        plt.ylabel('Episode reward')
        plt.show()
        disp.clear_output(wait=True)