**Proximal Policy Optimization (PPO) + Noisy Layers for additional exploration**

Reference code used:


1.   https://github.com/higgsfield/RL-Adventure-2/blob/master/3.ppo.ipynb
2.   https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/atari_ppo.py
3. https://github.com/Shmuma/ptan
4. https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py 

Side note: I followed "Deep-Reinforcement-Learning-Hands-On-Second-Edition" by Maxim Lapan. I constructed a few different algorithms proposed in the book (such as rainbow DQN, and A2C) - This algorithm while not being particularly sample efficient and quite slow to train acheived the best results for me.

Training dynamics: the training was done over several days (approximately 100 hours) in multiple iterations to acheive a best mean score of 1447.00, this is of course reflected in the log file. 2850 was the best video recorded score.

In [None]:
'''The training process is very slow so mount drive to save training checkpoints'''
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Main Code**

In [None]:
'''Imports'''

import gym
from gym import spaces
import numpy as np
import random
import collections
from collections import deque
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim

'''Wrappers from openai atari baselines
https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
'''       
class FireResetEnv(gym.Wrapper):
  def __init__(self, env):
    gym.Wrapper.__init__(self, env)
    assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
    assert len(env.unwrapped.get_action_meanings()) >= 3

  def reset(self, **kwargs):
    self.env.reset(**kwargs)
    obs, _, done, _ = self.env.step(1)
    if done:
      self.env.reset(**kwargs)
    obs, _, done, _ = self.env.step(2)
    if done:
      self.env.reset(**kwargs)
    return obs

  def step(self, ac):
    return self.env.step(ac)

class SkipEnv(gym.Wrapper):
  def __init__(self, env, skip=4):
    gym.Wrapper.__init__(self, env)
    self._skip = skip

  def step(self, action):
    total_reward = 0.0
    done = None
    for i in range(self._skip):
      obs, reward, done, info = self.env.step(action)
      total_reward += reward
      if done:
        break

    return obs, total_reward, done, info

  def reset(self, **kwargs):
    return self.env.reset(**kwargs)

class EpisodicLifeEnv(gym.Wrapper):
  def __init__(self, env):
    gym.Wrapper.__init__(self, env)
    self.lives = 0
    self.was_real_done  = True

  def step(self, action):
    obs, reward, done, info = self.env.step(action)
    self.was_real_done = done
    lives = self.env.unwrapped.ale.lives()
    if lives < self.lives and lives > 0:
      done = True
    self.lives = lives
    return obs, reward, done, info

  def reset(self, **kwargs):
    if self.was_real_done:
      obs = self.env.reset(**kwargs)
    else:
      obs, _, _, _ = self.env.step(0)
    self.lives = self.env.unwrapped.ale.lives()
    return obs

class ClipRewardEnv(gym.RewardWrapper):
  def __init__(self, env):
    gym.RewardWrapper.__init__(self, env)

  def reward(self, reward):
    return np.sign(reward)

'''Wrapper for marking output'''
class MarkingWrapper(gym.Wrapper):
  def __init__(self, env):
    super(MarkingWrapper, self).__init__(env)
    self.total_reward = 0.0

  def step(self, action):
    obs, reward, done, info = self.env.step(action)
    self.total_reward += reward
    if done:
      marking.append(self.total_reward)
      self.total_reward = 0
    return obs, reward, done, info

def make_env(env_name, episodic_life=True, reward_clipping=True, monitor=False):
  env = gym.make(env_name)
  env = MarkingWrapper(env)
  if monitor:
    env = gym.wrappers.Monitor(env, "./video",video_callable=lambda episode_id: episode_id, force=True)
  env = SkipEnv(env, skip=4)
  if episodic_life:
    env = EpisodicLifeEnv(env)
  env = FireResetEnv(env)
  if reward_clipping:
    env = ClipRewardEnv(env)
  return env

'''
Give the weights nice starting values
https://github.com/higgsfield/RL-Adventure-2/blob/master/3.ppo.ipynb
'''
def init_weights(m):
  if isinstance(m, nn.Linear):
    nn.init.normal_(m.weight, mean=0., std=0.1)
    nn.init.constant_(m.bias, 0.1)

'''
Noisy layer
https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/lib/dqn_extra.py
'''
class NoisyLinear(nn.Linear):
  def __init__(self, in_features, out_features,
                 sigma_init=0.017, bias=True):
    super(NoisyLinear, self).__init__(
            in_features, out_features, bias=bias)
    w = torch.full((out_features, in_features), sigma_init)
    self.sigma_weight = nn.Parameter(w)
    z = torch.zeros(out_features, in_features)
    self.register_buffer("epsilon_weight", z)
    if bias:
      w = torch.full((out_features,), sigma_init)
      self.sigma_bias = nn.Parameter(w)
      z = torch.zeros(out_features)
      self.register_buffer("epsilon_bias", z)
    self.reset_parameters()

  def reset_parameters(self):
    std = math.sqrt(3 / self.in_features)
    self.weight.data.uniform_(-std, std)
    self.bias.data.uniform_(-std, std)

  def forward(self, input):
    if not self.training:
      return super(NoisyLinear, self).forward(input)
    bias = self.bias
    if bias is not None:
      bias = bias + self.sigma_bias * \
            self.epsilon_bias.data
    v = self.sigma_weight * self.epsilon_weight.data + \
      self.weight
    return F.linear(input, v, bias)

  def sample_noise(self):
    self.epsilon_weight.normal_()
    if self.bias is not None:
      self.epsilon_bias.normal_()

'''
trainable network
https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/lib/ppo.py
'''
class NoisyPPO(nn.Module):
  def __init__(self, input_shape, n_actions):
    super(NoisyPPO, self).__init__()

    self.noisy_layers = [
      NoisyLinear(np.array(input_shape).prod(), 256),
      NoisyLinear(256, 84),
      NoisyLinear(84, n_actions),
    ]

    # noisy layers for the policy network
    self.actor = nn.Sequential(
      self.noisy_layers[0],
      nn.ReLU(),
      self.noisy_layers[1],
      nn.ReLU(),
      self.noisy_layers[2],
    )

    # linear layers for the value network
    self.critic = nn.Sequential(
      nn.Linear(np.array(input_shape).prod(), 256),
      nn.ReLU(),
      nn.Linear(256, 84),
      nn.ReLU(),
      nn.Linear(84, 1)
    )

    self.apply(init_weights)

  def forward(self, x):
    x = x.view(x.size(0),-1)
    fx = x.float() / 256
    return self.actor(fx), self.critic(fx)

  def sample_noise(self):
    for l in self.noisy_layers:
      l.sample_noise()

In [None]:
'''params'''
# use ram because it trains quicker
ENV_NAME = "Gravitar-ram-v0"

GAMMA = 0.99
LEARNING_RATE = 1e-5
ENTROPY_BETA = 0.01
BATCH_SIZE = 64

PPO_TRAJ = 1025
PPO_EPOCHS = 2

GAE_LAMBDA = 0.95
CLIP_GRAD = 0.2

NUM_ENVS = 8

seed = 742

# change args to --val to record the trained model
args = '--train'

'''
calculate gae
https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/lib/ppo.py
'''
def calc_adv_ref(values, dones, rewards, gamma, gae_lambda):
  last_gae = 0.0
  adv, ref = [], []

  for val, next_val, done, reward in zip(reversed(values[:-1]), reversed(values[1:]),
                                           reversed(dones[:-1]), reversed(rewards[:-1])):
    if done:
      delta = reward - val
      last_gae = delta
    else:
      delta = reward + gamma * next_val - val
      last_gae = delta + gamma * gae_lambda * last_gae
    adv.append(last_gae)
    ref.append(last_gae + val)
  adv = list(reversed(adv))
  ref = list(reversed(ref))
  return torch.FloatTensor(adv), torch.FloatTensor(ref)

'''
train on ppo batch
https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/atari_ppo.py
'''
def train(batch):
    optimizer.zero_grad()

    states_t, actions_t, advantage, ref, old_logprob = batch
    policy, value = net(states_t)
    loss_value = F.mse_loss(value.squeeze(-1), ref)

    logpolicy = F.log_softmax(policy, dim=1)

    probs = F.softmax(policy, dim=1)
    loss_entropy = (probs * logpolicy).sum(dim=1).mean()

    logprob = logpolicy.gather(1, actions_t.unsqueeze(-1)).squeeze(-1)
    ratio = torch.exp(logprob - old_logprob)
    surr_obj = advantage * ratio
    clipped_surr = advantage * torch.clamp(ratio, 1.0 - CLIP_GRAD, 1.0 + CLIP_GRAD)
    loss_policy = -torch.min(surr_obj, clipped_surr).mean()

    loss = ENTROPY_BETA * loss_entropy + loss_policy + loss_value
    loss.backward()
    optimizer.step()

# setup CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu" )

# setup multiple environments and set seeds
envs = [make_env(ENV_NAME) for i in range(NUM_ENVS)]
for idx, env in enumerate(envs):
  env.seed(seed + idx)
  env.action_space.seed(seed + idx)

# set seeds
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)

# init networks and optimizers
net = NoisyPPO(env.observation_space.shape, env.action_space.n).to(device)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

# additional variables for monitoring progress
n_episode = 0
marking = collections.deque(maxlen=100)

frame_idx = 0
best_m_reward = None

'''try to load a checkpoint save otherwise start from the begining'''
try:
  params = torch.load('drive/My Drive/training/gravitar-save.chkpt')
  net.load_state_dict(params['net'])
  optimizer.load_state_dict(params['optimizer'])
  n_episode = params['n_episode']
  best_m_reward = params['best_m_reward']
  print("Resuming training session from gravitar-save.chkpt")
except:
  print("Starting new training session")

if args == '--val':

  # setup validation environment with video recording
  env = make_env(ENV_NAME, episodic_life=False, reward_clipping=False, monitor=True)
  env.seed(seed)
  env.action_space.seed(seed)
  
  # run 100 episodes and record videos for each
  for i in range(100):
    state = env.reset()
    done = False
    while not done:
      with torch.no_grad():
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        logits, _ = net(state)

        probs = F.softmax(logits, dim=1)
        dist = Categorical(probs)

        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state

    print("episode {}, score {:.1f}".format(i, marking[-1]))

  print("validation episodes: {}, mean_score: {:.2f}, std_score: {:.2f}".format(
                100,  np.array(marking).mean(), np.array(marking).std()))

elif args == '--train':

  # training loop
  while True:

    states = []
    actions = []
    rewards = []
    dones = []
    last_done_index = None

    # interact with the environments in a round robin fashion
    for e_idx, e in enumerate(envs):
      state = e.reset()

      # play an entire PPO trajectory in the current environment
      for _ in range(PPO_TRAJ):
        '''
        batch generation
        https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter21/lib/ppo.py
        '''
        with torch.no_grad():
          state = torch.from_numpy(state).float().unsqueeze(0).to(device)
          logits, _, = net(state)

          probs = F.softmax(logits, dim=1)
          dist = Categorical(probs)

          action = dist.sample()

          next_state, reward, done, _ = e.step(action.cpu().numpy())

          states.append(state)
          actions.append(action)
          rewards.append(reward)
          dones.append(done)

          state = next_state

          if done:
            last_done_index = len(states)-1
            state = e.reset()

        frame_idx += 1
        '''
        training is very slow so we only print every 1 million frames
        change to len(marking) == 100 to print every 100 episodes instead
        '''
        if frame_idx%1000000==0:
          # get the mean reward for the last 100 episodes
          m_reward = np.array(marking).mean()
          n_episode += 100

          if best_m_reward is None or best_m_reward < m_reward:
            print("NEW BEST MEAN REWARD --> {:.2f}".format(m_reward))
            best_m_reward = m_reward

          print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
                n_episode, marking[-1], m_reward, np.array(marking).std()))
          
          # save the training for later - comment out if drive is not mounted
          torch.save({'net':net.state_dict(), 'optimizer':optimizer.state_dict(), 'n_episode':n_episode, 
                      'best_m_reward':best_m_reward}, 'drive/My Drive/training/gravitar-save.chkpt')
          
          marking = collections.deque(maxlen=100)

        # make sure we have a full trajectory
        if len(states) < PPO_TRAJ:
          continue

        # make sure there is atleast one full episode in our trajectory
        # improves training stability
        if last_done_index is None or last_done_index == len(states)-1:
          continue

        # crop the trajectory removing transitions in incomplete episodes
        # improves training stability
        net.sample_noise()

        states = states[:last_done_index+2]
        actions = actions[:last_done_index + 2]
        rewards = rewards[:last_done_index + 2]
        dones = dones[:last_done_index + 2]

        '''prepare the rest of the training batch'''

        states_t = torch.cat(states).to(device)
        actions_t = torch.tensor(actions).to(device)
        policy, values = net(states_t)
        values = values.squeeze()

        advantage, ref = calc_adv_ref(values.data.cpu().numpy(),
                                      dones, rewards, GAMMA, GAE_LAMBDA)
        advantage = advantage.to(device)
        ref = ref.to(device)

        logpolicy = F.log_softmax(policy, dim=1)
        old_logprob = logpolicy.gather(1, actions_t.unsqueeze(-1)).squeeze(-1)
        advantage = (advantage - torch.mean(advantage)) / torch.std(advantage)
        old_logprob = old_logprob.detach()

        # make our trajectory splittable on even batch chunks
        len_trajectory = len(states_t) - 1
        len_trajectory -= len_trajectory % BATCH_SIZE
        len_trajectory += 1
        indices = np.arange(0, len_trajectory-1)

        # train for ppo epoches
        for _ in range(PPO_EPOCHS):
          np.random.shuffle(indices)
          for batch_indices in np.split(indices, len_trajectory // BATCH_SIZE):
            train((
                          states_t[batch_indices],
                          actions_t[batch_indices],
                          advantage[batch_indices],
                          ref[batch_indices],
                          old_logprob[batch_indices],
                      ))
            
        states.clear()
        actions.clear()
        rewards.clear()
        dones.clear()

Resuming training session from gravitar-save.chkpt
episode 0, score 1850.0
episode 1, score 1300.0
episode 2, score 550.0
episode 3, score 350.0
episode 4, score 950.0
episode 5, score 2050.0
episode 6, score 1000.0
episode 7, score 1850.0
episode 8, score 600.0
episode 9, score 250.0
episode 10, score 800.0
episode 11, score 1750.0
episode 12, score 500.0
episode 13, score 2450.0
episode 14, score 1250.0
episode 15, score 1150.0
episode 16, score 800.0
episode 17, score 1950.0
episode 18, score 900.0
episode 19, score 450.0
episode 20, score 1750.0
episode 21, score 900.0
episode 22, score 2050.0
episode 23, score 1250.0
episode 24, score 1500.0
episode 25, score 900.0
episode 26, score 700.0
episode 27, score 1850.0
episode 28, score 2250.0
episode 29, score 1300.0
episode 30, score 2750.0
episode 31, score 900.0
episode 32, score 1750.0
episode 33, score 1550.0
episode 34, score 1300.0
episode 35, score 800.0
episode 36, score 2100.0
episode 37, score 1450.0
episode 38, score 1000.0