Empty video folder

In [None]:
import os
import glob
files = glo>b.glob('./video/*')
for f in files:
  os.remove(f)

**Initialise**

In [None]:
# you can write a brief 8-10 line abstract detailing your submission and experiments here

# Using RAM version of gravitar as the frame buffer version used too much memory when using the replay buffer
# to get any meaningful results
# DQN using target network, dueling, prioritised replay, and noisy networks with epsilon greedy decay as well as it gives seemingly better results in practice
# Using noisy networks and dueling despite Rainbow DQN paper (arXiv:1710.02298) reporting worse peformance
# on gravitar my tests showed better convergence on the RAM version

# Results show that the model is converging a little under a score of 2000 likely as this is a full clear of planet and anything it learns to help planet 2 harm performance on planet 1 

# The code is also written to run on a GPU using CUDA so please select a GPU runtime on google colab when running or ensure CUDA is available on NCC

# the code is based on https://github.com/seungeunrho/minimalRL/blob/master/dqn.py, which is released under the MIT licesne
# make sure you reference any code you have studied as above, with one comment line per reference

# imports
import gym
import collections
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# hyperparameters
learning_rate = 1e-4
gamma         = 0.98
buffer_limit  = 100000
batch_size    = 32
video_every   = 25
print_every   = 5
ep_limit = 1e6
update_freq = 1000


# Class is adapted from https://github.com/Shmuma/ptan/blob/master/samples/rainbow/lib/dqn_model.py which is released under the MIT license
class Noisy(nn.Linear):
    def __init__(self, in_features, out_features, sig_init=0.4, bias=True):
        super(Noisy, self).__init__(in_features, out_features, bias)
        self.sig_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sig_init/math.sqrt(in_features)))
        self.register_buffer('weight_var', torch.zeros(out_features, in_features))
        if bias == True:
            self.sig_bias = nn.Parameter(torch.Tensor(out_features).fill_(sig_init/math.sqrt(out_features)))
            self.register_buffer('bias_var', torch.zeros(out_features))
        self.reset_params()
      
    def reset_params(self):
        dev = 1/math.sqrt(self.sig_weight.size(1))
        self.weight.data.uniform_(-dev, dev)
        self.bias.data.uniform_(-dev, dev)

    
    def forward(self, ins):
        torch.randn(self.weight_var.size(), out=self.weight_var)
        tmp = self.bias
        if self.bias is not None:
            torch.randn(self.bias_var.size(), out=self.bias_var)
            tmp = tmp + self.sig_bias * Variable(self.bias_var).cuda()
        return F.linear(ins, self.weight+self.sig_weight * Variable(self.weight_var).cuda(), tmp)



# Class adapted from https://github.com/higgsfield/RL-Adventure/blob/master/4.prioritized%20dqn.ipynb there was no license in the repo
class ReplayBuffer():
    def __init__(self):
        self.buffer = []
        self.alpha = 0.5
        self.cap = buffer_limit
        self.pos = 0
        self.prios = np.zeros((self.cap,), dtype=np.float32)
    
    def put(self, transition):
        max_prio = self.prios.max() if self.buffer else 1.0
        if len(self.buffer) < self.cap:
            self.buffer.append(transition)
        else:
            self.buffer[self.pos] = transition
        # add new transition to buff with max priority
        self.prios[self.pos] = max_prio
        self.pos = (self.pos+1) % self.cap

    def sample(self, n):
        if len(self.buffer) == self.cap:
            prios = self.prios
        else:
            prios = self.prios[:self.pos]
        # Scale priorites into probabilities
        probs = prios ** self.alpha
        probs /= probs.sum()
        # sample batch from buffer using priorities
        idx = np.random.choice(len(self.buffer), n, p=probs)
        mini_batch = [self.buffer[id] for id in idx]
        # using fixed exponent of 0.5 rather than variable as performance is better
        weights = 1/np.sqrt(len(self.buffer) * probs[idx])
        weights /= weights.max()
        weights = np.array(weights, dtype=np.float32)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        # create sample for model
        for transition in mini_batch:
          # s = state, a = action, r = reward, s_prime = new state
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
        # return samples on device
        return torch.tensor(s_lst, dtype=torch.float).to(device), torch.tensor(a_lst).to(device), \
               torch.tensor(r_lst).to(device), torch.tensor(s_prime_lst, dtype=torch.float).to(device), \
               torch.tensor(done_mask_lst).to(device), idx, \
               torch.tensor(weights, dtype=torch.float).to(device)

    def update(self, indices, priorities):
        for id, prio in zip(indices, priorities):
            self.prios[id] = prio

    def size(self):
        return len(self.buffer)

class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()

        self.pre = nn.Sequential(
            nn.Linear(np.array(env.observation_space.shape).prod(), 256),
            nn.ReLU()
        )
        self.adv = nn.Sequential(
              Noisy(256, 84),
              nn.ReLU(),
              Noisy(84, env.action_space.n)
        )
        self.val = nn.Sequential(
            Noisy(256, 84),
            nn.ReLU(),
            Noisy(84, 1)
        )
    # using dueling DQN so return value of taking action in this state and the state value combined as Q-value
    def forward(self, x):
        x = self.pre(x)
        val = self.val(x)
        adv = self.adv(x)
        return val + (adv - adv.mean())
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return env.action_space.sample()
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    s,a,r,s_prime,done_mask, idx, weights = memory.sample(batch_size)
    q_out = q(s)
    q_a = q_out.gather(1,a)
    max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
    target = r + gamma * max_q_prime * done_mask
    # use mean squared error loss for each sample, weighted by buffer weights
    loss =  (q_a - target.detach()).pow(2) * weights
    # adjust priorities based on the loss the corresponding transition exhibited
    prios = loss + 1e-5
    # mean over all losses and back-propogate
    loss = loss.mean()
    optimizer.zero_grad()
    loss.backward()
    # update buffer priorities
    prios = prios.data.cpu()
    memory.update(idx, prios.numpy()[0])
    optimizer.step()


Wrappers CITE THIS BEFORE SUBMISSION

In [None]:
# Wrappers from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter06/lib/wrappers.py under the MIT license


class FireResetEnv(gym.Wrapper):
   def __init__(self, env=None):
       super(FireResetEnv, self).__init__(env)
       assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
       assert len(env.unwrapped.get_action_meanings()) >= 3
   def step(self, action):
       return self.env.step(action)
   def reset(self):
       self.env.reset()
       obs, _, done, _ = self.env.step(1)
       if done:
          self.env.reset()
       obs, _, done, _ = self.env.step(2)
       if done:
          self.env.reset()
       return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip
    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
           obs, reward, done, info = self.env.step(action)
           self._obs_buffer.append(obs)
           total_reward += reward
           if done:
               break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info
    def reset(self):
       self._obs_buffer.clear()
       obs = self.env.reset()
       self._obs_buffer.append(obs)
       return obs


**Train**

← You can download the videos from the videos folder in the files on the left

In [None]:
# setup the Gravitar ram environment, and record a video every 50 episodes. You can use the non-ram version here if you prefer
env = gym.make('Gravitar-ram-v0')
env = gym.wrappers.Monitor(env, "./video", video_callable=lambda episode_id: (episode_id%video_every)==0,force=True)
env = MaxAndSkipEnv(env)
env = FireResetEnv(env)
# reproducible environment and action spaces, do not change lines 6-11 here (tools > settings > editor > show line numbers)
seed = 742
torch.manual_seed(seed)
env.seed(seed)
random.seed(seed)
np.random.seed(seed)
env.action_space.seed(seed)
q = QNetwork().to(device)
q_target = QNetwork().to(device)

memory = ReplayBuffer()

score    = 0.0
marking  = []
optimizer = optim.Adam(q.parameters(), lr=learning_rate)
frame_idx = 0
episode = 0


# Uncomment below line to resume training from saved model
# load()


q_target.load_state_dict(q.state_dict())
for n_episode in range(episode, int(ep_limit)):
    s = env.reset()
    done = False
    score = 0.0
    while True:
        # model epsilon as an exponential decay with fixed min of 0.01 and max of 1.0, starts converging faster than linear decay in practice
        # based on re-aranged equation found at https://math.stackexchange.com/questions/2362737/how-to-find-a-differential-equation-with-exponential-decay-between-two-values
        epsilon =  0.01 + (1-0.01) * math.exp(-frame_idx/30000)
        # do action based on current state
        a = q.sample_action(torch.from_numpy(np.array(s)).float().unsqueeze(0).to(device), epsilon)
        # done means episode is over
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r,s_prime, done_mask))
        s = s_prime
        score += r
        if done:
            break
        if frame_idx % update_freq == 0:
            q_target.load_state_dict(q.state_dict())
        frame_idx += 1
    # if we have a big enough history then train on it
        if memory.size()>10000:
            train(q, q_target, memory, optimizer)

    # do not change lines 44-48 here, they are for marking the submission log
    marking.append(score)
    if n_episode%100 == 0:
        print("marking, episode: {}, score: {:.1f}, mean_score: {:.2f}, std_score: {:.2f}".format(
            n_episode, score, np.array(marking).mean(), np.array(marking).std()))
        marking = []

    # you can change this part, and print any data you like (so long as it doesn't start with "marking")
    if n_episode%print_every==0 and n_episode!=0:
        print("episode: {}, score: {:.1f}, epsilon: {:.2f}".format(n_episode, score, epsilon))
    if n_episode%100==0 and n_episode!=0:
        checkpoint(q, optimizer, frame_idx, n_episode)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# save progress
import pickle
def checkpoint(q,opt,frame, episode):
  torch.save({'Q':q.state_dict(), 'optimiser':opt.state_dict(), 'e':episode, 'f':frame}, 'drive/My Drive/training/rl-cit.chkpt')
  pickle.dump(memory, open('drive/My Drive/training/buffer-cit.p', 'wb'))

In [None]:
# load saved model
def load():
    global frame_idx, episode, q, optimizer, memory
    params = torch.load('drive/My Drive/training/rl-cit.chkpt')
    frame_idx = params['f']
    episode = params['e']
    q.load_state_dict(params['Q'])
    optimizer.load_state_dict(params['optimiser'])
    memory = pickle.load(open('drive/My Drive/training/buffer-cit.p', 'rb'))