In [41]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from atari_wrappers import NoopResetEnv
from atari_wrappers import MaxAndSkipEnv
from atari_wrappers import EpisodicLifeEnv
from atari_wrappers import FireResetEnv
from atari_wrappers import WarpFrame
from atari_wrappers import ScaledFloatFrame
from atari_wrappers import ClipRewardEnv
from atari_wrappers import FrameStack

from model import DQN

from tqdm import tqdm
from collections import deque

import os
import imageio
import numpy as np
from PIL import Image
import PIL.ImageDraw as ImageDraw
import matplotlib.pyplot as plt

In [42]:

def makeAtari(env_id, render_mode=None,max_episode_steps=400000):
    if render_mode == None:
        env = gym.make(env_id,render_mode='rgb_array')
    elif render_mode == 'human':
        env = gym.make(env_id,render_mode='human')
    env._max_episode_steps = max_episode_steps

    env = NoopResetEnv(env, noop_max=1)
    env = MaxAndSkipEnv(env, skip=4)
    return env


def wrapDeepmind(env):
    env = EpisodicLifeEnv(env)
    env = ClipRewardEnv(env)
    env = WarpFrame(env)
    return env


#Take rgb array and greyscale it 
def frameProcessor(n_frame):
    n_frame = torch.from_numpy(n_frame)
    h = n_frame.shape[-2]
    return n_frame.view(1,h,h)


### Environmnet setup 
When setting up the environemnt there are a few things we need to define
* The device, if we are using the CPU or the GPU
* The environemtn id, as we are using the gym library we need to make sure the environemnt id matches what is written in gym

Using the frame processor function from above we also get the chanell height and width which is later used to define the shape of our input for our states 

We alos define the policy and target net. Applying initial wieghts (zero values)

In [36]:
#Use GPU if it is available if not use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(device)

#our atari environemnt and the verision we are using
envId = 'Breakout-v4'
env = makeAtari(envId)
env = wrapDeepmind(env)

c,h,w = frameProcessor(env.reset()).shape
actions = env.action_space.n

policyNet = DQN(actions, device).to(device)
policyNet.apply(policyNet.init_weights)

targetNet = DQN(actions, device).to(device)
targetNet.load_state_dict(policyNet.state_dict())



<All keys matched successfully>

### Hyperparameters
The hyperparameters difine our decay rate, memory size etc. They are tweaked to fined optimum values for our agent

In [37]:
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.
EPS_END = 0.1
EPS_DECAY = 1000000
TARGET_UPDATE = 10000
NUM_STEPS = 20000000
M_SIZE = 10000
POLICY_UPDATE = 4
EVALUATE_FREQ = 200000
NUM_EPISODE = 100000
optimizer = optim.Adam(policyNet.parameters(), lr=0.0000625, eps=1.5e-4)

## Replay buffer
The replay buffer is where all our states,actions,rewards,done(if the episode is ended or not) and priority. The replay buffer is later used in evaluation and optimize methods.

In [38]:

#REPLAY BUFFER 
statesM = torch.zeros((M_SIZE, 5, h, w), dtype=torch.uint8) #state is a tensor of zeros with length capacity and in the shae c,h,w with type uint8
actionsM = torch.zeros((M_SIZE, 1), dtype=torch.long) #action is a tensor of zeros with length capacity with only 1 item in its shape with type long  
rewardsM = torch.zeros((M_SIZE, 1), dtype=torch.int8) #rewardsM is a tensor of zeros with length capacity with only 1 item in its shape with type unit8

doneM = torch.zeros((M_SIZE, 1), dtype=torch.bool) #doneM< is a tensor of zeros with length capacity with only 1 item in its shape with type bool
prioritiesM = np.zeros(M_SIZE) #priorities is a numpy array with length capacity, it stores floats that define the proactionBatchbilities for the corresponding index

#initial size and position is both zero
position = 0

def push(state, action, reward, done, priority,position):
    statesM[position] = state 
    actionsM[position,0] = action
    rewardsM[position,0] = reward
    doneM[position,0] = done
    prioritiesM[position] = priority

    #iterate position till it reaches Memory size
    #if capacity is reached then restart buffer at zero
    if position < M_SIZE:
        position +=1
    else:
        position = 0


def getPriorities(actionBatchtchSize):
        #Temporary store for priorities
        arr = prioritiesM

        # Create a boolean array indicating which elements are not zero
        nonZeroMask = arr != 0

        # Use the boolean array to get a subset of the original array
        
        tempPrio = arr[nonZeroMask]
        prioritySum = np.sum(tempPrio)
        samplingProactionBatchbility = tempPrio / prioritySum
        indexValues = []

        #Sample random integers given proactionBatchbility
        for i in range(actionBatchtchSize):
            indexValues.append(np.random.choice(len(samplingProactionBatchbility), p=samplingProactionBatchbility))
        return indexValues

def sample(actionBatchtchSize):
    #get index values actionBatchsed on their priorities
    i = getPriorities(actionBatchtchSize)
    
    frameStack = statesM[i, :4] #for each index in i get its frame stack
    stateBatch = frameStack[1]  #the state is the current state
    actionBatch = actionsM[i].to(device) #get the action at the index, convert it to the device type
    rewardBatch = rewardsM[i].to(device).float() #get the reward at the index, convert it to the device type and then to a float
    doneBatch = doneM<[i].to(device).int()  #get the reward at the index, convert it to the device type and then to a int 
    return frameStack, actionBatch, rewardBatch, stateBatch, doneBatch


The cell below is only used for the video 

In [39]:
def _label_with_episode_number(frame, episode_num):
    im = Image.fromarray(frame)

    drawer = ImageDraw.Draw(im)

    if np.mean(im) < 128:
        text_color = (255,255,255)
    else:
        text_color = (0,0,0)
    drawer.text((im.size[0]/20,im.size[1]/12), f'Num steps: {episode_num+1}', fill=text_color)

    return im

### Action selector
Depending on the current epsilon value we sample a random value and depnding on if it is greater or less than the epsilon we either pick the best action or a random value.

There is another function in this cell ***optimumAction*** this is used only in evaluation. It has an epsilon of 0.05 so 95% of the time we are picking the optimum value.

In [25]:
epsCounter = EPS_START
epsDecay = (EPS_START - EPS_END)/EPS_DECAY

def selectAction(epsCounter, state, training=False):
        sample = random.random() #Sample a random value
        # if trainning is not false
        if training:
            #Decay epsilon by decay factor
            epsCounter -= epsDecay
            #Chose max between the current epsilon and the final epsilon. 
            #if current eps falls bellow 0.1 then it will pick 0.1
            epsCounter = max(epsCounter, EPS_END)

        #If the sample is greater than current eps    
        if sample > epsCounter:
            with torch.no_grad():
                #using the policy net pick the action with the highest state action value
                max_index = policyNet(state).argmax(dim=1)
                a = max_index.cpu().view(1,1)
        else:
            #If sample is not greater than eps then randomly chose an action 
            a = torch.tensor([[random.randrange(4)]], device=device, dtype=torch.long)
        #Return action as a numpy array and the current eps
        return a.numpy()[0,0].item(), epsCounter

def optimumAction(state):
    sample = random.random() #Sample a random value
    eps = 0.05

    #If the sample is greater than current eps    
    if sample > eps:
        with torch.no_grad():
            #using the policy net pick the action with the highest state action value
            maxIndex = policyNet(state).argmax(dim=1)
            a = maxIndex.cpu().view(1,1)
    else:
        #If sample is not greater than eps then randomly chose an action 
        a = torch.tensor([[random.randrange(4)]], device=device, dtype=torch.long)
    #Return action as a numpy array and the current eps
    return a.numpy()[0,0].item()



### Optimize model 

In [26]:
#Double DQN
def optimize_model(train):
    if not train:#
        return
    state_batch, action_batch, reward_batch, n_state_batch, done_batch = sample(BATCH_SIZE)

    q = policyNet(state_batch).gather(1, action_batch)
    q_next_values=policyNet(n_state_batch).gather(1 , action_batch)
    a_prime = policyNet(n_state_batch).max(1)[1]
    q_target_next_values = targetNet(n_state_batch).detach()
    q_target_sa_prime = q_target_next_values.gather(1, a_prime.unsqueeze(1))
    q_target_sa_prime = q_target_sa_prime.squeeze()


    # Compute the expected Q values
    expected_state_action_values = (q_target_sa_prime * GAMMA)*(1.-done_batch[:,0]) + reward_batch[:,0]

    # Compute Huber loss
    loss = F.smooth_l1_loss(q, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policyNet.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

### Calculate loss/priority
to calculate the loss we calculate the difference between the predicted reward and the actual reward multiplied by the discount factor

In [None]:
def getPriority(state,reward,done):
    tState = torch.tensor(state)

    qV = policyNet(tState).max(1)[0].cpu().detach() 
    nq = targetNet(tState).max(1)[0].cpu().detach()
    
    doneBatch = torch.tensor(np.array([int(done)]))

    expected_state_action_values = (nq * GAMMA)*(1.-doneBatch) + reward

    loss = F.smooth_l1_loss(qV, expected_state_action_values)

    return abs(loss)

### Evaluation 
The local network is evaluated every n steps (based on the hyperparams). It runs through 5 episodes using the ***OptimumAction*** selector. This makes sure we are using the best actions. We sum up the rewards and this is printed into a text file along with the number of steps we have done and the current eps. We also have a counter for the ***maxVal*** whhch is the maximum reward gained during the current evaluation.

In [27]:
frameStore=[]
filenames = []

def evaluate(step, policy_net, device, env, n_actions, curr_eps,train):
    
    #Numver of episodes to average over
    numEpisode=5
    
    eRewards = []
    eQ=[]
    q = deque(maxlen=5)
    countingQ=0
    countingSteps=0
    maxval=0

    for i in range(numEpisode):
        env.reset()
        eReward = 0
        for _ in range(15): # no-op
            frame, _, done, _ = env.step(0)
            frame = frameProcessor(frame)
            q.append(frame)

        while not done:
            
            state = torch.cat(list(q))[1:].unsqueeze(0)
            #Get optimum action
            action = optimumAction(state)
            frame, reward, done, info = env.step(action)
            frame = frameProcessor(frame)
            q.append(frame)
               
            eReward += reward
            countingQ+=max(policy_net(state)[0])
            countingSteps+=1
        maxval=max(maxval,eReward)
        eQ.append(countingQ/countingSteps)
        eRewards.append(eReward)

    f = open("file.txt",'a') 
    f.write("Average reward: %f, Steps: %d, number of eps: %d, current eps: %f, Maxval: %f Average predicted Q: %f \n" % (float(sum(eRewards))/float(numEpisode), step, numEpisode,float(curr_eps),maxval,(float(sum(eQ)))/(float(numEpisode))))
    f.close()
    for i in range(2):
        env.reset()
        for _ in range(10): # no-op
            frame, _, done, _ = env.step(0)
            frame = frameProcessor(frame)
            q.append(frame)
        while not done:
            frameTest = env.render("rgb_array")
            state = torch.cat(list(q))[1:].unsqueeze(0)
            action = optimumAction(state)
            frame, reward, done, info = env.step(action)
            frame = frameProcessor(frame)
            q.append(frame)
            frameStore.append(_label_with_episode_number(frameTest, episode_num=step))
            
    imageio.mimwrite(os.path.join('./videos/', 'random_agent.gif'), frameStore, frameProcessors=60)

### Main section (trainning)
In this main section we train our agent. Initially we do 15 steps of a no-op. This makes sure our starting state is random. we append this frames to the queue. After having our initial state we then check if the length of our replay buffer is greater than 5000. This is just to make sure we have enough data to train our agent on. We then select an action based on the current epsilon. Storing the State,reward,action,done in the replay buffer.

This then loops for the number of steps

In [30]:
steps_done = 0
q = deque(maxlen=5)
done = True
eps = 0
episode_len = 0


for step in tqdm(range(NUM_STEPS)):
    if done: # life reset !!!
        env.reset()
        sum_reward = 0
        episode_len = 0
        img, _, _, _ = env.step(1) # BREAKOUT specific !!!
        for i in range(15): # no-op
            frame, _, _, _ = env.step(0)
            frame = frameProcessor(frame)
            q.append(frame)
        
    train = len(prioritiesM) > 50000
    # Select and perform an action
    state = torch.cat(list(q))[1:].unsqueeze(0)
    action, epsCounter = selectAction(epsCounter,state,train)

    #get value from the environemnt based on the action taken
    frame, reward, done, info = env.step(action)
    #Process to image to grey scale the image
    frame = frameProcessor(frame)

    #Get priority
    priority = getPriority(state,reward,done)

    # 5 frame as memory
    q.append(frame)
    push(torch.cat(list(q)).unsqueeze(0), action, reward, done,priority,position) # here the frame means next frame from the previous time step
    episode_len += 1

    # Perform one step of the optimization (on the target network)
    if step % POLICY_UPDATE == 0:
        (train)

    # Update the target network, copying all weights and biases in DQN
    if step % TARGET_UPDATE == 0:
        targetNet.load_state_dict(policyNet.state_dict())
    
    if step % EVALUATE_FREQ == 0:
        evaluate(step, policyNet, device, env, actions, cuur_eps, train)

  tState = torch.tensor(state)
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(
  0%|          | 0/20000000 [00:03<?, ?it/s]


FileNotFoundError: The directory '/Users/blank/Desktop/Reinforcement Learning/Breakout FINAL/videos' does not exist