In [0]:
!pip install gym 



In [0]:
!pip install atari-py



In [0]:
!pip install gym[atari]



# Implementation details

There are 5 modules of Deep Q learning-
1. CNN for interacting with the environment
2. Experience replay so that gradient descent converges rather than diverges. It basically provides a sense of direction to where to move for reducing the loss
3. A module to integrate both of the above modules and building a linking peice to complete the algorithm
4. testing with the OpenAI gym environment
5. Preprocessing of image

# import statements

In [0]:
import gym
import numpy as np
from skimage.transform import resize
import tensorflow as tf
from skimage.color import rgb2gray
from collections import deque
import random

# Initializing Gym environment

In [0]:
env = gym.make("Breakout-v4")
env = env.unwrapped
env.seed(1)
state = env.reset()

# Hyper parameters

In [0]:
action_number=env.action_space.n
learning_rate=0.01
batch_size=64
stack_size = 4
pretrain_length = batch_size   
memory_size = 1000000         
total_episodes = 50
gamma=0.9
state_size = [84, 84, 4]  
explore_start = 1.0            
explore_stop = 0.01
decay_rate = 0.00001

# Pre processing of image

In [0]:
def frame_preprocessing(image_frame):
  gray = rgb2gray(image_frame)
  cropped_frame = gray[8:-12,4:-12]
  normalized_frame = cropped_frame/255.0
  preprocessed_frame = resize(normalized_frame, [84,84])
  return preprocessed_frame

# Experience replay

## Stacking frames

In [0]:
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
def stack_frames(stacked_frames, state, is_new_episode):
    frame=frame_preprocessing(state)
    if is_new_episode:
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2) 
    return stacked_state, stacked_frames

## Memory class

In [0]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

## Dealing with empty memory problem

In [0]:
memory = Memory(max_size = memory_size)
for i in range(pretrain_length):
    if i == 0:
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    #possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
    action = random.randint(1,env.action_space.n)-1
    #action = possible_actions[choice]
    next_state, reward, done, _ = env.step(action)
    #env.render()
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    if done:
        next_state = np.zeros(state.shape)
        possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
        action = possible_actions[action]
        print(action.shape)
        memory.add((state, action, reward, next_state, done))
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
        action = possible_actions[action]
        memory.add((state, action, reward, next_state, done))
        state = next_state

# Q Network

The Q network architecture-
1. 16 filters with 8 X 8 kernel and 4 strides with relu activation
2. 32 filters with 4 X 4 kernel and 2 strides with relu activation
3. 256 fully connected layer
4. linear layer with number of actions

In [8]:
tf.compat.v1.disable_eager_execution()
input_state=tf.compat.v1.placeholder(tf.float32, shape=[None, *state_size],name="input_state")
y_j=tf.compat.v1.placeholder(tf.float32, shape=[None],name="y_j")
action_space=tf.compat.v1.placeholder(tf.float32, shape=[None, action_number],name="action_space")
cnn_layer_1=tf.keras.layers.Conv2D(filters=16,kernel_size=(8,8),strides=(4,4),activation="relu")(input_state)
cnn_layer_2=tf.keras.layers.Conv2D(filters=32,kernel_size=(4,4),strides=(2,2),activation="relu")(cnn_layer_1)
flatten_layer=tf.keras.layers.Flatten()(cnn_layer_2)
fully_connected_layer_1=tf.keras.layers.Dense(256,activation='relu',name="fully_connected_layer_1")(flatten_layer)
output_layer=tf.keras.layers.Dense(action_number,name="output_layer")(fully_connected_layer_1)
action_output=tf.keras.layers.Softmax(name="action_output")(output_layer)
Q_value=tf.math.reduce_sum(tf.math.multiply(output_layer, action_space))
loss=tf.math.reduce_mean(tf.math.square(y_j-Q_value))
training=tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


# Training Algorithm

In [0]:
saver = tf.compat.v1.train.Saver()
with tf.compat.v1.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
    decay_step = 0
    for i in range(total_episodes):
      episode_rewards = []
      state = env.reset()
      #env.render()
      state, stacked_frames = stack_frames(stacked_frames, state, True)
      while True:
        epsilon=random.random()
        decay_step +=1
        explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
        if (explore_probability > epsilon):
          action = random.randint(1,env.action_space.n)-1
          next_state, reward, done, _ = env.step(action)
        else:
          action_probability=sess.run(action_output,feed_dict={input_state:state.reshape((1, *state.shape))})
          action = np.random.choice(range(action_probability.shape[1]), p=action_probability.ravel())
          next_state, reward, done, _ = env.step(action)
        episode_rewards.append(reward)
        if done:
          next_state = np.zeros(state.shape)
          possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
          action = possible_actions[action]
          memory.add((state, action, reward, next_state, done))
          total_reward = np.sum(episode_rewards)
          print('Episode: {}'.format(i),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability))
          break
        else:
          next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
          possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())
          action = possible_actions[action]
          memory.add((state, action, reward, next_state, done))
          state=next_state
        batch = memory.sample(batch_size)
        states_mb = np.array([each[0] for each in batch], ndmin=3)
        actions_mb = np.array([each[1] for each in batch])
        rewards_mb = np.array([each[2] for each in batch]) 
        next_states_mb = np.array([each[3] for each in batch], ndmin=3)
        dones_mb = np.array([each[4] for each in batch])
        target_Qs_batch = []
        Qs_next_state = sess.run(output_layer, feed_dict = {input_state: next_states_mb})
        for j in range(len(batch)):
          terminal = dones_mb[i]
          if terminal:
            target_Qs_batch.append(rewards_mb[i])
          else:
            target_Qs_batch.append(rewards_mb[i]+(gamma*np.max(Qs_next_state[i])))
        targets_mb = np.array([each for each in target_Qs_batch])
        loss_value, _ = sess.run([loss, training],
                           feed_dict={input_state: states_mb,
                                      y_j: targets_mb, 
                                      action_space: actions_mb})
      if i % 5 == 0:
        save_path = saver.save(sess, "./models/model.ckpt")
        print("Model Saved")

Episode: 0 Total reward: 3.0 Explore P: 0.9964
Model Saved
Episode: 1 Total reward: 1.0 Explore P: 0.9939
Episode: 2 Total reward: 2.0 Explore P: 0.9909
Episode: 3 Total reward: 1.0 Explore P: 0.9885
Episode: 4 Total reward: 0.0 Explore P: 0.9868


# Testing of model

In [0]:
with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            Qs = sess.run(action_output,feed_dict={input_state:state.reshape((1, *state.shape))})
            action = np.random.choice(range(Qs.shape[1]), p=Qs.ravel())
            next_state, reward, done, _ = env.step(action)
            env.render()
            total_rewards += reward
            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break    
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
    env.close()