In [0]:
## Google Colab Setup
## Get Google SDK Key
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
## Mount Google Drive
!mkdir -p drive
!google-drive-ocamlfuse drive

In [4]:
## Set up Current DIR, SYS PATH
import os
os.chdir('drive/Colab_Notebooks/')
!pwd
import sys
sys.path.append('/content/drive/Colab_Notebooks/')
print sys.path

## Colab setup done

/content/drive/Colab_Notebooks
['', '/env/python', '/usr/lib/python2.7', '/usr/lib/python2.7/plat-x86_64-linux-gnu', '/usr/lib/python2.7/lib-tk', '/usr/lib/python2.7/lib-old', '/usr/lib/python2.7/lib-dynload', '/usr/local/lib/python2.7/dist-packages', '/usr/lib/python2.7/dist-packages', '/usr/local/lib/python2.7/dist-packages/IPython/extensions', '/content/.ipython', '/content/drive/Colab_Notebooks/']


In [6]:
## Here we use Q learning to play Gym-Atari Space Invadors game.
## We take a frame(1 out of 4 consecutive frames) at each time step as input.
## Q learning algorithm is implemented using a Deep Convolutional Network with 3 Convolutional layers(no pooling)
## and a fully connected layer
## Change training=False to test the learnt model

# import dependencies
import numpy as np
import random
import time
import tensorflow as tf
from skimage import color
from skimage.transform import resize
import matplotlib.pyplot as plt
from collections import deque
import warnings
try:
    import gym
except:
    !pip install gym
    import gym
try:
    import atari_py
except:
    !pip install cmake
    !pip install gym[atari]
    import atari_py

warnings.filterwarnings('ignore')

## Initialize environment
env = gym.make('SpaceInvaders-v0')
env.reset()

############ Tuning parameters ############

training = True
num_episodes = 6
num_steps = 50000
batch_size = 64

# large epsilon->exploration, small epsilon->exploitation
epsilon = 1.0
epsilon_max = 1.0 
epsilon_min = 0.01
decay_rate = 0.00001

stack_len = 4
state_size = [110, 84, stack_len] # downsized for lower computation time
action_size = env.action_space.n
learning_rate = 0.00025
gamma = 0.9

memory_size = 1000000
pretrain_len = batch_size
possible_actions = np.identity(action_size) # convert actions to one-hot vectors

############ Frame preprocessing ############

def preprocess_frame(frame):
    frame_gray       = color.rgb2gray(frame)   # convert to grayscale [210,160,1]
    frame_normalized = frame_gray/255.0        # normalize image in 0 to 1
    frame_resized    = resize(frame_normalized, [110,84]) 
    return frame_resized

############ Stack frames in a deque of length 4############

stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(stack_len)], maxlen=4)

def update_stack(frame, stacked_frames, episode_start):
  
    frame = preprocess_frame(frame)

    if episode_start==True:
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(stack_len)], maxlen=4)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stack = np.stack(stacked_frames, axis=2)
    else:
        stacked_frames.append(frame) # add frame t
        stack = np.stack(stacked_frames, axis=2)
    
  # Stack: input to network(shape:[110,84,stack_len])
    return stack, stacked_frames 

############  Memory class ############

class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
      buffer_size = len(self.buffer)
      index = np.random.choice(np.arange(buffer_size), size = batch_size, replace = False)
      return [self.buffer[i] for i in index]

memory = Memory(memory_size)

############ Initialize memory ############

for i in range(pretrain_len):
    if i==0:
        frame = env.reset()
        stack, stacked_frames = update_stack(frame, stacked_frames, True)
  
    action = env.action_space.sample()
    next_frame, reward, done, info = env.step(action)
    next_stack, stacked_frames = update_stack(next_frame, stacked_frames, False)
    action = possible_actions[action]

    if done: # if game is finished
        done = False
        next_stack = np.zeros(stack.shape)
        memory.add((stack, action, reward, next_stack, done*1))
        frame = env.reset()
        stack, stacked_frames = update_stack(frame, stacked_frames, True)
    else:
        memory.add((stack, action, reward, next_stack, done*1))
        stack = next_stack

############ Build our Network ############

class DeepQNetwork:
  
    def __init__(self, state_size, action_size, learning_rate, name='DQN'):
    
        self.state_size  = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
    
        with tf.variable_scope(name):
            # Input/Output placeholders
            self.x  = tf.placeholder(tf.float32, [None, state_size[0], state_size[1], state_size[2]], name='input') # state input: batches x h x w x 4
            self.actions = tf.placeholder(tf.float32, [None, self.action_size], name='actions') # action input: batches x action_size
            self.target_Q = tf.placeholder(tf.float32, [None], name='target_Q')
    
            # 3 convolution layers
            self.layer1 = self.conv_layer(self.x, self.state_size[2], 32, [8,8], name='layer1')
            self.layer2 = self.conv_layer(self.layer1, 32, 64, [4,4], name='layer2')
            self.layer3 = self.conv_layer(self.layer2, 64, 64, [3,3], name='layer3')
      
            # Flattening
            self.flattened = tf.contrib.layers.flatten(self.layer3)
      
            self.layer4 = tf.layers.dense(inputs = self.flattened, units=512, activation=tf.nn.elu, kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                    name="layer4")
            
            self.y_ = tf.layers.dense(inputs = self.layer4, kernel_initializer=tf.contrib.layers.xavier_initializer(), units=self.action_size, 
                                           activation=None)
    
            self.pred_Q = tf.reduce_sum(tf.multiply(self.y_, self.actions))

            # Loss function
            self.squared_loss = tf.reduce_mean(tf.square(self.target_Q - self.pred_Q)) # squared sum over batches

            # Optimizer
            self.adam_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.squared_loss)

    ## Template for convolution+pooling layer
    def conv_layer(self, input_data, num_input_channels, num_filters, conv_filter_shape, name):
  
        conv_filter_size = [conv_filter_shape[0], conv_filter_shape[1], num_input_channels, num_filters] 
  
        # Conv layer
        kernel = tf.get_variable(
               initializer=None,#tf.contrib.layers.xavier_initializer_conv2d(), 
               shape=conv_filter_size,
               name=name+'_kernel')
        conv_output  = tf.nn.conv2d(input_data, kernel, [1,2,2,1], padding='VALID')
        conv_output  = tf.nn.elu(conv_output)
  
        return conv_output

tf.reset_default_graph()
DeepQNetwork = DeepQNetwork(state_size, action_size, learning_rate) # instantiate class

# TensorBoard Setup
writer = tf.summary.FileWriter("./tensorboard/dqn/2")
#writer.add_graph(sess.graph)
tf.summary.scalar("Loss", DeepQNetwork.squared_loss)
write_op = tf.summary.merge_all()  

############ Training ############

saver = tf.train.Saver()
if training == True:

    with tf.Session() as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        reward_list = []
    
        for episode in range(num_episodes):
      
            # Get initial state (->rgb frame)
            frame = env.reset()
            # Add frame to stack
            stack, stacked_frames = update_stack(frame, stacked_frames, True)
            step = 0
            episode_reward = 0
            total_reward = 0

            while step < num_steps:
        
                step += 1
                #print 'step:', step
                ## Choose optimal action using epsilon greedy strategy
                random_number = random.uniform(0,1)
                decay_step += 1
                epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * decay_step)
                if random_number < epsilon:
                    action = env.action_space.sample()
                else:
                    k1,k2,k3 = stack.shape
                    action = np.argmax(sess.run(DeepQNetwork.y_, feed_dict={DeepQNetwork.x: stack.reshape(1,k1,k2,k3)})) # input size: [batches,state_size]

                ## Perform the action and update stack with next frame
                next_frame, reward, done, info = env.step(action) # next_frame: HxWx3 
                action = possible_actions[action]

                # Rewards
                episode_reward += reward 
        
                # Append to memory
                if done:
                    done = False
                    next_frame = np.zeros((state_size[0], state_size[1]),  dtype=np.int)
                    next_stack, stacked_frames = update_stack(next_frame, stacked_frames, False)
                    step = num_steps
                    memory.add((stack, action, reward, next_stack, done*1))
                    total_reward = episode_reward
                    reward_list.append([episode, episode_reward])

                    print('Episode: {}'.format(episode),
                          'Total reward: {}'.format(total_reward),
                          'epsilon: {:.4f}'.format(epsilon),
                          'Training Loss {:.4f}'.format(squared_loss))
                else:
                    next_stack, stacked_frames = update_stack(next_frame, stacked_frames, False) # stack: HxWx4
                    memory.add((stack, action, reward, next_stack, done*1))
                    stack = next_stack

                ## Learning the network weights
                mini_batch       = memory.sample(batch_size)
                stack_batch      = np.array([batch[0] for batch in mini_batch], ndmin=3)
                action_batch     = np.array([batch[1] for batch in mini_batch])
                reward_batch     = np.array([batch[2] for batch in mini_batch])
                next_stack_batch = np.array([batch[3] for batch in mini_batch])
                done_batch       = np.array([batch[4] for batch in mini_batch]).reshape(batch_size,1)
                target_Q_batch = []
        
                # Find max Q values from all actions for next state in a mini batch 
                next_Q_batch = sess.run(DeepQNetwork.y_, feed_dict={DeepQNetwork.x: next_stack_batch})

                for j in range(batch_size):
          
                  if done_batch[j] == 1:
                      target_Q_batch.append(reward_batch[j])
                  else:
                      target_Q_batch.append(reward_batch[j] + gamma*np.max(next_Q_batch[j]))

                target_Q = np.array([k for k in target_Q_batch])
                squared_loss, _ = sess.run([DeepQNetwork.squared_loss, DeepQNetwork.adam_optimizer], 
                                            feed_dict={DeepQNetwork.x: stack_batch, 
                                                       DeepQNetwork.target_Q: target_Q, 
                                                       DeepQNetwork.actions: action_batch})        
        
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DeepQNetwork.x: stack_batch,
                                                    DeepQNetwork.target_Q: target_Q,
                                                    DeepQNetwork.actions: action_batch})
                writer.add_summary(summary, episode)
                writer.flush()
        
        # Save model every 5 episodes
        if episode % 5 == 0:
            save_path = saver.save(sess, "./models/model.ckpt")
            print("Model Saved")


('Episode: 0', 'Total reward: 30.0', 'epsilon: 0.9959', 'Training Loss 1.5570')
Model Saved
('Episode: 1', 'Total reward: 155.0', 'epsilon: 0.9896', 'Training Loss 3.8456')
('Episode: 2', 'Total reward: 155.0', 'epsilon: 0.9832', 'Training Loss 0.3932')
('Episode: 3', 'Total reward: 180.0', 'epsilon: 0.9770', 'Training Loss 15.8457')
('Episode: 4', 'Total reward: 105.0', 'epsilon: 0.9707', 'Training Loss 3.4705')
('Episode: 5', 'Total reward: 75.0', 'epsilon: 0.9669', 'Training Loss 0.0494')
Model Saved


In [5]:
## Test the model
## After testing for 100 episodes our model can play game quite reasonably achieving average score of 203

############ Testing ############

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    with tf.Session() as sess:
        total_test_rewards = []

        # Load the model
        saver.restore(sess, "./models/model.ckpt")

        for episode in range(10):
            total_rewards = 0

            frame = env.reset()
            stack, stacked_frames = update_stack(frame, stacked_frames, True)

            print("EPISODE ", episode)
            step = 0
            while True:

                # Reshape the state
                stack = stack.reshape((1, state_size[0], state_size[1], state_size[2]))
                # Get action from Q-network 
                # Estimate the Qs values state
                Q_value = sess.run(DeepQNetwork.y_, feed_dict = {DeepQNetwork.x: stack})

                # Take the biggest Q value (= the best action)
                action = np.argmax(Q_value)

                #Perform the action and get the next_state, reward, and done information
                next_frame, reward, done, _ = env.step(action)
                #env.render()

                action = possible_actions[action]
                total_rewards += reward

                if done:
                    print ("Score", total_rewards)
                    total_test_rewards.append(total_rewards)
                    break

                next_stack, stacked_frames = update_stack(next_frame, stacked_frames, False)
                stack = next_stack

        print np.mean(total_test_rewards)        
        env.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
('EPISODE ', 0)
('Score', 130.0)
('EPISODE ', 1)
('Score', 65.0)
('EPISODE ', 2)
('Score', 155.0)
('EPISODE ', 3)
('Score', 70.0)
('EPISODE ', 4)
('Score', 275.0)
('EPISODE ', 5)
('Score', 270.0)
('EPISODE ', 6)
('Score', 400.0)
('EPISODE ', 7)
('Score', 35.0)
('EPISODE ', 8)
('Score', 585.0)
('EPISODE ', 9)
('Score', 45.0)
203.0


In [0]:
## Launch Tensorboard
!tensorboard --logdir=./tensorboard/dqn/1

In [0]:
# Check GPU RAM allocation
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isnâ€™t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()