In [54]:
!pip install ray



In [55]:
import gym
import random
import os
import numpy as np
import tensorflow as tf
import ray
import time

from collections      import deque
from keras import models
from keras import optimizers
from keras import layers

In [56]:
@ray.remote
class ParameterServer(object):
    def __init__(self,state_size,action_size):
      self.opt = optimizers.Adam(lr=0.001)
      self.state_size = state_size 
      self.action_size = action_size
      self.model = self.build_model()
      #self.model.set_weights(models.load_model("model.h5").get_weights())

    def get_weights(self):
      return self.model.get_weights()
    
    def build_model(self):
        model = models.Sequential()
      #print("Input shape {}".format(self.state_size))
        model.add(layers.Input(shape=self.state_size))
        model.add(layers.MaxPooling2D(pool_size=(2,2),strides=2))
        model.add(layers.Conv2D(32, 8, strides=4, activation="relu"))
        model.add(layers.Conv2D(64,4,strides=2,activation="relu"))
        model.add(layers.Conv2D(64, 3, strides=1, activation="relu"))
        model.add(layers.Flatten())
        model.add(layers.Dense(512, activation="relu"))
        model.add(layers.Dense(self.action_size, activation="linear"))
        model.compile(loss='mse', optimizer=self.opt)
        return model
    
    def apply_gradient(self,grad):
      var_list = self.model.trainable_variables
      self.opt.apply_gradients(zip(grad, var_list))
    
    def save_weights(self, version):
      if (version > 0):
        self.model.save("savedModels/model{:.0f}.h5".format(version))
      else:
        self.model.save("model.h5")

In [57]:
#handle adding to and sampling from a shared replay memory
#only one of these should be created
@ray.remote
class ReplayMemory(object):
    def __init__(self,state_size,action_size):
        #deque is an efficient collection for the necessary operations
        self.replays = deque(maxlen=100000)
        self.state_size = state_size
        self.action_size = action_size
        
    def addMemory(self, state, action, reward, new_state, done):
        self.replays.append([state, action, reward, new_state, done])

    def sampleReplays(self,batch_size):
        #return whatever replays are there if there aren't enough for a full batch
        #in practice this shouldn't ever happen
        if(len(self.replays) < batch_size):
            return np.array(self.replays)
        #return a randomly sampled batch from the replay memory
        return np.array(random.sample(self.replays, batch_size))

In [58]:
@ray.remote(num_gpus=1)
class Learner(object):
    def __init__(self,param_server,replay_memory,state_size,action_size,batch_size=32,gamma=0.95):
        self.param_server = param_server
        self.replay_memory = replay_memory
        self.state_size = state_size
        self.action_size = action_size 
        self.sample_batch_size = batch_size
        self.gamma = gamma
        self.opt = optimizers.Adam(lr=0.001)
        #initialize a local copy of Q network
        self.Q_network = self.build_model()
        #get the weights from parameter server and copy them into local network
        self.Q_network.set_weights(ray.get(self.param_server.get_weights.remote()))
        #initialize target network from Q network
        self.target_network = models.clone_model(self.Q_network)
        self.target_network.set_weights(self.Q_network.get_weights())

    def build_model(self):
        model = models.Sequential()
        model.add(layers.Input(shape=self.state_size))
        model.add(layers.MaxPooling2D(pool_size=(2,2),strides=2))
        model.add(layers.Conv2D(32, 8, strides=4, activation="relu"))
        model.add(layers.Conv2D(64,4,strides=2,activation="relu"))
        model.add(layers.Conv2D(64, 3, strides=1, activation="relu"))
        model.add(layers.Flatten())
        model.add(layers.Dense(512, activation="relu"))
        model.add(layers.Dense(self.action_size, activation="linear"))
        model.compile(loss='mse', optimizer=self.opt)
        return model

    def learn(self, step,num_batches=1):
      for _ in range(num_batches):
        #retrieve a batch of replays from the shared memory 
        mini_batch = ray.get(self.replay_memory.sampleReplays.remote(self.sample_batch_size))
        #on the first iteration we'll need to wait until some replays are generated
        if (len(mini_batch) < self.sample_batch_size):
          return 
        states = np.zeros((self.sample_batch_size,) + self.state_size)
        #print(states.shape)
        next_states = np.zeros((self.sample_batch_size,) + self.state_size)
        actions, rewards, dones = [], [], []
        for i in range(self.sample_batch_size):
                states[i] = mini_batch[i][0]
                actions.append(mini_batch[i][1])
                rewards.append(mini_batch[i][2])
                next_states[i] = mini_batch[i][3]
                dones.append(mini_batch[i][4])
        target =  self.Q_network.predict(states)
        target_val = self.target_network.predict(next_states)
        for i in range(self.sample_batch_size):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + 0.99 * (
                        np.amax(target_val[i]))
        
        dqn_variable = self.Q_network.trainable_variables
        with tf.GradientTape() as tape:  
            tape.watch(dqn_variable)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            target_q = self.target_network(tf.convert_to_tensor(next_states, dtype=tf.float32))
            next_action = tf.argmax(target_q, axis=1)
            target_value = tf.reduce_sum(tf.one_hot(next_action, self.action_size) * target_q, axis=1)
            target_value = (1-dones) * self.gamma * target_value + rewards

            main_q = self.Q_network(tf.convert_to_tensor(states, dtype=tf.float32))
            main_value = tf.reduce_sum(tf.one_hot(actions, self.action_size) * main_q, axis=1)

            loss = tf.math.reduce_mean(tf.square(main_value - target_value))
        grad = tape.gradient(loss, dqn_variable)
        ray.get(self.param_server.apply_gradient.remote(grad))
        #sync weights from parameter server 
        self.Q_network.set_weights(ray.get(self.param_server.get_weights.remote()))
        #sync target network periodically 
        if (step % 10000) == 0:
          self.target_network.set_weights(self.Q_network.get_weights())
          print("Synced Target Network")
        #print("Learning {}".format(step))

In [59]:
@ray.remote 
class Actor(object):
    def __init__(self,param_server,replay_memory,environment):
        self.param_server = param_server 
        self.replay_memory = replay_memory
        self.env = environment 
        self.done = True
        self.total_reward = 0
        self.episodes = 0
        self.state_size = self.env.observation_space.shape
        self.action_size = self.env.action_space.n 
        self.exploration_rate = 1.0
        self.exploration_min = 0.1
        self.exploration_decay = 0.99995
        #make a local copy of the Q-network
        self.Q_network = self.build_model()
        self.Q_network.set_weights(ray.get(self.param_server.get_weights.remote()))

    def build_model(self):
        model = models.Sequential()
        model.add(layers.Input(shape=self.state_size))
        model.add(layers.MaxPooling2D(pool_size=(2,2),strides=2))
        model.add(layers.Conv2D(32, 8, strides=4, activation="relu"))
        model.add(layers.Conv2D(64,4,strides=2,activation="relu"))
        model.add(layers.Conv2D(64, 3, strides=1, activation="relu"))
        model.add(layers.Flatten())
        model.add(layers.Dense(512, activation="relu"))
        model.add(layers.Dense(self.action_size, activation="linear"))
        model.compile()
        return model
  
    def sync_network(self):
        self.Q_network.set_weights(ray.get(self.param_server.get_weights.remote()))
  
    def run_episode(self):
        state = self.env.reset()
        state = np.expand_dims(state, 0)
        total_reward = 0
        done = False 
        while not done:
          if np.random.rand() <= self.exploration_rate:
            action = random.randrange(self.action_size)
          else:
            expected_values = self.Q_network.predict(state)
            action = np.argmax(expected_values[0])

          next_state, reward, done, _ = self.env.step(action)
          next_state = np.expand_dims(next_state, 0)
          total_reward += reward 
          ray.get(self.replay_memory.addMemory.remote(state,action,reward,next_state,done))
          state = next_state 
          if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay

        #print("Score: {}".format(total_reward))
        return total_reward 
    def run_step(self):
      if self.done:
        self.episodes += 1
        self.env.close()
        self.state = self.env.reset()
        self.done = False 
        episode_reward = self.total_reward 
        self.total_reward = 0
        return episode_reward 

      state = np.expand_dims(self.state, 0)
      if np.random.rand() <= self.exploration_rate:
        action = random.randrange(self.action_size)
      else:
        expected_values = self.Q_network.predict(state)
        action = np.argmax(expected_values[0])

      next_state, reward, self.done, _ = self.env.step(action)
      #next_state = np.expand_dims(next_state, 0)
      self.total_reward += reward 
      ray.get(self.replay_memory.addMemory.remote(state,action,reward,next_state,self.done))
      self.state = next_state 

      if self.exploration_rate > self.exploration_min:
        self.exploration_rate *= self.exploration_decay
      # wait until episode ends to return total score 
      #negative result is ignored by caller 
      return -1
    
    def get_average(self):
      if self.episodes == 0:
        return 0
      return (self.total_reward / self.episodes)

In [60]:
class Agent(object):
    def __init__(self,env_name,num_actors,num_learners):
        self.start_time = time.time()
        self.env = gym.make(env_name)
        self.state_size = self.env.observation_space.shape
        self.action_size = self.env.action_space.n 
        self.parameter_server = ParameterServer.remote(self.state_size, self.action_size)
        self.replay_memory = ReplayMemory.remote(self.state_size, self.action_size)
        self.actors = [Actor.remote(self.parameter_server, self.replay_memory, gym.make(env_name)) for _ in range(num_actors)]
        self.learners = [Learner.remote(self.parameter_server, self.replay_memory, self.state_size, self.action_size) for _ in range(num_learners)]

    def run(self):
        counter = 0
        high_scores = np.zeros(10)
        last_ten = deque(maxlen=10)

        while counter < 1000000:
          if (counter % 100 == 0):
            print("Global step {}".format(counter))
            #print("Average score {}".format(np.average(ray.get([actor.get_average.remote() for actor in self.actors]))))
            print("High scores {}".format(high_scores))
            print("Last ten {}".format(last_ten))
            if (counter % 1000 == 0):
              print("Running for {:.1f} minutes".format((time.time() - self.start_time) / 60 ))
          ray.get([actor.sync_network.remote() for actor in self.actors])
          scores = ray.get([actor.run_step.remote() for actor in self.actors])
          ray.get([learner.learn.remote(counter) for learner in self.learners])
          for score in scores:
            # negative score is used as special value 
            if score >= 0:
              last_ten.append(score)
            # add high score to the bottom of the list 
            if score > high_scores[0]: 
              high_scores[0] = score 
            # bubble up if necessary 
            i = 0
            j = 1
            while (j < len(high_scores)) and (high_scores[i] > high_scores[j]):
              temp = high_scores[j]
              high_scores[j] = high_scores[i] 
              high_scores[i] = temp 
              i += 1
              j += 1
  
          #print("Average reward {}".format(avg_score))
          #if avg_score > max_score:
            #max_score = avg_score
          if (counter % 5000) == 0:
            self.save_weights(counter / 5000)
          counter += 1

    def save_weights(self, suffix):
      ray.get(self.parameter_server.save_weights.remote(suffix))

In [61]:
ray.shutdown()
ray.init(ignore_reinit_error=True)

2021-04-14 22:03:55,119	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'metrics_export_port': 60776,
 'node_id': '911885bb3e3fe8b6e01a6149e241ea5b4d4f78dc76c18c2b2d7c07a9',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2021-04-14_22-03-54_720633_74/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2021-04-14_22-03-54_720633_74/sockets/raylet',
 'redis_address': '172.28.0.2:6379',
 'session_dir': '/tmp/ray/session_2021-04-14_22-03-54_720633_74',
 'webui_url': '127.0.0.1:8265'}

In [None]:
agent = Agent("Breakout-v0",2,1)
agent.run()

[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2110)[0m 2021-04-14 22:03:57.245362: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic librar

Global step 0
High scores [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Last ten deque([], maxlen=10)
Running for 0.0 minutes


[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=2108)[0m 2021-04-14 22:03:57.730441: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic librar

Global step 100
High scores [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Last ten deque([0, 0], maxlen=10)


[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.587870: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.588247: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz
[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.587870: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.588247: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz
[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.587870: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=2111)[0m 2021-04-14 22:04:39.588247: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz
[2m[36m(pid=2111)[0m 2021-04-14 22:04

Global step 200
High scores [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Last ten deque([0, 0], maxlen=10)
Global step 300
High scores [0. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0], maxlen=10)
Global step 400
High scores [0. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0], maxlen=10)
Global step 500
High scores [0. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0, 0.0, 0.0], maxlen=10)
Global step 600
High scores [0. 0. 0. 0. 0. 0. 0. 0. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0, 0.0, 0.0], maxlen=10)
Global step 700
High scores [0. 0. 0. 0. 0. 0. 1. 1. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0, 0.0, 0.0, 1.0, 1.0], maxlen=10)
Global step 800
High scores [0. 0. 0. 0. 0. 0. 1. 1. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0, 0.0, 0.0, 1.0, 1.0], maxlen=10)
Global step 900
High scores [0. 0. 0. 0. 0. 0. 1. 1. 2. 2.]
Last ten deque([0, 0, 2.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0], maxlen=10)
Global step 1000
High scores [0. 0. 0. 0. 0. 1. 1. 2. 2. 3.]
Last ten deque([0, 0, 2.0, 2.0, 

In [None]:
agent.save_weights(-1)