In [None]:
!pip install ray
!pip install baselines

In [None]:
import gym
import random
import os
import numpy as np
import tensorflow as tf
import ray
import time

from collections      import deque
from keras import models
from keras import optimizers
from keras import layers
from baselines.common.atari_wrappers import make_atari, wrap_deepmind 

In [None]:
class ParameterServer(object):
    def __init__(self,state_size,action_size):
        self.opt = optimizers.Adam(lr=0.01)
        self.state_size = state_size 
        self.action_size = action_size
        self.model = ParameterServer.build_model(state_size, action_size)
        self.target_model = models.clone_model(self.model)
        #self.target_model = models.load_model("weights.h5")
    
    def build_model(state_size, action_size):
        model = models.Sequential()
        model.add(layers.InputLayer(input_shape=state_size))
        model.add(layers.Conv2D(32, 8, strides=4, activation="relu"))
        model.add(layers.Conv2D(64, 4, strides=2, activation="relu"))
        model.add(layers.Conv2D(64, 3, strides=1, activation="relu"))
        model.add(layers.Flatten())
        model.add(layers.Dense(512, activation="relu"))
        model.add(layers.Dense(action_size, activation="linear"))
        model.compile(loss='mse', optimizer=optimizers.Adam(lr=0.001))
        return model
    
    def apply_gradients(self, gradients):
        var_list = self.model.trainable_variables
        self.opt.apply_gradients(zip(gradients, var_list))
    
    def apply_gradient(self, gradient):
        var_list = self.model.trainable_variables
        self.opt.apply_gradients(zip(gradient, var_list))
    
    def sync_target(self):
        self.target_model.set_weights(self.model.get_weights())
        
    def get_weights(self):
        return (self.model.get_weights(), self.target_model.get_weights())
        
    def save_weights(self, version):
        if (version > 0):
            self.model.save("breakout/model{:.0f}.h5".format(version))
        else:
            self.model.save("breakout/model.h5")

In [None]:
#handle adding to and sampling from a shared replay memory
#one of these is created by the main thread 
class ReplayMemory(object):
    def __init__(self):
        #deque is an efficient collection for the necessary operations
        self.replays = deque(maxlen=100000)

    def add_memory(self, replay):
      self.replays.append(replay)

    def sample_replays(self,batch_size):
        #return a randomly sampled batch from the replay memory
        return np.array(random.sample(self.replays, batch_size))

In [None]:
@ray.remote
class Learner(object):
    def __init__(self, state_size, action_size, batch_size=32, gamma=0.99):
        self.state_size = state_size
        self.action_size = action_size 
        self.sample_batch_size = batch_size
        self.gamma = gamma
        self.Q_network = ParameterServer.build_model(state_size, action_size)
        self.target_network = ParameterServer.build_model(state_size, action_size)
    
    def sync_weights(self, Q_weights, target_weights):
        self.Q_network.set_weights(Q_weights)
        self.target_network.set_weights(target_weights)

    def learn(self, mini_batch):
        states = np.zeros((self.sample_batch_size,) + self.state_size)
        next_states = np.zeros((self.sample_batch_size,) + self.state_size)
        actions, rewards, dones = [], [], []
        for i in range(self.sample_batch_size):
            states[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            next_states[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])
        target =  self.Q_network.predict(states)
        future_vals = self.target_network.predict(next_states)
        for i in range(self.sample_batch_size):
            if dones[i]:
                target[i][actions[i]] = rewards[i]
            else:
                target[i][actions[i]] = rewards[i] + 0.99 * (
                    np.amax(future_vals[i]))

        dqn_variable = self.Q_network.trainable_variables
        with tf.GradientTape() as tape:  
            tape.watch(dqn_variable)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            target_q = self.target_network(tf.convert_to_tensor(next_states, dtype=tf.float32))
            next_action = tf.argmax(target_q, axis=1)
            target_value = tf.reduce_sum(tf.one_hot(next_action, self.action_size) * target_q, axis=1)
            target_value = (1-dones) * self.gamma * target_value + rewards

            main_q = self.Q_network(tf.convert_to_tensor(states, dtype=tf.float32))
            main_value = tf.reduce_sum(tf.one_hot(actions, self.action_size) * main_q, axis=1)

            loss = tf.math.reduce_mean(tf.square(main_value - target_value))
        return tape.gradient(loss, dqn_variable)

In [None]:
@ray.remote
class Actor(object):
    def __init__(self , env_name, state_size, action_size):
        self.env = Agent.make_env(env_name) #env
        self.state = self.env.reset()
        self.Q_network = ParameterServer.build_model(state_size, action_size)
        self.done = False
        self.total_reward = 0
        self.state_size = state_size
        self.action_size = action_size
        self.exploration_rate =  1.0
        self.exploration_min = 0.1
        self.exploration_decay = 0.999
        
    def sync_weights(self, Q_weights):
        self.Q_network.set_weights(Q_weights)
        
    def run_step(self):
        if self.done:
            self.env.close()
            self.state = self.env.reset()
            self.done = False 
            self.total_reward = 0

        state = np.expand_dims(self.state, 0)
        
        if np.random.rand() <= self.exploration_rate:
            action = random.randrange(self.action_size)
        else:
            expected_values = self.Q_network.predict(state)
            action = np.argmax(expected_values[0])

        next_state, reward, self.done, _ = self.env.step(action)
        self.total_reward += reward 
        self.state = next_state 

        if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay
      
        return np.array([state, action, reward, next_state, self.done, self.total_reward])

In [None]:
class Agent(object):
    def __init__(self,env_name,num_actors,num_learners):
        self.env = Agent.make_env(env_name)
        self.state_size = (84,84,4,)
        self.action_size = self.env.action_space.n 
        
        self.parameter_server = ParameterServer(self.state_size, self.action_size)
        
        self.replay_memory = ReplayMemory()
        self.actors = [Actor.remote(env_name, self.state_size, self.action_size) 
                       for k in range(num_actors)]
        self.learners = [Learner.remote(self.state_size, self.action_size) 
                         for _ in range(num_learners)]
        
    def make_env(env_name):
        return wrap_deepmind(make_atari(env_name), frame_stack=True, scale=True)
        
    def run(self):
        start_time = time.time()
        counter = 0
        recent_scores = deque(maxlen=100)
        
        #run a little bit to create initial replays before learning
        for _ in range(100):
            replays = [actor.run_step.remote() for actor in self.actors]
            for r in replays:
                vals = ray.get(r)
                self.replay_memory.add_memory(np.copy(vals[:5]))
                if vals[4]:
                    recent_scores.append(np.copy(vals[5]))
        
        #training loop
        for counter in range(1000000):
            # start actors and learners
            replays = [actor.run_step.remote() for actor in self.actors]
            gradients = [learner.learn.remote(self.replay_memory.sample_replays(32)) 
                         for learner in self.learners]
            
            # save actor results into replay memory
            for r in replays:
                vals = ray.get(r)
                self.replay_memory.add_memory(np.copy(vals[:5]))
                if vals[4]:
                    recent_scores.append(np.copy(vals[5]))
            
            # apply gradients
            for g in gradients:
                self.parameter_server.apply_gradient(ray.get(g))
            
            if (counter % 100) == 0:
                q_weights, target_weights = self.parameter_server.get_weights()
                ray.get([actor.sync_weights.remote(q_weights) 
                        for actor in self.actors])
                ray.get([learner.sync_weights.remote(q_weights, target_weights) 
                          for learner in self.learners])
                
            if (counter % 500) == 0:
                print("Step {}: \tRunning Average: {:.4f}\tTime: {:.1f}".format(
                    counter, np.mean(recent_scores), (time.time() - start_time) / 60 ))
            
                if (counter % 5000) == 0:
                  self.parameter_server.sync_target()

                  if (counter % 10000) == 0:
                    self.parameter_server.save_weights((counter / 10000))

In [None]:
ray.shutdown()
ray.init(ignore_reinit_error=True)
agent = Agent("BreakoutNoFrameskip-v4",3,1)
agent.run()