In [1]:
import gym
import random
import os
import numpy as np
from collections      import deque
from keras.models     import Sequential
from keras.layers     import Dense
from keras.optimizers import Adam, SGD
from keras import backend as k
import tensorflow as tf
from statistics import mean
from multiprocessing import Process, Queue, Lock

In [2]:
!pip install ray
import ray



In [3]:
class Agent():
    def __init__(self, state_size, action_size):
        self.weight_backup      = "cartpole_weight.h5"
        self.state_size         = state_size
        self.action_size        = action_size
        self.memory             = deque(maxlen=2000)
        self.learning_rate      = 0.001
        self.gamma              = 0.95
        self.exploration_rate   = 1.0
        self.exploration_min    = 0.01
        self.exploration_decay  = 0.995
        self.brain              = self._build_model()
        self.target_model       = self._build_model()
        self.discount_factor    = 0.99

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=SGD(lr=self.learning_rate))

        if os.path.isfile(self.weight_backup):
            model.load_weights(self.weight_backup)
            self.exploration_rate = self.exploration_min
        return model

    def save_model(self):
            self.brain.save(self.weight_backup)
    def setModelweight(self, weights):
        self.brain.set_weights(weights)
    def act(self, state):
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        act_values = self.brain.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def replay(self, sample_batch_size):
        if len(self.memory) < sample_batch_size:
            return
        mini_batch = random.sample(self.memory, sample_batch_size)
        states = np.zeros((sample_batch_size, self.state_size))
        next_states = np.zeros((sample_batch_size, self.state_size))
        actions, rewards, dones = [], [], []
        for i in range(sample_batch_size):
                states[i] = mini_batch[i][0]
                actions.append(mini_batch[i][1])
                rewards.append(mini_batch[i][2])
                next_states[i] = mini_batch[i][3]
                dones.append(mini_batch[i][4])
        
        target = self.brain.predict(states)
        target_val = self.target_model.predict(next_states)
        for i in range(sample_batch_size):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.discount_factor * (
                        np.amax(target_val[i]))
        
        dqn_variable = self.brain.trainable_variables
        with tf.GradientTape() as tape:  
            tape.watch(dqn_variable)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

            target_q = self.target_model(tf.convert_to_tensor(np.vstack(next_states), dtype=tf.float32))
            next_action = tf.argmax(target_q, axis=1)
            target_value = tf.reduce_sum(tf.one_hot(next_action, self.action_size) * target_q, axis=1)
            target_value = (1-dones) * self.discount_factor * target_value + rewards

            main_q = self.brain(tf.convert_to_tensor(np.vstack(states), dtype=tf.float32))
            main_value = tf.reduce_sum(tf.one_hot(actions, self.action_size) * main_q, axis=1)

            loss = tf.math.reduce_mean(tf.square(main_value - target_value))
        gradient = tape.gradient(loss, dqn_variable)
        self.brain.fit(states, target, batch_size=sample_batch_size,epochs=1, verbose=0)
        if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay

In [4]:
@ray.remote
class CartPole:
    def __init__(self,parameter_servers):
        self.sample_batch_size = 32
        self.episodes          = 100
        self.env               = gym.make('CartPole-v1')

        self.state_size        = self.env.observation_space.shape[0]
        self.action_size       = self.env.action_space.n
        self.agent             = Agent(self.state_size, self.action_size)
        self.parameter_servers = parameter_servers
        self.memory            = deque(maxlen=2000)

    def run(self):
        try:
            for index_episode in range(self.episodes):
                state = self.env.reset()
                state = np.reshape(state, [1, self.state_size])
                weight  = ray.get(self.parameter_servers.get_weight.remote())
                self.agent.brain.set_weights(weight)
                done = False
                index = 0
                while not done:
#                    self.env.render()

                    action = self.agent.act(state)

                    next_state, reward, done, _ = self.env.step(action)
                    next_state = np.reshape(next_state, [1, self.state_size])
                    self.memory.append((state, action, reward, next_state, done))
                    weight  = ray.get(self.parameter_servers.get_weight.remote())
                    self.agent.brain.set_weights(weight)
                    if len(self.memory) >= self.sample_batch_size:
                        mini_batch = random.sample(self.memory, self.sample_batch_size)
                        states = np.zeros((self.sample_batch_size, self.state_size))
                        next_states = np.zeros((self.sample_batch_size, self.state_size))
                        actions, rewards, dones = [], [], []
                        for i in range(self.sample_batch_size):
                                states[i] = mini_batch[i][0]
                                actions.append(mini_batch[i][1])
                                rewards.append(mini_batch[i][2])
                                next_states[i] = mini_batch[i][3]
                                dones.append(mini_batch[i][4])
                        target =  self.agent.brain.predict(states)
                        target_val = self.agent.target_model.predict(next_states)
                        for i in range(self.sample_batch_size):
                                if dones[i]:
                                    target[i][actions[i]] = rewards[i]
                                else:
                                    target[i][actions[i]] = rewards[i] + 0.99 * (
                                        np.amax(target_val[i]))
                        
                        dqn_variable = self.agent.brain.trainable_variables
                        with tf.GradientTape() as tape:  
                            tape.watch(dqn_variable)
                            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
                            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
                            dones = tf.convert_to_tensor(dones, dtype=tf.float32)

                            target_q = self.agent.target_model(tf.convert_to_tensor(np.vstack(next_states), dtype=tf.float32))
                            next_action = tf.argmax(target_q, axis=1)
                            target_value = tf.reduce_sum(tf.one_hot(next_action, self.action_size) * target_q, axis=1)
                            target_value = (1-dones) * self.agent.discount_factor * target_value + rewards

                            main_q = self.agent.brain(tf.convert_to_tensor(np.vstack(states), dtype=tf.float32))
                            main_value = tf.reduce_sum(tf.one_hot(actions, self.action_size) * main_q, axis=1)

                            loss = tf.math.reduce_mean(tf.square(main_value - target_value))
                        gradient = tape.gradient(loss, dqn_variable)
                        self.parameter_servers.update_gradient.remote(gradient)
                        x = ray.get(self.parameter_servers.getGlobalstep.remote())
                        if (x % 100) == 0:
                            print('update target model')
                            w  = ray.get(self.parameter_servers.get_weight.remote())
                            self.agent.target_model.set_weights(w)
                    state = next_state
                    index += 1
                    
                print("Episode {}# Score: {}".format(index_episode, index + 1))
        finally:
            self.agent.save_model()


In [5]:
import numpy as np
@ray.remote
class ParameterServer(object):
    def __init__(self):
        # Alternatively, params could be a dictionary mapping keys to arrays.
        self.model = self.build_model()
        self.params = np.zeros(100)
        self.opt = SGD(lr=0.001, )
        self.step = 0;
    def get_params(self):
        return self.params

    def update_params(self, grad):
        self.params += grad
    def get_weight(self):
        return self.model.get_weights()
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=4, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(2, activation='linear'))
        model.compile(loss='mse', optimizer=SGD(lr=0.001))
        return model
    def getGlobalstep(self):
        return self.step
    def update_gradient(self,grad):
        self.step = self.step + 1
        x = self.model.trainable_variables
        self.opt.apply_gradients(zip(grad, x))

In [6]:
ray.init(ignore_reinit_error=True)

2021-04-04 06:16:32,276	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'metrics_export_port': 52707,
 'node_id': '299e535b126dbed569889e14fe1c7d011ffe418e9862062edd03fe4a',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2021-04-04_06-16-31_647142_205436/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2021-04-04_06-16-31_647142_205436/sockets/raylet',
 'redis_address': '172.28.0.2:6379',
 'session_dir': '/tmp/ray/session_2021-04-04_06-16-31_647142_205436',
 'webui_url': '127.0.0.1:8265'}

In [7]:
ps = ParameterServer.remote()

# Start 2 workers.
workers = [CartPole.remote(ps) for _ in range(1)]

# Inspect the parameters at regular intervals.
for worker in workers:
    ray.get(worker.run.remote())

[2m[36m(pid=205572)[0m 2021-04-04 06:16:41.163770: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=205571)[0m 2021-04-04 06:16:41.343811: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=205572)[0m 2021-04-04 06:16:43.967725: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=205572)[0m 2021-04-04 06:16:43.969064: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
[2m[36m(pid=205572)[0m 2021-04-04 06:16:43.986613: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[2m[36m(pid=205572)[0m 2021-04-04 06:16:43.986697: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not app

[2m[36m(pid=205571)[0m Episode 0# Score: 17
[2m[36m(pid=205571)[0m Episode 1# Score: 11
[2m[36m(pid=205571)[0m Episode 2# Score: 11
[2m[36m(pid=205571)[0m Episode 3# Score: 11
[2m[36m(pid=205571)[0m Episode 4# Score: 11
[2m[36m(pid=205571)[0m Episode 5# Score: 11
[2m[36m(pid=205571)[0m Episode 6# Score: 10
[2m[36m(pid=205571)[0m Episode 7# Score: 9
[2m[36m(pid=205571)[0m Episode 8# Score: 11
[2m[36m(pid=205571)[0m Episode 9# Score: 10
[2m[36m(pid=205571)[0m Episode 10# Score: 10
[2m[36m(pid=205571)[0m Episode 11# Score: 11
[2m[36m(pid=205571)[0m Episode 12# Score: 10
[2m[36m(pid=205571)[0m update target model
[2m[36m(pid=205571)[0m Episode 13# Score: 10
[2m[36m(pid=205571)[0m Episode 14# Score: 10
[2m[36m(pid=205571)[0m Episode 15# Score: 9
[2m[36m(pid=205571)[0m Episode 16# Score: 11
[2m[36m(pid=205571)[0m Episode 17# Score: 10
[2m[36m(pid=205571)[0m Episode 18# Score: 10
[2m[36m(pid=205571)[0m Episode 19# Score: 10
[2m[

In [12]:
@ray.remote
def actor():
    print('start process 1')
    env = gym.make('CartPole-v1')
    sample_batch_size = 32
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # initializing agent
    agent = Agent(state_size, action_size)
    scores = []
    print('start process 2')
    for e in range(200):
        done = False
        score = 0
        # reset enviroment
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            action = agent.act(state)
            # Advance one time step in the environment with the selected action
            next_state, reward, done, info = env.step(action)
            print('start process 3')
            next_state = np.reshape(next_state, [1, state_size])
            # -100 reward at the end of the episode in the middle
            print('start process 3')
            agent.remember(state, action, reward, next_state, done)
            print('start process 3')
            # Save sample <s, a, r, s'> to replay memory
            memSample.put([state, action, reward, next_state, done])

            score += reward
            state = next_state

        print("Episode {}# Score: {}".format(index_episode, index + 1))
        agent.replay(sample_batch_size)
         # Stop learning if the average score of the previous 10 episodes is greater than 490
    agent.save_model()