In [25]:
import sys
import gym
import random
import numpy as np
import os
from time import sleep
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
from multiprocessing import Process, Queue

In [26]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.render = False
        self.weight_backup      = "cartpole_weight.h5"
        # Defining the size of states and actions
        self.state_size = state_size
        self.action_size = action_size

        # DQN Hyper parameter
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.9995
        self.batch_size = 64

        # Create model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # Initialize the target model
        self.update_target_model()

    # build model
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

        if os.path.isfile(self.weight_backup):
            model.load_weights(self.weight_backup)
            self.exploration_rate = self.exploration_min
        return model

    # Update target model with model's weight
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # Choosing behavior with Epsilon greed policy
    def get_action(self, state):
        self.epsilon *= self.epsilon_decay
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])


In [30]:
def actor(memSample, modelq, q3):
    print('start process 1')
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # initializing agent
    agent = DQNAgent(state_size, action_size)
    scores = []

    for e in range(200):
        done = False
        score = 0
        # reset enviroment
        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if agent.render:
                env.render()
            #get learner's model and set the weight if it exist.
            if modelq.qsize() > 0:
                while modelq.qsize() > 1:
                    modelq.get()
                model = modelq.get()
                agent.model.set_weights(model)

            # Choose an action as it is
            action = agent.get_action(state)
            # Advance one time step in the environment with the selected action
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            # -100 reward at the end of the episode in the middle
            reward = reward if not done or score == 499 else -100

            # Save sample <s, a, r, s'> to replay memory
            memSample.put([state, action, reward, next_state, done])

            score += reward
            state = next_state

            if done:
                sleep(0.05)
                score = score if score == 500 else score + 100
                print("episode:", e, "  score:", score, "  epsilon:", agent.epsilon)
                scores.append(score)

                # Stop learning if the average score of the previous 10 episodes is greater than 490
                if np.mean(scores[-min(10, len(scores)):]) > 490:
                    #agent.model.save_weights("./save_model/cartpole_mp.h5")
                    q3.put(True)
                    sys.exit()

In [28]:
def learner(memory, modelq, q3):
    print('start process 2')
    replay_memory = deque(maxlen=5000)
    agent = DQNAgent(4, 2)
    count = 0
    #learning from sample memory
    while True:
        count += 1
        #get a sample from memory
        while memory.qsize() > 0:
            sample = memory.get()
            replay_memory.append(sample)
        #if actor finishes exit as well
        if q3.qsize() > 0:
            sys.exit()

        if len(replay_memory) > 1000:
            mini_batch = random.sample(replay_memory, agent.batch_size)

            states = np.zeros((agent.batch_size, agent.state_size))
            next_states = np.zeros((agent.batch_size, agent.state_size))
            actions, rewards, dones = [], [], []

            for i in range(agent.batch_size):
                states[i] = mini_batch[i][0]
                actions.append(mini_batch[i][1])
                rewards.append(mini_batch[i][2])
                next_states[i] = mini_batch[i][3]
                dones.append(mini_batch[i][4])

            # The model's queuing function for the current state
            # Target model's queuing function for the next state
            target = agent.model.predict(states)
            target_val = agent.target_model.predict(next_states)

            # Update target using Bellmann's optimal equation
            for i in range(agent.batch_size):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + agent.discount_factor * (
                        np.amax(target_val[i]))

            agent.model.fit(states, target, batch_size=agent.batch_size,
                            epochs=1, verbose=0)
            model = agent.model.get_weights()
            modelq.put(model)
            #update target model every 100 step
            if count >= 100:
                print('update target model')
                agent.target_model.set_weights(agent.model.get_weights())
                count = 0

In [31]:
memory = Queue()
model = Queue()
end = Queue()
process1 = Process(target=actor, args=(memory, model, end))
process2 = Process(target=learner, args=(memory, model, end))
process1.start()
process2.start()
memory.close()
model.close()
end.close()
memory.join_thread()
model.join_thread()
end.close()
process1.join()
process2.join()

start process 1
start process 2
episode: 0   score: 17.0   epsilon: 0.9910381481909833
episode: 1   score: 9.0   epsilon: 0.9860940917766235
episode: 2   score: 33.0   epsilon: 0.9694680571640535
episode: 3   score: 22.0   epsilon: 0.9583802792808146
episode: 4   score: 12.0   epsilon: 0.9521694616616301
episode: 5   score: 17.0   epsilon: 0.9436362600491477
episode: 6   score: 9.0   epsilon: 0.938928680514662
episode: 7   score: 37.0   epsilon: 0.9212530665171942
episode: 8   score: 17.0   epsilon: 0.9129969330564653
episode: 9   score: 16.0   epsilon: 0.9052674235521029
episode: 10   score: 30.0   epsilon: 0.8913405089533699
episode: 11   score: 11.0   epsilon: 0.8860071485337379
episode: 12   score: 14.0   epsilon: 0.8793853022912325
episode: 13   score: 11.0   epsilon: 0.8741234761790619
episode: 14   score: 7.0   epsilon: 0.8706330950236377
episode: 15   score: 10.0   epsilon: 0.8658565662672015
episode: 16   score: 21.0   epsilon: 0.8563819809727236
episode: 17   score: 9.0   eps