In [None]:
import gym
import tensorflow as tf
import numpy as np
from tensorflow import keras

from collections import deque
import time
import random
from multiprocessing import Process, Lock


In [None]:

# An episode a full game
train_episodes = 300
test_episodes = 100

def agent(state_shape, action_shape):
    """ The agent maps X-states to Y-actions
    e.g. The neural network output is [.1, .7, .05, 0.05, .05, .05]
    The highest value 0.7 is the Q-Value.
    The index of the highest action (0.7) is action #1.
    """
    
    learning_rate = 0.001
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(action_shape, activation='linear', kernel_initializer=init))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=['accuracy'])
    return model

def get_qs(model, state, step):
    return model.predict(state.reshape([1, state.shape[0]]))[0]

In [None]:
def train(env, replay_memory, model, target_model, done):
    learning_rate = 0.7 # Learning rate
    discount_factor = 0.618

    MIN_REPLAY_SIZE = 1000
    if len(replay_memory) < MIN_REPLAY_SIZE:
        return

    batch_size = 64 * 2
    mini_batch = random.sample(replay_memory, batch_size)
    current_states = np.array([encode_observation(transition[0], env.observation_space.shape) for transition in mini_batch])
    current_qs_list = model.predict(current_states)
    new_current_states = np.array([encode_observation(transition[3], env.observation_space.shape) for transition in mini_batch])
    future_qs_list = target_model.predict(new_current_states)

    X = []
    Y = []
    for index, (observation, action, reward, new_observation, done) in enumerate(mini_batch):
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(encode_observation(observation, env.observation_space.shape))
        Y.append(current_qs)
    model.fit(np.array(X), np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

def encode_observation(observation, n_dims):
    return observation

In [None]:
env = gym.make('CartPole-v1')
target_model = agent(env.observation_space.shape, env.action_space.n)

In [None]:
print(target_model)

<tensorflow.python.keras.engine.sequential.Sequential object at 0x7f55b64afed0>


In [None]:
def runit(Lock, lock2):
    epsilon = 1 # Epsilon-greedy algorithm in initialized at 1 meaning every step is random at the start
    max_epsilon = 1 # You can't explore more than 100% of the time
    min_epsilon = 0.01 # At a minimum, we'll always explore 1% of the time
    decay = 0.01
    # 1. Initialize the Target and Main models
    # Main Model (updated every 4 steps)
    env = gym.make('CartPole-v1')
    model = agent(env.observation_space.shape, env.action_space.n)
    # Target Model (updated every 100 steps)
    with lock2:
      target_model.set_weights(model.get_weights())
    
    replay_memory = deque(maxlen=50_000)

    target_update_counter = 0

    # X = states, y = actions
    X = []
    y = []

    steps_to_update_target_model = 0

    for episode in range(train_episodes):
        total_training_rewards = 0
        observation = env.reset()
        done = False
        while not done:
            steps_to_update_target_model += 1
            #if True:
            #    env.render()

            random_number = np.random.rand()
            # 2. Explore using the Epsilon Greedy Exploration Strategy
            if random_number <= epsilon:
                # Explore
                action = env.action_space.sample()
            else:
                # Exploit best known action
                # model dims are (batch, env.observation_space.n)
                encoded = encode_observation(observation, env.observation_space.shape[0])
                encoded_reshaped = encoded.reshape([1, encoded.shape[0]])
                predicted = model.predict(encoded_reshaped).flatten()
                action = np.argmax(predicted)
            new_observation, reward, done, info = env.step(action)
            replay_memory.append([observation, action, reward, new_observation, done])

            # 3. Update the Main Network using the Bellman Equation
            if steps_to_update_target_model % 4 == 0 or done:
                with lock:
                  train(env, replay_memory, model, target_model, done)             
            observation = new_observation
            total_training_rewards += reward

            if done:
                print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
                total_training_rewards += 1

                if steps_to_update_target_model >= 100:
                    print('Copying main network weights to the target network weights')
                    target_model.set_weights(model.get_weights())
                    steps_to_update_target_model = 0
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
    env.close()

In [None]:
import threading

In [None]:
class myThread (threading.Thread):
   def __init__(self, threadID, name, lock,lock2):
      threading.Thread.__init__(self)
      self.threadID = threadID
      self.name = name
      self.lock = lock
      self.lock2 = lock2
   def run(self):
      print("Starting " + self.name) 
      runit(self.lock,self.lock2)
      print ("Exiting " + self.name)

In [None]:
lock = threading.Lock()
lock2 = threading.Lock()
thread1 = myThread(1, "Thread-1", lock,lock2)
thread2 = myThread(2, "Thread-2", lock,lock2)

In [None]:
thread1.start()
thread2.start()
thread1.join()
thread2.join()

Starting Thread-1
Starting Thread-2
Total training rewards: 16.0 after n steps = 0 with final reward = 1.0
Total training rewards: 28.0 after n steps = 1 with final reward = 1.0
Total training rewards: 54.0 after n steps = 2 with final reward = 1.0
Total training rewards: 21.0 after n steps = 3 with final reward = 1.0
Copying main network weights to the target network weights
Total training rewards: 18.0 after n steps = 4 with final reward = 1.0
Total training rewards: 12.0 after n steps = 0 with final reward = 1.0
Total training rewards: 15.0 after n steps = 1 with final reward = 1.0
Total training rewards: 22.0 after n steps = 2 with final reward = 1.0
Total training rewards: 16.0 after n steps = 3 with final reward = 1.0
Total training rewards: 13.0 after n steps = 5 with final reward = 1.0
Total training rewards: 18.0 after n steps = 6 with final reward = 1.0
Total training rewards: 20.0 after n steps = 7 with final reward = 1.0Total training rewards: 29.0 after n steps = 4 with fi