# From Scratch Deep Q Reinforcement Learning Models

There is a simpler library called keras-rl that adds some easy plug & play setup for rl models, however, this sample sets up one manually for the mountain car openai gym example.

Summary of DQN - Reinforcement learning is an example of deep learning applied - not a different type of model.  Refinforcement learning can be applied with/without neural networks.  Refinforcement learning is defined by two entities - an environment and an agent.  The agent is the bot or automation that takes action upon the environment to reach an accept state within the environment.  Essentially a pattern is followed - for each timestep in a given trial/session, take an action.  This action will either be random, or can come from memory (epsilon - exploration vs. exploitation).  After the action, record and memorize the state of the environment.  Then using samples from memory, train on how to make better future predicitions.  The prediction network uses a multi-layer perceptron model to make better predictions.

In [1]:
import gym
import numpy as np
import random
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from collections import deque

ModuleNotFoundError: No module named 'gym'

In [None]:
class DQN:
    def __init__(self, env):
        self.env     = env
        self.memory  = deque(maxlen=2000)
        
        self.gamma = 0.85
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.005
        self.tau = .125

        self.model        = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        model   = Sequential()
        state_shape  = self.env.observation_space.shape
        model.add(Dense(24, input_dim=state_shape[0], activation="relu"))
        model.add(Dense(48, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.env.action_space.n))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.model.predict(state)[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        batch_size = 32
        if len(self.memory) < batch_size: 
            return

        samples = random.sample(self.memory, batch_size)
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self, fn):
        self.model.save(fn)

def main():
    env     = gym.make("MountainCar-v0")
    gamma   = 0.9
    epsilon = .95

    trials  = 1000
    trial_len = 500

    # updateTargetNetwork = 1000
    dqn_agent = DQN(env=env)
    steps = []
    for trial in range(trials):
        cur_state = env.reset().reshape(1,2)
        for step in range(trial_len):
            action = dqn_agent.act(cur_state)
            new_state, reward, done, _ = env.step(action)

            # reward = reward if not done else -20
            new_state = new_state.reshape(1,2)
            dqn_agent.remember(cur_state, action, reward, new_state, done)
            
            dqn_agent.replay()       # internally iterates default (prediction) model
            dqn_agent.target_train() # iterates target model

            cur_state = new_state
            if done:
                break
        if step >= 199:
            print("Failed to complete in trial {}".format(trial))
            if step % 10 == 0:
                dqn_agent.save_model("trial-{}.model".format(trial))
        else:
            print("Completed in {} trials".format(trial))
            dqn_agent.save_model("success.model")
            break

In [None]:
main()

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Failed to complete in trial 0
Failed to complete in trial 1
Failed to complete in trial 2
Failed to complete in trial 3
Failed to complete in trial 4
Failed to complete in trial 5
Failed to complete in trial 6
Failed to complete in trial 7


KeyboardInterrupt: ignored

In [None]:
env     = gym.make("MountainCar-v0")
cur_state = env.reset().reshape(1,2)
steps = []
gamma   = 0.9
epsilon = .95
trials  = 1000
trial_len = 500
dqn_agent = DQN(env=env)
for step in range(trial_len):
    action = dqn_agent.act(cur_state)
    new_state, reward, done, _ = env.step(action)
    new_state = new_state.reshape(1,2)
    cur_state = new_state
    if done:
        break
if step >= 199:
    print('Failed to complete trial')
    if step % 10 == 0:
        dqn_agent.save_model("trial-{}.model".format(trial))
else:
    print("Completed in Successfully!")

In [None]:
"""
MountainCar-v0 -- Deep Q-learning
"""
import os
import random
from collections import deque

import gym
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


class Agent:
    def __init__(self, state_size, action_size, batch_size=32, memory_size=50000):
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.training = 10000  # training after 10000 env steps
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])

    def experience_replay(self):
        # Updates the online network weights after enough data is collected
        if self.training >= len(self.memory):
            return

        # Samples a batch from the memory
        random_batch = random.sample(self.memory, self.batch_size)

        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            state[i] = random_batch[i][0]
            action.append(random_batch[i][1])
            reward.append(random_batch[i][2])
            next_state[i] = random_batch[i][3]
            done.append(random_batch[i][4])

        # Batch prediction to save compute costs
        target = self.model.predict(state)
        target_next = self.model(next_state)

        for i in range(len(random_batch)):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        self.model.fit(
            np.array(state),
            np.array(target),
            batch_size=self.batch_size,
            verbose=0
        )

        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def load_weights(self, weights_file):
        self.epsilon = self.epsilon_min
        self.model.load_weights(weights_file)

    def save_weights(self, weights_file):
        self.model.save_weights(weights_file)


if __name__ == "__main__":
    # Flag used to enable or disable screen recording
    recording_is_enabled = False

    # Initializes the environment
    env = gym.make('MountainCar-v0')

    # Records the environment
    if recording_is_enabled:
        env = gym.wrappers.Monitor(env, "recording", video_callable=lambda episode_id: True, force=True)

    # Defines training related constants
    num_episodes = 1000
    num_episode_steps = env.spec.max_episode_steps  # constant value
    action_size = env.action_space.n
    state_size = env.observation_space.shape[0]
    max_reward = 0

    # Creates the agent
    agent = Agent(state_size=state_size, action_size=action_size)

    # Loads the weights
    if os.path.isfile("mountain-car-v0.h5"):
        agent.load_weights("mountain-car-v0.h5")

    for episode in range(num_episodes):
        # Defines the total reward per episode
        total_reward = 0

        # Resets the environment
        observation = env.reset()

        # Gets the state
        state = np.reshape(observation, [1, state_size])

        for episode_step in range(num_episode_steps):
            # Renders the screen after new environment observation
            #env.render(mode="human")

            # Gets a new action
            action = agent.act(state)

            # Takes action and calculates the total reward
            observation, reward, done, _ = env.step(action)

            # Recalculates the reward
            if observation[1] > state[0][1] >= 0 and observation[1] >= 0:
                reward = 20
            if observation[1] < state[0][1] <= 0 and observation[1] <= 0:
                reward = 20
            if done and episode_step < num_episode_steps - 1:
                reward += 10000
            else:
                reward -= 25

            total_reward += reward

            # Gets the next state
            next_state = np.reshape(observation, [1, state_size])

            # Memorizes the experience
            agent.memorize(state, action, reward, next_state, done)

            # Updates the state
            state = next_state

            # Updates the network weights
            agent.experience_replay()

            if done:
                print("Episode %d/%d finished after %d episode steps with total reward = %f."
                      % (episode + 1, num_episodes, episode_step + 1, total_reward))
                break

            elif episode_step >= num_episode_steps - 1:
                print("Episode %d/%d timed out at %d with total reward = %f."
                      % (episode + 1, num_episodes, episode_step + 1, total_reward))

        # Saves the weights
        if total_reward >= max_reward:
            agent.save_weights("mountain-car-v0.h5")
            max_reward = total_reward

    # Closes the environment
    env.close()

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Episode 1/1000 finished after 200 episode steps with total reward = -3037.000000.
Episode 2/1000 finished after 200 episode steps with total reward = -3016.000000.
Episode 3/1000 finished after 200 episode steps with total reward = -3100.000000.
Episode 4/1000 finished after 200 episode steps with total reward = -3163.000000.
Episode 5/1000 finished after 200 episode steps with total reward = -2932.000000.
Episode 6/1000 finished after 200 episode steps with total reward = -3058.000000.
Episode 7/1000 finished after 200 episode steps with total reward = -3142.000000.
Episode 8/1000 finished after 200 episode steps with total reward = -3184.000000.
Episode 9/1000 finished after 200 episode steps with total reward = -3163.000000.
Episode 10/1000 finished after 200 episode steps with total reward = -3058.000000.
Episode 11/1000 finished after 200 episode steps with total reward = -3037.000000.
Episode 12/1000 finished after 200 episode steps with total reward = -2953.000000.
Episode 13/10

KeyboardInterrupt: ignored