In [1]:
import gym
import numpy as np
import tensorflow as tf
import random
import matplotlib.pyplot as plt
from collections import namedtuple
from collections import deque

In [2]:
!pip install Box2D

Collecting Box2D
[?25l  Downloading https://files.pythonhosted.org/packages/a9/0b/d48d42dd9e19ce83a3fb4eee074e785b6c6ea612a2244dc2ef69427d338b/Box2D-2.3.10-cp36-cp36m-manylinux1_x86_64.whl (1.3MB)
[K     |▎                               | 10kB 18.8MB/s eta 0:00:01[K     |▌                               | 20kB 1.7MB/s eta 0:00:01[K     |▊                               | 30kB 2.3MB/s eta 0:00:01[K     |█                               | 40kB 2.6MB/s eta 0:00:01[K     |█▎                              | 51kB 2.0MB/s eta 0:00:01[K     |█▌                              | 61kB 2.3MB/s eta 0:00:01[K     |█▊                              | 71kB 2.5MB/s eta 0:00:01[K     |██                              | 81kB 2.8MB/s eta 0:00:01[K     |██▎                             | 92kB 2.9MB/s eta 0:00:01[K     |██▌                             | 102kB 2.8MB/s eta 0:00:01[K     |██▊                             | 112kB 2.8MB/s eta 0:00:01[K     |███                             | 122kB 2

In [3]:
np.random.seed(1)
tf.random.set_seed(1)

In [4]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

In [5]:
class DQNAgent:
    def __init__(self, env, discount_factor=0.95, epsilon_greedy=1.0, epsilon_min=0.01, epsilon_decay=0.995, learning_rate=1e-3, max_memory_size=2000):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.memory = deque(maxlen=max_memory_size)
        self.gamma = discount_factor
        self.epsilon = epsilon_greedy
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.lr = learning_rate
        self._build_nn_model()
        self._build_second_nn_model()
        self.model_two.set_weights(self.model.get_weights())
        self.c = 0
    
    def _build_second_nn_model(self, n_layers=3):
        self.model_two = tf.keras.Sequential()
        # Hidden layers
        for n in range(n_layers-1):
            self.model_two.add(tf.keras.layers.Dense(units=32, activation='relu'))
            self.model_two.add(tf.keras.layers.Dense(units=32, activation='relu'))
        # Final layer
        self.model_two.add(tf.keras.layers.Dense(units=self.action_size))

        # Build and compile model
        self.model_two.build(input_shape=(None, self.state_size))
        self.model_two.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.lr))
    
    def _build_nn_model(self, n_layers=3):
        self.model = tf.keras.Sequential()
        # Hidden layers
        for n in range(n_layers-1):
            self.model.add(tf.keras.layers.Dense(units=32, activation='relu'))
            self.model.add(tf.keras.layers.Dense(units=32, activation='relu'))
        # Final layer
        self.model.add(tf.keras.layers.Dense(units=self.action_size))

        # Build and compile model
        self.model.build(input_shape=(None, self.state_size))
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.lr))
        
    def remember(self, transition):
        self.memory.append(transition)
        
    def choose_action(self, state):
        if np.random.random() <= self.epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state)[0]
        return np.argmax(q_values)
    
    def _learn(self, batch_samples, done):
        batch_states, batch_targets = [], []
        for transition in batch_samples:
            s, a, r, next_s, done = transition
            if done:
                target = r
            else:
                target = (r + self.gamma * np.amax(self.model_two.predict(next_s)[0]))
            target_all = self.model.predict(s)[0]
            target_all[a] = target
            batch_states.append(s.flatten())
            batch_targets.append(target_all)
            self._adjust_epsilon()
#             self.c += 1
            if done:
                self.model_two.set_weights(self.model.get_weights())
#                 self.c = 0
        return self.model.fit(x=np.array(batch_states), y=np.array(batch_targets), verbose=0)
    
    def _adjust_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def replay(self, batch_size, done):
        samples = random.sample(self.memory, batch_size)
        history = self._learn(samples, done)
        return history.history['loss'][0]

In [6]:
def plot_learning_history(history):
    fig = plt.figure(1, figsize=(14,5))
    ax = fig.add_subplot(1, 1, 1)
    episodes = np.arange(len(history)) + 1
    plt.plot(episodes, history, lw=4, marker='o', markersize=10)
    ax.tick_params(axis='both', which='major', labelsize=15)
    plt.xlabel('Episodes', size=20)
    plt.ylabel('# Total Rewards', size=20)
    plt.show()

In [7]:
EPISODES = 60
batch_size = 32
init_replay_memory_size = 500

if __name__ == '__main__':
  env = gym.make('LunarLander-v2')
  env.mode = 'fast'  
  agent = DQNAgent(env)
  state = env.reset()
  state = np.reshape(state, [1, agent.state_size])

  for i in range(init_replay_memory_size):
    action = agent.choose_action(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, agent.state_size])
    agent.remember(Transition(state, action, reward, next_state, done))
    if done:
      state = env.reset()
      state = np.reshape(state, [1, agent.state_size])
    else:
      state = next_state
    
  total_rewards, losses = [], []
  epi = 0 
  for e in range(EPISODES):
    e_reward = 0
    print("In Episode: ", e)
    state = env.reset()
#     if e % 10 == 0:
    # env.render()
    done = False
    state = np.reshape(state, [1, agent.state_size])
    if epi < 600:
      epi += 5
    for i in range(epi):
      action = agent.choose_action(state)
      next_state, reward, done, _ = env.step(action)
      e_reward += reward
      next_state = np.reshape(next_state, [1, agent.state_size])
      agent.remember(Transition(state, action, reward, next_state, done))
      state = next_state
#       if e % 10 == 0:
      # env.render()
      if done:
        total_rewards.append(e_reward)
        print('Episode: %d/%d, Total reward: %d' % (e, EPISODES, e_reward))
        break
      loss = agent.replay(batch_size, done)
      losses.append(loss)
    if not done:
      total_rewards.append(e_reward)
      print("Terminated before terminal state")  
  plot_learning_history(total_rewards) 

In Episode:  0
Terminated before terminal state
In Episode:  1
Terminated before terminal state
In Episode:  2
Terminated before terminal state
In Episode:  3
Terminated before terminal state
In Episode:  4
Terminated before terminal state
In Episode:  5
Terminated before terminal state
In Episode:  6
Episode: 6/60, Total reward: -358
In Episode:  7
Terminated before terminal state
In Episode:  8
Terminated before terminal state
In Episode:  9
Terminated before terminal state
In Episode:  10
Episode: 10/60, Total reward: -64
In Episode:  11
Terminated before terminal state
In Episode:  12
Terminated before terminal state
In Episode:  13
Terminated before terminal state
In Episode:  14
Terminated before terminal state
In Episode:  15
Terminated before terminal state
In Episode:  16
Terminated before terminal state
In Episode:  17
Terminated before terminal state
In Episode:  18


KeyboardInterrupt: ignored

In [8]:
total_rewards

[-358.41096603996715, -64.20761427578776]