# Prepare environment

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1



In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import random
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f0481cceed0>

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# DQN agent

In [None]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(256, input_shape = (None,self.state_size), activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
      minibatch = random.sample(self.memory, batch_size)
      X_cur_states = []
      X_next_states = []
      for index, sample in enumerate(minibatch):
        state, action, reward, next_state, done = sample
        X_cur_states.append(state)
        X_next_states.append(next_state)

      X_cur_states = np.array(X_cur_states)
      X_next_states = np.array(X_next_states)


      cur_action_values = self.model.predict(X_cur_states)
      next_action_values = self.model.predict(X_next_states)

      for index, sample in enumerate(minibatch):
        state, action, reward, next_state, done = sample
        if not done:
          cur_action_values[index][0][action] = reward + self.gamma * np.amax(next_action_values[index])
        else:
          cur_action_values[index][0][action] = reward

      self.model.fit(X_cur_states, cur_action_values, verbose=0)
      if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

In [None]:
env = wrap_env(gym.make('MountainCar-v0'))
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# Load Pretrained Model

**Do not forget to reset epsilon to 0.**

In [None]:
#agent.load('dqn_moutaincar.h5')
#agent.epsilon = 0

# Train model

In [None]:
batch_size = 256
EPISODES = 50

scores = []
global_max_score = -1e10
global_max_height = -1e10

agent = DQNAgent(state_size, action_size)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
for e in range(1,EPISODES+1):
        total_score = 0
        step = 0
        max_height = -1e10
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        done = False

        while not done:
            # env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            
            modified_reward = reward + 0.8 * abs(next_state[1]) - abs(state[0][1])
            
            total_score += reward
            max_height = max(max_height, next_state[0])

            next_state = np.reshape(next_state, [1, state_size])
            agent.memorize(state, action, modified_reward, next_state, done)
            state = next_state

            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

        scores.append(total_score)
        global_max_score = max(global_max_score, total_score)
        global_max_height = max(global_max_height, max_height)
        if not (e % 1):
          print("Episode: {}/{}".format(e, EPISODES))
          print(" Total score for episode {} : {}, Max height : {}".format(e, total_score, max_height))
          print(" GLOBAL MAXIMUMS: Max score : {}, Max height  : {}".format(global_max_score, global_max_height))
          print('-' * 150)

env.close()

Episode: 1/50
 Total score for episode 1 : -200.0, Max height : -0.41177092627397005
 GLOBAL MAXIMUMS: Max score : -200.0, Max height  : -0.41177092627397005
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 2/50
 Total score for episode 2 : -200.0, Max height : -0.3430364910627893
 GLOBAL MAXIMUMS: Max score : -200.0, Max height  : -0.3430364910627893
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 3/50
 Total score for episode 3 : -200.0, Max height : -0.33022654712485994
 GLOBAL MAXIMUMS: Max score : -200.0, Max height  : -0.33022654712485994
------------------------------------------------------------------------------------------------------------------------------------------------------
Episode: 4/50
 Total score for episode 4 : -200.0, Max height : -0.28336924

# Test model

In [None]:
env = wrap_env(gym.make('MountainCar-v0'))
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False

while not done:
  env.render()  
  action = agent.act(state)
  next_state, reward, done, _ = env.step(action)
  next_state = np.reshape(next_state, [1, state_size])
  state = next_state

env.close()
show_video()

In [None]:
agent.save('dqn_moutaincar.h5')