# Simple DQN for Mountain Car

Here is an extremely simple deep-q-network implementation for mountain car.

It is far from optimal in a computation sense, and it doesn't make use of the tricks necessary to ensure consistently good performance. 
In fact, the network can diverge if the exploration causes it to get too far off track.
However, it works surprisingly well for a first attempt.
In particular, it's interesting to note that adding a bit of depth can cause immediate improvements in the network's ability to solve the task.

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import pinv
import pandas as pd

np.set_printoptions(precision=4, suppress=True)

import mdpy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import gym
import tensorflow as tf
import tensorflow.contrib

In [3]:
from functools import reduce

In [4]:
env = gym.make('MountainCar-v0')

[2016-11-21 00:51:49,021] Making new env: MountainCar-v0


In [5]:
INPUT_SHAPE = env.observation_space.shape
n_inputs = reduce(lambda x, y: x*y, INPUT_SHAPE, 1)
n_outputs = env.action_space.n

In [126]:
# ses = tf.Session()
ses = tf.InteractiveSession()


X  = tf.placeholder(tf.float32, shape=[None, n_inputs])
# R  = tf.placeholder(tf.float32, shape=[None])
A  = tf.placeholder(tf.int32, shape=[None])
yq = tf.placeholder(tf.float32, shape=[None]) 

# Parameters
gamma = tf.constant(0.9999)


# Define network
# w_fc1 = tf.truncated_normal()
fc1 = tf.contrib.layers.fully_connected(X, 128)
fc2 = tf.contrib.layers.fully_connected(fc1, 32)

# Network outputs are action-values given input observations
out = tf.contrib.layers.fully_connected(fc2, n_outputs, activation_fn=None)

# Get q-values
action_mask = tf.cast(tf.one_hot(A, n_outputs), tf.bool)
q_value = tf.boolean_mask(out, action_mask)

# Next q-value with discounting
q_greedy = tf.reduce_max(out, 1)
q_next = tf.mul(gamma, q_greedy)

greedy_action = tf.argmax(out, 1)

# Define loss
losses = tf.squared_difference(q_value, yq)
loss = tf.reduce_mean(losses,)

# Define optimizer
# optimizer = tf.train.GradientDescentOptimizer(1e-2)
# optimizer = tf.train.RMSPropOptimizer(0.001)
optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(loss, global_step=tf.contrib.framework.get_global_step())

tf.initialize_all_variables().run()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [56]:
_x_ = [env.observation_space.sample()]

In [80]:
out.eval(feed_dict={X: _x_, A: [2]})

array([[-0.1149, -0.0782,  0.0184]], dtype=float32)

In [49]:
q_value.eval(feed_dict={X: _x_, A: [0]})

array([ 0.], dtype=float32)

In [36]:
ses.run(q_greedy, feed_dict={X: [env.observation_space.sample()]})

array([ 0.0556], dtype=float32)

In [27]:
q_next.get_shape()

TensorShape([])

In [37]:
ses.run(q_next, feed_dict={X: [env.observation_space.sample()]*3})

array([ 0.1707,  0.1707,  0.1707], dtype=float32)

In [81]:
init_obs = env.reset()

In [97]:
out.eval(feed_dict={X: [init_obs]})

array([[ 0.0343, -0.0097, -0.0294]], dtype=float32)

In [116]:
ses.run(train_op, feed_dict={X: [obs], A: [action], yq: q_pred})

In [117]:
ses.run(q_value, feed_dict={X: [obs], A: [action], yq: q_pred})

array([-3501.7778], dtype=float32)

In [118]:
ses.run(out, feed_dict={X: [obs], A: [action], yq: q_pred})

array([[-3511.4417, -3508.9768, -3501.7778]], dtype=float32)

In [115]:
ses.run(out, feed_dict={X: [obs], A: [action], yq: q_pred})

array([[-3511.4917, -3508.9631, -3501.9343]], dtype=float32)

In [106]:
ses.run(greedy_action, feed_dict={X: [obs], A: [action], yq: q_pred})

array([2])

In [113]:
ses.run(q_next, feed_dict={X: [obs], A: [action], yq: q_pred})

array([-3501.584], dtype=float32)

In [127]:
num_episodes = 100
max_steps = 2000


epsilon = 2e-2

xlst = []
dlst = []
for i in range(num_episodes):
    obs = env.reset()
    print("Episode: %d"%i)
    print(out.eval(feed_dict={X: [init_obs]}))
    for j in range(max_steps):
        if np.random.random() <= epsilon:
            action = np.random.randint(n_outputs)
        else:
            action = int(ses.run(greedy_action, feed_dict={X: [obs]}))
            
        # Take action
        obs_p, reward, done, info = env.step(action)
        
        # Q-value update
        q_curr = ses.run(q_value, feed_dict={X: [obs], A: [action]})
        q_pred = reward + ses.run(q_next, feed_dict={X: [obs_p]})
        
        # Record information
        delta = float(q_pred - q_curr)
        dlst.append(delta)
        xlst.append(obs)
        
        ses.run(train_op, feed_dict={X: [obs], A: [action], yq: q_pred})
        # Break if done
        if done:
            print("Reached the end in: %d steps"%j)
            break
        
        # Set up for next iteration
        obs = obs_p
    else:
        print("Failed to reach the end within the time limit!")

Episode: 0
[[-0.0064  0.054  -0.0544]]
Failed to reach the end within the time limit!
Episode: 1
[[-551.5478 -550.9449 -531.4397]]
Failed to reach the end within the time limit!
Episode: 2
[[-657.6057 -645.6402 -628.7264]]
Reached the end in: 1578 steps
Episode: 3
[[-740.5845 -748.1631 -678.0052]]
Reached the end in: 1188 steps
Episode: 4
[[-754.8339 -767.7115 -699.524 ]]
Reached the end in: 978 steps
Episode: 5
[[-754.6192 -763.8103 -704.7495]]
Reached the end in: 851 steps
Episode: 6
[[-793.7819 -823.4808 -765.5874]]
Failed to reach the end within the time limit!
Episode: 7
[[-821.1561 -836.3354 -809.0098]]
Failed to reach the end within the time limit!
Episode: 8
[[-852.5285 -846.9985 -855.2771]]
Failed to reach the end within the time limit!
Episode: 9
[[-797.4323 -831.5331 -843.3887]]
Failed to reach the end within the time limit!
Episode: 10
[[ -981.5047 -1015.6758 -1009.6324]]
Failed to reach the end within the time limit!
Episode: 11
[[-1085.151  -1024.1689 -1093.1429]]
Failed 

In [125]:
out.eval(feed_dict={X: [init_obs]})

array([[-8926.7139, -8911.9717, -8893.293 ]], dtype=float32)

In [59]:
n_outputs

3

In [58]:
np.random.randint(n_outputs)

2