In [1]:
import numpy as np
import tensorflow as tf
import gym

In [56]:
# Hyperparameters
D = 80 * 80 # Dimision of input image
H = 200 # Number of hidden layer neurons
batch_size = 10 # Every how many episodes to do a param update
learning_rate = 1e-4
gamma = 0.99 # Discount factor for reward
decay_rate = 0.99 # Decay factor for RMSProp leaky sum of grad^2
render = True
save_path = 'models/pong.ckpt'
MAX_EPISODE_NUMBER = 200

In [31]:
def discount_rewards(r):
    """Take 1D float array of rewards and compute discounted reward"""
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0: 
            running_add = 0 # reset the sum, since this was a game boundary
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def prepro(I):
    """Prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector"""
    I = I[35:195] # Crop
    I = I[::2, ::2, 0] # Downsample by factor of 2
    I[I == 144] = 0 # Erase background
    I[I == 109] = 0 # Erase background
    I[I != 0] = 1 # everything else just set to 1
    return I.astype(np.float).ravel()

In [51]:
# Model initialization
W1 = tf.Variable(tf.truncated_normal([D, H], mean=0, stddev=1./np.sqrt(D), dtype=tf.float32))
W2 = tf.Variable(tf.truncated_normal([H, 1], mean=0, stddev=1./np.sqrt(H), dtype=tf.float32))
x = tf.placeholder(dtype=tf.float32, shape=[None, D])
y = tf.placeholder(dtype=tf.float32, shape=[None, 1])
discounted_rewards = tf.placeholder(dtype=tf.float32, shape=[None, 1])

fc1 = tf.matmul(x, W1)
relu = tf.nn.relu(fc1)
fc2 = tf.matmul(relu, W2)
# Calculate probability which is used for sample action
sig = tf.nn.sigmoid(fc2)
# Train the policy network according the reward we get in final
loss = tf.nn.l2_loss(y - sig)
optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay_rate)
grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=discounted_rewards)
train = optimizer.apply_gradients(grads)

In [57]:
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None
xs, ys, drs = [], [], []
reward_sum = 0
running_reward = 0
episode_number = 0

init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init)

saver = tf.train.Saver(tf.all_variables())
load_was_success = True
try:
    save_dir = '/'.join(save_path.split('/')[:-1])
    ckpt = tf.train.get_checkpoint_state(save_dir)
    load_path = ckpt.model_checkpoint_path
    saver.restore(sess, load_path)
except Exception:
    print("No saved model to load, starting new session")
    load_was_success = False
else:
    print("Load model: {}".format(load_path))
    saver = tf.train.Saver(tf.all_variables())
    episode_number = int(load_path.split('-')[-1])


while True:
    if episode_number >= MAX_EPISODE_NUMBER:
        break

    if render:
        env.render()
    
    # process the observation, set input to network to be difference image 
    cur_x = prepro(observation)
    tf_x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    tf_x = np.reshape(tf_x, (1, -1))
    prev_x = cur_x
    
    feed_dict={x: tf_x}
    aprob = sess.run(sig, feed_dict=feed_dict)
    action = 2 if np.random.uniform() < aprob else 3
    
    # record various intermediates
    xs.append(tf_x)
    tf_y = 1 if action == 2 else 0
    ys.append(tf_y)
    
    # step the enviornment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward
    
    drs.append(reward)
    
    if done:
        episode_number += 1
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        
        # stack everything together
        epx = np.vstack(xs)
        epy = np.vstack(ys)
        epr = np.vstack(drs)
        xs, ys, drs = [], [], []
        
        discounted_epr = discount_rewards(epr)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)
        
        feed_dict = {x: epx, y: epy, discounted_rewards: discounted_epr}
        sess.run(train, feed_dict=feed_dict)
        
        observation  = env.reset()
        prev_x = None
        
        if episode_number % 10 == 0:
            print('ep {}: reward: {}, mean reward: {:3f}'.format(episode_number, reward_sum, running_reward)) 
        
        if episode_number % 50 == 0:
            saver.save(sess, save_path, global_step=episode_number)
            print("SAVE MODEL #{}".format(episode_number))


[2017-06-03 18:01:17,872] Making new env: Pong-v0


Instructions for updating:
Please use tf.global_variables instead.


[2017-06-03 18:01:19,015] From <ipython-input-57-79a763210b8f>:13: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


INFO:tensorflow:Restoring parameters from models/pong.ckpt-100


[2017-06-03 18:01:19,200] Restoring parameters from models/pong.ckpt-100


Load model: models/pong.ckpt-100
Instructions for updating:
Please use tf.global_variables instead.


[2017-06-03 18:01:19,666] From <ipython-input-57-79a763210b8f>:25: all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Please use tf.global_variables instead.


ep 101: reward: -21.0, mean reward: -0.210000
ep 102: reward: -42.0, mean reward: -0.627900
ep 103: reward: -63.0, mean reward: -1.251621
ep 104: reward: -84.0, mean reward: -2.079105
ep 105: reward: -104.0, mean reward: -3.098314
ep 106: reward: -125.0, mean reward: -4.317331
ep 107: reward: -145.0, mean reward: -5.724157
ep 108: reward: -166.0, mean reward: -7.326916
ep 109: reward: -187.0, mean reward: -9.123647
ep 110: reward: -207.0, mean reward: -11.102410
ep 111: reward: -228.0, mean reward: -13.271386
ep 112: reward: -249.0, mean reward: -15.628672
ep 113: reward: -270.0, mean reward: -18.172385
ep 114: reward: -291.0, mean reward: -20.900662
ep 115: reward: -312.0, mean reward: -23.811655
ep 116: reward: -333.0, mean reward: -26.903538
ep 117: reward: -353.0, mean reward: -30.164503
ep 118: reward: -374.0, mean reward: -33.602858
ep 119: reward: -395.0, mean reward: -37.216829
ep 120: reward: -416.0, mean reward: -41.004661
ep 121: reward: -436.0, mean reward: -44.954615
ep 12