In [7]:
import gym
import random
import tensorflow as tf
import numpy as np

In [2]:
STATE_FEATURE_DIM = 4
ACTION_DIM = 2
HIDDEN_DIM = 32

def build_graph(x, x_):
    # Build eval graph
    with tf.variable_scope('eval_graph'):
        collections = ['eval_graph_param', tf.GraphKeys.GLOBAL_VARIABLES]
        with tf.variable_scope("input_layer"):
            W1 = tf.get_variable(
                'W1', [STATE_FEATURE_DIM, HIDDEN_DIM], initializer=tf.truncated_normal_initializer, 
                collections=collections)
            b1 = tf.get_variable('b1', [HIDDEN_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            relu1 = tf.nn.relu(tf.matmul(x, W1) + b1)

        with tf.variable_scope("hidden_layer"):
            W2 = tf.get_variable(
                'W2', [HIDDEN_DIM, HIDDEN_DIM], initializer=tf.truncated_normal_initializer,
                collections=collections)
            b2 = tf.get_variable('b2', [HIDDEN_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            relu2 = tf.nn.relu(tf.matmul(relu1, W2) + b2)

        with tf.variable_scope("output_layer"):
            W3 = tf.get_variable(
                'W3', [HIDDEN_DIM, ACTION_DIM], initializer=tf.truncated_normal_initializer,
                collections=collections)
            b3 = tf.get_variable('b3', [ACTION_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            out = tf.matmul(relu2, W3) + b3
    
    # Build target graph
    with tf.variable_scope('target_graph'):
        collections = ['target_graph_param', tf.GraphKeys.GLOBAL_VARIABLES]
        with tf.variable_scope("input_layer"):
            W1 = tf.get_variable(
                'W1', [STATE_FEATURE_DIM, HIDDEN_DIM], initializer=tf.truncated_normal_initializer, 
                collections=collections)
            b1 = tf.get_variable('b1', [HIDDEN_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            relu1 = tf.nn.relu(tf.matmul(x_, W1) + b1)

        with tf.variable_scope("hidden_layer"):
            W2 = tf.get_variable(
                'W2', [HIDDEN_DIM, HIDDEN_DIM], initializer=tf.truncated_normal_initializer,
                collections=collections)
            b2 = tf.get_variable('b2', [HIDDEN_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            relu2 = tf.nn.relu(tf.matmul(relu1, W2) + b2)

        with tf.variable_scope("output_layer"):
            W3 = tf.get_variable(
                'W3', [HIDDEN_DIM, ACTION_DIM], initializer=tf.truncated_normal_initializer,
                collections=collections)
            b3 = tf.get_variable('b3', [ACTION_DIM], initializer=tf.constant_initializer(0.1), collections=collections)
            out_ = tf.matmul(relu2, W3) + b3
    
    return out, out_

In [3]:
def copy_graph_params():
    target_graph_param = tf.get_collection("target_graph_param")
    eval_graph_param = tf.get_collection("eval_graph_param")
    
    [tf.assign(t, s) for t, s in zip(target_graph_param, eval_graph_param)]

In [50]:
EPSILON = 0.5
EPSILON_DECAY = 0.99
EPSILON_MIN = 0.01

def action(state, out, x):
    if np.random.uniform() < EPSILON:
        return np.random.randint(0, 1)
    return np.argmax(out.eval(feed_dict={x: state}))

def test_action(state, out, x):
    return np.argmax(out.eval(feed_dict={x: state}))

tf.reset_default_graph()

x = tf.placeholder(tf.float32, shape=[None, STATE_FEATURE_DIM], name="training_sample")
x_ = tf.placeholder(tf.float32, shape=[None, STATE_FEATURE_DIM], name="training_sample_target")
y_ = tf.placeholder(tf.float32, shape=[None, ACTION_DIM], name="training_label")

out, out_ = build_graph(x, x_)
loss = tf.reduce_mean(tf.squared_difference(out, y_))

train_step = tf.train.RMSPropOptimizer(1e-4).minimize(loss)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

env = gym.make('CartPole-v0')
gamma = 0.9
running_loss = 0.0
running_score = 0.0
num_epsiode = 0
MEMORY_SIZE = 10000
memory = np.zeros((MEMORY_SIZE, 11))
counter = 0
scores = []

for _ in range(1000):
    state = env.reset()
    state = np.reshape(state, [1, 4])

    for t in range(500):
        act = action(state, out, x)

        next_state, reward, done, _ = env.step(act)
        next_state = np.reshape(next_state, [1, 4])
        
        ind = counter % MEMORY_SIZE
        counter += 1
        memory[ind, :] = np.concatenate([state[0], [reward, act, done], next_state[0]])
        state = next_state

        if done:
            running_score += t
            scores.append(t)
            break

        samples = memory[np.random.choice(min(counter, 10000), 32), :]
        eval_samples = samples[:, : STATE_FEATURE_DIM]
        target_samples = samples[:, -STATE_FEATURE_DIM : ]
        eval_labels, target_labels = sess.run([out, out_], feed_dict={x: eval_samples, x_: target_samples})

        reward = samples[:, STATE_FEATURE_DIM]
        eval_act = samples[:, STATE_FEATURE_DIM+1].astype(int)
        done = samples[:, STATE_FEATURE_DIM+2]
        q_target = eval_labels.copy()
        q_target[np.arange(32), eval_act] = reward + gamma * np.max(target_labels, axis=1) * (1 - done) + done * (-100)

        sess.run(train_step, feed_dict={x: eval_samples, y_: q_target})
    
    num_epsiode += 1
    if num_epsiode % 100 == 0:
        print("Current running score is: %.2f" % (running_score / 100))
        running_score = 0.0
    
    if EPSILON > EPSILON_MIN:
        EPSILON *= EPSILON_DECAY

    if num_epsiode % 300 == 0:
        copy_graph_params()

[2017-09-26 23:33:25,160] Making new env: CartPole-v0


Current running score is: 12.21
Current running score is: 8.24
Current running score is: 8.32
Current running score is: 8.34
Current running score is: 8.30
Current running score is: 8.32
Current running score is: 8.35
Current running score is: 8.26
Current running score is: 8.39
Current running score is: 8.40


In [41]:
total_score = 0.0
for _ in range(10):
    score = 0.0
    state = env.reset()
    while True:
        env.render()
        state = np.reshape(state, [1, 4])
        
        act = test_action(state, out, x)

        next_state, reward, done, _ = env.step(act)
        next_state = np.reshape(next_state, [1, 4])
        state = next_state
        if done:
            break
        score += reward
    total_score += score
print(total_score / 10)

8.2


In [53]:
x = tf.Variable(100)
y = tf.Variable(200)
sess.run(tf.global_variables_initializer())
sess.run(tf.assign(x, y))
print(x.eval())

200
