### Import libraries

In [1]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

### Parameters

In [2]:
episodes = 40 # will be a lot
max_steps = 200
gamma = 0.95 # specified by homework

# network parameters
hidden_size = 128
learning_rate = 0.01

# epochs 
epochs = 40

### V-Estimation

In [3]:
class VNetwork():
    def __init__(self, learning_rate=0.01, state_size=4, action_size=2, hidden_size=10, name='VNetwork'):
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            
            ## this could be changed depending on future code structure (loss function)
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # placeholder for Vs
            self.targetVs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # Hidden Layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size, weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size, weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, hidden_size, weights_initializer=tf.contrib.layers.xavier_initializer())
            
            # Output Layer
            self.output = tf.contrib.layers.fully_connected(self.fc3, action_size, activation_fn=tf.nn.softmax)
            
            # calculate value function
            self.V = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            # minimize SSE
            self.loss = tf.reduce_mean(tf.square(self.targetVs_ - self.V))
            
            # adjust network
            self.learn = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

### Setup

In [4]:
tf.reset_default_graph()
network = VNetwork(name = 'pray4us', hidden_size=hidden_size, learning_rate=learning_rate)

In [5]:
# set up environment
env = gym.make('CartPole-v0')

# Initialize the simulation
env.reset()

# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

### Training

In [6]:
saver = tf.train.Saver()
all_loss = []
all_rewards = []


with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        epoch_loss = []
        epoch_reward = []
        
        for i_episode in range(episodes): # ideally 1000
            state = env.reset()
    
            # for each sample
            reward = []
            current_state = []
            next_state = []
            action = []
    
            for t in range(max_steps):
                
                # determine action based on policy
                feed = {network.inputs_: state.reshape((1, *state.shape))}
                Vs = sess.run(network.output, feed_dict=feed)
                policy_action = np.argmax(Vs)
                
                policy_state, policy_reward, done, info = env.step(policy_action)
    
                reward.append(policy_reward)
                next_state.append(policy_state)
                action.append(policy_action)
                current_state.append(state)
                
                state = policy_state
                
                if done:
                    break
                    
            # learn things, network
            feed = {network.inputs_ : next_state}
            target = sess.run(network.output, feed_dict = feed)
            
            # check if next state = game over (all zeros)
            final_state = (next_state == np.zeros(current_state[0].shape)).all(axis=1)
            
            # reward at end of the game
            target[final_state] = (0, 0)
            
            # policy
            targets = sum(reward) + gamma * np.max(target, axis=1)
            
            loss, _ = sess.run([network.loss, network.learn],
                                  feed_dict={network.inputs_: current_state,
                                            network.targetVs_: targets,
                                            network.actions_: action})
            epoch_loss.append(loss)
            epoch_reward.append(sum(reward))
        all_loss.append(np.mean(epoch_loss))
        all_rewards.append(np.mean(epoch_reward))
            
        saver.save(sess, "checkpoints/cartpole{0}.ckpt".format(epoch))

In [7]:
all_loss

[84.86538,
 90.475,
 89.42999,
 88.384995,
 91.21999,
 88.8575,
 85.97249,
 86.81749,
 91.792496,
 87.53999,
 85.0275,
 87.017494,
 86.917496,
 86.69499,
 85.749985,
 85.649994,
 88.585,
 85.799995,
 84.854996,
 83.85999,
 84.38249,
 88.284996,
 85.17749,
 83.86,
 84.704994,
 90.22499,
 88.435,
 89.52999,
 87.017494,
 83.237495,
 89.80249,
 86.545,
 87.017494,
 89.37999,
 88.06249,
 88.33499,
 86.222496,
 88.112495,
 85.97249,
 83.909996]

In [8]:
all_rewards

[9.225,
 9.525,
 9.475,
 9.425,
 9.575,
 9.45,
 9.3,
 9.35,
 9.6,
 9.375,
 9.25,
 9.35,
 9.35,
 9.325,
 9.275,
 9.275,
 9.425,
 9.275,
 9.225,
 9.175,
 9.2,
 9.425,
 9.25,
 9.175,
 9.225,
 9.525,
 9.425,
 9.475,
 9.35,
 9.15,
 9.5,
 9.325,
 9.35,
 9.475,
 9.4,
 9.425,
 9.3,
 9.4,
 9.3,
 9.175]