### Import libraries

In [1]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

In [23]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13975096947971138428
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7911899136
locality {
  bus_id: 1
  links {
  }
}
incarnation: 13461418281855118340
physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:06:00.0, compute capability: 6.1"
]


### Set Up GPUs

In [16]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

### Parameters

In [17]:
episodes = 1000 # ideally > 1,000
gamma = 0.95 # specified by homework

# network parameters
learning_rate = 0.01
hidden_size = 10

# epochs 
epochs = 1000 # will be a lot on GPUs

### V-Estimation

In [18]:
class PolicyGradient():
    def __init__(self, learning_rate=0.01, state_size=4, action_size=2, hidden_size=10, name='PolicyGradient'):
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.actions_ = tf.placeholder(tf.int32, [None, action_size], name='actions')
            self.expected_future_rewards_ = tf.placeholder(tf.float32, [None,], name="expected_future_rewards")
            
            # Hidden Layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, action_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            
            # Output Layer
            self.action_distribution = tf.nn.softmax(self.fc3)
            
            self.log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.action_distribution, labels = self.actions_)
            self.loss = tf.reduce_mean(-1 * self.log_prob * self.expected_future_rewards_) 
            
            # adjust network
            self.learn = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

### Setup

In [19]:
tf.reset_default_graph()
network = PolicyGradient(name = 'pray4us', hidden_size=hidden_size, learning_rate=learning_rate)

In [20]:
# set up environment
env = gym.make('CartPole-v0')

# Initialize the simulation
env.reset()

# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

### Predict Future Reward Function

In [21]:
def expected_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    future = 0.0
    for i in reversed(range(len(episode_rewards))):
        future = episode_rewards[i] + (gamma * future)
        discounted_episode_rewards[i] = future
        
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

### Training

In [22]:
saver = tf.train.Saver()
all_loss = []
all_rewards = []


with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        epoch_loss = []
        epoch_reward = []
        
        for i_episode in range(episodes): # ideally 1000
            state = env.reset()
    
            # for each episode
            ep_reward, ep_current_state, ep_next_state, ep_action = [], [], [], []
    
            while True: # max_steps not needed, stopping included in done
            
                # get action prob distribution w.r.t. policy
                feed = {network.inputs_: state.reshape((1,*state.shape))}
                action_prob_dist = sess.run(network.action_distribution, feed_dict=feed)
                
                # select action w.r.t. distribution
                action = np.random.choice(range(action_prob_dist.shape[1]), p=action_prob_dist.ravel())
                new_state, reward, done, info = env.step(action)
                
                # keep track of all rewards, states, and actions
                ep_reward.append(reward)
                ep_next_state.append(new_state)
                ep_current_state.append(state)
                
                # reformat for softmax
                action_ = np.zeros(action_prob_dist.shape[1])
                action_[action] = 1
                ep_action.append(action_)
                
                state = new_state
                
                if done:
                    ep_total_reward = np.sum(ep_reward)
                    epoch_reward.append(ep_total_reward)
                    
                    # calc future reward
                    exp_future_reward = expected_rewards(ep_reward)
                    
                    # train
                    loss, _ = sess.run([network.loss, network.learn],
                                       feed_dict={network.inputs_: np.vstack(np.array(ep_current_state)),
                                                  network.actions_: np.vstack(np.array(ep_action)), 
                                                  network.expected_future_rewards_: exp_future_reward})
                    break
            epoch_loss.append(loss)
            epoch_reward.append(sum(ep_reward))
        all_loss.append(np.mean(epoch_loss))
        all_rewards.append(np.mean(epoch_reward))
        
        if (epoch % 2 == 0):
            print("Epoch: ", epoch, ", Average Reward: ", np.mean(epoch_reward))
            
        saver.save(sess, "checkpoints/cartpole{0}.ckpt".format(epoch))

Epoch:  0 , Average Reward:  22.151
Epoch:  2 , Average Reward:  21.868
Epoch:  4 , Average Reward:  21.875
Epoch:  6 , Average Reward:  22.722
Epoch:  8 , Average Reward:  21.742
Epoch:  10 , Average Reward:  21.731
Epoch:  12 , Average Reward:  21.932
Epoch:  14 , Average Reward:  21.496


KeyboardInterrupt: 