### Import libraries

In [1]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

### Set Up GPUs

In [4]:
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

### Parameters

In [5]:
episodes = 300 # ideally > 1,000
gamma = 0.95 # specified by homework

# network parameters
learning_rate = 0.01
hidden_size = 10

# epochs 
epochs = 1000 # will be a lot on GPUs

### Network Design

In [6]:
class PolicyGradient():
    def __init__(self, learning_rate=0.01, state_size=4, action_size=2, hidden_size=10, name='PolicyGradient'):
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.actions_ = tf.placeholder(tf.int32, [None, action_size], name='actions')
            self.expected_future_rewards_ = tf.placeholder(tf.float32, [None,], name="expected_future_rewards")
            
            # Hidden Layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, action_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, action_size,
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            
            # Output Layer
            self.action_distribution = tf.nn.softmax(self.fc3)
            
            self.log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.fc3, labels = self.actions_)
            self.loss = tf.reduce_mean(self.log_prob * self.expected_future_rewards_) 
            
            # adjust network
            self.learn = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

### Setup

In [7]:
tf.reset_default_graph()
network = PolicyGradient(name = 'pray4us', hidden_size=hidden_size, learning_rate=learning_rate)

In [8]:
# set up environment
env = gym.make('CartPole-v0')

# Initialize the simulation
env.reset()

# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

### Predict Future Reward Function

In [9]:
def expected_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    future = 0.0
    for i in reversed(range(len(episode_rewards))):
        future = episode_rewards[i] + (gamma * future)
        discounted_episode_rewards[i] = future
        
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

### Training

In [19]:
saver = tf.train.Saver()
all_loss = []
all_rewards = []


with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    for epoch in range(epochs):
        ep_reward, ep_current_state, ep_next_state, ep_action = [], [], [], []
        for i_episode in range(episodes): # ideally 1000
            state = env.reset()
            while True: 
            
                # get action prob distribution w.r.t. policy
                feed = {network.inputs_: state.reshape((1,*state.shape))}
                action_prob_dist = sess.run(network.action_distribution, feed_dict=feed)
                
                # select action w.r.t. distribution
                action = np.random.choice(range(action_prob_dist.shape[1]), p=action_prob_dist.ravel())
                new_state, reward, done, info = env.step(action)
                
                # keep track of all rewards, states, and actions
                ep_reward.append(reward)
                ep_next_state.append(new_state)
                ep_current_state.append(state)
                
                # reformat for softmax
                action_ = np.zeros(action_prob_dist.shape[1])
                action_[action] = 1
                ep_action.append(action_)
                
                # reset current state to be new state
                state = new_state
                
                if done:
                    break
        
        
        exp_future_reward = expected_rewards(ep_reward) 
        loss, _ = sess.run([network.loss, network.learn],
                    feed_dict={network.inputs_: np.vstack(np.array(ep_current_state)),
                               network.actions_: np.vstack(np.array(ep_action)), 
                               network.expected_future_rewards_: exp_future_reward})
        
        print("==========================================")
        print("Epoch:", epoch)
        print("Avg Reward:", np.sum(ep_reward)/episodes)
        
        if (epoch % 100 == 0):
            saver.save(sess, "checkpoints/cartpole{0}.ckpt".format(epoch))

Epoch: 0
Avg Reward: 22.58
Epoch: 1
Avg Reward: 22.63
Epoch: 2
Avg Reward: 21.84
Epoch: 3
Avg Reward: 22.883333333333333
Epoch: 4
Avg Reward: 21.436666666666667
Epoch: 5
Avg Reward: 21.323333333333334


KeyboardInterrupt: 