### Import libraries

In [1]:
import gym
import gym.spaces
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

### Set Up GPUs

In [2]:
# os.environ["CUDA_VISIBLE_DEVICES"]="1"

### Parameters

In [3]:
episodes = 300 # ideally > 1,000
gamma = 0.95 # specified by homework

# network parameters
learning_rate = 0.01
hidden_size = 10

# epochs 
epochs = 1000 # will be a lot on GPUs

### Network Design

In [4]:
class PolicyGradient():
    def __init__(self, learning_rate=0.01, state_size=4, action_size=2, hidden_size=10, name='PolicyGradient'):
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.actions_ = tf.placeholder(tf.int32, [None, action_size], name='actions')
            self.expected_future_rewards_ = tf.placeholder(tf.float32, [None,], name="expected_future_rewards")
            
            # Hidden Layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, action_size, 
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            self.fc3 = tf.contrib.layers.fully_connected(self.fc2, action_size,
                                                         weights_initializer=tf.contrib.layers.xavier_initializer())
            
            # Output Layer
            self.action_distribution = tf.nn.softmax(self.fc3)
            
            self.log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits = self.fc3, labels = self.actions_)
            self.loss = tf.reduce_mean(self.log_prob * self.expected_future_rewards_) 
            
            # adjust network
            self.learn = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

### Setup

In [5]:
tf.reset_default_graph()
network = PolicyGradient(name = 'pray4us', hidden_size=hidden_size, learning_rate=learning_rate)

In [6]:
# set up environment
env = gym.make('CartPole-v0')

# Initialize the simulation
env.reset()

# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())

### Predict Future Reward Function

In [7]:
def expected_rewards(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    future = 0.0
    for i in reversed(range(len(episode_rewards))):
        future = episode_rewards[i] + (gamma * future)
        discounted_episode_rewards[i] = future
        
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards.tolist()

### Training

In [8]:
saver = tf.train.Saver()
all_loss = []
all_rewards = []


with tf.Session() as sess: 
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(epochs):
        
        # save all states, actions, and rewards that happen 
        episode_states, episode_actions, episode_rewards = [], [], []
        
        for i_episode in range(episodes):
            state = env.reset()
            
            while True: 
            
                # get action prob distribution w.r.t. policy
                feed = {network.inputs_: state.reshape((1,*state.shape))}
                action_prob_dist = sess.run(network.action_distribution, feed_dict=feed)
                
                # select action w.r.t. distribution
                action = np.random.choice(range(action_prob_dist.shape[1]), p=action_prob_dist.ravel())
                new_state, reward, done, info = env.step(action)
                
                # keep track of all states, actions, and rewards
                episode_states.append(state)
                episode_rewards.append(reward)
                
                
                # reformat action for softmax
                action_ = np.zeros(action_prob_dist.shape[1])
                action_[action] = 1
                episode_actions.append(action_)
                
                # reset current state to be new state
                state = new_state
                
                if done:
                    # Calculate sum reward
                    episode_rewards_sum = np.sum(episode_rewards)
                    all_rewards.append(episode_rewards_sum)
                    total_rewards = np.sum(all_rewards)

                    # Mean reward
                    mean_reward = np.divide(total_rewards, i_episode+1)
                    maximumRewardRecorded = np.amax(all_rewards)
                    
                    if (epoch % 10 == 0):
                        print("==========================================")
                        print("Episode: ", i_episode)
                        print("Mean Reward", mean_reward)

                    # Calculate discounted reward
                    exp_rewards = expected_rewards(episode_rewards)

                    # update the network
                    loss_, _ = sess.run([network.loss, network.learn], feed_dict={network.inputs_: np.vstack(np.array(episode_states)),
                                                                     network.actions_: np.vstack(np.array(episode_actions)),
                                                                     network.expected_future_rewards_: exp_rewards 
                                                                    })
                    break
        
        if (epoch % 100 == 0):
            saver.save(sess, "checkpoints/cartpole{0}.ckpt".format(epoch))
            

Episode:  0
Reward:  17.0
Mean Reward 17.0
Episode:  1
Reward:  30.0
Mean Reward 23.5
Episode:  2
Reward:  45.0
Mean Reward 30.666666666666668
Episode:  3
Reward:  57.0
Mean Reward 37.25
Episode:  4
Reward:  76.0
Mean Reward 45.0
Episode:  5
Reward:  107.0
Mean Reward 55.333333333333336
Episode:  6
Reward:  127.0
Mean Reward 65.57142857142857
Episode:  7
Reward:  177.0
Mean Reward 79.5
Episode:  8
Reward:  208.0
Mean Reward 93.77777777777777
Episode:  9
Reward:  234.0
Mean Reward 107.8
Episode:  10
Reward:  255.0
Mean Reward 121.18181818181819
Episode:  11
Reward:  278.0
Mean Reward 134.25
Episode:  12
Reward:  298.0
Mean Reward 146.84615384615384
Episode:  13
Reward:  308.0
Mean Reward 158.35714285714286
Episode:  14
Reward:  320.0
Mean Reward 169.13333333333333
Episode:  15
Reward:  332.0
Mean Reward 179.3125
Episode:  16
Reward:  365.0
Mean Reward 190.23529411764707
Episode:  17
Reward:  380.0
Mean Reward 200.77777777777777
Episode:  18
Reward:  413.0
Mean Reward 211.94736842105263


Episode:  88
Reward:  2306.0
Mean Reward 1158.2921348314608
Episode:  89
Reward:  2322.0
Mean Reward 1171.2222222222222
Episode:  90
Reward:  2340.0
Mean Reward 1184.065934065934
Episode:  91
Reward:  2369.0
Mean Reward 1196.945652173913
Episode:  92
Reward:  2391.0
Mean Reward 1209.784946236559
Episode:  93
Reward:  2405.0
Mean Reward 1222.5
Episode:  94
Reward:  2423.0
Mean Reward 1235.1368421052632
Episode:  95
Reward:  2435.0
Mean Reward 1247.6354166666667
Episode:  96
Reward:  2444.0
Mean Reward 1259.9690721649486
Episode:  97
Reward:  2578.0
Mean Reward 1273.4183673469388
Episode:  98
Reward:  2602.0
Mean Reward 1286.8383838383838
Episode:  99
Reward:  2626.0
Mean Reward 1300.23
Episode:  100
Reward:  2642.0
Mean Reward 1313.5148514851485
Episode:  101
Reward:  2652.0
Mean Reward 1326.637254901961
Episode:  102
Reward:  2679.0
Mean Reward 1339.7669902912621
Episode:  103
Reward:  2724.0
Mean Reward 1353.076923076923
Episode:  104
Reward:  2773.0
Mean Reward 1366.6
Episode:  105
R

Episode:  169
Reward:  4392.0
Mean Reward 2235.858823529412
Episode:  170
Reward:  4413.0
Mean Reward 2248.590643274854
Episode:  171
Reward:  4443.0
Mean Reward 2261.3488372093025
Episode:  172
Reward:  4473.0
Mean Reward 2274.1329479768788
Episode:  173
Reward:  4485.0
Mean Reward 2286.8390804597702
Episode:  174
Reward:  4512.0
Mean Reward 2299.554285714286
Episode:  175
Reward:  4535.0
Mean Reward 2312.255681818182
Episode:  176
Reward:  4548.0
Mean Reward 2324.8870056497176
Episode:  177
Reward:  4565.0
Mean Reward 2337.4719101123596
Episode:  178
Reward:  4582.0
Mean Reward 2350.0111731843576
Episode:  179
Reward:  4604.0
Mean Reward 2362.5333333333333
Episode:  180
Reward:  4639.0
Mean Reward 2375.1104972375692
Episode:  181
Reward:  4659.0
Mean Reward 2387.6593406593406
Episode:  182
Reward:  4672.0
Mean Reward 2400.1420765027324
Episode:  183
Reward:  4710.0
Mean Reward 2412.695652173913
Episode:  184
Reward:  4721.0
Mean Reward 2425.172972972973
Episode:  185
Reward:  4739.0


Episode:  253
Reward:  6538.0
Mean Reward 3296.4212598425197
Episode:  254
Reward:  6607.0
Mean Reward 3309.4039215686275
Episode:  255
Reward:  6621.0
Mean Reward 3322.33984375
Episode:  256
Reward:  6638.0
Mean Reward 3335.241245136187
Episode:  257
Reward:  6648.0
Mean Reward 3348.0813953488373
Episode:  258
Reward:  6709.0
Mean Reward 3361.057915057915
Episode:  259
Reward:  6729.0
Mean Reward 3374.0115384615383
Episode:  260
Reward:  6788.0
Mean Reward 3387.0919540229884
Episode:  261
Reward:  6830.0
Mean Reward 3400.232824427481
Episode:  262
Reward:  6858.0
Mean Reward 3413.380228136882
Episode:  263
Reward:  6874.0
Mean Reward 3426.4886363636365
Episode:  264
Reward:  6894.0
Mean Reward 3439.5735849056605
Episode:  265
Reward:  6909.0
Mean Reward 3452.6165413533836
Episode:  266
Reward:  6934.0
Mean Reward 3465.6554307116103
Episode:  267
Reward:  6960.0
Mean Reward 3478.694029850746
Episode:  268
Reward:  6969.0
Mean Reward 3491.6691449814125
Episode:  269
Reward:  6983.0
Mean

KeyboardInterrupt: 