# Cartpole Agent

Notes:
state_prime is next state

In [1]:
# !pip install tqdm
# ! pip install gym

In [2]:
import tensorflow as tf
import numpy as np
import random
import gym
import tqdm
slim = tf.contrib.slim

#%matplotlib inline 
%matplotlib notebook 

In [3]:
def calculate_naive_returns(rewards):
    """ Calculates a list of naive returns given a 
    list of rewards."""
    total_returns = np.zeros(len(rewards))
    total_return = 0.0
    for t in range(len(rewards), -1, -1): # changed end index to -1 RL
        total_return = total_return + rewards[t] # also fixed
        total_returns[t] = total_return
    return total_returns

In [4]:
def discount_rewards(rewards, gamma=0.99):
    discounted_returns = [0 for _ in rewards]
    discounted_returns[-1] = rewards[-1]
    for t in range(len(rewards)-2, -1, -1): # iterate backwards
        discounted_returns[t] = rewards[t] + discounted_returns[t+1]*gamma
    return discounted_returns

In [5]:
def epsilon_greedy_action(action_distribution, epsilon=1e-1):
    if random.random() < epsilon:
        return np.argmax(np.random.random(
           action_distribution.shape))
    else:
        return np.argmax(action_distribution)

In [6]:
def epsilon_greedy_action_annealed(action_distribution,
                                   percentage, 
                                   epsilon_start=1.0, 
                                   epsilon_end=1e-2):
    annealed_epsilon = epsilon_start*(1.0-percentage) + epsilon_end*percentage
    if random.random() < annealed_epsilon:
        return np.argmax(np.random.random(
          action_distribution.shape))
    else:
        return np.argmax(action_distribution)


In [7]:
# this is about single episodes
class EpisodeHistory(object):

    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def add_to_history(self, state, action, reward, 
      state_prime):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.state_primes.append(state_prime)


In [8]:
# this holds multiple episodes
class Memory(object):

    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def reset_memory(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.discounted_returns = []

    def add_episode(self, episode):
        self.states += episode.states
        self.actions += episode.actions
        self.rewards += episode.rewards
        self.discounted_returns += episode.discounted_returns

In [9]:
class PGAgent(object):

    def __init__(self, session, state_size, num_actions,
        hidden_size, learning_rate=1e-3, 
        explore_exploit_setting=
          'epsilon_greedy_annealed_1.0->0.001'):
        self.session = session
        self.state_size = state_size
        self.num_actions = num_actions
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.explore_exploit_setting = explore_exploit_setting

        self.build_model()
        self.build_training()

    def build_model(self):
        with tf.variable_scope('pg-model'):
            self.state = tf.placeholder(
                shape=[None, self.state_size], 
                dtype=tf.float32)
            self.h0 = slim.fully_connected(self.state, 
            self.hidden_size)
            self.h1 = slim.fully_connected(self.h0, 
            self.hidden_size)
            self.output = slim.fully_connected(
                self.h1, self.num_actions, 
                activation_fn=tf.nn.softmax)

    def build_training(self):
        self.action_input = tf.placeholder(tf.int32, 
          shape=[None])
        self.reward_input = tf.placeholder(tf.float32, 
          shape=[None])

        # Select the logits related to the action taken
# this creates list of indices ?(ie num samples) long that are at start of each row in one-hot - then 
# adds indexes into each row of
# particular pre-selected action
        self.output_index_for_actions = (tf.range(
            0, tf.shape(self.output)[0]) * 
              tf.shape(self.output)[1]) + self.action_input
# then this flattens everything into 1-D, then selects from it using those indices
# all of this is a way to select only the neuron outputs that correspond to just the actions taken, rather
# than getting the values for all actions
        self.logits_for_actions = tf.gather(
            tf.reshape(self.output, [-1]), 
            self.output_index_for_actions)

        self.loss = - \
            tf.reduce_mean(tf.log(self.logits_for_actions) * 
              self.reward_input)
# see book pg. 254
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.learning_rate)
        self.train_step = self.optimizer.minimize(self.loss)

    def sample_action_from_distribution(
      self, action_distribution, 
      epsilon_percentage):
        # Choose an action based on the action probability 
        # distribution and an explore vs exploit
        if self.explore_exploit_setting == 'greedy':
            action = greedy_action(action_distribution)
        elif self.explore_exploit_setting == 'epsilon_greedy_0.05':
            action = epsilon_greedy_action(action_distribution,
              0.05)
        elif self.explore_exploit_setting == 'epsilon_greedy_0.25':
            action = epsilon_greedy_action(action_distribution, 
              0.25)
        elif self.explore_exploit_setting == 'epsilon_greedy_0.50':
            action = epsilon_greedy_action(action_distribution, 
              0.50)
        elif self.explore_exploit_setting == 'epsilon_greedy_0.90':
            action = epsilon_greedy_action(action_distribution, 
              0.90)
        elif self.explore_exploit_setting == 'epsilon_greedy_annealed_1.0->0.001':
            action = epsilon_greedy_action_annealed(
                action_distribution, epsilon_percentage, 1.0, 
                  0.001)
        elif self.explore_exploit_setting == 'epsilon_greedy_annealed_0.5->0.001':
            action = epsilon_greedy_action_annealed(
                action_distribution, epsilon_percentage, 0.5, 
                  0.001)
        elif self.explore_exploit_setting == 'epsilon_greedy_annealed_0.25->0.001':
            action = epsilon_greedy_action_annealed(
                action_distribution, epsilon_percentage, 0.25,
                  0.001)

        return action

    def predict_action(self, state, epsilon_percentage):
        action_distribution = self.session.run(
            self.output, feed_dict={self.state: [state]})[0]
        action = self.sample_action_from_distribution(
            action_distribution, epsilon_percentage)
        return action
    def show_current_policy(self):
    # Evaluate Current Policy - No Epsilon ------------------------------------
        pol_ep_hist = EpisodeHistory()
        state = env.reset()
        print('state: ',state)
        e_p = 1.0 # result in using end epsilon
        for n in range(20):    # figure out next action
            action = agent.predict_action(state, \
              e_p)
            # take a step
            state_prime, reward, terminal, _ = env.step(action)
            state = state_prime
            pol_ep_hist.add_to_history(state, action, reward, state_prime)
        print('Policy Actions - No Epsilon')
        print(pol_ep_hist.actions)


In [10]:
# def main():
# Configure Settings
total_episodes = 2000
total_steps_max = 1000
epsilon_stop = 0.6*total_episodes#3000
BATCH_SIZE = 10 # this is the number of episodes generated before we train
MAX_EP_LENGTH = 199 # FIXED by env in Open AI Gym (we think)
render_start = False #-1
should_render = False # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
LEARNING_RATE = 1e-2

explore_exploit_setting = 'epsilon_greedy_annealed_1.0->0.001'

env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]  # 4 for 
                                             # CartPole-v0
num_actions = env.action_space.n  # 2 for CartPole-v0

solved = False
with tf.Session() as session:
    agent = PGAgent(session=session, state_size=state_size,
      num_actions=num_actions,
                    hidden_size=16, learning_rate = LEARNING_RATE,
                      explore_exploit_setting=
                        explore_exploit_setting)
    session.run(tf.global_variables_initializer())

    episode_rewards = []
    episode_terminal = [] # 1 for success, -1 for failure (o.b.)
    episode_steps = []
    episode_maxs = []
    batch_losses = []

    global_memory = Memory()
    steps = 0 # total number of steps in all episodes
    for i in tqdm.tqdm(range(total_episodes)): # multiple episode loop
        state = env.reset()
        episode_reward = 0.0 # cumulative sum of all rewards in episode
        episode_history = EpisodeHistory()
        epsilon_percentage = float(min(i/float(
          epsilon_stop), 1.0))
        steps_this_ep = 0
        terminal = False
        j = 0
        while (j < MAX_EP_LENGTH) and (not terminal): # single episode
        #for j in range(MAX_EP_LENGTH):
            j += 1
            # figure out next action
            action = agent.predict_action(state, 
              epsilon_percentage)
            # take a step
#             print(action)
           
            state_prime, reward, terminal, _ = env.step(action)
            
            if (render_start > 0 and i > 
              render_start and should_render) \
                or (solved and should_render):
                env.render()
            episode_history.add_to_history(
                state, action, reward, state_prime)
            episode_reward += reward
            steps += 1
            steps_this_ep += 1
            # advance the state
            state = state_prime
            
#         if terminal: # if the episode terminated with "terminal" (ie. not timed out)
            
        episode_history.discounted_returns = discount_rewards(episode_history.rewards)
        
        # this is a modification that attempts to enhance the rewards for "good" episodes
        # it doesn't seem to hurt - need more experimentation to know if it helps
        for nn in range(len(episode_history.discounted_returns)):
            episode_history.discounted_returns[nn] *= (1+(steps_this_ep/MAX_EP_LENGTH))**2
        global_memory.add_episode(episode_history)

        # if we completed a minibatch then train
        if np.mod(i, BATCH_SIZE) == 0:
            feed_dict = {
            agent.reward_input: np.array(
              global_memory.discounted_returns),
            agent.action_input: np.array(
              global_memory.actions), 
            agent.state: np.array(
              global_memory.states)}
            _, batch_loss = session.run(
                [agent.train_step, agent.loss],
                  feed_dict=feed_dict)
            batch_losses.append(batch_loss)
            global_memory.reset_memory()
#             # show current policy - no epsilon
#             agent.show_current_policy()

        episode_rewards.append(episode_reward)
        # save sum of rewards, num_steps of this episode
        episode_steps.append(steps_this_ep)
        episode_maxs.append(max(episode_history.rewards))
        episode_terminal.append(terminal)

#         if i % 10: # show results every 10th iteration
#             if np.mean(episode_rewards[:]) > 100.0:
#                 solved = True
#             else:
#                 solved = False
#     print('Solved:', solved, 'Mean Reward', np.mean(episode_rewards[:-100]))
    print('Final Mean Reward - last 100', np.mean(episode_rewards[-100:-1]))
        
    print(episode_history.actions)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
100%|██████████| 2000/2000 [01:24<00:00, 23.68it/s]

Final Mean Reward - last 100 199.0
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0]





In [11]:
# main()

In [12]:
import matplotlib.pyplot as plt

In [13]:
plt.figure()
plt.plot(episode_rewards,'bd')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7ffaa4599be0>]

In [14]:
    print('Final Mean Reward - last 100', np.mean(episode_rewards[-100:-1]))

Final Mean Reward - last 100 199.0


In [15]:
plt.figure()
plt.plot(episode_terminal)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7ffaa454f5c0>]