In [1]:
import gym

import time

In [25]:
from IPython.display import Image

### Environment

In [2]:
env = gym.make("CartPole-v0")

### Observations

In [3]:
obs = env.reset()
obs

array([ 0.02342458,  0.01988509,  0.01149062, -0.04738753])

In [4]:
# show popup image
env.render()

True

### Actions

In [5]:
env.action_space

Discrete(2)

In [6]:
# possible actions
[env.action_space.sample() for _ in range(10)]

[0, 1, 1, 0, 1, 1, 1, 1, 1, 1]

__take a step__

In [7]:
# leaning right, lets accelerate right
obs, reward, done, info = env.step(action=1)
env.render()

True

In [8]:
print(reward, done, info)

1.0 False {}


### Basic policy

In [9]:
# given an observation, return an action
def basic_policy(obs, a):
    angle = obs[2] + a * obs[3]
    return 0 if angle < 0 else 1

In [10]:
def simulate(policy_function, episodes, max_steps, pf_kws={}, render=False, freeze_time_steps=0, freeze_time_episodes=0.5):
    """
    Take steps on an environrment according to a policy_function
    
    policy_function (fun): given obs, return action
    episodes (int): number of times to play
    pf_kws (dict): extra params for the policy_function
    max_steps (int): max steps per episode
    freeze_time_steps (float): the sleep time between steps in one episode
    freeze_time_episodes (float): the sleep time between episodes
    """
    totals = []
    for e in range(episodes):
        episode_rewards = 0
        obs = env.reset()
        for step in range(1000):
            action = policy_function(obs, **pf_kws)
            time.sleep(freeze_time_steps)
            obs, reward, done, info = env.step(action)
            episode_rewards += reward
            if render:
                env.render()
            if done:
                break
        time.sleep(freeze_time_episodes)
        print(episode_rewards)
        totals.append(episode_rewards)
        
    return totals

In [11]:
env = gym.make("CartPole-v0")

In [12]:
t = simulate(
    policy_function=basic_policy,
    episodes=5, 
    max_steps=200,
    pf_kws={'a':0.02},
    render=True
)

200.0
200.0
200.0
200.0
200.0


In [13]:
t

[200.0, 200.0, 200.0, 200.0, 200.0]

### NN Policy

In [2]:
env = gym.make("CartPole-v0")

In [2]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [3]:
n_inputs = 4 
# == env.observation_space.shape[0]
n_hidden = 4 
# it's a simple task, we don't need more hidden neurons
n_outputs = 1 
# only outputs the probability of accelerating left

In [31]:
initializer = tf.contrib.layers.variance_scaling_initializer()

In [30]:
Image(url='https://blog.paperspace.com/content/images/2018/06/ELU.png', width=200)

In [35]:
X = tf.placeholder(
    dtype=tf.float32, 
    shape=[None, n_inputs],
    name=None
)

In [36]:
hidden = fully_connected(
    inputs=X, 
    num_outputs=n_hidden, 
    activation_fn=tf.nn.elu, 
    weights_initializer=initializer
)

In [37]:
logits = fully_connected(
    inputs=hidden, 
    num_outputs=n_outputs, 
    activation_fn=None,
    weights_initializer=initializer
)

In [39]:
outputs = tf.nn.sigmoid(
    x=logits
)

In [48]:
# select a random action based on the estimated probabilities
p_left_and_right = tf.concat(axis=1, values=[outputs, 1 - outputs])

action = tf.multinomial(tf.log(p_left_and_right), num_samples=1)

In [49]:
init = tf.global_variables_initializer()

In [54]:
# target is 1 if action was left (0) and 
y = 1. - tf.to_float(action)

In [55]:
learning_rate = 0.01

cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
    labels=y, 
    logits=logits
)

optimizer = tf.train.AdamOptimizer(learning_rate)

In [56]:
grads_and_vars = optimizer.compute_gradients(cross_entropy)

In [57]:
gradients = [grad for grad, variable in grads_and_vars]

In [62]:
gradient_placeholders = []
grads_and_vars_feed = []
for grad, variable in grads_and_vars:
    gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
    gradient_placeholders.append(gradient_placeholder)
    grads_and_vars_feed.append((gradient_placeholder, variable))
    training_op = optimizer.apply_gradients(grads_and_vars_feed)

In [64]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.empty(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
        
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]