In [1]:
%matplotlib inline

import gym
import itertools
import matplotlib
import numpy as np
import sys
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tqdm import tqdm, trange
import collections

import sklearn.pipeline
import sklearn.preprocessing

from sklearn.kernel_approximation import RBFSampler

matplotlib.style.use('ggplot')

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [2]:
tf.enable_eager_execution()

In [3]:
env = gym.envs.make("MountainCarContinuous-v0")
env.observation_space.sample()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


array([-0.2121357 ,  0.03012651], dtype=float32)

In [4]:
env.observation_space, env.action_space, env.reward_range,


(Box(2,), Box(1,), (-inf, inf))

In [5]:
# Feature Preprocessing: Normalize to zero mean and unit variance
# We use a few samples from the observation space to do this
observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(observation_examples)

# Used to converte a state to a featurizes represenation.
# We use RBF kernels with different variances to cover different parts of the space
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(observation_examples))

FeatureUnion(n_jobs=None,
       transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=100, random_state=None)), ('rbf2', RBFSampler(gamma=2.0, n_components=100, random_state=None)), ('rbf3', RBFSampler(gamma=1.0, n_components=100, random_state=None)), ('rbf4', RBFSampler(gamma=0.5, n_components=100, random_state=None))],
       transformer_weights=None)

In [6]:
def featurize_state(state):
    """
    Returns the featurized representation for a state.
    """
    scaled = scaler.transform([state])
    featurized = featurizer.transform(scaled)
    return featurized[0]

In [7]:
featurize_state(env.observation_space.sample()).shape

(400,)

In [29]:
class ActorCritic(models.Model):
    def __init__(self, 
                 env,
                 num_eps=50):
        super(ActorCritic, self).__init__()
        self.env = env
        self.num_eps = num_eps
        self.mu_layer = layers.Dense(1)
        self.sigma_layer = layers.Dense(1)
        self.value_layer = layers.Dense(1, name='value')
        self.call(env.observation_space.sample())
        
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.01)

        self.actor_ws = [self.mu_layer.weights, self.sigma_layer.weights]
        self.critic_ws = self.value_layer.weights
        self.all_ws = self.actor_ws + [self.critic_ws]
    
    def call(self, inputs):
        x = featurize_state(inputs)
        x = tf.expand_dims(x, 0)
        
        self.value = self.value_layer(x)
        self.value = tf.squeeze(self.value)
        
        self.mu = self.mu_layer(x)
        self.sigma = tf.nn.softplus(self.sigma_layer(x))

        self.mu = tf.squeeze(self.mu)
        self.sigma = tf.squeeze(self.sigma)
        self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
        self.action = self.normal_dist._sample_n(1)
        self.action = tf.clip_by_value(self.action, env.action_space.low[0], env.action_space.high[0])
        return self.action, self.value

    def get_grads(self, t, policy_target, value_target):
        self.a_loss = -self.normal_dist.log_prob(self.action) * policy_target
        # Add cross entropy cost to encourage exploration
        self.a_loss -= 1e-1 * self.normal_dist.entropy()
        
        self.v_loss = tf.math.squared_difference(self.value, value_target)
        
        self.loss = self.a_loss + 0.01 * self.v_loss
        return t.gradient(self.loss, self.weights)
#         actor_grads = t.gradient(self.a_loss, self.actor_ws)
#         value_grads = t.gradient(self.v_loss, self.critic_ws)
#         return actor_grads, value_grads
    
    def train(self):
        for ep in range(self.num_eps):
            tr = tqdm(itertools.count())
            state = self.env.reset()
            total_r = 0
            for t in tr:
                with tf.GradientTape(persistent=True) as tape:
                    a, v = self.call(state)

                    next_state, r, done, info = self.env.step(a)
                    total_r += r
                    td_target = r + 0.99 * self.call(next_state)[1]
                    td_error = td_target - v
                
                    grads = self.get_grads(tape, td_error, td_target)
                self.optimizer.apply_gradients(zip(grads, self.weights))
                if done: break
                
                tr.set_description(f"Ep {ep}/{self.num_eps} | Reward {total_r} | step {t}")
                state = next_state


In [30]:
a = ActorCritic(env)
a.train()





0it [00:00, ?it/s][A[A[A[A



Ep 0/50 | Reward -0.47605725464339943 | step 6: 7it [00:00, 62.88it/s][A[A[A[A



Ep 0/50 | Reward -1.286872712108213 | step 15: 16it [00:00, 67.43it/s][A[A[A[A



Ep 0/50 | Reward -2.0953437037638483 | step 24: 25it [00:00, 72.53it/s][A[A[A[A



Ep 0/50 | Reward -2.9473616754572625 | step 33: 34it [00:00, 76.05it/s][A[A[A[A



Ep 0/50 | Reward -3.8473616754572633 | step 42: 43it [00:00, 79.34it/s][A[A[A[A



Ep 0/50 | Reward -4.7473616754572605 | step 51: 52it [00:00, 82.08it/s][A[A[A[A



Ep 0/50 | Reward -5.647361675457257 | step 60: 61it [00:00, 82.66it/s] [A[A[A[A



Ep 0/50 | Reward -6.4473616754572545 | step 68: 69it [00:00, 80.46it/s][A[A[A[A



Ep 0/50 | Reward -7.347361675457251 | step 77: 78it [00:00, 81.34it/s] [A[A[A[A



Ep 0/50 | Reward -8.247361675457249 | step 86: 87it [00:01, 82.89it/s][A[A[A[A



Ep 0/50 | Reward -9.147361675457246 | step 95: 96it [00:01, 84.28it/s][A[A[A[A



Ep 0/50 |

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').





          Ep 0/50 | Reward -41.04736167545757 | step 414: 408it [00:15, 26.08it/s][A[A[A[A

In [25]:
len(a.weights)

6

In [28]:
a.grads

ListWrapper([<tf.Tensor: id=1646, shape=(400, 1), dtype=float64, numpy=
array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
     

In [52]:
class Critic:
    def __init__(self):
        self.dense1 = layers.Dense(1)
        
    def call(self, inputs):
        x = featurize_state(inputs)
        x = tf.expand_dims(x, 0)
        x = self.dense1(x)
        self.value = tf.squeeze(x)
        return self.value
    
    def get_grads(self, t, target):
        self.loss = tf.math.squared_difference(self.value, target)
        return t.gradient(self.loss, self.weights)

In [None]:
class ActorCritic:
    def __init__(self):
        self.actor = Actor()
        self.critic = Critic()
        

In [7]:
class ValueEstimator():
    """
    Value Function approximator. 
    """
    
    def __init__(self, learning_rate=0.1, scope="value_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.float32, [400], "state")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just linear classifier
            self.output_layer = tf.contrib.layers.fully_connected(
                inputs=tf.expand_dims(self.state, 0),
                num_outputs=1,
                activation_fn=None,
                weights_initializer=tf.zeros_initializer)

            self.value_estimate = tf.squeeze(self.output_layer)
            self.loss = tf.squared_difference(self.value_estimate, self.target)

            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
            self.train_op = self.optimizer.minimize(
                self.loss, global_step=tf.contrib.framework.get_global_step())        
    
    def predict(self, state, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        return sess.run(self.value_estimate, { self.state: state })

    def update(self, state, target, sess=None):
        sess = sess or tf.get_default_session()
        state = featurize_state(state)
        feed_dict = { self.state: state, self.target: target }
        _, loss = sess.run([self.train_op, self.loss], feed_dict)
        return loss

In [15]:
def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):
    """
    Actor Critic Algorithm. Optimizes the policy 
    function approximator using policy gradient.
    
    Args:
        env: OpenAI environment.
        estimator_policy: Policy Function to be optimized 
        estimator_value: Value function approximator, used as a critic
        num_episodes: Number of episodes to run for
        discount_factor: Time-discount factor
    
    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """  
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    for i_episode in range(num_episodes):
        # Reset the environment and pick the fisrst action
        state = env.reset()
        
        episode = []
        
        # One step in the environment
        for t in itertools.count():
            
            # env.render()
            
            # Take a step
            action = estimator_policy.predict(state)
            next_state, reward, done, _ = env.step(action)
            
            # Keep track of the transition
            episode.append(Transition(
              state=state, action=action, reward=reward, next_state=next_state, done=done))
            
            # Update statistics
#             stats.episode_rewards[i_episode] += reward
#             stats.episode_lengths[i_episode] = t
            
            # Calculate TD Target
            value_next = estimator_value.predict(next_state)
            td_target = reward + discount_factor * value_next
            td_error = td_target - estimator_value.predict(state)
            
            # Update the value estimator
            estimator_value.update(state, td_target)
            
            # Update the policy estimator
            # using the td error as our advantage estimate
            estimator_policy.update(state, td_error, action)
            
            # Print out which step we're on, useful for debugging.
            print("\rStep {} @ Episode {}/{} ({})".format(
                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end="")

            if done:
                break
                
            state = next_state
    
    return stats

In [19]:
tf.reset_default_graph()

global_step = tf.Variable(0, name="global_step", trainable=False)
policy_estimator = PolicyEstimator(learning_rate=0.001)
value_estimator = ValueEstimator(learning_rate=0.1)

with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    # Note, due to randomness in the policy the number of episodes you need varies
    # TODO: Sometimes the algorithm gets stuck, I'm not sure what exactly is happening there.
    stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)

Instructions for updating:
Use `tf.global_variables_initializer` instead.


[2017-06-16 13:31:05,772] From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


Step 662 @ Episode 50/50 (65.13252566564918))

In [None]:
plotting.plot_episode_stats(stats, smoothing_window=10)