In [1]:
import numpy as np
import tensorflow as tf
import gym
import logz
import scipy.signal
import os
import time
import inspect
from multiprocessing import Process

%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)


In [2]:
#============================================================================================#
# Utilities
#============================================================================================#

def build_mlp(
        input_placeholder, 
        output_size,
        scope, 
        n_layers=2, 
        size=64, 
        activation=tf.tanh,
        output_activation=None
        ):
    #========================================================================================#
    #                           ----------SECTION 3----------
    # Network building
    #
    # Your code should make a feedforward neural network (also called a multilayer perceptron)
    # with 'n_layers' hidden layers of size 'size' units. 
    # 
    # The output layer should have size 'output_size' and activation 'output_activation'.
    #
    # Hint: use tf.layers.dense
    #========================================================================================#

    with tf.variable_scope(scope):
        # YOUR_CODE_HERE
        input_layers = input_placeholder
        for i in range(n_layers):
            input_layers = tf.layers.dense(inputs = input_layers, 
                                           units = size, 
                                           activation = activation, 
                                           name = scope + str(i+1) + "_layer")
            
        output_layer = tf.layers.dense(inputs = input_layers, 
                                 units = output_size, 
                                 activation = output_activation, 
                                 name = scope + "output_layer")
        
        return output_layer

def pathlength(path):
    return len(path["reward"])



In [3]:
#============================================================================================#
# Policy Gradient
#============================================================================================#

def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps
    
    loss_before = 1000

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype = tf.float32) #TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, 'discrete_mlp', n_layers, size) #TODO
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) #TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no, ac_dim, "continuous_mlp", n_layers, size) #TODO
        sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name='logstd',
                dtype=tf.float32) #TODO # logstd should just be a trainable variable, not a network output.
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = tf.random_normal(shape= tf.shape(sy_mean), mean = sy_mean, stddev = sy_std) #sy_mean + sy_std * tf.random_normal(tf.shape(sy_mean)) #TODO
        sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sy_std).log_prob(sy_ac_na) #TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = - tf.reduce_mean(sy_logprob_n * sy_adv_n) #TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_targets = tf.placeholder(shape = [None,], name = "baseline_targets", dtype = tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101


    #print("reach Training loop")
    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)

                #print("current ac:", ac)
                #print("before ob:", ob)
                ob, rew, done, _ = env.step(ac)
                #print("After ob:", ob)
                #print("rew:", rew)
                #print("done:", done)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        
        # YOUR_CODE_HERE
        #q_n = TODO
        q_n = []
        
        for path in paths:
            r_nr = path["reward"]
            step_n = len(r_nr)
            
            if reward_to_go:
                q = [np.sum(np.power(gamma, np.arange(step_n-t))* r_nr[t:]) for t in range(step_n)]
            else:
                q = [np.sum(np.power(gamma, np.arange(step_n) * r_nr)) for t in range(step_n)]
                
            q_n.extend(q)
        #if reward_to_go:
            

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict = {sy_ob_no: ob_no}) #TODO
            b_n_norm = (b_n - np.mean(b_n))/ (np.std(b_n,axis = 0) + 1e-7)
            b_n = b_n_norm * np.std(q_n, axis = 0) + np.mean(q_n, axis = 0)
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
        
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            #pass
            adv_n = (adv_n - np.mean(adv_n, axis = 0)) / (np.std(adv_n, axis = 0) + 1e-7)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            #pass
            q_n_norm = (q_n - np.mean(q_n, axis = 0)) / (np.std(q_n, axis = 0) + 1e-7)
            sess.run(baseline_update_op, feed_dict = {sy_ob_no: ob_no, baseline_targets: q_n_norm})

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 
        
        # YOUR_CODE_HERE
        feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n}
        #loss_before = sess.run(loss, feed_dict = feed_dict)
        _, loss_after = sess.run([update_op, loss], feed_dict = feed_dict)

        # Log diagnostics
        
        #print("log part is here?")
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Loss Delta", loss_before - loss_after)
        logz.log_tabular("Loss After", loss_after)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        
        loss_before = loss_after

In [4]:
def runmain(env_name, 
           exp_name = 'vpg',
           render = False,
           discount = 1.0, 
           n_iter = 100, 
           batch_size = 1000, 
           ep_len = -1., 
           learning_rate = 5e-3,
           reward_to_go = False,
           dont_normalize_advantages = False,
           nn_baseline = False,
           seed = 1, 
           n_experiments = 1,
           n_layers = 1,
           size = 32):
    
    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = ep_len if ep_len > 0 else None

    for e in range(n_experiments):
        #print('n_experiments:', n_experiments)
        seed = seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=exp_name,
                env_name=env_name,
                n_iter=n_iter,
                gamma=discount,
                min_timesteps_per_batch=batch_size,
                max_path_length=max_path_length,
                learning_rate=learning_rate,
                reward_to_go=reward_to_go,
                animate=render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(dont_normalize_advantages),
                nn_baseline=nn_baseline, 
                seed=seed,
                n_layers=n_layers,
                size=size
                )
        # Awkward hacky process runs, because Tensorflow does not like
        # repeatedly calling train_PG in the same thread.
        p = Process(target=train_func, args=tuple())
        p.start()
        p.join()
        


In [18]:

runmain('CartPole-v0', 
        n_iter = 100, 
        batch_size = 1000 , 
        n_experiments = 5,  
        dont_normalize_advantages = True,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'sb_bl_rtg_dna')



n_experiments: 5
Running experiment with seed 1
[32;1mLogging data to data/sb_bl_rtg_dna_CartPole-v0_01-05-2018_19-58-13/1/log.txt[0m


[2018-05-01 19:58:13,471] Making new env: CartPole-v0


********** Iteration 0 ************
----------------------------------------
|               Time |            1.93 |
|         Loss Delta |           1e+03 |
|         Loss After |         -0.0917 |
|          Iteration |               0 |
|      AverageReturn |            25.1 |
|          StdReturn |            12.7 |
|          MaxReturn |              81 |
|          MinReturn |              10 |
|          EpLenMean |            25.1 |
|           EpLenStd |            12.7 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |           1e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            2.84 |
|         Loss Delta |           0.147 |
|         Loss After |          -0.239 |
|          Iteration |               1 |
|      AverageReturn |            25.4 |
|          StdReturn |            11.6 |
|          MaxReturn |              65 |
|          MinReturn |    

|          StdReturn |            20.8 |
|          MaxReturn |             109 |
|          MinReturn |              23 |
|          EpLenMean |            48.1 |
|           EpLenStd |            20.8 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        1.43e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            13.5 |
|         Loss Delta |          -0.307 |
|         Loss After |           -0.16 |
|          Iteration |              14 |
|      AverageReturn |            51.9 |
|          StdReturn |            23.9 |
|          MaxReturn |              94 |
|          MinReturn |              19 |
|          EpLenMean |            51.9 |
|           EpLenStd |            23.9 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        1.53e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |              24 |
|         Loss Delta |            0.25 |
|         Loss After |          -0.677 |
|          Iteration |              27 |
|      AverageReturn |            70.5 |
|          StdReturn |            32.6 |
|          MaxReturn |             158 |
|          MinReturn |              28 |
|          EpLenMean |            70.5 |
|           EpLenStd |            32.6 |
| TimestepsThisBatch |        1.06e+03 |
|     TimestepsSoFar |        2.88e+04 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |            24.8 |
|         Loss Delta |          -0.363 |
|         Loss After |          -0.314 |
|          Iteration |              28 |
|      AverageReturn |            78.8 |
|          StdReturn |            34.1 |
|          MaxReturn |  

|      AverageReturn |             139 |
|          StdReturn |            43.3 |
|          MaxReturn |             200 |
|          MinReturn |              68 |
|          EpLenMean |             139 |
|           EpLenStd |            43.3 |
| TimestepsThisBatch |        1.11e+03 |
|     TimestepsSoFar |        4.25e+04 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |            35.2 |
|         Loss Delta |           -1.31 |
|         Loss After |          -0.249 |
|          Iteration |              41 |
|      AverageReturn |             100 |
|          StdReturn |            17.8 |
|          MaxReturn |             121 |
|          MinReturn |              70 |
|          EpLenMean |             100 |
|           EpLenStd |            17.8 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        4.35e+04 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        5.64e+04 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |              45 |
|         Loss Delta |           -1.35 |
|         Loss After |          -0.262 |
|          Iteration |              54 |
|      AverageReturn |             167 |
|          StdReturn |            42.2 |
|          MaxReturn |             200 |
|          MinReturn |              94 |
|          EpLenMean |             167 |
|           EpLenStd |            42.2 |
| TimestepsThisBatch |        1.17e+03 |
|     TimestepsSoFar |        5.75e+04 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |            45.8 |
|         Loss Delta |          -0.204 |
|         Loss After |         -0.0581 |
|          Iteration |              55 |
|      AverageReturn |             148 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             197 |
|          StdReturn |            6.23 |
|          MaxReturn |             200 |
|          MinReturn |             183 |
|          EpLenMean |             197 |
|           EpLenStd |            6.23 |
| TimestepsThisBatch |        1.18e+03 |
|     TimestepsSoFar |        7.18e+04 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |            57.3 |
|         Loss Delta |            1.74 |
|         Loss After |          -0.652 |
|          Iteration |              68 |
|      AverageReturn |             186 |
|          StdReturn |            32.4 |
|          MaxReturn |             200 |
|          MinReturn |             113 |
|          EpLenMean |             186 |
|           EpLenStd |            32.4 |
| TimestepsThisBatch |        1.11e+03 |
|     TimestepsSoFar |        7.29e+04 |
--------------------

| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |         8.6e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |            68.1 |
|         Loss Delta |             1.6 |
|         Loss After |           -1.15 |
|          Iteration |              81 |
|      AverageReturn |             147 |
|          StdReturn |            48.2 |
|          MaxReturn |             200 |
|          MinReturn |              66 |
|          EpLenMean |             147 |
|           EpLenStd |            48.2 |
| TimestepsThisBatch |        1.18e+03 |
|     TimestepsSoFar |        8.72e+04 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |            69.1 |
|         Loss Delta |          -0.926 |
|         Loss After |          -0.228 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |          -0.843 |
|          Iteration |              94 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.02e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |            80.3 |
|         Loss Delta |          -0.677 |
|         Loss After |          -0.166 |
|          Iteration |              95 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar

[2018-05-01 19:59:37,595] Making new env: CartPole-v0


********** Iteration 0 ************
----------------------------------------
|               Time |             1.9 |
|         Loss Delta |           1e+03 |
|         Loss After |           0.302 |
|          Iteration |               0 |
|      AverageReturn |            18.7 |
|          StdReturn |             8.5 |
|          MaxReturn |              60 |
|          MinReturn |              10 |
|          EpLenMean |            18.7 |
|           EpLenStd |             8.5 |
| TimestepsThisBatch |        1.01e+03 |
|     TimestepsSoFar |        1.01e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            2.64 |
|         Loss Delta |           0.168 |
|         Loss After |           0.134 |
|          Iteration |               1 |
|      AverageReturn |            23.6 |
|          StdReturn |            14.8 |
|          MaxReturn |              81 |
|          MinReturn |    

|          StdReturn |            20.3 |
|          MaxReturn |             116 |
|          MinReturn |              13 |
|          EpLenMean |            38.7 |
|           EpLenStd |            20.3 |
| TimestepsThisBatch |        1.00e+03 |
|     TimestepsSoFar |        1.43e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            13.4 |
|         Loss Delta |          -0.141 |
|         Loss After |          -0.139 |
|          Iteration |              14 |
|      AverageReturn |              40 |
|          StdReturn |            15.8 |
|          MaxReturn |              71 |
|          MinReturn |              17 |
|          EpLenMean |              40 |
|           EpLenStd |            15.8 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        1.53e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |            24.7 |
|         Loss Delta |           0.723 |
|         Loss After |           -1.25 |
|          Iteration |              27 |
|      AverageReturn |            66.9 |
|          StdReturn |            27.4 |
|          MaxReturn |             114 |
|          MinReturn |              18 |
|          EpLenMean |            66.9 |
|           EpLenStd |            27.4 |
| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |        2.88e+04 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |            25.5 |
|         Loss Delta |          -0.852 |
|         Loss After |            -0.4 |
|          Iteration |              28 |
|      AverageReturn |            57.4 |
|          StdReturn |            21.3 |
|          MaxReturn |  

|      AverageReturn |              90 |
|          StdReturn |            35.4 |
|          MaxReturn |             180 |
|          MinReturn |              32 |
|          EpLenMean |              90 |
|           EpLenStd |            35.4 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar |        4.24e+04 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |            36.4 |
|         Loss Delta |          -0.477 |
|         Loss After |           0.178 |
|          Iteration |              41 |
|      AverageReturn |             108 |
|          StdReturn |            45.8 |
|          MaxReturn |             200 |
|          MinReturn |              35 |
|          EpLenMean |             108 |
|           EpLenStd |            45.8 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar |        4.35e+04 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        5.65e+04 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |            46.7 |
|         Loss Delta |           -1.09 |
|         Loss After |           0.358 |
|          Iteration |              54 |
|      AverageReturn |             183 |
|          StdReturn |            37.3 |
|          MaxReturn |             200 |
|          MinReturn |             100 |
|          EpLenMean |             183 |
|           EpLenStd |            37.3 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        5.76e+04 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |            47.6 |
|         Loss Delta |           0.815 |
|         Loss After |          -0.458 |
|          Iteration |              55 |
|      AverageReturn |             187 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        7.23e+04 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |            59.3 |
|         Loss Delta |          -0.774 |
|         Loss After |           -0.25 |
|          Iteration |              68 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        7.35e+04 |
--------------------

| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        8.76e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |            70.9 |
|         Loss Delta |         -0.0353 |
|         Loss After |           0.358 |
|          Iteration |              81 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        8.88e+04 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |            71.9 |
|         Loss Delta |          0.0891 |
|         Loss After |           0.269 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |          0.0663 |
|          Iteration |              94 |
|      AverageReturn |             184 |
|          StdReturn |            23.5 |
|          MaxReturn |             200 |
|          MinReturn |             142 |
|          EpLenMean |             184 |
|           EpLenStd |            23.5 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        1.04e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |            84.7 |
|         Loss Delta |          -0.708 |
|         Loss After |           0.775 |
|          Iteration |              95 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar

[2018-05-01 20:01:05,908] Making new env: CartPole-v0


********** Iteration 0 ************
----------------------------------------
|               Time |            1.81 |
|         Loss Delta |           1e+03 |
|         Loss After |           0.112 |
|          Iteration |               0 |
|      AverageReturn |            22.8 |
|          StdReturn |              12 |
|          MaxReturn |              61 |
|          MinReturn |               9 |
|          EpLenMean |            22.8 |
|           EpLenStd |              12 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |           1e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            2.78 |
|         Loss Delta |           0.156 |
|         Loss After |         -0.0438 |
|          Iteration |               1 |
|      AverageReturn |            22.8 |
|          StdReturn |            13.8 |
|          MaxReturn |              72 |
|          MinReturn |    

|          StdReturn |            20.4 |
|          MaxReturn |              99 |
|          MinReturn |              15 |
|          EpLenMean |            45.3 |
|           EpLenStd |            20.4 |
| TimestepsThisBatch |        1.09e+03 |
|     TimestepsSoFar |        1.44e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            13.5 |
|         Loss Delta |          -0.209 |
|         Loss After |           0.227 |
|          Iteration |              14 |
|      AverageReturn |            42.6 |
|          StdReturn |            24.1 |
|          MaxReturn |             115 |
|          MinReturn |              20 |
|          EpLenMean |            42.6 |
|           EpLenStd |            24.1 |
| TimestepsThisBatch |        1.02e+03 |
|     TimestepsSoFar |        1.54e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |            23.9 |
|         Loss Delta |          -0.232 |
|         Loss After |         -0.0996 |
|          Iteration |              27 |
|      AverageReturn |             134 |
|          StdReturn |            40.7 |
|          MaxReturn |             200 |
|          MinReturn |              72 |
|          EpLenMean |             134 |
|           EpLenStd |            40.7 |
| TimestepsThisBatch |        1.07e+03 |
|     TimestepsSoFar |        2.89e+04 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |            24.7 |
|         Loss Delta |            1.26 |
|         Loss After |           -1.36 |
|          Iteration |              28 |
|      AverageReturn |             139 |
|          StdReturn |            47.5 |
|          MaxReturn |  

|      AverageReturn |             186 |
|          StdReturn |            21.9 |
|          MaxReturn |             200 |
|          MinReturn |             143 |
|          EpLenMean |             186 |
|           EpLenStd |            21.9 |
| TimestepsThisBatch |        1.11e+03 |
|     TimestepsSoFar |        4.29e+04 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |            34.6 |
|         Loss Delta |           0.399 |
|         Loss After |          -0.794 |
|          Iteration |              41 |
|      AverageReturn |             161 |
|          StdReturn |            38.9 |
|          MaxReturn |             200 |
|          MinReturn |             108 |
|          EpLenMean |             161 |
|           EpLenStd |            38.9 |
| TimestepsThisBatch |        1.13e+03 |
|     TimestepsSoFar |        4.41e+04 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        5.72e+04 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |            45.4 |
|         Loss Delta |          -0.997 |
|         Loss After |           0.296 |
|          Iteration |              54 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        5.84e+04 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |            46.2 |
|         Loss Delta |            1.02 |
|         Loss After |          -0.725 |
|          Iteration |              55 |
|      AverageReturn |             168 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             173 |
|          StdReturn |            39.3 |
|          MaxReturn |             200 |
|          MinReturn |             106 |
|          EpLenMean |             173 |
|           EpLenStd |            39.3 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        7.28e+04 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |              57 |
|         Loss Delta |           0.615 |
|         Loss After |          -0.845 |
|          Iteration |              68 |
|      AverageReturn |             188 |
|          StdReturn |            24.2 |
|          MaxReturn |             200 |
|          MinReturn |             134 |
|          EpLenMean |             188 |
|           EpLenStd |            24.2 |
| TimestepsThisBatch |        1.13e+03 |
|     TimestepsSoFar |        7.39e+04 |
--------------------

| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        8.76e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |            68.1 |
|         Loss Delta |          -0.267 |
|         Loss After |          0.0968 |
|          Iteration |              81 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        8.88e+04 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |              69 |
|         Loss Delta |         -0.0584 |
|         Loss After |           0.155 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |          -0.123 |
|          Iteration |              94 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.04e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |            81.1 |
|         Loss Delta |         -0.0715 |
|         Loss After |         -0.0515 |
|          Iteration |              95 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar

[2018-05-01 20:02:30,800] Making new env: CartPole-v0


********** Iteration 0 ************
----------------------------------------
|               Time |            1.69 |
|         Loss Delta |           1e+03 |
|         Loss After |          -0.128 |
|          Iteration |               0 |
|      AverageReturn |            33.4 |
|          StdReturn |            13.8 |
|          MaxReturn |              61 |
|          MinReturn |              11 |
|          EpLenMean |            33.4 |
|           EpLenStd |            13.8 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |           1e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            2.73 |
|         Loss Delta |         -0.0207 |
|         Loss After |          -0.108 |
|          Iteration |               1 |
|      AverageReturn |              37 |
|          StdReturn |            18.3 |
|          MaxReturn |              81 |
|          MinReturn |    

|          StdReturn |            26.9 |
|          MaxReturn |             114 |
|          MinReturn |              25 |
|          EpLenMean |              62 |
|           EpLenStd |            26.9 |
| TimestepsThisBatch |        1.05e+03 |
|     TimestepsSoFar |        1.45e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            12.9 |
|         Loss Delta |          -0.119 |
|         Loss After |           0.158 |
|          Iteration |              14 |
|      AverageReturn |            67.7 |
|          StdReturn |            32.7 |
|          MaxReturn |             162 |
|          MinReturn |              26 |
|          EpLenMean |            67.7 |
|           EpLenStd |            32.7 |
| TimestepsThisBatch |        1.02e+03 |
|     TimestepsSoFar |        1.55e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |              23 |
|         Loss Delta |          -0.452 |
|         Loss After |          -0.256 |
|          Iteration |              27 |
|      AverageReturn |            87.1 |
|          StdReturn |              30 |
|          MaxReturn |             133 |
|          MinReturn |              25 |
|          EpLenMean |            87.1 |
|           EpLenStd |              30 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        2.88e+04 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |            23.9 |
|         Loss Delta |            1.83 |
|         Loss After |           -2.08 |
|          Iteration |              28 |
|      AverageReturn |             106 |
|          StdReturn |            58.1 |
|          MaxReturn |  

|      AverageReturn |             171 |
|          StdReturn |            26.4 |
|          MaxReturn |             200 |
|          MinReturn |             132 |
|          EpLenMean |             171 |
|           EpLenStd |            26.4 |
| TimestepsThisBatch |        1.02e+03 |
|     TimestepsSoFar |        4.29e+04 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |            35.9 |
|         Loss Delta |           -1.02 |
|         Loss After |            0.61 |
|          Iteration |              41 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        4.41e+04 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        5.77e+04 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |            45.7 |
|         Loss Delta |           0.566 |
|         Loss After |          -0.402 |
|          Iteration |              54 |
|      AverageReturn |             174 |
|          StdReturn |            41.7 |
|          MaxReturn |             200 |
|          MinReturn |              89 |
|          EpLenMean |             174 |
|           EpLenStd |            41.7 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        5.88e+04 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |            46.7 |
|         Loss Delta |           0.718 |
|         Loss After |           -1.12 |
|          Iteration |              55 |
|      AverageReturn |             176 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             170 |
|          StdReturn |            43.8 |
|          MaxReturn |             200 |
|          MinReturn |              94 |
|          EpLenMean |             170 |
|           EpLenStd |            43.8 |
| TimestepsThisBatch |        1.02e+03 |
|     TimestepsSoFar |        7.28e+04 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |            57.4 |
|         Loss Delta |           0.116 |
|         Loss After |           -0.99 |
|          Iteration |              68 |
|      AverageReturn |             196 |
|          StdReturn |             8.2 |
|          MaxReturn |             200 |
|          MinReturn |             178 |
|          EpLenMean |             196 |
|           EpLenStd |             8.2 |
| TimestepsThisBatch |        1.18e+03 |
|     TimestepsSoFar |        7.39e+04 |
--------------------

| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        8.78e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |              69 |
|         Loss Delta |           -1.91 |
|         Loss After |            1.85 |
|          Iteration |              81 |
|      AverageReturn |             181 |
|          StdReturn |            42.1 |
|          MaxReturn |             200 |
|          MinReturn |              87 |
|          EpLenMean |             181 |
|           EpLenStd |            42.1 |
| TimestepsThisBatch |        1.09e+03 |
|     TimestepsSoFar |        8.89e+04 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |            69.7 |
|         Loss Delta |            2.11 |
|         Loss After |          -0.261 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |          -0.671 |
|          Iteration |              94 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |        1.04e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |            81.2 |
|         Loss Delta |           0.702 |
|         Loss After |           -1.37 |
|          Iteration |              95 |
|      AverageReturn |             191 |
|          StdReturn |            20.9 |
|          MaxReturn |             200 |
|          MinReturn |             144 |
|          EpLenMean |             191 |
|           EpLenStd |            20.9 |
| TimestepsThisBatch |        1.14e+03 |
|     TimestepsSoFar

[2018-05-01 20:03:55,955] Making new env: CartPole-v0


********** Iteration 0 ************
----------------------------------------
|               Time |            1.83 |
|         Loss Delta |           1e+03 |
|         Loss After |           0.159 |
|          Iteration |               0 |
|      AverageReturn |            22.9 |
|          StdReturn |              13 |
|          MaxReturn |              54 |
|          MinReturn |               9 |
|          EpLenMean |            22.9 |
|           EpLenStd |              13 |
| TimestepsThisBatch |        1.03e+03 |
|     TimestepsSoFar |        1.03e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            2.69 |
|         Loss Delta |           0.116 |
|         Loss After |           0.043 |
|          Iteration |               1 |
|      AverageReturn |            21.6 |
|          StdReturn |            12.4 |
|          MaxReturn |              80 |
|          MinReturn |    

|          StdReturn |            34.4 |
|          MaxReturn |             124 |
|          MinReturn |              14 |
|          EpLenMean |            60.2 |
|           EpLenStd |            34.4 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar |        1.44e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            12.7 |
|         Loss Delta |          0.0914 |
|         Loss After |          -0.545 |
|          Iteration |              14 |
|      AverageReturn |            43.7 |
|          StdReturn |            19.5 |
|          MaxReturn |              89 |
|          MinReturn |              22 |
|          EpLenMean |            43.7 |
|           EpLenStd |            19.5 |
| TimestepsThisBatch |           1e+03 |
|     TimestepsSoFar |        1.54e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |            24.2 |
|         Loss Delta |          0.0688 |
|         Loss After |          -0.568 |
|          Iteration |              27 |
|      AverageReturn |            71.9 |
|          StdReturn |            41.3 |
|          MaxReturn |             151 |
|          MinReturn |              19 |
|          EpLenMean |            71.9 |
|           EpLenStd |            41.3 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar |         2.9e+04 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |            25.1 |
|         Loss Delta |          -0.427 |
|         Loss After |          -0.141 |
|          Iteration |              28 |
|      AverageReturn |             130 |
|          StdReturn |            43.2 |
|          MaxReturn |  

|      AverageReturn |             138 |
|          StdReturn |            55.4 |
|          MaxReturn |             200 |
|          MinReturn |              39 |
|          EpLenMean |             138 |
|           EpLenStd |            55.4 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        4.29e+04 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |            36.2 |
|         Loss Delta |           -1.19 |
|         Loss After |           0.805 |
|          Iteration |              41 |
|      AverageReturn |             174 |
|          StdReturn |            30.3 |
|          MaxReturn |             200 |
|          MinReturn |             130 |
|          EpLenMean |             174 |
|           EpLenStd |            30.3 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |        4.39e+04 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        5.74e+04 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |            47.2 |
|         Loss Delta |          -0.389 |
|         Loss After |            -0.4 |
|          Iteration |              54 |
|      AverageReturn |             192 |
|          StdReturn |             6.9 |
|          MaxReturn |             200 |
|          MinReturn |             183 |
|          EpLenMean |             192 |
|           EpLenStd |             6.9 |
| TimestepsThisBatch |        1.15e+03 |
|     TimestepsSoFar |        5.86e+04 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |            48.1 |
|         Loss Delta |          -0.177 |
|         Loss After |          -0.223 |
|          Iteration |              55 |
|      AverageReturn |             186 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             200 |
|          StdReturn |               0 |
|          MaxReturn |             200 |
|          MinReturn |             200 |
|          EpLenMean |             200 |
|           EpLenStd |               0 |
| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |         7.3e+04 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |            58.5 |
|         Loss Delta |           0.201 |
|         Loss After |           0.338 |
|          Iteration |              68 |
|      AverageReturn |             179 |
|          StdReturn |            30.2 |
|          MaxReturn |             200 |
|          MinReturn |             125 |
|          EpLenMean |             179 |
|           EpLenStd |            30.2 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar |         7.4e+04 |
--------------------

| TimestepsThisBatch |         1.2e+03 |
|     TimestepsSoFar |         8.8e+04 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |            69.9 |
|         Loss Delta |             1.1 |
|         Loss After |          -0.315 |
|          Iteration |              81 |
|      AverageReturn |             173 |
|          StdReturn |            38.5 |
|          MaxReturn |             200 |
|          MinReturn |             107 |
|          EpLenMean |             173 |
|           EpLenStd |            38.5 |
| TimestepsThisBatch |        1.04e+03 |
|     TimestepsSoFar |         8.9e+04 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |            70.7 |
|         Loss Delta |           0.241 |
|         Loss After |          -0.555 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |          -0.214 |
|          Iteration |              94 |
|      AverageReturn |             183 |
|          StdReturn |            37.3 |
|          MaxReturn |             200 |
|          MinReturn |             100 |
|          EpLenMean |             183 |
|           EpLenStd |            37.3 |
| TimestepsThisBatch |         1.1e+03 |
|     TimestepsSoFar |        1.04e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |            81.7 |
|         Loss Delta |           0.425 |
|         Loss After |          -0.639 |
|          Iteration |              95 |
|      AverageReturn |             180 |
|          StdReturn |            42.6 |
|          MaxReturn |             200 |
|          MinReturn |              85 |
|          EpLenMean |             180 |
|           EpLenStd |            42.6 |
| TimestepsThisBatch |        1.08e+03 |
|     TimestepsSoFar

In [16]:
runmain('HalfCheetah-v1', 
        n_iter = 100, 
        batch_size = 50000, 
        discount = 0.9,
        learning_rate = 0.025,
        n_experiments = 1,  
        dont_normalize_advantages = False,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'lb_bl_rtg_dna',
        ep_len = 150,
        n_layers = 1)

n_experiments: 1
Running experiment with seed 1
[32;1mLogging data to data/lb_bl_rtg_dna_HalfCheetah-v1_01-05-2018_16-30-42/1/log.txt[0m


[2018-05-01 16:30:42,281] Making new env: HalfCheetah-v1


********** Iteration 0 ************
----------------------------------------
|               Time |            66.4 |
|         Loss Delta |           1e+03 |
|         Loss After |         -0.0769 |
|          Iteration |               0 |
|      AverageReturn |            -172 |
|          StdReturn |            41.2 |
|          MaxReturn |           -79.4 |
|          MinReturn |            -377 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        5.01e+04 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |             129 |
|         Loss Delta |         0.00902 |
|         Loss After |         -0.0859 |
|          Iteration |               1 |
|      AverageReturn |            -125 |
|          StdReturn |            40.2 |
|          MaxReturn |           -12.5 |
|          MinReturn |    

|          StdReturn |            24.7 |
|          MaxReturn |            20.7 |
|          MinReturn |            -127 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        7.02e+05 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |             935 |
|         Loss Delta |         -0.0143 |
|         Loss After |         -0.0719 |
|          Iteration |              14 |
|      AverageReturn |           -37.7 |
|          StdReturn |              23 |
|          MaxReturn |            23.1 |
|          MinReturn |            -135 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        7.52e+05 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |        1.75e+03 |
|         Loss Delta |          0.0197 |
|         Loss After |          -0.081 |
|          Iteration |              27 |
|      AverageReturn |           -6.43 |
|          StdReturn |            12.9 |
|          MaxReturn |            31.4 |
|          MinReturn |           -55.7 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |         1.4e+06 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |        1.82e+03 |
|         Loss Delta |          -0.021 |
|         Loss After |         -0.0599 |
|          Iteration |              28 |
|      AverageReturn |           -2.93 |
|          StdReturn |              13 |
|          MaxReturn |  

|      AverageReturn |            42.9 |
|          StdReturn |            18.5 |
|          MaxReturn |            84.5 |
|          MinReturn |           -16.6 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        2.06e+06 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |        2.64e+03 |
|         Loss Delta |         0.00425 |
|         Loss After |         -0.0514 |
|          Iteration |              41 |
|      AverageReturn |            49.6 |
|          StdReturn |              25 |
|          MaxReturn |             103 |
|          MinReturn |           -63.1 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        2.11e+06 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        2.71e+06 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |        3.44e+03 |
|         Loss Delta |        -0.00441 |
|         Loss After |         -0.0502 |
|          Iteration |              54 |
|      AverageReturn |            79.8 |
|          StdReturn |            24.8 |
|          MaxReturn |             129 |
|          MinReturn |             -60 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        2.76e+06 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |         3.5e+03 |
|         Loss Delta |          0.0133 |
|         Loss After |         -0.0635 |
|          Iteration |              55 |
|      AverageReturn |            90.8 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             128 |
|          StdReturn |            37.8 |
|          MaxReturn |             186 |
|          MinReturn |           -36.7 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        3.41e+06 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |        4.28e+03 |
|         Loss Delta |         -0.0034 |
|         Loss After |         -0.0259 |
|          Iteration |              68 |
|      AverageReturn |             131 |
|          StdReturn |            33.9 |
|          MaxReturn |             189 |
|          MinReturn |           -34.6 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        3.46e+06 |
--------------------

| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        4.06e+06 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |        5.09e+03 |
|         Loss Delta |         -0.0144 |
|         Loss After |        -0.00823 |
|          Iteration |              81 |
|      AverageReturn |             147 |
|          StdReturn |            28.6 |
|          MaxReturn |             217 |
|          MinReturn |           -22.2 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        4.11e+06 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |        5.14e+03 |
|         Loss Delta |          0.0163 |
|         Loss After |         -0.0245 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |         -0.0078 |
|          Iteration |              94 |
|      AverageReturn |             150 |
|          StdReturn |            26.2 |
|          MaxReturn |             194 |
|          MinReturn |           -2.31 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar |        4.76e+06 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |        5.95e+03 |
|         Loss Delta |         0.00974 |
|         Loss After |         -0.0175 |
|          Iteration |              95 |
|      AverageReturn |             151 |
|          StdReturn |            26.9 |
|          MaxReturn |             196 |
|          MinReturn |           -13.3 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.01e+04 |
|     TimestepsSoFar

In [17]:
runmain('InvertedPendulum-v1', 
        n_iter = 100, 
        batch_size = 5000 , 
        n_experiments = 1,  
        dont_normalize_advantages = False,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'sb_bl_rtg_dna',
        n_layers = 2)

n_experiments: 1
Running experiment with seed 1
[32;1mLogging data to data/sb_bl_rtg_dna_InvertedPendulum-v1_01-05-2018_19-10-15/1/log.txt[0m


[2018-05-01 19:10:15,638] Making new env: InvertedPendulum-v1


********** Iteration 0 ************
----------------------------------------
|               Time |            7.81 |
|         Loss Delta |           1e+03 |
|         Loss After |           -0.03 |
|          Iteration |               0 |
|      AverageReturn |            8.62 |
|          StdReturn |            5.16 |
|          MaxReturn |              44 |
|          MinReturn |               3 |
|          EpLenMean |            8.62 |
|           EpLenStd |            5.16 |
| TimestepsThisBatch |           5e+03 |
|     TimestepsSoFar |           5e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            13.7 |
|         Loss Delta |          0.0022 |
|         Loss After |         -0.0322 |
|          Iteration |               1 |
|      AverageReturn |            12.1 |
|          StdReturn |            7.83 |
|          MaxReturn |              59 |
|          MinReturn |    

|          StdReturn |            25.3 |
|          MaxReturn |             125 |
|          MinReturn |               8 |
|          EpLenMean |            47.7 |
|           EpLenStd |            25.3 |
| TimestepsThisBatch |        5.01e+03 |
|     TimestepsSoFar |        7.02e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |            83.5 |
|         Loss Delta |         0.00679 |
|         Loss After |         -0.0106 |
|          Iteration |              14 |
|      AverageReturn |            46.1 |
|          StdReturn |              25 |
|          MaxReturn |             146 |
|          MinReturn |              10 |
|          EpLenMean |            46.1 |
|           EpLenStd |              25 |
| TimestepsThisBatch |        5.02e+03 |
|     TimestepsSoFar |        7.52e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |             152 |
|         Loss Delta |        -0.00732 |
|         Loss After |          -0.012 |
|          Iteration |              27 |
|      AverageReturn |             111 |
|          StdReturn |              44 |
|          MaxReturn |             213 |
|          MinReturn |              29 |
|          EpLenMean |             111 |
|           EpLenStd |              44 |
| TimestepsThisBatch |        5.12e+03 |
|     TimestepsSoFar |        1.41e+05 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |             156 |
|         Loss Delta |           0.005 |
|         Loss After |          -0.017 |
|          Iteration |              28 |
|      AverageReturn |             106 |
|          StdReturn |            45.1 |
|          MaxReturn |  

|      AverageReturn |             141 |
|          StdReturn |            62.7 |
|          MaxReturn |             407 |
|          MinReturn |              54 |
|          EpLenMean |             141 |
|           EpLenStd |            62.7 |
| TimestepsThisBatch |        5.07e+03 |
|     TimestepsSoFar |        2.07e+05 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |             226 |
|         Loss Delta |         0.00657 |
|         Loss After |        -0.00992 |
|          Iteration |              41 |
|      AverageReturn |             154 |
|          StdReturn |            62.9 |
|          MaxReturn |             338 |
|          MinReturn |              55 |
|          EpLenMean |             154 |
|           EpLenStd |            62.9 |
| TimestepsThisBatch |        5.09e+03 |
|     TimestepsSoFar |        2.12e+05 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        2.73e+05 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |             299 |
|         Loss Delta |          -0.013 |
|         Loss After |         -0.0033 |
|          Iteration |              54 |
|      AverageReturn |             238 |
|          StdReturn |            77.5 |
|          MaxReturn |             450 |
|          MinReturn |              57 |
|          EpLenMean |             238 |
|           EpLenStd |            77.5 |
| TimestepsThisBatch |        5.23e+03 |
|     TimestepsSoFar |        2.78e+05 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |             304 |
|         Loss Delta |         -0.0103 |
|         Loss After |         0.00704 |
|          Iteration |              55 |
|      AverageReturn |             229 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |             572 |
|          StdReturn |             264 |
|          MaxReturn |           1e+03 |
|          MinReturn |             155 |
|          EpLenMean |             572 |
|           EpLenStd |             264 |
| TimestepsThisBatch |        5.14e+03 |
|     TimestepsSoFar |        3.45e+05 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |             379 |
|         Loss Delta |         -0.0155 |
|         Loss After |         0.00647 |
|          Iteration |              68 |
|      AverageReturn |             532 |
|          StdReturn |             189 |
|          MaxReturn |             926 |
|          MinReturn |             233 |
|          EpLenMean |             532 |
|           EpLenStd |             189 |
| TimestepsThisBatch |        5.32e+03 |
|     TimestepsSoFar |         3.5e+05 |
--------------------

| TimestepsThisBatch |        5.02e+03 |
|     TimestepsSoFar |        4.15e+05 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |             452 |
|         Loss Delta |          0.0104 |
|         Loss After |         -0.0117 |
|          Iteration |              81 |
|      AverageReturn |           1e+03 |
|          StdReturn |               0 |
|          MaxReturn |           1e+03 |
|          MinReturn |           1e+03 |
|          EpLenMean |           1e+03 |
|           EpLenStd |               0 |
| TimestepsThisBatch |           6e+03 |
|     TimestepsSoFar |        4.21e+05 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |             457 |
|         Loss Delta |         0.00202 |
|         Loss After |         -0.0137 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |        -0.00106 |
|          Iteration |              94 |
|      AverageReturn |             715 |
|          StdReturn |             370 |
|          MaxReturn |           1e+03 |
|          MinReturn |             145 |
|          EpLenMean |             715 |
|           EpLenStd |             370 |
| TimestepsThisBatch |        5.72e+03 |
|     TimestepsSoFar |        4.92e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |             535 |
|         Loss Delta |         -0.0249 |
|         Loss After |          0.0238 |
|          Iteration |              95 |
|      AverageReturn |             982 |
|          StdReturn |            41.4 |
|          MaxReturn |           1e+03 |
|          MinReturn |             889 |
|          EpLenMean |             982 |
|           EpLenStd |            41.4 |
| TimestepsThisBatch |        5.89e+03 |
|     TimestepsSoFar