In [1]:
import numpy as np
import tensorflow as tf
import gym
import logz
import scipy.signal
import os
import time
import inspect
from multiprocessing import Process

%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)


In [2]:
#============================================================================================#
# Utilities
#============================================================================================#

def build_mlp(
        input_placeholder, 
        output_size,
        scope, 
        n_layers=2, 
        size=64, 
        activation=tf.tanh,
        output_activation=None
        ):
    #========================================================================================#
    #                           ----------SECTION 3----------
    # Network building
    #
    # Your code should make a feedforward neural network (also called a multilayer perceptron)
    # with 'n_layers' hidden layers of size 'size' units. 
    # 
    # The output layer should have size 'output_size' and activation 'output_activation'.
    #
    # Hint: use tf.layers.dense
    #========================================================================================#

    with tf.variable_scope(scope):
        # YOUR_CODE_HERE
        input_layers = input_placeholder
        for i in range(n_layers):
            input_layers = tf.layers.dense(inputs = input_layers, 
                                           units = size, 
                                           activation = activation, 
                                           name = scope + str(i+1) + "_layer")
            
        output_layer = tf.layers.dense(inputs = input_layers, 
                                 units = output_size, 
                                 activation = output_activation, 
                                 name = scope + "output_layer")
        
        return output_layer

def pathlength(path):
    return len(path["reward"])



In [3]:
#============================================================================================#
# Policy Gradient
#============================================================================================#

def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps
    
    loss_before = 1000

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype = tf.float32) #TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, 'discrete_mlp', n_layers, size) #TODO
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) #TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no, ac_dim, "continuous_mlp", n_layers, size) #TODO
        sy_logstd = tf.Variable(tf.zeros([1, ac_dim]), name='logstd',
                dtype=tf.float32) #TODO # logstd should just be a trainable variable, not a network output.
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = tf.random_normal(shape= tf.shape(sy_mean), mean = sy_mean, stddev = sy_std) #sy_mean + sy_std * tf.random_normal(tf.shape(sy_mean)) #TODO
        sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sy_std).log_prob(sy_ac_na) #TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = - tf.reduce_mean(sy_logprob_n * sy_adv_n) #TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_targets = tf.placeholder(shape = [None,], name = "baseline_targets", dtype = tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss) #TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101


    #print("reach Training loop")
    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)

                #print("current ac:", ac)
                #print("before ob:", ob)
                ob, rew, done, _ = env.step(ac)
                #print("After ob:", ob)
                #print("rew:", rew)
                #print("done:", done)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        
        # YOUR_CODE_HERE
        #q_n = TODO
        q_n = []
        
        for path in paths:
            r_nr = path["reward"]
            step_n = len(r_nr)
            
            if reward_to_go:
                q = [np.sum(np.power(gamma, np.arange(step_n-t))* r_nr[t:]) for t in range(step_n)]
            else:
                q = [np.sum(np.power(gamma, np.arange(step_n) * r_nr)) for t in range(step_n)]
                
            q_n.extend(q)
        #if reward_to_go:
            

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict = {sy_ob_no: ob_no}) #TODO
            b_n_norm = (b_n - np.mean(b_n))/ (np.std(b_n,axis = 0) + 1e-7)
            b_n = b_n_norm * np.std(q_n, axis = 0) + np.mean(q_n, axis = 0)
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
        
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            #pass
            adv_n = (adv_n - np.mean(adv_n, axis = 0)) / (np.std(adv_n, axis = 0) + 1e-7)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            #pass
            q_n_norm = (q_n - np.mean(q_n, axis = 0)) / (np.std(q_n, axis = 0) + 1e-7)
            sess.run(baseline_update_op, feed_dict = {sy_ob_no: ob_no, baseline_targets: q_n_norm})

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 
        
        # YOUR_CODE_HERE
        feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n}
        #loss_before = sess.run(loss, feed_dict = feed_dict)
        _, loss_after = sess.run([update_op, loss], feed_dict = feed_dict)

        # Log diagnostics
        
        #print("log part is here?")
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Loss Delta", loss_before - loss_after)
        logz.log_tabular("Loss After", loss_after)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        
        loss_before = loss_after

In [4]:
def runmain(env_name, 
           exp_name = 'vpg',
           render = False,
           discount = 1.0, 
           n_iter = 100, 
           batch_size = 1000, 
           ep_len = -1., 
           learning_rate = 5e-3,
           reward_to_go = False,
           dont_normalize_advantages = False,
           nn_baseline = False,
           seed = 1, 
           n_experiments = 1,
           n_layers = 1,
           size = 32):
    
    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=1)
    parser.add_argument('--size', '-s', type=int, default=32)
    args = parser.parse_args()
    """
    
    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = ep_len if ep_len > 0 else None

    for e in range(n_experiments):
        #print('n_experiments:', n_experiments)
        seed = seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=exp_name,
                env_name=env_name,
                n_iter=n_iter,
                gamma=discount,
                min_timesteps_per_batch=batch_size,
                max_path_length=max_path_length,
                learning_rate=learning_rate,
                reward_to_go=reward_to_go,
                animate=render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(dont_normalize_advantages),
                nn_baseline=nn_baseline, 
                seed=seed,
                n_layers=n_layers,
                size=size
                )
        # Awkward hacky process runs, because Tensorflow does not like
        # repeatedly calling train_PG in the same thread.
        p = Process(target=train_func, args=tuple())
        p.start()
        p.join()
        


In [5]:
"""
runmain('CartPole-v0', 
        n_iter = 100, 
        batch_size = 1000 , 
        n_experiments = 5,  
        dont_normalize_advantages = True,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'sb_bl_rtg_dna')

"""

"\nrunmain('CartPole-v0', \n        n_iter = 100, \n        batch_size = 1000 , \n        n_experiments = 5,  \n        dont_normalize_advantages = True,\n        reward_to_go = True,\n        nn_baseline = True,\n        exp_name = 'sb_bl_rtg_dna',\n        n_layers = 2)\n\n"

In [13]:
runmain('HalfCheetah-v1', 
        n_iter = 100, 
        batch_size = 5000 , 
        discount = 0.9,
        learning_rate = 1e-2,
        n_experiments = 1,  
        dont_normalize_advantages = False,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'lb_bl_rtg_dna',
        ep_len = 150,
        n_layers = 3)

n_experiments: 1
Running experiment with seed 1
[32;1mLogging data to data/lb_bl_rtg_dna_HalfCheetah-v1_01-05-2018_15-40-59/1/log.txt[0m


[2018-05-01 15:40:59,445] Making new env: HalfCheetah-v1


********** Iteration 0 ************
----------------------------------------
|               Time |            8.66 |
|         Loss Delta |           1e+03 |
|         Loss After |         -0.0849 |
|          Iteration |               0 |
|      AverageReturn |            -110 |
|          StdReturn |            37.7 |
|          MaxReturn |           -34.1 |
|          MinReturn |            -200 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        5.13e+03 |
----------------------------------------
********** Iteration 1 ************
----------------------------------------
|               Time |            16.5 |
|         Loss Delta |        -0.00433 |
|         Loss After |         -0.0806 |
|          Iteration |               1 |
|      AverageReturn |            -111 |
|          StdReturn |            46.4 |
|          MaxReturn |             -49 |
|          MinReturn |    

|          StdReturn |            28.8 |
|          MaxReturn |           -19.2 |
|          MinReturn |            -135 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        7.19e+04 |
----------------------------------------
********** Iteration 14 ************
----------------------------------------
|               Time |             104 |
|         Loss Delta |         0.00428 |
|         Loss After |         -0.0943 |
|          Iteration |              14 |
|      AverageReturn |             -65 |
|          StdReturn |            32.7 |
|          MaxReturn |             4.2 |
|          MinReturn |            -154 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |         7.7e+04 |
----------------------------------------
********** Iteration 15 ************
------------------------

----------------------------------------
********** Iteration 27 ************
----------------------------------------
|               Time |             194 |
|         Loss Delta |        -0.00571 |
|         Loss After |         -0.0931 |
|          Iteration |              27 |
|      AverageReturn |           -44.9 |
|          StdReturn |            32.7 |
|          MaxReturn |            1.82 |
|          MinReturn |            -124 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        1.44e+05 |
----------------------------------------
********** Iteration 28 ************
----------------------------------------
|               Time |             201 |
|         Loss Delta |         0.00241 |
|         Loss After |         -0.0955 |
|          Iteration |              28 |
|      AverageReturn |           -47.3 |
|          StdReturn |            31.7 |
|          MaxReturn |  

|      AverageReturn |             -38 |
|          StdReturn |            17.6 |
|          MaxReturn |           -6.27 |
|          MinReturn |           -76.9 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |         2.1e+05 |
----------------------------------------
********** Iteration 41 ************
----------------------------------------
|               Time |             286 |
|         Loss Delta |         -0.0456 |
|         Loss After |         -0.0578 |
|          Iteration |              41 |
|      AverageReturn |           -26.5 |
|          StdReturn |            25.5 |
|          MaxReturn |            32.7 |
|          MinReturn |           -99.8 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        2.16e+05 |
----------------------------------------
********** Iteration

|     TimestepsSoFar |        2.77e+05 |
----------------------------------------
********** Iteration 54 ************
----------------------------------------
|               Time |             370 |
|         Loss Delta |          0.0178 |
|         Loss After |         -0.0826 |
|          Iteration |              54 |
|      AverageReturn |           -23.5 |
|          StdReturn |            24.1 |
|          MaxReturn |            10.1 |
|          MinReturn |           -66.4 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        2.82e+05 |
----------------------------------------
********** Iteration 55 ************
----------------------------------------
|               Time |             375 |
|         Loss Delta |          0.0352 |
|         Loss After |          -0.118 |
|          Iteration |              55 |
|      AverageReturn |           -15.7 |
|          StdReturn |  

|          Iteration |              67 |
|      AverageReturn |           -17.9 |
|          StdReturn |            22.1 |
|          MaxReturn |            31.1 |
|          MinReturn |           -64.3 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        3.49e+05 |
----------------------------------------
********** Iteration 68 ************
----------------------------------------
|               Time |             461 |
|         Loss Delta |        -0.00787 |
|         Loss After |         -0.0883 |
|          Iteration |              68 |
|      AverageReturn |           -11.5 |
|          StdReturn |              27 |
|          MaxReturn |            32.9 |
|          MinReturn |             -79 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        3.54e+05 |
--------------------

| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        4.16e+05 |
----------------------------------------
********** Iteration 81 ************
----------------------------------------
|               Time |             548 |
|         Loss Delta |          -0.021 |
|         Loss After |         -0.0557 |
|          Iteration |              81 |
|      AverageReturn |            12.9 |
|          StdReturn |              22 |
|          MaxReturn |              56 |
|          MinReturn |           -40.1 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        4.21e+05 |
----------------------------------------
********** Iteration 82 ************
----------------------------------------
|               Time |             556 |
|         Loss Delta |         0.00796 |
|         Loss After |         -0.0636 |
|          Iteration |              82 |
|      AverageReturn |  

|         Loss After |         -0.0523 |
|          Iteration |              94 |
|      AverageReturn |            12.5 |
|          StdReturn |            23.9 |
|          MaxReturn |            47.7 |
|          MinReturn |           -52.6 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar |        4.88e+05 |
----------------------------------------
********** Iteration 95 ************
----------------------------------------
|               Time |             641 |
|         Loss Delta |         -0.0138 |
|         Loss After |         -0.0384 |
|          Iteration |              95 |
|      AverageReturn |           -4.66 |
|          StdReturn |            36.3 |
|          MaxReturn |            58.6 |
|          MinReturn |           -83.7 |
|          EpLenMean |             151 |
|           EpLenStd |               0 |
| TimestepsThisBatch |        5.13e+03 |
|     TimestepsSoFar

In [None]:
runmain('InvertedPendulum-v1', 
        n_iter = 100, 
        batch_size = 5000 , 
        n_experiments = 1,  
        dont_normalize_advantages = False,
        reward_to_go = True,
        nn_baseline = True,
        exp_name = 'sb_bl_rtg_dna',
        n_layers = 2)

In [None]:
"""
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=1)
    parser.add_argument('--size', '-s', type=int, default=32)
    args = parser.parse_args()

    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=args.exp_name,
                env_name=args.env_name,
                n_iter=args.n_iter,
                gamma=args.discount,
                min_timesteps_per_batch=args.batch_size,
                max_path_length=max_path_length,
                learning_rate=args.learning_rate,
                reward_to_go=args.reward_to_go,
                animate=args.render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(args.dont_normalize_advantages),
                nn_baseline=args.nn_baseline, 
                seed=seed,
                n_layers=args.n_layers,
                size=args.size
                )
        # Awkward hacky process runs, because Tensorflow does not like
        # repeatedly calling train_PG in the same thread.
        p = Process(target=train_func, args=tuple())
        p.start()
        p.join()
        

if __name__ == "__main__":
    main()
"""