In [1]:
"""
Main script to run for training the RL models
"""
import os
import time
import random
from logger import create_logger
from rewards.reward_func import StepReward
import sympy as sp
import numpy as np
from observations.spaces import StringObs, ActionSpace
from envs.gym_env import PolyLogLinExpr
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure
from model.rl_algorithms import TRPOModel
from model.monitor_utils import SaveOnBestTrainingRewardCallback, plot_results, plot_all_results, import_curriculum

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

if __name__ == '__main__':

    logger = create_logger('run.log')
    logger.info("============ Initialized logger ============")

    # Define the training timesteps along with the episode length
    max_step = 50
    total_t_steps = 3000000
    check_freq = 30000

    # Reward function to use
    reward_func = StepReward(len_pen=0.00, num_terms_pen=0.0, active_pen=False, simple_reward=1, pen_cycle=False)

    # Use a sympy observation environment
    x = sp.Symbol('x')
    expr0 = 2 * sp.polylog(2, x)
    obs_space = StringObs(expr0, 512, one_hot_encode=True, numeral_decomp=True, reduced_graph=True, prev_st_info=True)

    # Use an Discrete action space with 3 actions
    act_space = ActionSpace(['inversion', 'reflection', 'cyclic', 'duplication'])

    # At each new episode we get a new equation
    random_start = True

    # If we want to do curriculum learning
    curriculum_dir = None

    # Define the parameters for the data generation if we desire to do it on the fly
    start_size = 13500  # Training set size
    len_simple = 0
    len_scr = 4
    num_scr = 7

    # Otherwise we can load a previously generated training set
    file_starts = '/home/stringer/下载/data_and_model/train_data.txt'

    random_start_args = {'random_start': random_start, 'start_size': start_size, 'len_simple': len_simple,
                         'len_scr': len_scr, 'num_scr': num_scr, 'file_starts': file_starts,
                         'curriculum': import_curriculum(curriculum_dir)}

    # Directory to store result
    log_dir = "log_dir"

    # RL Algorithm

    # Network architecture
    policy = 'MultiInputPolicy'
    shared_layers = [256]
    vf_layers = [128, 128, 64]
    pi_layers = [128, 128, 64]
    act_func = 'relu'

    # Agent parameters
    model_name = 'trpo'
    trpo_kwargs = {'gae_lambda': 0.9, 'gamma': 0.9, 'n_steps': 2048}
    ft_extrac = {'name': 'sage', 'params': {'embed_dim': 64, 'num_layers': 2, 'obs_space': obs_space,
                                            'bidirectional': True}}
    # ft_extrac = 'Flatten'

    num_experiments = 1
    times = []
    log_dirs = []

    # Can repeat experiment with different seeds
    for experiment in range(num_experiments):
        np.random.seed(experiment)
        random.seed(experiment)
        rng = np.random.RandomState(experiment)

        # Path to store results
        log_link = log_dir + model_name + '_exp_{}'.format(str(experiment)) + '/'
        log_dirs.append(log_link)
        os.makedirs(log_link, exist_ok=True)

        # Define the environment and check it
        env = Monitor(PolyLogLinExpr(max_step, reward_func, obs_space, act_space, rng,
                                     random_start_args=random_start_args, log_dir=log_link, gen='V2'), log_link)
        env.reset()

        # Define the agent
        base_model = TRPOModel(env, policy, shared_layers, vf_layers, pi_layers, act_func, feature_extractor=ft_extrac,
                               seed=experiment, one_hot=obs_space.one_hot_encode, **trpo_kwargs)

        # Configure the logger
        assert base_model.get_name() == model_name
        rl_model = base_model.get_model()
        logger.info(str(rl_model.policy))
        new_logger = configure(log_link, ["stdout", "csv"])
        rl_model.set_logger(new_logger)

        # Start the timer
        start = time.time()

        # Use deterministic actions for evaluation
        eval_callback = SaveOnBestTrainingRewardCallback(check_freq=check_freq, log_dir=log_link)

        # Now start the training
        rl_model.learn(total_timesteps=total_t_steps, callback=eval_callback)

        times.append(time.time() - start)

        # Save the final model and associated stats
        rl_model.save(log_link + 'final_model')
        stats_path = os.path.join(log_link, "vec_normalize_" + str(experiment) + ".pkl")

        # Plot the rewards as a function of time steps
        plot_results(log_link, window_size=int(check_freq/20))

    plot_all_results(log_dirs, log_dir + model_name + '_rewards.pdf', window_size=int(check_freq/20))


INFO - 01/25/24 20:43:09 - 0:00:00 - We create a reward/penalty at each step
INFO - 01/25/24 20:43:09 - 0:00:00 - Reward function has a penalty each step. Number of terms times -0.0. We also penalize lengthy expressions with -0.0 per word/arg
INFO - 01/25/24 20:43:09 - 0:00:00 - Reward function also rewards each simplification with +1
INFO - 01/25/24 20:43:09 - 0:00:00 - Final expression not simplified are penalized with 0
INFO - 01/25/24 20:43:09 - 0:00:00 - Create an Observation space of size 512
INFO - 01/25/24 20:43:09 - 0:00:00 - We use a numeral decomposition for the integers
INFO - 01/25/24 20:43:09 - 0:00:00 - We use ['x'] variables
INFO - 01/25/24 20:43:09 - 0:00:00 - We use dict_keys(['add', 'mul', 'div', 'pow', 'polylog']) operators
INFO - 01/25/24 20:43:09 - 0:00:00 - We use ['INT+', 'INT-'] symbols
INFO - 01/25/24 20:43:09 - 0:00:00 - We use ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] elements
INFO - 01/25/24 20:43:09 - 0:00:00 - The full list of words is ['pa

Using cuda device
Wrapping the env in a DummyVecEnv.


INFO - 01/25/24 20:55:22 - 0:12:12 - MultiInputActorCriticPolicy(
                                       (features_extractor): CombinedFeatureExtractor(
                                         (word_encoder): GSageEncoder(
                                           (embeddings): Identity()
                                           (gcn_forward): GraphSAGE(20, 64, num_layers=2)
                                           (gcn_fwd_state): GraphSAGE(20, 64, num_layers=2)
                                         )
                                       )
                                       (mlp_extractor): MlpExtractor(
                                         (shared_net): Sequential(
                                           (0): Linear(in_features=131, out_features=256, bias=True)
                                           (1): ReLU()
                                         )
                                         (policy_net): Sequential(
                                       

Logging to log_dirtrpo_exp_0/
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 45.8     |
|    ep_rew_mean     | 0.75     |
| time/              |          |
|    fps             | 23       |
|    iterations      | 1        |
|    time_elapsed    | 88       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 43.2     |
|    ep_rew_mean            | 0.798    |
| time/                     |          |
|    fps                    | 17       |
|    iterations             | 2        |
|    time_elapsed           | 237      |
|    total_timesteps        | 4096     |
| train/                    |          |
|    explained_variance     | -2.21    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00603  |
|    learning_rate          | 0.001    |
|    n_updates              | 1        |
|    policy_obj

INFO - 01/25/24 21:33:00 - 0:49:50 - Num timesteps: 30000
INFO - 01/25/24 21:33:00 - 0:49:50 - Best mean reward: -inf - Last mean reward per episode: 0.93
INFO - 01/25/24 21:33:00 - 0:49:50 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 41.6     |
|    ep_rew_mean            | 0.99     |
| time/                     |          |
|    fps                    | 13       |
|    iterations             | 15       |
|    time_elapsed           | 2284     |
|    total_timesteps        | 30720    |
| train/                    |          |
|    explained_variance     | 0.0711   |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00552  |
|    learning_rate          | 0.001    |
|    n_updates              | 14       |
|    policy_objective       | 0.00423  |
|    value_loss             | 0.0771   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.7     |
|    ep_rew_mean            | 0.9      |
| time/                     |          |
|    fps                    | 13       |
|    iterations 

INFO - 01/25/24 22:03:43 - 1:20:34 - Num timesteps: 60000
INFO - 01/25/24 22:03:43 - 1:20:34 - Best mean reward: 0.93 - Last mean reward per episode: 1.06
INFO - 01/25/24 22:03:43 - 1:20:34 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.5     |
|    ep_rew_mean            | 0.96     |
| time/                     |          |
|    fps                    | 14       |
|    iterations             | 30       |
|    time_elapsed           | 4151     |
|    total_timesteps        | 61440    |
| train/                    |          |
|    explained_variance     | 0.0772   |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00688  |
|    learning_rate          | 0.001    |
|    n_updates              | 29       |
|    policy_objective       | 0.00711  |
|    value_loss             | 0.072    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 43       |
|    ep_rew_mean            | 0.94     |
| time/                     |          |
|    fps                    | 14       |
|    iterations 

INFO - 01/25/24 22:28:58 - 1:45:48 - Num timesteps: 90000
INFO - 01/25/24 22:28:58 - 1:45:48 - Best mean reward: 1.06 - Last mean reward per episode: 1.18
INFO - 01/25/24 22:28:58 - 1:45:48 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 39       |
|    ep_rew_mean            | 1.19     |
| time/                     |          |
|    fps                    | 16       |
|    iterations             | 44       |
|    time_elapsed           | 5617     |
|    total_timesteps        | 90112    |
| train/                    |          |
|    explained_variance     | 0.14     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00868  |
|    learning_rate          | 0.001    |
|    n_updates              | 43       |
|    policy_objective       | 0.00476  |
|    value_loss             | 0.0616   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 38.5     |
|    ep_rew_mean            | 1.36     |
| time/                     |          |
|    fps                    | 16       |
|    iterations 

INFO - 01/25/24 22:46:57 - 2:03:48 - Num timesteps: 120000
INFO - 01/25/24 22:46:57 - 2:03:48 - Best mean reward: 1.18 - Last mean reward per episode: 1.08


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 41.4     |
|    ep_rew_mean            | 1.14     |
| time/                     |          |
|    fps                    | 18       |
|    iterations             | 59       |
|    time_elapsed           | 6709     |
|    total_timesteps        | 120832   |
| train/                    |          |
|    explained_variance     | 0.132    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00766  |
|    learning_rate          | 0.001    |
|    n_updates              | 58       |
|    policy_objective       | 0.00357  |
|    value_loss             | 0.0635   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 38.4     |
|    ep_rew_mean            | 1.05     |
| time/                     |          |
|    fps                    | 18       |
|    iterations 

INFO - 01/25/24 23:04:15 - 2:21:06 - Num timesteps: 150000
INFO - 01/25/24 23:04:15 - 2:21:06 - Best mean reward: 1.18 - Last mean reward per episode: 1.01


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 41.9     |
|    ep_rew_mean            | 0.89     |
| time/                     |          |
|    fps                    | 19       |
|    iterations             | 74       |
|    time_elapsed           | 7764     |
|    total_timesteps        | 151552   |
| train/                    |          |
|    explained_variance     | 0.301    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00937  |
|    learning_rate          | 0.001    |
|    n_updates              | 73       |
|    policy_objective       | 0.01     |
|    value_loss             | 0.0704   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 41       |
|    ep_rew_mean            | 0.96     |
| time/                     |          |
|    fps                    | 19       |
|    iterations 

INFO - 01/25/24 23:22:51 - 2:39:41 - Num timesteps: 180000
INFO - 01/25/24 23:22:51 - 2:39:41 - Best mean reward: 1.18 - Last mean reward per episode: 1.00


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 42.8     |
|    ep_rew_mean            | 0.96     |
| time/                     |          |
|    fps                    | 20       |
|    iterations             | 88       |
|    time_elapsed           | 8854     |
|    total_timesteps        | 180224   |
| train/                    |          |
|    explained_variance     | 0.176    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00621  |
|    learning_rate          | 0.001    |
|    n_updates              | 87       |
|    policy_objective       | 0.0164   |
|    value_loss             | 0.0545   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 44.1     |
|    ep_rew_mean            | 0.97     |
| time/                     |          |
|    fps                    | 20       |
|    iterations 

INFO - 01/25/24 23:40:32 - 2:57:23 - Num timesteps: 210000
INFO - 01/25/24 23:40:32 - 2:57:23 - Best mean reward: 1.18 - Last mean reward per episode: 1.19
INFO - 01/25/24 23:40:32 - 2:57:23 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.5     |
|    ep_rew_mean            | 1.1      |
| time/                     |          |
|    fps                    | 21       |
|    iterations             | 103      |
|    time_elapsed           | 9924     |
|    total_timesteps        | 210944   |
| train/                    |          |
|    explained_variance     | 0.139    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00578  |
|    learning_rate          | 0.001    |
|    n_updates              | 102      |
|    policy_objective       | 0.00245  |
|    value_loss             | 0.073    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.5     |
|    ep_rew_mean            | 1.06     |
| time/                     |          |
|    fps                    | 21       |
|    iterations 

INFO - 01/25/24 23:57:09 - 3:14:00 - Num timesteps: 240000
INFO - 01/25/24 23:57:09 - 3:14:00 - Best mean reward: 1.19 - Last mean reward per episode: 1.32
INFO - 01/25/24 23:57:09 - 3:14:00 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.3     |
|    ep_rew_mean            | 1.21     |
| time/                     |          |
|    fps                    | 22       |
|    iterations             | 118      |
|    time_elapsed           | 10934    |
|    total_timesteps        | 241664   |
| train/                    |          |
|    explained_variance     | 0.102    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00776  |
|    learning_rate          | 0.001    |
|    n_updates              | 117      |
|    policy_objective       | 0.0121   |
|    value_loss             | 0.107    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 39.4     |
|    ep_rew_mean            | 1.15     |
| time/                     |          |
|    fps                    | 22       |
|    iterations 

INFO - 01/26/24 00:14:19 - 3:31:10 - Num timesteps: 270000
INFO - 01/26/24 00:14:19 - 3:31:10 - Best mean reward: 1.32 - Last mean reward per episode: 1.22


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 37.4     |
|    ep_rew_mean            | 1.28     |
| time/                     |          |
|    fps                    | 22       |
|    iterations             | 132      |
|    time_elapsed           | 11942    |
|    total_timesteps        | 270336   |
| train/                    |          |
|    explained_variance     | 0.168    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00543  |
|    learning_rate          | 0.001    |
|    n_updates              | 131      |
|    policy_objective       | 0.019    |
|    value_loss             | 0.0744   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 36.3     |
|    ep_rew_mean            | 1.21     |
| time/                     |          |
|    fps                    | 22       |
|    iterations 

INFO - 01/26/24 00:31:43 - 3:48:34 - Num timesteps: 300000
INFO - 01/26/24 00:31:43 - 3:48:34 - Best mean reward: 1.32 - Last mean reward per episode: 1.16


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 37.6     |
|    ep_rew_mean            | 1.04     |
| time/                     |          |
|    fps                    | 23       |
|    iterations             | 147      |
|    time_elapsed           | 12999    |
|    total_timesteps        | 301056   |
| train/                    |          |
|    explained_variance     | 0.0904   |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00469  |
|    learning_rate          | 0.001    |
|    n_updates              | 146      |
|    policy_objective       | 0.0113   |
|    value_loss             | 0.0668   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 40.3     |
|    ep_rew_mean            | 1.02     |
| time/                     |          |
|    fps                    | 23       |
|    iterations 

INFO - 01/26/24 00:50:58 - 4:07:49 - Num timesteps: 330000
INFO - 01/26/24 00:50:58 - 4:07:49 - Best mean reward: 1.32 - Last mean reward per episode: 1.33
INFO - 01/26/24 00:50:58 - 4:07:49 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 35.9     |
|    ep_rew_mean            | 1.31     |
| time/                     |          |
|    fps                    | 23       |
|    iterations             | 162      |
|    time_elapsed           | 14178    |
|    total_timesteps        | 331776   |
| train/                    |          |
|    explained_variance     | 0.264    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0096   |
|    learning_rate          | 0.001    |
|    n_updates              | 161      |
|    policy_objective       | 0.0114   |
|    value_loss             | 0.081    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33.4     |
|    ep_rew_mean            | 1.31     |
| time/                     |          |
|    fps                    | 23       |
|    iterations 

INFO - 01/26/24 01:09:27 - 4:26:18 - Num timesteps: 360000
INFO - 01/26/24 01:09:27 - 4:26:18 - Best mean reward: 1.33 - Last mean reward per episode: 1.48
INFO - 01/26/24 01:09:27 - 4:26:18 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.7     |
|    ep_rew_mean            | 1.45     |
| time/                     |          |
|    fps                    | 23       |
|    iterations             | 176      |
|    time_elapsed           | 15254    |
|    total_timesteps        | 360448   |
| train/                    |          |
|    explained_variance     | 0.212    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00739  |
|    learning_rate          | 0.001    |
|    n_updates              | 175      |
|    policy_objective       | 0.0105   |
|    value_loss             | 0.107    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 36.5     |
|    ep_rew_mean            | 1.38     |
| time/                     |          |
|    fps                    | 23       |
|    iterations 

INFO - 01/26/24 01:29:11 - 4:46:02 - Num timesteps: 390000
INFO - 01/26/24 01:29:11 - 4:46:02 - Best mean reward: 1.48 - Last mean reward per episode: 1.02


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 35.5     |
|    ep_rew_mean            | 1.07     |
| time/                     |          |
|    fps                    | 23       |
|    iterations             | 191      |
|    time_elapsed           | 16457    |
|    total_timesteps        | 391168   |
| train/                    |          |
|    explained_variance     | 0.258    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00676  |
|    learning_rate          | 0.001    |
|    n_updates              | 190      |
|    policy_objective       | 0.00869  |
|    value_loss             | 0.0653   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33.4     |
|    ep_rew_mean            | 1.41     |
| time/                     |          |
|    fps                    | 23       |
|    iterations 

INFO - 01/26/24 01:48:00 - 5:04:50 - Num timesteps: 420000
INFO - 01/26/24 01:48:00 - 5:04:50 - Best mean reward: 1.48 - Last mean reward per episode: 1.13


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 36.3     |
|    ep_rew_mean            | 1.47     |
| time/                     |          |
|    fps                    | 23       |
|    iterations             | 206      |
|    time_elapsed           | 17590    |
|    total_timesteps        | 421888   |
| train/                    |          |
|    explained_variance     | 0.095    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00945  |
|    learning_rate          | 0.001    |
|    n_updates              | 205      |
|    policy_objective       | 0.0162   |
|    value_loss             | 0.0799   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 35.3     |
|    ep_rew_mean            | 1.39     |
| time/                     |          |
|    fps                    | 23       |
|    iterations 

INFO - 01/26/24 02:05:39 - 5:22:30 - Num timesteps: 450000
INFO - 01/26/24 02:05:39 - 5:22:30 - Best mean reward: 1.48 - Last mean reward per episode: 1.44


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 38       |
|    ep_rew_mean            | 1.34     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 220      |
|    time_elapsed           | 18631    |
|    total_timesteps        | 450560   |
| train/                    |          |
|    explained_variance     | 0.0575   |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00735  |
|    learning_rate          | 0.001    |
|    n_updates              | 219      |
|    policy_objective       | 0.00777  |
|    value_loss             | 0.107    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 32.4     |
|    ep_rew_mean            | 1.37     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 02:24:29 - 5:41:20 - Num timesteps: 480000
INFO - 01/26/24 02:24:29 - 5:41:20 - Best mean reward: 1.48 - Last mean reward per episode: 1.65
INFO - 01/26/24 02:24:29 - 5:41:20 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.6     |
|    ep_rew_mean            | 1.5      |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 235      |
|    time_elapsed           | 19777    |
|    total_timesteps        | 481280   |
| train/                    |          |
|    explained_variance     | 0.299    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00825  |
|    learning_rate          | 0.001    |
|    n_updates              | 234      |
|    policy_objective       | 0.00972  |
|    value_loss             | 0.1      |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 35.5     |
|    ep_rew_mean            | 1.33     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 02:44:03 - 6:00:53 - Num timesteps: 510000
INFO - 01/26/24 02:44:03 - 6:00:53 - Best mean reward: 1.65 - Last mean reward per episode: 1.64


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31       |
|    ep_rew_mean            | 1.67     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 250      |
|    time_elapsed           | 20957    |
|    total_timesteps        | 512000   |
| train/                    |          |
|    explained_variance     | 0.233    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00334  |
|    learning_rate          | 0.001    |
|    n_updates              | 249      |
|    policy_objective       | 0.00724  |
|    value_loss             | 0.131    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34       |
|    ep_rew_mean            | 1.5      |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 03:02:46 - 6:19:36 - Num timesteps: 540000
INFO - 01/26/24 03:02:46 - 6:19:36 - Best mean reward: 1.65 - Last mean reward per episode: 1.50


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33.1     |
|    ep_rew_mean            | 1.35     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 264      |
|    time_elapsed           | 22058    |
|    total_timesteps        | 540672   |
| train/                    |          |
|    explained_variance     | 0.244    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00703  |
|    learning_rate          | 0.001    |
|    n_updates              | 263      |
|    policy_objective       | 0.0149   |
|    value_loss             | 0.14     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34       |
|    ep_rew_mean            | 1.28     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 03:22:09 - 6:39:00 - Num timesteps: 570000
INFO - 01/26/24 03:22:09 - 6:39:00 - Best mean reward: 1.65 - Last mean reward per episode: 1.61


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.2     |
|    ep_rew_mean            | 1.65     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 279      |
|    time_elapsed           | 23232    |
|    total_timesteps        | 571392   |
| train/                    |          |
|    explained_variance     | 0.224    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00858  |
|    learning_rate          | 0.001    |
|    n_updates              | 278      |
|    policy_objective       | 0.0132   |
|    value_loss             | 0.13     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33.5     |
|    ep_rew_mean            | 1.42     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 03:39:31 - 6:56:22 - Num timesteps: 600000
INFO - 01/26/24 03:39:31 - 6:56:22 - Best mean reward: 1.65 - Last mean reward per episode: 1.37


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 36.7     |
|    ep_rew_mean            | 1.36     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 293      |
|    time_elapsed           | 24250    |
|    total_timesteps        | 600064   |
| train/                    |          |
|    explained_variance     | 0.107    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00698  |
|    learning_rate          | 0.001    |
|    n_updates              | 292      |
|    policy_objective       | 0.0124   |
|    value_loss             | 0.0979   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 38.1     |
|    ep_rew_mean            | 1.38     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 03:57:38 - 7:14:28 - Num timesteps: 630000
INFO - 01/26/24 03:57:38 - 7:14:28 - Best mean reward: 1.65 - Last mean reward per episode: 1.35


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 35.1     |
|    ep_rew_mean            | 1.4      |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 308      |
|    time_elapsed           | 25351    |
|    total_timesteps        | 630784   |
| train/                    |          |
|    explained_variance     | 0.154    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0065   |
|    learning_rate          | 0.001    |
|    n_updates              | 307      |
|    policy_objective       | 0.00867  |
|    value_loss             | 0.111    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.3     |
|    ep_rew_mean            | 1.46     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 04:16:30 - 7:33:20 - Num timesteps: 660000
INFO - 01/26/24 04:16:30 - 7:33:20 - Best mean reward: 1.65 - Last mean reward per episode: 1.66
INFO - 01/26/24 04:16:30 - 7:33:20 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33       |
|    ep_rew_mean            | 1.66     |
| time/                     |          |
|    fps                    | 24       |
|    iterations             | 323      |
|    time_elapsed           | 26505    |
|    total_timesteps        | 661504   |
| train/                    |          |
|    explained_variance     | 0.202    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00102  |
|    learning_rate          | 0.001    |
|    n_updates              | 322      |
|    policy_objective       | 0.00671  |
|    value_loss             | 0.141    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.7     |
|    ep_rew_mean            | 1.39     |
| time/                     |          |
|    fps                    | 24       |
|    iterations 

INFO - 01/26/24 04:35:17 - 7:52:08 - Num timesteps: 690000
INFO - 01/26/24 04:35:17 - 7:52:08 - Best mean reward: 1.66 - Last mean reward per episode: 1.73
INFO - 01/26/24 04:35:17 - 7:52:08 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34       |
|    ep_rew_mean            | 1.67     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 337      |
|    time_elapsed           | 27599    |
|    total_timesteps        | 690176   |
| train/                    |          |
|    explained_variance     | 0.11     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00768  |
|    learning_rate          | 0.001    |
|    n_updates              | 336      |
|    policy_objective       | 0.0133   |
|    value_loss             | 0.152    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.1     |
|    ep_rew_mean            | 1.64     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 04:54:15 - 8:11:05 - Num timesteps: 720000
INFO - 01/26/24 04:54:15 - 8:11:05 - Best mean reward: 1.73 - Last mean reward per episode: 1.58


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 32.7     |
|    ep_rew_mean            | 1.64     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 352      |
|    time_elapsed           | 28753    |
|    total_timesteps        | 720896   |
| train/                    |          |
|    explained_variance     | 0.12     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00698  |
|    learning_rate          | 0.001    |
|    n_updates              | 351      |
|    policy_objective       | 0.0111   |
|    value_loss             | 0.152    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 34.5     |
|    ep_rew_mean            | 1.65     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 05:12:21 - 8:29:12 - Num timesteps: 750000
INFO - 01/26/24 05:12:21 - 8:29:12 - Best mean reward: 1.73 - Last mean reward per episode: 1.83
INFO - 01/26/24 05:12:21 - 8:29:12 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33       |
|    ep_rew_mean            | 1.62     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 367      |
|    time_elapsed           | 29854    |
|    total_timesteps        | 751616   |
| train/                    |          |
|    explained_variance     | 0.121    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00789  |
|    learning_rate          | 0.001    |
|    n_updates              | 366      |
|    policy_objective       | 0.0159   |
|    value_loss             | 0.163    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.1     |
|    ep_rew_mean            | 1.73     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 05:31:03 - 8:47:53 - Num timesteps: 780000
INFO - 01/26/24 05:31:03 - 8:47:53 - Best mean reward: 1.83 - Last mean reward per episode: 1.50


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.7     |
|    ep_rew_mean            | 1.56     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 381      |
|    time_elapsed           | 30946    |
|    total_timesteps        | 780288   |
| train/                    |          |
|    explained_variance     | 0.19     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00887  |
|    learning_rate          | 0.001    |
|    n_updates              | 380      |
|    policy_objective       | 0.0137   |
|    value_loss             | 0.116    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 32.4     |
|    ep_rew_mean            | 1.59     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 05:50:09 - 9:07:00 - Num timesteps: 810000
INFO - 01/26/24 05:50:09 - 9:07:00 - Best mean reward: 1.83 - Last mean reward per episode: 1.39


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30       |
|    ep_rew_mean            | 1.6      |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 396      |
|    time_elapsed           | 32111    |
|    total_timesteps        | 811008   |
| train/                    |          |
|    explained_variance     | 0.231    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00804  |
|    learning_rate          | 0.001    |
|    n_updates              | 395      |
|    policy_objective       | 0.0184   |
|    value_loss             | 0.117    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.1     |
|    ep_rew_mean            | 1.8      |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 06:09:11 - 9:26:01 - Num timesteps: 840000
INFO - 01/26/24 06:09:11 - 9:26:01 - Best mean reward: 1.83 - Last mean reward per episode: 1.86
INFO - 01/26/24 06:09:11 - 9:26:01 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.4     |
|    ep_rew_mean            | 1.78     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 411      |
|    time_elapsed           | 33265    |
|    total_timesteps        | 841728   |
| train/                    |          |
|    explained_variance     | 0.211    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00686  |
|    learning_rate          | 0.001    |
|    n_updates              | 410      |
|    policy_objective       | 0.0149   |
|    value_loss             | 0.15     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.7     |
|    ep_rew_mean            | 1.83     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 06:27:11 - 9:44:01 - Num timesteps: 870000
INFO - 01/26/24 06:27:11 - 9:44:01 - Best mean reward: 1.86 - Last mean reward per episode: 1.71


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.5     |
|    ep_rew_mean            | 1.77     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 425      |
|    time_elapsed           | 34314    |
|    total_timesteps        | 870400   |
| train/                    |          |
|    explained_variance     | 0.206    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00966  |
|    learning_rate          | 0.001    |
|    n_updates              | 424      |
|    policy_objective       | 0.0142   |
|    value_loss             | 0.0897   |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.1     |
|    ep_rew_mean            | 1.65     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 06:46:01 - 10:02:52 - Num timesteps: 900000
INFO - 01/26/24 06:46:01 - 10:02:52 - Best mean reward: 1.86 - Last mean reward per episode: 1.94
INFO - 01/26/24 06:46:01 - 10:02:52 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.6     |
|    ep_rew_mean            | 1.88     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 440      |
|    time_elapsed           | 35467    |
|    total_timesteps        | 901120   |
| train/                    |          |
|    explained_variance     | 0.0737   |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00866  |
|    learning_rate          | 0.001    |
|    n_updates              | 439      |
|    policy_objective       | 0.0188   |
|    value_loss             | 0.165    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.7     |
|    ep_rew_mean            | 1.93     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 07:05:43 - 10:22:34 - Num timesteps: 930000
INFO - 01/26/24 07:05:43 - 10:22:34 - Best mean reward: 1.94 - Last mean reward per episode: 1.72


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.1     |
|    ep_rew_mean            | 1.81     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 455      |
|    time_elapsed           | 36659    |
|    total_timesteps        | 931840   |
| train/                    |          |
|    explained_variance     | 0.333    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0084   |
|    learning_rate          | 0.001    |
|    n_updates              | 454      |
|    policy_objective       | 0.0145   |
|    value_loss             | 0.13     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.6     |
|    ep_rew_mean            | 1.84     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 07:24:48 - 10:41:39 - Num timesteps: 960000
INFO - 01/26/24 07:24:48 - 10:41:39 - Best mean reward: 1.94 - Last mean reward per episode: 1.65


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.6     |
|    ep_rew_mean            | 1.58     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 469      |
|    time_elapsed           | 37775    |
|    total_timesteps        | 960512   |
| train/                    |          |
|    explained_variance     | 0.319    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00955  |
|    learning_rate          | 0.001    |
|    n_updates              | 468      |
|    policy_objective       | 0.0131   |
|    value_loss             | 0.139    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.7     |
|    ep_rew_mean            | 1.74     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 07:44:19 - 11:01:10 - Num timesteps: 990000
INFO - 01/26/24 07:44:19 - 11:01:10 - Best mean reward: 1.94 - Last mean reward per episode: 1.65


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.4     |
|    ep_rew_mean            | 1.69     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 484      |
|    time_elapsed           | 38963    |
|    total_timesteps        | 991232   |
| train/                    |          |
|    explained_variance     | 0.341    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00834  |
|    learning_rate          | 0.001    |
|    n_updates              | 483      |
|    policy_objective       | 0.019    |
|    value_loss             | 0.121    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.1     |
|    ep_rew_mean            | 1.72     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 08:04:26 - 11:21:17 - Num timesteps: 1020000
INFO - 01/26/24 08:04:26 - 11:21:17 - Best mean reward: 1.94 - Last mean reward per episode: 1.74


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26       |
|    ep_rew_mean            | 1.86     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 499      |
|    time_elapsed           | 40182    |
|    total_timesteps        | 1021952  |
| train/                    |          |
|    explained_variance     | 0.264    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0085   |
|    learning_rate          | 0.001    |
|    n_updates              | 498      |
|    policy_objective       | 0.0128   |
|    value_loss             | 0.156    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.1     |
|    ep_rew_mean            | 1.88     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 08:22:34 - 11:39:25 - Num timesteps: 1050000
INFO - 01/26/24 08:22:34 - 11:39:25 - Best mean reward: 1.94 - Last mean reward per episode: 1.87


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.3     |
|    ep_rew_mean            | 1.86     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 513      |
|    time_elapsed           | 41243    |
|    total_timesteps        | 1050624  |
| train/                    |          |
|    explained_variance     | 0.142    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00755  |
|    learning_rate          | 0.001    |
|    n_updates              | 512      |
|    policy_objective       | 0.0263   |
|    value_loss             | 0.195    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.1     |
|    ep_rew_mean            | 1.82     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 08:41:01 - 11:57:52 - Num timesteps: 1080000
INFO - 01/26/24 08:41:01 - 11:57:52 - Best mean reward: 1.94 - Last mean reward per episode: 1.89


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.1     |
|    ep_rew_mean            | 1.94     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 528      |
|    time_elapsed           | 42367    |
|    total_timesteps        | 1081344  |
| train/                    |          |
|    explained_variance     | 0.25     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0089   |
|    learning_rate          | 0.001    |
|    n_updates              | 527      |
|    policy_objective       | 0.00852  |
|    value_loss             | 0.145    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33       |
|    ep_rew_mean            | 1.78     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 08:59:45 - 12:16:35 - Num timesteps: 1110000
INFO - 01/26/24 08:59:45 - 12:16:35 - Best mean reward: 1.94 - Last mean reward per episode: 1.88


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27       |
|    ep_rew_mean            | 1.88     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 542      |
|    time_elapsed           | 43463    |
|    total_timesteps        | 1110016  |
| train/                    |          |
|    explained_variance     | 0.221    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00653  |
|    learning_rate          | 0.001    |
|    n_updates              | 541      |
|    policy_objective       | 0.0163   |
|    value_loss             | 0.17     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.4     |
|    ep_rew_mean            | 1.75     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 09:19:05 - 12:35:56 - Num timesteps: 1140000
INFO - 01/26/24 09:19:05 - 12:35:56 - Best mean reward: 1.94 - Last mean reward per episode: 1.89


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29       |
|    ep_rew_mean            | 1.84     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 557      |
|    time_elapsed           | 44642    |
|    total_timesteps        | 1140736  |
| train/                    |          |
|    explained_variance     | 0.255    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00946  |
|    learning_rate          | 0.001    |
|    n_updates              | 556      |
|    policy_objective       | 0.0163   |
|    value_loss             | 0.178    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.1     |
|    ep_rew_mean            | 1.9      |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 09:38:31 - 12:55:22 - Num timesteps: 1170000
INFO - 01/26/24 09:38:31 - 12:55:22 - Best mean reward: 1.94 - Last mean reward per episode: 1.91


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 30.6     |
|    ep_rew_mean            | 1.76     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 572      |
|    time_elapsed           | 45824    |
|    total_timesteps        | 1171456  |
| train/                    |          |
|    explained_variance     | 0.158    |
|    is_line_search_success | 0        |
|    kl_divergence_loss     | 0        |
|    learning_rate          | 0.001    |
|    n_updates              | 571      |
|    policy_objective       | 6.07e-05 |
|    value_loss             | 0.238    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 33.2     |
|    ep_rew_mean            | 1.75     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 09:56:37 - 13:13:28 - Num timesteps: 1200000
INFO - 01/26/24 09:56:37 - 13:13:28 - Best mean reward: 1.94 - Last mean reward per episode: 1.70


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.2     |
|    ep_rew_mean            | 1.68     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 586      |
|    time_elapsed           | 46876    |
|    total_timesteps        | 1200128  |
| train/                    |          |
|    explained_variance     | 0.373    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00989  |
|    learning_rate          | 0.001    |
|    n_updates              | 585      |
|    policy_objective       | 0.015    |
|    value_loss             | 0.145    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.2     |
|    ep_rew_mean            | 1.86     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 10:15:56 - 13:32:47 - Num timesteps: 1230000
INFO - 01/26/24 10:15:56 - 13:32:47 - Best mean reward: 1.94 - Last mean reward per episode: 1.59


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.9     |
|    ep_rew_mean            | 1.53     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 601      |
|    time_elapsed           | 48051    |
|    total_timesteps        | 1230848  |
| train/                    |          |
|    explained_variance     | 0.369    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00666  |
|    learning_rate          | 0.001    |
|    n_updates              | 600      |
|    policy_objective       | 0.0151   |
|    value_loss             | 0.115    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.6     |
|    ep_rew_mean            | 1.77     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 10:35:24 - 13:52:15 - Num timesteps: 1260000
INFO - 01/26/24 10:35:24 - 13:52:15 - Best mean reward: 1.94 - Last mean reward per episode: 1.60


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.5     |
|    ep_rew_mean            | 1.7      |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 616      |
|    time_elapsed           | 49234    |
|    total_timesteps        | 1261568  |
| train/                    |          |
|    explained_variance     | 0.275    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0068   |
|    learning_rate          | 0.001    |
|    n_updates              | 615      |
|    policy_objective       | 0.0265   |
|    value_loss             | 0.153    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29       |
|    ep_rew_mean            | 1.73     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 10:54:37 - 14:11:28 - Num timesteps: 1290000
INFO - 01/26/24 10:54:37 - 14:11:28 - Best mean reward: 1.94 - Last mean reward per episode: 1.69


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.1     |
|    ep_rew_mean            | 1.64     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 630      |
|    time_elapsed           | 50360    |
|    total_timesteps        | 1290240  |
| train/                    |          |
|    explained_variance     | 0.292    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00858  |
|    learning_rate          | 0.001    |
|    n_updates              | 629      |
|    policy_objective       | 0.0122   |
|    value_loss             | 0.151    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 31.9     |
|    ep_rew_mean            | 1.61     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 11:14:40 - 14:31:31 - Num timesteps: 1320000
INFO - 01/26/24 11:14:40 - 14:31:31 - Best mean reward: 1.94 - Last mean reward per episode: 1.89


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 25       |
|    ep_rew_mean            | 1.81     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 645      |
|    time_elapsed           | 51576    |
|    total_timesteps        | 1320960  |
| train/                    |          |
|    explained_variance     | 0.211    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00766  |
|    learning_rate          | 0.001    |
|    n_updates              | 644      |
|    policy_objective       | 0.0138   |
|    value_loss             | 0.191    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.9     |
|    ep_rew_mean            | 1.92     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 11:34:18 - 14:51:09 - Num timesteps: 1350000
INFO - 01/26/24 11:34:18 - 14:51:09 - Best mean reward: 1.94 - Last mean reward per episode: 1.82


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 25.4     |
|    ep_rew_mean            | 1.85     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 660      |
|    time_elapsed           | 52766    |
|    total_timesteps        | 1351680  |
| train/                    |          |
|    explained_variance     | 0.317    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00989  |
|    learning_rate          | 0.001    |
|    n_updates              | 659      |
|    policy_objective       | 0.0164   |
|    value_loss             | 0.183    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.9     |
|    ep_rew_mean            | 1.59     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 11:53:13 - 15:10:04 - Num timesteps: 1380000
INFO - 01/26/24 11:53:13 - 15:10:04 - Best mean reward: 1.94 - Last mean reward per episode: 1.81


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27       |
|    ep_rew_mean            | 1.92     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 674      |
|    time_elapsed           | 53876    |
|    total_timesteps        | 1380352  |
| train/                    |          |
|    explained_variance     | 0.407    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00715  |
|    learning_rate          | 0.001    |
|    n_updates              | 673      |
|    policy_objective       | 0.0158   |
|    value_loss             | 0.101    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 25.8     |
|    ep_rew_mean            | 1.89     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 12:13:40 - 15:30:31 - Num timesteps: 1410000
INFO - 01/26/24 12:13:40 - 15:30:31 - Best mean reward: 1.94 - Last mean reward per episode: 1.71


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 24.9     |
|    ep_rew_mean            | 1.79     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 689      |
|    time_elapsed           | 55121    |
|    total_timesteps        | 1411072  |
| train/                    |          |
|    explained_variance     | 0.297    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00848  |
|    learning_rate          | 0.001    |
|    n_updates              | 688      |
|    policy_objective       | 0.0172   |
|    value_loss             | 0.158    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 25.3     |
|    ep_rew_mean            | 1.69     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 12:33:58 - 15:50:48 - Num timesteps: 1440000
INFO - 01/26/24 12:33:58 - 15:50:48 - Best mean reward: 1.94 - Last mean reward per episode: 1.77


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.9     |
|    ep_rew_mean            | 1.69     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 704      |
|    time_elapsed           | 56361    |
|    total_timesteps        | 1441792  |
| train/                    |          |
|    explained_variance     | 0.457    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00858  |
|    learning_rate          | 0.001    |
|    n_updates              | 703      |
|    policy_objective       | 0.012    |
|    value_loss             | 0.135    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.9     |
|    ep_rew_mean            | 1.83     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 12:54:01 - 16:10:52 - Num timesteps: 1470000
INFO - 01/26/24 12:54:01 - 16:10:52 - Best mean reward: 1.94 - Last mean reward per episode: 1.59


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.4     |
|    ep_rew_mean            | 1.71     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 718      |
|    time_elapsed           | 57531    |
|    total_timesteps        | 1470464  |
| train/                    |          |
|    explained_variance     | 0.282    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00919  |
|    learning_rate          | 0.001    |
|    n_updates              | 717      |
|    policy_objective       | 0.0232   |
|    value_loss             | 0.162    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.7     |
|    ep_rew_mean            | 1.78     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 13:14:55 - 16:31:46 - Num timesteps: 1500000
INFO - 01/26/24 13:14:55 - 16:31:46 - Best mean reward: 1.94 - Last mean reward per episode: 2.00
INFO - 01/26/24 13:14:55 - 16:31:46 - Saving new best model to log_dirtrpo_exp_0/best_model.zip


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 24.3     |
|    ep_rew_mean            | 1.75     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 733      |
|    time_elapsed           | 58800    |
|    total_timesteps        | 1501184  |
| train/                    |          |
|    explained_variance     | 0.336    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0078   |
|    learning_rate          | 0.001    |
|    n_updates              | 732      |
|    policy_objective       | 0.0172   |
|    value_loss             | 0.208    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.1     |
|    ep_rew_mean            | 1.75     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 13:36:08 - 16:52:59 - Num timesteps: 1530000
INFO - 01/26/24 13:36:08 - 16:52:59 - Best mean reward: 2.00 - Last mean reward per episode: 1.83


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 21.2     |
|    ep_rew_mean            | 1.71     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 748      |
|    time_elapsed           | 60084    |
|    total_timesteps        | 1531904  |
| train/                    |          |
|    explained_variance     | 0.368    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.0071   |
|    learning_rate          | 0.001    |
|    n_updates              | 747      |
|    policy_objective       | 0.011    |
|    value_loss             | 0.127    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.9     |
|    ep_rew_mean            | 1.46     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 13:56:40 - 17:13:30 - Num timesteps: 1560000
INFO - 01/26/24 13:56:40 - 17:13:30 - Best mean reward: 2.00 - Last mean reward per episode: 1.79


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 24.4     |
|    ep_rew_mean            | 1.62     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 762      |
|    time_elapsed           | 61289    |
|    total_timesteps        | 1560576  |
| train/                    |          |
|    explained_variance     | 0.317    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00908  |
|    learning_rate          | 0.001    |
|    n_updates              | 761      |
|    policy_objective       | 0.0155   |
|    value_loss             | 0.163    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.3     |
|    ep_rew_mean            | 1.68     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 14:18:06 - 17:34:57 - Num timesteps: 1590000
INFO - 01/26/24 14:18:06 - 17:34:57 - Best mean reward: 2.00 - Last mean reward per episode: 1.73


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.6     |
|    ep_rew_mean            | 1.77     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 777      |
|    time_elapsed           | 62597    |
|    total_timesteps        | 1591296  |
| train/                    |          |
|    explained_variance     | 0.264    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00893  |
|    learning_rate          | 0.001    |
|    n_updates              | 776      |
|    policy_objective       | 0.0166   |
|    value_loss             | 0.136    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.8     |
|    ep_rew_mean            | 1.73     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 14:39:00 - 17:55:50 - Num timesteps: 1620000
INFO - 01/26/24 14:39:00 - 17:55:50 - Best mean reward: 2.00 - Last mean reward per episode: 1.65


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 24.3     |
|    ep_rew_mean            | 1.91     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 792      |
|    time_elapsed           | 63852    |
|    total_timesteps        | 1622016  |
| train/                    |          |
|    explained_variance     | 0.348    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00776  |
|    learning_rate          | 0.001    |
|    n_updates              | 791      |
|    policy_objective       | 0.0128   |
|    value_loss             | 0.13     |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.1     |
|    ep_rew_mean            | 1.77     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 14:58:18 - 18:15:09 - Num timesteps: 1650000
INFO - 01/26/24 14:58:18 - 18:15:09 - Best mean reward: 2.00 - Last mean reward per episode: 1.78


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.5     |
|    ep_rew_mean            | 1.77     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 806      |
|    time_elapsed           | 64989    |
|    total_timesteps        | 1650688  |
| train/                    |          |
|    explained_variance     | 0.381    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00767  |
|    learning_rate          | 0.001    |
|    n_updates              | 805      |
|    policy_objective       | 0.0202   |
|    value_loss             | 0.158    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 29.4     |
|    ep_rew_mean            | 1.68     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 15:19:26 - 18:36:17 - Num timesteps: 1680000
INFO - 01/26/24 15:19:26 - 18:36:17 - Best mean reward: 2.00 - Last mean reward per episode: 1.66


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 25.8     |
|    ep_rew_mean            | 1.64     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 821      |
|    time_elapsed           | 66273    |
|    total_timesteps        | 1681408  |
| train/                    |          |
|    explained_variance     | 0.283    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00782  |
|    learning_rate          | 0.001    |
|    n_updates              | 820      |
|    policy_objective       | 0.0167   |
|    value_loss             | 0.154    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28.7     |
|    ep_rew_mean            | 1.76     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 15:39:40 - 18:56:31 - Num timesteps: 1710000
INFO - 01/26/24 15:39:40 - 18:56:31 - Best mean reward: 2.00 - Last mean reward per episode: 1.72


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 26.6     |
|    ep_rew_mean            | 1.75     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 835      |
|    time_elapsed           | 67460    |
|    total_timesteps        | 1710080  |
| train/                    |          |
|    explained_variance     | 0.433    |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00956  |
|    learning_rate          | 0.001    |
|    n_updates              | 834      |
|    policy_objective       | 0.0161   |
|    value_loss             | 0.145    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 28       |
|    ep_rew_mean            | 1.8      |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

INFO - 01/26/24 16:00:46 - 19:17:36 - Num timesteps: 1740000
INFO - 01/26/24 16:00:46 - 19:17:36 - Best mean reward: 2.00 - Last mean reward per episode: 1.66


----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 23.1     |
|    ep_rew_mean            | 1.85     |
| time/                     |          |
|    fps                    | 25       |
|    iterations             | 850      |
|    time_elapsed           | 68739    |
|    total_timesteps        | 1740800  |
| train/                    |          |
|    explained_variance     | 0.34     |
|    is_line_search_success | 1        |
|    kl_divergence_loss     | 0.00827  |
|    learning_rate          | 0.001    |
|    n_updates              | 849      |
|    policy_objective       | 0.0129   |
|    value_loss             | 0.149    |
----------------------------------------
----------------------------------------
| rollout/                  |          |
|    ep_len_mean            | 27.1     |
|    ep_rew_mean            | 1.98     |
| time/                     |          |
|    fps                    | 25       |
|    iterations 

KeyboardInterrupt: 