In [1]:
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import MlpPolicy

from imitation.algorithms.adversarial.gail import GAIL
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.util.util import make_vec_env

SEED = 42

2024-06-28 05:58:17.194209: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
env = make_vec_env(
    "seals:seals/CartPole-v0",
    rng=np.random.default_rng(SEED),
    n_envs=8,
    post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts
)
expert = load_policy(
    "ppo-huggingface",
    organization="HumanCompatibleAI",
    env_name="seals-CartPole-v0",
    venv=env,
)

rollouts = rollout.rollout(
    expert,
    env,
    rollout.make_sample_until(min_timesteps=None, min_episodes=60),
    rng=np.random.default_rng(SEED),
)

learner = PPO(
    env=env,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0004,
    gamma=0.95,
    n_epochs=5,
    seed=SEED,
)
reward_net = BasicRewardNet(
    observation_space=env.observation_space,
    action_space=env.action_space,
    normalize_input_layer=RunningNorm,
)

gail_trainer = GAIL(
    demonstrations=rollouts,
    demo_batch_size=180,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=8,
    venv=env,
    gen_algo=learner,
    reward_net=reward_net,
)


In [6]:
# evaluate the learner before training
env.seed(SEED)
learner_rewards_before_training, _ = evaluate_policy(
    learner, env, 100, return_episode_rewards=True,
)

# train the learner and evaluate again
gail_trainer.train(200_000)  # Train for 800_000 steps to match expert.
env.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, env, 100, return_episode_rewards=True,
)

print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

round:   0%|          | 0/12 [00:00<?, ?it/s]

------------------------------------------
| raw/                        |          |
|    gen/rollout/ep_len_mean  | 500      |
|    gen/rollout/ep_rew_mean  | 34.4     |
|    gen/time/fps             | 5466     |
|    gen/time/iterations      | 1        |
|    gen/time/time_elapsed    | 2        |
|    gen/time/total_timesteps | 16384    |
------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.5      |
|    disc/disc_acc_expert             | 0        |
|    disc/disc_acc_gen                | 1        |
|    disc/disc_entropy                | 0.69     |
|    disc/disc_loss                   | 0.685    |
|    disc/disc_proportion_expert_pred | 0        |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 1.02e+03 |
|    disc/n_generated                 | 1.02e+03 |
-

round:   8%|▊         | 1/12 [00:06<01:07,  6.14s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 35.3        |
|    gen/rollout/ep_rew_wrapped_mean | 270         |
|    gen/time/fps                    | 5444        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 3           |
|    gen/time/total_timesteps        | 32768       |
|    gen/train/approx_kl             | 0.006985818 |
|    gen/train/clip_fraction         | 0.0338      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.687      |
|    gen/train/explained_variance    | 0.0555      |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0315      |
|    gen/train/n_updates             | 5           |
|    gen/train/policy_gradient_loss  | -0.00158    |
|    gen/train/value_loss            | 4.7    

round:  17%|█▋        | 2/12 [00:12<01:01,  6.14s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 33.2        |
|    gen/rollout/ep_rew_wrapped_mean | 284         |
|    gen/time/fps                    | 5634        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 49152       |
|    gen/train/approx_kl             | 0.008708343 |
|    gen/train/clip_fraction         | 0.0647      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.677      |
|    gen/train/explained_variance    | 0.728       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0786      |
|    gen/train/n_updates             | 10          |
|    gen/train/policy_gradient_loss  | -0.00238    |
|    gen/train/value_loss            | 0.26   

round:  25%|██▌       | 3/12 [00:18<00:54,  6.09s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 35.9        |
|    gen/rollout/ep_rew_wrapped_mean | 278         |
|    gen/time/fps                    | 5577        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 65536       |
|    gen/train/approx_kl             | 0.008885191 |
|    gen/train/clip_fraction         | 0.0534      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.677      |
|    gen/train/explained_variance    | 0.883       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0317      |
|    gen/train/n_updates             | 15          |
|    gen/train/policy_gradient_loss  | -0.00414    |
|    gen/train/value_loss            | 0.0468 

round:  33%|███▎      | 4/12 [00:24<00:48,  6.03s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 500        |
|    gen/rollout/ep_rew_mean         | 40         |
|    gen/rollout/ep_rew_wrapped_mean | 276        |
|    gen/time/fps                    | 5595       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 81920      |
|    gen/train/approx_kl             | 0.01323979 |
|    gen/train/clip_fraction         | 0.131      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -0.662     |
|    gen/train/explained_variance    | 0.936      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.00448   |
|    gen/train/n_updates             | 20         |
|    gen/train/policy_gradient_loss  | -0.0133    |
|    gen/train/value_loss            | 0.0171     |
------------

round:  42%|████▏     | 5/12 [00:30<00:42,  6.05s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 47.5        |
|    gen/rollout/ep_rew_wrapped_mean | 273         |
|    gen/time/fps                    | 5560        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 98304       |
|    gen/train/approx_kl             | 0.013039661 |
|    gen/train/clip_fraction         | 0.158       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.647      |
|    gen/train/explained_variance    | 0.912       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0258     |
|    gen/train/n_updates             | 25          |
|    gen/train/policy_gradient_loss  | -0.0157     |
|    gen/train/value_loss            | 0.0174 

round:  50%|█████     | 6/12 [00:36<00:36,  6.07s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 54.3        |
|    gen/rollout/ep_rew_wrapped_mean | 278         |
|    gen/time/fps                    | 5260        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 3           |
|    gen/time/total_timesteps        | 114688      |
|    gen/train/approx_kl             | 0.015362133 |
|    gen/train/clip_fraction         | 0.15        |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.597      |
|    gen/train/explained_variance    | 0.919       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00624     |
|    gen/train/n_updates             | 30          |
|    gen/train/policy_gradient_loss  | -0.0108     |
|    gen/train/value_loss            | 0.0179 

round:  58%|█████▊    | 7/12 [00:42<00:30,  6.09s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 58.9        |
|    gen/rollout/ep_rew_wrapped_mean | 279         |
|    gen/time/fps                    | 5581        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 131072      |
|    gen/train/approx_kl             | 0.010625873 |
|    gen/train/clip_fraction         | 0.0907      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.574      |
|    gen/train/explained_variance    | 0.933       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.0143     |
|    gen/train/n_updates             | 35          |
|    gen/train/policy_gradient_loss  | -0.00618    |
|    gen/train/value_loss            | 0.0219 

round:  67%|██████▋   | 8/12 [00:48<00:24,  6.07s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 66.3        |
|    gen/rollout/ep_rew_wrapped_mean | 275         |
|    gen/time/fps                    | 5572        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 147456      |
|    gen/train/approx_kl             | 0.008240618 |
|    gen/train/clip_fraction         | 0.0858      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.57       |
|    gen/train/explained_variance    | 0.952       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.000247   |
|    gen/train/n_updates             | 40          |
|    gen/train/policy_gradient_loss  | -0.00468    |
|    gen/train/value_loss            | 0.0248 

round:  75%|███████▌  | 9/12 [00:54<00:18,  6.07s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 74.2        |
|    gen/rollout/ep_rew_wrapped_mean | 272         |
|    gen/time/fps                    | 5516        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 163840      |
|    gen/train/approx_kl             | 0.008478992 |
|    gen/train/clip_fraction         | 0.0938      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.589      |
|    gen/train/explained_variance    | 0.957       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00986     |
|    gen/train/n_updates             | 45          |
|    gen/train/policy_gradient_loss  | -0.00452    |
|    gen/train/value_loss            | 0.0287 

round:  83%|████████▎ | 10/12 [01:00<00:12,  6.05s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 500          |
|    gen/rollout/ep_rew_mean         | 85.2         |
|    gen/rollout/ep_rew_wrapped_mean | 274          |
|    gen/time/fps                    | 5557         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 180224       |
|    gen/train/approx_kl             | 0.0068827113 |
|    gen/train/clip_fraction         | 0.0802       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -0.599       |
|    gen/train/explained_variance    | 0.96         |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0102       |
|    gen/train/n_updates             | 50           |
|    gen/train/policy_gradient_loss  | -0.00378     |
|    gen/train/value_loss   

round:  92%|█████████▏| 11/12 [01:06<00:06,  6.03s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 500         |
|    gen/rollout/ep_rew_mean         | 91.3        |
|    gen/rollout/ep_rew_wrapped_mean | 272         |
|    gen/time/fps                    | 5564        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 196608      |
|    gen/train/approx_kl             | 0.008205008 |
|    gen/train/clip_fraction         | 0.0982      |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -0.601      |
|    gen/train/explained_variance    | 0.935       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0091      |
|    gen/train/n_updates             | 55          |
|    gen/train/policy_gradient_loss  | -0.00593    |
|    gen/train/value_loss            | 0.0442 

round: 100%|██████████| 12/12 [01:12<00:00,  6.06s/it]


mean reward after training: 321.46
mean reward before training: 102.6


In [7]:
import datasets
from imitation.data import huggingface_utils

# Download some expert trajectories from the HuggingFace Datasets Hub.
dataset = datasets.load_dataset("HumanCompatibleAI/ppo-Pendulum-v1")

# Convert the dataset to a format usable by the imitation library.
expert_trajectories = huggingface_utils.TrajectoryDatasetSequence(dataset["train"])

Downloading readme:   0%|          | 0.00/536 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/940k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
from imitation.data import rollout

trajectory_stats = rollout.rollout_stats(expert_trajectories)

print(
    f"We have {trajectory_stats['n_traj']} trajectories. "
    f"The average length of each trajectory is {trajectory_stats['len_mean']}. "
    f"The average return of each trajectory is {trajectory_stats['return_mean']}."
)

We have 200 trajectories. The average length of each trajectory is 200.0. The average return of each trajectory is -205.22814517737746.


In [11]:
expert_trajectories[0].acts.shape

(200, 1)

In [12]:
venv = make_vec_env(
    "Pendulum-v1",
    rng=np.random.default_rng(seed=SEED),
)

In [15]:
from stable_baselines3 import sac


learner = sac.SAC(
    env=venv,
    policy="MlpPolicy",
    batch_size=64,
    gamma=0.95,
    seed=SEED,
)
reward_net = BasicRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

gail_trainer = GAIL(
    demonstrations=expert_trajectories,
    demo_batch_size=1024,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=8,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)


AttributeError: 'ActorCriticPolicy' object has no attribute 'actor'

In [14]:

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True,
)

# train the learner and evaluate again
gail_trainer.train(200_000)  # Train for 800_000 steps to match expert.
env.seed(SEED)
learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 100, return_episode_rewards=True,
)

print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

round:   0%|          | 0/12 [00:00<?, ?it/s]

-------------------------------------------
| raw/                        |           |
|    gen/rollout/ep_len_mean  | 200       |
|    gen/rollout/ep_rew_mean  | -1.19e+03 |
|    gen/time/fps             | 6161      |
|    gen/time/iterations      | 1         |
|    gen/time/time_elapsed    | 2         |
|    gen/time/total_timesteps | 16384     |
-------------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.499    |
|    disc/disc_acc_expert             | 0        |
|    disc/disc_acc_gen                | 0.997    |
|    disc/disc_entropy                | 0.691    |
|    disc/disc_loss                   | 0.708    |
|    disc/disc_proportion_expert_pred | 0.00146  |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 1.02e+03 |
|    disc/n_generated                 | 1.0

round:   8%|▊         | 1/12 [00:05<01:02,  5.72s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 200          |
|    gen/rollout/ep_rew_mean         | -1.23e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 131          |
|    gen/time/fps                    | 6182         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 32768        |
|    gen/train/approx_kl             | 0.0021170615 |
|    gen/train/clip_fraction         | 0.0177       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -1.42        |
|    gen/train/explained_variance    | -0.012       |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.139        |
|    gen/train/n_updates             | 5            |
|    gen/train/policy_gradient_loss  | -0.000286    |
|    gen/train/std          

round:  17%|█▋        | 2/12 [00:11<00:59,  5.97s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 200          |
|    gen/rollout/ep_rew_mean         | -1.25e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 128          |
|    gen/time/fps                    | 6196         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 49152        |
|    gen/train/approx_kl             | 0.0027575726 |
|    gen/train/clip_fraction         | 0.0178       |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -1.43        |
|    gen/train/explained_variance    | -0.00838     |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0869       |
|    gen/train/n_updates             | 10           |
|    gen/train/policy_gradient_loss  | -0.000509    |
|    gen/train/std          

round:  25%|██▌       | 3/12 [00:17<00:52,  5.87s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 200          |
|    gen/rollout/ep_rew_mean         | -1.19e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 125          |
|    gen/time/fps                    | 6192         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 65536        |
|    gen/train/approx_kl             | 0.0104048215 |
|    gen/train/clip_fraction         | 0.116        |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -1.4         |
|    gen/train/explained_variance    | 0.195        |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0212       |
|    gen/train/n_updates             | 15           |
|    gen/train/policy_gradient_loss  | -0.00974     |
|    gen/train/std          

round:  33%|███▎      | 4/12 [00:23<00:46,  5.80s/it]

---------------------------------------------------
| raw/                               |            |
|    gen/rollout/ep_len_mean         | 200        |
|    gen/rollout/ep_rew_mean         | -1.16e+03  |
|    gen/rollout/ep_rew_wrapped_mean | 122        |
|    gen/time/fps                    | 6130       |
|    gen/time/iterations             | 1          |
|    gen/time/time_elapsed           | 2          |
|    gen/time/total_timesteps        | 81920      |
|    gen/train/approx_kl             | 0.01466484 |
|    gen/train/clip_fraction         | 0.175      |
|    gen/train/clip_range            | 0.2        |
|    gen/train/entropy_loss          | -1.35      |
|    gen/train/explained_variance    | 0.591      |
|    gen/train/learning_rate         | 0.0004     |
|    gen/train/loss                  | -0.0322    |
|    gen/train/n_updates             | 20         |
|    gen/train/policy_gradient_loss  | -0.0198    |
|    gen/train/std                   | 0.929      |
|    gen/tra

round:  42%|████▏     | 5/12 [00:29<00:40,  5.77s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.22e+03   |
|    gen/rollout/ep_rew_wrapped_mean | 118         |
|    gen/time/fps                    | 6170        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 98304       |
|    gen/train/approx_kl             | 0.013843885 |
|    gen/train/clip_fraction         | 0.168       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.31       |
|    gen/train/explained_variance    | 0.736       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.00124    |
|    gen/train/n_updates             | 25          |
|    gen/train/policy_gradient_loss  | -0.019      |
|    gen/train/std                   | 0.891  

round:  50%|█████     | 6/12 [00:34<00:34,  5.75s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.2e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 113         |
|    gen/time/fps                    | 6255        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 114688      |
|    gen/train/approx_kl             | 0.016044566 |
|    gen/train/clip_fraction         | 0.176       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.26       |
|    gen/train/explained_variance    | 0.793       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | -0.022      |
|    gen/train/n_updates             | 30          |
|    gen/train/policy_gradient_loss  | -0.0179     |
|    gen/train/std                   | 0.843  

round:  58%|█████▊    | 7/12 [00:40<00:29,  5.87s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.22e+03   |
|    gen/rollout/ep_rew_wrapped_mean | 107         |
|    gen/time/fps                    | 6293        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 131072      |
|    gen/train/approx_kl             | 0.011361226 |
|    gen/train/clip_fraction         | 0.126       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.22       |
|    gen/train/explained_variance    | 0.825       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00492     |
|    gen/train/n_updates             | 35          |
|    gen/train/policy_gradient_loss  | -0.012      |
|    gen/train/std                   | 0.804  

round:  67%|██████▋   | 8/12 [00:47<00:23,  5.99s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.23e+03   |
|    gen/rollout/ep_rew_wrapped_mean | 97.8        |
|    gen/time/fps                    | 5884        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 147456      |
|    gen/train/approx_kl             | 0.011657476 |
|    gen/train/clip_fraction         | 0.126       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.16       |
|    gen/train/explained_variance    | 0.865       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.00553     |
|    gen/train/n_updates             | 40          |
|    gen/train/policy_gradient_loss  | -0.0112     |
|    gen/train/std                   | 0.765  

round:  75%|███████▌  | 9/12 [00:53<00:18,  6.12s/it]

-----------------------------------------------------
| raw/                               |              |
|    gen/rollout/ep_len_mean         | 200          |
|    gen/rollout/ep_rew_mean         | -1.19e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 89.9         |
|    gen/time/fps                    | 5886         |
|    gen/time/iterations             | 1            |
|    gen/time/time_elapsed           | 2            |
|    gen/time/total_timesteps        | 163840       |
|    gen/train/approx_kl             | 0.0101738535 |
|    gen/train/clip_fraction         | 0.123        |
|    gen/train/clip_range            | 0.2          |
|    gen/train/entropy_loss          | -1.13        |
|    gen/train/explained_variance    | 0.884        |
|    gen/train/learning_rate         | 0.0004       |
|    gen/train/loss                  | 0.0268       |
|    gen/train/n_updates             | 45           |
|    gen/train/policy_gradient_loss  | -0.00924     |
|    gen/train/std          

round:  83%|████████▎ | 10/12 [00:59<00:12,  6.19s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.16e+03   |
|    gen/rollout/ep_rew_wrapped_mean | 82.6        |
|    gen/time/fps                    | 6232        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 180224      |
|    gen/train/approx_kl             | 0.009474072 |
|    gen/train/clip_fraction         | 0.117       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.11       |
|    gen/train/explained_variance    | 0.902       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.0945      |
|    gen/train/n_updates             | 50          |
|    gen/train/policy_gradient_loss  | -0.00852    |
|    gen/train/std                   | 0.73   

round:  92%|█████████▏| 11/12 [01:06<00:06,  6.18s/it]

----------------------------------------------------
| raw/                               |             |
|    gen/rollout/ep_len_mean         | 200         |
|    gen/rollout/ep_rew_mean         | -1.1e+03    |
|    gen/rollout/ep_rew_wrapped_mean | 78.4        |
|    gen/time/fps                    | 6266        |
|    gen/time/iterations             | 1           |
|    gen/time/time_elapsed           | 2           |
|    gen/time/total_timesteps        | 196608      |
|    gen/train/approx_kl             | 0.009692896 |
|    gen/train/clip_fraction         | 0.113       |
|    gen/train/clip_range            | 0.2         |
|    gen/train/entropy_loss          | -1.08       |
|    gen/train/explained_variance    | 0.934       |
|    gen/train/learning_rate         | 0.0004      |
|    gen/train/loss                  | 0.137       |
|    gen/train/n_updates             | 55          |
|    gen/train/policy_gradient_loss  | -0.00927    |
|    gen/train/std                   | 0.71   

round: 100%|██████████| 12/12 [01:12<00:00,  6.01s/it]


mean reward after training: -1027.2936809
mean reward before training: -1208.5587643000001
