In [None]:
#########################################################################
## COMPROBAR GPU ASIGNADA EN COLABORATORY
#########################################################################
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
#########################################################################
## LIBRERIAS NECESARIAS
#########################################################################
import tensorflow as tf
import gymnasium as gym
import sinergym 
from sinergym.utils.wrappers import (LoggerWrapper, NormalizeAction,
                                     NormalizeObservation) 
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import keras


# Librerias necesarias para BC
from stable_baselines3.common.evaluation import evaluate_policy

from imitation.algorithms import bc
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
# Problema en rollout: es la función que define las transiciones expertas
import imitation.data.rollout as rollout 
from stable_baselines3.common.vec_env import DummyVecEnv

# 5ZONE

## Creamos un vector de entornos

In [None]:
def _make_env():
     _env = gym.make("Eplus-5zone-hot-discrete-v1")
     _env = NormalizeObservation(_env)
     _env = LoggerWrapper(_env)
     return _env

venv = DummyVecEnv([_make_env for _ in range(1)])



## Creación de demostraciones expertas 

In [None]:
env = gym.make("Eplus-5zone-hot-discrete-v1")
env = NormalizeObservation(env)
env = LoggerWrapper(env)



In [None]:
# Función que devuekve la política experta 
def download_expert():
    print("Downloading a pretrained expert.")
    expert = load_policy(
        "ppo",
        path="model5zone.zip",
        venv=env,
    )
    return expert

# Función que devuelve trayectorias de la política experta 
def sample_expert_transitions():
    # Cargamos la política experta
    expert = download_expert()

    print("Sampling expert transitions.")

    # Generar trayectorias a partir de una política dada
    rollouts = rollout.rollout(
        expert,  # Política 
        venv,    # Entorno
        sample_until=rollout.make_sample_until(min_timesteps=None, min_episodes=1),  # EPISODES=1 asi que min_episodes=1
        rng=np.random.default_rng(),
        unwrap=False,
    )
    
    return rollout.flatten_trajectories(rollouts)


In [None]:
# Selección de una muestra de trayectorias de secuencias expertas
transitions = sample_expert_transitions()

Downloading a pretrained expert.
Sampling expert transitions.
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


## 3. **Adversarial Inverse Reinforcement Learning (AIRL)**

* Similar a GAIL
* Se diferencia en que cubre casos donde la función de recompensa es ,as generalizable a cambios dinámicos del entorno

In [None]:
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.util.networks import RunningNorm
from imitation.algorithms.adversarial.airl import AIRL
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

## Experimento 1

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(3504)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 2

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(7008)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 3

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(10512)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 4

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(14016)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 5

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(17520)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 6

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(21024)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 7

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(24528)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 8

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(28032)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 9

In [None]:
SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
)

In [None]:
venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(31536)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

In [None]:
print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))

## Experimento 10

In [None]:

"""SEED = 42

 
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

airl_trainer = AIRL(
    demonstrations=transitions,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
) """


Progress: |*--------------------------------------------------------------------------------------------------| 1%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


In [None]:
""" venv.seed(SEED)

learner_rewards_before_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
)

airl_trainer.train(35038)  
venv.seed(SEED)

learner_rewards_after_training, _ = evaluate_policy(
    learner, venv, 5, return_episode_rewards=True,
) """

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                        |      |
|    gen/time/fps             | 349  |
|    gen/time/iterations      | 1    |
|    gen/time/time_elapsed    | 5    |
|    gen/time/total_timesteps | 2048 |
--------------------------------------
--------------------------------------------------
| raw/                                |          |
|    disc/disc_acc                    | 0.5      |
|    disc/disc_acc_expert             | 1        |
|    disc/disc_acc_gen                | 0        |
|    disc/disc_entropy                | 0.354    |
|    disc/disc_loss                   | 1.15     |
|    disc/disc_proportion_expert_pred | 1        |
|    disc/disc_proportion_expert_true | 0.5      |
|    disc/global_step                 | 1        |
|    disc/n_expert                    | 2.05e+03 |
|    disc/n_generated                 | 2.05e+03 |
---------------------------------------

round:   6%|▌         | 1/17 [00:06<01:44,  6.50s/it]

--------------------------------------------------------------------------------------------------------------| 12%
| raw/                              |              |
|    gen/time/fps                   | 729          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 4096         |
|    gen/train/approx_kl            | 0.0039499034 |
|    gen/train/clip_fraction        | 0.164        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.3         |
|    gen/train/explained_variance   | -0.178       |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.0136       |
|    gen/train/n_updates            | 5            |
|    gen/train/policy_gradient_loss | -0.00907     |
|    gen/train/value_loss           | 1.6          |
----------------------------------------------------
------------------------------------

round:  12%|█▏        | 2/17 [00:09<01:10,  4.68s/it]

--------------------------------------------------------------------------------------------------------------| 18%
| raw/                              |              |
|    gen/time/fps                   | 727          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 6144         |
|    gen/train/approx_kl            | 0.0037344352 |
|    gen/train/clip_fraction        | 0.163        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.3         |
|    gen/train/explained_variance   | 0.545        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.0291       |
|    gen/train/n_updates            | 10           |
|    gen/train/policy_gradient_loss | -0.00983     |
|    gen/train/value_loss           | 3.45         |
----------------------------------------------------
------------------------------------

round:  18%|█▊        | 3/17 [00:13<00:57,  4.12s/it]

--------------------------------------------------------------------------------------------------------------| 23%
| raw/                              |              |
|    gen/time/fps                   | 725          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 8192         |
|    gen/train/approx_kl            | 0.0036309871 |
|    gen/train/clip_fraction        | 0.172        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.29        |
|    gen/train/explained_variance   | 0.434        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.159        |
|    gen/train/n_updates            | 15           |
|    gen/train/policy_gradient_loss | -0.0101      |
|    gen/train/value_loss           | 4.99         |
----------------------------------------------------
------------------------------------

round:  24%|██▎       | 4/17 [00:16<00:50,  3.85s/it]

--------------------------------------------------------------------------------------------------------------| 29%
| raw/                              |              |
|    gen/time/fps                   | 667          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 3            |
|    gen/time/total_timesteps       | 10240        |
|    gen/train/approx_kl            | 0.0029962873 |
|    gen/train/clip_fraction        | 0.112        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.29        |
|    gen/train/explained_variance   | 0.652        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.202        |
|    gen/train/n_updates            | 20           |
|    gen/train/policy_gradient_loss | -0.00778     |
|    gen/train/value_loss           | 7.01         |
----------------------------------------------------
------------------------------------

round:  29%|██▉       | 5/17 [00:20<00:45,  3.80s/it]

--------------------------------------------------------------------------------------------------------------| 35%
| raw/                              |              |
|    gen/time/fps                   | 720          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 12288        |
|    gen/train/approx_kl            | 0.0036509265 |
|    gen/train/clip_fraction        | 0.132        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.28        |
|    gen/train/explained_variance   | 0.669        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.303        |
|    gen/train/n_updates            | 25           |
|    gen/train/policy_gradient_loss | -0.00777     |
|    gen/train/value_loss           | 5.6          |
----------------------------------------------------
------------------------------------

round:  35%|███▌      | 6/17 [00:23<00:40,  3.68s/it]

--------------------------------------------------------------------------------------------------------------| 41%
| raw/                              |              |
|    gen/time/fps                   | 723          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 14336        |
|    gen/train/approx_kl            | 0.0038683012 |
|    gen/train/clip_fraction        | 0.174        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.28        |
|    gen/train/explained_variance   | 0.738        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.169        |
|    gen/train/n_updates            | 30           |
|    gen/train/policy_gradient_loss | -0.00919     |
|    gen/train/value_loss           | 7.2          |
----------------------------------------------------
------------------------------------

round:  41%|████      | 7/17 [00:27<00:36,  3.61s/it]

----------------------------------------------------******----------------------------------------------------| 47%
| raw/                              |              |
|    gen/time/fps                   | 728          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 16384        |
|    gen/train/approx_kl            | 0.0039479425 |
|    gen/train/clip_fraction        | 0.139        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.27        |
|    gen/train/explained_variance   | 0.492        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.231        |
|    gen/train/n_updates            | 35           |
|    gen/train/policy_gradient_loss | -0.00674     |
|    gen/train/value_loss           | 9.79         |
----------------------------------------------------
------------------------------------

round:  47%|████▋     | 8/17 [00:30<00:32,  3.56s/it]

----------------------------------------------------************----------------------------------------------| 53%
| raw/                              |              |
|    gen/time/fps                   | 722          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 18432        |
|    gen/train/approx_kl            | 0.0023032096 |
|    gen/train/clip_fraction        | 0.0461       |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.27        |
|    gen/train/explained_variance   | 0.712        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.502        |
|    gen/train/n_updates            | 40           |
|    gen/train/policy_gradient_loss | -0.00365     |
|    gen/train/value_loss           | 20.7         |
----------------------------------------------------
------------------------------------

round:  53%|█████▎    | 9/17 [00:34<00:28,  3.52s/it]

----------------------------------------------------****************------------------------------------------| 58%
| raw/                              |              |
|    gen/time/fps                   | 720          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 20480        |
|    gen/train/approx_kl            | 0.0016755029 |
|    gen/train/clip_fraction        | 0.029        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.26        |
|    gen/train/explained_variance   | 0.739        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 1.4          |
|    gen/train/n_updates            | 45           |
|    gen/train/policy_gradient_loss | -0.00437     |
|    gen/train/value_loss           | 26.7         |
----------------------------------------------------
------------------------------------

round:  59%|█████▉    | 10/17 [00:37<00:24,  3.49s/it]

---------------------------------------------------************************-----------------------------------| 64%
| raw/                              |             |
|    gen/time/fps                   | 731         |
|    gen/time/iterations            | 1           |
|    gen/time/time_elapsed          | 2           |
|    gen/time/total_timesteps       | 22528       |
|    gen/train/approx_kl            | 0.002093975 |
|    gen/train/clip_fraction        | 0.0437      |
|    gen/train/clip_range           | 0.1         |
|    gen/train/entropy_loss         | -2.27       |
|    gen/train/explained_variance   | 0.322       |
|    gen/train/learning_rate        | 0.0005      |
|    gen/train/loss                 | 0.378       |
|    gen/train/n_updates            | 50          |
|    gen/train/policy_gradient_loss | -0.00516    |
|    gen/train/value_loss           | 25.3        |
---------------------------------------------------
--------------------------------------------------
|

round:  65%|██████▍   | 11/17 [00:41<00:20,  3.47s/it]

---------------------------------------------------******************************-----------------------------| 70%
| raw/                              |             |
|    gen/time/fps                   | 722         |
|    gen/time/iterations            | 1           |
|    gen/time/time_elapsed          | 2           |
|    gen/time/total_timesteps       | 24576       |
|    gen/train/approx_kl            | 0.004002318 |
|    gen/train/clip_fraction        | 0.15        |
|    gen/train/clip_range           | 0.1         |
|    gen/train/entropy_loss         | -2.26       |
|    gen/train/explained_variance   | 0.446       |
|    gen/train/learning_rate        | 0.0005      |
|    gen/train/loss                 | 0.361       |
|    gen/train/n_updates            | 55          |
|    gen/train/policy_gradient_loss | -0.00782    |
|    gen/train/value_loss           | 13.7        |
---------------------------------------------------
--------------------------------------------------
|

round:  71%|███████   | 12/17 [00:44<00:17,  3.55s/it]

----------------------------------------------------***********************************-----------------------| 76%
| raw/                              |              |
|    gen/time/fps                   | 722          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 26624        |
|    gen/train/approx_kl            | 0.0036876681 |
|    gen/train/clip_fraction        | 0.124        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.25        |
|    gen/train/explained_variance   | 0.477        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.553        |
|    gen/train/n_updates            | 60           |
|    gen/train/policy_gradient_loss | -0.00703     |
|    gen/train/value_loss           | 11.2         |
----------------------------------------------------
------------------------------------

round:  76%|███████▋  | 13/17 [00:48<00:14,  3.52s/it]

----------------------------------------------------****************************************------------------| 81%
| raw/                              |              |
|    gen/time/fps                   | 726          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 28672        |
|    gen/train/approx_kl            | 0.0039207237 |
|    gen/train/clip_fraction        | 0.175        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.24        |
|    gen/train/explained_variance   | 0.509        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.158        |
|    gen/train/n_updates            | 65           |
|    gen/train/policy_gradient_loss | -0.00899     |
|    gen/train/value_loss           | 6            |
----------------------------------------------------
------------------------------------

round:  82%|████████▏ | 14/17 [00:51<00:10,  3.51s/it]

----------------------------------------------------**********************************************------------| 87%
| raw/                              |              |
|    gen/time/fps                   | 730          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 30720        |
|    gen/train/approx_kl            | 0.0013609843 |
|    gen/train/clip_fraction        | 0.0244       |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.23        |
|    gen/train/explained_variance   | 0.121        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.636        |
|    gen/train/n_updates            | 70           |
|    gen/train/policy_gradient_loss | -0.00293     |
|    gen/train/value_loss           | 16.4         |
----------------------------------------------------
------------------------------------

round:  88%|████████▊ | 15/17 [00:55<00:06,  3.49s/it]

----------------------------------------------------****************************************************------| 93%
| raw/                              |              |
|    gen/time/fps                   | 724          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 32768        |
|    gen/train/approx_kl            | 0.0023393547 |
|    gen/train/clip_fraction        | 0.0518       |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.24        |
|    gen/train/explained_variance   | 0.0729       |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.377        |
|    gen/train/n_updates            | 75           |
|    gen/train/policy_gradient_loss | -0.00516     |
|    gen/train/value_loss           | 11.1         |
----------------------------------------------------
------------------------------------

round:  94%|█████████▍| 16/17 [00:58<00:03,  3.48s/it]

----------------------------------------------------**********************************************************| 99%
| raw/                              |              |
|    gen/time/fps                   | 722          |
|    gen/time/iterations            | 1            |
|    gen/time/time_elapsed          | 2            |
|    gen/time/total_timesteps       | 34816        |
|    gen/train/approx_kl            | 0.0036461586 |
|    gen/train/clip_fraction        | 0.104        |
|    gen/train/clip_range           | 0.1          |
|    gen/train/entropy_loss         | -2.23        |
|    gen/train/explained_variance   | 0.167        |
|    gen/train/learning_rate        | 0.0005       |
|    gen/train/loss                 | 0.398        |
|    gen/train/n_updates            | 80           |
|    gen/train/policy_gradient_loss | -0.00829     |
|    gen/train/value_loss           | 11.9         |
----------------------------------------------------
------------------------------------

round: 100%|██████████| 17/17 [01:02<00:00,  3.66s/it]


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


In [None]:
"""print("mean reward after training:", np.mean(learner_rewards_after_training))
print("mean reward before training:", np.mean(learner_rewards_before_training))""" 


mean reward after training: -21901.931846516578
mean reward before training: -22185.937215596066
