In [None]:
#########################################################################
## COMPROBAR GPU ASIGNADA EN COLABORATORY
#########################################################################
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Mar  5 11:40:14 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
| N/A   59C    P8    19W / 100W |     54MiB /  6144MiB |     41%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#########################################################################
## LIBRERIAS NECESARIAS
#########################################################################
import tensorflow as tf
import gymnasium as gym
import sinergym 
from sinergym.utils.wrappers import (LoggerWrapper, NormalizeAction,
                                     NormalizeObservation) 
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import keras


# Librerias necesarias para BC
from stable_baselines3.common.evaluation import evaluate_policy

from imitation.algorithms import bc
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
# Problema en rollout: es la función que define las transiciones expertas
import imitation.data.rollout as rollout 
from stable_baselines3.common.vec_env import DummyVecEnv




# **5Zone**

## 1. Creamos un vector de entornos

In [None]:
def _make_env():
     _env = gym.make("Eplus-5zone-hot-discrete-v1")
     _env = NormalizeObservation(_env)
     _env = LoggerWrapper(_env)
     #_env = RolloutInfoWrapper(_env)
     return _env

# https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html
venv = DummyVecEnv([_make_env for _ in range(1)])



## 2.Creación de demostraciones expertas 

In [None]:
env = gym.make("Eplus-5zone-hot-discrete-v1")
env = NormalizeObservation(env)
env = LoggerWrapper(env)



In [None]:
# Función que devuekve la política experta 
def download_expert():
    print("Downloading a pretrained expert.")
    # https://imitation.readthedocs.io/en/latest/main-concepts/experts.html
    expert = load_policy(
        "ppo",
        path="model5zone.zip",
        venv=env,
    )
    return expert

# Función que devuelve trayectorias de la política experta 
def sample_expert_transitions():
    # Cargamos la política experta
    expert = download_expert()

    print("Sampling expert transitions.")

    # Generar trayectorias a partir de una política dada
    rollouts = rollout.rollout(
        expert,  # Política 
        venv,    # Entorno
        sample_until=rollout.make_sample_until(min_timesteps=None, min_episodes=1),  # EPISODES=1 asi que min_episodes=1
        rng=np.random.default_rng(),
        unwrap=False,
    )
    
    return rollout.flatten_trajectories(rollouts)


In [None]:
# Selección de una muestra de trayectorias de secuencias expertas
transitions = sample_expert_transitions()

Downloading a pretrained expert.
Sampling expert transitions.
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


## **Reward Learning through Preference Comparisons**
* El algoritmo de comparación de preferencias aprende una función de recompensa a partir de las preferencias entre pares de trayectorias. Las comparaciones se modelan como generadas a partir de un modelo de Bradley-Terry (o Boltzmann racional), donde la probabilidad de preferir la trayectoria A sobre B es proporcional al exponencial de la diferencia entre el retorno de la trayectoria A menos B. 
* En otras palabras, la diferencia en los retornos forma un logit para un problema de clasificación binaria, y en consecuencia la función de recompensa se entrena utilizando una pérdida de entropía cruzada para predecir la comparación de preferencias.






In [None]:
from imitation.rewards.reward_nets import BasicRewardNet 
from imitation.algorithms import preference_comparisons
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
from imitation.util.networks import RunningNorm

In [None]:
N_EVAL_EPISODES = 5

In [None]:
reward_net = BasicRewardNet(
    env.observation_space, env.action_space, normalize_input_layer=RunningNorm,
)

In [None]:
fragmenter = preference_comparisons.RandomFragmenter(warning_threshold=0, rng=np.random.default_rng())

gatherer = preference_comparisons.SyntheticGatherer(rng=np.random.default_rng())

preference_model = preference_comparisons.PreferenceModel(reward_net)

reward_trainer = preference_comparisons.BasicRewardTrainer(
    preference_model=preference_model,
    loss=preference_comparisons.CrossEntropyRewardLoss(),
    epochs=10,
    rng=np.random.default_rng(),
)

In [None]:
agent = PPO(
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    n_steps=2048 // venv.num_envs,
    clip_range=0.1,
    ent_coef=0.01,
    gae_lambda=0.95,
    n_epochs=10,
    gamma=0.97,
    learning_rate=2e-3,
)

In [None]:
trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    venv=venv,
    exploration_frac=0.05,
    rng=np.random.default_rng(),
)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


### Experimento 1

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=3504, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 2

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=7008, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 3

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=10512, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 4

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=14016, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 5

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=17520, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 6

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=21024, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 7

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=24528, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 8

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=28032, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 9

In [None]:
pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)

In [None]:
pref_comparisons.train(total_timesteps=31536, total_comparisons=200)

In [None]:
reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")

### Experimento 10

In [None]:
"""pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=5, # Set to 60 for better performance
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    initial_epoch_multiplier=4,
    initial_comparison_frac=0.1,
    query_schedule="hyperbolic",
)"""

In [None]:
"""pref_comparisons.train(total_timesteps=35038, total_comparisons=200)"""

Query schedule: [20, 51, 41, 34, 29, 25]
Collecting 40 fragments (4000 transitions)
Requested 3800 transitions but only 0 in buffer. Sampling 3800 additional transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 200 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 20 comparisons


Training reward model:   0%|          | 0/40 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |           |
|    agent/rollout/ep_rew_wrapped_mean | -3.88e+03 |
|    agent/time/fps                    | 378       |
|    agent/time/iterations             | 1         |
|    agent/time/time_elapsed           | 5         |
|    agent/time/total_timesteps        | 2048      |
----------------------------------------------------
-------------------------------------------------------
| mean/                                   |           |
|    agent/rollout/ep_rew_wrapped_mean    | -3.88e+03 |
|    agent/time/fps                       | 378       |
|    agent/time/iterations                | 1         |
|    agent/time/time_elapsed              | 5         |
|    agent/time/total_timesteps           | 2.05e+03  |
|    agent/train/approx_kl                | 0.00616   |
|    agent/train/clip_fraction            | 0.306     |
|    agent

  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 510 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 71 comparisons


Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*--------------------------------------------------------------------------------------------------| 1%

  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -3.75e+03    |
|    agent/time/fps                    | 867          |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 2            |
|    agent/time/total_timesteps        | 4096         |
|    agent/train/approx_kl             | 0.0061630034 |
|    agent/train/clip_fraction         | 0.306        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.3         |
|    agent/train/explained_variance    | -0.216       |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0145      |
|    agent/train/n_updates             | 10           |
|    agent/train/policy_gradient_loss  | -0.0175      |
|    agent/train/value_loss            | 0.25

  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 410 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 112 comparisons


Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*--------------------------------------------------------------------------------------------------| 1%

  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -4.02e+03    |
|    agent/time/fps                    | 880          |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 2            |
|    agent/time/total_timesteps        | 6144         |
|    agent/train/approx_kl             | 0.0054165376 |
|    agent/train/clip_fraction         | 0.299        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.29        |
|    agent/train/explained_variance    | 0.46         |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.0422      |
|    agent/train/n_updates             | 20           |
|    agent/train/policy_gradient_loss  | -0.0166      |
|    agent/train/value_loss            | 0.09

  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 340 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 146 comparisons


Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*--------------------------------------------------------------------------------------------------| 1%

  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -4.2e+03     |
|    agent/time/fps                    | 832          |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 2            |
|    agent/time/total_timesteps        | 8192         |
|    agent/train/approx_kl             | 0.0051789386 |
|    agent/train/clip_fraction         | 0.285        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.28        |
|    agent/train/explained_variance    | 0.643        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | -0.00777     |
|    agent/train/n_updates             | 30           |
|    agent/train/policy_gradient_loss  | -0.017       |
|    agent/train/value_loss            | 0.12

  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 290 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 175 comparisons


Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*--------------------------------------------------------------------------------------------------| 1%

  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | -4.27e+03   |
|    agent/time/fps                    | 841         |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 2           |
|    agent/time/total_timesteps        | 10240       |
|    agent/train/approx_kl             | 0.006289848 |
|    agent/train/clip_fraction         | 0.322       |
|    agent/train/clip_range            | 0.1         |
|    agent/train/entropy_loss          | -2.27       |
|    agent/train/explained_variance    | 0.727       |
|    agent/train/learning_rate         | 0.002       |
|    agent/train/loss                  | 0.0277      |
|    agent/train/n_updates             | 40          |
|    agent/train/policy_gradient_loss  | -0.0196     |
|    agent/train/value_loss            | 0.191       |
-----

  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Sampling 250 exploratory transitions.
Progress: |*******--------------------------------------------------------------------------------------------| 7%

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Creating fragment pairs
Gathering preferences
Dataset now contains 200 comparisons


Training reward model:   0%|          | 0/10 [00:00<?, ?it/s]

Training agent for 2000 timesteps
Progress: |*--------------------------------------------------------------------------------------------------| 1%

  gym.logger.warn("Casting input x to numpy array.")


--------------------------------------------------------------------------------------------------------------| 6%
| raw/                                 |              |
|    agent/rollout/ep_rew_wrapped_mean | -4.34e+03    |
|    agent/time/fps                    | 873          |
|    agent/time/iterations             | 1            |
|    agent/time/time_elapsed           | 2            |
|    agent/time/total_timesteps        | 12288        |
|    agent/train/approx_kl             | 0.0050376505 |
|    agent/train/clip_fraction         | 0.293        |
|    agent/train/clip_range            | 0.1          |
|    agent/train/entropy_loss          | -2.26        |
|    agent/train/explained_variance    | 0.868        |
|    agent/train/learning_rate         | 0.002        |
|    agent/train/loss                  | 0.0182       |
|    agent/train/n_updates             | 50           |
|    agent/train/policy_gradient_loss  | -0.0185      |
|    agent/train/value_loss            | 0.21

{'reward_loss': 0.034957232020263164, 'reward_accuracy': 0.9955357142857142}

In [None]:
"""reward_mean, reward_std = evaluate_policy(agent.policy, venv, N_EVAL_EPISODES)
reward_stderr = reward_std/np.sqrt(N_EVAL_EPISODES)

print(f"Reward: {reward_mean:.0f} +/- {reward_stderr:.0f}")"""

Progress: |********-------------------------------------------------------------------------------------------| 8%



Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#


  gym.logger.warn("Casting input x to numpy array.")


Progress: |****************************************************************************************************| 100%
#----------------------------------------------------------------------------------------------#
#----------------------------------------------------------------------------------------------#
Reward: -25131 +/- 5
