In [3]:
from torch import nn
import torch.nn.functional as F
import torch
from torch.distributions import Categorical, Normal
import gymnasium as gym
from tqdm.notebook import tnrange
import numpy as np
import scipy
import wandb
from gymnasium.spaces import Box, Discrete
import os

# wandb.login()

In [2]:
def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.

    input: 
        vector x, 
        [x0, 
         x1, 
         x2]

    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


def combined_shape(length, shape=None):
    if shape is None:
        return (length,)
    return (length, shape) if np.isscalar(shape) else (length, *shape)

class PPOBuffer():
    def __init__(self, observation_dim, action_dim, capacity, gamma, lam):
        self.obs_buf = np.zeros(combined_shape(capacity, observation_dim), dtype=np.float32)
        self.act_buf = np.zeros(combined_shape(capacity, action_dim), dtype=np.float32)
        self.adv_buf = np.zeros(capacity, dtype=np.float32)
        self.rew_buf = np.zeros(capacity, dtype=np.float32)
        self.rtg_buf = np.zeros(capacity, dtype=np.float32)
        self.val_buf = np.zeros(capacity, dtype=np.float32)
        self.logp_buf = np.zeros(capacity, dtype=np.float32)
        self.capacity = capacity
        self.idx = 0
        self.path_idx = 0
        self.gamma = gamma
        self.lam = lam

    def push(self, obs, act, rew, val, logp):
        assert self.idx < self.capacity
        self.obs_buf[self.idx] = obs
        self.act_buf[self.idx] = act
        self.rew_buf[self.idx] = rew
        self.val_buf[self.idx] = val
        self.logp_buf[self.idx] = logp

        self.idx += 1

    def GAE_cal(self, last_val):
        path_slice = slice(self.path_idx, self.idx)
        # to make the deltas the same dim
        rewards = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)

        deltas = rewards[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)

        self.rtg_buf[path_slice] = discount_cumsum(rewards, self.gamma)[:-1]
        self.path_idx = self.idx

                
    def sample(self, minibatch_size, device):
        """This method sample a list of minibatches from the memory

        Args:
            minibatch_size (int): size of minibatch, usually 2^n
            device (object): CPU or GPU

        Returns:
            list: a list of minibatches
        """
        assert self.idx == self.capacity
        # reset the index
        self.idx, self.path_idx = 0, 0
        # normalise advantage
        self.adv_buf = (self.adv_buf - np.mean(self.adv_buf)) / (np.std(self.adv_buf) + 1e-7)
        
        inds = np.arange(self.capacity)
        
        np.random.shuffle(inds)
        
        data = []
        for start in range(0, self.capacity, minibatch_size):
            end = start + minibatch_size
            minibatch_inds = inds[start:end]
            minibatch = dict(obs=self.obs_buf[minibatch_inds], act=self.act_buf[minibatch_inds], \
                             rtg=self.rtg_buf[minibatch_inds], adv=self.adv_buf[minibatch_inds], \
                             logp=self.logp_buf[minibatch_inds])
            data.append({k: torch.as_tensor(v, dtype=torch.float32, device=device) for k,v in minibatch.items()})
        
        return data

In [3]:
class Actor_Net(nn.Module):
    def __init__(self, n_observations, n_actions, num_cells, continous_action):
        super(Actor_Net,self).__init__()
        
        self.layer1 = nn.Linear(n_observations, num_cells)
        self.layer2 = nn.Linear(num_cells, num_cells)
        self.layer3 = nn.Linear(num_cells, num_cells)
        self.layer4 = nn.Linear(num_cells, n_actions)

        self.continous_action = continous_action
        self.action_dim = n_actions
        
        if self.continous_action:
            log_std = -0.5 * np.ones(self.action_dim, dtype=np.float32)
            # Add it to the list of parameters
            self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))            

        

    def forward(self, x):
        activation1 = F.tanh(self.layer1(x))
        activation2 = F.tanh(self.layer2(activation1))
        activation3 = F.tanh(self.layer3(activation2))
        activation4 = self.layer4(activation3)

        return activation4
    
    def act(self, x):
        if self.continous_action:
            mu = self.forward(x)
            std = torch.exp(self.log_std)
            dist = Normal(mu, std)
        else:
            log_probs = F.log_softmax(self.forward(x), dim=1)
            dist = Categorical(log_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        

        return action.detach().cpu().numpy(), action_logprob.detach().cpu().numpy()
    
    def logprob_ent_from_state_acton(self, x, act):
        if self.continous_action:
            mu = self.forward(x)
            std = torch.exp(self.log_std)
            dist = Normal(mu, std)
            # sum term is crucial to reduce dimension, otherwise the ratio = torch.exp(logp - logp_old) will have wrong result with boardcasting
            act_logp = dist.log_prob(act).sum(axis=-1) 
        else:
            dist = Categorical(F.softmax(self.forward(x)))
            act_logp = dist.log_prob(act)
        entropy = dist.entropy()
        
        return entropy, act_logp
    
   
class Critic_Net(nn.Module):
    def __init__(self, n_observations, num_cells):
        super(Critic_Net,self).__init__()
        self.layer1 = nn.Linear(n_observations, num_cells)
        self.layer2 = nn.Linear(num_cells, num_cells)
        self.layer3 = nn.Linear(num_cells, num_cells)
        self.layer4 = nn.Linear(num_cells, 1)

    def forward(self, x):
        activation1 = F.tanh(self.layer1(x))
        activation2 = F.tanh(self.layer2(activation1))
        activation3 = F.tanh(self.layer3(activation2))
        activation4 = self.layer4(activation3)

        return activation4

class Actor_Critic_net(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim, continous_action, parameters_hardshare):

        super(Actor_Critic_net, self).__init__()

        self.parameters_hardshare = parameters_hardshare
        self.continous_action = continous_action
        self.act_dim = act_dim
        if self.continous_action:
            log_std = -0.5 * np.ones(self.act_dim, dtype=np.float32)
            # Add it to the list of parameters
            self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))

        if self.parameters_hardshare:
            self.layer1 = nn.Linear(obs_dim, hidden_dim)
            self.layer2 = nn.Linear(hidden_dim, hidden_dim)
            self.layer3 = nn.Linear(hidden_dim, hidden_dim)

            self.actor_head = nn.Linear(hidden_dim, act_dim)
            self.critic_head = nn.Linear(hidden_dim, 1)

        else:
            self.actor = Actor_Net(obs_dim, act_dim, hidden_dim, continous_action)
            self.critic = Critic_Net(obs_dim, hidden_dim)


    def forward(self, x):

        if self.parameters_hardshare:
            activation1 = F.tanh(self.layer1(x))
            activation2 = F.tanh(self.layer2(activation1))
            activation3 = F.tanh(self.layer3(activation2))
            actor_logits = self.actor_head(activation3)
            value = self.critic_head(activation3)
        else:
            actor_logits = self.actor.forward(x)
            value = self.critic.forward(x)

        return actor_logits, value

    
    def act(self, x):
        if self.continous_action:
            mu, value = self.forward(x)
            std = torch.exp(self.log_std)
            dist = Normal(mu, std)
        else:
            actor_logits, value = self.forward(x)
            log_probs = F.log_softmax(actor_logits, dim=1)
            dist = Categorical(log_probs)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        

        return action.detach().cpu().numpy(), action_logprob.detach().cpu().numpy(), value.detach().item()     

    def logprob_ent_from_state_acton(self, x, act):

        if self.continous_action:
            mu, value = self.forward(x)
            std = torch.exp(self.log_std)
            dist = Normal(mu, std)
            # sum term is crucial to reduce dimension, otherwise the ratio = torch.exp(logp - logp_old) will have wrong result with boardcasting
            act_logp = dist.log_prob(act).sum(axis=-1) 
        else:
            actor_logits, value = self.forward(x)
            dist = Categorical(F.softmax(actor_logits))
            act_logp = dist.log_prob(act)
        entropy = dist.entropy()
        
        return entropy, act_logp, value

In [4]:
class PPO():
    def __init__(self, gamma, lamb, eps_clip, K_epochs, \
                 observation_space, action_space, num_cells, \
                 actor_lr, critic_lr, memory_size , minibatch_size,\
                 max_training_iter, cal_total_loss, c1, c2, \
                    early_stop, kl_threshold, parameters_hardshare, device
                 ):
        self.gamma = gamma
        self.lamb = lamb
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.max_training_iter = max_training_iter

        self.n_observations = observation_space
        self.n_actions = action_space
        self.memory_size = memory_size
        self.minibatch_size = minibatch_size
        
        self.cal_total_loss = cal_total_loss
        self.c1 = c1
        self.c2 = c2
        self.early_stop = early_stop
        self.kl_threshold = kl_threshold

        self.parameters_hardshare = parameters_hardshare


        if isinstance(action_space, Box):
            self.continous_action = True
        elif isinstance(action_space, Discrete):
            self.continous_action = False
        else:
            raise AssertionError(f"action space is not valid {action_space}")


        self.observtion_dim = observation_space.shape[0]

        # self.actor = Actor_Net(self.observtion_dim, \
        #                        action_space.shape[0] if self.continous_action else action_space.n, \
        #                           num_cells, self.continous_action).to(device)
          
        # self.critic = Critic_Net(self.observtion_dim, num_cells).to(device)
        self.actor_critic = Actor_Critic_net(self.observtion_dim, \
                               action_space.shape[0] if self.continous_action else action_space.n, \
                                  num_cells, self.continous_action, parameters_hardshare).to(device)

        # self.actor_opt = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        # self.critic_opt = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        if parameters_hardshare:
            self.actor_critic_opt = torch.optim.Adam(self.actor_critic.parameters(), lr=actor_lr)
            
        else:
            self.actor_critic_opt = torch.optim.Adam([ 
                {'params': self.actor_critic.actor.parameters(), 'lr': actor_lr},
                {'params': self.actor_critic.critic.parameters(), 'lr': critic_lr} 
            ])


        self.memory = PPOBuffer(observation_space.shape, action_space.shape, memory_size, gamma, lamb)

        self.device = device
        
        # wandb.watch(self.actor, log='all', log_freq=100, idx=1)
        # wandb.watch(self.critic, log='all', log_freq=100, idx=2)
        wandb.watch(self.actor_critic, log='all', log_freq=100)

    def roll_out(self, env):
        # TODO: implement multiple thread
        # make a new environment instance
        

        # Maybe a deep copy is necessary for multi-thread processing
        obs, _ = env.reset()

        ep_reward = 0
        ep_count = 0

        action_shape = env.action_space.shape
        # Run the policy for T timestep
        for i in tnrange(self.memory_size, desc="roll_out", leave=False):

            obs_tensor = torch.tensor(obs, \
                                    dtype=torch.float32, device=self.device).unsqueeze(0)
            
            # action, action_logprob = self.actor.act(obs_tensor)

            # action = action.reshape(action_shape)
             
            # value = self.critic.forward(obs_tensor).item()
            

            action, action_logprob, value = self.actor_critic.act(obs_tensor)
            
            action = action.reshape(action_shape)

            next_obs, reward, terminated, truncated, _ = env.step(action)

            self.memory.push(obs, action, reward, value, action_logprob)

            obs = next_obs

            ep_reward += reward

            if terminated or truncated:
                if truncated:
                    # last_value = self.critic.forward(torch.tensor(next_obs, dtype=torch.float32, device=self.device)).item()
                    _, last_value = self.actor_critic.forward(torch.tensor(next_obs, dtype=torch.float32, device=self.device))
                    last_value = last_value.item()
                else:
                    last_value = 0

                
                self.memory.GAE_cal(last_value)
          
                obs,_ = env.reset()
                ep_count += 1
                wandb.log({'episode_reward':ep_reward})
                ep_reward = 0


    # def compute_loss(self, data):
    #     observations, actions, logp_old = data['obs'], data['act'], data['logp']
    #     advs, rtgs = data['adv'], data['rtg']

    #     # Calculate the pi_theta (a_t|s_t)
    #     entropy, logp = self.actor.logprob_ent_from_state_acton(observations, actions)
    #     ratio = torch.exp(logp - logp_old)
    #     # Kl approx according to http://joschu.net/blog/kl-approx.html
    #     kl_apx = ((ratio - 1) - (logp - logp_old)).mean()
    #     wandb.log({'KL approx': kl_apx})
    #     clip_advs = torch.clamp(ratio, 1-self.eps_clip, 1+self.eps_clip) * advs
    #     # Torch Adam implement tation mius the gradient, to plus the gradient, we need make the loss negative
    #     actor_loss = -(torch.min(ratio*advs, clip_advs)).mean()

    #     values = self.critic(observations).squeeze()
    #     critic_loss = nn.MSELoss()(values, rtgs)

    #     entropy_loss = entropy.mean()

    #     return actor_loss, critic_loss, entropy_loss, kl_apx

    def compute_loss(self, data):
        observations, actions, logp_old = data['obs'], data['act'], data['logp']
        advs, rtgs = data['adv'], data['rtg']

        # Calculate the pi_theta (a_t|s_t)
        entropy, logp, values = self.actor_critic.logprob_ent_from_state_acton(observations, actions)
        ratio = torch.exp(logp - logp_old)
        # Kl approx according to http://joschu.net/blog/kl-approx.html
        kl_apx = ((ratio - 1) - (logp - logp_old)).mean()
        wandb.log({'KL_approx': kl_apx})
        clip_advs = torch.clamp(ratio, 1-self.eps_clip, 1+self.eps_clip) * advs
        # Torch Adam implement tation mius the gradient, to plus the gradient, we need make the loss negative
        actor_loss = -(torch.min(ratio*advs, clip_advs)).mean()

        values = values.squeeze()
        critic_loss = nn.MSELoss()(values, rtgs)

        entropy_loss = entropy.mean()

        return actor_loss, critic_loss, entropy_loss, kl_apx        

    def optimise(self):

        data = self.memory.sample(self.minibatch_size, self.device)

        early_stop_count = 0
        
        # for _ in tnrange(self.K_epochs, desc=f"epochs", position=1, leave=False):
        for _ in range(self.K_epochs):
            
            for minibatch in data:
            
                actor_loss, critic_loss, entropy_loss, kl_apx = self.compute_loss(minibatch)

                if self.cal_total_loss:
                    total_loss = actor_loss + self.c1 * critic_loss - self.c2 * entropy_loss

                # If this update is too big, early stop and try next minibatch
                if self.early_stop and kl_apx > self.kl_threshold:
                    early_stop_count += 1
                    continue

                # self.actor_opt.zero_grad()
                # self.critic_opt.zero_grad()
                # if self.cal_total_loss:
                #     wandb.log({'total_loss': total_loss})
                #     total_loss.backward()
                #     self.actor_opt.step()
                #     self.critic_opt.step()
                # else:
                #     wandb.log({'actor_loss': actor_loss, 'critic_loss': critic_loss})
                #     actor_loss.backward()
                #     self.actor_opt.step()
                #     critic_loss.backward()
                #     self.critic_opt.step()
                
                self.actor_critic_opt.zero_grad()
                if self.cal_total_loss:
                    wandb.log({'total_loss': total_loss})
                    total_loss.backward()
                    self.actor_critic_opt.step()

                else:
                    wandb.log({'actor_loss': actor_loss, 'critic_loss': critic_loss})
                    actor_loss.backward()
                    critic_loss.backward()
                    self.actor_critic_opt.step()
            
        wandb.run.summary['early_stop_count'] = early_stop_count    

                
    def train(self, env):

        for i in tnrange(self.max_training_iter // self.memory_size):

            self.roll_out(env)

            self.optimise()

        # save the model to the wandb run folder
        PATH = os.path.join(wandb.run.dir, "actor_critic.pt")
        torch.save(self.actor_critic.state_dict(), PATH)

        
    
            

        


In [5]:
def main():



    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    run = wandb.init(
            # project='PPO'
        )


    gamma = 0.95
    # gamma = wandb.config.gamma
    lamb = 0.99
    # lamb = wandb.config.lam
    eps_clip = 0.2
    max_training_iter = 200000
    K_epochs = 40
    num_cells = 64
    actor_lr = 4e-4 
    # actor_lr = wandb.config.actor_lr
    critic_lr = 5e-3
    # critic_lr = wandb.config.critic_lr
    memory_size = 1024
    # minibatch_size = 256
    minibatch_size = wandb.config.minibatch_size
    
    # c1 = 0.5
    # c2 = 0.9
    kl_threshold = 0.013
    c1 = wandb.config.c1
    c2 = wandb.config.c2
    # kl_threshold = wandb.config.kl_threshold
    
    env_name = "Pendulum-v1" # CartPole-v1
    # parameters_hardshare = False
    # early_stop = True
    # cal_total_loss = True
    parameters_hardshare = wandb.config.parameters_hardshare
    early_stop = wandb.config.early_stop
    cal_total_loss = wandb.config.cal_total_loss

    wandb.config.update(
        {
            'actor_lr' : actor_lr,
            'critic_lr' : critic_lr,
            'gamma' : gamma,
            'lambda' : lamb,
            'eps_clip' : eps_clip,
            'max_training_iter' : max_training_iter,
            'k_epochs' : K_epochs,
            'hidden_cell_dim' : num_cells,
            'memory_size' : memory_size,
            'minibatch_size' : minibatch_size,
            'cal_total_loss' : cal_total_loss,
            'c1' : c1,
            'c2' : c2,
            'early_stop' : early_stop,
            'env_name': env_name,
            'kl_threshold' : kl_threshold

        }
    )

    # wandb.define_metric("episode_reward", summary="mean")
    wandb.define_metric("KL_approx", summary="mean")
        
           
    env = gym.make(env_name)

    my_ppo = PPO(gamma, lamb, eps_clip, K_epochs, env.observation_space, env.action_space, num_cells,\
                 actor_lr, critic_lr, memory_size, minibatch_size, max_training_iter, \
                 cal_total_loss, c1, c2, early_stop, kl_threshold, parameters_hardshare, device)
    
    my_ppo.train(env)

    env.close()
    run.finish()

In [6]:
# %%wandb
# main()

### Sweep for HalfCheetah
#### Continous action space

In [None]:
sweep_configuration = {
    'method': 'random',
    'metric':{'goal':'maximize', 'name':'episode_reward'},
    'parameters':
    {
        'early_stop': {'value': False},
        'cal_total_loss' : {'value' : True},
        'parameters_hardshare' : {'value' : False},
        'c1' : {'value': 0.5020639303776493},
        'c2' : {'value' : 0.910077248529638},
        # 'kl_threshold' : {'min': 0.01, 'max': 0.04},
        'minibatch_size' : {'values' : [128, 256, 512, 1024]}
    }
}
%env "WANDB_NOTEBOOK_NAME" "PPO_GYM"
sweep_id = wandb.sweep(sweep=sweep_configuration, project='PPO-Pendulum-2')
wandb.agent(sweep_id, function=main, count=4)

### Sweep configuration for Pendulum
#### Continous action space

In [7]:
sweep_configuration = {
    'method': 'grid',
    'metric':{'goal':'maximize', 'name':'episode_reward'},
    'parameters':
    {
        'early_stop': {'value': False},
        'cal_total_loss' : {'value' : True},
        'parameters_hardshare' : {'value' : False},
        'c1' : {'value': 0.5020639303776493},
        'c2' : {'value' : 0.910077248529638},
        # 'kl_threshold' : {'min': 0.01, 'max': 0.04},
        'minibatch_size' : {'values' : [128, 256, 512, 1024]}
    }
}
%env "WANDB_NOTEBOOK_NAME" "PPO_GYM"
sweep_id = wandb.sweep(sweep=sweep_configuration, project='PPO-Pendulum-2')
wandb.agent(sweep_id, function=main, count=4)

env: "WANDB_NOTEBOOK_NAME"="PPO_GYM"


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: 5au3cuvc
Sweep URL: https://wandb.ai/tcd-clearway/PPO-Pendulum-2/sweeps/5au3cuvc


[34m[1mwandb[0m: Agent Starting Run: zktfg548 with config:
[34m[1mwandb[0m: 	c1: 0.5020639303776493
[34m[1mwandb[0m: 	c2: 0.910077248529638
[34m[1mwandb[0m: 	cal_total_loss: True
[34m[1mwandb[0m: 	early_stop: False
[34m[1mwandb[0m: 	minibatch_size: 128
[34m[1mwandb[0m: 	parameters_hardshare: False
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstan-wenlong-wang[0m ([33mtcd-clearway[0m). Use [1m`wandb login --relogin`[0m to force relogin




  0%|          | 0/195 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
KL_approx,▁▁▁▁▁▂▁▁▁▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▂█▂▁▁▁▁▂▁▁▁
episode_reward,▃▄▄▄▄▆▇▇▇▇▇▇█▇▇▆▇▇▆▇▇▇▁▇▇▇▆▇█▇▇▆▄▇▇▆▇▇▇█
total_loss,█▃▆▃▃▂▃▄▂▁▃▁▁▁▁▂▂▁▂▁▂▄▁▂▁▁▁▁▂▁▂▂▂▁▁▂▁▂▅▁

0,1
early_stop_count,0.0
episode_reward,-130.87765
total_loss,2.7076


[34m[1mwandb[0m: Agent Starting Run: 2i1pxj2d with config:
[34m[1mwandb[0m: 	c1: 0.5020639303776493
[34m[1mwandb[0m: 	c2: 0.910077248529638
[34m[1mwandb[0m: 	cal_total_loss: True
[34m[1mwandb[0m: 	early_stop: False
[34m[1mwandb[0m: 	minibatch_size: 256
[34m[1mwandb[0m: 	parameters_hardshare: False
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




  0%|          | 0/195 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

0,1
KL_approx,▁▁▁▂▂▂▂▃▂▁▂▁▂▁▂▁▂▄▄▁▁▁▂▁▂▂▂▂▂▂▂▅▂▁▃▄▂▂▄█
episode_reward,▂▁▃▂▄▅▅▅▇▇▇█▃▇█▇██▇▇▇▇▇███▇▇▇▇▇▇▇▇▇▇▇▇▆▇
total_loss,█▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
early_stop_count,0.0
episode_reward,-0.62468
total_loss,14.18579


[34m[1mwandb[0m: Agent Starting Run: qjkvcmdc with config:
[34m[1mwandb[0m: 	c1: 0.5020639303776493
[34m[1mwandb[0m: 	c2: 0.910077248529638
[34m[1mwandb[0m: 	cal_total_loss: True
[34m[1mwandb[0m: 	early_stop: False
[34m[1mwandb[0m: 	minibatch_size: 512
[34m[1mwandb[0m: 	parameters_hardshare: False
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…



  0%|          | 0/195 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
KL_approx,▂▁▂▂▃▃▁▂▂▃▂▂▄▄▃▂▂▂▃▁▂▂▄▂▄▃▂▁▁▅▂▂▅▂▂▆▂█▂▃
episode_reward,▂▄▄▄▂▄▄▄▃▃▃▁▄▄▆▇▇▇▇▇██▇▃▆▆▇▇▇█▄▇▇▇▆█▇▇▇▇
total_loss,█▃▂▂▂▂▂▄▂▂▂▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
early_stop_count,0.0
episode_reward,-129.76039
total_loss,3.01893


[34m[1mwandb[0m: Agent Starting Run: 2mggyfu5 with config:
[34m[1mwandb[0m: 	c1: 0.5020639303776493
[34m[1mwandb[0m: 	c2: 0.910077248529638
[34m[1mwandb[0m: 	cal_total_loss: True
[34m[1mwandb[0m: 	early_stop: False
[34m[1mwandb[0m: 	minibatch_size: 1024
[34m[1mwandb[0m: 	parameters_hardshare: False
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




  0%|          | 0/195 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

roll_out:   0%|          | 0/1024 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.077 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.016737…

0,1
KL_approx,▂▂▁▃▃▃█▅▃▄▆▄▆▄▂▄▃▅▅▂▄█▃▂▇▃▅▂▃▂▃▂▃▁▅▅▇▄▄▂
episode_reward,▄▁▃▄▂▃▃▄▅▄▅▅▇▇▇▇▇▇▇▇▇▇▄█▇▇▇▇▅▇▆▇▂▇▇▇▇██▄
total_loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
early_stop_count,0.0
episode_reward,-120.12987
total_loss,25.297
