In [30]:
import mujoco
import gym
import tianshou as ts
import numpy as np
import torch as tc
import numpy as np
from torch import nn

from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger


In [37]:

if tc.cuda.is_available():
    # gpu_num = input('Enter GPU number: ')
    gpu_num = 0
    if gpu_num == 'all' or gpu_num == '':
        device = tc.device('cuda')
    else:
        device = tc.device(f'cuda:{gpu_num}')
    print('CUDA is available!')
    # SET CUDA_LAUNCH_BLOCKING enable
    # clean cuda memory
    tc.cuda.empty_cache()

    # clear cuda memory
    # tc.cuda.reset_max_memory_allocated()
elif tc.backends.mps.is_available():
    device = tc.device('mps')
    print('MPS is available!')
else:
    device = tc.device('cpu')
    print('CPU...')

MPS is available!


In [2]:
env = gym.make('custom_env:XiaoTian-v0')
obs = env.reset()

  logger.warn("A Box action space maximum and minimum values are equal.")


custom_env:XiaoTian-v0
custom_env:XiaoTian-v0

In [43]:
train_envs = ts.env.DummyVectorEnv(
    [lambda: gym.make('custom_env:XiaoTian-v0') for _ in range(100)])
test_envs = ts.env.DummyVectorEnv(
    [lambda: gym.make('custom_env:XiaoTian-v0') for _ in range(100)])


In [126]:
state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n

class BasePolicyNet(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, tc.Tensor):
            obs = tc.tensor(obs, dtype=tc.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

class saQsaNet(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape) + np.prod(action_shape), 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 1),
        )

    def forward(self, obs, act, state=None, info={}):#obs is observation, state is hidden state
        if not isinstance(obs, tc.Tensor):
            obs = tc.tensor(obs, dtype=tc.float)
        if not isinstance(act, tc.Tensor):
            act = tc.tensor(act, dtype=tc.float)
        batch = obs.shape[0]
        x = tc.cat([obs.view(batch, -1), act.view(batch, -1)], dim=1)
        q_value = self.model(x)
        return q_value

In [114]:

class SACNet(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )
        self.model2 = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)), nn.Sigmoid(),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, tc.Tensor):
            obs = tc.tensor(obs, dtype=tc.float)
        batch = obs.shape[0]
        logits = (self.model(obs.view(batch, -1)),
                  self.model2(obs.view(batch, -1)))

        return logits, state
SACNet


DDPG parameters
Parameters
actor (torch.nn.Module) – the actor network following the rules in BasePolicy. (s -> logits)

actor_optim (torch.optim.Optimizer) – the optimizer for actor network.

critic (torch.nn.Module) – the critic network. (s, a -> Q(s, a))

critic_optim (torch.optim.Optimizer) – the optimizer for critic network.

tau (float) – param for soft update of the target network. Default to 0.005.

gamma (float) – discount factor, in [0, 1]. Default to 0.99.

exploration_noise (BaseNoise) – the exploration noise, add to the action. Default to GaussianNoise(sigma=0.1).

reward_normalization (bool) – normalize the reward to Normal(0, 1), Default to False.

estimation_step (int) – the number of steps to look ahead. Default to 1.

action_scaling (bool) – whether to map actions from range [-1, 1] to range [action_spaces.low, action_spaces.high]. Default to True.

action_bound_method (str) – method to bound action to range [-1, 1], can be either “clip” (for simply clipping the action) or empty string for no bounding. Default to “clip”.

action_space (Optional[gym.Space]) – env’s action space, mandatory if you want to use option “action_scaling” or “action_bound_method”. Default to None.

lr_scheduler – a learning rate scheduler that adjusts the learning rate in optimizer in each policy.update(). Default to None (no lr_scheduler).

In [127]:
DDPGaNet = BasePolicyNet(state_shape, action_shape)
DDPGcNet = saQsaNet(state_shape, action_shape)
DDPGAoptim = tc.optim.Adam(DDPGaNet.parameters(), lr=1e-3)
DDPGCoptim = tc.optim.Adam(DDPGcNet.parameters(), lr=1e-3)
DDPGpolicy = ts.policy.DDPGPolicy(actor=DDPGaNet, actor_optim=DDPGAoptim, critic=DDPGcNet, critic_optim=DDPGCoptim, estimation_step=3)
DDPGtrain_collector = ts.data.Collector(
    DDPGpolicy, train_envs, ts.data.VectorReplayBuffer(20000, 100), exploration_noise=True)
DDPGtest_collector = ts.data.Collector(
    DDPGpolicy, test_envs, exploration_noise=True)


SAVPolicy
Parameters
actor (torch.nn.Module) – the actor network following the rules in BasePolicy. (s -> logits)

actor_optim (torch.optim.Optimizer) – the optimizer for actor network.

critic1 (torch.nn.Module) – the first critic network. (s, a -> Q(s, a))

critic1_optim (torch.optim.Optimizer) – the optimizer for the first critic network.

critic2 (torch.nn.Module) – the second critic network. (s, a -> Q(s, a))

critic2_optim (torch.optim.Optimizer) – the optimizer for the second critic network.

tau (float) – param for soft update of the target network. Default to 0.005.

gamma (float) – discount factor, in [0, 1]. Default to 0.99.

torch.Tensor, torch.optim.Optimizer) or float alpha ((float,) – entropy regularization coefficient. Default to 0.2. If a tuple (target_entropy, log_alpha, alpha_optim) is provided, then alpha is automatically tuned.

reward_normalization (bool) – normalize the reward to Normal(0, 1). Default to False.

exploration_noise (BaseNoise) – add a noise to action for exploration. Default to None. This is useful when solving hard-exploration problem.

deterministic_eval (bool) – whether to use deterministic action (mean of Gaussian policy) instead of stochastic action sampled by the policy. Default to True.

action_scaling (bool) – whether to map actions from range [-1, 1] to range [action_spaces.low, action_spaces.high]. Default to True.

action_bound_method (str) – method to bound action to range [-1, 1], can be either “clip” (for simply clipping the action) or empty string for no bounding. Default to “clip”.

action_space (Optional[gym.Space]) – env’s action space, mandatory if you want to use option “action_scaling” or “action_bound_method”. Default to None.

lr_scheduler – a learning rate scheduler that adjusts the learning rate in optimizer in each policy.update(). Default to None (no lr_scheduler).

In [115]:
SACaNet = SACNet(state_shape, action_shape)
SACc1Net = saQsaNet(state_shape, action_shape)
SACc2Net = saQsaNet(state_shape, action_shape)
SACaOptim = tc.optim.Adam(SACaNet.parameters(), lr=1e-3)
SACc1Optim = tc.optim.Adam(SACc1Net.parameters(), lr=1e-3)
SACc2Optim = tc.optim.Adam(SACc2Net.parameters(), lr=1e-3)
SACppolicy = ts.policy.SACPolicy(actor=SACaNet, actor_optim=SACaOptim, critic1=SACc1Net,
                                 critic1_optim=SACc1Optim, critic2=SACc2Net, critic2_optim=SACc2Optim, estimation_step=3)


SACtrain_collector = ts.data.Collector(
    SACppolicy, train_envs, ts.data.VectorReplayBuffer(20000, 100), exploration_noise=True)
SACtest_collector = ts.data.Collector(
    SACppolicy, test_envs, exploration_noise=True)


In [None]:
DDPGwriter = SummaryWriter('log/DDPGdqn')
DDPGlogger = TensorboardLogger(DDPGwriter)
result = ts.trainer.offpolicy_trainer(
    DDPGpolicy, DDPGtrain_collector, DDPGtest_collector,
    max_epoch=2000, step_per_epoch=100, step_per_collect=10,
    update_per_step=0.1, episode_per_test=15, batch_size=64,
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold,
    logger = DDPGlogger)
print(f'Finished training! Use {result["duration"]}')
tc.save(DDPGpolicy.state_dict(), 'tianshou_results/log/DDPGdqn.pth')
tc.load(DDPGpolicy.state_dict(), 'tianshou_results/log/DDPGdqn.pth')

Epoch #1: 101it [00:00, 988.38it/s, env_step=100, len=1, loss/actor=-1.434, loss/critic=4.410, n/ep=100, n/st=100, rew=0.45]                         


Epoch #1: test_reward: -0.002026 ± 0.206586, best_reward: 0.460380 ± 0.196909 in #0


Epoch #2: 101it [00:00, 954.51it/s, env_step=200, len=1, loss/actor=-1.933, loss/critic=3.947, n/ep=100, n/st=100, rew=-0.00]                         


Epoch #2: test_reward: 0.026730 ± 0.182932, best_reward: 0.460380 ± 0.196909 in #0


Epoch #3: 101it [00:00, 976.20it/s, env_step=300, len=1, loss/actor=-2.184, loss/critic=3.925, n/ep=100, n/st=100, rew=0.06]                         


Epoch #3: test_reward: -0.535785 ± 0.277432, best_reward: 0.460380 ± 0.196909 in #0


Epoch #4: 101it [00:00, 987.94it/s, env_step=400, len=1, loss/actor=-2.082, loss/critic=4.060, n/ep=100, n/st=100, rew=-0.51]                         


Epoch #4: test_reward: -0.366431 ± 0.256317, best_reward: 0.460380 ± 0.196909 in #0


Epoch #5: 101it [00:00, 964.97it/s, env_step=500, len=1, loss/actor=-2.151, loss/critic=3.544, n/ep=100, n/st=100, rew=-0.35]                         


Epoch #5: test_reward: -0.908908 ± 0.334102, best_reward: 0.460380 ± 0.196909 in #0


Epoch #6: 101it [00:00, 944.01it/s, env_step=600, len=1, loss/actor=-2.176, loss/critic=3.350, n/ep=100, n/st=100, rew=-0.89]                         


Epoch #6: test_reward: -0.685961 ± 0.354213, best_reward: 0.460380 ± 0.196909 in #0


Epoch #7: 101it [00:00, 939.40it/s, env_step=700, len=1, loss/actor=-2.137, loss/critic=3.007, n/ep=100, n/st=100, rew=-0.70]                         


Epoch #7: test_reward: -0.947898 ± 0.221752, best_reward: 0.460380 ± 0.196909 in #0


Epoch #8: 101it [00:00, 966.32it/s, env_step=800, len=1, loss/actor=-2.071, loss/critic=2.869, n/ep=100, n/st=100, rew=-1.09]                         


Epoch #8: test_reward: -1.030960 ± 0.290892, best_reward: 0.460380 ± 0.196909 in #0


Epoch #9: 101it [00:00, 945.47it/s, env_step=900, len=1, loss/actor=-2.041, loss/critic=2.674, n/ep=100, n/st=100, rew=-1.18]                         


Epoch #9: test_reward: -1.213694 ± 0.467378, best_reward: 0.460380 ± 0.196909 in #0


Epoch #10: 101it [00:00, 957.52it/s, env_step=1000, len=1, loss/actor=-1.949, loss/critic=2.508, n/ep=100, n/st=100, rew=-1.29]                         


Epoch #10: test_reward: -1.334788 ± 0.295295, best_reward: 0.460380 ± 0.196909 in #0


Epoch #11: 101it [00:00, 973.19it/s, env_step=1100, len=1, loss/actor=-1.944, loss/critic=2.239, n/ep=100, n/st=100, rew=-1.21]                         


Epoch #11: test_reward: -1.342734 ± 0.431586, best_reward: 0.460380 ± 0.196909 in #0


Epoch #12: 101it [00:00, 983.56it/s, env_step=1200, len=1, loss/actor=-1.842, loss/critic=1.988, n/ep=100, n/st=100, rew=-1.26]                         


Epoch #12: test_reward: -1.359042 ± 0.450050, best_reward: 0.460380 ± 0.196909 in #0


Epoch #13: 101it [00:00, 1002.13it/s, env_step=1300, len=1, loss/actor=-1.655, loss/critic=1.663, n/ep=100, n/st=100, rew=-1.30]                        


Epoch #13: test_reward: -1.396734 ± 0.361152, best_reward: 0.460380 ± 0.196909 in #0


Epoch #14: 101it [00:00, 995.83it/s, env_step=1400, len=1, loss/actor=-1.575, loss/critic=1.273, n/ep=100, n/st=100, rew=-1.37]                         


Epoch #14: test_reward: -1.476754 ± 0.456231, best_reward: 0.460380 ± 0.196909 in #0


Epoch #15: 101it [00:00, 719.73it/s, env_step=1500, len=1, loss/actor=-1.418, loss/critic=1.194, n/ep=100, n/st=100, rew=-1.44]                         


Epoch #15: test_reward: -1.191282 ± 0.377401, best_reward: 0.460380 ± 0.196909 in #0


Epoch #16: 101it [00:00, 988.96it/s, env_step=1600, len=1, loss/actor=-1.273, loss/critic=1.014, n/ep=100, n/st=100, rew=-1.40]                         


Epoch #16: test_reward: -1.294166 ± 0.444312, best_reward: 0.460380 ± 0.196909 in #0


Epoch #17: 101it [00:00, 1010.98it/s, env_step=1700, len=1, loss/actor=-1.175, loss/critic=0.996, n/ep=100, n/st=100, rew=-1.29]                         


Epoch #17: test_reward: -1.453735 ± 0.467283, best_reward: 0.460380 ± 0.196909 in #0


Epoch #18: 101it [00:00, 1004.93it/s, env_step=1800, len=1, loss/actor=-1.073, loss/critic=0.868, n/ep=100, n/st=100, rew=-1.20]                         


Epoch #18: test_reward: -1.188260 ± 0.457969, best_reward: 0.460380 ± 0.196909 in #0


Epoch #19: 101it [00:00, 1001.14it/s, env_step=1900, len=1, loss/actor=-0.951, loss/critic=0.828, n/ep=100, n/st=100, rew=-1.03]                        


Epoch #19: test_reward: -1.024670 ± 0.289765, best_reward: 0.460380 ± 0.196909 in #0


Epoch #20: 101it [00:00, 996.99it/s, env_step=2000, len=1, loss/actor=-0.891, loss/critic=0.768, n/ep=100, n/st=100, rew=-0.91]                         


Epoch #20: test_reward: -0.799134 ± 0.300218, best_reward: 0.460380 ± 0.196909 in #0


Epoch #21: 101it [00:00, 984.14it/s, env_step=2100, len=1, loss/actor=-0.797, loss/critic=0.635, n/ep=100, n/st=100, rew=-0.75]                         


Epoch #21: test_reward: -0.523631 ± 0.340103, best_reward: 0.460380 ± 0.196909 in #0


Epoch #22: 101it [00:00, 993.88it/s, env_step=2200, len=1, loss/actor=-0.704, loss/critic=0.597, n/ep=100, n/st=100, rew=-0.55]                         


Epoch #22: test_reward: -0.364668 ± 0.317003, best_reward: 0.460380 ± 0.196909 in #0


Epoch #23: 101it [00:00, 989.26it/s, env_step=2300, len=1, loss/actor=-0.664, loss/critic=0.595, n/ep=100, n/st=100, rew=-0.44]                         


Epoch #23: test_reward: -0.313744 ± 0.259535, best_reward: 0.460380 ± 0.196909 in #0


Epoch #24: 101it [00:00, 1010.21it/s, env_step=2400, len=1, loss/actor=-0.610, loss/critic=0.599, n/ep=100, n/st=100, rew=-0.31]                         


Epoch #24: test_reward: -0.269505 ± 0.316681, best_reward: 0.460380 ± 0.196909 in #0


Epoch #25: 101it [00:00, 987.75it/s, env_step=2500, len=1, loss/actor=-0.560, loss/critic=0.584, n/ep=100, n/st=100, rew=-0.18]                         


Epoch #25: test_reward: -0.106823 ± 0.197442, best_reward: 0.460380 ± 0.196909 in #0


Epoch #26: 101it [00:00, 990.21it/s, env_step=2600, len=1, loss/actor=-0.517, loss/critic=0.557, n/ep=100, n/st=100, rew=-0.06]                         


Epoch #26: test_reward: -0.032576 ± 0.270639, best_reward: 0.460380 ± 0.196909 in #0


Epoch #27: 101it [00:00, 982.75it/s, env_step=2700, len=1, loss/actor=-0.458, loss/critic=0.531, n/ep=100, n/st=100, rew=0.01]                         


Epoch #27: test_reward: 0.193841 ± 0.163697, best_reward: 0.460380 ± 0.196909 in #0


Epoch #28: 101it [00:00, 995.99it/s, env_step=2800, len=1, loss/actor=-0.427, loss/critic=0.514, n/ep=100, n/st=100, rew=0.17]                         


Epoch #28: test_reward: 0.215364 ± 0.189003, best_reward: 0.460380 ± 0.196909 in #0


Epoch #29: 101it [00:00, 988.58it/s, env_step=2900, len=1, loss/actor=-0.410, loss/critic=0.479, n/ep=100, n/st=100, rew=0.22]                         


Epoch #29: test_reward: 0.297664 ± 0.127638, best_reward: 0.460380 ± 0.196909 in #0


Epoch #30: 101it [00:00, 994.86it/s, env_step=3000, len=1, loss/actor=-0.395, loss/critic=0.463, n/ep=100, n/st=100, rew=0.25]                         


Epoch #30: test_reward: 0.415504 ± 0.179686, best_reward: 0.460380 ± 0.196909 in #0


Epoch #31: 101it [00:00, 987.95it/s, env_step=3100, len=1, loss/actor=-0.400, loss/critic=0.448, n/ep=100, n/st=100, rew=0.33]                         


Epoch #31: test_reward: 0.341001 ± 0.148430, best_reward: 0.460380 ± 0.196909 in #0


Epoch #32: 101it [00:00, 991.52it/s, env_step=3200, len=1, loss/actor=-0.391, loss/critic=0.446, n/ep=100, n/st=100, rew=0.33]                         


Epoch #32: test_reward: 0.210035 ± 0.242153, best_reward: 0.460380 ± 0.196909 in #0


Epoch #33: 101it [00:00, 977.54it/s, env_step=3300, len=1, loss/actor=-0.422, loss/critic=0.442, n/ep=100, n/st=100, rew=0.29]                         


Epoch #33: test_reward: 0.384298 ± 0.150891, best_reward: 0.460380 ± 0.196909 in #0


Epoch #34: 101it [00:00, 1000.60it/s, env_step=3400, len=1, loss/actor=-0.418, loss/critic=0.435, n/ep=100, n/st=100, rew=0.25]                        


Epoch #34: test_reward: -0.000754 ± 0.152615, best_reward: 0.460380 ± 0.196909 in #0


Epoch #35: 101it [00:00, 1005.12it/s, env_step=3500, len=1, loss/actor=-0.461, loss/critic=0.448, n/ep=100, n/st=100, rew=0.03]                         


Epoch #35: test_reward: -0.147510 ± 0.293751, best_reward: 0.460380 ± 0.196909 in #0


Epoch #36: 101it [00:00, 1002.45it/s, env_step=3600, len=1, loss/actor=-0.500, loss/critic=0.459, n/ep=100, n/st=100, rew=-0.16]                        


Epoch #36: test_reward: -0.552151 ± 0.291328, best_reward: 0.460380 ± 0.196909 in #0


Epoch #37: 101it [00:00, 1004.83it/s, env_step=3700, len=1, loss/actor=-0.551, loss/critic=0.444, n/ep=100, n/st=100, rew=-0.56]                         


Epoch #37: test_reward: -0.790597 ± 0.233262, best_reward: 0.460380 ± 0.196909 in #0


Epoch #38: 101it [00:00, 963.76it/s, env_step=3800, len=1, loss/actor=-0.605, loss/critic=0.424, n/ep=100, n/st=100, rew=-0.77]                         


Epoch #38: test_reward: -1.105157 ± 0.317170, best_reward: 0.460380 ± 0.196909 in #0


Epoch #39: 101it [00:00, 976.72it/s, env_step=3900, len=1, loss/actor=-0.646, loss/critic=0.442, n/ep=100, n/st=100, rew=-1.08]                         


Epoch #39: test_reward: -1.098142 ± 0.207656, best_reward: 0.460380 ± 0.196909 in #0


Epoch #40: 101it [00:00, 992.84it/s, env_step=4000, len=1, loss/actor=-0.700, loss/critic=0.463, n/ep=100, n/st=100, rew=-1.01]                         


Epoch #40: test_reward: -1.278566 ± 0.359806, best_reward: 0.460380 ± 0.196909 in #0


Epoch #41: 101it [00:00, 1004.55it/s, env_step=4100, len=1, loss/actor=-0.718, loss/critic=0.487, n/ep=100, n/st=100, rew=-1.18]                         


Epoch #41: test_reward: -1.244178 ± 0.401917, best_reward: 0.460380 ± 0.196909 in #0


Epoch #42: 101it [00:00, 993.96it/s, env_step=4200, len=1, loss/actor=-0.767, loss/critic=0.466, n/ep=100, n/st=100, rew=-1.24]                         


Epoch #42: test_reward: -1.711577 ± 0.388460, best_reward: 0.460380 ± 0.196909 in #0


Epoch #43: 101it [00:00, 965.63it/s, env_step=4300, len=1, loss/actor=-0.760, loss/critic=0.452, n/ep=100, n/st=100, rew=-1.59]                         


Epoch #43: test_reward: -1.613049 ± 0.379959, best_reward: 0.460380 ± 0.196909 in #0


Epoch #44: 101it [00:00, 989.89it/s, env_step=4400, len=1, loss/actor=-0.774, loss/critic=0.428, n/ep=100, n/st=100, rew=-1.65]                         


Epoch #44: test_reward: -1.971947 ± 0.398427, best_reward: 0.460380 ± 0.196909 in #0


Epoch #45: 101it [00:00, 957.62it/s, env_step=4500, len=1, loss/actor=-0.738, loss/critic=0.399, n/ep=100, n/st=100, rew=-1.89]                         


Epoch #45: test_reward: -1.652075 ± 0.362804, best_reward: 0.460380 ± 0.196909 in #0


Epoch #46: 101it [00:00, 997.48it/s, env_step=4600, len=1, loss/actor=-0.682, loss/critic=0.401, n/ep=100, n/st=100, rew=-1.58]                         


Epoch #46: test_reward: -1.296587 ± 0.271728, best_reward: 0.460380 ± 0.196909 in #0


Epoch #47: 101it [00:00, 985.66it/s, env_step=4700, len=1, loss/actor=-0.609, loss/critic=0.398, n/ep=100, n/st=100, rew=-1.46]                         


Epoch #47: test_reward: -1.194868 ± 0.499924, best_reward: 0.460380 ± 0.196909 in #0


Epoch #48: 101it [00:00, 1000.26it/s, env_step=4800, len=1, loss/actor=-0.520, loss/critic=0.403, n/ep=100, n/st=100, rew=-1.28]                        


Epoch #48: test_reward: -1.008909 ± 0.271828, best_reward: 0.460380 ± 0.196909 in #0


Epoch #49: 101it [00:00, 989.73it/s, env_step=4900, len=1, loss/actor=-0.411, loss/critic=0.397, n/ep=100, n/st=100, rew=-1.06]                         


Epoch #49: test_reward: -0.721998 ± 0.240332, best_reward: 0.460380 ± 0.196909 in #0


Epoch #50: 101it [00:00, 988.06it/s, env_step=5000, len=1, loss/actor=-0.299, loss/critic=0.378, n/ep=100, n/st=100, rew=-0.70]                         


Epoch #50: test_reward: -0.355660 ± 0.289518, best_reward: 0.460380 ± 0.196909 in #0


Epoch #51: 101it [00:00, 999.34it/s, env_step=5100, len=1, loss/actor=-0.213, loss/critic=0.367, n/ep=100, n/st=100, rew=-0.37]                         


Epoch #51: test_reward: -0.029411 ± 0.199411, best_reward: 0.460380 ± 0.196909 in #0


Epoch #52: 101it [00:00, 987.80it/s, env_step=5200, len=1, loss/actor=-0.096, loss/critic=0.351, n/ep=100, n/st=100, rew=-0.11]                         


Epoch #52: test_reward: 0.116083 ± 0.174099, best_reward: 0.460380 ± 0.196909 in #0


Epoch #53: 101it [00:00, 985.18it/s, env_step=5300, len=1, loss/actor=-0.019, loss/critic=0.330, n/ep=100, n/st=100, rew=0.12]                         


Epoch #53: test_reward: 0.321554 ± 0.187620, best_reward: 0.460380 ± 0.196909 in #0


Epoch #54: 101it [00:00, 1000.50it/s, env_step=5400, len=1, loss/actor=0.042, loss/critic=0.334, n/ep=100, n/st=100, rew=0.26]                        


Epoch #54: test_reward: 0.346989 ± 0.244997, best_reward: 0.460380 ± 0.196909 in #0


Epoch #55: 101it [00:00, 988.18it/s, env_step=5500, len=1, loss/actor=0.087, loss/critic=0.338, n/ep=100, n/st=100, rew=0.40]                         


Epoch #55: test_reward: 0.491057 ± 0.151428, best_reward: 0.491057 ± 0.151428 in #55


Epoch #56: 101it [00:00, 994.87it/s, env_step=5600, len=1, loss/actor=0.113, loss/critic=0.335, n/ep=100, n/st=100, rew=0.47]                         


Epoch #56: test_reward: 0.501086 ± 0.154794, best_reward: 0.501086 ± 0.154794 in #56


Epoch #57: 101it [00:00, 990.93it/s, env_step=5700, len=1, loss/actor=0.125, loss/critic=0.329, n/ep=100, n/st=100, rew=0.51]                         


Epoch #57: test_reward: 0.501986 ± 0.128752, best_reward: 0.501986 ± 0.128752 in #57


Epoch #58: 101it [00:00, 979.26it/s, env_step=5800, len=1, loss/actor=0.100, loss/critic=0.342, n/ep=100, n/st=100, rew=0.54]                         


Epoch #58: test_reward: 0.577484 ± 0.203080, best_reward: 0.577484 ± 0.203080 in #58


Epoch #59: 101it [00:00, 972.03it/s, env_step=5900, len=1, loss/actor=0.071, loss/critic=0.324, n/ep=100, n/st=100, rew=0.54]                         


Epoch #59: test_reward: 0.603214 ± 0.186733, best_reward: 0.603214 ± 0.186733 in #59


Epoch #60: 101it [00:00, 963.78it/s, env_step=6000, len=1, loss/actor=0.008, loss/critic=0.325, n/ep=100, n/st=100, rew=0.55]                         


Epoch #60: test_reward: 0.441345 ± 0.167256, best_reward: 0.603214 ± 0.186733 in #59


Epoch #61: 101it [00:00, 988.63it/s, env_step=6100, len=1, loss/actor=-0.051, loss/critic=0.323, n/ep=100, n/st=100, rew=0.50]                         


Epoch #61: test_reward: 0.381107 ± 0.205986, best_reward: 0.603214 ± 0.186733 in #59


Epoch #62: 101it [00:00, 992.24it/s, env_step=6200, len=1, loss/actor=-0.125, loss/critic=0.321, n/ep=100, n/st=100, rew=0.37]                         


Epoch #62: test_reward: 0.223596 ± 0.202905, best_reward: 0.603214 ± 0.186733 in #59


Epoch #63: 101it [00:00, 998.90it/s, env_step=6300, len=1, loss/actor=-0.191, loss/critic=0.319, n/ep=100, n/st=100, rew=0.29]                         


Epoch #63: test_reward: 0.038947 ± 0.237666, best_reward: 0.603214 ± 0.186733 in #59


Epoch #64: 101it [00:00, 1005.54it/s, env_step=6400, len=1, loss/actor=-0.253, loss/critic=0.306, n/ep=100, n/st=100, rew=0.03]                         


Epoch #64: test_reward: -0.211542 ± 0.255583, best_reward: 0.603214 ± 0.186733 in #59


Epoch #65: 101it [00:00, 993.63it/s, env_step=6500, len=1, loss/actor=-0.310, loss/critic=0.303, n/ep=100, n/st=100, rew=-0.17]                         


Epoch #65: test_reward: -0.468487 ± 0.292287, best_reward: 0.603214 ± 0.186733 in #59


Epoch #66: 101it [00:00, 983.28it/s, env_step=6600, len=1, loss/actor=-0.396, loss/critic=0.305, n/ep=100, n/st=100, rew=-0.42]                         


Epoch #66: test_reward: -0.740989 ± 0.369223, best_reward: 0.603214 ± 0.186733 in #59


Epoch #67: 101it [00:00, 982.41it/s, env_step=6700, len=1, loss/actor=-0.462, loss/critic=0.310, n/ep=100, n/st=100, rew=-0.76]                         


Epoch #67: test_reward: -1.011259 ± 0.298510, best_reward: 0.603214 ± 0.186733 in #59


Epoch #68: 101it [00:00, 959.62it/s, env_step=6800, len=1, loss/actor=-0.497, loss/critic=0.321, n/ep=100, n/st=100, rew=-1.07]                         


Epoch #68: test_reward: -1.333582 ± 0.363650, best_reward: 0.603214 ± 0.186733 in #59


Epoch #69: 101it [00:00, 953.79it/s, env_step=6900, len=1, loss/actor=-0.563, loss/critic=0.320, n/ep=100, n/st=100, rew=-1.21]                         


Epoch #69: test_reward: -1.131816 ± 0.340914, best_reward: 0.603214 ± 0.186733 in #59


Epoch #70: 101it [00:00, 959.42it/s, env_step=7000, len=1, loss/actor=-0.569, loss/critic=0.326, n/ep=100, n/st=100, rew=-1.29]                         


Epoch #70: test_reward: -1.320975 ± 0.373972, best_reward: 0.603214 ± 0.186733 in #59


Epoch #71: 101it [00:00, 999.49it/s, env_step=7100, len=1, loss/actor=-0.580, loss/critic=0.330, n/ep=100, n/st=100, rew=-1.24]                         


Epoch #71: test_reward: -1.238217 ± 0.216016, best_reward: 0.603214 ± 0.186733 in #59


Epoch #72: 101it [00:00, 995.09it/s, env_step=7200, len=1, loss/actor=-0.575, loss/critic=0.340, n/ep=100, n/st=100, rew=-1.17]                         


Epoch #72: test_reward: -1.012711 ± 0.278828, best_reward: 0.603214 ± 0.186733 in #59


Epoch #73: 101it [00:00, 1001.23it/s, env_step=7300, len=1, loss/actor=-0.560, loss/critic=0.352, n/ep=100, n/st=100, rew=-0.99]                        


Epoch #73: test_reward: -1.106085 ± 0.420863, best_reward: 0.603214 ± 0.186733 in #59


Epoch #74: 101it [00:00, 978.04it/s, env_step=7400, len=1, loss/actor=-0.535, loss/critic=0.366, n/ep=100, n/st=100, rew=-0.94]                         


Epoch #74: test_reward: -0.472354 ± 0.315668, best_reward: 0.603214 ± 0.186733 in #59


Epoch #75: 101it [00:00, 1001.42it/s, env_step=7500, len=1, loss/actor=-0.575, loss/critic=0.363, n/ep=100, n/st=100, rew=-0.49]                        


Epoch #75: test_reward: -0.249720 ± 0.290166, best_reward: 0.603214 ± 0.186733 in #59


Epoch #76: 101it [00:00, 982.26it/s, env_step=7600, len=1, loss/actor=-0.606, loss/critic=0.366, n/ep=100, n/st=100, rew=-0.30]                         


Epoch #76: test_reward: -0.031392 ± 0.189023, best_reward: 0.603214 ± 0.186733 in #59


Epoch #77: 101it [00:00, 977.96it/s, env_step=7700, len=1, loss/actor=-0.630, loss/critic=0.368, n/ep=100, n/st=100, rew=-0.02]                         


Epoch #77: test_reward: 0.152202 ± 0.186818, best_reward: 0.603214 ± 0.186733 in #59


Epoch #78: 101it [00:00, 959.88it/s, env_step=7800, len=1, loss/actor=-0.690, loss/critic=0.343, n/ep=100, n/st=100, rew=0.23]                         


Epoch #78: test_reward: 0.303790 ± 0.226558, best_reward: 0.603214 ± 0.186733 in #59


Epoch #79: 101it [00:00, 951.51it/s, env_step=7900, len=1, loss/actor=-0.719, loss/critic=0.344, n/ep=100, n/st=100, rew=0.25]                         


Epoch #79: test_reward: 0.306361 ± 0.195661, best_reward: 0.603214 ± 0.186733 in #59


Epoch #80: 101it [00:00, 963.62it/s, env_step=8000, len=1, loss/actor=-0.775, loss/critic=0.339, n/ep=100, n/st=100, rew=0.30]                         


Epoch #80: test_reward: 0.391458 ± 0.179108, best_reward: 0.603214 ± 0.186733 in #59


Epoch #81: 101it [00:00, 960.71it/s, env_step=8100, len=1, loss/actor=-0.838, loss/critic=0.326, n/ep=100, n/st=100, rew=0.37]                         


Epoch #81: test_reward: 0.387533 ± 0.162753, best_reward: 0.603214 ± 0.186733 in #59


Epoch #82: 101it [00:00, 969.33it/s, env_step=8200, len=1, loss/actor=-0.886, loss/critic=0.387, n/ep=100, n/st=100, rew=0.39]                         


Epoch #82: test_reward: 0.285686 ± 0.199131, best_reward: 0.603214 ± 0.186733 in #59


Epoch #83: 101it [00:00, 970.79it/s, env_step=8300, len=1, loss/actor=-0.954, loss/critic=0.494, n/ep=100, n/st=100, rew=0.43]                         


Epoch #83: test_reward: 0.505745 ± 0.172306, best_reward: 0.603214 ± 0.186733 in #59


Epoch #84: 101it [00:00, 990.91it/s, env_step=8400, len=1, loss/actor=-1.024, loss/critic=0.573, n/ep=100, n/st=100, rew=0.48]                         


Epoch #84: test_reward: 0.503369 ± 0.208150, best_reward: 0.603214 ± 0.186733 in #59


Epoch #85: 101it [00:00, 986.24it/s, env_step=8500, len=1, loss/actor=-1.019, loss/critic=0.655, n/ep=100, n/st=100, rew=0.54]                         


Epoch #85: test_reward: 0.608372 ± 0.183735, best_reward: 0.608372 ± 0.183735 in #85


Epoch #86: 101it [00:00, 991.12it/s, env_step=8600, len=1, loss/actor=-1.011, loss/critic=0.676, n/ep=100, n/st=100, rew=0.60]                         


Epoch #86: test_reward: 0.617226 ± 0.171614, best_reward: 0.617226 ± 0.171614 in #86


Epoch #87: 101it [00:00, 963.88it/s, env_step=8700, len=1, loss/actor=-1.018, loss/critic=0.692, n/ep=100, n/st=100, rew=0.60]                         


Epoch #87: test_reward: 0.689096 ± 0.145361, best_reward: 0.689096 ± 0.145361 in #87


Epoch #88: 101it [00:00, 960.82it/s, env_step=8800, len=1, loss/actor=-1.005, loss/critic=0.682, n/ep=100, n/st=100, rew=0.62]                         


Epoch #88: test_reward: 0.632051 ± 0.163816, best_reward: 0.689096 ± 0.145361 in #87


Epoch #89: 101it [00:00, 971.96it/s, env_step=8900, len=1, loss/actor=-0.993, loss/critic=0.670, n/ep=100, n/st=100, rew=0.64]                         


Epoch #89: test_reward: 0.599081 ± 0.096621, best_reward: 0.689096 ± 0.145361 in #87


Epoch #90: 101it [00:00, 953.28it/s, env_step=9000, len=1, loss/actor=-0.988, loss/critic=0.662, n/ep=100, n/st=100, rew=0.63]                         


Epoch #90: test_reward: 0.648117 ± 0.129894, best_reward: 0.689096 ± 0.145361 in #87


Epoch #91: 101it [00:00, 975.10it/s, env_step=9100, len=1, loss/actor=-0.966, loss/critic=0.678, n/ep=100, n/st=100, rew=0.69]                         


Epoch #91: test_reward: 0.714706 ± 0.181754, best_reward: 0.714706 ± 0.181754 in #91


Epoch #92: 101it [00:00, 996.50it/s, env_step=9200, len=1, loss/actor=-0.985, loss/critic=0.637, n/ep=100, n/st=100, rew=0.66]                         


Epoch #92: test_reward: 0.652926 ± 0.165330, best_reward: 0.714706 ± 0.181754 in #91


Epoch #93: 101it [00:00, 991.28it/s, env_step=9300, len=1, loss/actor=-0.966, loss/critic=0.521, n/ep=100, n/st=100, rew=0.66]                         


Epoch #93: test_reward: 0.636230 ± 0.192361, best_reward: 0.714706 ± 0.181754 in #91


Epoch #94: 101it [00:00, 996.53it/s, env_step=9400, len=1, loss/actor=-0.978, loss/critic=0.431, n/ep=100, n/st=100, rew=0.66]                         


Epoch #94: test_reward: 0.621566 ± 0.188633, best_reward: 0.714706 ± 0.181754 in #91


Epoch #95: 101it [00:00, 971.43it/s, env_step=9500, len=1, loss/actor=-0.984, loss/critic=0.328, n/ep=100, n/st=100, rew=0.64]                         


Epoch #95: test_reward: 0.599107 ± 0.141747, best_reward: 0.714706 ± 0.181754 in #91


Epoch #96: 101it [00:00, 954.45it/s, env_step=9600, len=1, loss/actor=-0.980, loss/critic=0.288, n/ep=100, n/st=100, rew=0.60]                         


Epoch #96: test_reward: 0.650068 ± 0.143132, best_reward: 0.714706 ± 0.181754 in #91


Epoch #97: 101it [00:00, 962.32it/s, env_step=9700, len=1, loss/actor=-0.993, loss/critic=0.261, n/ep=100, n/st=100, rew=0.60]                         


Epoch #97: test_reward: 0.602095 ± 0.168527, best_reward: 0.714706 ± 0.181754 in #91


Epoch #98: 101it [00:00, 924.02it/s, env_step=9800, len=1, loss/actor=-0.990, loss/critic=0.268, n/ep=100, n/st=100, rew=0.56]                         


Epoch #98: test_reward: 0.590302 ± 0.111938, best_reward: 0.714706 ± 0.181754 in #91


Epoch #99: 101it [00:00, 949.31it/s, env_step=9900, len=1, loss/actor=-1.021, loss/critic=0.272, n/ep=100, n/st=100, rew=0.49]                         


Epoch #99: test_reward: 0.517120 ± 0.131176, best_reward: 0.714706 ± 0.181754 in #91


Epoch #100: 101it [00:00, 936.34it/s, env_step=10000, len=1, loss/actor=-1.020, loss/critic=0.265, n/ep=100, n/st=100, rew=0.48]                         


Epoch #100: test_reward: 0.412826 ± 0.155879, best_reward: 0.714706 ± 0.181754 in #91


Epoch #101: 101it [00:00, 931.18it/s, env_step=10100, len=1, loss/actor=-1.039, loss/critic=0.250, n/ep=100, n/st=100, rew=0.44]                         


Epoch #101: test_reward: 0.369665 ± 0.161102, best_reward: 0.714706 ± 0.181754 in #91


Epoch #102: 101it [00:00, 990.45it/s, env_step=10200, len=1, loss/actor=-1.039, loss/critic=0.221, n/ep=100, n/st=100, rew=0.38]                         


Epoch #102: test_reward: 0.316748 ± 0.157905, best_reward: 0.714706 ± 0.181754 in #91


Epoch #103: 101it [00:00, 980.63it/s, env_step=10300, len=1, loss/actor=-1.053, loss/critic=0.217, n/ep=100, n/st=100, rew=0.28]                         


Epoch #103: test_reward: 0.191405 ± 0.251985, best_reward: 0.714706 ± 0.181754 in #91


Epoch #104: 101it [00:00, 978.14it/s, env_step=10400, len=1, loss/actor=-1.054, loss/critic=0.217, n/ep=100, n/st=100, rew=0.22]                         


Epoch #104: test_reward: 0.139054 ± 0.228153, best_reward: 0.714706 ± 0.181754 in #91


Epoch #105: 101it [00:00, 975.35it/s, env_step=10500, len=1, loss/actor=-1.069, loss/critic=0.223, n/ep=100, n/st=100, rew=0.11]                         


Epoch #105: test_reward: -0.078585 ± 0.180972, best_reward: 0.714706 ± 0.181754 in #91


Epoch #106: 101it [00:00, 960.85it/s, env_step=10600, len=1, loss/actor=-1.068, loss/critic=0.216, n/ep=100, n/st=100, rew=-0.04]                         


Epoch #106: test_reward: -0.094019 ± 0.180414, best_reward: 0.714706 ± 0.181754 in #91


Epoch #107: 101it [00:00, 963.24it/s, env_step=10700, len=1, loss/actor=-1.059, loss/critic=0.224, n/ep=100, n/st=100, rew=-0.20]                         


Epoch #107: test_reward: -0.300244 ± 0.217241, best_reward: 0.714706 ± 0.181754 in #91


Epoch #108: 101it [00:00, 951.83it/s, env_step=10800, len=1, loss/actor=-1.072, loss/critic=0.221, n/ep=100, n/st=100, rew=-0.34]                         


Epoch #108: test_reward: -0.418596 ± 0.314055, best_reward: 0.714706 ± 0.181754 in #91


Epoch #109: 101it [00:00, 963.46it/s, env_step=10900, len=1, loss/actor=-1.048, loss/critic=0.247, n/ep=100, n/st=100, rew=-0.48]                         


Epoch #109: test_reward: -0.597215 ± 0.233933, best_reward: 0.714706 ± 0.181754 in #91


Epoch #110: 101it [00:00, 969.39it/s, env_step=11000, len=1, loss/actor=-1.054, loss/critic=0.258, n/ep=100, n/st=100, rew=-0.56]                         


Epoch #110: test_reward: -0.545856 ± 0.258383, best_reward: 0.714706 ± 0.181754 in #91


Epoch #111: 101it [00:00, 939.15it/s, env_step=11100, len=1, loss/actor=-1.021, loss/critic=0.253, n/ep=100, n/st=100, rew=-0.59]                         


Epoch #111: test_reward: -0.592571 ± 0.282938, best_reward: 0.714706 ± 0.181754 in #91


Epoch #112: 101it [00:00, 1015.44it/s, env_step=11200, len=1, loss/actor=-1.008, loss/critic=0.255, n/ep=100, n/st=100, rew=-0.53]                         


Epoch #112: test_reward: -0.310445 ± 0.256278, best_reward: 0.714706 ± 0.181754 in #91


Epoch #113: 101it [00:00, 995.33it/s, env_step=11300, len=1, loss/actor=-0.993, loss/critic=0.262, n/ep=100, n/st=100, rew=-0.54]                         


Epoch #113: test_reward: -0.354733 ± 0.229628, best_reward: 0.714706 ± 0.181754 in #91


Epoch #114: 101it [00:00, 1003.28it/s, env_step=11400, len=1, loss/actor=-0.962, loss/critic=0.259, n/ep=100, n/st=100, rew=-0.41]                        


Epoch #114: test_reward: -0.178499 ± 0.293491, best_reward: 0.714706 ± 0.181754 in #91


Epoch #115: 101it [00:00, 1006.51it/s, env_step=11500, len=1, loss/actor=-0.935, loss/critic=0.255, n/ep=100, n/st=100, rew=-0.20]                         


Epoch #115: test_reward: -0.024489 ± 0.249632, best_reward: 0.714706 ± 0.181754 in #91


Epoch #116: 101it [00:00, 1015.03it/s, env_step=11600, len=1, loss/actor=-0.918, loss/critic=0.259, n/ep=100, n/st=100, rew=-0.02]                         


Epoch #116: test_reward: 0.062729 ± 0.265335, best_reward: 0.714706 ± 0.181754 in #91


Epoch #117: 101it [00:00, 997.48it/s, env_step=11700, len=1, loss/actor=-0.891, loss/critic=0.255, n/ep=100, n/st=100, rew=0.08]                         


Epoch #117: test_reward: 0.186434 ± 0.159862, best_reward: 0.714706 ± 0.181754 in #91


Epoch #118: 101it [00:00, 998.38it/s, env_step=11800, len=1, loss/actor=-0.863, loss/critic=0.248, n/ep=100, n/st=100, rew=0.25]                         


Epoch #118: test_reward: 0.360879 ± 0.159865, best_reward: 0.714706 ± 0.181754 in #91


Epoch #119: 101it [00:00, 993.83it/s, env_step=11900, len=1, loss/actor=-0.846, loss/critic=0.220, n/ep=100, n/st=100, rew=0.35]                         


Epoch #119: test_reward: 0.471103 ± 0.149238, best_reward: 0.714706 ± 0.181754 in #91


Epoch #120: 101it [00:00, 980.35it/s, env_step=12000, len=1, loss/actor=-0.821, loss/critic=0.225, n/ep=100, n/st=100, rew=0.48]                         


Epoch #120: test_reward: 0.581712 ± 0.209635, best_reward: 0.714706 ± 0.181754 in #91


Epoch #121: 101it [00:00, 1004.33it/s, env_step=12100, len=1, loss/actor=-0.823, loss/critic=0.223, n/ep=100, n/st=100, rew=0.59]                         


Epoch #121: test_reward: 0.625765 ± 0.178967, best_reward: 0.714706 ± 0.181754 in #91


Epoch #122: 101it [00:00, 976.01it/s, env_step=12200, len=1, loss/actor=-0.806, loss/critic=0.218, n/ep=100, n/st=100, rew=0.63]                         


Epoch #122: test_reward: 0.652673 ± 0.154142, best_reward: 0.714706 ± 0.181754 in #91


Epoch #123: 101it [00:00, 987.79it/s, env_step=12300, len=1, loss/actor=-0.800, loss/critic=0.213, n/ep=100, n/st=100, rew=0.66]                         


Epoch #123: test_reward: 0.724343 ± 0.165257, best_reward: 0.724343 ± 0.165257 in #123


Epoch #124: 101it [00:00, 975.99it/s, env_step=12400, len=1, loss/actor=-0.800, loss/critic=0.204, n/ep=100, n/st=100, rew=0.68]                         


Epoch #124: test_reward: 0.662827 ± 0.173601, best_reward: 0.724343 ± 0.165257 in #123


Epoch #125: 101it [00:00, 1000.62it/s, env_step=12500, len=1, loss/actor=-0.806, loss/critic=0.207, n/ep=100, n/st=100, rew=0.73]                        


Epoch #125: test_reward: 0.764704 ± 0.095331, best_reward: 0.764704 ± 0.095331 in #125


Epoch #126: 101it [00:00, 991.44it/s, env_step=12600, len=1, loss/actor=-0.800, loss/critic=0.211, n/ep=100, n/st=100, rew=0.69]                         


Epoch #126: test_reward: 0.708365 ± 0.153408, best_reward: 0.764704 ± 0.095331 in #125


Epoch #127: 101it [00:00, 992.38it/s, env_step=12700, len=1, loss/actor=-0.818, loss/critic=0.195, n/ep=100, n/st=100, rew=0.70]                         


Epoch #127: test_reward: 0.683834 ± 0.135358, best_reward: 0.764704 ± 0.095331 in #125


Epoch #128: 101it [00:00, 982.43it/s, env_step=12800, len=1, loss/actor=-0.819, loss/critic=0.201, n/ep=100, n/st=100, rew=0.69]                         


Epoch #128: test_reward: 0.709430 ± 0.147104, best_reward: 0.764704 ± 0.095331 in #125


Epoch #129: 101it [00:00, 963.28it/s, env_step=12900, len=1, loss/actor=-0.835, loss/critic=0.201, n/ep=100, n/st=100, rew=0.69]                         


Epoch #129: test_reward: 0.630836 ± 0.190494, best_reward: 0.764704 ± 0.095331 in #125


Epoch #130: 101it [00:00, 963.88it/s, env_step=13000, len=1, loss/actor=-0.849, loss/critic=0.182, n/ep=100, n/st=100, rew=0.65]                         


Epoch #130: test_reward: 0.655977 ± 0.158484, best_reward: 0.764704 ± 0.095331 in #125


Epoch #131: 101it [00:00, 971.93it/s, env_step=13100, len=1, loss/actor=-0.861, loss/critic=0.182, n/ep=100, n/st=100, rew=0.62]                         


Epoch #131: test_reward: 0.532221 ± 0.160990, best_reward: 0.764704 ± 0.095331 in #125


Epoch #132: 101it [00:00, 949.81it/s, env_step=13200, len=1, loss/actor=-0.878, loss/critic=0.222, n/ep=100, n/st=100, rew=0.60]                         


Epoch #132: test_reward: 0.547227 ± 0.172578, best_reward: 0.764704 ± 0.095331 in #125


Epoch #133: 101it [00:00, 975.91it/s, env_step=13300, len=1, loss/actor=-0.889, loss/critic=0.270, n/ep=100, n/st=100, rew=0.60]                         


Epoch #133: test_reward: 0.516806 ± 0.209930, best_reward: 0.764704 ± 0.095331 in #125


Epoch #134: 101it [00:00, 972.66it/s, env_step=13400, len=1, loss/actor=-0.897, loss/critic=0.295, n/ep=100, n/st=100, rew=0.56]                         


Epoch #134: test_reward: 0.552357 ± 0.127236, best_reward: 0.764704 ± 0.095331 in #125


Epoch #135: 101it [00:00, 962.15it/s, env_step=13500, len=1, loss/actor=-0.900, loss/critic=0.289, n/ep=100, n/st=100, rew=0.64]                         


Epoch #135: test_reward: 0.582801 ± 0.179894, best_reward: 0.764704 ± 0.095331 in #125


Epoch #136: 101it [00:00, 988.01it/s, env_step=13600, len=1, loss/actor=-0.912, loss/critic=0.290, n/ep=100, n/st=100, rew=0.55]                         


Epoch #136: test_reward: 0.629397 ± 0.118638, best_reward: 0.764704 ± 0.095331 in #125


Epoch #137: 101it [00:00, 987.87it/s, env_step=13700, len=1, loss/actor=-0.917, loss/critic=0.290, n/ep=100, n/st=100, rew=0.54]                         


Epoch #137: test_reward: 0.436809 ± 0.168061, best_reward: 0.764704 ± 0.095331 in #125


Epoch #138: 101it [00:00, 973.36it/s, env_step=13800, len=1, loss/actor=-0.931, loss/critic=0.280, n/ep=100, n/st=100, rew=0.48]                         


Epoch #138: test_reward: 0.557370 ± 0.122010, best_reward: 0.764704 ± 0.095331 in #125


Epoch #139: 101it [00:00, 998.49it/s, env_step=13900, len=1, loss/actor=-0.928, loss/critic=0.295, n/ep=100, n/st=100, rew=0.52]                         


Epoch #139: test_reward: 0.556073 ± 0.233309, best_reward: 0.764704 ± 0.095331 in #125


Epoch #140: 101it [00:00, 967.05it/s, env_step=14000, len=1, loss/actor=-0.935, loss/critic=0.293, n/ep=100, n/st=100, rew=0.55]                         


Epoch #140: test_reward: 0.560259 ± 0.180079, best_reward: 0.764704 ± 0.095331 in #125


Epoch #141: 101it [00:00, 987.20it/s, env_step=14100, len=1, loss/actor=-0.930, loss/critic=0.287, n/ep=100, n/st=100, rew=0.53]                         


Epoch #141: test_reward: 0.575407 ± 0.127238, best_reward: 0.764704 ± 0.095331 in #125


Epoch #142: 101it [00:00, 966.78it/s, env_step=14200, len=1, loss/actor=-0.932, loss/critic=0.240, n/ep=100, n/st=100, rew=0.54]                         


Epoch #142: test_reward: 0.500225 ± 0.168021, best_reward: 0.764704 ± 0.095331 in #125


Epoch #143: 101it [00:00, 991.36it/s, env_step=14300, len=1, loss/actor=-0.932, loss/critic=0.181, n/ep=100, n/st=100, rew=0.51]                         


Epoch #143: test_reward: 0.499905 ± 0.150573, best_reward: 0.764704 ± 0.095331 in #125


Epoch #144: 101it [00:00, 981.39it/s, env_step=14400, len=1, loss/actor=-0.938, loss/critic=0.159, n/ep=100, n/st=100, rew=0.55]                         


Epoch #144: test_reward: 0.538202 ± 0.125187, best_reward: 0.764704 ± 0.095331 in #125


Epoch #145: 101it [00:00, 1016.12it/s, env_step=14500, len=1, loss/actor=-0.941, loss/critic=0.154, n/ep=100, n/st=100, rew=0.51]                         


Epoch #145: test_reward: 0.533016 ± 0.088141, best_reward: 0.764704 ± 0.095331 in #125


Epoch #146: 101it [00:00, 973.01it/s, env_step=14600, len=1, loss/actor=-0.939, loss/critic=0.133, n/ep=100, n/st=100, rew=0.52]                         


Epoch #146: test_reward: 0.524381 ± 0.222424, best_reward: 0.764704 ± 0.095331 in #125


Epoch #147: 101it [00:00, 986.19it/s, env_step=14700, len=1, loss/actor=-0.940, loss/critic=0.132, n/ep=100, n/st=100, rew=0.55]                         


Epoch #147: test_reward: 0.522860 ± 0.107268, best_reward: 0.764704 ± 0.095331 in #125


Epoch #148: 101it [00:00, 963.14it/s, env_step=14800, len=1, loss/actor=-0.935, loss/critic=0.142, n/ep=100, n/st=100, rew=0.55]                         


Epoch #148: test_reward: 0.435009 ± 0.126250, best_reward: 0.764704 ± 0.095331 in #125


Epoch #149: 101it [00:00, 969.90it/s, env_step=14900, len=1, loss/actor=-0.940, loss/critic=0.115, n/ep=100, n/st=100, rew=0.56]                         


Epoch #149: test_reward: 0.541991 ± 0.245084, best_reward: 0.764704 ± 0.095331 in #125


Epoch #150: 101it [00:00, 958.16it/s, env_step=15000, len=1, loss/actor=-0.933, loss/critic=0.114, n/ep=100, n/st=100, rew=0.60]                         


Epoch #150: test_reward: 0.519345 ± 0.197355, best_reward: 0.764704 ± 0.095331 in #125


Epoch #151: 101it [00:00, 988.29it/s, env_step=15100, len=1, loss/actor=-0.935, loss/critic=0.117, n/ep=100, n/st=100, rew=0.58]                         


Epoch #151: test_reward: 0.570475 ± 0.101339, best_reward: 0.764704 ± 0.095331 in #125


Epoch #152: 101it [00:00, 992.80it/s, env_step=15200, len=1, loss/actor=-0.932, loss/critic=0.120, n/ep=100, n/st=100, rew=0.58]                         


Epoch #152: test_reward: 0.578687 ± 0.193101, best_reward: 0.764704 ± 0.095331 in #125


Epoch #153: 101it [00:00, 980.80it/s, env_step=15300, len=1, loss/actor=-0.932, loss/critic=0.125, n/ep=100, n/st=100, rew=0.59]                         


Epoch #153: test_reward: 0.408584 ± 0.192934, best_reward: 0.764704 ± 0.095331 in #125


Epoch #154: 101it [00:00, 967.84it/s, env_step=15400, len=1, loss/actor=-0.922, loss/critic=0.125, n/ep=100, n/st=100, rew=0.57]                         


Epoch #154: test_reward: 0.531526 ± 0.170880, best_reward: 0.764704 ± 0.095331 in #125


Epoch #155: 101it [00:00, 987.23it/s, env_step=15500, len=1, loss/actor=-0.918, loss/critic=0.145, n/ep=100, n/st=100, rew=0.61]                         


Epoch #155: test_reward: 0.552274 ± 0.171676, best_reward: 0.764704 ± 0.095331 in #125


Epoch #156: 101it [00:00, 983.44it/s, env_step=15600, len=1, loss/actor=-0.919, loss/critic=0.152, n/ep=100, n/st=100, rew=0.61]                         


Epoch #156: test_reward: 0.681102 ± 0.088818, best_reward: 0.764704 ± 0.095331 in #125


Epoch #157: 101it [00:00, 1002.21it/s, env_step=15700, len=1, loss/actor=-0.910, loss/critic=0.154, n/ep=100, n/st=100, rew=0.61]                        


Epoch #157: test_reward: 0.565346 ± 0.137325, best_reward: 0.764704 ± 0.095331 in #125


Epoch #158: 101it [00:00, 984.43it/s, env_step=15800, len=1, loss/actor=-0.911, loss/critic=0.149, n/ep=100, n/st=100, rew=0.63]                         


Epoch #158: test_reward: 0.678571 ± 0.136873, best_reward: 0.764704 ± 0.095331 in #125


Epoch #159: 101it [00:00, 966.19it/s, env_step=15900, len=1, loss/actor=-0.907, loss/critic=0.152, n/ep=100, n/st=100, rew=0.64]                         


Epoch #159: test_reward: 0.684030 ± 0.097034, best_reward: 0.764704 ± 0.095331 in #125


Epoch #160: 101it [00:00, 995.30it/s, env_step=16000, len=1, loss/actor=-0.904, loss/critic=0.156, n/ep=100, n/st=100, rew=0.63]                         


Epoch #160: test_reward: 0.665927 ± 0.082425, best_reward: 0.764704 ± 0.095331 in #125


Epoch #161: 101it [00:00, 964.33it/s, env_step=16100, len=1, loss/actor=-0.902, loss/critic=0.160, n/ep=100, n/st=100, rew=0.65]                         


Epoch #161: test_reward: 0.688721 ± 0.084914, best_reward: 0.764704 ± 0.095331 in #125


Epoch #162: 101it [00:00, 986.41it/s, env_step=16200, len=1, loss/actor=-0.891, loss/critic=0.167, n/ep=100, n/st=100, rew=0.67]                         


Epoch #162: test_reward: 0.614191 ± 0.138686, best_reward: 0.764704 ± 0.095331 in #125


Epoch #163: 101it [00:00, 995.43it/s, env_step=16300, len=1, loss/actor=-0.884, loss/critic=0.166, n/ep=100, n/st=100, rew=0.68]                         


Epoch #163: test_reward: 0.722018 ± 0.096762, best_reward: 0.764704 ± 0.095331 in #125


Epoch #164: 101it [00:00, 1001.75it/s, env_step=16400, len=1, loss/actor=-0.887, loss/critic=0.167, n/ep=100, n/st=100, rew=0.68]                        


Epoch #164: test_reward: 0.659564 ± 0.152709, best_reward: 0.764704 ± 0.095331 in #125


Epoch #165: 101it [00:00, 983.13it/s, env_step=16500, len=1, loss/actor=-0.879, loss/critic=0.151, n/ep=100, n/st=100, rew=0.68]                         


Epoch #165: test_reward: 0.632395 ± 0.150780, best_reward: 0.764704 ± 0.095331 in #125


Epoch #166: 101it [00:00, 981.05it/s, env_step=16600, len=1, loss/actor=-0.881, loss/critic=0.156, n/ep=100, n/st=100, rew=0.69]                         


Epoch #166: test_reward: 0.698046 ± 0.161910, best_reward: 0.764704 ± 0.095331 in #125


Epoch #167: 101it [00:00, 988.16it/s, env_step=16700, len=1, loss/actor=-0.872, loss/critic=0.163, n/ep=100, n/st=100, rew=0.68]                         


Epoch #167: test_reward: 0.739479 ± 0.139002, best_reward: 0.764704 ± 0.095331 in #125


Epoch #168: 101it [00:00, 961.75it/s, env_step=16800, len=1, loss/actor=-0.872, loss/critic=0.167, n/ep=100, n/st=100, rew=0.71]                         


Epoch #168: test_reward: 0.743908 ± 0.124262, best_reward: 0.764704 ± 0.095331 in #125


Epoch #169: 101it [00:00, 993.17it/s, env_step=16900, len=1, loss/actor=-0.862, loss/critic=0.179, n/ep=100, n/st=100, rew=0.73]                         


Epoch #169: test_reward: 0.639972 ± 0.135606, best_reward: 0.764704 ± 0.095331 in #125


Epoch #170: 101it [00:00, 962.81it/s, env_step=17000, len=1, loss/actor=-0.865, loss/critic=0.187, n/ep=100, n/st=100, rew=0.68]                         


Epoch #170: test_reward: 0.644077 ± 0.158599, best_reward: 0.764704 ± 0.095331 in #125


Epoch #171: 101it [00:00, 964.34it/s, env_step=17100, len=1, loss/actor=-0.853, loss/critic=0.178, n/ep=100, n/st=100, rew=0.69]                         


Epoch #171: test_reward: 0.747305 ± 0.164331, best_reward: 0.764704 ± 0.095331 in #125


Epoch #172: 101it [00:00, 955.94it/s, env_step=17200, len=1, loss/actor=-0.871, loss/critic=0.176, n/ep=100, n/st=100, rew=0.70]                         


Epoch #172: test_reward: 0.690006 ± 0.129376, best_reward: 0.764704 ± 0.095331 in #125


Epoch #173: 101it [00:00, 980.76it/s, env_step=17300, len=1, loss/actor=-0.857, loss/critic=0.181, n/ep=100, n/st=100, rew=0.70]                         


Epoch #173: test_reward: 0.732766 ± 0.110018, best_reward: 0.764704 ± 0.095331 in #125


Epoch #174: 101it [00:00, 988.99it/s, env_step=17400, len=1, loss/actor=-0.858, loss/critic=0.194, n/ep=100, n/st=100, rew=0.75]                         


Epoch #174: test_reward: 0.731443 ± 0.103633, best_reward: 0.764704 ± 0.095331 in #125


Epoch #175: 101it [00:00, 982.05it/s, env_step=17500, len=1, loss/actor=-0.856, loss/critic=0.192, n/ep=100, n/st=100, rew=0.69]                         


Epoch #175: test_reward: 0.724456 ± 0.149011, best_reward: 0.764704 ± 0.095331 in #125


Epoch #176: 101it [00:00, 998.48it/s, env_step=17600, len=1, loss/actor=-0.853, loss/critic=0.192, n/ep=100, n/st=100, rew=0.71]                         


Epoch #176: test_reward: 0.762961 ± 0.152742, best_reward: 0.764704 ± 0.095331 in #125


Epoch #177: 101it [00:00, 966.00it/s, env_step=17700, len=1, loss/actor=-0.862, loss/critic=0.180, n/ep=100, n/st=100, rew=0.70]                         


Epoch #177: test_reward: 0.740052 ± 0.134330, best_reward: 0.764704 ± 0.095331 in #125


Epoch #178: 101it [00:00, 965.90it/s, env_step=17800, len=1, loss/actor=-0.851, loss/critic=0.167, n/ep=100, n/st=100, rew=0.73]                         


Epoch #178: test_reward: 0.737310 ± 0.125001, best_reward: 0.764704 ± 0.095331 in #125


Epoch #179: 101it [00:00, 966.55it/s, env_step=17900, len=1, loss/actor=-0.858, loss/critic=0.154, n/ep=100, n/st=100, rew=0.71]                         


Epoch #179: test_reward: 0.667745 ± 0.142915, best_reward: 0.764704 ± 0.095331 in #125


Epoch #180: 101it [00:00, 977.63it/s, env_step=18000, len=1, loss/actor=-0.857, loss/critic=0.141, n/ep=100, n/st=100, rew=0.71]                         


Epoch #180: test_reward: 0.797995 ± 0.153606, best_reward: 0.797995 ± 0.153606 in #180


Epoch #181: 101it [00:00, 964.61it/s, env_step=18100, len=1, loss/actor=-0.863, loss/critic=0.136, n/ep=100, n/st=100, rew=0.72]                         


Epoch #181: test_reward: 0.698699 ± 0.160247, best_reward: 0.797995 ± 0.153606 in #180


Epoch #182: 101it [00:00, 981.88it/s, env_step=18200, len=1, loss/actor=-0.854, loss/critic=0.128, n/ep=100, n/st=100, rew=0.71]                         


Epoch #182: test_reward: 0.730759 ± 0.199523, best_reward: 0.797995 ± 0.153606 in #180


Epoch #183: 101it [00:00, 989.55it/s, env_step=18300, len=1, loss/actor=-0.867, loss/critic=0.122, n/ep=100, n/st=100, rew=0.70]                         


Epoch #183: test_reward: 0.713403 ± 0.183034, best_reward: 0.797995 ± 0.153606 in #180


Epoch #184: 101it [00:00, 968.09it/s, env_step=18400, len=1, loss/actor=-0.872, loss/critic=0.104, n/ep=100, n/st=100, rew=0.72]                         


Epoch #184: test_reward: 0.703442 ± 0.134825, best_reward: 0.797995 ± 0.153606 in #180


Epoch #185: 101it [00:00, 977.06it/s, env_step=18500, len=1, loss/actor=-0.872, loss/critic=0.103, n/ep=100, n/st=100, rew=0.70]                         


Epoch #185: test_reward: 0.731769 ± 0.149437, best_reward: 0.797995 ± 0.153606 in #180


Epoch #186: 101it [00:00, 967.27it/s, env_step=18600, len=1, loss/actor=-0.875, loss/critic=0.111, n/ep=100, n/st=100, rew=0.74]                         


Epoch #186: test_reward: 0.701602 ± 0.151422, best_reward: 0.797995 ± 0.153606 in #180


Epoch #187: 101it [00:00, 976.83it/s, env_step=18700, len=1, loss/actor=-0.865, loss/critic=0.112, n/ep=100, n/st=100, rew=0.72]                         


Epoch #187: test_reward: 0.671613 ± 0.145502, best_reward: 0.797995 ± 0.153606 in #180


Epoch #188: 101it [00:00, 981.22it/s, env_step=18800, len=1, loss/actor=-0.876, loss/critic=0.120, n/ep=100, n/st=100, rew=0.72]                         


Epoch #188: test_reward: 0.689633 ± 0.159070, best_reward: 0.797995 ± 0.153606 in #180


Epoch #189: 101it [00:00, 959.26it/s, env_step=18900, len=1, loss/actor=-0.876, loss/critic=0.120, n/ep=100, n/st=100, rew=0.71]                         


Epoch #189: test_reward: 0.610226 ± 0.130243, best_reward: 0.797995 ± 0.153606 in #180


Epoch #190: 101it [00:00, 974.65it/s, env_step=19000, len=1, loss/actor=-0.873, loss/critic=0.116, n/ep=100, n/st=100, rew=0.69]                         


Epoch #190: test_reward: 0.693960 ± 0.178165, best_reward: 0.797995 ± 0.153606 in #180


Epoch #191: 101it [00:00, 979.07it/s, env_step=19100, len=1, loss/actor=-0.873, loss/critic=0.119, n/ep=100, n/st=100, rew=0.66]                         


Epoch #191: test_reward: 0.585116 ± 0.174578, best_reward: 0.797995 ± 0.153606 in #180


Epoch #192: 101it [00:00, 993.14it/s, env_step=19200, len=1, loss/actor=-0.875, loss/critic=0.119, n/ep=100, n/st=100, rew=0.68]                         


Epoch #192: test_reward: 0.691285 ± 0.133699, best_reward: 0.797995 ± 0.153606 in #180


Epoch #193: 101it [00:00, 992.75it/s, env_step=19300, len=1, loss/actor=-0.876, loss/critic=0.113, n/ep=100, n/st=100, rew=0.71]                         


Epoch #193: test_reward: 0.777294 ± 0.187580, best_reward: 0.797995 ± 0.153606 in #180


Epoch #194: 101it [00:00, 984.38it/s, env_step=19400, len=1, loss/actor=-0.877, loss/critic=0.112, n/ep=100, n/st=100, rew=0.68]                         


Epoch #194: test_reward: 0.684952 ± 0.168734, best_reward: 0.797995 ± 0.153606 in #180


Epoch #195: 101it [00:00, 983.85it/s, env_step=19500, len=1, loss/actor=-0.882, loss/critic=0.105, n/ep=100, n/st=100, rew=0.68]                         


Epoch #195: test_reward: 0.672123 ± 0.142841, best_reward: 0.797995 ± 0.153606 in #180


Epoch #196: 101it [00:00, 973.00it/s, env_step=19600, len=1, loss/actor=-0.879, loss/critic=0.080, n/ep=100, n/st=100, rew=0.60]                         


Epoch #196: test_reward: 0.662922 ± 0.141057, best_reward: 0.797995 ± 0.153606 in #180


Epoch #197: 101it [00:00, 980.54it/s, env_step=19700, len=1, loss/actor=-0.888, loss/critic=0.075, n/ep=100, n/st=100, rew=0.64]                         


Epoch #197: test_reward: 0.658135 ± 0.142859, best_reward: 0.797995 ± 0.153606 in #180


Epoch #198: 101it [00:00, 991.12it/s, env_step=19800, len=1, loss/actor=-0.883, loss/critic=0.063, n/ep=100, n/st=100, rew=0.59]                         


Epoch #198: test_reward: 0.587177 ± 0.065441, best_reward: 0.797995 ± 0.153606 in #180


Epoch #199: 101it [00:00, 1004.11it/s, env_step=19900, len=1, loss/actor=-0.886, loss/critic=0.055, n/ep=100, n/st=100, rew=0.62]                         


Epoch #199: test_reward: 0.634764 ± 0.186261, best_reward: 0.797995 ± 0.153606 in #180


Epoch #200: 101it [00:00, 972.37it/s, env_step=20000, len=1, loss/actor=-0.887, loss/critic=0.052, n/ep=100, n/st=100, rew=0.60]                         

Epoch #200: test_reward: 0.635114 ± 0.187312, best_reward: 0.797995 ± 0.153606 in #180
Finished training! Use 26.10s





AttributeError: 'collections.OrderedDict' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.

In [117]:
writer = SummaryWriter('log/SAC')
logger = TensorboardLogger(writer)
result = ts.trainer.offpolicy_trainer(
    SACppolicy, SACtrain_collector, SACtest_collector,
    max_epoch=200, step_per_epoch=10000, step_per_collect=10,
    update_per_step=0.1, episode_per_test=15, batch_size=64,
    stop_fn=lambda mean_rewards: mean_rewards >= env.spec.reward_threshold,
    logger=logger)
print(f'Finished training! Use {result["duration"]}')
tc.save(SACppolicy.state_dict(), 'tianshou_results/log/SACdqn.pth')
tc.load(SACppolicy.state_dict(), 'tianshou_results/log/SACdqn.pth')

Epoch #1: 10001it [00:12, 790.81it/s, env_step=10000, len=1, loss/actor=-2.733, loss/critic1=6.084, loss/critic2=6.084, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #1: test_reward: 1.122193 ± 0.138248, best_reward: 1.122193 ± 0.138248 in #1


Epoch #2: 10001it [00:12, 801.66it/s, env_step=20000, len=1, loss/actor=-2.733, loss/critic1=6.062, loss/critic2=6.062, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #2: test_reward: 1.129613 ± 0.084685, best_reward: 1.129613 ± 0.084685 in #2


Epoch #3: 10001it [00:12, 786.52it/s, env_step=30000, len=1, loss/actor=-2.734, loss/critic1=6.172, loss/critic2=6.172, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #3: test_reward: 1.029232 ± 0.115932, best_reward: 1.129613 ± 0.084685 in #2


Epoch #4: 10001it [00:12, 793.14it/s, env_step=40000, len=1, loss/actor=-2.734, loss/critic1=6.137, loss/critic2=6.137, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #4: test_reward: 1.098971 ± 0.130741, best_reward: 1.129613 ± 0.084685 in #2


Epoch #5: 10001it [00:12, 796.07it/s, env_step=50000, len=1, loss/actor=-2.732, loss/critic1=6.174, loss/critic2=6.174, n/ep=100, n/st=100, rew=-2.28]                           


Epoch #5: test_reward: 1.110298 ± 0.130538, best_reward: 1.129613 ± 0.084685 in #2


Epoch #6: 10001it [00:12, 798.05it/s, env_step=60000, len=1, loss/actor=-2.732, loss/critic1=6.158, loss/critic2=6.158, n/ep=100, n/st=100, rew=-2.27]                           


Epoch #6: test_reward: 1.047207 ± 0.085355, best_reward: 1.129613 ± 0.084685 in #2


Epoch #7: 10001it [00:12, 814.38it/s, env_step=70000, len=1, loss/actor=-2.735, loss/critic1=6.047, loss/critic2=6.047, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #7: test_reward: 1.034733 ± 0.122725, best_reward: 1.129613 ± 0.084685 in #2


Epoch #8: 10001it [00:12, 802.19it/s, env_step=80000, len=1, loss/actor=-2.733, loss/critic1=6.111, loss/critic2=6.111, n/ep=100, n/st=100, rew=-2.22]                           


Epoch #8: test_reward: 1.060327 ± 0.116201, best_reward: 1.129613 ± 0.084685 in #2


Epoch #9: 10001it [00:12, 773.03it/s, env_step=90000, len=1, loss/actor=-2.733, loss/critic1=6.152, loss/critic2=6.152, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #9: test_reward: 1.057199 ± 0.142567, best_reward: 1.129613 ± 0.084685 in #2


Epoch #10: 10001it [00:12, 783.93it/s, env_step=100000, len=1, loss/actor=-2.734, loss/critic1=6.091, loss/critic2=6.091, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #10: test_reward: 1.137475 ± 0.117072, best_reward: 1.137475 ± 0.117072 in #10


Epoch #11: 10001it [00:12, 797.17it/s, env_step=110000, len=1, loss/actor=-2.734, loss/critic1=6.134, loss/critic2=6.134, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #11: test_reward: 1.028033 ± 0.097969, best_reward: 1.137475 ± 0.117072 in #10


Epoch #12: 10001it [00:12, 793.86it/s, env_step=120000, len=1, loss/actor=-2.733, loss/critic1=6.148, loss/critic2=6.148, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #12: test_reward: 1.104806 ± 0.130885, best_reward: 1.137475 ± 0.117072 in #10


Epoch #13: 10001it [00:12, 783.39it/s, env_step=130000, len=1, loss/actor=-2.734, loss/critic1=6.171, loss/critic2=6.171, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #13: test_reward: 1.053294 ± 0.073459, best_reward: 1.137475 ± 0.117072 in #10


Epoch #14: 10001it [00:12, 774.56it/s, env_step=140000, len=1, loss/actor=-2.733, loss/critic1=6.156, loss/critic2=6.156, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #14: test_reward: 1.095832 ± 0.106995, best_reward: 1.137475 ± 0.117072 in #10


Epoch #15: 10001it [00:12, 799.73it/s, env_step=150000, len=1, loss/actor=-2.737, loss/critic1=6.086, loss/critic2=6.086, n/ep=100, n/st=100, rew=-2.26]                           


Epoch #15: test_reward: 1.052327 ± 0.132107, best_reward: 1.137475 ± 0.117072 in #10


Epoch #16: 10001it [00:12, 813.39it/s, env_step=160000, len=1, loss/actor=-2.733, loss/critic1=6.143, loss/critic2=6.143, n/ep=100, n/st=100, rew=-2.49]                           


Epoch #16: test_reward: 1.087186 ± 0.121926, best_reward: 1.137475 ± 0.117072 in #10


Epoch #17: 10001it [00:12, 793.37it/s, env_step=170000, len=1, loss/actor=-2.733, loss/critic1=6.127, loss/critic2=6.127, n/ep=100, n/st=100, rew=-2.29]                           


Epoch #17: test_reward: 1.026156 ± 0.111410, best_reward: 1.137475 ± 0.117072 in #10


Epoch #18: 10001it [00:12, 799.34it/s, env_step=180000, len=1, loss/actor=-2.734, loss/critic1=6.207, loss/critic2=6.207, n/ep=100, n/st=100, rew=-2.25]                           


Epoch #18: test_reward: 1.086643 ± 0.128920, best_reward: 1.137475 ± 0.117072 in #10


Epoch #19: 10001it [00:12, 800.36it/s, env_step=190000, len=1, loss/actor=-2.733, loss/critic1=6.159, loss/critic2=6.159, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #19: test_reward: 1.089522 ± 0.108431, best_reward: 1.137475 ± 0.117072 in #10


Epoch #20: 10001it [00:12, 809.72it/s, env_step=200000, len=1, loss/actor=-2.736, loss/critic1=6.043, loss/critic2=6.043, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #20: test_reward: 1.030652 ± 0.152566, best_reward: 1.137475 ± 0.117072 in #10


Epoch #21: 10001it [00:13, 767.80it/s, env_step=210000, len=1, loss/actor=-2.735, loss/critic1=6.138, loss/critic2=6.138, n/ep=100, n/st=100, rew=-2.30]                           


Epoch #21: test_reward: 1.040260 ± 0.148831, best_reward: 1.137475 ± 0.117072 in #10


Epoch #22: 10001it [00:13, 760.19it/s, env_step=220000, len=1, loss/actor=-2.732, loss/critic1=6.205, loss/critic2=6.205, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #22: test_reward: 1.022968 ± 0.081255, best_reward: 1.137475 ± 0.117072 in #10


Epoch #23: 10001it [00:12, 783.48it/s, env_step=230000, len=1, loss/actor=-2.735, loss/critic1=6.101, loss/critic2=6.101, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #23: test_reward: 1.100886 ± 0.165791, best_reward: 1.137475 ± 0.117072 in #10


Epoch #24: 10001it [00:12, 805.20it/s, env_step=240000, len=1, loss/actor=-2.733, loss/critic1=6.126, loss/critic2=6.126, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #24: test_reward: 1.035651 ± 0.156733, best_reward: 1.137475 ± 0.117072 in #10


Epoch #25: 10001it [00:12, 789.83it/s, env_step=250000, len=1, loss/actor=-2.733, loss/critic1=6.074, loss/critic2=6.074, n/ep=100, n/st=100, rew=-2.28]                           


Epoch #25: test_reward: 1.107909 ± 0.122103, best_reward: 1.137475 ± 0.117072 in #10


Epoch #26: 10001it [00:12, 776.53it/s, env_step=260000, len=1, loss/actor=-2.735, loss/critic1=6.084, loss/critic2=6.084, n/ep=100, n/st=100, rew=-2.28]                           


Epoch #26: test_reward: 1.074499 ± 0.121117, best_reward: 1.137475 ± 0.117072 in #10


Epoch #27: 10001it [00:12, 775.82it/s, env_step=270000, len=1, loss/actor=-2.735, loss/critic1=6.098, loss/critic2=6.098, n/ep=100, n/st=100, rew=-2.52]                           


Epoch #27: test_reward: 1.038815 ± 0.125411, best_reward: 1.137475 ± 0.117072 in #10


Epoch #28: 10001it [00:12, 791.95it/s, env_step=280000, len=1, loss/actor=-2.735, loss/critic1=6.127, loss/critic2=6.127, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #28: test_reward: 1.068557 ± 0.102565, best_reward: 1.137475 ± 0.117072 in #10


Epoch #29: 10001it [00:12, 808.21it/s, env_step=290000, len=1, loss/actor=-2.733, loss/critic1=6.062, loss/critic2=6.062, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #29: test_reward: 1.128376 ± 0.097586, best_reward: 1.137475 ± 0.117072 in #10


Epoch #30: 10001it [00:12, 813.22it/s, env_step=300000, len=1, loss/actor=-2.732, loss/critic1=6.174, loss/critic2=6.174, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #30: test_reward: 1.078366 ± 0.159646, best_reward: 1.137475 ± 0.117072 in #10


Epoch #31: 10001it [00:12, 814.73it/s, env_step=310000, len=1, loss/actor=-2.735, loss/critic1=6.059, loss/critic2=6.059, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #31: test_reward: 1.041903 ± 0.104885, best_reward: 1.137475 ± 0.117072 in #10


Epoch #32: 10001it [00:12, 813.21it/s, env_step=320000, len=1, loss/actor=-2.735, loss/critic1=6.076, loss/critic2=6.076, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #32: test_reward: 1.062328 ± 0.134809, best_reward: 1.137475 ± 0.117072 in #10


Epoch #33: 10001it [00:12, 812.18it/s, env_step=330000, len=1, loss/actor=-2.734, loss/critic1=6.199, loss/critic2=6.199, n/ep=100, n/st=100, rew=-2.54]                           


Epoch #33: test_reward: 1.072309 ± 0.145517, best_reward: 1.137475 ± 0.117072 in #10


Epoch #34: 10001it [00:12, 809.52it/s, env_step=340000, len=1, loss/actor=-2.734, loss/critic1=6.117, loss/critic2=6.117, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #34: test_reward: 1.042521 ± 0.109189, best_reward: 1.137475 ± 0.117072 in #10


Epoch #35: 10001it [00:12, 811.02it/s, env_step=350000, len=1, loss/actor=-2.737, loss/critic1=6.041, loss/critic2=6.041, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #35: test_reward: 1.030940 ± 0.145000, best_reward: 1.137475 ± 0.117072 in #10


Epoch #36: 10001it [00:12, 796.63it/s, env_step=360000, len=1, loss/actor=-2.732, loss/critic1=6.161, loss/critic2=6.161, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #36: test_reward: 1.045791 ± 0.115871, best_reward: 1.137475 ± 0.117072 in #10


Epoch #37: 10001it [00:12, 794.80it/s, env_step=370000, len=1, loss/actor=-2.733, loss/critic1=6.078, loss/critic2=6.078, n/ep=100, n/st=100, rew=-2.31]                           


Epoch #37: test_reward: 1.114763 ± 0.108206, best_reward: 1.137475 ± 0.117072 in #10


Epoch #38: 10001it [00:12, 786.95it/s, env_step=380000, len=1, loss/actor=-2.734, loss/critic1=6.110, loss/critic2=6.110, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #38: test_reward: 1.097779 ± 0.129143, best_reward: 1.137475 ± 0.117072 in #10


Epoch #39: 10001it [00:12, 797.50it/s, env_step=390000, len=1, loss/actor=-2.734, loss/critic1=6.096, loss/critic2=6.096, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #39: test_reward: 1.081236 ± 0.090528, best_reward: 1.137475 ± 0.117072 in #10


Epoch #40: 10001it [00:12, 787.09it/s, env_step=400000, len=1, loss/actor=-2.733, loss/critic1=6.073, loss/critic2=6.073, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #40: test_reward: 1.076868 ± 0.116857, best_reward: 1.137475 ± 0.117072 in #10


Epoch #41: 10001it [00:12, 787.18it/s, env_step=410000, len=1, loss/actor=-2.734, loss/critic1=6.226, loss/critic2=6.226, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #41: test_reward: 1.040093 ± 0.128714, best_reward: 1.137475 ± 0.117072 in #10


Epoch #42: 10001it [00:12, 788.22it/s, env_step=420000, len=1, loss/actor=-2.735, loss/critic1=6.099, loss/critic2=6.099, n/ep=100, n/st=100, rew=-2.50]                           


Epoch #42: test_reward: 1.068348 ± 0.087748, best_reward: 1.137475 ± 0.117072 in #10


Epoch #43: 10001it [00:12, 807.68it/s, env_step=430000, len=1, loss/actor=-2.735, loss/critic1=6.136, loss/critic2=6.136, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #43: test_reward: 1.042910 ± 0.125658, best_reward: 1.137475 ± 0.117072 in #10


Epoch #44: 10001it [00:12, 776.93it/s, env_step=440000, len=1, loss/actor=-2.734, loss/critic1=6.159, loss/critic2=6.159, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #44: test_reward: 1.058466 ± 0.113895, best_reward: 1.137475 ± 0.117072 in #10


Epoch #45: 10001it [00:12, 769.99it/s, env_step=450000, len=1, loss/actor=-2.735, loss/critic1=6.160, loss/critic2=6.160, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #45: test_reward: 1.113842 ± 0.129089, best_reward: 1.137475 ± 0.117072 in #10


Epoch #46: 10001it [00:12, 779.49it/s, env_step=460000, len=1, loss/actor=-2.738, loss/critic1=6.142, loss/critic2=6.142, n/ep=100, n/st=100, rew=-2.24]                           


Epoch #46: test_reward: 1.048378 ± 0.117597, best_reward: 1.137475 ± 0.117072 in #10


Epoch #47: 10001it [00:12, 801.79it/s, env_step=470000, len=1, loss/actor=-2.733, loss/critic1=6.163, loss/critic2=6.163, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #47: test_reward: 1.078733 ± 0.154216, best_reward: 1.137475 ± 0.117072 in #10


Epoch #48: 10001it [00:12, 799.82it/s, env_step=480000, len=1, loss/actor=-2.733, loss/critic1=6.151, loss/critic2=6.151, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #48: test_reward: 1.078656 ± 0.097186, best_reward: 1.137475 ± 0.117072 in #10


Epoch #49: 10001it [00:12, 798.54it/s, env_step=490000, len=1, loss/actor=-2.734, loss/critic1=6.175, loss/critic2=6.175, n/ep=100, n/st=100, rew=-2.25]                           


Epoch #49: test_reward: 1.056947 ± 0.138093, best_reward: 1.137475 ± 0.117072 in #10


Epoch #50: 10001it [00:12, 801.05it/s, env_step=500000, len=1, loss/actor=-2.733, loss/critic1=6.117, loss/critic2=6.117, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #50: test_reward: 1.092005 ± 0.106779, best_reward: 1.137475 ± 0.117072 in #10


Epoch #51: 10001it [00:12, 799.49it/s, env_step=510000, len=1, loss/actor=-2.735, loss/critic1=6.073, loss/critic2=6.073, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #51: test_reward: 1.045700 ± 0.136078, best_reward: 1.137475 ± 0.117072 in #10


Epoch #52: 10001it [00:12, 799.90it/s, env_step=520000, len=1, loss/actor=-2.732, loss/critic1=6.098, loss/critic2=6.098, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #52: test_reward: 1.133201 ± 0.094932, best_reward: 1.137475 ± 0.117072 in #10


Epoch #53: 10001it [00:12, 803.79it/s, env_step=530000, len=1, loss/actor=-2.734, loss/critic1=6.057, loss/critic2=6.057, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #53: test_reward: 1.065431 ± 0.137863, best_reward: 1.137475 ± 0.117072 in #10


Epoch #54: 10001it [00:12, 800.45it/s, env_step=540000, len=1, loss/actor=-2.734, loss/critic1=6.168, loss/critic2=6.168, n/ep=100, n/st=100, rew=-2.30]                           


Epoch #54: test_reward: 1.099296 ± 0.136771, best_reward: 1.137475 ± 0.117072 in #10


Epoch #55: 10001it [00:12, 803.07it/s, env_step=550000, len=1, loss/actor=-2.732, loss/critic1=6.120, loss/critic2=6.120, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #55: test_reward: 1.080535 ± 0.097029, best_reward: 1.137475 ± 0.117072 in #10


Epoch #56: 10001it [00:12, 797.93it/s, env_step=560000, len=1, loss/actor=-2.734, loss/critic1=6.060, loss/critic2=6.060, n/ep=100, n/st=100, rew=-2.30]                           


Epoch #56: test_reward: 1.060563 ± 0.092876, best_reward: 1.137475 ± 0.117072 in #10


Epoch #57: 10001it [00:12, 798.66it/s, env_step=570000, len=1, loss/actor=-2.734, loss/critic1=6.124, loss/critic2=6.124, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #57: test_reward: 1.088454 ± 0.109265, best_reward: 1.137475 ± 0.117072 in #10


Epoch #58: 10001it [00:12, 804.58it/s, env_step=580000, len=1, loss/actor=-2.733, loss/critic1=6.101, loss/critic2=6.101, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #58: test_reward: 1.057829 ± 0.124132, best_reward: 1.137475 ± 0.117072 in #10


Epoch #59: 10001it [00:12, 802.24it/s, env_step=590000, len=1, loss/actor=-2.733, loss/critic1=6.167, loss/critic2=6.167, n/ep=100, n/st=100, rew=-2.52]                           


Epoch #59: test_reward: 1.078416 ± 0.107376, best_reward: 1.137475 ± 0.117072 in #10


Epoch #60: 10001it [00:12, 800.03it/s, env_step=600000, len=1, loss/actor=-2.736, loss/critic1=6.060, loss/critic2=6.060, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #60: test_reward: 1.088950 ± 0.134303, best_reward: 1.137475 ± 0.117072 in #10


Epoch #61: 10001it [00:12, 800.91it/s, env_step=610000, len=1, loss/actor=-2.731, loss/critic1=6.155, loss/critic2=6.155, n/ep=100, n/st=100, rew=-2.30]                           


Epoch #61: test_reward: 1.069790 ± 0.115505, best_reward: 1.137475 ± 0.117072 in #10


Epoch #62: 10001it [00:12, 802.97it/s, env_step=620000, len=1, loss/actor=-2.735, loss/critic1=6.114, loss/critic2=6.114, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #62: test_reward: 1.104739 ± 0.129185, best_reward: 1.137475 ± 0.117072 in #10


Epoch #63: 10001it [00:12, 801.48it/s, env_step=630000, len=1, loss/actor=-2.734, loss/critic1=6.036, loss/critic2=6.036, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #63: test_reward: 1.088720 ± 0.104506, best_reward: 1.137475 ± 0.117072 in #10


Epoch #64: 10001it [00:12, 798.29it/s, env_step=640000, len=1, loss/actor=-2.735, loss/critic1=6.071, loss/critic2=6.071, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #64: test_reward: 1.075358 ± 0.119019, best_reward: 1.137475 ± 0.117072 in #10


Epoch #65: 10001it [00:12, 789.98it/s, env_step=650000, len=1, loss/actor=-2.735, loss/critic1=6.110, loss/critic2=6.110, n/ep=100, n/st=100, rew=-2.21]                           


Epoch #65: test_reward: 1.065423 ± 0.073663, best_reward: 1.137475 ± 0.117072 in #10


Epoch #66: 10001it [00:13, 765.15it/s, env_step=660000, len=1, loss/actor=-2.733, loss/critic1=6.059, loss/critic2=6.059, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #66: test_reward: 1.054993 ± 0.111000, best_reward: 1.137475 ± 0.117072 in #10


Epoch #67: 10001it [00:12, 798.52it/s, env_step=670000, len=1, loss/actor=-2.735, loss/critic1=6.106, loss/critic2=6.106, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #67: test_reward: 1.042506 ± 0.087200, best_reward: 1.137475 ± 0.117072 in #10


Epoch #68: 10001it [00:12, 795.67it/s, env_step=680000, len=1, loss/actor=-2.733, loss/critic1=6.064, loss/critic2=6.064, n/ep=100, n/st=100, rew=-2.54]                           


Epoch #68: test_reward: 1.042268 ± 0.113160, best_reward: 1.137475 ± 0.117072 in #10


Epoch #69: 10001it [00:12, 795.54it/s, env_step=690000, len=1, loss/actor=-2.733, loss/critic1=6.102, loss/critic2=6.102, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #69: test_reward: 1.101764 ± 0.095358, best_reward: 1.137475 ± 0.117072 in #10


Epoch #70: 10001it [00:12, 797.02it/s, env_step=700000, len=1, loss/actor=-2.734, loss/critic1=6.182, loss/critic2=6.182, n/ep=100, n/st=100, rew=-2.26]                           


Epoch #70: test_reward: 1.080181 ± 0.139107, best_reward: 1.137475 ± 0.117072 in #10


Epoch #71: 10001it [00:12, 789.25it/s, env_step=710000, len=1, loss/actor=-2.736, loss/critic1=6.167, loss/critic2=6.167, n/ep=100, n/st=100, rew=-2.50]                           


Epoch #71: test_reward: 1.029124 ± 0.120074, best_reward: 1.137475 ± 0.117072 in #10


Epoch #72: 10001it [00:13, 767.67it/s, env_step=720000, len=1, loss/actor=-2.734, loss/critic1=6.049, loss/critic2=6.049, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #72: test_reward: 1.042812 ± 0.126332, best_reward: 1.137475 ± 0.117072 in #10


Epoch #73: 10001it [00:12, 783.91it/s, env_step=730000, len=1, loss/actor=-2.735, loss/critic1=6.096, loss/critic2=6.096, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #73: test_reward: 1.097533 ± 0.147352, best_reward: 1.137475 ± 0.117072 in #10


Epoch #74: 10001it [00:12, 801.57it/s, env_step=740000, len=1, loss/actor=-2.735, loss/critic1=6.017, loss/critic2=6.017, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #74: test_reward: 1.081114 ± 0.140211, best_reward: 1.137475 ± 0.117072 in #10


Epoch #75: 10001it [00:12, 808.89it/s, env_step=750000, len=1, loss/actor=-2.735, loss/critic1=6.158, loss/critic2=6.158, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #75: test_reward: 1.101135 ± 0.100890, best_reward: 1.137475 ± 0.117072 in #10


Epoch #76: 10001it [00:12, 832.08it/s, env_step=760000, len=1, loss/actor=-2.732, loss/critic1=6.066, loss/critic2=6.066, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #76: test_reward: 1.034753 ± 0.129610, best_reward: 1.137475 ± 0.117072 in #10


Epoch #77: 10001it [00:12, 818.29it/s, env_step=770000, len=1, loss/actor=-2.735, loss/critic1=6.097, loss/critic2=6.097, n/ep=100, n/st=100, rew=-2.49]                           


Epoch #77: test_reward: 1.082250 ± 0.146306, best_reward: 1.137475 ± 0.117072 in #10


Epoch #78: 10001it [00:11, 840.50it/s, env_step=780000, len=1, loss/actor=-2.734, loss/critic1=6.183, loss/critic2=6.183, n/ep=100, n/st=100, rew=-2.26]                           


Epoch #78: test_reward: 1.010535 ± 0.112571, best_reward: 1.137475 ± 0.117072 in #10


Epoch #79: 10001it [00:12, 813.01it/s, env_step=790000, len=1, loss/actor=-2.733, loss/critic1=6.103, loss/critic2=6.103, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #79: test_reward: 1.025528 ± 0.144433, best_reward: 1.137475 ± 0.117072 in #10


Epoch #80: 10001it [00:12, 821.53it/s, env_step=800000, len=1, loss/actor=-2.734, loss/critic1=6.071, loss/critic2=6.071, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #80: test_reward: 1.038395 ± 0.103504, best_reward: 1.137475 ± 0.117072 in #10


Epoch #81: 10001it [00:12, 813.66it/s, env_step=810000, len=1, loss/actor=-2.732, loss/critic1=6.205, loss/critic2=6.205, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #81: test_reward: 1.071381 ± 0.125780, best_reward: 1.137475 ± 0.117072 in #10


Epoch #82: 10001it [00:12, 814.36it/s, env_step=820000, len=1, loss/actor=-2.734, loss/critic1=6.098, loss/critic2=6.098, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #82: test_reward: 1.041589 ± 0.122442, best_reward: 1.137475 ± 0.117072 in #10


Epoch #83: 10001it [00:12, 808.47it/s, env_step=830000, len=1, loss/actor=-2.734, loss/critic1=6.081, loss/critic2=6.081, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #83: test_reward: 1.066814 ± 0.093414, best_reward: 1.137475 ± 0.117072 in #10


Epoch #84: 10001it [00:12, 792.27it/s, env_step=840000, len=1, loss/actor=-2.736, loss/critic1=6.096, loss/critic2=6.096, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #84: test_reward: 1.098112 ± 0.107210, best_reward: 1.137475 ± 0.117072 in #10


Epoch #85: 10001it [00:12, 783.80it/s, env_step=850000, len=1, loss/actor=-2.734, loss/critic1=6.204, loss/critic2=6.204, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #85: test_reward: 1.044346 ± 0.102190, best_reward: 1.137475 ± 0.117072 in #10


Epoch #86: 10001it [00:12, 786.84it/s, env_step=860000, len=1, loss/actor=-2.737, loss/critic1=6.175, loss/critic2=6.175, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #86: test_reward: 1.029106 ± 0.099661, best_reward: 1.137475 ± 0.117072 in #10


Epoch #87: 10001it [00:12, 790.58it/s, env_step=870000, len=1, loss/actor=-2.733, loss/critic1=6.072, loss/critic2=6.072, n/ep=100, n/st=100, rew=-2.51]                           


Epoch #87: test_reward: 1.064261 ± 0.179794, best_reward: 1.137475 ± 0.117072 in #10


Epoch #88: 10001it [00:12, 786.93it/s, env_step=880000, len=1, loss/actor=-2.732, loss/critic1=6.062, loss/critic2=6.062, n/ep=100, n/st=100, rew=-2.46]                           


Epoch #88: test_reward: 1.041739 ± 0.083865, best_reward: 1.137475 ± 0.117072 in #10


Epoch #89: 10001it [00:12, 795.56it/s, env_step=890000, len=1, loss/actor=-2.735, loss/critic1=6.114, loss/critic2=6.114, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #89: test_reward: 1.022663 ± 0.108039, best_reward: 1.137475 ± 0.117072 in #10


Epoch #90: 10001it [00:12, 798.63it/s, env_step=900000, len=1, loss/actor=-2.734, loss/critic1=6.125, loss/critic2=6.125, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #90: test_reward: 1.082465 ± 0.110601, best_reward: 1.137475 ± 0.117072 in #10


Epoch #91: 10001it [00:12, 793.99it/s, env_step=910000, len=1, loss/actor=-2.737, loss/critic1=6.100, loss/critic2=6.100, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #91: test_reward: 1.084986 ± 0.104290, best_reward: 1.137475 ± 0.117072 in #10


Epoch #92: 10001it [00:12, 797.49it/s, env_step=920000, len=1, loss/actor=-2.733, loss/critic1=6.091, loss/critic2=6.091, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #92: test_reward: 1.061028 ± 0.095228, best_reward: 1.137475 ± 0.117072 in #10


Epoch #93: 10001it [00:12, 796.02it/s, env_step=930000, len=1, loss/actor=-2.733, loss/critic1=6.138, loss/critic2=6.138, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #93: test_reward: 1.052796 ± 0.106456, best_reward: 1.137475 ± 0.117072 in #10


Epoch #94: 10001it [00:13, 747.87it/s, env_step=940000, len=1, loss/actor=-2.736, loss/critic1=6.116, loss/critic2=6.116, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #94: test_reward: 1.055123 ± 0.128306, best_reward: 1.137475 ± 0.117072 in #10


Epoch #95: 10001it [00:12, 790.70it/s, env_step=950000, len=1, loss/actor=-2.736, loss/critic1=6.108, loss/critic2=6.108, n/ep=100, n/st=100, rew=-2.30]                           


Epoch #95: test_reward: 1.061581 ± 0.152949, best_reward: 1.137475 ± 0.117072 in #10


Epoch #96: 10001it [00:12, 790.34it/s, env_step=960000, len=1, loss/actor=-2.734, loss/critic1=6.052, loss/critic2=6.052, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #96: test_reward: 1.109584 ± 0.124610, best_reward: 1.137475 ± 0.117072 in #10


Epoch #97: 10001it [00:12, 782.95it/s, env_step=970000, len=1, loss/actor=-2.734, loss/critic1=6.093, loss/critic2=6.093, n/ep=100, n/st=100, rew=-2.28]                           


Epoch #97: test_reward: 1.036817 ± 0.153034, best_reward: 1.137475 ± 0.117072 in #10


Epoch #98: 10001it [00:12, 784.63it/s, env_step=980000, len=1, loss/actor=-2.735, loss/critic1=6.167, loss/critic2=6.167, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #98: test_reward: 1.062427 ± 0.178474, best_reward: 1.137475 ± 0.117072 in #10


Epoch #99: 10001it [00:12, 786.74it/s, env_step=990000, len=1, loss/actor=-2.733, loss/critic1=6.083, loss/critic2=6.083, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #99: test_reward: 1.039684 ± 0.092968, best_reward: 1.137475 ± 0.117072 in #10


Epoch #100: 10001it [00:12, 784.36it/s, env_step=1000000, len=1, loss/actor=-2.735, loss/critic1=6.054, loss/critic2=6.054, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #100: test_reward: 1.098760 ± 0.118861, best_reward: 1.137475 ± 0.117072 in #10


Epoch #101: 10001it [00:12, 789.20it/s, env_step=1010000, len=1, loss/actor=-2.734, loss/critic1=6.112, loss/critic2=6.112, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #101: test_reward: 1.011578 ± 0.085939, best_reward: 1.137475 ± 0.117072 in #10


Epoch #102: 10001it [00:12, 787.17it/s, env_step=1020000, len=1, loss/actor=-2.735, loss/critic1=6.184, loss/critic2=6.184, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #102: test_reward: 0.982062 ± 0.121216, best_reward: 1.137475 ± 0.117072 in #10


Epoch #103: 10001it [00:12, 781.47it/s, env_step=1030000, len=1, loss/actor=-2.732, loss/critic1=6.098, loss/critic2=6.098, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #103: test_reward: 1.075898 ± 0.119698, best_reward: 1.137475 ± 0.117072 in #10


Epoch #104: 10001it [00:12, 781.58it/s, env_step=1040000, len=1, loss/actor=-2.735, loss/critic1=6.164, loss/critic2=6.164, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #104: test_reward: 1.038317 ± 0.120209, best_reward: 1.137475 ± 0.117072 in #10


Epoch #105: 10001it [00:12, 788.61it/s, env_step=1050000, len=1, loss/actor=-2.732, loss/critic1=6.127, loss/critic2=6.127, n/ep=100, n/st=100, rew=-2.29]                           


Epoch #105: test_reward: 1.081552 ± 0.076276, best_reward: 1.137475 ± 0.117072 in #10


Epoch #106: 10001it [00:12, 786.41it/s, env_step=1060000, len=1, loss/actor=-2.735, loss/critic1=6.109, loss/critic2=6.109, n/ep=100, n/st=100, rew=-2.46]                           


Epoch #106: test_reward: 1.037817 ± 0.126656, best_reward: 1.137475 ± 0.117072 in #10


Epoch #107: 10001it [00:12, 793.39it/s, env_step=1070000, len=1, loss/actor=-2.733, loss/critic1=6.131, loss/critic2=6.131, n/ep=100, n/st=100, rew=-2.52]                           


Epoch #107: test_reward: 1.062468 ± 0.109304, best_reward: 1.137475 ± 0.117072 in #10


Epoch #108: 10001it [00:12, 786.57it/s, env_step=1080000, len=1, loss/actor=-2.733, loss/critic1=6.120, loss/critic2=6.120, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #108: test_reward: 1.067537 ± 0.124456, best_reward: 1.137475 ± 0.117072 in #10


Epoch #109: 10001it [00:12, 803.88it/s, env_step=1090000, len=1, loss/actor=-2.734, loss/critic1=6.058, loss/critic2=6.058, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #109: test_reward: 1.107861 ± 0.119662, best_reward: 1.137475 ± 0.117072 in #10


Epoch #110: 10001it [00:12, 808.98it/s, env_step=1100000, len=1, loss/actor=-2.734, loss/critic1=6.138, loss/critic2=6.138, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #110: test_reward: 1.042383 ± 0.121292, best_reward: 1.137475 ± 0.117072 in #10


Epoch #111: 10001it [00:12, 804.52it/s, env_step=1110000, len=1, loss/actor=-2.736, loss/critic1=6.145, loss/critic2=6.145, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #111: test_reward: 1.123779 ± 0.087349, best_reward: 1.137475 ± 0.117072 in #10


Epoch #112: 10001it [00:12, 810.90it/s, env_step=1120000, len=1, loss/actor=-2.735, loss/critic1=6.111, loss/critic2=6.111, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #112: test_reward: 1.074453 ± 0.102054, best_reward: 1.137475 ± 0.117072 in #10


Epoch #113: 10001it [00:12, 810.06it/s, env_step=1130000, len=1, loss/actor=-2.735, loss/critic1=6.155, loss/critic2=6.155, n/ep=100, n/st=100, rew=-2.27]                           


Epoch #113: test_reward: 1.065342 ± 0.115404, best_reward: 1.137475 ± 0.117072 in #10


Epoch #114: 10001it [00:12, 809.75it/s, env_step=1140000, len=1, loss/actor=-2.735, loss/critic1=6.100, loss/critic2=6.100, n/ep=100, n/st=100, rew=-2.31]                           


Epoch #114: test_reward: 1.031491 ± 0.130908, best_reward: 1.137475 ± 0.117072 in #10


Epoch #115: 10001it [00:12, 793.59it/s, env_step=1150000, len=1, loss/actor=-2.736, loss/critic1=6.034, loss/critic2=6.034, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #115: test_reward: 1.022467 ± 0.114860, best_reward: 1.137475 ± 0.117072 in #10


Epoch #116: 10001it [00:12, 808.71it/s, env_step=1160000, len=1, loss/actor=-2.733, loss/critic1=6.168, loss/critic2=6.168, n/ep=100, n/st=100, rew=-2.31]                           


Epoch #116: test_reward: 1.011477 ± 0.126640, best_reward: 1.137475 ± 0.117072 in #10


Epoch #117: 10001it [00:12, 806.27it/s, env_step=1170000, len=1, loss/actor=-2.735, loss/critic1=6.143, loss/critic2=6.143, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #117: test_reward: 1.005185 ± 0.143956, best_reward: 1.137475 ± 0.117072 in #10


Epoch #118: 10001it [00:12, 802.11it/s, env_step=1180000, len=1, loss/actor=-2.735, loss/critic1=6.116, loss/critic2=6.116, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #118: test_reward: 1.088584 ± 0.094437, best_reward: 1.137475 ± 0.117072 in #10


Epoch #119: 10001it [00:12, 810.57it/s, env_step=1190000, len=1, loss/actor=-2.733, loss/critic1=6.126, loss/critic2=6.126, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #119: test_reward: 1.078592 ± 0.095784, best_reward: 1.137475 ± 0.117072 in #10


Epoch #120: 10001it [00:12, 807.35it/s, env_step=1200000, len=1, loss/actor=-2.735, loss/critic1=6.117, loss/critic2=6.117, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #120: test_reward: 1.073985 ± 0.103946, best_reward: 1.137475 ± 0.117072 in #10


Epoch #121: 10001it [00:12, 808.67it/s, env_step=1210000, len=1, loss/actor=-2.734, loss/critic1=6.093, loss/critic2=6.093, n/ep=100, n/st=100, rew=-2.29]                           


Epoch #121: test_reward: 1.031114 ± 0.109029, best_reward: 1.137475 ± 0.117072 in #10


Epoch #122: 10001it [00:12, 809.54it/s, env_step=1220000, len=1, loss/actor=-2.735, loss/critic1=6.187, loss/critic2=6.187, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #122: test_reward: 1.061190 ± 0.123569, best_reward: 1.137475 ± 0.117072 in #10


Epoch #123: 10001it [00:12, 803.22it/s, env_step=1230000, len=1, loss/actor=-2.735, loss/critic1=6.108, loss/critic2=6.108, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #123: test_reward: 1.054551 ± 0.136356, best_reward: 1.137475 ± 0.117072 in #10


Epoch #124: 10001it [00:12, 814.02it/s, env_step=1240000, len=1, loss/actor=-2.737, loss/critic1=6.123, loss/critic2=6.123, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #124: test_reward: 1.044853 ± 0.125721, best_reward: 1.137475 ± 0.117072 in #10


Epoch #125: 10001it [00:12, 807.30it/s, env_step=1250000, len=1, loss/actor=-2.735, loss/critic1=6.108, loss/critic2=6.108, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #125: test_reward: 1.057863 ± 0.101862, best_reward: 1.137475 ± 0.117072 in #10


Epoch #126: 10001it [00:12, 809.03it/s, env_step=1260000, len=1, loss/actor=-2.734, loss/critic1=6.162, loss/critic2=6.162, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #126: test_reward: 1.153005 ± 0.132169, best_reward: 1.153005 ± 0.132169 in #126


Epoch #127: 10001it [00:12, 812.22it/s, env_step=1270000, len=1, loss/actor=-2.735, loss/critic1=6.173, loss/critic2=6.173, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #127: test_reward: 1.133200 ± 0.098027, best_reward: 1.153005 ± 0.132169 in #126


Epoch #128: 10001it [00:12, 806.97it/s, env_step=1280000, len=1, loss/actor=-2.733, loss/critic1=6.167, loss/critic2=6.167, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #128: test_reward: 1.108734 ± 0.099432, best_reward: 1.153005 ± 0.132169 in #126


Epoch #129: 10001it [00:12, 807.62it/s, env_step=1290000, len=1, loss/actor=-2.736, loss/critic1=6.070, loss/critic2=6.070, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #129: test_reward: 1.068887 ± 0.111874, best_reward: 1.153005 ± 0.132169 in #126


Epoch #130: 10001it [00:12, 809.14it/s, env_step=1300000, len=1, loss/actor=-2.733, loss/critic1=6.098, loss/critic2=6.098, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #130: test_reward: 1.024906 ± 0.105211, best_reward: 1.153005 ± 0.132169 in #126


Epoch #131: 10001it [00:12, 812.81it/s, env_step=1310000, len=1, loss/actor=-2.732, loss/critic1=6.023, loss/critic2=6.023, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #131: test_reward: 1.113982 ± 0.098841, best_reward: 1.153005 ± 0.132169 in #126


Epoch #132: 10001it [00:12, 813.85it/s, env_step=1320000, len=1, loss/actor=-2.734, loss/critic1=6.171, loss/critic2=6.171, n/ep=100, n/st=100, rew=-2.31]                           


Epoch #132: test_reward: 1.072450 ± 0.117088, best_reward: 1.153005 ± 0.132169 in #126


Epoch #133: 10001it [00:12, 809.64it/s, env_step=1330000, len=1, loss/actor=-2.734, loss/critic1=6.134, loss/critic2=6.134, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #133: test_reward: 1.007602 ± 0.124737, best_reward: 1.153005 ± 0.132169 in #126


Epoch #134: 10001it [00:12, 805.14it/s, env_step=1340000, len=1, loss/actor=-2.732, loss/critic1=6.137, loss/critic2=6.137, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #134: test_reward: 1.086317 ± 0.141913, best_reward: 1.153005 ± 0.132169 in #126


Epoch #135: 10001it [00:12, 799.13it/s, env_step=1350000, len=1, loss/actor=-2.734, loss/critic1=6.124, loss/critic2=6.124, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #135: test_reward: 1.049754 ± 0.118602, best_reward: 1.153005 ± 0.132169 in #126


Epoch #136: 10001it [00:12, 805.38it/s, env_step=1360000, len=1, loss/actor=-2.736, loss/critic1=6.128, loss/critic2=6.128, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #136: test_reward: 1.069470 ± 0.117660, best_reward: 1.153005 ± 0.132169 in #126


Epoch #137: 10001it [00:12, 813.00it/s, env_step=1370000, len=1, loss/actor=-2.735, loss/critic1=6.121, loss/critic2=6.121, n/ep=100, n/st=100, rew=-2.29]                           


Epoch #137: test_reward: 1.019839 ± 0.126247, best_reward: 1.153005 ± 0.132169 in #126


Epoch #138: 10001it [00:12, 813.97it/s, env_step=1380000, len=1, loss/actor=-2.734, loss/critic1=6.134, loss/critic2=6.134, n/ep=100, n/st=100, rew=-2.53]                           


Epoch #138: test_reward: 1.128666 ± 0.155065, best_reward: 1.153005 ± 0.132169 in #126


Epoch #139: 10001it [00:12, 814.73it/s, env_step=1390000, len=1, loss/actor=-2.737, loss/critic1=6.093, loss/critic2=6.093, n/ep=100, n/st=100, rew=-2.43]                           


Epoch #139: test_reward: 1.098262 ± 0.132385, best_reward: 1.153005 ± 0.132169 in #126


Epoch #140: 10001it [00:12, 811.24it/s, env_step=1400000, len=1, loss/actor=-2.733, loss/critic1=6.103, loss/critic2=6.103, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #140: test_reward: 1.047761 ± 0.126287, best_reward: 1.153005 ± 0.132169 in #126


Epoch #141: 10001it [00:12, 812.47it/s, env_step=1410000, len=1, loss/actor=-2.734, loss/critic1=6.060, loss/critic2=6.060, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #141: test_reward: 1.067640 ± 0.112270, best_reward: 1.153005 ± 0.132169 in #126


Epoch #142: 10001it [00:12, 802.73it/s, env_step=1420000, len=1, loss/actor=-2.735, loss/critic1=6.114, loss/critic2=6.114, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #142: test_reward: 1.074114 ± 0.112809, best_reward: 1.153005 ± 0.132169 in #126


Epoch #143: 10001it [00:12, 811.85it/s, env_step=1430000, len=1, loss/actor=-2.733, loss/critic1=6.100, loss/critic2=6.100, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #143: test_reward: 1.066553 ± 0.107957, best_reward: 1.153005 ± 0.132169 in #126


Epoch #144: 10001it [00:12, 810.19it/s, env_step=1440000, len=1, loss/actor=-2.733, loss/critic1=6.221, loss/critic2=6.221, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #144: test_reward: 1.076544 ± 0.129752, best_reward: 1.153005 ± 0.132169 in #126


Epoch #145: 10001it [00:12, 809.36it/s, env_step=1450000, len=1, loss/actor=-2.735, loss/critic1=6.228, loss/critic2=6.228, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #145: test_reward: 1.089280 ± 0.133056, best_reward: 1.153005 ± 0.132169 in #126


Epoch #146: 10001it [00:12, 804.44it/s, env_step=1460000, len=1, loss/actor=-2.735, loss/critic1=6.115, loss/critic2=6.115, n/ep=100, n/st=100, rew=-2.42]                           


Epoch #146: test_reward: 1.112241 ± 0.106828, best_reward: 1.153005 ± 0.132169 in #126


Epoch #147: 10001it [00:12, 806.10it/s, env_step=1470000, len=1, loss/actor=-2.734, loss/critic1=6.208, loss/critic2=6.208, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #147: test_reward: 1.013416 ± 0.141651, best_reward: 1.153005 ± 0.132169 in #126


Epoch #148: 10001it [00:12, 807.85it/s, env_step=1480000, len=1, loss/actor=-2.734, loss/critic1=6.124, loss/critic2=6.124, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #148: test_reward: 1.048358 ± 0.124770, best_reward: 1.153005 ± 0.132169 in #126


Epoch #149: 10001it [00:12, 810.40it/s, env_step=1490000, len=1, loss/actor=-2.736, loss/critic1=6.104, loss/critic2=6.104, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #149: test_reward: 1.074911 ± 0.099632, best_reward: 1.153005 ± 0.132169 in #126


Epoch #150: 10001it [00:12, 805.15it/s, env_step=1500000, len=1, loss/actor=-2.735, loss/critic1=6.105, loss/critic2=6.105, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #150: test_reward: 1.091557 ± 0.075443, best_reward: 1.153005 ± 0.132169 in #126


Epoch #151: 10001it [00:12, 806.88it/s, env_step=1510000, len=1, loss/actor=-2.733, loss/critic1=6.122, loss/critic2=6.122, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #151: test_reward: 1.069695 ± 0.154287, best_reward: 1.153005 ± 0.132169 in #126


Epoch #152: 10001it [00:12, 806.99it/s, env_step=1520000, len=1, loss/actor=-2.735, loss/critic1=6.111, loss/critic2=6.111, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #152: test_reward: 1.047248 ± 0.128842, best_reward: 1.153005 ± 0.132169 in #126


Epoch #153: 10001it [00:12, 809.95it/s, env_step=1530000, len=1, loss/actor=-2.734, loss/critic1=6.055, loss/critic2=6.055, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #153: test_reward: 1.010233 ± 0.124334, best_reward: 1.153005 ± 0.132169 in #126


Epoch #154: 10001it [00:12, 807.51it/s, env_step=1540000, len=1, loss/actor=-2.733, loss/critic1=6.034, loss/critic2=6.034, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #154: test_reward: 1.064697 ± 0.097680, best_reward: 1.153005 ± 0.132169 in #126


Epoch #155: 10001it [00:12, 806.05it/s, env_step=1550000, len=1, loss/actor=-2.736, loss/critic1=6.083, loss/critic2=6.083, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #155: test_reward: 1.073560 ± 0.143026, best_reward: 1.153005 ± 0.132169 in #126


Epoch #156: 10001it [00:12, 804.79it/s, env_step=1560000, len=1, loss/actor=-2.735, loss/critic1=6.155, loss/critic2=6.155, n/ep=100, n/st=100, rew=-2.33]                           


Epoch #156: test_reward: 1.025015 ± 0.101792, best_reward: 1.153005 ± 0.132169 in #126


Epoch #157: 10001it [00:12, 804.48it/s, env_step=1570000, len=1, loss/actor=-2.734, loss/critic1=6.101, loss/critic2=6.101, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #157: test_reward: 1.047550 ± 0.144859, best_reward: 1.153005 ± 0.132169 in #126


Epoch #158: 10001it [10:41, 15.60it/s, env_step=1580000, len=1, loss/actor=-2.736, loss/critic1=6.117, loss/critic2=6.117, n/ep=100, n/st=100, rew=-2.27]                            


Epoch #158: test_reward: 1.050973 ± 0.127664, best_reward: 1.153005 ± 0.132169 in #126


Epoch #159: 10001it [00:12, 791.28it/s, env_step=1590000, len=1, loss/actor=-2.733, loss/critic1=6.074, loss/critic2=6.074, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #159: test_reward: 1.062592 ± 0.143152, best_reward: 1.153005 ± 0.132169 in #126


Epoch #160: 10001it [00:12, 797.89it/s, env_step=1600000, len=1, loss/actor=-2.733, loss/critic1=6.174, loss/critic2=6.174, n/ep=100, n/st=100, rew=-2.26]                           


Epoch #160: test_reward: 1.066537 ± 0.117476, best_reward: 1.153005 ± 0.132169 in #126


Epoch #161: 10001it [00:12, 794.53it/s, env_step=1610000, len=1, loss/actor=-2.735, loss/critic1=6.060, loss/critic2=6.060, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #161: test_reward: 1.083448 ± 0.150501, best_reward: 1.153005 ± 0.132169 in #126


Epoch #162: 10001it [00:12, 792.44it/s, env_step=1620000, len=1, loss/actor=-2.735, loss/critic1=6.040, loss/critic2=6.040, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #162: test_reward: 1.100699 ± 0.149179, best_reward: 1.153005 ± 0.132169 in #126


Epoch #163: 10001it [00:12, 783.78it/s, env_step=1630000, len=1, loss/actor=-2.735, loss/critic1=6.094, loss/critic2=6.094, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #163: test_reward: 1.154821 ± 0.125385, best_reward: 1.154821 ± 0.125385 in #163


Epoch #164: 10001it [00:12, 773.53it/s, env_step=1640000, len=1, loss/actor=-2.736, loss/critic1=6.101, loss/critic2=6.101, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #164: test_reward: 1.092034 ± 0.119208, best_reward: 1.154821 ± 0.125385 in #163


Epoch #165: 10001it [00:12, 781.94it/s, env_step=1650000, len=1, loss/actor=-2.734, loss/critic1=6.090, loss/critic2=6.090, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #165: test_reward: 1.066204 ± 0.103143, best_reward: 1.154821 ± 0.125385 in #163


Epoch #166: 10001it [00:12, 782.48it/s, env_step=1660000, len=1, loss/actor=-2.736, loss/critic1=6.111, loss/critic2=6.111, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #166: test_reward: 1.025087 ± 0.124468, best_reward: 1.154821 ± 0.125385 in #163


Epoch #167: 10001it [00:12, 796.68it/s, env_step=1670000, len=1, loss/actor=-2.734, loss/critic1=6.148, loss/critic2=6.148, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #167: test_reward: 1.039479 ± 0.114321, best_reward: 1.154821 ± 0.125385 in #163


Epoch #168: 10001it [00:12, 785.90it/s, env_step=1680000, len=1, loss/actor=-2.733, loss/critic1=6.145, loss/critic2=6.145, n/ep=100, n/st=100, rew=-2.51]                           


Epoch #168: test_reward: 1.047136 ± 0.103323, best_reward: 1.154821 ± 0.125385 in #163


Epoch #169: 10001it [00:12, 796.12it/s, env_step=1690000, len=1, loss/actor=-2.733, loss/critic1=6.109, loss/critic2=6.109, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #169: test_reward: 1.053245 ± 0.132858, best_reward: 1.154821 ± 0.125385 in #163


Epoch #170: 10001it [00:12, 799.07it/s, env_step=1700000, len=1, loss/actor=-2.735, loss/critic1=6.083, loss/critic2=6.083, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #170: test_reward: 1.035429 ± 0.074366, best_reward: 1.154821 ± 0.125385 in #163


Epoch #171: 10001it [00:12, 772.67it/s, env_step=1710000, len=1, loss/actor=-2.734, loss/critic1=6.149, loss/critic2=6.149, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #171: test_reward: 1.086050 ± 0.108275, best_reward: 1.154821 ± 0.125385 in #163


Epoch #172: 10001it [00:12, 788.64it/s, env_step=1720000, len=1, loss/actor=-2.734, loss/critic1=6.059, loss/critic2=6.059, n/ep=100, n/st=100, rew=-2.37]                           


Epoch #172: test_reward: 1.023511 ± 0.126454, best_reward: 1.154821 ± 0.125385 in #163


Epoch #173: 10001it [00:12, 789.91it/s, env_step=1730000, len=1, loss/actor=-2.732, loss/critic1=6.145, loss/critic2=6.145, n/ep=100, n/st=100, rew=-2.29]                           


Epoch #173: test_reward: 1.081905 ± 0.133477, best_reward: 1.154821 ± 0.125385 in #163


Epoch #174: 10001it [00:13, 766.21it/s, env_step=1740000, len=1, loss/actor=-2.733, loss/critic1=6.120, loss/critic2=6.120, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #174: test_reward: 1.076526 ± 0.070710, best_reward: 1.154821 ± 0.125385 in #163


Epoch #175: 10001it [00:12, 785.18it/s, env_step=1750000, len=1, loss/actor=-2.732, loss/critic1=6.117, loss/critic2=6.117, n/ep=100, n/st=100, rew=-2.48]                           


Epoch #175: test_reward: 1.076116 ± 0.106853, best_reward: 1.154821 ± 0.125385 in #163


Epoch #176: 10001it [00:12, 797.20it/s, env_step=1760000, len=1, loss/actor=-2.733, loss/critic1=6.085, loss/critic2=6.085, n/ep=100, n/st=100, rew=-2.31]                           


Epoch #176: test_reward: 1.058070 ± 0.150326, best_reward: 1.154821 ± 0.125385 in #163


Epoch #177: 10001it [00:12, 788.24it/s, env_step=1770000, len=1, loss/actor=-2.733, loss/critic1=6.143, loss/critic2=6.143, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #177: test_reward: 1.002188 ± 0.107414, best_reward: 1.154821 ± 0.125385 in #163


Epoch #178: 10001it [00:12, 772.31it/s, env_step=1780000, len=1, loss/actor=-2.738, loss/critic1=6.156, loss/critic2=6.156, n/ep=100, n/st=100, rew=-2.49]                           


Epoch #178: test_reward: 1.052114 ± 0.122922, best_reward: 1.154821 ± 0.125385 in #163


Epoch #179: 10001it [00:12, 794.65it/s, env_step=1790000, len=1, loss/actor=-2.735, loss/critic1=6.110, loss/critic2=6.110, n/ep=100, n/st=100, rew=-2.46]                           


Epoch #179: test_reward: 1.060333 ± 0.143398, best_reward: 1.154821 ± 0.125385 in #163


Epoch #180: 10001it [00:12, 789.59it/s, env_step=1800000, len=1, loss/actor=-2.735, loss/critic1=6.155, loss/critic2=6.155, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #180: test_reward: 1.074369 ± 0.131621, best_reward: 1.154821 ± 0.125385 in #163


Epoch #181: 10001it [00:12, 772.15it/s, env_step=1810000, len=1, loss/actor=-2.733, loss/critic1=6.066, loss/critic2=6.066, n/ep=100, n/st=100, rew=-2.36]                           


Epoch #181: test_reward: 1.037565 ± 0.103357, best_reward: 1.154821 ± 0.125385 in #163


Epoch #182: 10001it [00:12, 805.44it/s, env_step=1820000, len=1, loss/actor=-2.735, loss/critic1=6.106, loss/critic2=6.106, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #182: test_reward: 1.080630 ± 0.136788, best_reward: 1.154821 ± 0.125385 in #163


Epoch #183: 10001it [00:12, 811.66it/s, env_step=1830000, len=1, loss/actor=-2.734, loss/critic1=6.150, loss/critic2=6.150, n/ep=100, n/st=100, rew=-2.46]                           


Epoch #183: test_reward: 0.980871 ± 0.116003, best_reward: 1.154821 ± 0.125385 in #163


Epoch #184: 10001it [00:12, 816.23it/s, env_step=1840000, len=1, loss/actor=-2.732, loss/critic1=6.112, loss/critic2=6.112, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #184: test_reward: 1.034794 ± 0.181058, best_reward: 1.154821 ± 0.125385 in #163


Epoch #185: 10001it [00:12, 811.22it/s, env_step=1850000, len=1, loss/actor=-2.736, loss/critic1=6.073, loss/critic2=6.073, n/ep=100, n/st=100, rew=-2.49]                           


Epoch #185: test_reward: 1.044794 ± 0.078379, best_reward: 1.154821 ± 0.125385 in #163


Epoch #186: 10001it [00:12, 812.46it/s, env_step=1860000, len=1, loss/actor=-2.735, loss/critic1=6.112, loss/critic2=6.112, n/ep=100, n/st=100, rew=-2.34]                           


Epoch #186: test_reward: 1.054076 ± 0.104361, best_reward: 1.154821 ± 0.125385 in #163


Epoch #187: 10001it [00:12, 813.86it/s, env_step=1870000, len=1, loss/actor=-2.737, loss/critic1=6.121, loss/critic2=6.121, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #187: test_reward: 1.061175 ± 0.155666, best_reward: 1.154821 ± 0.125385 in #163


Epoch #188: 10001it [00:12, 814.89it/s, env_step=1880000, len=1, loss/actor=-2.734, loss/critic1=6.145, loss/critic2=6.145, n/ep=100, n/st=100, rew=-2.35]                           


Epoch #188: test_reward: 1.066628 ± 0.088714, best_reward: 1.154821 ± 0.125385 in #163


Epoch #189: 10001it [00:12, 814.10it/s, env_step=1890000, len=1, loss/actor=-2.735, loss/critic1=6.113, loss/critic2=6.113, n/ep=100, n/st=100, rew=-2.40]                           


Epoch #189: test_reward: 1.073143 ± 0.115656, best_reward: 1.154821 ± 0.125385 in #163


Epoch #190: 10001it [00:12, 790.28it/s, env_step=1900000, len=1, loss/actor=-2.733, loss/critic1=6.083, loss/critic2=6.083, n/ep=100, n/st=100, rew=-2.32]                           


Epoch #190: test_reward: 1.034295 ± 0.111901, best_reward: 1.154821 ± 0.125385 in #163


Epoch #191: 10001it [00:12, 788.29it/s, env_step=1910000, len=1, loss/actor=-2.736, loss/critic1=6.147, loss/critic2=6.147, n/ep=100, n/st=100, rew=-2.38]                           


Epoch #191: test_reward: 1.042030 ± 0.151298, best_reward: 1.154821 ± 0.125385 in #163


Epoch #192: 10001it [00:12, 818.33it/s, env_step=1920000, len=1, loss/actor=-2.734, loss/critic1=6.206, loss/critic2=6.206, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #192: test_reward: 1.036747 ± 0.118222, best_reward: 1.154821 ± 0.125385 in #163


Epoch #193: 10001it [00:12, 813.80it/s, env_step=1930000, len=1, loss/actor=-2.732, loss/critic1=6.146, loss/critic2=6.146, n/ep=100, n/st=100, rew=-2.44]                           


Epoch #193: test_reward: 1.052459 ± 0.105410, best_reward: 1.154821 ± 0.125385 in #163


Epoch #194: 10001it [00:12, 803.12it/s, env_step=1940000, len=1, loss/actor=-2.734, loss/critic1=6.106, loss/critic2=6.106, n/ep=100, n/st=100, rew=-2.47]                           


Epoch #194: test_reward: 1.088526 ± 0.129423, best_reward: 1.154821 ± 0.125385 in #163


Epoch #195: 10001it [00:12, 818.38it/s, env_step=1950000, len=1, loss/actor=-2.732, loss/critic1=6.183, loss/critic2=6.183, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #195: test_reward: 1.051698 ± 0.122456, best_reward: 1.154821 ± 0.125385 in #163


Epoch #196: 10001it [00:12, 824.76it/s, env_step=1960000, len=1, loss/actor=-2.734, loss/critic1=6.095, loss/critic2=6.095, n/ep=100, n/st=100, rew=-2.46]                           


Epoch #196: test_reward: 1.047201 ± 0.129189, best_reward: 1.154821 ± 0.125385 in #163


Epoch #197: 10001it [00:12, 813.54it/s, env_step=1970000, len=1, loss/actor=-2.734, loss/critic1=6.179, loss/critic2=6.179, n/ep=100, n/st=100, rew=-2.47]                           


Epoch #197: test_reward: 1.080108 ± 0.104867, best_reward: 1.154821 ± 0.125385 in #163


Epoch #198: 10001it [00:12, 790.59it/s, env_step=1980000, len=1, loss/actor=-2.733, loss/critic1=6.184, loss/critic2=6.184, n/ep=100, n/st=100, rew=-2.39]                           


Epoch #198: test_reward: 1.142869 ± 0.156013, best_reward: 1.154821 ± 0.125385 in #163


Epoch #199: 10001it [00:12, 785.58it/s, env_step=1990000, len=1, loss/actor=-2.736, loss/critic1=6.112, loss/critic2=6.112, n/ep=100, n/st=100, rew=-2.41]                           


Epoch #199: test_reward: 1.058799 ± 0.089497, best_reward: 1.154821 ± 0.125385 in #163


Epoch #200: 10001it [00:12, 797.98it/s, env_step=2000000, len=1, loss/actor=-2.731, loss/critic1=6.191, loss/critic2=6.191, n/ep=100, n/st=100, rew=-2.45]                           


Epoch #200: test_reward: 1.045275 ± 0.176816, best_reward: 1.154821 ± 0.125385 in #163
Finished training! Use 3139.84s


AttributeError: 'collections.OrderedDict' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.

In [118]:
writer.flush()

In [119]:
tensorboard --logdir=runs

SyntaxError: cannot assign to operator (3224537314.py, line 1)

In [None]:
policy.eval()
# policy.set_eps(0.05)
collector = ts.data.Collector(policy, env, exploration_noise=True)
collector.collect(n_episode=1, render=1 / 35)

  gym.logger.warn(


{'n/ep': 1,
 'n/st': 1,
 'rews': array([0.88650187]),
 'lens': array([1]),
 'idxs': array([0]),
 'rew': 0.8865018660339451,
 'len': 1.0,
 'rew_std': 0.0,
 'len_std': 0.0}