In [1]:
import numpy as np
import config.config as config
from Environment import Env
from Agent_LSTM import *
from pathlib import Path

import sys; sys.path.append('../analysis/')
from my_utils import reset_seeds

In [2]:
def training(datapath, seed_number, Actor, Critic, 
             expnoise_std=0.8, TOTAL_EPISODE=5e4, value_noise_std=0):
    # get configures
    arg = config.ConfigGain(datapath)
    arg.SEED_NUMBER = seed_number
    arg.save()
    
    # reproducibility
    reset_seeds(arg.SEED_NUMBER)

    # initialize environment and agent
    env = Env(arg)
    agent = Agent(arg, Actor, Critic)
    agent.episodic = False
    agent.actor.value_noise_std = value_noise_std
    
    # define exploration noise
    noise = ActionNoise(arg.ACTION_DIM, mean=0, std=expnoise_std)
    
    # Loop now
    tot_t = 0
    episode = agent.initial_episode
    reward_log = []
    rewarded_trial_log = []
    step_log = []
    actor_loss_log = 0
    critic_loss_log = 0
    num_update = 1e-5
    dist_log = []

    LOG_FREQ = 100
    REPLAY_PERIOD = 4
    PRE_LEARN_PERIOD = arg.BATCH_SIZE * 50
    random_stop = True

    # Start loop
    while episode < TOTAL_EPISODE:
        # initialize a trial
        cross_start_threshold = False
        x = env.reset()
        agent.bstep.reset(env.pro_gains)
        
        last_action = torch.zeros(1, 1, arg.ACTION_DIM)
        last_action_raw = last_action.clone()

        state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                           env.target_position_obs.view(1, 1, -1), 
                           torch.zeros(1, 1, 1)], dim=2).to(arg.device)

        hiddenin = None
        tend = 0

        states = []
        actions = []
        rewards = []
        dones = []

        for t in range(arg.EPISODE_LEN):
            # 1. Check start threshold.
            if not cross_start_threshold and (last_action.abs() > arg.TERMINAL_ACTION).any():
                cross_start_threshold = True

            # 2. Take an action based on current state 
            # and previous hidden & cell states of LSTM units.
            action, action_raw, hiddenout = agent.select_action(state, hiddenin, action_noise=noise)
            if random_stop and np.random.rand() > 0.95:
                action = torch.zeros_like(action)

            # 3. Track next x in the environment.
            next_x, reached_target, relative_dist = env(x, action, t - tend)

            # 4. Next observation given next x.
            next_ox = agent.bstep(next_x)
            next_state = torch.cat([next_ox.view(1, 1, -1), action, env.target_position_obs.view(1, 1, -1),
                                    torch.ones(1, 1, 1) * t - tend + 1], dim=2).to(arg.device)

            # 5. Check whether stop.
            is_stop = env.is_stop(x, action)

            # 6. Give reward if stopped.          
            if is_stop and cross_start_threshold:
                reward = env.return_reward(x, reward_mode='mixed')
                done = torch.ones(1, 1, 1)
            else:
                reward = torch.zeros(1, 1, 1)
                done = torch.zeros(1, 1, 1)

            # 7. Append data.
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)

            # 8. Update timestep.
            last_action_raw = action_raw
            last_action = action
            state = next_state
            x = next_x
            hiddenin = hiddenout
            tot_t += 1

            # 9. Update model.
            if len(agent.memory.memory) > PRE_LEARN_PERIOD and tot_t % REPLAY_PERIOD == 0:
                actor_loss, critic_loss = agent.learn()
                actor_loss_log += actor_loss
                critic_loss_log += critic_loss
                num_update += 1

            # 10. whether break.
            if is_stop and cross_start_threshold:
                step_log.append(t + 1 - tend)
                reward_log.append(reward.item())
                rewarded_trial_log.append(int(reached_target & is_stop))
                dist_log.append(relative_dist.item())
                # initialize a trial
                cross_start_threshold = False
                x = env.reset()
                agent.bstep.reset(env.pro_gains)
                state = torch.cat([x[-arg.OBS_DIM:].view(1, 1, -1), last_action,
                                   env.target_position_obs.view(1, 1, -1),
                                   torch.zeros(1, 1, 1)], dim=2).to(arg.device)
                tend = t + 1


        # store the last state
        states.append(state)
        # End of one trial, store trajectory into buffer.
        states = torch.cat(states)
        actions = torch.cat(actions).to(arg.device)
        rewards = torch.cat(rewards).to(arg.device)
        dones = torch.cat(dones).to(arg.device)
        agent.memory.push(states, actions, rewards, dones) 

        # store mirrored trajectories reflected along y-axis
        agent.memory.push(*agent.mirror_traj(states, actions), rewards, dones) 

        if episode % LOG_FREQ == LOG_FREQ - 1:
            # save
            agent.save(save_memory=False, episode=episode, pre_phase=False, full_param=True)
            
            print(f"t: {tot_t}, Ep: {episode}, action std: {noise.std:0.2f}")
            print(f"mean steps: {np.mean(step_log):0.3f}, "
                  f"mean reward: {np.mean(reward_log):0.3f}, "
                  f"reward rate: {np.sum(reward_log) / np.sum(step_log):0.3f}, "
                  f"rewarded fraction: {np.mean(rewarded_trial_log):0.3f}, "
                  f"relative distance: {np.mean(dist_log) * arg.LINEAR_SCALE:0.3f}, "
                  f"critic loss: {critic_loss_log / num_update:0.3f}, "
                  f"actor loss: {-actor_loss_log / (num_update/2):0.3f}")
            
            reward_rate = np.sum(reward_log) / np.sum(step_log)
            if reward_rate > 0.2:
                random_stop = False
                noise.reset(mean=0, std=0.5)
                
                
            #if np.mean(rewarded_trial_log) > 0.85:
            #    break

            reward_log = []
            rewarded_trial_log = []
            step_log = []
            actor_loss_log = 0
            critic_loss_log = 0
            num_update = 1e-5
            dist_log = []
            
        episode += 1

In [3]:
actors = ['Actor_novalue_learnb']
critics = ['Critic']
seeds = [[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24]]
expnoise_std = 0.8
TOTAL_EPISODE = 1e4
folder_path = Path('D:/quitting_data/agents_no_curriculum')

value_noise_std = 0

In [4]:
for actor, critic, seed_ in zip(actors, critics, seeds):
    for seed in seed_:
        datapath = folder_path / f'{actor}{critic}' / f'seed{seed}' 
        exec(f'from {actor} import *'); exec(f'from {critic} import *')
        training(datapath, seed, Actor, Critic, expnoise_std, TOTAL_EPISODE, 
                 value_noise_std)

t: 10000, Ep: 99, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  print(f"mean steps: {np.mean(step_log):0.3f}, "
  reward_rate = np.sum(reward_log) / np.sum(step_log)


t: 20000, Ep: 199, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 30000, Ep: 299, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.80
mean steps: 17.609, mean reward: 0.302, reward rate: 0.017, rewarded fraction: 0.018, relative distance: 263.212, critic loss: 0.022, actor loss: 0.005
t: 60000, Ep: 599, action std: 0.80
mean steps: 17.509, mean reward: 0.365, reward rate: 0.021, rewarded fraction: 0.026, relative distance: 253.123, critic loss: 0.052, actor loss: -0.009
t: 70000, Ep: 699, action std: 0.80
mean steps: 16.357, mean reward: 0.452, reward rate: 0.028, re

t: 460000, Ep: 4599, action std: 0.80
mean steps: 16.029, mean reward: 0.045, reward rate: 0.003, rewarded fraction: 0.002, relative distance: 303.478, critic loss: 0.004, actor loss: -0.056
t: 470000, Ep: 4699, action std: 0.80
mean steps: 17.548, mean reward: 0.021, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 308.248, critic loss: 0.003, actor loss: -0.058
t: 480000, Ep: 4799, action std: 0.80
mean steps: 17.057, mean reward: 0.073, reward rate: 0.004, rewarded fraction: 0.004, relative distance: 303.360, critic loss: 0.003, actor loss: -0.058
t: 490000, Ep: 4899, action std: 0.80
mean steps: 16.018, mean reward: 0.025, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 310.639, critic loss: 0.005, actor loss: -0.057
t: 500000, Ep: 4999, action std: 0.80
mean steps: 16.446, mean reward: 0.021, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 307.887, critic loss: 0.004, actor loss: -0.057
t: 510000, Ep: 5099, action std: 0.80
mean st

t: 890000, Ep: 8899, action std: 0.80
mean steps: 5.691, mean reward: 0.740, reward rate: 0.130, rewarded fraction: 0.060, relative distance: 240.826, critic loss: 0.475, actor loss: 2.514
t: 900000, Ep: 8999, action std: 0.80
mean steps: 5.900, mean reward: 1.163, reward rate: 0.197, rewarded fraction: 0.104, relative distance: 240.325, critic loss: 0.420, actor loss: 3.354
t: 910000, Ep: 9099, action std: 0.80
mean steps: 5.978, mean reward: 1.112, reward rate: 0.186, rewarded fraction: 0.098, relative distance: 236.775, critic loss: 0.521, actor loss: 4.308
t: 920000, Ep: 9199, action std: 0.80
mean steps: 6.314, mean reward: 1.438, reward rate: 0.228, rewarded fraction: 0.129, relative distance: 226.516, critic loss: 0.552, actor loss: 5.368
t: 930000, Ep: 9299, action std: 0.50
mean steps: 7.536, mean reward: 2.789, reward rate: 0.370, rewarded fraction: 0.262, relative distance: 199.780, critic loss: 0.577, actor loss: 6.474
t: 940000, Ep: 9399, action std: 0.50
mean steps: 7.367

t: 330000, Ep: 3299, action std: 0.50
mean steps: 12.162, mean reward: 5.830, reward rate: 0.479, rewarded fraction: 0.559, relative distance: 130.551, critic loss: 1.120, actor loss: 23.658
t: 340000, Ep: 3399, action std: 0.50
mean steps: 12.407, mean reward: 6.176, reward rate: 0.498, rewarded fraction: 0.597, relative distance: 123.330, critic loss: 1.100, actor loss: 23.823
t: 350000, Ep: 3499, action std: 0.50
mean steps: 12.232, mean reward: 6.129, reward rate: 0.501, rewarded fraction: 0.595, relative distance: 126.686, critic loss: 1.084, actor loss: 23.940
t: 360000, Ep: 3599, action std: 0.50
mean steps: 12.476, mean reward: 5.908, reward rate: 0.474, rewarded fraction: 0.563, relative distance: 124.839, critic loss: 1.103, actor loss: 24.046
t: 370000, Ep: 3699, action std: 0.50
mean steps: 12.620, mean reward: 6.289, reward rate: 0.498, rewarded fraction: 0.606, relative distance: 121.684, critic loss: 1.092, actor loss: 24.165
t: 380000, Ep: 3799, action std: 0.50
mean st

t: 760000, Ep: 7599, action std: 0.50
mean steps: 13.467, mean reward: 6.874, reward rate: 0.510, rewarded fraction: 0.663, relative distance: 105.437, critic loss: 1.004, actor loss: 24.986
t: 770000, Ep: 7699, action std: 0.50
mean steps: 13.695, mean reward: 7.124, reward rate: 0.520, rewarded fraction: 0.693, relative distance: 100.444, critic loss: 0.988, actor loss: 24.990
t: 780000, Ep: 7799, action std: 0.50
mean steps: 13.438, mean reward: 6.907, reward rate: 0.514, rewarded fraction: 0.669, relative distance: 107.222, critic loss: 1.030, actor loss: 24.983
t: 790000, Ep: 7899, action std: 0.50
mean steps: 13.822, mean reward: 6.889, reward rate: 0.498, rewarded fraction: 0.667, relative distance: 107.831, critic loss: 1.003, actor loss: 24.987
t: 800000, Ep: 7999, action std: 0.50
mean steps: 13.418, mean reward: 6.809, reward rate: 0.507, rewarded fraction: 0.661, relative distance: 110.873, critic loss: 0.984, actor loss: 24.954
t: 810000, Ep: 8099, action std: 0.50
mean st

t: 200000, Ep: 1999, action std: 0.80
mean steps: 16.006, mean reward: 0.859, reward rate: 0.054, rewarded fraction: 0.071, relative distance: 223.469, critic loss: 0.066, actor loss: -0.146
t: 210000, Ep: 2099, action std: 0.80
mean steps: 7.458, mean reward: 0.066, reward rate: 0.009, rewarded fraction: 0.002, relative distance: 264.231, critic loss: 0.335, actor loss: 0.285
t: 220000, Ep: 2199, action std: 0.80
mean steps: 6.619, mean reward: 0.075, reward rate: 0.011, rewarded fraction: 0.002, relative distance: 268.051, critic loss: 0.230, actor loss: 0.256
t: 230000, Ep: 2299, action std: 0.80
mean steps: 6.404, mean reward: 0.075, reward rate: 0.012, rewarded fraction: 0.003, relative distance: 255.630, critic loss: 0.223, actor loss: 0.268
t: 240000, Ep: 2399, action std: 0.80
mean steps: 6.921, mean reward: 0.366, reward rate: 0.053, rewarded fraction: 0.021, relative distance: 230.678, critic loss: 0.271, actor loss: 0.348
t: 250000, Ep: 2499, action std: 0.80
mean steps: 7.0

t: 630000, Ep: 6299, action std: 0.50
mean steps: 13.459, mean reward: 6.864, reward rate: 0.510, rewarded fraction: 0.666, relative distance: 109.082, critic loss: 1.093, actor loss: 25.361
t: 640000, Ep: 6399, action std: 0.50
mean steps: 13.477, mean reward: 6.608, reward rate: 0.490, rewarded fraction: 0.636, relative distance: 113.028, critic loss: 1.106, actor loss: 25.413
t: 650000, Ep: 6499, action std: 0.50
mean steps: 12.421, mean reward: 6.444, reward rate: 0.519, rewarded fraction: 0.621, relative distance: 121.007, critic loss: 1.099, actor loss: 25.433
t: 660000, Ep: 6599, action std: 0.50
mean steps: 12.750, mean reward: 6.250, reward rate: 0.490, rewarded fraction: 0.600, relative distance: 122.203, critic loss: 1.082, actor loss: 25.477
t: 670000, Ep: 6699, action std: 0.50
mean steps: 13.024, mean reward: 6.758, reward rate: 0.519, rewarded fraction: 0.657, relative distance: 112.759, critic loss: 1.102, actor loss: 25.516
t: 680000, Ep: 6799, action std: 0.50
mean st

t: 70000, Ep: 699, action std: 0.80
mean steps: 14.785, mean reward: 0.462, reward rate: 0.031, rewarded fraction: 0.033, relative distance: 249.794, critic loss: 0.062, actor loss: 0.211
t: 80000, Ep: 799, action std: 0.80
mean steps: 8.687, mean reward: 0.452, reward rate: 0.052, rewarded fraction: 0.035, relative distance: 234.545, critic loss: 0.121, actor loss: 0.692
t: 90000, Ep: 899, action std: 0.80
mean steps: 13.031, mean reward: 0.923, reward rate: 0.071, rewarded fraction: 0.075, relative distance: 208.115, critic loss: 0.180, actor loss: 1.810
t: 100000, Ep: 999, action std: 0.80
mean steps: 11.242, mean reward: 0.686, reward rate: 0.061, rewarded fraction: 0.048, relative distance: 198.153, critic loss: 0.324, actor loss: 3.012
t: 110000, Ep: 1099, action std: 0.80
mean steps: 10.547, mean reward: 1.130, reward rate: 0.107, rewarded fraction: 0.093, relative distance: 190.952, critic loss: 0.851, actor loss: 4.092
t: 120000, Ep: 1199, action std: 0.80
mean steps: 10.340, 

t: 510000, Ep: 5099, action std: 0.50
mean steps: 14.184, mean reward: 6.943, reward rate: 0.490, rewarded fraction: 0.671, relative distance: 102.904, critic loss: 1.057, actor loss: 26.602
t: 520000, Ep: 5199, action std: 0.50
mean steps: 14.856, mean reward: 7.581, reward rate: 0.510, rewarded fraction: 0.739, relative distance: 85.756, critic loss: 1.069, actor loss: 26.675
t: 530000, Ep: 5299, action std: 0.50
mean steps: 14.167, mean reward: 7.045, reward rate: 0.497, rewarded fraction: 0.680, relative distance: 97.595, critic loss: 1.078, actor loss: 26.715
t: 540000, Ep: 5399, action std: 0.50
mean steps: 14.881, mean reward: 7.506, reward rate: 0.504, rewarded fraction: 0.727, relative distance: 85.823, critic loss: 1.061, actor loss: 26.761
t: 550000, Ep: 5499, action std: 0.50
mean steps: 14.073, mean reward: 7.180, reward rate: 0.510, rewarded fraction: 0.698, relative distance: 99.593, critic loss: 1.077, actor loss: 26.797
t: 560000, Ep: 5599, action std: 0.50
mean steps:

t: 950000, Ep: 9499, action std: 0.50
mean steps: 14.407, mean reward: 7.323, reward rate: 0.508, rewarded fraction: 0.711, relative distance: 92.609, critic loss: 1.036, actor loss: 26.551
t: 960000, Ep: 9599, action std: 0.50
mean steps: 15.026, mean reward: 7.619, reward rate: 0.507, rewarded fraction: 0.735, relative distance: 80.804, critic loss: 1.046, actor loss: 26.566
t: 970000, Ep: 9699, action std: 0.50
mean steps: 14.420, mean reward: 7.476, reward rate: 0.518, rewarded fraction: 0.728, relative distance: 90.137, critic loss: 1.029, actor loss: 26.555
t: 980000, Ep: 9799, action std: 0.50
mean steps: 14.448, mean reward: 7.385, reward rate: 0.511, rewarded fraction: 0.716, relative distance: 91.053, critic loss: 1.034, actor loss: 26.493
t: 990000, Ep: 9899, action std: 0.50
mean steps: 14.772, mean reward: 7.362, reward rate: 0.498, rewarded fraction: 0.712, relative distance: 91.076, critic loss: 1.048, actor loss: 26.503
t: 1000000, Ep: 9999, action std: 0.50
mean steps:

t: 390000, Ep: 3899, action std: 0.50
mean steps: 14.143, mean reward: 6.511, reward rate: 0.460, rewarded fraction: 0.622, relative distance: 101.565, critic loss: 0.801, actor loss: 11.483
t: 400000, Ep: 3999, action std: 0.50
mean steps: 14.402, mean reward: 7.033, reward rate: 0.488, rewarded fraction: 0.677, relative distance: 90.045, critic loss: 0.829, actor loss: 12.817
t: 410000, Ep: 4099, action std: 0.50
mean steps: 15.226, mean reward: 7.189, reward rate: 0.472, rewarded fraction: 0.694, relative distance: 87.645, critic loss: 0.819, actor loss: 14.051
t: 420000, Ep: 4199, action std: 0.50
mean steps: 14.851, mean reward: 6.791, reward rate: 0.457, rewarded fraction: 0.647, relative distance: 94.905, critic loss: 0.811, actor loss: 15.194
t: 430000, Ep: 4299, action std: 0.50
mean steps: 14.218, mean reward: 6.554, reward rate: 0.461, rewarded fraction: 0.620, relative distance: 92.556, critic loss: 0.822, actor loss: 16.234
t: 440000, Ep: 4399, action std: 0.50
mean steps:

t: 830000, Ep: 8299, action std: 0.50
mean steps: 15.217, mean reward: 7.399, reward rate: 0.486, rewarded fraction: 0.708, relative distance: 79.316, critic loss: 1.005, actor loss: 24.825
t: 840000, Ep: 8399, action std: 0.50
mean steps: 15.837, mean reward: 7.738, reward rate: 0.489, rewarded fraction: 0.750, relative distance: 75.149, critic loss: 0.959, actor loss: 24.875
t: 850000, Ep: 8499, action std: 0.50
mean steps: 14.857, mean reward: 7.461, reward rate: 0.502, rewarded fraction: 0.718, relative distance: 82.943, critic loss: 0.959, actor loss: 24.911
t: 860000, Ep: 8599, action std: 0.50
mean steps: 14.972, mean reward: 7.683, reward rate: 0.513, rewarded fraction: 0.739, relative distance: 76.062, critic loss: 0.974, actor loss: 24.926
t: 870000, Ep: 8699, action std: 0.50
mean steps: 14.836, mean reward: 7.718, reward rate: 0.520, rewarded fraction: 0.750, relative distance: 82.051, critic loss: 0.927, actor loss: 24.935
t: 880000, Ep: 8799, action std: 0.50
mean steps: 

t: 270000, Ep: 2699, action std: 0.80
mean steps: 16.855, mean reward: 0.159, reward rate: 0.009, rewarded fraction: 0.010, relative distance: 252.719, critic loss: 0.033, actor loss: -0.106
t: 280000, Ep: 2799, action std: 0.80
mean steps: 16.442, mean reward: 0.086, reward rate: 0.005, rewarded fraction: 0.004, relative distance: 268.955, critic loss: 0.028, actor loss: -0.098
t: 290000, Ep: 2899, action std: 0.80
mean steps: 16.683, mean reward: 0.102, reward rate: 0.006, rewarded fraction: 0.006, relative distance: 282.019, critic loss: 0.028, actor loss: -0.091
t: 300000, Ep: 2999, action std: 0.80
mean steps: 16.289, mean reward: 0.215, reward rate: 0.013, rewarded fraction: 0.016, relative distance: 278.388, critic loss: 0.024, actor loss: -0.086
t: 310000, Ep: 3099, action std: 0.80
mean steps: 15.807, mean reward: 0.179, reward rate: 0.011, rewarded fraction: 0.011, relative distance: 280.370, critic loss: 0.027, actor loss: -0.085
t: 320000, Ep: 3199, action std: 0.80
mean st

t: 710000, Ep: 7099, action std: 0.80
mean steps: 7.265, mean reward: 1.363, reward rate: 0.188, rewarded fraction: 0.120, relative distance: 214.310, critic loss: 0.943, actor loss: 8.568
t: 720000, Ep: 7199, action std: 0.80
mean steps: 7.146, mean reward: 1.374, reward rate: 0.192, rewarded fraction: 0.124, relative distance: 215.714, critic loss: 0.965, actor loss: 9.123
t: 730000, Ep: 7299, action std: 0.80
mean steps: 7.219, mean reward: 1.471, reward rate: 0.204, rewarded fraction: 0.133, relative distance: 214.716, critic loss: 1.088, actor loss: 9.682
t: 740000, Ep: 7399, action std: 0.50
mean steps: 8.843, mean reward: 2.662, reward rate: 0.301, rewarded fraction: 0.247, relative distance: 181.638, critic loss: 1.123, actor loss: 10.235
t: 750000, Ep: 7499, action std: 0.50
mean steps: 8.811, mean reward: 2.793, reward rate: 0.317, rewarded fraction: 0.265, relative distance: 183.881, critic loss: 1.144, actor loss: 10.723
t: 760000, Ep: 7599, action std: 0.50
mean steps: 9.1

t: 150000, Ep: 1499, action std: 0.80
mean steps: 18.026, mean reward: 0.161, reward rate: 0.009, rewarded fraction: 0.011, relative distance: 276.653, critic loss: 0.047, actor loss: -0.104
t: 160000, Ep: 1599, action std: 0.80
mean steps: 17.006, mean reward: 0.085, reward rate: 0.005, rewarded fraction: 0.006, relative distance: 314.762, critic loss: 0.036, actor loss: -0.126
t: 170000, Ep: 1699, action std: 0.80
mean steps: 16.650, mean reward: 0.061, reward rate: 0.004, rewarded fraction: 0.004, relative distance: 357.922, critic loss: 0.034, actor loss: -0.132
t: 180000, Ep: 1799, action std: 0.80
mean steps: 16.442, mean reward: 0.047, reward rate: 0.003, rewarded fraction: 0.002, relative distance: 351.288, critic loss: 0.028, actor loss: -0.131
t: 190000, Ep: 1899, action std: 0.80
mean steps: 16.874, mean reward: 0.056, reward rate: 0.003, rewarded fraction: 0.004, relative distance: 345.982, critic loss: 0.027, actor loss: -0.137
t: 200000, Ep: 1999, action std: 0.80
mean st

t: 590000, Ep: 5899, action std: 0.50
mean steps: 12.302, mean reward: 5.696, reward rate: 0.463, rewarded fraction: 0.546, relative distance: 128.624, critic loss: 1.042, actor loss: 19.906
t: 600000, Ep: 5999, action std: 0.50
mean steps: 12.847, mean reward: 5.981, reward rate: 0.466, rewarded fraction: 0.575, relative distance: 121.896, critic loss: 1.023, actor loss: 20.366
t: 610000, Ep: 6099, action std: 0.50
mean steps: 12.840, mean reward: 6.212, reward rate: 0.484, rewarded fraction: 0.601, relative distance: 120.618, critic loss: 1.007, actor loss: 20.804
t: 620000, Ep: 6199, action std: 0.50
mean steps: 12.819, mean reward: 5.965, reward rate: 0.465, rewarded fraction: 0.572, relative distance: 123.032, critic loss: 1.028, actor loss: 21.250
t: 630000, Ep: 6299, action std: 0.50
mean steps: 12.967, mean reward: 5.959, reward rate: 0.460, rewarded fraction: 0.566, relative distance: 117.310, critic loss: 1.014, actor loss: 21.594
t: 640000, Ep: 6399, action std: 0.50
mean st

t: 30000, Ep: 299, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.80
mean steps: 10.668, mean reward: 0.022, reward rate: 0.002, rewarded fraction: 0.000, relative distance: 313.758, critic loss: 0.000, actor loss: 0.003
t: 60000, Ep: 599, action std: 0.80
mean steps: 6.428, mean reward: 0.026, reward rate: 0.004, rewarded fraction: 0.000, relative distance: 294.204, critic loss: 0.001, actor loss: 0.004
t: 70000, Ep: 699, action std: 0.80
mean steps: 6.887, mean reward: 0.188, reward rate: 0.027, rewarded fraction: 0.011, relative distance: 246.271, critic loss: 0.019, actor loss: 0.121
t: 80000, Ep: 799, action std: 0.80
mean steps: 11.478, mean reward: 0.489, reward rate:

t: 470000, Ep: 4699, action std: 0.50
mean steps: 13.441, mean reward: 6.176, reward rate: 0.459, rewarded fraction: 0.587, relative distance: 115.133, critic loss: 0.815, actor loss: 14.659
t: 480000, Ep: 4799, action std: 0.50
mean steps: 14.036, mean reward: 6.265, reward rate: 0.446, rewarded fraction: 0.592, relative distance: 109.841, critic loss: 0.816, actor loss: 15.559
t: 490000, Ep: 4899, action std: 0.50
mean steps: 13.789, mean reward: 6.586, reward rate: 0.478, rewarded fraction: 0.633, relative distance: 106.879, critic loss: 0.808, actor loss: 16.373
t: 500000, Ep: 4999, action std: 0.50
mean steps: 13.713, mean reward: 6.370, reward rate: 0.465, rewarded fraction: 0.602, relative distance: 108.342, critic loss: 0.849, actor loss: 17.121
t: 510000, Ep: 5099, action std: 0.50
mean steps: 13.855, mean reward: 6.635, reward rate: 0.479, rewarded fraction: 0.633, relative distance: 104.077, critic loss: 0.841, actor loss: 17.814
t: 520000, Ep: 5199, action std: 0.50
mean st

t: 900000, Ep: 8999, action std: 0.50
mean steps: 13.171, mean reward: 6.625, reward rate: 0.503, rewarded fraction: 0.633, relative distance: 106.814, critic loss: 1.068, actor loss: 24.667
t: 910000, Ep: 9099, action std: 0.50
mean steps: 12.735, mean reward: 6.133, reward rate: 0.482, rewarded fraction: 0.582, relative distance: 119.123, critic loss: 1.035, actor loss: 24.680
t: 920000, Ep: 9199, action std: 0.50
mean steps: 12.340, mean reward: 6.270, reward rate: 0.508, rewarded fraction: 0.604, relative distance: 121.373, critic loss: 1.069, actor loss: 24.682
t: 930000, Ep: 9299, action std: 0.50
mean steps: 13.281, mean reward: 6.737, reward rate: 0.507, rewarded fraction: 0.645, relative distance: 103.805, critic loss: 1.034, actor loss: 24.664
t: 940000, Ep: 9399, action std: 0.50
mean steps: 12.852, mean reward: 6.341, reward rate: 0.493, rewarded fraction: 0.611, relative distance: 117.034, critic loss: 1.025, actor loss: 24.641
t: 950000, Ep: 9499, action std: 0.50
mean st

t: 340000, Ep: 3399, action std: 0.50
mean steps: 13.271, mean reward: 6.657, reward rate: 0.502, rewarded fraction: 0.643, relative distance: 109.066, critic loss: 1.099, actor loss: 25.074
t: 350000, Ep: 3499, action std: 0.50
mean steps: 13.314, mean reward: 6.481, reward rate: 0.487, rewarded fraction: 0.624, relative distance: 114.444, critic loss: 1.096, actor loss: 25.284
t: 360000, Ep: 3599, action std: 0.50
mean steps: 13.618, mean reward: 6.927, reward rate: 0.509, rewarded fraction: 0.673, relative distance: 103.757, critic loss: 1.110, actor loss: 25.427
t: 370000, Ep: 3699, action std: 0.50
mean steps: 13.886, mean reward: 6.837, reward rate: 0.492, rewarded fraction: 0.663, relative distance: 105.722, critic loss: 1.104, actor loss: 25.604
t: 380000, Ep: 3799, action std: 0.50
mean steps: 13.564, mean reward: 6.963, reward rate: 0.513, rewarded fraction: 0.673, relative distance: 101.775, critic loss: 1.103, actor loss: 25.736
t: 390000, Ep: 3899, action std: 0.50
mean st

t: 780000, Ep: 7799, action std: 0.50
mean steps: 14.049, mean reward: 7.069, reward rate: 0.503, rewarded fraction: 0.681, relative distance: 95.440, critic loss: 1.142, actor loss: 26.830
t: 790000, Ep: 7899, action std: 0.50
mean steps: 14.450, mean reward: 7.017, reward rate: 0.486, rewarded fraction: 0.673, relative distance: 96.491, critic loss: 1.121, actor loss: 26.851
t: 800000, Ep: 7999, action std: 0.50
mean steps: 14.302, mean reward: 7.018, reward rate: 0.491, rewarded fraction: 0.675, relative distance: 96.234, critic loss: 1.125, actor loss: 26.857
t: 810000, Ep: 8099, action std: 0.50
mean steps: 13.773, mean reward: 6.817, reward rate: 0.495, rewarded fraction: 0.657, relative distance: 103.380, critic loss: 1.135, actor loss: 26.818
t: 820000, Ep: 8199, action std: 0.50
mean steps: 13.805, mean reward: 6.861, reward rate: 0.497, rewarded fraction: 0.661, relative distance: 102.330, critic loss: 1.129, actor loss: 26.807
t: 830000, Ep: 8299, action std: 0.50
mean steps

t: 220000, Ep: 2199, action std: 0.50
mean steps: 9.663, mean reward: 4.437, reward rate: 0.459, rewarded fraction: 0.422, relative distance: 166.566, critic loss: 1.281, actor loss: 14.040
t: 230000, Ep: 2299, action std: 0.50
mean steps: 10.084, mean reward: 4.435, reward rate: 0.440, rewarded fraction: 0.423, relative distance: 166.857, critic loss: 1.269, actor loss: 15.046
t: 240000, Ep: 2399, action std: 0.50
mean steps: 10.295, mean reward: 4.565, reward rate: 0.443, rewarded fraction: 0.438, relative distance: 167.271, critic loss: 1.265, actor loss: 16.041
t: 250000, Ep: 2499, action std: 0.50
mean steps: 10.156, mean reward: 4.637, reward rate: 0.457, rewarded fraction: 0.446, relative distance: 166.904, critic loss: 1.258, actor loss: 17.023
t: 260000, Ep: 2599, action std: 0.50
mean steps: 11.757, mean reward: 5.750, reward rate: 0.489, rewarded fraction: 0.552, relative distance: 133.718, critic loss: 1.238, actor loss: 17.933
t: 270000, Ep: 2699, action std: 0.50
mean ste

t: 650000, Ep: 6499, action std: 0.50
mean steps: 12.168, mean reward: 6.006, reward rate: 0.494, rewarded fraction: 0.580, relative distance: 131.192, critic loss: 1.146, actor loss: 25.366
t: 660000, Ep: 6599, action std: 0.50
mean steps: 12.496, mean reward: 6.255, reward rate: 0.501, rewarded fraction: 0.602, relative distance: 122.401, critic loss: 1.138, actor loss: 25.379
t: 670000, Ep: 6699, action std: 0.50
mean steps: 12.616, mean reward: 6.470, reward rate: 0.513, rewarded fraction: 0.624, relative distance: 116.553, critic loss: 1.151, actor loss: 25.366
t: 680000, Ep: 6799, action std: 0.50
mean steps: 12.444, mean reward: 6.103, reward rate: 0.490, rewarded fraction: 0.590, relative distance: 127.458, critic loss: 1.142, actor loss: 25.363
t: 690000, Ep: 6899, action std: 0.50
mean steps: 12.659, mean reward: 6.450, reward rate: 0.509, rewarded fraction: 0.625, relative distance: 121.192, critic loss: 1.131, actor loss: 25.355
t: 700000, Ep: 6999, action std: 0.50
mean st

t: 90000, Ep: 899, action std: 0.80
mean steps: 15.931, mean reward: 0.369, reward rate: 0.023, rewarded fraction: 0.026, relative distance: 250.754, critic loss: 0.085, actor loss: 0.124
t: 100000, Ep: 999, action std: 0.80
mean steps: 16.321, mean reward: 0.696, reward rate: 0.043, rewarded fraction: 0.054, relative distance: 235.452, critic loss: 0.105, actor loss: 0.091
t: 110000, Ep: 1099, action std: 0.80
mean steps: 15.973, mean reward: 0.550, reward rate: 0.034, rewarded fraction: 0.042, relative distance: 265.826, critic loss: 0.126, actor loss: 0.051
t: 120000, Ep: 1199, action std: 0.80
mean steps: 16.279, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 370.084, critic loss: 0.094, actor loss: -0.000
t: 130000, Ep: 1299, action std: 0.80
mean steps: 16.377, mean reward: 0.015, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 398.453, critic loss: 0.066, actor loss: -0.030
t: 140000, Ep: 1399, action std: 0.80
mean steps: 1

t: 530000, Ep: 5299, action std: 0.50
mean steps: 11.494, mean reward: 5.713, reward rate: 0.497, rewarded fraction: 0.549, relative distance: 147.144, critic loss: 0.939, actor loss: 25.656
t: 540000, Ep: 5399, action std: 0.50
mean steps: 11.429, mean reward: 5.684, reward rate: 0.497, rewarded fraction: 0.546, relative distance: 143.254, critic loss: 0.932, actor loss: 25.742
t: 550000, Ep: 5499, action std: 0.50
mean steps: 11.891, mean reward: 6.168, reward rate: 0.519, rewarded fraction: 0.596, relative distance: 131.203, critic loss: 0.930, actor loss: 25.815
t: 560000, Ep: 5599, action std: 0.50
mean steps: 11.553, mean reward: 5.889, reward rate: 0.510, rewarded fraction: 0.570, relative distance: 141.506, critic loss: 0.931, actor loss: 25.840
t: 570000, Ep: 5699, action std: 0.50
mean steps: 11.212, mean reward: 5.665, reward rate: 0.505, rewarded fraction: 0.548, relative distance: 147.306, critic loss: 0.921, actor loss: 25.859
t: 580000, Ep: 5799, action std: 0.50
mean st

t: 960000, Ep: 9599, action std: 0.50
mean steps: 9.609, mean reward: 5.026, reward rate: 0.523, rewarded fraction: 0.484, relative distance: 169.523, critic loss: 0.955, actor loss: 25.436
t: 970000, Ep: 9699, action std: 0.50
mean steps: 8.930, mean reward: 4.789, reward rate: 0.536, rewarded fraction: 0.460, relative distance: 174.755, critic loss: 1.042, actor loss: 25.424
t: 980000, Ep: 9799, action std: 0.50
mean steps: 9.042, mean reward: 4.693, reward rate: 0.519, rewarded fraction: 0.449, relative distance: 177.834, critic loss: 1.010, actor loss: 25.399
t: 990000, Ep: 9899, action std: 0.50
mean steps: 8.321, mean reward: 4.470, reward rate: 0.537, rewarded fraction: 0.434, relative distance: 192.706, critic loss: 0.953, actor loss: 25.404
t: 1000000, Ep: 9999, action std: 0.50
mean steps: 8.643, mean reward: 4.542, reward rate: 0.526, rewarded fraction: 0.433, relative distance: 180.510, critic loss: 1.061, actor loss: 25.371
t: 10000, Ep: 99, action std: 0.80
mean steps: 5.

t: 400000, Ep: 3999, action std: 0.80
mean steps: 6.457, mean reward: 1.120, reward rate: 0.173, rewarded fraction: 0.099, relative distance: 225.847, critic loss: 0.610, actor loss: 2.222
t: 410000, Ep: 4099, action std: 0.80
mean steps: 6.645, mean reward: 1.265, reward rate: 0.190, rewarded fraction: 0.112, relative distance: 222.644, critic loss: 0.677, actor loss: 3.460
t: 420000, Ep: 4199, action std: 0.80
mean steps: 6.660, mean reward: 1.140, reward rate: 0.171, rewarded fraction: 0.098, relative distance: 222.775, critic loss: 0.674, actor loss: 4.769
t: 430000, Ep: 4299, action std: 0.80
mean steps: 6.614, mean reward: 1.243, reward rate: 0.188, rewarded fraction: 0.109, relative distance: 223.310, critic loss: 0.759, actor loss: 6.104
t: 440000, Ep: 4399, action std: 0.80
mean steps: 6.732, mean reward: 1.423, reward rate: 0.211, rewarded fraction: 0.126, relative distance: 219.282, critic loss: 0.899, actor loss: 7.230
t: 450000, Ep: 4499, action std: 0.50
mean steps: 8.080

t: 840000, Ep: 8399, action std: 0.50
mean steps: 14.736, mean reward: 7.478, reward rate: 0.507, rewarded fraction: 0.723, relative distance: 86.159, critic loss: 0.890, actor loss: 26.431
t: 850000, Ep: 8499, action std: 0.50
mean steps: 14.247, mean reward: 7.233, reward rate: 0.508, rewarded fraction: 0.699, relative distance: 95.749, critic loss: 0.883, actor loss: 26.448
t: 860000, Ep: 8599, action std: 0.50
mean steps: 14.371, mean reward: 7.557, reward rate: 0.526, rewarded fraction: 0.736, relative distance: 91.089, critic loss: 0.895, actor loss: 26.463
t: 870000, Ep: 8699, action std: 0.50
mean steps: 13.703, mean reward: 7.044, reward rate: 0.514, rewarded fraction: 0.684, relative distance: 105.175, critic loss: 0.904, actor loss: 26.477
t: 880000, Ep: 8799, action std: 0.50
mean steps: 14.432, mean reward: 7.303, reward rate: 0.506, rewarded fraction: 0.710, relative distance: 95.798, critic loss: 0.895, actor loss: 26.528
t: 890000, Ep: 8899, action std: 0.50
mean steps:

t: 280000, Ep: 2799, action std: 0.80
mean steps: 16.872, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 418.786, critic loss: 0.001, actor loss: -0.023
t: 290000, Ep: 2899, action std: 0.80
mean steps: 16.220, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 412.450, critic loss: 0.000, actor loss: -0.022
t: 300000, Ep: 2999, action std: 0.80
mean steps: 16.162, mean reward: 0.011, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 417.691, critic loss: 0.001, actor loss: -0.021
t: 310000, Ep: 3099, action std: 0.80
mean steps: 16.520, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 413.131, critic loss: 0.001, actor loss: -0.020
t: 320000, Ep: 3199, action std: 0.80
mean steps: 15.499, mean reward: 0.036, reward rate: 0.002, rewarded fraction: 0.002, relative distance: 403.869, critic loss: 0.001, actor loss: -0.019
t: 330000, Ep: 3299, action std: 0.80
mean st

t: 710000, Ep: 7099, action std: 0.80
mean steps: 16.731, mean reward: 0.019, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 383.405, critic loss: 0.005, actor loss: -0.011
t: 720000, Ep: 7199, action std: 0.80
mean steps: 16.025, mean reward: 0.013, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 387.335, critic loss: 0.004, actor loss: -0.011
t: 730000, Ep: 7299, action std: 0.80
mean steps: 16.379, mean reward: 0.019, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 384.424, critic loss: 0.005, actor loss: -0.011
t: 740000, Ep: 7399, action std: 0.80
mean steps: 16.642, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 394.216, critic loss: 0.004, actor loss: -0.010
t: 750000, Ep: 7499, action std: 0.80
mean steps: 16.167, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 392.842, critic loss: 0.005, actor loss: -0.009
t: 760000, Ep: 7599, action std: 0.80
mean st

t: 150000, Ep: 1499, action std: 0.80
mean steps: 11.096, mean reward: 0.954, reward rate: 0.086, rewarded fraction: 0.076, relative distance: 195.604, critic loss: 0.159, actor loss: 1.306
t: 160000, Ep: 1599, action std: 0.80
mean steps: 8.458, mean reward: 0.886, reward rate: 0.105, rewarded fraction: 0.073, relative distance: 213.057, critic loss: 0.577, actor loss: 1.918
t: 170000, Ep: 1699, action std: 0.80
mean steps: 7.160, mean reward: 0.792, reward rate: 0.111, rewarded fraction: 0.064, relative distance: 227.627, critic loss: 1.091, actor loss: 2.831
t: 180000, Ep: 1799, action std: 0.80
mean steps: 8.017, mean reward: 0.924, reward rate: 0.115, rewarded fraction: 0.077, relative distance: 224.445, critic loss: 0.907, actor loss: 3.510
t: 190000, Ep: 1899, action std: 0.80
mean steps: 7.867, mean reward: 1.146, reward rate: 0.146, rewarded fraction: 0.100, relative distance: 222.050, critic loss: 0.897, actor loss: 4.084
t: 200000, Ep: 1999, action std: 0.80
mean steps: 7.77

t: 590000, Ep: 5899, action std: 0.50
mean steps: 14.882, mean reward: 7.332, reward rate: 0.493, rewarded fraction: 0.710, relative distance: 90.296, critic loss: 1.043, actor loss: 22.888
t: 600000, Ep: 5999, action std: 0.50
mean steps: 14.709, mean reward: 7.136, reward rate: 0.485, rewarded fraction: 0.689, relative distance: 90.384, critic loss: 1.037, actor loss: 22.982
t: 610000, Ep: 6099, action std: 0.50
mean steps: 14.923, mean reward: 7.145, reward rate: 0.479, rewarded fraction: 0.685, relative distance: 88.258, critic loss: 1.029, actor loss: 23.123
t: 620000, Ep: 6199, action std: 0.50
mean steps: 14.497, mean reward: 7.021, reward rate: 0.484, rewarded fraction: 0.671, relative distance: 91.590, critic loss: 1.011, actor loss: 23.236
t: 630000, Ep: 6299, action std: 0.50
mean steps: 14.736, mean reward: 7.238, reward rate: 0.491, rewarded fraction: 0.698, relative distance: 88.804, critic loss: 1.034, actor loss: 23.346
t: 640000, Ep: 6399, action std: 0.50
mean steps: 

t: 30000, Ep: 299, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.80
mean steps: 15.107, mean reward: 0.217, reward rate: 0.014, rewarded fraction: 0.015, relative distance: 266.286, critic loss: 0.014, actor loss: 0.064
t: 60000, Ep: 599, action std: 0.80
mean steps: 15.820, mean reward: 0.101, reward rate: 0.006, rewarded fraction: 0.004, relative distance: 276.305, critic loss: 0.030, actor loss: 0.057
t: 70000, Ep: 699, action std: 0.80
mean steps: 17.767, mean reward: 0.119, reward rate: 0.007, rewarded fraction: 0.006, relative distance: 272.936, critic loss: 0.034, actor loss: 0.039
t: 80000, Ep: 799, action std: 0.80
mean steps: 17.388, mean reward: 0.283, reward rat

t: 470000, Ep: 4699, action std: 0.50
mean steps: 12.072, mean reward: 5.996, reward rate: 0.497, rewarded fraction: 0.579, relative distance: 129.319, critic loss: 1.123, actor loss: 25.818
t: 480000, Ep: 4799, action std: 0.50
mean steps: 12.041, mean reward: 6.001, reward rate: 0.498, rewarded fraction: 0.577, relative distance: 128.770, critic loss: 1.146, actor loss: 25.950
t: 490000, Ep: 4899, action std: 0.50
mean steps: 12.249, mean reward: 5.994, reward rate: 0.489, rewarded fraction: 0.574, relative distance: 126.075, critic loss: 1.150, actor loss: 26.072
t: 500000, Ep: 4999, action std: 0.50
mean steps: 12.700, mean reward: 6.092, reward rate: 0.480, rewarded fraction: 0.584, relative distance: 121.819, critic loss: 1.155, actor loss: 26.185
t: 510000, Ep: 5099, action std: 0.50
mean steps: 12.306, mean reward: 6.205, reward rate: 0.504, rewarded fraction: 0.598, relative distance: 124.653, critic loss: 1.144, actor loss: 26.265
t: 520000, Ep: 5199, action std: 0.50
mean st

t: 900000, Ep: 8999, action std: 0.50
mean steps: 12.524, mean reward: 6.263, reward rate: 0.500, rewarded fraction: 0.609, relative distance: 124.427, critic loss: 1.144, actor loss: 26.136
t: 910000, Ep: 9099, action std: 0.50
mean steps: 12.256, mean reward: 6.235, reward rate: 0.509, rewarded fraction: 0.607, relative distance: 127.406, critic loss: 1.165, actor loss: 26.080
t: 920000, Ep: 9199, action std: 0.50
mean steps: 11.994, mean reward: 6.324, reward rate: 0.527, rewarded fraction: 0.614, relative distance: 125.175, critic loss: 1.162, actor loss: 26.100
t: 930000, Ep: 9299, action std: 0.50
mean steps: 12.174, mean reward: 6.107, reward rate: 0.502, rewarded fraction: 0.590, relative distance: 130.043, critic loss: 1.163, actor loss: 26.111
t: 940000, Ep: 9399, action std: 0.50
mean steps: 12.914, mean reward: 6.640, reward rate: 0.514, rewarded fraction: 0.643, relative distance: 113.528, critic loss: 1.169, actor loss: 26.099
t: 950000, Ep: 9499, action std: 0.50
mean st

t: 340000, Ep: 3399, action std: 0.50
mean steps: 11.150, mean reward: 5.552, reward rate: 0.498, rewarded fraction: 0.535, relative distance: 144.940, critic loss: 1.231, actor loss: 23.673
t: 350000, Ep: 3499, action std: 0.50
mean steps: 11.360, mean reward: 5.584, reward rate: 0.492, rewarded fraction: 0.540, relative distance: 144.526, critic loss: 1.227, actor loss: 23.973
t: 360000, Ep: 3599, action std: 0.50
mean steps: 11.319, mean reward: 5.528, reward rate: 0.488, rewarded fraction: 0.533, relative distance: 144.223, critic loss: 1.218, actor loss: 24.230
t: 370000, Ep: 3699, action std: 0.50
mean steps: 11.583, mean reward: 5.699, reward rate: 0.492, rewarded fraction: 0.551, relative distance: 138.927, critic loss: 1.221, actor loss: 24.496
t: 380000, Ep: 3799, action std: 0.50
mean steps: 11.318, mean reward: 5.509, reward rate: 0.487, rewarded fraction: 0.530, relative distance: 144.975, critic loss: 1.224, actor loss: 24.742
t: 390000, Ep: 3899, action std: 0.50
mean st

t: 770000, Ep: 7699, action std: 0.50
mean steps: 12.264, mean reward: 6.011, reward rate: 0.490, rewarded fraction: 0.578, relative distance: 128.068, critic loss: 1.102, actor loss: 24.975
t: 780000, Ep: 7799, action std: 0.50
mean steps: 12.740, mean reward: 6.215, reward rate: 0.488, rewarded fraction: 0.602, relative distance: 121.574, critic loss: 1.099, actor loss: 24.968
t: 790000, Ep: 7899, action std: 0.50
mean steps: 12.122, mean reward: 6.056, reward rate: 0.500, rewarded fraction: 0.587, relative distance: 131.169, critic loss: 1.118, actor loss: 24.951
t: 800000, Ep: 7999, action std: 0.50
mean steps: 12.048, mean reward: 5.788, reward rate: 0.480, rewarded fraction: 0.551, relative distance: 129.600, critic loss: 1.167, actor loss: 24.913
t: 810000, Ep: 8099, action std: 0.50
mean steps: 12.411, mean reward: 6.316, reward rate: 0.509, rewarded fraction: 0.611, relative distance: 118.790, critic loss: 1.155, actor loss: 24.911
t: 820000, Ep: 8199, action std: 0.50
mean st

t: 210000, Ep: 2099, action std: 0.50
mean steps: 13.937, mean reward: 6.233, reward rate: 0.447, rewarded fraction: 0.595, relative distance: 109.729, critic loss: 1.208, actor loss: 18.766
t: 220000, Ep: 2199, action std: 0.50
mean steps: 13.848, mean reward: 6.032, reward rate: 0.436, rewarded fraction: 0.577, relative distance: 115.959, critic loss: 1.188, actor loss: 19.677
t: 230000, Ep: 2299, action std: 0.50
mean steps: 14.748, mean reward: 7.191, reward rate: 0.488, rewarded fraction: 0.696, relative distance: 90.521, critic loss: 1.146, actor loss: 20.579
t: 240000, Ep: 2399, action std: 0.50
mean steps: 14.433, mean reward: 7.047, reward rate: 0.488, rewarded fraction: 0.682, relative distance: 93.944, critic loss: 1.093, actor loss: 21.411
t: 250000, Ep: 2499, action std: 0.50
mean steps: 15.665, mean reward: 7.661, reward rate: 0.489, rewarded fraction: 0.744, relative distance: 78.354, critic loss: 1.037, actor loss: 22.160
t: 260000, Ep: 2599, action std: 0.50
mean steps

t: 650000, Ep: 6499, action std: 0.50
mean steps: 17.333, mean reward: 8.643, reward rate: 0.499, rewarded fraction: 0.834, relative distance: 49.376, critic loss: 0.990, actor loss: 27.269
t: 660000, Ep: 6599, action std: 0.50
mean steps: 17.464, mean reward: 8.472, reward rate: 0.485, rewarded fraction: 0.816, relative distance: 48.669, critic loss: 0.974, actor loss: 27.272
t: 670000, Ep: 6699, action std: 0.50
mean steps: 17.006, mean reward: 8.371, reward rate: 0.492, rewarded fraction: 0.803, relative distance: 48.725, critic loss: 1.015, actor loss: 27.247
t: 680000, Ep: 6799, action std: 0.50
mean steps: 16.854, mean reward: 8.523, reward rate: 0.506, rewarded fraction: 0.820, relative distance: 47.734, critic loss: 0.986, actor loss: 27.251
t: 690000, Ep: 6899, action std: 0.50
mean steps: 16.944, mean reward: 8.524, reward rate: 0.503, rewarded fraction: 0.825, relative distance: 52.683, critic loss: 1.004, actor loss: 27.248
t: 700000, Ep: 6999, action std: 0.50
mean steps: 

t: 90000, Ep: 899, action std: 0.80
mean steps: 16.587, mean reward: 0.068, reward rate: 0.004, rewarded fraction: 0.004, relative distance: 364.542, critic loss: 0.009, actor loss: -0.078
t: 100000, Ep: 999, action std: 0.80
mean steps: 16.389, mean reward: 0.078, reward rate: 0.005, rewarded fraction: 0.006, relative distance: 360.956, critic loss: 0.014, actor loss: -0.077
t: 110000, Ep: 1099, action std: 0.80
mean steps: 16.214, mean reward: 0.024, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 357.538, critic loss: 0.015, actor loss: -0.073
t: 120000, Ep: 1199, action std: 0.80
mean steps: 16.618, mean reward: 0.033, reward rate: 0.002, rewarded fraction: 0.002, relative distance: 362.675, critic loss: 0.016, actor loss: -0.066
t: 130000, Ep: 1299, action std: 0.80
mean steps: 16.362, mean reward: 0.036, reward rate: 0.002, rewarded fraction: 0.002, relative distance: 364.395, critic loss: 0.016, actor loss: -0.062
t: 140000, Ep: 1399, action std: 0.80
mean steps

t: 520000, Ep: 5199, action std: 0.80
mean steps: 7.951, mean reward: 0.631, reward rate: 0.079, rewarded fraction: 0.051, relative distance: 231.073, critic loss: 0.264, actor loss: 1.459
t: 530000, Ep: 5299, action std: 0.80
mean steps: 10.664, mean reward: 1.171, reward rate: 0.110, rewarded fraction: 0.097, relative distance: 200.575, critic loss: 0.355, actor loss: 2.101
t: 540000, Ep: 5399, action std: 0.80
mean steps: 9.563, mean reward: 1.313, reward rate: 0.137, rewarded fraction: 0.111, relative distance: 205.824, critic loss: 0.385, actor loss: 2.852
t: 550000, Ep: 5499, action std: 0.80
mean steps: 9.367, mean reward: 1.229, reward rate: 0.131, rewarded fraction: 0.102, relative distance: 206.399, critic loss: 0.522, actor loss: 4.206
t: 560000, Ep: 5599, action std: 0.80
mean steps: 8.746, mean reward: 1.174, reward rate: 0.134, rewarded fraction: 0.099, relative distance: 213.441, critic loss: 0.582, actor loss: 5.591
t: 570000, Ep: 5699, action std: 0.80
mean steps: 9.06

t: 950000, Ep: 9499, action std: 0.50
mean steps: 12.757, mean reward: 6.550, reward rate: 0.513, rewarded fraction: 0.630, relative distance: 113.194, critic loss: 0.888, actor loss: 25.888
t: 960000, Ep: 9599, action std: 0.50
mean steps: 12.619, mean reward: 6.160, reward rate: 0.488, rewarded fraction: 0.592, relative distance: 125.143, critic loss: 0.878, actor loss: 25.962
t: 970000, Ep: 9699, action std: 0.50
mean steps: 12.314, mean reward: 6.306, reward rate: 0.512, rewarded fraction: 0.607, relative distance: 120.563, critic loss: 0.882, actor loss: 25.993
t: 980000, Ep: 9799, action std: 0.50
mean steps: 12.999, mean reward: 6.819, reward rate: 0.525, rewarded fraction: 0.661, relative distance: 108.647, critic loss: 0.890, actor loss: 26.043
t: 990000, Ep: 9899, action std: 0.50
mean steps: 13.065, mean reward: 7.007, reward rate: 0.536, rewarded fraction: 0.684, relative distance: 108.487, critic loss: 0.894, actor loss: 26.131
t: 1000000, Ep: 9999, action std: 0.50
mean s

t: 390000, Ep: 3899, action std: 0.80
mean steps: 15.639, mean reward: 0.300, reward rate: 0.019, rewarded fraction: 0.021, relative distance: 253.920, critic loss: 0.016, actor loss: -0.039
t: 400000, Ep: 3999, action std: 0.80
mean steps: 17.103, mean reward: 0.444, reward rate: 0.026, rewarded fraction: 0.034, relative distance: 252.648, critic loss: 0.016, actor loss: -0.040
t: 410000, Ep: 4099, action std: 0.80
mean steps: 16.440, mean reward: 0.490, reward rate: 0.030, rewarded fraction: 0.040, relative distance: 254.675, critic loss: 0.016, actor loss: -0.041
t: 420000, Ep: 4199, action std: 0.80
mean steps: 17.444, mean reward: 0.332, reward rate: 0.019, rewarded fraction: 0.024, relative distance: 256.768, critic loss: 0.015, actor loss: -0.043
t: 430000, Ep: 4299, action std: 0.80
mean steps: 16.463, mean reward: 0.279, reward rate: 0.017, rewarded fraction: 0.018, relative distance: 260.825, critic loss: 0.014, actor loss: -0.046
t: 440000, Ep: 4399, action std: 0.80
mean st

t: 820000, Ep: 8199, action std: 0.80
mean steps: 17.280, mean reward: 0.465, reward rate: 0.027, rewarded fraction: 0.036, relative distance: 249.075, critic loss: 0.010, actor loss: -0.031
t: 830000, Ep: 8299, action std: 0.80
mean steps: 17.002, mean reward: 0.313, reward rate: 0.018, rewarded fraction: 0.023, relative distance: 262.044, critic loss: 0.010, actor loss: -0.031
t: 840000, Ep: 8399, action std: 0.80
mean steps: 17.976, mean reward: 0.274, reward rate: 0.015, rewarded fraction: 0.019, relative distance: 260.547, critic loss: 0.010, actor loss: -0.030
t: 850000, Ep: 8499, action std: 0.80
mean steps: 15.741, mean reward: 0.362, reward rate: 0.023, rewarded fraction: 0.026, relative distance: 258.518, critic loss: 0.010, actor loss: -0.029
t: 860000, Ep: 8599, action std: 0.80
mean steps: 16.476, mean reward: 0.436, reward rate: 0.026, rewarded fraction: 0.034, relative distance: 258.792, critic loss: 0.011, actor loss: -0.028
t: 870000, Ep: 8699, action std: 0.80
mean st

t: 260000, Ep: 2599, action std: 0.80
mean steps: 17.038, mean reward: 0.376, reward rate: 0.022, rewarded fraction: 0.023, relative distance: 252.676, critic loss: 0.019, actor loss: -0.015
t: 270000, Ep: 2699, action std: 0.80
mean steps: 17.430, mean reward: 0.318, reward rate: 0.018, rewarded fraction: 0.021, relative distance: 256.097, critic loss: 0.019, actor loss: -0.031
t: 280000, Ep: 2799, action std: 0.80
mean steps: 17.462, mean reward: 0.377, reward rate: 0.022, rewarded fraction: 0.026, relative distance: 260.119, critic loss: 0.018, actor loss: -0.046
t: 290000, Ep: 2899, action std: 0.80
mean steps: 16.888, mean reward: 0.268, reward rate: 0.016, rewarded fraction: 0.017, relative distance: 261.576, critic loss: 0.017, actor loss: -0.056
t: 300000, Ep: 2999, action std: 0.80
mean steps: 16.446, mean reward: 0.429, reward rate: 0.026, rewarded fraction: 0.032, relative distance: 254.573, critic loss: 0.017, actor loss: -0.063
t: 310000, Ep: 3099, action std: 0.80
mean st

t: 700000, Ep: 6999, action std: 0.50
mean steps: 10.888, mean reward: 5.059, reward rate: 0.465, rewarded fraction: 0.482, relative distance: 148.980, critic loss: 0.882, actor loss: 19.575
t: 710000, Ep: 7099, action std: 0.50
mean steps: 10.619, mean reward: 5.104, reward rate: 0.481, rewarded fraction: 0.487, relative distance: 152.544, critic loss: 0.900, actor loss: 20.090
t: 720000, Ep: 7199, action std: 0.50
mean steps: 10.866, mean reward: 5.220, reward rate: 0.480, rewarded fraction: 0.501, relative distance: 149.714, critic loss: 0.884, actor loss: 20.511
t: 730000, Ep: 7299, action std: 0.50
mean steps: 10.438, mean reward: 5.070, reward rate: 0.486, rewarded fraction: 0.490, relative distance: 157.380, critic loss: 0.895, actor loss: 20.945
t: 740000, Ep: 7399, action std: 0.50
mean steps: 10.820, mean reward: 5.178, reward rate: 0.479, rewarded fraction: 0.499, relative distance: 149.694, critic loss: 0.910, actor loss: 21.285
t: 750000, Ep: 7499, action std: 0.50
mean st

t: 140000, Ep: 1399, action std: 0.80
mean steps: 16.975, mean reward: 0.501, reward rate: 0.030, rewarded fraction: 0.038, relative distance: 245.726, critic loss: 0.046, actor loss: 0.683
t: 150000, Ep: 1499, action std: 0.80
mean steps: 17.101, mean reward: 0.456, reward rate: 0.027, rewarded fraction: 0.036, relative distance: 264.548, critic loss: 0.040, actor loss: 0.573
t: 160000, Ep: 1599, action std: 0.80
mean steps: 16.476, mean reward: 0.357, reward rate: 0.022, rewarded fraction: 0.026, relative distance: 259.261, critic loss: 0.032, actor loss: 0.476
t: 170000, Ep: 1699, action std: 0.80
mean steps: 17.175, mean reward: 0.399, reward rate: 0.023, rewarded fraction: 0.028, relative distance: 255.226, critic loss: 0.030, actor loss: 0.392
t: 180000, Ep: 1799, action std: 0.80
mean steps: 17.448, mean reward: 0.359, reward rate: 0.021, rewarded fraction: 0.025, relative distance: 254.167, critic loss: 0.027, actor loss: 0.320
t: 190000, Ep: 1899, action std: 0.80
mean steps: 

t: 570000, Ep: 5699, action std: 0.80
mean steps: 16.574, mean reward: 0.395, reward rate: 0.024, rewarded fraction: 0.030, relative distance: 255.189, critic loss: 0.010, actor loss: -0.080
t: 580000, Ep: 5799, action std: 0.80
mean steps: 16.341, mean reward: 0.374, reward rate: 0.023, rewarded fraction: 0.027, relative distance: 260.769, critic loss: 0.010, actor loss: -0.074
t: 590000, Ep: 5899, action std: 0.80
mean steps: 15.950, mean reward: 0.356, reward rate: 0.022, rewarded fraction: 0.026, relative distance: 254.091, critic loss: 0.009, actor loss: -0.070
t: 600000, Ep: 5999, action std: 0.80
mean steps: 16.915, mean reward: 0.425, reward rate: 0.025, rewarded fraction: 0.031, relative distance: 247.954, critic loss: 0.010, actor loss: -0.066
t: 610000, Ep: 6099, action std: 0.80
mean steps: 16.224, mean reward: 0.419, reward rate: 0.026, rewarded fraction: 0.030, relative distance: 247.328, critic loss: 0.010, actor loss: -0.064
t: 620000, Ep: 6199, action std: 0.80
mean st

t: 1000000, Ep: 9999, action std: 0.80
mean steps: 16.552, mean reward: 0.351, reward rate: 0.021, rewarded fraction: 0.022, relative distance: 258.627, critic loss: 0.009, actor loss: -0.040
t: 10000, Ep: 99, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 20000, Ep: 199, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 30000, Ep: 299, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 40000, Ep: 399, action std: 0.80
mean steps: nan, mean reward: nan, reward rate: nan, rewarded fraction: nan, relative distance: nan, critic loss: 0.000, actor loss: 0.000
t: 50000, Ep: 499, action std: 0.80
mean steps: 12.330, mean reward: 0.039, reward rate: 0.003, rewarded frac

t: 440000, Ep: 4399, action std: 0.50
mean steps: 9.582, mean reward: 3.553, reward rate: 0.371, rewarded fraction: 0.329, relative distance: 173.533, critic loss: 0.970, actor loss: 10.383
t: 450000, Ep: 4499, action std: 0.50
mean steps: 9.898, mean reward: 4.227, reward rate: 0.427, rewarded fraction: 0.402, relative distance: 164.507, critic loss: 0.990, actor loss: 11.346
t: 460000, Ep: 4599, action std: 0.50
mean steps: 10.293, mean reward: 4.448, reward rate: 0.432, rewarded fraction: 0.420, relative distance: 158.124, critic loss: 0.989, actor loss: 12.315
t: 470000, Ep: 4699, action std: 0.50
mean steps: 9.955, mean reward: 4.243, reward rate: 0.426, rewarded fraction: 0.397, relative distance: 165.353, critic loss: 0.972, actor loss: 13.265
t: 480000, Ep: 4799, action std: 0.50
mean steps: 10.530, mean reward: 5.132, reward rate: 0.487, rewarded fraction: 0.492, relative distance: 149.354, critic loss: 0.938, actor loss: 14.174
t: 490000, Ep: 4899, action std: 0.50
mean steps

t: 870000, Ep: 8699, action std: 0.50
mean steps: 13.287, mean reward: 6.503, reward rate: 0.489, rewarded fraction: 0.627, relative distance: 115.169, critic loss: 1.031, actor loss: 24.453
t: 880000, Ep: 8799, action std: 0.50
mean steps: 12.481, mean reward: 6.128, reward rate: 0.491, rewarded fraction: 0.590, relative distance: 125.673, critic loss: 1.033, actor loss: 24.509
t: 890000, Ep: 8899, action std: 0.50
mean steps: 12.419, mean reward: 6.211, reward rate: 0.500, rewarded fraction: 0.601, relative distance: 123.789, critic loss: 1.043, actor loss: 24.541
t: 900000, Ep: 8999, action std: 0.50
mean steps: 12.787, mean reward: 6.555, reward rate: 0.513, rewarded fraction: 0.626, relative distance: 110.818, critic loss: 1.062, actor loss: 24.593
t: 910000, Ep: 9099, action std: 0.50
mean steps: 13.194, mean reward: 6.457, reward rate: 0.489, rewarded fraction: 0.623, relative distance: 114.595, critic loss: 1.018, actor loss: 24.644
t: 920000, Ep: 9199, action std: 0.50
mean st

t: 310000, Ep: 3099, action std: 0.80
mean steps: 16.331, mean reward: 0.011, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 428.202, critic loss: 0.001, actor loss: -0.011
t: 320000, Ep: 3199, action std: 0.80
mean steps: 16.335, mean reward: 0.014, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 418.879, critic loss: 0.001, actor loss: -0.011
t: 330000, Ep: 3299, action std: 0.80
mean steps: 15.637, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 406.939, critic loss: 0.001, actor loss: -0.011
t: 340000, Ep: 3399, action std: 0.80
mean steps: 16.503, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 422.895, critic loss: 0.002, actor loss: -0.011
t: 350000, Ep: 3499, action std: 0.80
mean steps: 15.185, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 423.476, critic loss: 0.001, actor loss: -0.011
t: 360000, Ep: 3599, action std: 0.80
mean st

t: 740000, Ep: 7399, action std: 0.80
mean steps: 16.598, mean reward: 0.011, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 459.148, critic loss: 0.001, actor loss: -0.010
t: 750000, Ep: 7499, action std: 0.80
mean steps: 15.143, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 436.518, critic loss: 0.001, actor loss: -0.010
t: 760000, Ep: 7599, action std: 0.80
mean steps: 16.094, mean reward: 0.010, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 452.768, critic loss: 0.001, actor loss: -0.010
t: 770000, Ep: 7699, action std: 0.80
mean steps: 16.535, mean reward: 0.011, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 454.890, critic loss: 0.001, actor loss: -0.010
t: 780000, Ep: 7799, action std: 0.80
mean steps: 15.373, mean reward: 0.012, reward rate: 0.001, rewarded fraction: 0.000, relative distance: 445.270, critic loss: 0.001, actor loss: -0.010
t: 790000, Ep: 7899, action std: 0.80
mean st

t: 180000, Ep: 1799, action std: 0.80
mean steps: 16.057, mean reward: 0.533, reward rate: 0.033, rewarded fraction: 0.041, relative distance: 250.370, critic loss: 0.056, actor loss: -0.036
t: 190000, Ep: 1899, action std: 0.80
mean steps: 16.300, mean reward: 0.521, reward rate: 0.032, rewarded fraction: 0.038, relative distance: 242.686, critic loss: 0.055, actor loss: -0.054
t: 200000, Ep: 1999, action std: 0.80
mean steps: 16.124, mean reward: 0.401, reward rate: 0.025, rewarded fraction: 0.027, relative distance: 248.370, critic loss: 0.052, actor loss: -0.067
t: 210000, Ep: 2099, action std: 0.80
mean steps: 15.083, mean reward: 0.383, reward rate: 0.025, rewarded fraction: 0.024, relative distance: 227.704, critic loss: 0.052, actor loss: -0.074
t: 220000, Ep: 2199, action std: 0.80
mean steps: 16.128, mean reward: 0.180, reward rate: 0.011, rewarded fraction: 0.010, relative distance: 255.460, critic loss: 0.048, actor loss: -0.105
t: 230000, Ep: 2299, action std: 0.80
mean st

t: 620000, Ep: 6199, action std: 0.50
mean steps: 13.102, mean reward: 6.395, reward rate: 0.488, rewarded fraction: 0.612, relative distance: 115.283, critic loss: 0.982, actor loss: 24.579
t: 630000, Ep: 6299, action std: 0.50
mean steps: 12.942, mean reward: 6.243, reward rate: 0.482, rewarded fraction: 0.600, relative distance: 118.955, critic loss: 1.004, actor loss: 24.652
t: 640000, Ep: 6399, action std: 0.50
mean steps: 12.779, mean reward: 6.630, reward rate: 0.519, rewarded fraction: 0.639, relative distance: 112.054, critic loss: 1.004, actor loss: 24.696
t: 650000, Ep: 6499, action std: 0.50
mean steps: 12.482, mean reward: 6.245, reward rate: 0.500, rewarded fraction: 0.604, relative distance: 124.095, critic loss: 0.990, actor loss: 24.718
t: 660000, Ep: 6599, action std: 0.50
mean steps: 13.246, mean reward: 6.563, reward rate: 0.496, rewarded fraction: 0.634, relative distance: 112.139, critic loss: 0.985, actor loss: 24.748
t: 670000, Ep: 6699, action std: 0.50
mean st

t: 60000, Ep: 599, action std: 0.80
mean steps: 12.900, mean reward: 0.353, reward rate: 0.027, rewarded fraction: 0.025, relative distance: 247.205, critic loss: 0.086, actor loss: 0.595
t: 70000, Ep: 699, action std: 0.80
mean steps: 16.998, mean reward: 0.339, reward rate: 0.020, rewarded fraction: 0.020, relative distance: 256.300, critic loss: 0.093, actor loss: 0.823
t: 80000, Ep: 799, action std: 0.80
mean steps: 17.065, mean reward: 0.300, reward rate: 0.018, rewarded fraction: 0.020, relative distance: 264.610, critic loss: 0.091, actor loss: 0.560
t: 90000, Ep: 899, action std: 0.80
mean steps: 17.658, mean reward: 0.300, reward rate: 0.017, rewarded fraction: 0.019, relative distance: 252.168, critic loss: 0.076, actor loss: 0.369
t: 100000, Ep: 999, action std: 0.80
mean steps: 17.199, mean reward: 0.425, reward rate: 0.025, rewarded fraction: 0.030, relative distance: 255.590, critic loss: 0.065, actor loss: 0.252
t: 110000, Ep: 1099, action std: 0.80
mean steps: 16.403, m

t: 500000, Ep: 4999, action std: 0.50
mean steps: 12.757, mean reward: 6.124, reward rate: 0.480, rewarded fraction: 0.589, relative distance: 123.717, critic loss: 1.123, actor loss: 23.272
t: 510000, Ep: 5099, action std: 0.50
mean steps: 12.723, mean reward: 6.482, reward rate: 0.509, rewarded fraction: 0.630, relative distance: 119.648, critic loss: 1.143, actor loss: 23.348
t: 520000, Ep: 5199, action std: 0.50
mean steps: 12.897, mean reward: 6.609, reward rate: 0.512, rewarded fraction: 0.643, relative distance: 117.146, critic loss: 1.121, actor loss: 23.439
t: 530000, Ep: 5299, action std: 0.50
mean steps: 12.939, mean reward: 6.414, reward rate: 0.496, rewarded fraction: 0.615, relative distance: 115.582, critic loss: 1.130, actor loss: 23.491
t: 540000, Ep: 5399, action std: 0.50
mean steps: 13.106, mean reward: 6.462, reward rate: 0.493, rewarded fraction: 0.624, relative distance: 116.846, critic loss: 1.138, actor loss: 23.589
t: 550000, Ep: 5499, action std: 0.50
mean st

t: 930000, Ep: 9299, action std: 0.50
mean steps: 12.658, mean reward: 6.291, reward rate: 0.497, rewarded fraction: 0.611, relative distance: 124.347, critic loss: 1.082, actor loss: 24.668
t: 940000, Ep: 9399, action std: 0.50
mean steps: 12.540, mean reward: 6.333, reward rate: 0.505, rewarded fraction: 0.616, relative distance: 122.447, critic loss: 1.094, actor loss: 24.688
t: 950000, Ep: 9499, action std: 0.50
mean steps: 12.364, mean reward: 6.257, reward rate: 0.506, rewarded fraction: 0.609, relative distance: 125.097, critic loss: 1.116, actor loss: 24.666
t: 960000, Ep: 9599, action std: 0.50
mean steps: 12.470, mean reward: 6.326, reward rate: 0.507, rewarded fraction: 0.612, relative distance: 121.248, critic loss: 1.082, actor loss: 24.675
t: 970000, Ep: 9699, action std: 0.50
mean steps: 12.605, mean reward: 6.539, reward rate: 0.519, rewarded fraction: 0.636, relative distance: 118.994, critic loss: 1.068, actor loss: 24.703
t: 980000, Ep: 9799, action std: 0.50
mean st