# Open Sim RL Training
#### Imports

In [1]:
# Dependencies
import numpy as np
import tensorflow as tf
# Environment
from osim.env import L2RunEnv as ENV # rename environment to be used for training

  from ._conv import register_converters as _register_converters


### Agent Class
#### Imports

In [2]:
import keras
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.processors import WhiteningNormalizerProcessor
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

Using TensorFlow backend.


#### Class
Reference: https://github.com/keras-rl/keras-rl/blob/master/examples/ddpg_mujoco.py

In [3]:
class Agent:
    def __init__(self,env):
        nb_actions = env.action_space.shape[0]
        
        self.env = env
        self.actor = self.build_actor(env)
        self.critic, action_input = self.build_critic(env)
        self.loss = self.build_loss()

        self.memory = SequentialMemory(limit=100000, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)
        self.agent = DDPGAgent(   nb_actions=nb_actions, actor=self.actor, 
                                  critic=self.critic, critic_action_input=action_input,
                                  memory=self.memory, nb_steps_warmup_critic=1000, 
                                  nb_steps_warmup_actor=1000,
                                  random_process=self.random_process, 
                                  gamma=.99, target_model_update=1e-3,
                                  processor=WhiteningNormalizerProcessor()  )
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=self.loss)

    def build_loss(self):
        return ['mse']

    def build_actor(self,env):
        nb_actions = env.action_space.shape[0]
        actor = Sequential()
        actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions,
                        activation='tanh',
                        kernel_constraint=  keras.constraints.min_max_norm(
                                            min_value=0,
                                            max_value=nb_actions,
                                            axis=1) ) )
        actor.summary()

        inD = Input(shape=(1,) + env.observation_space.shape)
        out = actor(inD)

        return Model(inD,out)

    def build_critic(self,env):
        nb_actions = env.action_space.shape[0]
        action_input = Input(shape=(nb_actions,), name='action_input')
        observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)

        critic = Model(inputs=[action_input, observation_input], outputs=x)
        critic.summary()

        return critic, action_input
    
    def fit(self, **kwargs):
        return self.agent.fit(self.env,**kwargs)
    
    def test(self, **kwargs):
        return self.agent.test(self.env,**kwargs)
    
    def save_weights(self,filename='ddpg_{}_weights.h5f'):
        self.agent.save_weights(filename.format("opensim"), overwrite=True)
        
    def load_weights(self,filename='ddpg_{}_weights.h5f'):
        self.agent.load_weights(filename.format("opensim"))

### Environment Class

In [4]:
class TrainEnv(ENV):
    pass
# TODO: define virtual assistant forces on agent
# TODO: define search through easier environments
# TODO: make environment harder once the agent has trained for challenge

# Run Simulation
#### Environment

In [5]:
env = TrainEnv(visualize=False)
observation = env.reset( )

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


#### Agent

In [6]:
agent = Agent(env)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 41)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 400)               16800     
_________________________________________________________________
activation_1 (Activation)    (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 300)               120300    
_________________________________________________________________
activation_2 (Activation)    (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 18)                5418      
Total params: 142,518
Trainable params: 142,518
Non-trainable params: 0
_________________________________________________________________
____

### Load previously trained weights

In [7]:
agent.load_weights( )

### Train new weights

In [None]:
for i in range(100): # Train in smaller batches to allow for interuption
    print("\n\niteration:",i)
    agent.fit(nb_steps=2000, visualize=False, verbose=2)
    ## Always save new weights
    agent.save_weights( )



iteration: 0
Training for 2000 steps ...
  195/2000: episode: 1, duration: 5.798s, episode steps: 195, steps per second: 34, episode reward: -0.801, mean reward: -0.004 [-0.020, 0.004], mean action: 0.004 [-1.023, 0.857], mean observation: 0.073 [-4.036, 5.242], loss: --, mean_squared_error: --, mean_q: --
  381/2000: episode: 2, duration: 4.646s, episode steps: 186, steps per second: 40, episode reward: -0.789, mean reward: -0.004 [-0.020, 0.005], mean action: 0.046 [-0.945, 1.067], mean observation: 0.065 [-5.320, 6.739], loss: --, mean_squared_error: --, mean_q: --
  563/2000: episode: 3, duration: 4.741s, episode steps: 182, steps per second: 38, episode reward: 0.560, mean reward: 0.003 [-0.003, 0.011], mean action: 0.050 [-0.843, 0.993], mean observation: 0.135 [-5.071, 6.784], loss: --, mean_squared_error: --, mean_q: --
  732/2000: episode: 4, duration: 4.466s, episode steps: 169, steps per second: 38, episode reward: 0.536, mean reward: 0.003 [-0.002, 0.010], mean action: 0.

  660/2000: episode: 16, duration: 4.745s, episode steps: 43, steps per second: 9, episode reward: 0.174, mean reward: 0.004 [0.001, 0.007], mean action: 0.039 [-1.119, 1.145], mean observation: -0.074 [-19.306, 10.080], loss: --, mean_squared_error: --, mean_q: --
  702/2000: episode: 17, duration: 4.391s, episode steps: 42, steps per second: 10, episode reward: 0.167, mean reward: 0.004 [0.001, 0.006], mean action: 0.031 [-1.068, 1.106], mean observation: -0.080 [-19.472, 10.284], loss: --, mean_squared_error: --, mean_q: --
  744/2000: episode: 18, duration: 4.439s, episode steps: 42, steps per second: 9, episode reward: 0.172, mean reward: 0.004 [0.001, 0.006], mean action: 0.031 [-1.102, 1.088], mean observation: -0.079 [-19.139, 10.211], loss: --, mean_squared_error: --, mean_q: --
  784/2000: episode: 19, duration: 4.227s, episode steps: 40, steps per second: 9, episode reward: 0.161, mean reward: 0.004 [0.001, 0.007], mean action: 0.046 [-1.065, 1.067], mean observation: -0.090

 1433/2000: episode: 12, duration: 6.753s, episode steps: 124, steps per second: 18, episode reward: -0.926, mean reward: -0.007 [-0.022, 0.005], mean action: 0.119 [-1.120, 1.210], mean observation: -0.021 [-15.630, 15.024], loss: 0.000746, mean_squared_error: 0.001493, mean_q: 0.407273
 1560/2000: episode: 13, duration: 9.036s, episode steps: 127, steps per second: 14, episode reward: -0.779, mean reward: -0.006 [-0.019, 0.008], mean action: 0.166 [-1.154, 1.179], mean observation: -0.019 [-12.694, 11.112], loss: 0.000885, mean_squared_error: 0.001769, mean_q: 0.411115
 1684/2000: episode: 14, duration: 6.166s, episode steps: 124, steps per second: 20, episode reward: -0.799, mean reward: -0.006 [-0.020, 0.008], mean action: 0.133 [-1.128, 1.254], mean observation: 0.010 [-29.551, 11.779], loss: 0.001544, mean_squared_error: 0.003088, mean_q: 0.418433
 1819/2000: episode: 15, duration: 7.333s, episode steps: 135, steps per second: 18, episode reward: -0.801, mean reward: -0.006 [-0.0

  822/2000: episode: 11, duration: 8.563s, episode steps: 75, steps per second: 9, episode reward: 0.110, mean reward: 0.001 [-0.004, 0.009], mean action: 0.125 [-1.098, 1.155], mean observation: -0.003 [-19.890, 17.429], loss: --, mean_squared_error: --, mean_q: --
  890/2000: episode: 12, duration: 12.107s, episode steps: 68, steps per second: 6, episode reward: 0.144, mean reward: 0.002 [-0.003, 0.009], mean action: 0.064 [-1.214, 1.119], mean observation: 0.001 [-20.189, 17.549], loss: --, mean_squared_error: --, mean_q: --
  958/2000: episode: 13, duration: 13.114s, episode steps: 68, steps per second: 5, episode reward: 0.143, mean reward: 0.002 [-0.003, 0.009], mean action: 0.106 [-1.144, 1.235], mean observation: 0.004 [-19.343, 17.863], loss: --, mean_squared_error: --, mean_q: --
 1041/2000: episode: 14, duration: 13.366s, episode steps: 83, steps per second: 6, episode reward: 0.062, mean reward: 0.001 [-0.004, 0.009], mean action: 0.111 [-1.140, 1.135], mean observation: -0

 1949/2000: episode: 17, duration: 7.937s, episode steps: 134, steps per second: 17, episode reward: -0.742, mean reward: -0.006 [-0.020, 0.007], mean action: -0.040 [-1.117, 1.098], mean observation: 0.020 [-32.519, 15.308], loss: 0.000152, mean_squared_error: 0.000303, mean_q: 0.467680
done, took 237.125 seconds


iteration: 6
Training for 2000 steps ...
   94/2000: episode: 1, duration: 7.968s, episode steps: 94, steps per second: 12, episode reward: 0.272, mean reward: 0.003 [-0.001, 0.009], mean action: 0.073 [-1.144, 1.242], mean observation: 0.018 [-13.086, 17.499], loss: --, mean_squared_error: --, mean_q: --
  185/2000: episode: 2, duration: 6.643s, episode steps: 91, steps per second: 14, episode reward: 0.283, mean reward: 0.003 [-0.001, 0.009], mean action: 0.088 [-1.077, 1.181], mean observation: 0.019 [-15.033, 17.262], loss: --, mean_squared_error: --, mean_q: --
  284/2000: episode: 3, duration: 7.350s, episode steps: 99, steps per second: 13, episode reward: 0.224, mea

 1000/2000: episode: 11, duration: 3.260s, episode steps: 91, steps per second: 28, episode reward: 0.473, mean reward: 0.005 [-0.001, 0.013], mean action: 0.282 [-1.095, 1.117], mean observation: 0.128 [-9.738, 18.260], loss: --, mean_squared_error: --, mean_q: --
 1091/2000: episode: 12, duration: 4.077s, episode steps: 91, steps per second: 22, episode reward: 0.456, mean reward: 0.005 [-0.001, 0.013], mean action: 0.309 [-1.164, 1.211], mean observation: 0.128 [-9.660, 18.004], loss: 0.001412, mean_squared_error: 0.002824, mean_q: 0.573506
 1171/2000: episode: 13, duration: 3.679s, episode steps: 80, steps per second: 22, episode reward: 0.421, mean reward: 0.005 [-0.002, 0.014], mean action: 0.272 [-1.168, 1.127], mean observation: 0.126 [-9.713, 18.031], loss: 0.000778, mean_squared_error: 0.001556, mean_q: 0.585379
 1261/2000: episode: 14, duration: 4.328s, episode steps: 90, steps per second: 21, episode reward: 0.448, mean reward: 0.005 [-0.001, 0.013], mean action: 0.258 [-1.

done, took 98.276 seconds


iteration: 9
Training for 2000 steps ...
   77/2000: episode: 1, duration: 2.844s, episode steps: 77, steps per second: 27, episode reward: 0.420, mean reward: 0.005 [-0.002, 0.013], mean action: 0.242 [-1.134, 1.151], mean observation: 0.124 [-9.458, 20.074], loss: --, mean_squared_error: --, mean_q: --
  156/2000: episode: 2, duration: 2.909s, episode steps: 79, steps per second: 27, episode reward: 0.436, mean reward: 0.006 [-0.002, 0.013], mean action: 0.220 [-1.142, 1.122], mean observation: 0.123 [-9.366, 20.045], loss: --, mean_squared_error: --, mean_q: --
  233/2000: episode: 3, duration: 2.769s, episode steps: 77, steps per second: 28, episode reward: 0.415, mean reward: 0.005 [-0.002, 0.013], mean action: 0.233 [-1.162, 1.160], mean observation: 0.124 [-9.418, 19.983], loss: --, mean_squared_error: --, mean_q: --
  312/2000: episode: 4, duration: 2.872s, episode steps: 79, steps per second: 28, episode reward: 0.433, mean reward: 0.005 [-0.002, 0.

  477/2000: episode: 6, duration: 2.714s, episode steps: 80, steps per second: 29, episode reward: 0.447, mean reward: 0.006 [-0.000, 0.013], mean action: 0.398 [-1.187, 1.179], mean observation: 0.123 [-9.195, 19.156], loss: --, mean_squared_error: --, mean_q: --
  555/2000: episode: 7, duration: 2.547s, episode steps: 78, steps per second: 31, episode reward: 0.432, mean reward: 0.006 [-0.000, 0.013], mean action: 0.400 [-1.120, 1.186], mean observation: 0.124 [-9.225, 19.311], loss: --, mean_squared_error: --, mean_q: --
  635/2000: episode: 8, duration: 2.701s, episode steps: 80, steps per second: 30, episode reward: 0.444, mean reward: 0.006 [-0.000, 0.013], mean action: 0.412 [-1.242, 1.224], mean observation: 0.124 [-9.309, 19.426], loss: --, mean_squared_error: --, mean_q: --
  715/2000: episode: 9, duration: 2.609s, episode steps: 80, steps per second: 31, episode reward: 0.444, mean reward: 0.006 [-0.000, 0.013], mean action: 0.399 [-1.110, 1.120], mean observation: 0.124 [-9

  990/2000: episode: 12, duration: 2.880s, episode steps: 83, steps per second: 29, episode reward: 0.473, mean reward: 0.006 [-0.000, 0.014], mean action: 0.293 [-1.119, 1.151], mean observation: 0.124 [-16.184, 17.632], loss: --, mean_squared_error: --, mean_q: --
 1072/2000: episode: 13, duration: 3.449s, episode steps: 82, steps per second: 24, episode reward: 0.459, mean reward: 0.006 [-0.000, 0.014], mean action: 0.299 [-1.139, 1.168], mean observation: 0.125 [-15.733, 17.484], loss: 0.000201, mean_squared_error: 0.000403, mean_q: 0.708665
 1154/2000: episode: 14, duration: 3.601s, episode steps: 82, steps per second: 23, episode reward: 0.458, mean reward: 0.006 [-0.000, 0.014], mean action: 0.305 [-1.173, 1.216], mean observation: 0.125 [-17.183, 17.703], loss: 0.000214, mean_squared_error: 0.000427, mean_q: 0.703556
 1234/2000: episode: 15, duration: 3.407s, episode steps: 80, steps per second: 23, episode reward: 0.444, mean reward: 0.006 [-0.000, 0.014], mean action: 0.296 [

 1404/2000: episode: 18, duration: 3.046s, episode steps: 72, steps per second: 24, episode reward: 0.425, mean reward: 0.006 [-0.001, 0.014], mean action: 0.369 [-1.157, 1.149], mean observation: 0.125 [-9.621, 17.845], loss: 0.000066, mean_squared_error: 0.000133, mean_q: 0.720553
 1475/2000: episode: 19, duration: 2.928s, episode steps: 71, steps per second: 24, episode reward: 0.420, mean reward: 0.006 [-0.001, 0.014], mean action: 0.354 [-1.087, 1.205], mean observation: 0.126 [-9.687, 17.821], loss: 0.000079, mean_squared_error: 0.000158, mean_q: 0.698341
 1548/2000: episode: 20, duration: 3.157s, episode steps: 73, steps per second: 23, episode reward: 0.434, mean reward: 0.006 [-0.001, 0.014], mean action: 0.348 [-1.138, 1.205], mean observation: 0.125 [-9.731, 17.687], loss: 0.000098, mean_squared_error: 0.000197, mean_q: 0.710705
 1621/2000: episode: 21, duration: 3.071s, episode steps: 73, steps per second: 24, episode reward: 0.431, mean reward: 0.006 [-0.001, 0.014], mean 

 1607/2000: episode: 22, duration: 3.117s, episode steps: 74, steps per second: 24, episode reward: 0.425, mean reward: 0.006 [-0.000, 0.014], mean action: 0.217 [-1.097, 1.156], mean observation: 0.135 [-9.969, 18.013], loss: 0.000154, mean_squared_error: 0.000307, mean_q: 0.720255
 1681/2000: episode: 23, duration: 3.002s, episode steps: 74, steps per second: 25, episode reward: 0.431, mean reward: 0.006 [-0.000, 0.014], mean action: 0.192 [-1.123, 1.126], mean observation: 0.133 [-10.093, 17.961], loss: 0.000135, mean_squared_error: 0.000269, mean_q: 0.733543
 1759/2000: episode: 24, duration: 3.341s, episode steps: 78, steps per second: 23, episode reward: 0.457, mean reward: 0.006 [-0.000, 0.014], mean action: 0.196 [-1.091, 1.126], mean observation: 0.130 [-10.076, 18.204], loss: 0.000188, mean_squared_error: 0.000376, mean_q: 0.725469
 1835/2000: episode: 25, duration: 3.270s, episode steps: 76, steps per second: 23, episode reward: 0.441, mean reward: 0.006 [0.000, 0.014], mean

 1891/2000: episode: 25, duration: 3.113s, episode steps: 73, steps per second: 23, episode reward: 0.414, mean reward: 0.006 [-0.000, 0.014], mean action: 0.225 [-1.110, 1.189], mean observation: 0.135 [-9.910, 18.032], loss: 0.000082, mean_squared_error: 0.000164, mean_q: 0.739529
 1965/2000: episode: 26, duration: 2.957s, episode steps: 74, steps per second: 25, episode reward: 0.423, mean reward: 0.006 [0.000, 0.013], mean action: 0.227 [-1.081, 1.210], mean observation: 0.137 [-10.021, 18.239], loss: 0.000229, mean_squared_error: 0.000457, mean_q: 0.711784
done, took 74.676 seconds


iteration: 15
Training for 2000 steps ...
   74/2000: episode: 1, duration: 2.386s, episode steps: 74, steps per second: 31, episode reward: 0.421, mean reward: 0.006 [-0.000, 0.014], mean action: 0.234 [-1.074, 1.242], mean observation: 0.136 [-10.052, 17.912], loss: --, mean_squared_error: --, mean_q: --
  147/2000: episode: 2, duration: 2.302s, episode steps: 73, steps per second: 32, episode rewar

  145/2000: episode: 2, duration: 2.518s, episode steps: 73, steps per second: 29, episode reward: 0.406, mean reward: 0.006 [-0.002, 0.014], mean action: 0.240 [-1.125, 1.163], mean observation: 0.137 [-10.011, 18.040], loss: --, mean_squared_error: --, mean_q: --
  216/2000: episode: 3, duration: 2.545s, episode steps: 71, steps per second: 28, episode reward: 0.392, mean reward: 0.006 [-0.002, 0.014], mean action: 0.221 [-1.147, 1.103], mean observation: 0.138 [-10.057, 18.275], loss: --, mean_squared_error: --, mean_q: --
  287/2000: episode: 4, duration: 2.427s, episode steps: 71, steps per second: 29, episode reward: 0.393, mean reward: 0.006 [-0.002, 0.014], mean action: 0.229 [-1.175, 1.179], mean observation: 0.138 [-10.093, 18.138], loss: --, mean_squared_error: --, mean_q: --
  359/2000: episode: 5, duration: 2.413s, episode steps: 72, steps per second: 30, episode reward: 0.400, mean reward: 0.006 [-0.002, 0.014], mean action: 0.226 [-1.145, 1.155], mean observation: 0.138 

  852/2000: episode: 8, duration: 3.705s, episode steps: 108, steps per second: 29, episode reward: 0.600, mean reward: 0.006 [-0.001, 0.013], mean action: 0.236 [-1.177, 1.120], mean observation: 0.106 [-9.992, 18.257], loss: --, mean_squared_error: --, mean_q: --
  959/2000: episode: 9, duration: 3.652s, episode steps: 107, steps per second: 29, episode reward: 0.597, mean reward: 0.006 [-0.001, 0.014], mean action: 0.306 [-1.139, 1.208], mean observation: 0.106 [-10.001, 18.271], loss: --, mean_squared_error: --, mean_q: --
 1065/2000: episode: 10, duration: 4.197s, episode steps: 106, steps per second: 25, episode reward: 0.595, mean reward: 0.006 [-0.001, 0.014], mean action: 0.282 [-1.120, 1.159], mean observation: 0.106 [-10.043, 18.230], loss: 0.000051, mean_squared_error: 0.000101, mean_q: 0.723865
 1167/2000: episode: 11, duration: 4.407s, episode steps: 102, steps per second: 23, episode reward: 0.564, mean reward: 0.006 [-0.001, 0.012], mean action: 0.279 [-1.131, 1.203], m

 1825/2000: episode: 19, duration: 3.792s, episode steps: 94, steps per second: 25, episode reward: 0.526, mean reward: 0.006 [-0.001, 0.012], mean action: 0.132 [-1.135, 1.175], mean observation: 0.131 [-9.028, 21.204], loss: 0.000087, mean_squared_error: 0.000174, mean_q: 0.700485
 1914/2000: episode: 20, duration: 3.487s, episode steps: 89, steps per second: 26, episode reward: 0.500, mean reward: 0.006 [-0.001, 0.012], mean action: 0.102 [-1.154, 1.171], mean observation: 0.141 [-9.099, 20.042], loss: 0.000052, mean_squared_error: 0.000104, mean_q: 0.709838
done, took 73.056 seconds


iteration: 19
Training for 2000 steps ...
   90/2000: episode: 1, duration: 2.955s, episode steps: 90, steps per second: 30, episode reward: 0.505, mean reward: 0.006 [-0.001, 0.013], mean action: 0.078 [-1.137, 1.167], mean observation: 0.125 [-38.574, 20.801], loss: --, mean_squared_error: --, mean_q: --
  179/2000: episode: 2, duration: 2.960s, episode steps: 89, steps per second: 30, episode rewar

  562/2000: episode: 7, duration: 2.630s, episode steps: 80, steps per second: 30, episode reward: 0.462, mean reward: 0.006 [-0.001, 0.012], mean action: 0.073 [-1.209, 1.127], mean observation: 0.133 [-9.593, 19.121], loss: --, mean_squared_error: --, mean_q: --
  642/2000: episode: 8, duration: 2.704s, episode steps: 80, steps per second: 30, episode reward: 0.468, mean reward: 0.006 [-0.001, 0.012], mean action: 0.081 [-1.228, 1.130], mean observation: 0.130 [-9.739, 18.945], loss: --, mean_squared_error: --, mean_q: --
  722/2000: episode: 9, duration: 2.717s, episode steps: 80, steps per second: 29, episode reward: 0.465, mean reward: 0.006 [-0.001, 0.012], mean action: 0.106 [-1.075, 1.109], mean observation: 0.132 [-9.781, 18.857], loss: --, mean_squared_error: --, mean_q: --
  802/2000: episode: 10, duration: 2.664s, episode steps: 80, steps per second: 30, episode reward: 0.464, mean reward: 0.006 [-0.001, 0.012], mean action: 0.095 [-1.155, 1.200], mean observation: 0.123 [-

 1562/2000: episode: 15, duration: 3.662s, episode steps: 85, steps per second: 23, episode reward: 0.481, mean reward: 0.006 [-0.000, 0.014], mean action: 0.161 [-1.126, 1.231], mean observation: 0.121 [-38.707, 20.725], loss: 0.000071, mean_squared_error: 0.000142, mean_q: 0.693118
 1666/2000: episode: 16, duration: 4.761s, episode steps: 104, steps per second: 22, episode reward: 0.833, mean reward: 0.008 [-0.000, 0.020], mean action: 0.231 [-1.113, 1.177], mean observation: 0.101 [-11.256, 22.055], loss: 0.000153, mean_squared_error: 0.000305, mean_q: 0.698670
 1753/2000: episode: 17, duration: 3.907s, episode steps: 87, steps per second: 22, episode reward: 0.479, mean reward: 0.006 [-0.000, 0.013], mean action: 0.193 [-1.217, 1.162], mean observation: 0.120 [-44.438, 14.063], loss: 0.000175, mean_squared_error: 0.000351, mean_q: 0.692226
 1840/2000: episode: 18, duration: 3.671s, episode steps: 87, steps per second: 24, episode reward: 0.488, mean reward: 0.006 [0.000, 0.013], me

  855/2000: episode: 7, duration: 6.604s, episode steps: 121, steps per second: 18, episode reward: 0.726, mean reward: 0.006 [-0.001, 0.015], mean action: 0.328 [-1.144, 1.217], mean observation: 0.122 [-42.801, 18.680], loss: --, mean_squared_error: --, mean_q: --
  978/2000: episode: 8, duration: 6.434s, episode steps: 123, steps per second: 19, episode reward: 0.789, mean reward: 0.006 [-0.001, 0.015], mean action: 0.359 [-1.270, 1.256], mean observation: 0.110 [-42.467, 18.069], loss: --, mean_squared_error: --, mean_q: --
 1093/2000: episode: 9, duration: 9.399s, episode steps: 115, steps per second: 12, episode reward: 0.708, mean reward: 0.006 [-0.001, 0.016], mean action: 0.357 [-1.155, 1.270], mean observation: 0.082 [-50.280, 18.256], loss: 0.000137, mean_squared_error: 0.000275, mean_q: 0.681926
 1233/2000: episode: 10, duration: 7.725s, episode steps: 140, steps per second: 18, episode reward: 0.922, mean reward: 0.007 [-0.001, 0.018], mean action: 0.389 [-1.105, 1.193], m

 1518/2000: episode: 8, duration: 8.070s, episode steps: 145, steps per second: 18, episode reward: 0.896, mean reward: 0.006 [-0.001, 0.017], mean action: 0.282 [-1.171, 1.192], mean observation: 0.115 [-9.224, 19.300], loss: 0.000134, mean_squared_error: 0.000268, mean_q: 0.656648
 1653/2000: episode: 9, duration: 6.873s, episode steps: 135, steps per second: 20, episode reward: 0.812, mean reward: 0.006 [-0.001, 0.014], mean action: 0.298 [-1.186, 1.261], mean observation: 0.110 [-9.217, 19.278], loss: 0.000096, mean_squared_error: 0.000192, mean_q: 0.675534
 1792/2000: episode: 10, duration: 7.211s, episode steps: 139, steps per second: 19, episode reward: 0.854, mean reward: 0.006 [-0.001, 0.015], mean action: 0.306 [-1.159, 1.218], mean observation: 0.113 [-8.956, 18.255], loss: 0.000141, mean_squared_error: 0.000281, mean_q: 0.664662
 1929/2000: episode: 11, duration: 7.093s, episode steps: 137, steps per second: 19, episode reward: 0.939, mean reward: 0.007 [-0.000, 0.018], mea

 1952/2000: episode: 13, duration: 7.254s, episode steps: 168, steps per second: 23, episode reward: -0.782, mean reward: -0.005 [-0.020, 0.006], mean action: 0.217 [-1.269, 1.208], mean observation: 0.045 [-29.571, 20.236], loss: 0.000121, mean_squared_error: 0.000243, mean_q: 0.661821
done, took 79.236 seconds


iteration: 28
Training for 2000 steps ...
  141/2000: episode: 1, duration: 5.232s, episode steps: 141, steps per second: 27, episode reward: -0.800, mean reward: -0.006 [-0.020, 0.006], mean action: 0.289 [-1.159, 1.259], mean observation: 0.027 [-27.748, 14.526], loss: --, mean_squared_error: --, mean_q: --
  275/2000: episode: 2, duration: 4.865s, episode steps: 134, steps per second: 28, episode reward: -0.774, mean reward: -0.006 [-0.020, 0.007], mean action: 0.335 [-1.168, 1.276], mean observation: 0.023 [-30.822, 15.049], loss: --, mean_squared_error: --, mean_q: --
  414/2000: episode: 3, duration: 5.044s, episode steps: 139, steps per second: 28, episode reward: -0.7

 1132/2000: episode: 6, duration: 9.640s, episode steps: 197, steps per second: 20, episode reward: 0.860, mean reward: 0.004 [-0.002, 0.016], mean action: 0.295 [-1.272, 1.156], mean observation: 0.120 [-11.792, 18.719], loss: 0.000217, mean_squared_error: 0.000434, mean_q: 0.649764
 1254/2000: episode: 7, duration: 7.449s, episode steps: 122, steps per second: 16, episode reward: 0.864, mean reward: 0.007 [-0.003, 0.017], mean action: 0.433 [-1.229, 1.184], mean observation: 0.099 [-14.075, 16.701], loss: 0.000234, mean_squared_error: 0.000468, mean_q: 0.646097
 1380/2000: episode: 8, duration: 7.427s, episode steps: 126, steps per second: 17, episode reward: 0.705, mean reward: 0.006 [-0.004, 0.013], mean action: 0.438 [-1.129, 1.187], mean observation: 0.088 [-11.495, 16.688], loss: 0.000140, mean_squared_error: 0.000280, mean_q: 0.651383
 1487/2000: episode: 9, duration: 5.799s, episode steps: 107, steps per second: 18, episode reward: 0.688, mean reward: 0.006 [-0.003, 0.013], me

   92/2000: episode: 1, duration: 4.394s, episode steps: 92, steps per second: 21, episode reward: 0.677, mean reward: 0.007 [-0.002, 0.013], mean action: 0.380 [-1.229, 1.149], mean observation: 0.072 [-15.171, 22.966], loss: --, mean_squared_error: --, mean_q: --
  183/2000: episode: 2, duration: 4.330s, episode steps: 91, steps per second: 21, episode reward: 0.668, mean reward: 0.007 [-0.003, 0.013], mean action: 0.374 [-1.182, 1.195], mean observation: 0.070 [-19.863, 22.902], loss: --, mean_squared_error: --, mean_q: --
  273/2000: episode: 3, duration: 4.615s, episode steps: 90, steps per second: 20, episode reward: 0.656, mean reward: 0.007 [-0.003, 0.013], mean action: 0.340 [-1.201, 1.162], mean observation: 0.068 [-36.168, 22.980], loss: --, mean_squared_error: --, mean_q: --
  364/2000: episode: 4, duration: 4.304s, episode steps: 91, steps per second: 21, episode reward: 0.670, mean reward: 0.007 [-0.003, 0.013], mean action: 0.390 [-1.213, 1.166], mean observation: 0.070 

  824/2000: episode: 9, duration: 4.334s, episode steps: 91, steps per second: 21, episode reward: 0.677, mean reward: 0.007 [-0.000, 0.013], mean action: 0.338 [-1.116, 1.111], mean observation: 0.071 [-55.405, 18.280], loss: --, mean_squared_error: --, mean_q: --
  915/2000: episode: 10, duration: 4.388s, episode steps: 91, steps per second: 21, episode reward: 0.675, mean reward: 0.007 [-0.000, 0.013], mean action: 0.328 [-1.231, 1.152], mean observation: 0.075 [-43.315, 21.954], loss: --, mean_squared_error: --, mean_q: --
 1006/2000: episode: 11, duration: 4.475s, episode steps: 91, steps per second: 20, episode reward: 0.675, mean reward: 0.007 [-0.000, 0.013], mean action: 0.321 [-1.133, 1.156], mean observation: 0.076 [-39.113, 22.655], loss: 0.000174, mean_squared_error: 0.000349, mean_q: 0.643793
 1097/2000: episode: 12, duration: 5.341s, episode steps: 91, steps per second: 17, episode reward: 0.676, mean reward: 0.007 [-0.000, 0.013], mean action: 0.277 [-1.172, 1.221], mea

 1648/2000: episode: 18, duration: 4.816s, episode steps: 91, steps per second: 19, episode reward: 0.706, mean reward: 0.008 [-0.001, 0.014], mean action: 0.353 [-1.176, 1.157], mean observation: 0.076 [-27.041, 18.833], loss: 0.000174, mean_squared_error: 0.000348, mean_q: 0.636109
 1739/2000: episode: 19, duration: 5.141s, episode steps: 91, steps per second: 18, episode reward: 0.727, mean reward: 0.008 [-0.001, 0.015], mean action: 0.308 [-1.136, 1.148], mean observation: 0.078 [-29.725, 18.669], loss: 0.000151, mean_squared_error: 0.000301, mean_q: 0.638733
 1829/2000: episode: 20, duration: 5.122s, episode steps: 90, steps per second: 18, episode reward: 0.710, mean reward: 0.008 [-0.001, 0.015], mean action: 0.311 [-1.126, 1.201], mean observation: 0.079 [-26.437, 18.458], loss: 0.000142, mean_squared_error: 0.000284, mean_q: 0.653161
 1921/2000: episode: 21, duration: 5.116s, episode steps: 92, steps per second: 18, episode reward: 0.725, mean reward: 0.008 [0.000, 0.015], mea

  543/2000: episode: 6, duration: 4.153s, episode steps: 91, steps per second: 22, episode reward: 0.723, mean reward: 0.008 [-0.001, 0.015], mean action: 0.285 [-1.219, 1.131], mean observation: 0.076 [-22.829, 18.667], loss: --, mean_squared_error: --, mean_q: --
  634/2000: episode: 7, duration: 4.088s, episode steps: 91, steps per second: 22, episode reward: 0.717, mean reward: 0.008 [-0.001, 0.015], mean action: 0.315 [-1.157, 1.176], mean observation: 0.075 [-29.435, 18.504], loss: --, mean_squared_error: --, mean_q: --
  725/2000: episode: 8, duration: 4.281s, episode steps: 91, steps per second: 21, episode reward: 0.727, mean reward: 0.008 [-0.001, 0.015], mean action: 0.302 [-1.155, 1.156], mean observation: 0.078 [-28.289, 18.427], loss: --, mean_squared_error: --, mean_q: --
  816/2000: episode: 9, duration: 4.074s, episode steps: 91, steps per second: 22, episode reward: 0.734, mean reward: 0.008 [-0.001, 0.015], mean action: 0.304 [-1.192, 1.155], mean observation: 0.073 

 1393/2000: episode: 15, duration: 5.289s, episode steps: 95, steps per second: 18, episode reward: 0.672, mean reward: 0.007 [0.000, 0.013], mean action: 0.266 [-1.132, 1.202], mean observation: 0.084 [-53.264, 17.397], loss: 0.000185, mean_squared_error: 0.000369, mean_q: 0.642526
 1484/2000: episode: 16, duration: 5.187s, episode steps: 91, steps per second: 18, episode reward: 0.662, mean reward: 0.007 [-0.000, 0.013], mean action: 0.306 [-1.172, 1.158], mean observation: 0.074 [-50.409, 17.256], loss: 0.000179, mean_squared_error: 0.000357, mean_q: 0.639459
 1575/2000: episode: 17, duration: 4.836s, episode steps: 91, steps per second: 19, episode reward: 0.696, mean reward: 0.008 [-0.000, 0.014], mean action: 0.290 [-1.105, 1.247], mean observation: 0.072 [-49.820, 17.505], loss: 0.000305, mean_squared_error: 0.000609, mean_q: 0.644381
 1665/2000: episode: 18, duration: 4.848s, episode steps: 90, steps per second: 19, episode reward: 0.681, mean reward: 0.008 [-0.000, 0.014], mea

  280/2000: episode: 3, duration: 4.035s, episode steps: 93, steps per second: 23, episode reward: 0.699, mean reward: 0.008 [-0.001, 0.014], mean action: 0.202 [-1.144, 1.156], mean observation: 0.079 [-40.661, 22.088], loss: --, mean_squared_error: --, mean_q: --
  373/2000: episode: 4, duration: 4.167s, episode steps: 93, steps per second: 22, episode reward: 0.694, mean reward: 0.007 [-0.001, 0.014], mean action: 0.211 [-1.101, 1.168], mean observation: 0.079 [-43.710, 21.581], loss: --, mean_squared_error: --, mean_q: --
  467/2000: episode: 5, duration: 4.152s, episode steps: 94, steps per second: 23, episode reward: 0.716, mean reward: 0.008 [-0.001, 0.014], mean action: 0.209 [-1.188, 1.248], mean observation: 0.079 [-41.121, 21.948], loss: --, mean_squared_error: --, mean_q: --
  560/2000: episode: 6, duration: 4.085s, episode steps: 93, steps per second: 23, episode reward: 0.689, mean reward: 0.007 [-0.001, 0.014], mean action: 0.210 [-1.184, 1.141], mean observation: 0.078 

  559/2000: episode: 5, duration: 4.918s, episode steps: 112, steps per second: 23, episode reward: 0.759, mean reward: 0.007 [-0.001, 0.013], mean action: 0.118 [-1.162, 1.166], mean observation: 0.097 [-42.050, 16.448], loss: --, mean_squared_error: --, mean_q: --
  671/2000: episode: 6, duration: 5.006s, episode steps: 112, steps per second: 22, episode reward: 0.759, mean reward: 0.007 [-0.001, 0.013], mean action: 0.109 [-1.222, 1.183], mean observation: 0.097 [-38.932, 16.331], loss: --, mean_squared_error: --, mean_q: --
  783/2000: episode: 7, duration: 4.925s, episode steps: 112, steps per second: 23, episode reward: 0.761, mean reward: 0.007 [-0.001, 0.013], mean action: 0.115 [-1.191, 1.210], mean observation: 0.095 [-43.000, 16.482], loss: --, mean_squared_error: --, mean_q: --
  895/2000: episode: 8, duration: 4.950s, episode steps: 112, steps per second: 23, episode reward: 0.765, mean reward: 0.007 [-0.001, 0.014], mean action: 0.119 [-1.168, 1.227], mean observation: 0.

  448/2000: episode: 4, duration: 5.187s, episode steps: 112, steps per second: 22, episode reward: 0.756, mean reward: 0.007 [-0.000, 0.013], mean action: 0.253 [-1.180, 1.167], mean observation: 0.103 [-53.289, 15.248], loss: --, mean_squared_error: --, mean_q: --
  559/2000: episode: 5, duration: 5.272s, episode steps: 111, steps per second: 21, episode reward: 0.741, mean reward: 0.007 [-0.000, 0.013], mean action: 0.255 [-1.192, 1.219], mean observation: 0.102 [-52.052, 15.424], loss: --, mean_squared_error: --, mean_q: --
  672/2000: episode: 6, duration: 5.277s, episode steps: 113, steps per second: 21, episode reward: 0.769, mean reward: 0.007 [-0.000, 0.013], mean action: 0.277 [-1.150, 1.196], mean observation: 0.102 [-53.228, 15.189], loss: --, mean_squared_error: --, mean_q: --
  784/2000: episode: 7, duration: 5.145s, episode steps: 112, steps per second: 22, episode reward: 0.756, mean reward: 0.007 [-0.000, 0.012], mean action: 0.249 [-1.275, 1.136], mean observation: 0.

 1550/2000: episode: 16, duration: 5.799s, episode steps: 110, steps per second: 19, episode reward: 0.814, mean reward: 0.007 [-0.001, 0.015], mean action: 0.264 [-1.155, 1.165], mean observation: 0.105 [-16.653, 19.038], loss: 0.000153, mean_squared_error: 0.000306, mean_q: 0.632864
 1689/2000: episode: 17, duration: 6.725s, episode steps: 139, steps per second: 21, episode reward: -0.778, mean reward: -0.006 [-0.021, 0.011], mean action: 0.294 [-1.192, 1.231], mean observation: 0.024 [-17.779, 21.427], loss: 0.000245, mean_squared_error: 0.000490, mean_q: 0.627381
 1836/2000: episode: 18, duration: 6.988s, episode steps: 147, steps per second: 21, episode reward: -0.784, mean reward: -0.005 [-0.021, 0.011], mean action: 0.340 [-1.239, 1.175], mean observation: 0.025 [-41.244, 21.944], loss: 0.000174, mean_squared_error: 0.000347, mean_q: 0.629485
done, took 102.345 seconds


iteration: 45
Training for 2000 steps ...
  120/2000: episode: 1, duration: 4.975s, episode steps: 120, steps

 1184/2000: episode: 11, duration: 6.056s, episode steps: 110, steps per second: 18, episode reward: 0.755, mean reward: 0.007 [-0.001, 0.013], mean action: 0.226 [-1.122, 1.243], mean observation: 0.100 [-12.179, 18.996], loss: 0.000116, mean_squared_error: 0.000231, mean_q: 0.622618
 1289/2000: episode: 12, duration: 5.667s, episode steps: 105, steps per second: 19, episode reward: 0.713, mean reward: 0.007 [0.000, 0.013], mean action: 0.238 [-1.174, 1.165], mean observation: 0.092 [-18.171, 20.123], loss: 0.000118, mean_squared_error: 0.000235, mean_q: 0.628609
 1380/2000: episode: 13, duration: 5.581s, episode steps: 91, steps per second: 16, episode reward: 0.637, mean reward: 0.007 [-0.003, 0.012], mean action: 0.324 [-1.088, 1.167], mean observation: 0.072 [-18.691, 19.366], loss: 0.000163, mean_squared_error: 0.000326, mean_q: 0.630719
 1471/2000: episode: 14, duration: 5.287s, episode steps: 91, steps per second: 17, episode reward: 0.636, mean reward: 0.007 [-0.002, 0.013], m

  323/2000: episode: 3, duration: 4.496s, episode steps: 108, steps per second: 24, episode reward: 0.742, mean reward: 0.007 [0.000, 0.013], mean action: 0.226 [-1.172, 1.136], mean observation: 0.098 [-10.645, 19.628], loss: --, mean_squared_error: --, mean_q: --
  430/2000: episode: 4, duration: 4.720s, episode steps: 107, steps per second: 23, episode reward: 0.741, mean reward: 0.007 [0.000, 0.013], mean action: 0.254 [-1.075, 1.203], mean observation: 0.096 [-9.613, 20.464], loss: --, mean_squared_error: --, mean_q: --
  537/2000: episode: 5, duration: 4.542s, episode steps: 107, steps per second: 24, episode reward: 0.741, mean reward: 0.007 [0.000, 0.013], mean action: 0.230 [-1.150, 1.173], mean observation: 0.097 [-10.513, 20.147], loss: --, mean_squared_error: --, mean_q: --
  645/2000: episode: 6, duration: 4.644s, episode steps: 108, steps per second: 23, episode reward: 0.746, mean reward: 0.007 [0.000, 0.013], mean action: 0.214 [-1.139, 1.234], mean observation: 0.097 [

 1584/2000: episode: 15, duration: 5.755s, episode steps: 105, steps per second: 18, episode reward: 0.725, mean reward: 0.007 [0.001, 0.012], mean action: 0.229 [-1.164, 1.264], mean observation: 0.093 [-9.256, 18.939], loss: 0.000084, mean_squared_error: 0.000168, mean_q: 0.634827
 1688/2000: episode: 16, duration: 5.730s, episode steps: 104, steps per second: 18, episode reward: 0.729, mean reward: 0.007 [0.001, 0.012], mean action: 0.223 [-1.185, 1.229], mean observation: 0.095 [-13.508, 19.086], loss: 0.000096, mean_squared_error: 0.000192, mean_q: 0.618128
 1794/2000: episode: 17, duration: 5.644s, episode steps: 106, steps per second: 19, episode reward: 0.742, mean reward: 0.007 [0.001, 0.014], mean action: 0.244 [-1.163, 1.210], mean observation: 0.096 [-9.180, 19.331], loss: 0.000314, mean_squared_error: 0.000628, mean_q: 0.633805
 1901/2000: episode: 18, duration: 5.898s, episode steps: 107, steps per second: 18, episode reward: 0.746, mean reward: 0.007 [0.000, 0.013], mean

  975/2000: episode: 9, duration: 4.719s, episode steps: 108, steps per second: 23, episode reward: 0.748, mean reward: 0.007 [-0.000, 0.014], mean action: 0.232 [-1.239, 1.192], mean observation: 0.096 [-12.253, 19.109], loss: --, mean_squared_error: --, mean_q: --
 1083/2000: episode: 10, duration: 5.249s, episode steps: 108, steps per second: 21, episode reward: 0.749, mean reward: 0.007 [-0.000, 0.014], mean action: 0.229 [-1.254, 1.171], mean observation: 0.095 [-13.103, 19.028], loss: 0.000133, mean_squared_error: 0.000267, mean_q: 0.633453
 1190/2000: episode: 11, duration: 5.566s, episode steps: 107, steps per second: 19, episode reward: 0.737, mean reward: 0.007 [-0.000, 0.014], mean action: 0.211 [-1.128, 1.266], mean observation: 0.095 [-12.583, 18.995], loss: 0.000149, mean_squared_error: 0.000299, mean_q: 0.625206
 1298/2000: episode: 12, duration: 5.379s, episode steps: 108, steps per second: 20, episode reward: 0.755, mean reward: 0.007 [-0.000, 0.013], mean action: 0.21

  104/2000: episode: 1, duration: 5.214s, episode steps: 104, steps per second: 20, episode reward: 0.751, mean reward: 0.007 [-0.001, 0.014], mean action: 0.156 [-1.134, 1.159], mean observation: 0.088 [-10.166, 17.795], loss: --, mean_squared_error: --, mean_q: --
  209/2000: episode: 2, duration: 5.299s, episode steps: 105, steps per second: 20, episode reward: 0.771, mean reward: 0.007 [-0.001, 0.014], mean action: 0.175 [-1.131, 1.206], mean observation: 0.089 [-14.975, 17.915], loss: --, mean_squared_error: --, mean_q: --
  313/2000: episode: 3, duration: 5.124s, episode steps: 104, steps per second: 20, episode reward: 0.746, mean reward: 0.007 [-0.002, 0.013], mean action: 0.190 [-1.104, 1.199], mean observation: 0.088 [-10.188, 18.033], loss: --, mean_squared_error: --, mean_q: --
  419/2000: episode: 4, duration: 5.366s, episode steps: 106, steps per second: 20, episode reward: 0.781, mean reward: 0.007 [-0.002, 0.014], mean action: 0.168 [-1.239, 1.140], mean observation: 0.

 1133/2000: episode: 12, duration: 5.381s, episode steps: 96, steps per second: 18, episode reward: 0.704, mean reward: 0.007 [-0.002, 0.014], mean action: 0.126 [-1.138, 1.149], mean observation: 0.085 [-10.421, 17.854], loss: 0.000100, mean_squared_error: 0.000201, mean_q: 0.644375
 1231/2000: episode: 13, duration: 5.522s, episode steps: 98, steps per second: 18, episode reward: 0.715, mean reward: 0.007 [-0.001, 0.013], mean action: 0.218 [-1.092, 1.163], mean observation: 0.082 [-10.457, 18.042], loss: 0.000089, mean_squared_error: 0.000178, mean_q: 0.640513
 1328/2000: episode: 14, duration: 5.748s, episode steps: 97, steps per second: 17, episode reward: 0.704, mean reward: 0.007 [-0.002, 0.014], mean action: 0.223 [-1.116, 1.226], mean observation: 0.080 [-10.388, 18.031], loss: 0.000161, mean_squared_error: 0.000322, mean_q: 0.640127
 1422/2000: episode: 15, duration: 5.456s, episode steps: 94, steps per second: 17, episode reward: 0.692, mean reward: 0.007 [-0.002, 0.014], me

done, took 104.026 seconds


iteration: 56
Training for 2000 steps ...
   99/2000: episode: 1, duration: 4.536s, episode steps: 99, steps per second: 22, episode reward: 0.690, mean reward: 0.007 [-0.002, 0.013], mean action: 0.123 [-1.093, 1.147], mean observation: 0.084 [-18.466, 17.831], loss: --, mean_squared_error: --, mean_q: --
  198/2000: episode: 2, duration: 4.774s, episode steps: 99, steps per second: 21, episode reward: 0.687, mean reward: 0.007 [-0.002, 0.013], mean action: 0.139 [-1.148, 1.169], mean observation: 0.082 [-18.530, 18.117], loss: --, mean_squared_error: --, mean_q: --
  298/2000: episode: 3, duration: 4.657s, episode steps: 100, steps per second: 21, episode reward: 0.703, mean reward: 0.007 [-0.002, 0.013], mean action: 0.125 [-1.260, 1.161], mean observation: 0.085 [-19.554, 17.748], loss: --, mean_squared_error: --, mean_q: --
  398/2000: episode: 4, duration: 4.479s, episode steps: 100, steps per second: 22, episode reward: 0.700, mean reward: 0.007 [-0.

 1221/2000: episode: 12, duration: 5.642s, episode steps: 103, steps per second: 18, episode reward: 0.719, mean reward: 0.007 [-0.003, 0.014], mean action: 0.155 [-1.206, 1.257], mean observation: 0.084 [-20.258, 17.960], loss: 0.000083, mean_squared_error: 0.000167, mean_q: 0.631239
 1325/2000: episode: 13, duration: 5.705s, episode steps: 104, steps per second: 18, episode reward: 0.704, mean reward: 0.007 [-0.003, 0.014], mean action: 0.135 [-1.186, 1.158], mean observation: 0.081 [-18.738, 17.976], loss: 0.000183, mean_squared_error: 0.000366, mean_q: 0.629262
 1432/2000: episode: 14, duration: 5.631s, episode steps: 107, steps per second: 19, episode reward: 0.725, mean reward: 0.007 [-0.003, 0.014], mean action: 0.086 [-1.181, 1.218], mean observation: 0.087 [-18.563, 18.025], loss: 0.000098, mean_squared_error: 0.000195, mean_q: 0.640337
 1537/2000: episode: 15, duration: 5.530s, episode steps: 105, steps per second: 19, episode reward: 0.738, mean reward: 0.007 [-0.003, 0.014]

  313/2000: episode: 3, duration: 4.541s, episode steps: 105, steps per second: 23, episode reward: 0.748, mean reward: 0.007 [-0.002, 0.014], mean action: 0.170 [-1.157, 1.307], mean observation: 0.090 [-18.965, 18.000], loss: --, mean_squared_error: --, mean_q: --
  417/2000: episode: 4, duration: 4.463s, episode steps: 104, steps per second: 23, episode reward: 0.741, mean reward: 0.007 [-0.002, 0.014], mean action: 0.155 [-1.124, 1.190], mean observation: 0.087 [-19.200, 17.841], loss: --, mean_squared_error: --, mean_q: --
  521/2000: episode: 5, duration: 4.536s, episode steps: 104, steps per second: 23, episode reward: 0.738, mean reward: 0.007 [-0.002, 0.014], mean action: 0.151 [-1.118, 1.172], mean observation: 0.089 [-17.616, 17.959], loss: --, mean_squared_error: --, mean_q: --
  625/2000: episode: 6, duration: 4.472s, episode steps: 104, steps per second: 23, episode reward: 0.742, mean reward: 0.007 [-0.002, 0.014], mean action: 0.174 [-1.113, 1.248], mean observation: 0.

 1480/2000: episode: 14, duration: 5.106s, episode steps: 104, steps per second: 20, episode reward: 0.714, mean reward: 0.007 [-0.002, 0.013], mean action: 0.151 [-1.228, 1.121], mean observation: 0.085 [-10.357, 17.406], loss: 0.000083, mean_squared_error: 0.000167, mean_q: 0.628890
 1585/2000: episode: 15, duration: 5.449s, episode steps: 105, steps per second: 19, episode reward: 0.709, mean reward: 0.007 [-0.002, 0.013], mean action: 0.125 [-1.113, 1.152], mean observation: 0.086 [-35.046, 17.652], loss: 0.000339, mean_squared_error: 0.000677, mean_q: 0.625982
 1690/2000: episode: 16, duration: 5.580s, episode steps: 105, steps per second: 19, episode reward: 0.682, mean reward: 0.006 [-0.001, 0.013], mean action: 0.068 [-1.136, 1.154], mean observation: 0.080 [-41.539, 17.529], loss: 0.000170, mean_squared_error: 0.000340, mean_q: 0.617165
 1798/2000: episode: 17, duration: 5.652s, episode steps: 108, steps per second: 19, episode reward: 0.717, mean reward: 0.007 [-0.001, 0.013]

  970/2000: episode: 9, duration: 4.824s, episode steps: 107, steps per second: 22, episode reward: 0.706, mean reward: 0.007 [-0.001, 0.014], mean action: 0.121 [-1.132, 1.246], mean observation: 0.082 [-26.263, 17.561], loss: --, mean_squared_error: --, mean_q: --
 1077/2000: episode: 10, duration: 5.619s, episode steps: 107, steps per second: 19, episode reward: 0.707, mean reward: 0.007 [-0.001, 0.014], mean action: 0.111 [-1.241, 1.169], mean observation: 0.084 [-15.292, 17.511], loss: 0.000202, mean_squared_error: 0.000405, mean_q: 0.619385
 1218/2000: episode: 11, duration: 6.309s, episode steps: 141, steps per second: 22, episode reward: -0.801, mean reward: -0.006 [-0.020, 0.013], mean action: 0.167 [-1.240, 1.217], mean observation: 0.009 [-16.838, 13.578], loss: 0.000057, mean_squared_error: 0.000114, mean_q: 0.615956
 1352/2000: episode: 12, duration: 6.012s, episode steps: 134, steps per second: 22, episode reward: -0.766, mean reward: -0.006 [-0.021, 0.013], mean action: 

  783/2000: episode: 7, duration: 4.894s, episode steps: 112, steps per second: 23, episode reward: 0.736, mean reward: 0.007 [-0.001, 0.014], mean action: 0.094 [-1.275, 1.207], mean observation: 0.091 [-17.013, 17.429], loss: --, mean_squared_error: --, mean_q: --
  895/2000: episode: 8, duration: 4.800s, episode steps: 112, steps per second: 23, episode reward: 0.737, mean reward: 0.007 [-0.001, 0.014], mean action: 0.102 [-1.104, 1.127], mean observation: 0.088 [-16.921, 17.512], loss: --, mean_squared_error: --, mean_q: --
 1007/2000: episode: 9, duration: 4.888s, episode steps: 112, steps per second: 23, episode reward: 0.738, mean reward: 0.007 [-0.001, 0.014], mean action: 0.095 [-1.199, 1.327], mean observation: 0.088 [-16.519, 17.394], loss: 0.000068, mean_squared_error: 0.000136, mean_q: 0.630446
 1120/2000: episode: 10, duration: 5.895s, episode steps: 113, steps per second: 19, episode reward: 0.732, mean reward: 0.006 [-0.001, 0.014], mean action: 0.109 [-1.194, 1.210], m

 1907/2000: episode: 19, duration: 6.035s, episode steps: 104, steps per second: 17, episode reward: 0.678, mean reward: 0.007 [-0.002, 0.014], mean action: 0.088 [-1.204, 1.177], mean observation: 0.078 [-41.362, 17.908], loss: 0.000093, mean_squared_error: 0.000185, mean_q: 0.606218
done, took 109.550 seconds


iteration: 66
Training for 2000 steps ...
  103/2000: episode: 1, duration: 5.073s, episode steps: 103, steps per second: 20, episode reward: 0.680, mean reward: 0.007 [-0.002, 0.014], mean action: 0.108 [-1.131, 1.213], mean observation: 0.080 [-37.291, 17.953], loss: --, mean_squared_error: --, mean_q: --
  205/2000: episode: 2, duration: 5.200s, episode steps: 102, steps per second: 20, episode reward: 0.663, mean reward: 0.006 [-0.002, 0.014], mean action: 0.101 [-1.124, 1.181], mean observation: 0.075 [-26.622, 17.984], loss: --, mean_squared_error: --, mean_q: --
  307/2000: episode: 3, duration: 5.018s, episode steps: 102, steps per second: 20, episode reward: 0.664, me

 1130/2000: episode: 11, duration: 6.327s, episode steps: 102, steps per second: 16, episode reward: 0.663, mean reward: 0.006 [-0.002, 0.014], mean action: 0.076 [-1.214, 1.168], mean observation: 0.073 [-47.339, 17.916], loss: 0.000212, mean_squared_error: 0.000425, mean_q: 0.600053
 1228/2000: episode: 12, duration: 6.586s, episode steps: 98, steps per second: 15, episode reward: 0.627, mean reward: 0.006 [-0.002, 0.014], mean action: 0.049 [-1.140, 1.206], mean observation: 0.061 [-34.576, 17.563], loss: 0.000057, mean_squared_error: 0.000115, mean_q: 0.592616
 1325/2000: episode: 13, duration: 6.443s, episode steps: 97, steps per second: 15, episode reward: 0.620, mean reward: 0.006 [-0.003, 0.014], mean action: 0.038 [-1.149, 1.159], mean observation: 0.058 [-41.165, 17.539], loss: 0.000247, mean_squared_error: 0.000493, mean_q: 0.605548
 1429/2000: episode: 14, duration: 6.516s, episode steps: 104, steps per second: 16, episode reward: 0.672, mean reward: 0.006 [-0.002, 0.014], 

  198/2000: episode: 2, duration: 5.091s, episode steps: 99, steps per second: 19, episode reward: 0.608, mean reward: 0.006 [-0.002, 0.014], mean action: 0.077 [-1.158, 1.285], mean observation: 0.053 [-42.979, 17.512], loss: --, mean_squared_error: --, mean_q: --
  297/2000: episode: 3, duration: 5.117s, episode steps: 99, steps per second: 19, episode reward: 0.609, mean reward: 0.006 [-0.002, 0.014], mean action: 0.051 [-1.220, 1.295], mean observation: 0.060 [-42.851, 17.390], loss: --, mean_squared_error: --, mean_q: --
  396/2000: episode: 4, duration: 5.178s, episode steps: 99, steps per second: 19, episode reward: 0.607, mean reward: 0.006 [-0.002, 0.014], mean action: 0.071 [-1.157, 1.148], mean observation: 0.058 [-42.430, 17.438], loss: --, mean_squared_error: --, mean_q: --
  495/2000: episode: 5, duration: 5.084s, episode steps: 99, steps per second: 19, episode reward: 0.610, mean reward: 0.006 [-0.002, 0.014], mean action: 0.088 [-1.190, 1.148], mean observation: 0.060 

 1277/2000: episode: 12, duration: 6.702s, episode steps: 108, steps per second: 16, episode reward: 0.653, mean reward: 0.006 [-0.002, 0.014], mean action: 0.202 [-1.191, 1.159], mean observation: 0.067 [-43.676, 20.114], loss: 0.000129, mean_squared_error: 0.000258, mean_q: 0.597779
 1380/2000: episode: 13, duration: 6.528s, episode steps: 103, steps per second: 16, episode reward: 0.629, mean reward: 0.006 [-0.002, 0.014], mean action: 0.149 [-1.172, 1.109], mean observation: 0.071 [-45.279, 17.335], loss: 0.000200, mean_squared_error: 0.000400, mean_q: 0.595656
 1483/2000: episode: 14, duration: 6.524s, episode steps: 103, steps per second: 16, episode reward: 0.624, mean reward: 0.006 [-0.002, 0.014], mean action: 0.153 [-1.198, 1.103], mean observation: 0.057 [-39.147, 19.648], loss: 0.000680, mean_squared_error: 0.001360, mean_q: 0.600285
 1587/2000: episode: 15, duration: 6.218s, episode steps: 104, steps per second: 17, episode reward: 0.629, mean reward: 0.006 [-0.001, 0.014]

  443/2000: episode: 4, duration: 5.552s, episode steps: 110, steps per second: 20, episode reward: 0.679, mean reward: 0.006 [-0.002, 0.014], mean action: 0.048 [-1.241, 1.247], mean observation: 0.079 [-38.218, 20.026], loss: --, mean_squared_error: --, mean_q: --
  554/2000: episode: 5, duration: 5.613s, episode steps: 111, steps per second: 20, episode reward: 0.685, mean reward: 0.006 [-0.001, 0.014], mean action: 0.068 [-1.172, 1.232], mean observation: 0.080 [-33.934, 17.476], loss: --, mean_squared_error: --, mean_q: --
  667/2000: episode: 6, duration: 5.548s, episode steps: 113, steps per second: 20, episode reward: 0.699, mean reward: 0.006 [-0.001, 0.014], mean action: 0.071 [-1.213, 1.243], mean observation: 0.081 [-27.346, 17.701], loss: --, mean_squared_error: --, mean_q: --
  778/2000: episode: 7, duration: 5.408s, episode steps: 111, steps per second: 21, episode reward: 0.680, mean reward: 0.006 [-0.002, 0.014], mean action: 0.062 [-1.187, 1.232], mean observation: 0.

 1742/2000: episode: 16, duration: 6.693s, episode steps: 119, steps per second: 18, episode reward: 0.735, mean reward: 0.006 [0.000, 0.014], mean action: 0.054 [-1.227, 1.142], mean observation: 0.095 [-36.681, 17.919], loss: 0.000304, mean_squared_error: 0.000608, mean_q: 0.581927
 1869/2000: episode: 17, duration: 6.478s, episode steps: 127, steps per second: 20, episode reward: 0.770, mean reward: 0.006 [0.000, 0.013], mean action: 0.034 [-1.217, 1.257], mean observation: 0.102 [-11.112, 17.813], loss: 0.000180, mean_squared_error: 0.000361, mean_q: 0.602786
 1990/2000: episode: 18, duration: 6.475s, episode steps: 121, steps per second: 19, episode reward: 0.742, mean reward: 0.006 [-0.000, 0.013], mean action: 0.010 [-1.196, 1.138], mean observation: 0.093 [-39.706, 17.754], loss: 0.000313, mean_squared_error: 0.000627, mean_q: 0.581198
done, took 106.680 seconds


iteration: 74
Training for 2000 steps ...
  124/2000: episode: 1, duration: 5.276s, episode steps: 124, steps per s

 1416/2000: episode: 12, duration: 6.973s, episode steps: 117, steps per second: 17, episode reward: 0.680, mean reward: 0.006 [0.000, 0.014], mean action: -0.059 [-1.151, 1.124], mean observation: 0.106 [-14.701, 17.524], loss: 0.000074, mean_squared_error: 0.000148, mean_q: 0.587558
 1530/2000: episode: 13, duration: 6.974s, episode steps: 114, steps per second: 16, episode reward: 0.692, mean reward: 0.006 [0.000, 0.014], mean action: -0.046 [-1.255, 1.087], mean observation: 0.104 [-14.318, 17.165], loss: 0.000146, mean_squared_error: 0.000291, mean_q: 0.592864
 1641/2000: episode: 14, duration: 6.737s, episode steps: 111, steps per second: 16, episode reward: 0.647, mean reward: 0.006 [0.000, 0.014], mean action: 0.003 [-1.152, 1.245], mean observation: 0.102 [-13.109, 17.333], loss: 0.000106, mean_squared_error: 0.000213, mean_q: 0.584262
 1753/2000: episode: 15, duration: 6.596s, episode steps: 112, steps per second: 17, episode reward: 0.660, mean reward: 0.006 [0.000, 0.014], 

  991/2000: episode: 8, duration: 5.935s, episode steps: 119, steps per second: 20, episode reward: 0.699, mean reward: 0.006 [0.000, 0.013], mean action: -0.101 [-1.223, 1.160], mean observation: 0.101 [-44.930, 17.872], loss: --, mean_squared_error: --, mean_q: --
 1118/2000: episode: 9, duration: 7.143s, episode steps: 127, steps per second: 18, episode reward: 0.761, mean reward: 0.006 [0.000, 0.014], mean action: -0.139 [-1.232, 1.166], mean observation: 0.098 [-14.226, 17.782], loss: 0.000120, mean_squared_error: 0.000240, mean_q: 0.581994
 1233/2000: episode: 10, duration: 7.101s, episode steps: 115, steps per second: 16, episode reward: 0.685, mean reward: 0.006 [0.000, 0.013], mean action: -0.102 [-1.192, 1.233], mean observation: 0.101 [-37.448, 21.106], loss: 0.000106, mean_squared_error: 0.000213, mean_q: 0.582332
 1358/2000: episode: 11, duration: 7.496s, episode steps: 125, steps per second: 17, episode reward: 0.697, mean reward: 0.006 [0.000, 0.013], mean action: -0.053

  316/2000: episode: 3, duration: 6.254s, episode steps: 105, steps per second: 17, episode reward: 0.629, mean reward: 0.006 [-0.001, 0.014], mean action: 0.056 [-1.194, 1.232], mean observation: 0.064 [-53.558, 17.295], loss: --, mean_squared_error: --, mean_q: --
  422/2000: episode: 4, duration: 6.196s, episode steps: 106, steps per second: 17, episode reward: 0.638, mean reward: 0.006 [-0.001, 0.014], mean action: 0.052 [-1.133, 1.177], mean observation: 0.070 [-52.251, 19.781], loss: --, mean_squared_error: --, mean_q: --
  527/2000: episode: 5, duration: 6.296s, episode steps: 105, steps per second: 17, episode reward: 0.621, mean reward: 0.006 [-0.001, 0.014], mean action: 0.026 [-1.249, 1.137], mean observation: 0.077 [-53.057, 17.216], loss: --, mean_squared_error: --, mean_q: --
  633/2000: episode: 6, duration: 6.277s, episode steps: 106, steps per second: 17, episode reward: 0.632, mean reward: 0.006 [-0.001, 0.014], mean action: 0.052 [-1.164, 1.184], mean observation: 0.

 1451/2000: episode: 14, duration: 6.986s, episode steps: 102, steps per second: 15, episode reward: 0.627, mean reward: 0.006 [-0.001, 0.013], mean action: 0.096 [-1.173, 1.130], mean observation: 0.069 [-51.274, 18.221], loss: 0.000223, mean_squared_error: 0.000447, mean_q: 0.580384
 1554/2000: episode: 15, duration: 7.254s, episode steps: 103, steps per second: 14, episode reward: 0.625, mean reward: 0.006 [-0.001, 0.013], mean action: 0.112 [-1.222, 1.192], mean observation: 0.075 [-51.021, 13.576], loss: 0.000122, mean_squared_error: 0.000244, mean_q: 0.598785
 1658/2000: episode: 16, duration: 7.282s, episode steps: 104, steps per second: 14, episode reward: 0.626, mean reward: 0.006 [-0.001, 0.014], mean action: 0.059 [-1.110, 1.134], mean observation: 0.070 [-53.522, 15.068], loss: 0.000082, mean_squared_error: 0.000163, mean_q: 0.590986
 1764/2000: episode: 17, duration: 7.599s, episode steps: 106, steps per second: 14, episode reward: 0.645, mean reward: 0.006 [-0.001, 0.013]

  534/2000: episode: 5, duration: 5.991s, episode steps: 107, steps per second: 18, episode reward: 0.653, mean reward: 0.006 [-0.001, 0.013], mean action: 0.176 [-1.130, 1.156], mean observation: 0.072 [-51.017, 18.128], loss: --, mean_squared_error: --, mean_q: --
  641/2000: episode: 6, duration: 5.960s, episode steps: 107, steps per second: 18, episode reward: 0.650, mean reward: 0.006 [-0.001, 0.013], mean action: 0.156 [-1.103, 1.155], mean observation: 0.073 [-50.127, 18.008], loss: --, mean_squared_error: --, mean_q: --
  747/2000: episode: 7, duration: 5.932s, episode steps: 106, steps per second: 18, episode reward: 0.642, mean reward: 0.006 [-0.001, 0.013], mean action: 0.186 [-1.121, 1.211], mean observation: 0.073 [-50.118, 18.007], loss: --, mean_squared_error: --, mean_q: --
  854/2000: episode: 8, duration: 5.981s, episode steps: 107, steps per second: 18, episode reward: 0.649, mean reward: 0.006 [-0.001, 0.013], mean action: 0.157 [-1.164, 1.140], mean observation: 0.

 1841/2000: episode: 17, duration: 7.670s, episode steps: 114, steps per second: 15, episode reward: 0.704, mean reward: 0.006 [-0.001, 0.014], mean action: -0.026 [-1.148, 1.094], mean observation: 0.092 [-49.029, 17.954], loss: 0.000332, mean_squared_error: 0.000665, mean_q: 0.589554
 1952/2000: episode: 18, duration: 7.713s, episode steps: 111, steps per second: 14, episode reward: 0.682, mean reward: 0.006 [-0.001, 0.014], mean action: -0.028 [-1.155, 1.224], mean observation: 0.082 [-51.346, 17.884], loss: 0.000106, mean_squared_error: 0.000212, mean_q: 0.576160
done, took 128.553 seconds


iteration: 84
Training for 2000 steps ...
  113/2000: episode: 1, duration: 6.768s, episode steps: 113, steps per second: 17, episode reward: 0.677, mean reward: 0.006 [-0.001, 0.013], mean action: -0.019 [-1.112, 1.171], mean observation: 0.086 [-49.281, 18.117], loss: --, mean_squared_error: --, mean_q: --
  225/2000: episode: 2, duration: 6.616s, episode steps: 112, steps per second: 17, epi

 1188/2000: episode: 11, duration: 6.829s, episode steps: 109, steps per second: 16, episode reward: 0.652, mean reward: 0.006 [-0.001, 0.013], mean action: 0.098 [-1.217, 1.167], mean observation: 0.092 [-14.045, 17.575], loss: 0.000068, mean_squared_error: 0.000135, mean_q: 0.596074
 1296/2000: episode: 12, duration: 6.860s, episode steps: 108, steps per second: 16, episode reward: 0.642, mean reward: 0.006 [-0.001, 0.014], mean action: 0.088 [-1.160, 1.197], mean observation: 0.090 [-21.741, 17.687], loss: 0.000333, mean_squared_error: 0.000666, mean_q: 0.598362
 1402/2000: episode: 13, duration: 7.095s, episode steps: 106, steps per second: 15, episode reward: 0.624, mean reward: 0.006 [-0.001, 0.013], mean action: 0.121 [-1.157, 1.182], mean observation: 0.087 [-41.302, 17.785], loss: 0.000056, mean_squared_error: 0.000112, mean_q: 0.590015
 1510/2000: episode: 14, duration: 6.780s, episode steps: 108, steps per second: 16, episode reward: 0.628, mean reward: 0.006 [-0.002, 0.014]

  455/2000: episode: 4, duration: 5.920s, episode steps: 114, steps per second: 19, episode reward: 0.692, mean reward: 0.006 [-0.001, 0.013], mean action: 0.095 [-1.199, 1.150], mean observation: 0.083 [-27.878, 22.192], loss: --, mean_squared_error: --, mean_q: --
  568/2000: episode: 5, duration: 5.896s, episode steps: 113, steps per second: 19, episode reward: 0.691, mean reward: 0.006 [-0.001, 0.013], mean action: 0.121 [-1.110, 1.190], mean observation: 0.083 [-36.828, 19.762], loss: --, mean_squared_error: --, mean_q: --
  682/2000: episode: 6, duration: 5.933s, episode steps: 114, steps per second: 19, episode reward: 0.703, mean reward: 0.006 [-0.001, 0.013], mean action: 0.106 [-1.070, 1.182], mean observation: 0.082 [-34.392, 24.235], loss: --, mean_squared_error: --, mean_q: --
  794/2000: episode: 7, duration: 5.855s, episode steps: 112, steps per second: 19, episode reward: 0.697, mean reward: 0.006 [-0.001, 0.013], mean action: 0.088 [-1.186, 1.170], mean observation: 0.

 1792/2000: episode: 16, duration: 7.186s, episode steps: 110, steps per second: 15, episode reward: 0.688, mean reward: 0.006 [-0.001, 0.013], mean action: 0.134 [-1.167, 1.242], mean observation: 0.080 [-27.499, 17.583], loss: 0.000240, mean_squared_error: 0.000479, mean_q: 0.578031
 1908/2000: episode: 17, duration: 6.896s, episode steps: 116, steps per second: 17, episode reward: 0.712, mean reward: 0.006 [-0.001, 0.013], mean action: 0.119 [-1.222, 1.200], mean observation: 0.085 [-10.448, 17.587], loss: 0.000077, mean_squared_error: 0.000154, mean_q: 0.578037
done, took 118.189 seconds


iteration: 89
Training for 2000 steps ...
  115/2000: episode: 1, duration: 6.288s, episode steps: 115, steps per second: 18, episode reward: 0.697, mean reward: 0.006 [-0.001, 0.013], mean action: 0.093 [-1.150, 1.114], mean observation: 0.083 [-14.073, 17.553], loss: --, mean_squared_error: --, mean_q: --
  230/2000: episode: 2, duration: 5.851s, episode steps: 115, steps per second: 20, episod

 1373/2000: episode: 12, duration: 7.841s, episode steps: 147, steps per second: 19, episode reward: 0.767, mean reward: 0.005 [0.000, 0.012], mean action: -0.003 [-1.187, 1.150], mean observation: 0.108 [-10.306, 19.646], loss: 0.000205, mean_squared_error: 0.000409, mean_q: 0.593911
 1524/2000: episode: 13, duration: 8.428s, episode steps: 151, steps per second: 18, episode reward: 0.798, mean reward: 0.005 [0.000, 0.014], mean action: -0.061 [-1.243, 1.256], mean observation: 0.112 [-34.167, 19.807], loss: 0.000548, mean_squared_error: 0.001095, mean_q: 0.578509
 1659/2000: episode: 14, duration: 7.049s, episode steps: 135, steps per second: 19, episode reward: 0.787, mean reward: 0.006 [0.000, 0.013], mean action: -0.103 [-1.202, 1.117], mean observation: 0.111 [-9.644, 18.597], loss: 0.000138, mean_squared_error: 0.000275, mean_q: 0.585843
 1793/2000: episode: 15, duration: 7.121s, episode steps: 134, steps per second: 19, episode reward: 0.777, mean reward: 0.006 [0.000, 0.013], 

done, took 96.756 seconds


iteration: 93
Training for 2000 steps ...
  133/2000: episode: 1, duration: 6.023s, episode steps: 133, steps per second: 22, episode reward: 0.753, mean reward: 0.006 [0.000, 0.012], mean action: -0.036 [-1.275, 1.236], mean observation: 0.116 [-20.452, 20.039], loss: --, mean_squared_error: --, mean_q: --
  269/2000: episode: 2, duration: 6.269s, episode steps: 136, steps per second: 22, episode reward: 0.790, mean reward: 0.006 [0.000, 0.012], mean action: -0.032 [-1.241, 1.212], mean observation: 0.116 [-39.094, 19.367], loss: --, mean_squared_error: --, mean_q: --
  404/2000: episode: 3, duration: 6.207s, episode steps: 135, steps per second: 22, episode reward: 0.786, mean reward: 0.006 [0.000, 0.012], mean action: -0.051 [-1.315, 1.148], mean observation: 0.115 [-35.797, 19.875], loss: --, mean_squared_error: --, mean_q: --
  540/2000: episode: 4, duration: 6.105s, episode steps: 136, steps per second: 22, episode reward: 0.796, mean reward: 0.006 [0.

  107/2000: episode: 1, duration: 5.178s, episode steps: 107, steps per second: 21, episode reward: 0.655, mean reward: 0.006 [-0.001, 0.013], mean action: 0.088 [-1.108, 1.242], mean observation: 0.093 [-10.299, 17.898], loss: --, mean_squared_error: --, mean_q: --
  218/2000: episode: 2, duration: 5.445s, episode steps: 111, steps per second: 20, episode reward: 0.596, mean reward: 0.005 [-0.001, 0.013], mean action: 0.072 [-1.131, 1.181], mean observation: 0.111 [-10.343, 18.225], loss: --, mean_squared_error: --, mean_q: --
  327/2000: episode: 3, duration: 5.396s, episode steps: 109, steps per second: 20, episode reward: 0.602, mean reward: 0.006 [-0.001, 0.013], mean action: 0.072 [-1.173, 1.120], mean observation: 0.110 [-10.287, 17.843], loss: --, mean_squared_error: --, mean_q: --
  437/2000: episode: 4, duration: 5.654s, episode steps: 110, steps per second: 19, episode reward: 0.591, mean reward: 0.005 [-0.001, 0.013], mean action: 0.071 [-1.259, 1.210], mean observation: 0.

 1610/2000: episode: 14, duration: 6.582s, episode steps: 106, steps per second: 16, episode reward: 0.573, mean reward: 0.005 [-0.001, 0.013], mean action: 0.105 [-1.215, 1.200], mean observation: 0.081 [-45.343, 17.955], loss: 0.000125, mean_squared_error: 0.000250, mean_q: 0.575718
 1732/2000: episode: 15, duration: 10.215s, episode steps: 122, steps per second: 12, episode reward: 0.394, mean reward: 0.003 [-0.001, 0.013], mean action: 0.092 [-1.146, 1.235], mean observation: 0.098 [-42.069, 17.856], loss: 0.000198, mean_squared_error: 0.000396, mean_q: 0.574846
 1845/2000: episode: 16, duration: 6.952s, episode steps: 113, steps per second: 16, episode reward: 0.691, mean reward: 0.006 [-0.001, 0.013], mean action: 0.107 [-1.208, 1.170], mean observation: 0.089 [-14.719, 17.841], loss: 0.000106, mean_squared_error: 0.000213, mean_q: 0.581787
 1955/2000: episode: 17, duration: 6.161s, episode steps: 110, steps per second: 18, episode reward: 0.700, mean reward: 0.006 [-0.001, 0.013

 1103/2000: episode: 10, duration: 6.007s, episode steps: 109, steps per second: 18, episode reward: 0.705, mean reward: 0.006 [-0.001, 0.013], mean action: 0.224 [-1.122, 1.256], mean observation: 0.083 [-9.959, 17.988], loss: 0.000141, mean_squared_error: 0.000282, mean_q: 0.569630
 1212/2000: episode: 11, duration: 6.106s, episode steps: 109, steps per second: 18, episode reward: 0.721, mean reward: 0.007 [-0.002, 0.013], mean action: 0.229 [-1.105, 1.145], mean observation: 0.085 [-10.033, 18.109], loss: 0.000354, mean_squared_error: 0.000708, mean_q: 0.562056
 1324/2000: episode: 12, duration: 6.087s, episode steps: 112, steps per second: 18, episode reward: 0.736, mean reward: 0.007 [-0.000, 0.013], mean action: 0.227 [-1.152, 1.230], mean observation: 0.088 [-10.172, 17.897], loss: 0.000118, mean_squared_error: 0.000237, mean_q: 0.565228
 1438/2000: episode: 13, duration: 6.185s, episode steps: 114, steps per second: 18, episode reward: 0.763, mean reward: 0.007 [0.000, 0.013], 

  326/2000: episode: 3, duration: 4.622s, episode steps: 108, steps per second: 23, episode reward: 0.715, mean reward: 0.007 [-0.001, 0.014], mean action: 0.231 [-1.114, 1.169], mean observation: 0.086 [-17.076, 18.155], loss: --, mean_squared_error: --, mean_q: --
  435/2000: episode: 4, duration: 4.771s, episode steps: 109, steps per second: 23, episode reward: 0.721, mean reward: 0.007 [-0.001, 0.014], mean action: 0.225 [-1.147, 1.155], mean observation: 0.088 [-11.967, 17.800], loss: --, mean_squared_error: --, mean_q: --
  544/2000: episode: 5, duration: 4.802s, episode steps: 109, steps per second: 23, episode reward: 0.723, mean reward: 0.007 [-0.001, 0.014], mean action: 0.260 [-1.149, 1.312], mean observation: 0.089 [-11.733, 17.866], loss: --, mean_squared_error: --, mean_q: --
  652/2000: episode: 6, duration: 4.660s, episode steps: 108, steps per second: 23, episode reward: 0.716, mean reward: 0.007 [-0.001, 0.014], mean action: 0.251 [-1.074, 1.180], mean observation: 0.

 1761/2000: episode: 15, duration: 6.944s, episode steps: 123, steps per second: 18, episode reward: 0.693, mean reward: 0.006 [-0.000, 0.014], mean action: 0.193 [-1.165, 1.182], mean observation: 0.081 [-16.097, 18.443], loss: 0.000117, mean_squared_error: 0.000234, mean_q: 0.544518
 1876/2000: episode: 16, duration: 6.106s, episode steps: 115, steps per second: 19, episode reward: 0.689, mean reward: 0.006 [-0.001, 0.014], mean action: 0.169 [-1.201, 1.223], mean observation: 0.081 [-13.954, 18.030], loss: 0.000126, mean_squared_error: 0.000252, mean_q: 0.551589
 1993/2000: episode: 17, duration: 6.166s, episode steps: 117, steps per second: 19, episode reward: 0.706, mean reward: 0.006 [-0.000, 0.013], mean action: 0.141 [-1.118, 1.167], mean observation: 0.084 [-17.755, 18.076], loss: 0.000065, mean_squared_error: 0.000131, mean_q: 0.556598
done, took 98.583 seconds


iteration: 102
Training for 2000 steps ...
  114/2000: episode: 1, duration: 5.001s, episode steps: 114, steps per

 1163/2000: episode: 11, duration: 6.728s, episode steps: 116, steps per second: 17, episode reward: 0.669, mean reward: 0.006 [-0.001, 0.013], mean action: 0.007 [-1.102, 1.221], mean observation: 0.088 [-14.412, 17.434], loss: 0.000150, mean_squared_error: 0.000300, mean_q: 0.548491
 1282/2000: episode: 12, duration: 7.219s, episode steps: 119, steps per second: 16, episode reward: 0.677, mean reward: 0.006 [-0.001, 0.014], mean action: 0.003 [-1.171, 1.251], mean observation: 0.089 [-14.975, 17.484], loss: 0.000314, mean_squared_error: 0.000627, mean_q: 0.554476
 1377/2000: episode: 13, duration: 6.158s, episode steps: 95, steps per second: 15, episode reward: 0.575, mean reward: 0.006 [0.000, 0.013], mean action: 0.036 [-1.155, 1.147], mean observation: 0.069 [-45.801, 26.344], loss: 0.000201, mean_squared_error: 0.000401, mean_q: 0.546562
 1478/2000: episode: 14, duration: 6.342s, episode steps: 101, steps per second: 16, episode reward: 0.646, mean reward: 0.006 [-0.000, 0.014], 

  527/2000: episode: 5, duration: 5.280s, episode steps: 105, steps per second: 20, episode reward: 0.675, mean reward: 0.006 [-0.000, 0.013], mean action: 0.263 [-1.251, 1.200], mean observation: 0.087 [-35.641, 17.886], loss: --, mean_squared_error: --, mean_q: --
  625/2000: episode: 6, duration: 5.000s, episode steps: 98, steps per second: 20, episode reward: 0.602, mean reward: 0.006 [-0.000, 0.013], mean action: 0.260 [-1.146, 1.159], mean observation: 0.073 [-39.889, 22.357], loss: --, mean_squared_error: --, mean_q: --
  729/2000: episode: 7, duration: 5.292s, episode steps: 104, steps per second: 20, episode reward: 0.641, mean reward: 0.006 [-0.000, 0.013], mean action: 0.238 [-1.142, 1.189], mean observation: 0.067 [-44.970, 18.083], loss: --, mean_squared_error: --, mean_q: --
  835/2000: episode: 8, duration: 5.351s, episode steps: 106, steps per second: 20, episode reward: 0.680, mean reward: 0.006 [-0.000, 0.013], mean action: 0.288 [-1.118, 1.176], mean observation: 0.0

  126/2000: episode: 1, duration: 6.339s, episode steps: 126, steps per second: 20, episode reward: 0.681, mean reward: 0.005 [-0.002, 0.013], mean action: -0.088 [-1.161, 1.140], mean observation: 0.089 [-31.017, 13.001], loss: --, mean_squared_error: --, mean_q: --
  256/2000: episode: 2, duration: 5.862s, episode steps: 130, steps per second: 22, episode reward: 0.688, mean reward: 0.005 [-0.002, 0.013], mean action: -0.109 [-1.206, 1.161], mean observation: 0.092 [-10.053, 12.905], loss: --, mean_squared_error: --, mean_q: --
  380/2000: episode: 3, duration: 5.982s, episode steps: 124, steps per second: 21, episode reward: 0.676, mean reward: 0.005 [-0.002, 0.013], mean action: -0.053 [-1.107, 1.169], mean observation: 0.089 [-43.020, 12.979], loss: --, mean_squared_error: --, mean_q: --
  501/2000: episode: 4, duration: 5.765s, episode steps: 121, steps per second: 21, episode reward: 0.657, mean reward: 0.005 [-0.002, 0.013], mean action: -0.064 [-1.284, 1.201], mean observation

  103/2000: episode: 1, duration: 4.442s, episode steps: 103, steps per second: 23, episode reward: 0.576, mean reward: 0.006 [-0.001, 0.013], mean action: -0.053 [-1.204, 1.207], mean observation: 0.105 [-10.130, 13.137], loss: --, mean_squared_error: --, mean_q: --
  205/2000: episode: 2, duration: 4.300s, episode steps: 102, steps per second: 24, episode reward: 0.574, mean reward: 0.006 [-0.001, 0.013], mean action: -0.030 [-1.199, 1.180], mean observation: 0.104 [-10.071, 13.016], loss: --, mean_squared_error: --, mean_q: --
  309/2000: episode: 3, duration: 4.462s, episode steps: 104, steps per second: 23, episode reward: 0.581, mean reward: 0.006 [-0.001, 0.012], mean action: -0.025 [-1.174, 1.218], mean observation: 0.107 [-10.074, 13.008], loss: --, mean_squared_error: --, mean_q: --
  413/2000: episode: 4, duration: 4.426s, episode steps: 104, steps per second: 23, episode reward: 0.572, mean reward: 0.005 [-0.001, 0.012], mean action: -0.060 [-1.166, 1.126], mean observation

 1951/2000: episode: 13, duration: 8.461s, episode steps: 193, steps per second: 23, episode reward: -0.885, mean reward: -0.005 [-0.021, 0.011], mean action: 0.019 [-1.231, 1.166], mean observation: 0.051 [-10.001, 12.836], loss: 0.000127, mean_squared_error: 0.000255, mean_q: 0.520581
done, took 89.968 seconds


iteration: 111
Training for 2000 steps ...
  178/2000: episode: 1, duration: 6.047s, episode steps: 178, steps per second: 29, episode reward: -0.879, mean reward: -0.005 [-0.021, 0.011], mean action: 0.017 [-1.159, 1.134], mean observation: 0.049 [-10.064, 12.996], loss: --, mean_squared_error: --, mean_q: --
  354/2000: episode: 2, duration: 6.268s, episode steps: 176, steps per second: 28, episode reward: -0.843, mean reward: -0.005 [-0.021, 0.011], mean action: 0.030 [-1.173, 1.207], mean observation: 0.051 [-10.173, 13.149], loss: --, mean_squared_error: --, mean_q: --
  531/2000: episode: 3, duration: 6.054s, episode steps: 177, steps per second: 29, episode reward: -0.

  932/2000: episode: 7, duration: 4.453s, episode steps: 133, steps per second: 30, episode reward: -0.775, mean reward: -0.006 [-0.019, 0.013], mean action: 0.026 [-1.136, 1.229], mean observation: 0.035 [-15.428, 17.867], loss: --, mean_squared_error: --, mean_q: --
 1068/2000: episode: 8, duration: 5.122s, episode steps: 136, steps per second: 27, episode reward: -0.831, mean reward: -0.006 [-0.019, 0.014], mean action: -0.005 [-1.146, 1.229], mean observation: 0.055 [-14.880, 18.035], loss: 0.000063, mean_squared_error: 0.000126, mean_q: 0.523137
 1204/2000: episode: 9, duration: 5.570s, episode steps: 136, steps per second: 24, episode reward: -0.804, mean reward: -0.006 [-0.018, 0.013], mean action: -0.004 [-1.206, 1.164], mean observation: 0.039 [-14.933, 18.133], loss: 0.000172, mean_squared_error: 0.000344, mean_q: 0.526089
 1338/2000: episode: 10, duration: 6.013s, episode steps: 134, steps per second: 22, episode reward: -0.868, mean reward: -0.006 [-0.019, 0.013], mean acti

  865/2000: episode: 6, duration: 4.204s, episode steps: 145, steps per second: 34, episode reward: -0.860, mean reward: -0.006 [-0.019, 0.011], mean action: -0.012 [-1.138, 1.160], mean observation: 0.052 [-21.204, 20.269], loss: --, mean_squared_error: --, mean_q: --
 1008/2000: episode: 7, duration: 4.189s, episode steps: 143, steps per second: 34, episode reward: -0.843, mean reward: -0.006 [-0.019, 0.011], mean action: -0.015 [-1.122, 1.234], mean observation: 0.052 [-21.796, 20.456], loss: 0.000083, mean_squared_error: 0.000166, mean_q: 0.502070
 1153/2000: episode: 8, duration: 5.499s, episode steps: 145, steps per second: 26, episode reward: -0.851, mean reward: -0.006 [-0.019, 0.011], mean action: -0.041 [-1.224, 1.274], mean observation: 0.054 [-21.044, 20.409], loss: 0.000119, mean_squared_error: 0.000237, mean_q: 0.500857
 1298/2000: episode: 9, duration: 5.595s, episode steps: 145, steps per second: 26, episode reward: -0.832, mean reward: -0.006 [-0.019, 0.011], mean acti

 1327/2000: episode: 11, duration: 6.068s, episode steps: 108, steps per second: 18, episode reward: 0.542, mean reward: 0.005 [-0.001, 0.011], mean action: -0.003 [-1.162, 1.169], mean observation: 0.107 [-33.769, 20.171], loss: 0.000159, mean_squared_error: 0.000319, mean_q: 0.505943
 1437/2000: episode: 12, duration: 6.229s, episode steps: 110, steps per second: 18, episode reward: 0.546, mean reward: 0.005 [-0.001, 0.011], mean action: 0.023 [-1.126, 1.106], mean observation: 0.103 [-13.237, 20.251], loss: 0.000077, mean_squared_error: 0.000153, mean_q: 0.509858
 1548/2000: episode: 13, duration: 5.881s, episode steps: 111, steps per second: 19, episode reward: 0.535, mean reward: 0.005 [-0.000, 0.011], mean action: -0.011 [-1.177, 1.146], mean observation: 0.107 [-13.808, 20.247], loss: 0.000054, mean_squared_error: 0.000107, mean_q: 0.503434
 1661/2000: episode: 14, duration: 6.784s, episode steps: 113, steps per second: 17, episode reward: 0.413, mean reward: 0.004 [-0.000, 0.01

  499/2000: episode: 5, duration: 4.369s, episode steps: 102, steps per second: 23, episode reward: 0.454, mean reward: 0.004 [-0.001, 0.011], mean action: 0.005 [-1.134, 1.186], mean observation: 0.142 [-21.920, 19.911], loss: --, mean_squared_error: --, mean_q: --
  598/2000: episode: 6, duration: 4.241s, episode steps: 99, steps per second: 23, episode reward: 0.451, mean reward: 0.005 [-0.001, 0.011], mean action: -0.014 [-1.276, 1.147], mean observation: 0.140 [-22.468, 19.882], loss: --, mean_squared_error: --, mean_q: --
  696/2000: episode: 7, duration: 4.200s, episode steps: 98, steps per second: 23, episode reward: 0.416, mean reward: 0.004 [-0.001, 0.011], mean action: -0.006 [-1.205, 1.152], mean observation: 0.143 [-16.593, 19.638], loss: --, mean_squared_error: --, mean_q: --
  795/2000: episode: 8, duration: 4.350s, episode steps: 99, steps per second: 23, episode reward: 0.422, mean reward: 0.004 [-0.001, 0.011], mean action: -0.015 [-1.245, 1.185], mean observation: 0.

 1533/2000: episode: 15, duration: 4.790s, episode steps: 99, steps per second: 21, episode reward: 0.429, mean reward: 0.004 [-0.001, 0.010], mean action: -0.085 [-1.074, 1.174], mean observation: 0.135 [-10.754, 15.699], loss: 0.000093, mean_squared_error: 0.000186, mean_q: 0.508624
 1639/2000: episode: 16, duration: 5.475s, episode steps: 106, steps per second: 19, episode reward: 0.464, mean reward: 0.004 [-0.001, 0.010], mean action: -0.082 [-1.174, 1.180], mean observation: 0.137 [-34.501, 15.761], loss: 0.000256, mean_squared_error: 0.000513, mean_q: 0.508054
 1749/2000: episode: 17, duration: 5.572s, episode steps: 110, steps per second: 20, episode reward: 0.480, mean reward: 0.004 [-0.001, 0.010], mean action: -0.102 [-1.227, 1.128], mean observation: 0.138 [-10.762, 15.748], loss: 0.000069, mean_squared_error: 0.000138, mean_q: 0.505594
 1859/2000: episode: 18, duration: 5.669s, episode steps: 110, steps per second: 19, episode reward: 0.492, mean reward: 0.004 [-0.001, 0.01

  625/2000: episode: 6, duration: 4.269s, episode steps: 104, steps per second: 24, episode reward: 0.431, mean reward: 0.004 [-0.001, 0.011], mean action: -0.194 [-1.186, 1.121], mean observation: 0.135 [-41.320, 15.852], loss: --, mean_squared_error: --, mean_q: --
  728/2000: episode: 7, duration: 4.185s, episode steps: 103, steps per second: 25, episode reward: 0.428, mean reward: 0.004 [-0.001, 0.011], mean action: -0.189 [-1.152, 1.175], mean observation: 0.137 [-40.152, 15.616], loss: --, mean_squared_error: --, mean_q: --
  832/2000: episode: 8, duration: 4.216s, episode steps: 104, steps per second: 25, episode reward: 0.435, mean reward: 0.004 [-0.001, 0.011], mean action: -0.221 [-1.274, 1.169], mean observation: 0.132 [-40.931, 15.897], loss: --, mean_squared_error: --, mean_q: --
  940/2000: episode: 9, duration: 4.421s, episode steps: 108, steps per second: 24, episode reward: 0.476, mean reward: 0.004 [-0.001, 0.011], mean action: -0.226 [-1.196, 1.147], mean observation

 1825/2000: episode: 17, duration: 5.744s, episode steps: 109, steps per second: 19, episode reward: 0.449, mean reward: 0.004 [-0.001, 0.010], mean action: -0.164 [-1.183, 1.113], mean observation: 0.141 [-41.970, 15.746], loss: 0.000295, mean_squared_error: 0.000591, mean_q: 0.518794
 1937/2000: episode: 18, duration: 5.785s, episode steps: 112, steps per second: 19, episode reward: 0.462, mean reward: 0.004 [-0.002, 0.011], mean action: -0.170 [-1.184, 1.136], mean observation: 0.140 [-41.969, 15.794], loss: 0.000069, mean_squared_error: 0.000138, mean_q: 0.510851
done, took 89.465 seconds


iteration: 124
Training for 2000 steps ...
  108/2000: episode: 1, duration: 4.742s, episode steps: 108, steps per second: 23, episode reward: 0.441, mean reward: 0.004 [-0.001, 0.010], mean action: -0.146 [-1.123, 1.152], mean observation: 0.138 [-42.184, 15.541], loss: --, mean_squared_error: --, mean_q: --
  215/2000: episode: 2, duration: 4.633s, episode steps: 107, steps per second: 23, epi

 1232/2000: episode: 11, duration: 5.939s, episode steps: 119, steps per second: 20, episode reward: 0.526, mean reward: 0.004 [-0.001, 0.011], mean action: -0.129 [-1.211, 1.151], mean observation: 0.146 [-16.424, 15.914], loss: 0.000143, mean_squared_error: 0.000286, mean_q: 0.532940
 1350/2000: episode: 12, duration: 5.614s, episode steps: 118, steps per second: 21, episode reward: 0.507, mean reward: 0.004 [-0.001, 0.010], mean action: -0.119 [-1.180, 1.262], mean observation: 0.143 [-15.418, 16.284], loss: 0.000179, mean_squared_error: 0.000357, mean_q: 0.521229
 1461/2000: episode: 13, duration: 5.054s, episode steps: 111, steps per second: 22, episode reward: 0.465, mean reward: 0.004 [-0.001, 0.010], mean action: -0.123 [-1.189, 1.185], mean observation: 0.138 [-15.306, 15.546], loss: 0.000114, mean_squared_error: 0.000229, mean_q: 0.514442
 1585/2000: episode: 14, duration: 5.668s, episode steps: 124, steps per second: 22, episode reward: 0.532, mean reward: 0.004 [-0.001, 0.0

 1220/2000: episode: 9, duration: 7.933s, episode steps: 150, steps per second: 19, episode reward: 0.508, mean reward: 0.003 [-0.002, 0.011], mean action: -0.010 [-1.234, 1.220], mean observation: 0.143 [-18.373, 15.878], loss: 0.000109, mean_squared_error: 0.000217, mean_q: 0.515414
 1393/2000: episode: 10, duration: 9.187s, episode steps: 173, steps per second: 19, episode reward: 0.495, mean reward: 0.003 [-0.003, 0.011], mean action: 0.039 [-1.257, 1.336], mean observation: 0.138 [-21.754, 15.775], loss: 0.000269, mean_squared_error: 0.000539, mean_q: 0.518877
 1518/2000: episode: 11, duration: 6.266s, episode steps: 125, steps per second: 20, episode reward: 0.486, mean reward: 0.004 [-0.001, 0.010], mean action: -0.056 [-1.159, 1.159], mean observation: 0.139 [-32.079, 17.958], loss: 0.000197, mean_squared_error: 0.000393, mean_q: 0.527357
 1657/2000: episode: 12, duration: 7.523s, episode steps: 139, steps per second: 18, episode reward: 0.430, mean reward: 0.003 [-0.001, 0.010

 1071/2000: episode: 6, duration: 9.555s, episode steps: 176, steps per second: 18, episode reward: 0.698, mean reward: 0.004 [-0.003, 0.015], mean action: 0.006 [-1.192, 1.244], mean observation: 0.133 [-31.532, 17.692], loss: 0.000185, mean_squared_error: 0.000369, mean_q: 0.535754
 1235/2000: episode: 7, duration: 9.295s, episode steps: 164, steps per second: 18, episode reward: 0.720, mean reward: 0.004 [-0.003, 0.014], mean action: -0.023 [-1.346, 1.237], mean observation: 0.135 [-31.436, 17.762], loss: 0.000250, mean_squared_error: 0.000501, mean_q: 0.531270
 1398/2000: episode: 8, duration: 8.828s, episode steps: 163, steps per second: 18, episode reward: 0.720, mean reward: 0.004 [-0.003, 0.014], mean action: 0.028 [-1.328, 1.273], mean observation: 0.132 [-31.954, 17.987], loss: 0.000192, mean_squared_error: 0.000385, mean_q: 0.534768
 1554/2000: episode: 9, duration: 8.772s, episode steps: 156, steps per second: 18, episode reward: 0.700, mean reward: 0.004 [-0.003, 0.014], m

  667/2000: episode: 3, duration: 10.426s, episode steps: 206, steps per second: 20, episode reward: 0.758, mean reward: 0.004 [-0.003, 0.013], mean action: 0.028 [-1.143, 1.352], mean observation: 0.116 [-24.446, 17.433], loss: --, mean_squared_error: --, mean_q: --
  867/2000: episode: 4, duration: 9.855s, episode steps: 200, steps per second: 20, episode reward: 0.789, mean reward: 0.004 [-0.003, 0.013], mean action: -0.009 [-1.336, 1.277], mean observation: 0.120 [-25.012, 17.610], loss: --, mean_squared_error: --, mean_q: --
 1070/2000: episode: 5, duration: 10.790s, episode steps: 203, steps per second: 19, episode reward: 0.782, mean reward: 0.004 [-0.003, 0.013], mean action: -0.034 [-1.293, 1.249], mean observation: 0.121 [-24.731, 17.533], loss: 0.000085, mean_squared_error: 0.000169, mean_q: 0.541746
 1253/2000: episode: 6, duration: 10.334s, episode steps: 183, steps per second: 18, episode reward: 0.774, mean reward: 0.004 [-0.003, 0.013], mean action: 0.000 [-1.185, 1.244

 1774/2000: episode: 3, duration: 15.315s, episode steps: 344, steps per second: 22, episode reward: -0.790, mean reward: -0.002 [-0.020, 0.011], mean action: 0.145 [-1.414, 1.419], mean observation: 0.075 [-15.088, 14.999], loss: 0.000217, mean_squared_error: 0.000435, mean_q: 0.568039
done, took 76.753 seconds


iteration: 139
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 39.514s, episode steps: 1000, steps per second: 25, episode reward: 0.004, mean reward: 0.000 [-0.003, 0.011], mean action: 0.117 [-1.360, 1.326], mean observation: 0.107 [-16.419, 15.104], loss: --, mean_squared_error: --, mean_q: --
 1266/2000: episode: 2, duration: 10.198s, episode steps: 266, steps per second: 26, episode reward: -0.832, mean reward: -0.003 [-0.020, 0.011], mean action: 0.166 [-1.257, 1.241], mean observation: 0.060 [-16.123, 14.955], loss: 0.000165, mean_squared_error: 0.000330, mean_q: 0.579192
done, took 84.035 seconds


iteration: 140
Training for 2000 steps ...
 1000/2000: e

  628/2000: episode: 3, duration: 10.808s, episode steps: 206, steps per second: 19, episode reward: 0.725, mean reward: 0.004 [-0.003, 0.013], mean action: -0.003 [-1.204, 1.283], mean observation: 0.111 [-38.485, 15.100], loss: --, mean_squared_error: --, mean_q: --
 1433/2000: episode: 4, duration: 33.779s, episode steps: 805, steps per second: 24, episode reward: 0.755, mean reward: 0.001 [-0.003, 0.013], mean action: 0.011 [-1.400, 1.391], mean observation: 0.118 [-15.328, 14.675], loss: 0.000183, mean_squared_error: 0.000365, mean_q: 0.590711
done, took 95.370 seconds


iteration: 145
Training for 2000 steps ...
  322/2000: episode: 1, duration: 13.370s, episode steps: 322, steps per second: 24, episode reward: 0.736, mean reward: 0.002 [-0.002, 0.013], mean action: 0.004 [-1.309, 1.352], mean observation: 0.116 [-15.529, 15.051], loss: --, mean_squared_error: --, mean_q: --
  569/2000: episode: 2, duration: 10.916s, episode steps: 247, steps per second: 23, episode reward: 0.731

  771/2000: episode: 3, duration: 8.767s, episode steps: 228, steps per second: 26, episode reward: 0.744, mean reward: 0.003 [-0.001, 0.014], mean action: -0.105 [-1.185, 1.470], mean observation: 0.113 [-24.183, 17.643], loss: --, mean_squared_error: --, mean_q: --
 1021/2000: episode: 4, duration: 9.888s, episode steps: 250, steps per second: 25, episode reward: 0.756, mean reward: 0.003 [-0.001, 0.013], mean action: -0.141 [-1.227, 1.195], mean observation: 0.111 [-22.844, 17.148], loss: 0.000124, mean_squared_error: 0.000248, mean_q: 0.615747
 1340/2000: episode: 5, duration: 14.750s, episode steps: 319, steps per second: 22, episode reward: 0.734, mean reward: 0.002 [-0.002, 0.014], mean action: -0.124 [-1.358, 1.165], mean observation: 0.111 [-23.351, 17.215], loss: 0.000292, mean_squared_error: 0.000583, mean_q: 0.614296
 1536/2000: episode: 6, duration: 10.757s, episode steps: 196, steps per second: 18, episode reward: 0.748, mean reward: 0.004 [-0.002, 0.013], mean action: -0

 1438/2000: episode: 7, duration: 11.319s, episode steps: 213, steps per second: 19, episode reward: 0.777, mean reward: 0.004 [-0.000, 0.013], mean action: 0.146 [-1.227, 1.170], mean observation: 0.113 [-22.351, 16.886], loss: 0.000195, mean_squared_error: 0.000390, mean_q: 0.613777
 1678/2000: episode: 8, duration: 13.459s, episode steps: 240, steps per second: 18, episode reward: 0.802, mean reward: 0.003 [-0.000, 0.012], mean action: 0.198 [-1.180, 1.341], mean observation: 0.120 [-19.949, 16.635], loss: 0.000297, mean_squared_error: 0.000595, mean_q: 0.615715
 1886/2000: episode: 9, duration: 11.407s, episode steps: 208, steps per second: 18, episode reward: 0.815, mean reward: 0.004 [-0.000, 0.012], mean action: 0.153 [-1.141, 1.266], mean observation: 0.125 [-19.946, 16.263], loss: 0.000312, mean_squared_error: 0.000625, mean_q: 0.607687
done, took 97.082 seconds


iteration: 153
Training for 2000 steps ...
  229/2000: episode: 1, duration: 9.959s, episode steps: 229, steps per

 1382/2000: episode: 8, duration: 8.846s, episode steps: 167, steps per second: 19, episode reward: 0.780, mean reward: 0.005 [-0.001, 0.013], mean action: 0.055 [-1.117, 1.156], mean observation: 0.108 [-22.016, 15.988], loss: 0.000261, mean_squared_error: 0.000523, mean_q: 0.616082
 1541/2000: episode: 9, duration: 8.520s, episode steps: 159, steps per second: 19, episode reward: 0.770, mean reward: 0.005 [-0.001, 0.013], mean action: 0.024 [-1.241, 1.253], mean observation: 0.103 [-23.295, 16.523], loss: 0.000279, mean_squared_error: 0.000558, mean_q: 0.606445
 1695/2000: episode: 10, duration: 7.916s, episode steps: 154, steps per second: 19, episode reward: 0.771, mean reward: 0.005 [-0.001, 0.013], mean action: -0.062 [-1.180, 1.140], mean observation: 0.114 [-32.961, 16.250], loss: 0.000219, mean_squared_error: 0.000438, mean_q: 0.606458
 1855/2000: episode: 11, duration: 8.582s, episode steps: 160, steps per second: 19, episode reward: 0.788, mean reward: 0.005 [-0.002, 0.013],

  337/2000: episode: 2, duration: 7.507s, episode steps: 169, steps per second: 23, episode reward: 0.787, mean reward: 0.005 [-0.001, 0.013], mean action: 0.051 [-1.131, 1.197], mean observation: 0.113 [-12.571, 16.246], loss: --, mean_squared_error: --, mean_q: --
  504/2000: episode: 3, duration: 7.264s, episode steps: 167, steps per second: 23, episode reward: 0.781, mean reward: 0.005 [-0.000, 0.013], mean action: 0.049 [-1.134, 1.284], mean observation: 0.111 [-11.878, 15.822], loss: --, mean_squared_error: --, mean_q: --
  680/2000: episode: 4, duration: 7.982s, episode steps: 176, steps per second: 22, episode reward: 0.808, mean reward: 0.005 [-0.001, 0.013], mean action: 0.050 [-1.224, 1.243], mean observation: 0.121 [-12.278, 16.043], loss: --, mean_squared_error: --, mean_q: --
  846/2000: episode: 5, duration: 7.389s, episode steps: 166, steps per second: 22, episode reward: 0.783, mean reward: 0.005 [-0.001, 0.013], mean action: 0.023 [-1.155, 1.126], mean observation: 0.

 1228/2000: episode: 8, duration: 8.466s, episode steps: 149, steps per second: 18, episode reward: 0.766, mean reward: 0.005 [-0.002, 0.013], mean action: -0.064 [-1.216, 1.194], mean observation: 0.111 [-48.948, 16.429], loss: 0.000438, mean_squared_error: 0.000876, mean_q: 0.604735
 1397/2000: episode: 9, duration: 9.416s, episode steps: 169, steps per second: 18, episode reward: 0.821, mean reward: 0.005 [-0.002, 0.013], mean action: -0.017 [-1.205, 1.271], mean observation: 0.127 [-15.202, 15.538], loss: 0.000151, mean_squared_error: 0.000303, mean_q: 0.616677
 1574/2000: episode: 10, duration: 7.820s, episode steps: 177, steps per second: 23, episode reward: -0.725, mean reward: -0.004 [-0.021, 0.010], mean action: -0.188 [-1.174, 1.160], mean observation: 0.083 [-8.200, 17.717], loss: 0.000244, mean_squared_error: 0.000487, mean_q: 0.599019
 1730/2000: episode: 11, duration: 8.870s, episode steps: 156, steps per second: 18, episode reward: 0.774, mean reward: 0.005 [-0.001, 0.01

 1257/2000: episode: 5, duration: 13.034s, episode steps: 234, steps per second: 18, episode reward: 0.844, mean reward: 0.004 [-0.001, 0.013], mean action: -0.068 [-1.254, 1.252], mean observation: 0.121 [-12.537, 16.893], loss: 0.000214, mean_squared_error: 0.000429, mean_q: 0.613262
 1483/2000: episode: 6, duration: 12.205s, episode steps: 226, steps per second: 19, episode reward: 0.830, mean reward: 0.004 [-0.002, 0.013], mean action: -0.157 [-1.355, 1.364], mean observation: 0.124 [-12.604, 16.996], loss: 0.000170, mean_squared_error: 0.000340, mean_q: 0.617657
 1670/2000: episode: 7, duration: 10.394s, episode steps: 187, steps per second: 18, episode reward: 0.831, mean reward: 0.004 [-0.002, 0.013], mean action: -0.093 [-1.201, 1.256], mean observation: 0.119 [-12.270, 16.765], loss: 0.000109, mean_squared_error: 0.000218, mean_q: 0.618543
 1886/2000: episode: 8, duration: 11.771s, episode steps: 216, steps per second: 18, episode reward: 0.830, mean reward: 0.004 [-0.002, 0.0

  536/2000: episode: 2, duration: 13.430s, episode steps: 268, steps per second: 20, episode reward: 0.777, mean reward: 0.003 [-0.003, 0.013], mean action: 0.022 [-1.249, 1.356], mean observation: 0.118 [-34.454, 18.867], loss: --, mean_squared_error: --, mean_q: --
  804/2000: episode: 3, duration: 13.143s, episode steps: 268, steps per second: 20, episode reward: 0.785, mean reward: 0.003 [-0.003, 0.013], mean action: -0.008 [-1.296, 1.189], mean observation: 0.119 [-34.519, 19.009], loss: --, mean_squared_error: --, mean_q: --
 1069/2000: episode: 4, duration: 13.673s, episode steps: 265, steps per second: 19, episode reward: 0.772, mean reward: 0.003 [-0.003, 0.013], mean action: 0.022 [-1.237, 1.266], mean observation: 0.114 [-34.945, 19.064], loss: 0.000299, mean_squared_error: 0.000598, mean_q: 0.632893
 1275/2000: episode: 5, duration: 13.263s, episode steps: 206, steps per second: 16, episode reward: 0.771, mean reward: 0.004 [-0.003, 0.013], mean action: 0.051 [-1.151, 1.349

 1955/2000: episode: 7, duration: 17.121s, episode steps: 307, steps per second: 18, episode reward: 0.765, mean reward: 0.002 [-0.002, 0.013], mean action: 0.130 [-1.257, 1.290], mean observation: 0.118 [-24.748, 17.436], loss: 0.000211, mean_squared_error: 0.000422, mean_q: 0.630831
done, took 104.675 seconds


iteration: 171
Training for 2000 steps ...
  295/2000: episode: 1, duration: 13.465s, episode steps: 295, steps per second: 22, episode reward: 0.736, mean reward: 0.002 [-0.002, 0.013], mean action: 0.166 [-1.138, 1.328], mean observation: 0.116 [-24.998, 17.582], loss: --, mean_squared_error: --, mean_q: --
  583/2000: episode: 2, duration: 13.200s, episode steps: 288, steps per second: 22, episode reward: 0.724, mean reward: 0.003 [-0.003, 0.013], mean action: 0.152 [-1.271, 1.274], mean observation: 0.115 [-25.123, 17.701], loss: --, mean_squared_error: --, mean_q: --
  846/2000: episode: 3, duration: 12.274s, episode steps: 263, steps per second: 21, episode reward: 0.738

  584/2000: episode: 3, duration: 9.677s, episode steps: 198, steps per second: 20, episode reward: 0.704, mean reward: 0.004 [-0.002, 0.013], mean action: 0.036 [-1.211, 1.278], mean observation: 0.110 [-24.542, 17.715], loss: --, mean_squared_error: --, mean_q: --
  785/2000: episode: 4, duration: 9.968s, episode steps: 201, steps per second: 20, episode reward: 0.705, mean reward: 0.004 [-0.001, 0.013], mean action: 0.066 [-1.151, 1.211], mean observation: 0.110 [-24.752, 17.740], loss: --, mean_squared_error: --, mean_q: --
  980/2000: episode: 5, duration: 9.603s, episode steps: 195, steps per second: 20, episode reward: 0.706, mean reward: 0.004 [-0.002, 0.013], mean action: 0.052 [-1.214, 1.205], mean observation: 0.108 [-23.342, 17.137], loss: --, mean_squared_error: --, mean_q: --
 1176/2000: episode: 6, duration: 11.297s, episode steps: 196, steps per second: 17, episode reward: 0.709, mean reward: 0.004 [-0.002, 0.014], mean action: 0.067 [-1.171, 1.156], mean observation: 0

 1471/2000: episode: 6, duration: 9.001s, episode steps: 188, steps per second: 21, episode reward: 0.479, mean reward: 0.003 [-0.002, 0.012], mean action: -0.044 [-1.240, 1.300], mean observation: 0.136 [-22.785, 17.009], loss: 0.000227, mean_squared_error: 0.000453, mean_q: 0.602196
 1659/2000: episode: 7, duration: 9.630s, episode steps: 188, steps per second: 20, episode reward: 0.462, mean reward: 0.002 [-0.002, 0.012], mean action: -0.048 [-1.270, 1.169], mean observation: 0.134 [-23.273, 17.193], loss: 0.000223, mean_squared_error: 0.000446, mean_q: 0.605073
 1841/2000: episode: 8, duration: 9.324s, episode steps: 182, steps per second: 20, episode reward: 0.458, mean reward: 0.003 [-0.002, 0.012], mean action: -0.052 [-1.223, 1.221], mean observation: 0.132 [-22.205, 16.787], loss: 0.000148, mean_squared_error: 0.000296, mean_q: 0.610768
done, took 92.446 seconds


iteration: 177
Training for 2000 steps ...
  145/2000: episode: 1, duration: 5.626s, episode steps: 145, steps per

  351/2000: episode: 2, duration: 6.661s, episode steps: 172, steps per second: 26, episode reward: 0.460, mean reward: 0.003 [-0.002, 0.011], mean action: -0.090 [-1.155, 1.168], mean observation: 0.124 [-24.701, 18.116], loss: --, mean_squared_error: --, mean_q: --
  531/2000: episode: 3, duration: 7.126s, episode steps: 180, steps per second: 25, episode reward: 0.457, mean reward: 0.003 [-0.002, 0.011], mean action: -0.131 [-1.249, 1.190], mean observation: 0.123 [-24.592, 17.858], loss: --, mean_squared_error: --, mean_q: --
  709/2000: episode: 4, duration: 6.975s, episode steps: 178, steps per second: 26, episode reward: 0.463, mean reward: 0.003 [-0.002, 0.011], mean action: -0.095 [-1.335, 1.229], mean observation: 0.124 [-24.625, 17.933], loss: --, mean_squared_error: --, mean_q: --
  887/2000: episode: 5, duration: 6.843s, episode steps: 178, steps per second: 26, episode reward: 0.458, mean reward: 0.003 [-0.002, 0.011], mean action: -0.104 [-1.363, 1.266], mean observation

  194/2000: episode: 1, duration: 7.581s, episode steps: 194, steps per second: 26, episode reward: 0.458, mean reward: 0.002 [-0.003, 0.011], mean action: -0.144 [-1.150, 1.193], mean observation: 0.132 [-22.316, 20.377], loss: --, mean_squared_error: --, mean_q: --
  391/2000: episode: 2, duration: 7.529s, episode steps: 197, steps per second: 26, episode reward: 0.455, mean reward: 0.002 [-0.003, 0.011], mean action: -0.177 [-1.247, 1.194], mean observation: 0.132 [-23.027, 20.392], loss: --, mean_squared_error: --, mean_q: --
  590/2000: episode: 3, duration: 7.659s, episode steps: 199, steps per second: 26, episode reward: 0.465, mean reward: 0.002 [-0.003, 0.011], mean action: -0.178 [-1.235, 1.105], mean observation: 0.131 [-23.864, 19.818], loss: --, mean_squared_error: --, mean_q: --
  787/2000: episode: 4, duration: 7.594s, episode steps: 197, steps per second: 26, episode reward: 0.458, mean reward: 0.002 [-0.003, 0.011], mean action: -0.142 [-1.180, 1.240], mean observation

 1877/2000: episode: 9, duration: 8.003s, episode steps: 170, steps per second: 21, episode reward: 0.495, mean reward: 0.003 [-0.002, 0.011], mean action: -0.070 [-1.244, 1.213], mean observation: 0.130 [-23.891, 18.882], loss: 0.000082, mean_squared_error: 0.000164, mean_q: 0.591629
done, took 89.850 seconds


iteration: 185
Training for 2000 steps ...
  215/2000: episode: 1, duration: 8.121s, episode steps: 215, steps per second: 26, episode reward: 0.520, mean reward: 0.002 [-0.002, 0.010], mean action: -0.070 [-1.238, 1.179], mean observation: 0.127 [-24.111, 18.770], loss: --, mean_squared_error: --, mean_q: --
  457/2000: episode: 2, duration: 9.135s, episode steps: 242, steps per second: 26, episode reward: 0.491, mean reward: 0.002 [-0.002, 0.010], mean action: -0.091 [-1.276, 1.215], mean observation: 0.124 [-23.477, 18.934], loss: --, mean_squared_error: --, mean_q: --
  681/2000: episode: 3, duration: 8.401s, episode steps: 224, steps per second: 27, episode reward: 0.514, 

 1932/2000: episode: 11, duration: 5.804s, episode steps: 175, steps per second: 30, episode reward: -0.810, mean reward: -0.005 [-0.020, 0.008], mean action: -0.165 [-1.317, 1.193], mean observation: 0.062 [-32.989, 18.274], loss: 0.000174, mean_squared_error: 0.000349, mean_q: 0.576842
done, took 84.251 seconds


iteration: 188
Training for 2000 steps ...
  257/2000: episode: 1, duration: 6.757s, episode steps: 257, steps per second: 38, episode reward: -0.855, mean reward: -0.003 [-0.019, 0.011], mean action: -0.121 [-1.234, 1.203], mean observation: 0.073 [-32.580, 18.221], loss: --, mean_squared_error: --, mean_q: --
  520/2000: episode: 2, duration: 6.904s, episode steps: 263, steps per second: 38, episode reward: -0.862, mean reward: -0.003 [-0.019, 0.011], mean action: -0.120 [-1.158, 1.125], mean observation: 0.073 [-32.623, 18.171], loss: --, mean_squared_error: --, mean_q: --
  770/2000: episode: 3, duration: 6.603s, episode steps: 250, steps per second: 38, episode reward: 

  403/2000: episode: 2, duration: 8.776s, episode steps: 204, steps per second: 23, episode reward: 0.750, mean reward: 0.004 [-0.002, 0.013], mean action: -0.082 [-1.283, 1.249], mean observation: 0.112 [-24.279, 16.957], loss: --, mean_squared_error: --, mean_q: --
  605/2000: episode: 3, duration: 8.761s, episode steps: 202, steps per second: 23, episode reward: 0.745, mean reward: 0.004 [-0.002, 0.013], mean action: -0.091 [-1.249, 1.273], mean observation: 0.111 [-24.951, 17.077], loss: --, mean_squared_error: --, mean_q: --
  807/2000: episode: 4, duration: 8.773s, episode steps: 202, steps per second: 23, episode reward: 0.737, mean reward: 0.004 [-0.002, 0.013], mean action: -0.057 [-1.187, 1.305], mean observation: 0.112 [-24.549, 16.823], loss: --, mean_squared_error: --, mean_q: --
 1008/2000: episode: 5, duration: 8.868s, episode steps: 201, steps per second: 23, episode reward: 0.744, mean reward: 0.004 [-0.002, 0.013], mean action: -0.120 [-1.305, 1.130], mean observation

  533/2000: episode: 3, duration: 7.139s, episode steps: 178, steps per second: 25, episode reward: 0.773, mean reward: 0.004 [-0.001, 0.013], mean action: -0.043 [-1.285, 1.145], mean observation: 0.113 [-23.049, 17.048], loss: --, mean_squared_error: --, mean_q: --
  707/2000: episode: 4, duration: 6.802s, episode steps: 174, steps per second: 26, episode reward: 0.748, mean reward: 0.004 [-0.001, 0.013], mean action: 0.030 [-1.082, 1.270], mean observation: 0.110 [-23.153, 17.075], loss: --, mean_squared_error: --, mean_q: --
  878/2000: episode: 5, duration: 7.013s, episode steps: 171, steps per second: 24, episode reward: 0.747, mean reward: 0.004 [-0.001, 0.013], mean action: -0.015 [-1.221, 1.117], mean observation: 0.110 [-22.812, 16.975], loss: --, mean_squared_error: --, mean_q: --
 1054/2000: episode: 6, duration: 7.080s, episode steps: 176, steps per second: 25, episode reward: 0.731, mean reward: 0.004 [-0.001, 0.012], mean action: 0.014 [-1.258, 1.322], mean observation: 

  663/2000: episode: 5, duration: 5.439s, episode steps: 139, steps per second: 26, episode reward: 0.481, mean reward: 0.003 [-0.002, 0.011], mean action: -0.050 [-1.327, 1.244], mean observation: 0.122 [-22.107, 16.719], loss: --, mean_squared_error: --, mean_q: --
  801/2000: episode: 6, duration: 5.353s, episode steps: 138, steps per second: 26, episode reward: 0.473, mean reward: 0.003 [-0.002, 0.012], mean action: -0.041 [-1.141, 1.199], mean observation: 0.122 [-23.025, 17.122], loss: --, mean_squared_error: --, mean_q: --
  928/2000: episode: 7, duration: 4.579s, episode steps: 127, steps per second: 28, episode reward: 0.424, mean reward: 0.003 [-0.002, 0.012], mean action: -0.046 [-1.150, 1.209], mean observation: 0.122 [-21.966, 16.730], loss: --, mean_squared_error: --, mean_q: --
 1056/2000: episode: 8, duration: 5.157s, episode steps: 128, steps per second: 25, episode reward: 0.446, mean reward: 0.003 [-0.002, 0.012], mean action: -0.071 [-1.312, 1.200], mean observation

  207/2000: episode: 2, duration: 4.211s, episode steps: 107, steps per second: 25, episode reward: 0.445, mean reward: 0.004 [-0.001, 0.013], mean action: -0.044 [-1.132, 1.156], mean observation: 0.125 [-10.389, 14.975], loss: --, mean_squared_error: --, mean_q: --
  312/2000: episode: 3, duration: 3.941s, episode steps: 105, steps per second: 27, episode reward: 0.428, mean reward: 0.004 [-0.001, 0.013], mean action: -0.033 [-1.087, 1.104], mean observation: 0.126 [-10.479, 15.133], loss: --, mean_squared_error: --, mean_q: --
  419/2000: episode: 4, duration: 4.251s, episode steps: 107, steps per second: 25, episode reward: 0.424, mean reward: 0.004 [-0.001, 0.012], mean action: -0.044 [-1.178, 1.171], mean observation: 0.125 [-10.304, 14.663], loss: --, mean_squared_error: --, mean_q: --
  524/2000: episode: 5, duration: 4.090s, episode steps: 105, steps per second: 26, episode reward: 0.429, mean reward: 0.004 [-0.001, 0.013], mean action: -0.041 [-1.221, 1.202], mean observation

 1339/2000: episode: 13, duration: 4.311s, episode steps: 105, steps per second: 24, episode reward: 0.439, mean reward: 0.004 [-0.002, 0.011], mean action: -0.183 [-1.197, 1.257], mean observation: 0.139 [-17.200, 15.333], loss: 0.000053, mean_squared_error: 0.000106, mean_q: 0.486861
 1443/2000: episode: 14, duration: 4.471s, episode steps: 104, steps per second: 23, episode reward: 0.435, mean reward: 0.004 [-0.002, 0.010], mean action: -0.201 [-1.110, 1.113], mean observation: 0.138 [-16.928, 14.785], loss: 0.000076, mean_squared_error: 0.000151, mean_q: 0.489129
 1548/2000: episode: 15, duration: 4.458s, episode steps: 105, steps per second: 24, episode reward: 0.434, mean reward: 0.004 [-0.002, 0.011], mean action: -0.229 [-1.179, 1.072], mean observation: 0.138 [-16.461, 15.151], loss: 0.000058, mean_squared_error: 0.000117, mean_q: 0.487902
 1648/2000: episode: 16, duration: 4.382s, episode steps: 100, steps per second: 23, episode reward: 0.426, mean reward: 0.004 [-0.002, 0.0

  287/2000: episode: 3, duration: 3.068s, episode steps: 96, steps per second: 31, episode reward: 0.423, mean reward: 0.004 [-0.002, 0.012], mean action: -0.237 [-1.152, 1.095], mean observation: 0.135 [-16.704, 15.105], loss: --, mean_squared_error: --, mean_q: --
  382/2000: episode: 4, duration: 3.042s, episode steps: 95, steps per second: 31, episode reward: 0.423, mean reward: 0.004 [-0.002, 0.012], mean action: -0.219 [-1.174, 1.180], mean observation: 0.134 [-16.489, 14.851], loss: --, mean_squared_error: --, mean_q: --
  477/2000: episode: 5, duration: 3.076s, episode steps: 95, steps per second: 31, episode reward: 0.425, mean reward: 0.004 [-0.002, 0.012], mean action: -0.234 [-1.151, 1.221], mean observation: 0.134 [-16.796, 14.765], loss: --, mean_squared_error: --, mean_q: --
  573/2000: episode: 6, duration: 2.957s, episode steps: 96, steps per second: 32, episode reward: 0.429, mean reward: 0.004 [-0.002, 0.012], mean action: -0.243 [-1.140, 1.134], mean observation: 0.

 1280/2000: episode: 13, duration: 4.096s, episode steps: 98, steps per second: 24, episode reward: 0.435, mean reward: 0.004 [-0.002, 0.012], mean action: -0.292 [-1.129, 1.148], mean observation: 0.131 [-16.302, 15.094], loss: 0.000084, mean_squared_error: 0.000167, mean_q: 0.476662
 1378/2000: episode: 14, duration: 3.838s, episode steps: 98, steps per second: 26, episode reward: 0.438, mean reward: 0.004 [-0.002, 0.012], mean action: -0.318 [-1.197, 1.107], mean observation: 0.131 [-17.004, 14.742], loss: 0.000138, mean_squared_error: 0.000277, mean_q: 0.475024
 1480/2000: episode: 15, duration: 3.939s, episode steps: 102, steps per second: 26, episode reward: 0.436, mean reward: 0.004 [-0.002, 0.011], mean action: -0.304 [-1.158, 1.150], mean observation: 0.132 [-16.185, 14.827], loss: 0.000052, mean_squared_error: 0.000105, mean_q: 0.470479
 1581/2000: episode: 16, duration: 3.878s, episode steps: 101, steps per second: 26, episode reward: 0.434, mean reward: 0.004 [-0.002, 0.012

  177/2000: episode: 2, duration: 2.810s, episode steps: 88, steps per second: 31, episode reward: 0.428, mean reward: 0.005 [-0.002, 0.012], mean action: -0.223 [-1.127, 1.085], mean observation: 0.139 [-10.110, 14.183], loss: --, mean_squared_error: --, mean_q: --
  266/2000: episode: 3, duration: 2.844s, episode steps: 89, steps per second: 31, episode reward: 0.435, mean reward: 0.005 [-0.002, 0.012], mean action: -0.218 [-1.210, 1.110], mean observation: 0.139 [-10.328, 14.829], loss: --, mean_squared_error: --, mean_q: --
  356/2000: episode: 4, duration: 2.854s, episode steps: 90, steps per second: 32, episode reward: 0.438, mean reward: 0.005 [-0.002, 0.012], mean action: -0.213 [-1.182, 1.108], mean observation: 0.140 [-10.448, 15.131], loss: --, mean_squared_error: --, mean_q: --
  446/2000: episode: 5, duration: 2.869s, episode steps: 90, steps per second: 31, episode reward: 0.437, mean reward: 0.005 [-0.002, 0.012], mean action: -0.210 [-1.180, 1.110], mean observation: 0.

  960/2000: episode: 10, duration: 2.991s, episode steps: 97, steps per second: 32, episode reward: 0.456, mean reward: 0.005 [-0.002, 0.010], mean action: -0.170 [-1.163, 1.185], mean observation: 0.141 [-16.527, 14.700], loss: --, mean_squared_error: --, mean_q: --
 1057/2000: episode: 11, duration: 3.400s, episode steps: 97, steps per second: 29, episode reward: 0.451, mean reward: 0.005 [-0.002, 0.011], mean action: -0.180 [-1.155, 1.181], mean observation: 0.140 [-16.454, 15.223], loss: 0.000070, mean_squared_error: 0.000140, mean_q: 0.456551
 1152/2000: episode: 12, duration: 3.826s, episode steps: 95, steps per second: 25, episode reward: 0.437, mean reward: 0.005 [-0.001, 0.011], mean action: -0.191 [-1.163, 1.232], mean observation: 0.143 [-10.309, 17.831], loss: 0.000248, mean_squared_error: 0.000496, mean_q: 0.452012
 1248/2000: episode: 13, duration: 3.893s, episode steps: 96, steps per second: 25, episode reward: 0.441, mean reward: 0.005 [-0.002, 0.011], mean action: -0.2

 1925/2000: episode: 20, duration: 4.125s, episode steps: 95, steps per second: 23, episode reward: 0.408, mean reward: 0.004 [-0.000, 0.010], mean action: -0.236 [-1.276, 1.094], mean observation: 0.135 [-10.320, 19.932], loss: 0.000118, mean_squared_error: 0.000237, mean_q: 0.438230
done, took 71.373 seconds


iteration: 207
Training for 2000 steps ...
   95/2000: episode: 1, duration: 3.279s, episode steps: 95, steps per second: 29, episode reward: 0.387, mean reward: 0.004 [-0.000, 0.011], mean action: -0.251 [-1.223, 1.226], mean observation: 0.134 [-10.281, 19.937], loss: --, mean_squared_error: --, mean_q: --
  195/2000: episode: 2, duration: 3.303s, episode steps: 100, steps per second: 30, episode reward: 0.452, mean reward: 0.005 [-0.000, 0.010], mean action: -0.257 [-1.174, 1.173], mean observation: 0.141 [-10.311, 19.525], loss: --, mean_squared_error: --, mean_q: --
  292/2000: episode: 3, duration: 3.184s, episode steps: 97, steps per second: 30, episode reward: 0.413, me

 1391/2000: episode: 11, duration: 6.941s, episode steps: 135, steps per second: 19, episode reward: 0.731, mean reward: 0.005 [-0.001, 0.013], mean action: -0.150 [-1.173, 1.195], mean observation: 0.098 [-23.054, 19.956], loss: 0.000104, mean_squared_error: 0.000209, mean_q: 0.434671
 1547/2000: episode: 12, duration: 7.699s, episode steps: 156, steps per second: 20, episode reward: 0.750, mean reward: 0.005 [-0.000, 0.013], mean action: -0.164 [-1.127, 1.218], mean observation: 0.101 [-10.431, 19.962], loss: 0.000102, mean_squared_error: 0.000204, mean_q: 0.439232
 1802/2000: episode: 13, duration: 12.273s, episode steps: 255, steps per second: 21, episode reward: -0.877, mean reward: -0.003 [-0.021, 0.009], mean action: -0.131 [-1.282, 1.328], mean observation: 0.049 [-39.823, 20.157], loss: 0.000142, mean_squared_error: 0.000285, mean_q: 0.428616
done, took 98.796 seconds


iteration: 209
Training for 2000 steps ...
  220/2000: episode: 1, duration: 8.592s, episode steps: 220, ste

 1153/2000: episode: 6, duration: 8.060s, episode steps: 190, steps per second: 24, episode reward: -0.794, mean reward: -0.004 [-0.021, 0.007], mean action: -0.023 [-1.241, 1.297], mean observation: 0.048 [-19.051, 20.102], loss: 0.000075, mean_squared_error: 0.000149, mean_q: 0.402398
 1340/2000: episode: 7, duration: 9.062s, episode steps: 187, steps per second: 21, episode reward: -0.762, mean reward: -0.004 [-0.021, 0.007], mean action: 0.063 [-1.136, 1.379], mean observation: 0.049 [-23.253, 20.042], loss: 0.000095, mean_squared_error: 0.000191, mean_q: 0.409903
 1537/2000: episode: 8, duration: 9.229s, episode steps: 197, steps per second: 21, episode reward: -0.774, mean reward: -0.004 [-0.021, 0.007], mean action: 0.058 [-1.326, 1.197], mean observation: 0.050 [-18.483, 19.746], loss: 0.000123, mean_squared_error: 0.000247, mean_q: 0.414849
 1713/2000: episode: 9, duration: 8.176s, episode steps: 176, steps per second: 22, episode reward: -0.786, mean reward: -0.004 [-0.021, 0

 1208/2000: episode: 10, duration: 5.002s, episode steps: 119, steps per second: 24, episode reward: -0.739, mean reward: -0.006 [-0.021, 0.009], mean action: 0.087 [-1.166, 1.137], mean observation: 0.011 [-20.896, 19.055], loss: 0.000088, mean_squared_error: 0.000176, mean_q: 0.388373
 1324/2000: episode: 11, duration: 5.231s, episode steps: 116, steps per second: 22, episode reward: -0.706, mean reward: -0.006 [-0.021, 0.009], mean action: 0.130 [-1.126, 1.214], mean observation: 0.015 [-23.853, 19.110], loss: 0.000081, mean_squared_error: 0.000162, mean_q: 0.385710
 1440/2000: episode: 12, duration: 4.995s, episode steps: 116, steps per second: 23, episode reward: -0.697, mean reward: -0.006 [-0.020, 0.008], mean action: 0.005 [-1.240, 1.200], mean observation: 0.016 [-25.110, 18.995], loss: 0.000130, mean_squared_error: 0.000261, mean_q: 0.391988
 1558/2000: episode: 13, duration: 5.417s, episode steps: 118, steps per second: 22, episode reward: -0.716, mean reward: -0.006 [-0.021

 1778/2000: episode: 4, duration: 13.051s, episode steps: 228, steps per second: 17, episode reward: 0.719, mean reward: 0.003 [-0.002, 0.013], mean action: 0.056 [-1.370, 1.339], mean observation: 0.112 [-10.408, 19.795], loss: 0.000164, mean_squared_error: 0.000328, mean_q: 0.370824
 1985/2000: episode: 5, duration: 12.288s, episode steps: 207, steps per second: 17, episode reward: 0.642, mean reward: 0.003 [-0.002, 0.011], mean action: -0.020 [-1.277, 1.221], mean observation: 0.117 [-16.212, 20.067], loss: 0.000204, mean_squared_error: 0.000407, mean_q: 0.374353
done, took 96.440 seconds


iteration: 217
Training for 2000 steps ...
  194/2000: episode: 1, duration: 9.596s, episode steps: 194, steps per second: 20, episode reward: 0.588, mean reward: 0.003 [-0.001, 0.011], mean action: 0.015 [-1.349, 1.298], mean observation: 0.115 [-12.388, 19.729], loss: --, mean_squared_error: --, mean_q: --
  378/2000: episode: 2, duration: 8.985s, episode steps: 184, steps per second: 20, episo

 1683/2000: episode: 9, duration: 11.215s, episode steps: 188, steps per second: 17, episode reward: 0.731, mean reward: 0.004 [-0.003, 0.013], mean action: -0.218 [-1.278, 1.142], mean observation: 0.146 [-20.568, 18.279], loss: 0.000230, mean_squared_error: 0.000459, mean_q: 0.386691
 1964/2000: episode: 10, duration: 16.479s, episode steps: 281, steps per second: 17, episode reward: 0.735, mean reward: 0.003 [-0.001, 0.013], mean action: -0.164 [-1.234, 1.265], mean observation: 0.128 [-25.064, 14.832], loss: 0.000133, mean_squared_error: 0.000265, mean_q: 0.377213
done, took 108.932 seconds


iteration: 220
Training for 2000 steps ...
  234/2000: episode: 1, duration: 12.025s, episode steps: 234, steps per second: 19, episode reward: 0.728, mean reward: 0.003 [-0.002, 0.013], mean action: -0.163 [-1.159, 1.210], mean observation: 0.125 [-31.552, 14.978], loss: --, mean_squared_error: --, mean_q: --
  469/2000: episode: 2, duration: 11.894s, episode steps: 235, steps per second: 20,

 1270/2000: episode: 5, duration: 14.389s, episode steps: 257, steps per second: 18, episode reward: 0.835, mean reward: 0.003 [-0.002, 0.014], mean action: -0.092 [-1.302, 1.285], mean observation: 0.127 [-21.495, 16.738], loss: 0.000333, mean_squared_error: 0.000667, mean_q: 0.389697
 1523/2000: episode: 6, duration: 13.793s, episode steps: 253, steps per second: 18, episode reward: 0.757, mean reward: 0.003 [-0.002, 0.013], mean action: -0.040 [-1.257, 1.250], mean observation: 0.117 [-21.926, 16.846], loss: 0.000164, mean_squared_error: 0.000327, mean_q: 0.382835
 1772/2000: episode: 7, duration: 13.949s, episode steps: 249, steps per second: 18, episode reward: 0.739, mean reward: 0.003 [-0.002, 0.013], mean action: -0.042 [-1.254, 1.154], mean observation: 0.113 [-20.247, 16.275], loss: 0.000153, mean_squared_error: 0.000306, mean_q: 0.382901
done, took 101.245 seconds


iteration: 224
Training for 2000 steps ...
  271/2000: episode: 1, duration: 12.360s, episode steps: 271, step

  799/2000: episode: 5, duration: 4.192s, episode steps: 162, steps per second: 39, episode reward: -0.689, mean reward: -0.004 [-0.019, 0.009], mean action: -0.078 [-1.178, 1.201], mean observation: 0.088 [-26.492, 18.694], loss: --, mean_squared_error: --, mean_q: --
  965/2000: episode: 6, duration: 4.416s, episode steps: 166, steps per second: 38, episode reward: -0.701, mean reward: -0.004 [-0.019, 0.009], mean action: -0.092 [-1.263, 1.218], mean observation: 0.088 [-26.223, 18.715], loss: --, mean_squared_error: --, mean_q: --
 1134/2000: episode: 7, duration: 5.915s, episode steps: 169, steps per second: 29, episode reward: -0.704, mean reward: -0.004 [-0.019, 0.009], mean action: -0.118 [-1.318, 1.162], mean observation: 0.085 [-25.349, 18.530], loss: 0.000155, mean_squared_error: 0.000309, mean_q: 0.395636
 1401/2000: episode: 8, duration: 12.422s, episode steps: 267, steps per second: 21, episode reward: 0.853, mean reward: 0.003 [-0.003, 0.013], mean action: -0.120 [-1.301,

 1140/2000: episode: 6, duration: 10.238s, episode steps: 181, steps per second: 18, episode reward: 0.786, mean reward: 0.004 [-0.001, 0.014], mean action: -0.007 [-1.218, 1.193], mean observation: 0.108 [-49.700, 17.431], loss: 0.000188, mean_squared_error: 0.000377, mean_q: 0.380932
 1302/2000: episode: 7, duration: 8.865s, episode steps: 162, steps per second: 18, episode reward: 0.747, mean reward: 0.005 [-0.001, 0.013], mean action: 0.013 [-1.169, 1.152], mean observation: 0.099 [-31.603, 17.386], loss: 0.000361, mean_squared_error: 0.000722, mean_q: 0.382404
 1458/2000: episode: 8, duration: 8.800s, episode steps: 156, steps per second: 18, episode reward: 0.729, mean reward: 0.005 [-0.001, 0.013], mean action: 0.021 [-1.116, 1.177], mean observation: 0.100 [-22.770, 17.107], loss: 0.000267, mean_squared_error: 0.000534, mean_q: 0.387546
 1613/2000: episode: 9, duration: 8.425s, episode steps: 155, steps per second: 18, episode reward: 0.743, mean reward: 0.005 [-0.001, 0.013], 

 1890/2000: episode: 5, duration: 8.118s, episode steps: 155, steps per second: 19, episode reward: 0.779, mean reward: 0.005 [-0.002, 0.013], mean action: -0.123 [-1.227, 1.234], mean observation: 0.110 [-24.343, 17.988], loss: 0.000325, mean_squared_error: 0.000649, mean_q: 0.399483
done, took 71.434 seconds


iteration: 238
Training for 2000 steps ...
  214/2000: episode: 1, duration: 9.024s, episode steps: 214, steps per second: 24, episode reward: 0.777, mean reward: 0.004 [-0.002, 0.014], mean action: -0.114 [-1.230, 1.178], mean observation: 0.115 [-28.238, 18.569], loss: --, mean_squared_error: --, mean_q: --
 1214/2000: episode: 2, duration: 42.995s, episode steps: 1000, steps per second: 23, episode reward: 0.703, mean reward: 0.001 [-0.003, 0.012], mean action: -0.061 [-1.308, 1.540], mean observation: 0.119 [-26.068, 17.930], loss: 0.000220, mean_squared_error: 0.000440, mean_q: 0.415541
 1370/2000: episode: 3, duration: 8.306s, episode steps: 156, steps per second: 19, epi

 1762/2000: episode: 11, duration: 7.988s, episode steps: 158, steps per second: 20, episode reward: 0.775, mean reward: 0.005 [-0.001, 0.013], mean action: -0.062 [-1.144, 1.268], mean observation: 0.105 [-18.823, 15.091], loss: 0.000126, mean_squared_error: 0.000253, mean_q: 0.406952
 1917/2000: episode: 12, duration: 7.706s, episode steps: 155, steps per second: 20, episode reward: 0.788, mean reward: 0.005 [-0.001, 0.013], mean action: -0.057 [-1.262, 1.189], mean observation: 0.107 [-19.185, 15.339], loss: 0.000428, mean_squared_error: 0.000856, mean_q: 0.406441
done, took 91.784 seconds


iteration: 241
Training for 2000 steps ...
  154/2000: episode: 1, duration: 6.012s, episode steps: 154, steps per second: 26, episode reward: 0.770, mean reward: 0.005 [-0.001, 0.013], mean action: -0.091 [-1.173, 1.183], mean observation: 0.104 [-19.172, 15.491], loss: --, mean_squared_error: --, mean_q: --
  310/2000: episode: 2, duration: 5.915s, episode steps: 156, steps per second: 26, epi

done, took 88.718 seconds


iteration: 243
Training for 2000 steps ...
  126/2000: episode: 1, duration: 5.038s, episode steps: 126, steps per second: 25, episode reward: 0.718, mean reward: 0.006 [-0.001, 0.014], mean action: -0.134 [-1.305, 1.311], mean observation: 0.095 [-43.421, 15.183], loss: --, mean_squared_error: --, mean_q: --
  252/2000: episode: 2, duration: 5.092s, episode steps: 126, steps per second: 25, episode reward: 0.714, mean reward: 0.006 [-0.001, 0.014], mean action: -0.098 [-1.289, 1.222], mean observation: 0.090 [-43.505, 15.155], loss: --, mean_squared_error: --, mean_q: --
  376/2000: episode: 3, duration: 4.811s, episode steps: 124, steps per second: 26, episode reward: 0.703, mean reward: 0.006 [-0.001, 0.014], mean action: -0.133 [-1.294, 1.170], mean observation: 0.089 [-43.855, 14.636], loss: --, mean_squared_error: --, mean_q: --
  502/2000: episode: 4, duration: 5.002s, episode steps: 126, steps per second: 25, episode reward: 0.729, mean reward: 0.006

 1580/2000: episode: 4, duration: 12.021s, episode steps: 223, steps per second: 19, episode reward: 0.730, mean reward: 0.003 [-0.004, 0.013], mean action: -0.168 [-1.208, 1.297], mean observation: 0.115 [-42.528, 17.145], loss: 0.000194, mean_squared_error: 0.000388, mean_q: 0.420347
 1843/2000: episode: 5, duration: 13.503s, episode steps: 263, steps per second: 19, episode reward: 0.774, mean reward: 0.003 [-0.004, 0.013], mean action: -0.172 [-1.286, 1.315], mean observation: 0.119 [-23.745, 17.854], loss: 0.000133, mean_squared_error: 0.000267, mean_q: 0.416292
done, took 89.575 seconds


iteration: 247
Training for 2000 steps ...
  227/2000: episode: 1, duration: 9.320s, episode steps: 227, steps per second: 24, episode reward: 0.771, mean reward: 0.003 [-0.004, 0.013], mean action: -0.107 [-1.151, 1.231], mean observation: 0.116 [-22.644, 17.460], loss: --, mean_squared_error: --, mean_q: --
  466/2000: episode: 2, duration: 10.044s, episode steps: 239, steps per second: 24, ep

 1678/2000: episode: 7, duration: 14.275s, episode steps: 325, steps per second: 23, episode reward: 0.489, mean reward: 0.002 [-0.002, 0.010], mean action: -0.089 [-1.361, 1.324], mean observation: 0.131 [-18.862, 15.695], loss: 0.000187, mean_squared_error: 0.000375, mean_q: 0.436425
 1990/2000: episode: 8, duration: 15.811s, episode steps: 312, steps per second: 20, episode reward: 0.644, mean reward: 0.002 [-0.003, 0.012], mean action: -0.106 [-1.227, 1.118], mean observation: 0.123 [-18.881, 16.076], loss: 0.000243, mean_squared_error: 0.000486, mean_q: 0.440494
done, took 85.841 seconds


iteration: 253
Training for 2000 steps ...
  284/2000: episode: 1, duration: 11.609s, episode steps: 284, steps per second: 24, episode reward: 0.654, mean reward: 0.002 [-0.003, 0.012], mean action: -0.154 [-1.390, 1.480], mean observation: 0.123 [-19.349, 15.968], loss: --, mean_squared_error: --, mean_q: --
  542/2000: episode: 2, duration: 10.991s, episode steps: 258, steps per second: 23, e

 1456/2000: episode: 7, duration: 8.133s, episode steps: 245, steps per second: 30, episode reward: -0.824, mean reward: -0.003 [-0.019, 0.010], mean action: -0.149 [-1.190, 1.336], mean observation: 0.100 [-23.118, 17.660], loss: 0.000088, mean_squared_error: 0.000176, mean_q: 0.453506
 1734/2000: episode: 8, duration: 8.972s, episode steps: 278, steps per second: 31, episode reward: -0.853, mean reward: -0.003 [-0.019, 0.011], mean action: -0.127 [-1.157, 1.212], mean observation: 0.098 [-21.870, 17.181], loss: 0.000159, mean_squared_error: 0.000318, mean_q: 0.454745
 1925/2000: episode: 9, duration: 6.626s, episode steps: 191, steps per second: 29, episode reward: -0.809, mean reward: -0.004 [-0.019, 0.010], mean action: -0.161 [-1.295, 1.176], mean observation: 0.096 [-21.964, 17.259], loss: 0.000197, mean_squared_error: 0.000393, mean_q: 0.448830
done, took 59.351 seconds


iteration: 260
Training for 2000 steps ...
  226/2000: episode: 1, duration: 5.734s, episode steps: 226, ste

 1844/2000: episode: 8, duration: 8.838s, episode steps: 173, steps per second: 20, episode reward: 0.463, mean reward: 0.003 [-0.003, 0.010], mean action: -0.256 [-1.304, 1.160], mean observation: 0.136 [-18.624, 20.097], loss: 0.000153, mean_squared_error: 0.000306, mean_q: 0.501312
done, took 93.433 seconds


iteration: 265
Training for 2000 steps ...
  184/2000: episode: 1, duration: 7.652s, episode steps: 184, steps per second: 24, episode reward: 0.464, mean reward: 0.003 [-0.004, 0.010], mean action: -0.179 [-1.304, 1.214], mean observation: 0.132 [-18.049, 20.146], loss: --, mean_squared_error: --, mean_q: --
  362/2000: episode: 2, duration: 7.485s, episode steps: 178, steps per second: 24, episode reward: 0.445, mean reward: 0.002 [-0.004, 0.010], mean action: -0.199 [-1.211, 1.189], mean observation: 0.131 [-18.366, 19.910], loss: --, mean_squared_error: --, mean_q: --
  554/2000: episode: 3, duration: 7.881s, episode steps: 192, steps per second: 24, episode reward: 0.428, 

  404/2000: episode: 2, duration: 12.698s, episode steps: 258, steps per second: 20, episode reward: 0.896, mean reward: 0.003 [-0.004, 0.013], mean action: -0.254 [-1.356, 1.141], mean observation: 0.154 [-13.932, 19.794], loss: --, mean_squared_error: --, mean_q: --
  619/2000: episode: 3, duration: 10.940s, episode steps: 215, steps per second: 20, episode reward: 0.834, mean reward: 0.004 [-0.003, 0.013], mean action: -0.258 [-1.205, 1.230], mean observation: 0.142 [-11.328, 19.610], loss: --, mean_squared_error: --, mean_q: --
  863/2000: episode: 4, duration: 12.075s, episode steps: 244, steps per second: 20, episode reward: 0.866, mean reward: 0.004 [-0.003, 0.013], mean action: -0.270 [-1.218, 1.174], mean observation: 0.151 [-16.914, 19.731], loss: --, mean_squared_error: --, mean_q: --
 1103/2000: episode: 5, duration: 12.875s, episode steps: 240, steps per second: 19, episode reward: 0.873, mean reward: 0.004 [-0.003, 0.013], mean action: -0.267 [-1.276, 1.232], mean observa

 1332/2000: episode: 10, duration: 8.041s, episode steps: 131, steps per second: 16, episode reward: 0.602, mean reward: 0.005 [-0.004, 0.012], mean action: -0.167 [-1.224, 1.165], mean observation: 0.110 [-11.377, 19.740], loss: 0.000180, mean_squared_error: 0.000360, mean_q: 0.496208
 1560/2000: episode: 11, duration: 13.613s, episode steps: 228, steps per second: 17, episode reward: 0.769, mean reward: 0.003 [-0.004, 0.013], mean action: -0.226 [-1.297, 1.276], mean observation: 0.134 [-37.283, 19.835], loss: 0.000068, mean_squared_error: 0.000136, mean_q: 0.491205
 1747/2000: episode: 12, duration: 11.067s, episode steps: 187, steps per second: 17, episode reward: 0.703, mean reward: 0.004 [-0.004, 0.014], mean action: -0.206 [-1.274, 1.190], mean observation: 0.127 [-24.515, 20.192], loss: 0.000120, mean_squared_error: 0.000241, mean_q: 0.495805
 1886/2000: episode: 13, duration: 8.897s, episode steps: 139, steps per second: 16, episode reward: 0.593, mean reward: 0.004 [-0.004, 0

  704/2000: episode: 3, duration: 11.446s, episode steps: 235, steps per second: 21, episode reward: 0.712, mean reward: 0.003 [-0.004, 0.014], mean action: -0.182 [-1.350, 1.293], mean observation: 0.125 [-24.151, 19.690], loss: --, mean_squared_error: --, mean_q: --
  931/2000: episode: 4, duration: 10.926s, episode steps: 227, steps per second: 21, episode reward: 0.704, mean reward: 0.003 [-0.003, 0.014], mean action: -0.190 [-1.436, 1.351], mean observation: 0.127 [-11.772, 19.965], loss: --, mean_squared_error: --, mean_q: --
 1170/2000: episode: 5, duration: 13.199s, episode steps: 239, steps per second: 18, episode reward: 0.709, mean reward: 0.003 [-0.004, 0.013], mean action: -0.188 [-1.257, 1.243], mean observation: 0.124 [-14.844, 20.077], loss: 0.000101, mean_squared_error: 0.000202, mean_q: 0.501901
 1282/2000: episode: 6, duration: 7.721s, episode steps: 112, steps per second: 15, episode reward: 0.555, mean reward: 0.005 [-0.003, 0.013], mean action: -0.159 [-1.136, 1.0

 1050/2000: episode: 5, duration: 9.498s, episode steps: 208, steps per second: 22, episode reward: 0.676, mean reward: 0.003 [-0.003, 0.013], mean action: -0.065 [-1.290, 1.314], mean observation: 0.116 [-45.008, 17.650], loss: 0.000213, mean_squared_error: 0.000426, mean_q: 0.489307
 1219/2000: episode: 6, duration: 9.074s, episode steps: 169, steps per second: 19, episode reward: 0.611, mean reward: 0.004 [-0.003, 0.016], mean action: -0.085 [-1.179, 1.201], mean observation: 0.102 [-45.575, 15.534], loss: 0.000097, mean_squared_error: 0.000194, mean_q: 0.495232
 1380/2000: episode: 7, duration: 7.916s, episode steps: 161, steps per second: 20, episode reward: 0.541, mean reward: 0.003 [-0.004, 0.015], mean action: -0.143 [-1.208, 1.150], mean observation: 0.106 [-15.105, 15.036], loss: 0.000133, mean_squared_error: 0.000265, mean_q: 0.492804
 1506/2000: episode: 8, duration: 8.544s, episode steps: 126, steps per second: 15, episode reward: 0.578, mean reward: 0.005 [-0.004, 0.013],

 1146/2000: episode: 10, duration: 5.609s, episode steps: 114, steps per second: 20, episode reward: 0.710, mean reward: 0.006 [-0.001, 0.012], mean action: -0.162 [-1.263, 1.241], mean observation: 0.127 [-15.147, 14.710], loss: 0.000031, mean_squared_error: 0.000062, mean_q: 0.493156
 1262/2000: episode: 11, duration: 5.523s, episode steps: 116, steps per second: 21, episode reward: 0.723, mean reward: 0.006 [-0.001, 0.013], mean action: -0.175 [-1.123, 1.302], mean observation: 0.127 [-15.297, 15.118], loss: 0.000134, mean_squared_error: 0.000268, mean_q: 0.492243
 1541/2000: episode: 12, duration: 12.028s, episode steps: 279, steps per second: 23, episode reward: -0.784, mean reward: -0.003 [-0.020, 0.011], mean action: -0.100 [-1.185, 1.270], mean observation: 0.086 [-30.909, 19.096], loss: 0.000172, mean_squared_error: 0.000344, mean_q: 0.487038
 1913/2000: episode: 13, duration: 16.338s, episode steps: 372, steps per second: 23, episode reward: -0.751, mean reward: -0.002 [-0.01

  920/2000: episode: 10, duration: 2.593s, episode steps: 93, steps per second: 36, episode reward: 0.449, mean reward: 0.005 [-0.003, 0.010], mean action: -0.344 [-1.174, 1.069], mean observation: 0.140 [-17.641, 15.261], loss: --, mean_squared_error: --, mean_q: --
 1012/2000: episode: 11, duration: 2.725s, episode steps: 92, steps per second: 34, episode reward: 0.440, mean reward: 0.005 [-0.003, 0.010], mean action: -0.313 [-1.137, 1.128], mean observation: 0.141 [-17.510, 15.532], loss: 0.000038, mean_squared_error: 0.000077, mean_q: 0.495192
 1100/2000: episode: 12, duration: 3.512s, episode steps: 88, steps per second: 25, episode reward: 0.419, mean reward: 0.005 [-0.003, 0.010], mean action: -0.326 [-1.131, 1.108], mean observation: 0.137 [-17.108, 15.231], loss: 0.000032, mean_squared_error: 0.000063, mean_q: 0.486448
 1196/2000: episode: 13, duration: 3.594s, episode steps: 96, steps per second: 27, episode reward: 0.452, mean reward: 0.005 [-0.002, 0.010], mean action: -0.3

 1524/2000: episode: 4, duration: 7.497s, episode steps: 171, steps per second: 23, episode reward: 0.559, mean reward: 0.003 [-0.002, 0.011], mean action: -0.068 [-1.194, 1.178], mean observation: 0.120 [-13.537, 18.511], loss: 0.000062, mean_squared_error: 0.000124, mean_q: 0.460464
 1703/2000: episode: 5, duration: 7.854s, episode steps: 179, steps per second: 23, episode reward: 0.582, mean reward: 0.003 [-0.001, 0.011], mean action: -0.082 [-1.118, 1.211], mean observation: 0.121 [-13.660, 17.858], loss: 0.000031, mean_squared_error: 0.000062, mean_q: 0.468737
 1895/2000: episode: 6, duration: 9.755s, episode steps: 192, steps per second: 20, episode reward: 0.549, mean reward: 0.003 [-0.002, 0.011], mean action: -0.086 [-1.198, 1.181], mean observation: 0.121 [-24.215, 17.882], loss: 0.000079, mean_squared_error: 0.000158, mean_q: 0.465609
done, took 83.670 seconds


iteration: 287
Training for 2000 steps ...
  209/2000: episode: 1, duration: 8.400s, episode steps: 209, steps per

 1656/2000: episode: 2, duration: 31.666s, episode steps: 735, steps per second: 23, episode reward: 0.543, mean reward: 0.001 [-0.002, 0.010], mean action: 0.001 [-1.325, 1.412], mean observation: 0.117 [-13.876, 18.312], loss: 0.000082, mean_squared_error: 0.000164, mean_q: 0.456786
done, took 76.842 seconds


iteration: 291
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 35.095s, episode steps: 1000, steps per second: 28, episode reward: 0.135, mean reward: 0.000 [-0.003, 0.010], mean action: 0.093 [-1.481, 1.553], mean observation: 0.106 [-16.891, 18.183], loss: --, mean_squared_error: --, mean_q: --
 1692/2000: episode: 2, duration: 33.521s, episode steps: 692, steps per second: 21, episode reward: 0.540, mean reward: 0.001 [-0.003, 0.011], mean action: -0.007 [-1.337, 1.432], mean observation: 0.116 [-18.858, 18.423], loss: 0.000070, mean_squared_error: 0.000139, mean_q: 0.452885
 1933/2000: episode: 3, duration: 11.668s, episode steps: 241, steps per second: 21, ep

 1865/2000: episode: 10, duration: 6.248s, episode steps: 157, steps per second: 25, episode reward: 0.520, mean reward: 0.003 [-0.001, 0.010], mean action: 0.175 [-1.360, 1.263], mean observation: 0.127 [-10.229, 18.516], loss: 0.000151, mean_squared_error: 0.000302, mean_q: 0.449471
done, took 78.606 seconds


iteration: 295
Training for 2000 steps ...
  168/2000: episode: 1, duration: 5.697s, episode steps: 168, steps per second: 29, episode reward: 0.555, mean reward: 0.003 [-0.001, 0.010], mean action: 0.225 [-1.131, 1.154], mean observation: 0.130 [-15.739, 18.023], loss: --, mean_squared_error: --, mean_q: --
  342/2000: episode: 2, duration: 5.696s, episode steps: 174, steps per second: 31, episode reward: 0.540, mean reward: 0.003 [-0.001, 0.010], mean action: 0.216 [-1.172, 1.189], mean observation: 0.130 [-15.497, 18.331], loss: --, mean_squared_error: --, mean_q: --
  505/2000: episode: 3, duration: 5.436s, episode steps: 163, steps per second: 30, episode reward: 0.548, me

 1514/2000: episode: 3, duration: 6.510s, episode steps: 195, steps per second: 30, episode reward: -0.867, mean reward: -0.004 [-0.019, 0.010], mean action: -0.003 [-1.179, 1.166], mean observation: 0.082 [-10.845, 20.253], loss: 0.000089, mean_squared_error: 0.000178, mean_q: 0.418207
 1678/2000: episode: 4, duration: 5.686s, episode steps: 164, steps per second: 29, episode reward: -0.873, mean reward: -0.005 [-0.020, 0.010], mean action: -0.003 [-1.219, 1.159], mean observation: 0.064 [-14.154, 20.589], loss: 0.000064, mean_squared_error: 0.000128, mean_q: 0.417564
 1830/2000: episode: 5, duration: 5.276s, episode steps: 152, steps per second: 29, episode reward: -0.858, mean reward: -0.006 [-0.020, 0.010], mean action: 0.011 [-1.118, 1.142], mean observation: 0.065 [-14.277, 20.423], loss: 0.000095, mean_squared_error: 0.000191, mean_q: 0.414162
 1992/2000: episode: 6, duration: 5.353s, episode steps: 162, steps per second: 30, episode reward: -0.861, mean reward: -0.005 [-0.020, 

 1396/2000: episode: 7, duration: 8.798s, episode steps: 236, steps per second: 27, episode reward: -0.802, mean reward: -0.003 [-0.020, 0.011], mean action: 0.006 [-1.378, 1.333], mean observation: 0.062 [-23.119, 20.198], loss: 0.000051, mean_squared_error: 0.000102, mean_q: 0.384874
 1549/2000: episode: 8, duration: 5.610s, episode steps: 153, steps per second: 27, episode reward: -0.815, mean reward: -0.005 [-0.020, 0.011], mean action: -0.009 [-1.253, 1.168], mean observation: 0.033 [-26.117, 20.396], loss: 0.000035, mean_squared_error: 0.000071, mean_q: 0.386748
 1693/2000: episode: 9, duration: 5.420s, episode steps: 144, steps per second: 27, episode reward: -0.813, mean reward: -0.006 [-0.021, 0.011], mean action: 0.051 [-1.189, 1.176], mean observation: 0.028 [-27.144, 20.541], loss: 0.000032, mean_squared_error: 0.000064, mean_q: 0.384321
 1862/2000: episode: 10, duration: 6.050s, episode steps: 169, steps per second: 28, episode reward: -0.811, mean reward: -0.005 [-0.020, 

 1000/2000: episode: 1, duration: 23.785s, episode steps: 1000, steps per second: 42, episode reward: 0.101, mean reward: 0.000 [-0.002, 0.010], mean action: 0.067 [-1.475, 1.273], mean observation: 0.113 [-10.715, 19.838], loss: --, mean_squared_error: --, mean_q: --
 2000/2000: episode: 2, duration: 45.313s, episode steps: 1000, steps per second: 22, episode reward: 0.082, mean reward: 0.000 [-0.002, 0.010], mean action: 0.147 [-1.373, 1.520], mean observation: 0.116 [-10.720, 19.902], loss: 0.000064, mean_squared_error: 0.000127, mean_q: 0.396501
done, took 69.116 seconds


iteration: 317
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 24.634s, episode steps: 1000, steps per second: 41, episode reward: 0.096, mean reward: 0.000 [-0.004, 0.007], mean action: 0.271 [-1.343, 1.424], mean observation: 0.119 [-8.179, 21.110], loss: --, mean_squared_error: --, mean_q: --
 2000/2000: episode: 2, duration: 35.887s, episode steps: 1000, steps per second: 28, episode reward: 0.0

 2000/2000: episode: 2, duration: 40.160s, episode steps: 1000, steps per second: 25, episode reward: 0.138, mean reward: 0.000 [-0.002, 0.007], mean action: 0.134 [-1.368, 1.422], mean observation: 0.119 [-15.285, 20.689], loss: 0.000141, mean_squared_error: 0.000281, mean_q: 0.392358
done, took 76.011 seconds


iteration: 327
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 41.578s, episode steps: 1000, steps per second: 24, episode reward: 0.142, mean reward: 0.000 [-0.001, 0.007], mean action: 0.073 [-1.470, 1.474], mean observation: 0.122 [-14.978, 20.981], loss: --, mean_squared_error: --, mean_q: --
 2000/2000: episode: 2, duration: 43.841s, episode steps: 1000, steps per second: 23, episode reward: 0.107, mean reward: 0.000 [-0.001, 0.008], mean action: 0.112 [-1.311, 1.397], mean observation: 0.123 [-13.077, 20.827], loss: 0.000115, mean_squared_error: 0.000230, mean_q: 0.388702
done, took 85.436 seconds


iteration: 328
Training for 2000 steps ...
 1000/2000: epi

  807/2000: episode: 5, duration: 6.039s, episode steps: 168, steps per second: 28, episode reward: 0.565, mean reward: 0.003 [-0.002, 0.011], mean action: 0.239 [-1.201, 1.153], mean observation: 0.113 [-8.595, 13.885], loss: --, mean_squared_error: --, mean_q: --
  967/2000: episode: 6, duration: 5.620s, episode steps: 160, steps per second: 28, episode reward: 0.564, mean reward: 0.004 [-0.003, 0.011], mean action: 0.246 [-1.202, 1.234], mean observation: 0.115 [-8.393, 13.203], loss: --, mean_squared_error: --, mean_q: --
 1125/2000: episode: 7, duration: 6.884s, episode steps: 158, steps per second: 23, episode reward: 0.537, mean reward: 0.003 [-0.002, 0.011], mean action: 0.269 [-1.181, 1.237], mean observation: 0.115 [-9.161, 13.512], loss: 0.000027, mean_squared_error: 0.000053, mean_q: 0.387164
 1275/2000: episode: 8, duration: 6.371s, episode steps: 150, steps per second: 24, episode reward: 0.612, mean reward: 0.004 [-0.003, 0.011], mean action: 0.243 [-1.196, 1.247], mean 

 1537/2000: episode: 10, duration: 8.485s, episode steps: 188, steps per second: 22, episode reward: 0.634, mean reward: 0.003 [-0.002, 0.012], mean action: 0.489 [-1.212, 1.269], mean observation: 0.107 [-22.874, 19.189], loss: 0.000095, mean_squared_error: 0.000190, mean_q: 0.388586
 1702/2000: episode: 11, duration: 7.409s, episode steps: 165, steps per second: 22, episode reward: 0.625, mean reward: 0.004 [-0.002, 0.012], mean action: 0.460 [-1.224, 1.240], mean observation: 0.099 [-28.687, 19.418], loss: 0.000070, mean_squared_error: 0.000141, mean_q: 0.380015
 1854/2000: episode: 12, duration: 7.250s, episode steps: 152, steps per second: 21, episode reward: 0.640, mean reward: 0.004 [-0.001, 0.012], mean action: 0.496 [-1.225, 1.300], mean observation: 0.105 [-21.663, 19.617], loss: 0.000167, mean_squared_error: 0.000334, mean_q: 0.380623
done, took 82.778 seconds


iteration: 341
Training for 2000 steps ...
  143/2000: episode: 1, duration: 5.400s, episode steps: 143, steps per

 1062/2000: episode: 7, duration: 5.146s, episode steps: 144, steps per second: 28, episode reward: -0.804, mean reward: -0.006 [-0.020, 0.009], mean action: 0.395 [-1.127, 1.205], mean observation: 0.058 [-40.803, 12.538], loss: 0.000212, mean_squared_error: 0.000424, mean_q: 0.370460
 1242/2000: episode: 8, duration: 6.090s, episode steps: 180, steps per second: 30, episode reward: -0.748, mean reward: -0.004 [-0.020, 0.011], mean action: 0.391 [-1.142, 1.193], mean observation: 0.078 [-15.060, 15.844], loss: 0.000131, mean_squared_error: 0.000263, mean_q: 0.373299
done, took 69.977 seconds


iteration: 345
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 32.557s, episode steps: 1000, steps per second: 31, episode reward: 0.144, mean reward: 0.000 [-0.002, 0.008], mean action: 0.369 [-1.344, 1.387], mean observation: 0.127 [-23.418, 17.353], loss: --, mean_squared_error: --, mean_q: --
 2000/2000: episode: 2, duration: 41.988s, episode steps: 1000, steps per second: 24, 

 1386/2000: episode: 2, duration: 15.210s, episode steps: 386, steps per second: 25, episode reward: 0.502, mean reward: 0.001 [-0.002, 0.010], mean action: 0.415 [-1.426, 1.236], mean observation: 0.133 [-9.661, 15.112], loss: 0.000053, mean_squared_error: 0.000106, mean_q: 0.362189
 1594/2000: episode: 3, duration: 8.156s, episode steps: 208, steps per second: 26, episode reward: 0.485, mean reward: 0.002 [-0.001, 0.010], mean action: 0.493 [-1.196, 1.283], mean observation: 0.138 [-9.780, 16.115], loss: 0.000106, mean_squared_error: 0.000211, mean_q: 0.361905
 1842/2000: episode: 4, duration: 9.541s, episode steps: 248, steps per second: 26, episode reward: 0.497, mean reward: 0.002 [-0.001, 0.010], mean action: 0.446 [-1.339, 1.264], mean observation: 0.139 [-9.595, 14.968], loss: 0.000143, mean_squared_error: 0.000286, mean_q: 0.358033
done, took 61.847 seconds


iteration: 356
Training for 2000 steps ...
 1000/2000: episode: 1, duration: 21.552s, episode steps: 1000, steps per se

 1630/2000: episode: 12, duration: 4.656s, episode steps: 116, steps per second: 25, episode reward: 0.609, mean reward: 0.005 [-0.002, 0.013], mean action: 0.345 [-1.143, 1.223], mean observation: 0.144 [-24.140, 17.546], loss: 0.000016, mean_squared_error: 0.000033, mean_q: 0.347356
 1748/2000: episode: 13, duration: 4.721s, episode steps: 118, steps per second: 25, episode reward: 0.635, mean reward: 0.005 [-0.002, 0.014], mean action: 0.338 [-1.088, 1.251], mean observation: 0.142 [-23.544, 18.047], loss: 0.000056, mean_squared_error: 0.000111, mean_q: 0.347460
 1868/2000: episode: 14, duration: 4.732s, episode steps: 120, steps per second: 25, episode reward: 0.629, mean reward: 0.005 [-0.003, 0.014], mean action: 0.308 [-1.200, 1.186], mean observation: 0.143 [-18.617, 17.457], loss: 0.000072, mean_squared_error: 0.000143, mean_q: 0.344193
 1988/2000: episode: 15, duration: 4.725s, episode steps: 120, steps per second: 25, episode reward: 0.625, mean reward: 0.005 [-0.002, 0.014]

 1321/2000: episode: 12, duration: 3.597s, episode steps: 105, steps per second: 29, episode reward: 0.493, mean reward: 0.005 [-0.001, 0.011], mean action: 0.255 [-1.119, 1.161], mean observation: 0.152 [-25.521, 15.346], loss: 0.000120, mean_squared_error: 0.000239, mean_q: 0.338595
 1424/2000: episode: 13, duration: 3.518s, episode steps: 103, steps per second: 29, episode reward: 0.502, mean reward: 0.005 [-0.001, 0.011], mean action: 0.209 [-1.119, 1.190], mean observation: 0.155 [-21.307, 17.709], loss: 0.000136, mean_squared_error: 0.000272, mean_q: 0.337879
 1525/2000: episode: 14, duration: 3.411s, episode steps: 101, steps per second: 30, episode reward: 0.504, mean reward: 0.005 [-0.000, 0.011], mean action: 0.238 [-1.187, 1.163], mean observation: 0.155 [-17.332, 17.606], loss: 0.000064, mean_squared_error: 0.000127, mean_q: 0.342891
 1625/2000: episode: 15, duration: 3.549s, episode steps: 100, steps per second: 28, episode reward: 0.489, mean reward: 0.005 [-0.001, 0.010]

  464/2000: episode: 4, duration: 3.091s, episode steps: 114, steps per second: 37, episode reward: 0.496, mean reward: 0.004 [-0.001, 0.011], mean action: 0.249 [-1.174, 1.138], mean observation: 0.152 [-16.948, 18.097], loss: --, mean_squared_error: --, mean_q: --
  579/2000: episode: 5, duration: 3.162s, episode steps: 115, steps per second: 36, episode reward: 0.499, mean reward: 0.004 [-0.001, 0.011], mean action: 0.252 [-1.159, 1.120], mean observation: 0.153 [-19.393, 17.457], loss: --, mean_squared_error: --, mean_q: --
  694/2000: episode: 6, duration: 3.127s, episode steps: 115, steps per second: 37, episode reward: 0.508, mean reward: 0.004 [-0.001, 0.011], mean action: 0.251 [-1.180, 1.135], mean observation: 0.153 [-18.312, 17.471], loss: --, mean_squared_error: --, mean_q: --
  809/2000: episode: 7, duration: 3.125s, episode steps: 115, steps per second: 37, episode reward: 0.499, mean reward: 0.004 [-0.001, 0.011], mean action: 0.229 [-1.211, 1.156], mean observation: 0.

 1833/2000: episode: 4, duration: 5.971s, episode steps: 151, steps per second: 25, episode reward: 0.455, mean reward: 0.003 [-0.001, 0.010], mean action: 0.267 [-1.153, 1.131], mean observation: 0.137 [-13.427, 16.192], loss: 0.000069, mean_squared_error: 0.000139, mean_q: 0.331197
done, took 62.381 seconds


iteration: 371
Training for 2000 steps ...
  167/2000: episode: 1, duration: 4.893s, episode steps: 167, steps per second: 34, episode reward: 0.466, mean reward: 0.003 [-0.001, 0.010], mean action: 0.211 [-1.117, 1.232], mean observation: 0.137 [-12.298, 16.037], loss: --, mean_squared_error: --, mean_q: --
  375/2000: episode: 2, duration: 6.922s, episode steps: 208, steps per second: 30, episode reward: 0.456, mean reward: 0.002 [-0.001, 0.010], mean action: 0.222 [-1.211, 1.200], mean observation: 0.133 [-12.596, 16.182], loss: --, mean_squared_error: --, mean_q: --
  700/2000: episode: 3, duration: 11.370s, episode steps: 325, steps per second: 29, episode reward: 0.479, me

 1836/2000: episode: 13, duration: 6.313s, episode steps: 139, steps per second: 22, episode reward: -0.754, mean reward: -0.005 [-0.019, 0.008], mean action: 0.276 [-1.180, 1.344], mean observation: 0.069 [-33.287, 16.466], loss: 0.000135, mean_squared_error: 0.000269, mean_q: 0.331334
 1984/2000: episode: 14, duration: 5.912s, episode steps: 148, steps per second: 25, episode reward: -0.762, mean reward: -0.005 [-0.019, 0.009], mean action: 0.269 [-1.142, 1.175], mean observation: 0.078 [-16.696, 15.149], loss: 0.000076, mean_squared_error: 0.000151, mean_q: 0.330323
done, took 74.856 seconds


iteration: 375
Training for 2000 steps ...
  150/2000: episode: 1, duration: 4.447s, episode steps: 150, steps per second: 34, episode reward: -0.750, mean reward: -0.005 [-0.019, 0.009], mean action: 0.262 [-1.215, 1.206], mean observation: 0.077 [-13.854, 15.986], loss: --, mean_squared_error: --, mean_q: --
  295/2000: episode: 2, duration: 4.603s, episode steps: 145, steps per second: 32, 

  244/2000: episode: 3, duration: 2.180s, episode steps: 81, steps per second: 37, episode reward: 0.453, mean reward: 0.006 [0.001, 0.011], mean action: 0.195 [-1.073, 1.153], mean observation: 0.147 [-9.194, 19.026], loss: --, mean_squared_error: --, mean_q: --
  325/2000: episode: 4, duration: 2.168s, episode steps: 81, steps per second: 37, episode reward: 0.452, mean reward: 0.006 [0.001, 0.011], mean action: 0.196 [-1.128, 1.124], mean observation: 0.147 [-9.243, 19.030], loss: --, mean_squared_error: --, mean_q: --
  406/2000: episode: 5, duration: 2.176s, episode steps: 81, steps per second: 37, episode reward: 0.452, mean reward: 0.006 [0.001, 0.011], mean action: 0.188 [-1.156, 1.194], mean observation: 0.147 [-9.298, 19.088], loss: --, mean_squared_error: --, mean_q: --
  487/2000: episode: 6, duration: 2.221s, episode steps: 81, steps per second: 36, episode reward: 0.454, mean reward: 0.006 [0.001, 0.011], mean action: 0.194 [-1.062, 1.204], mean observation: 0.147 [-9.330

  731/2000: episode: 9, duration: 2.192s, episode steps: 82, steps per second: 37, episode reward: 0.456, mean reward: 0.006 [0.001, 0.011], mean action: 0.211 [-1.119, 1.172], mean observation: 0.147 [-9.858, 20.160], loss: --, mean_squared_error: --, mean_q: --
  813/2000: episode: 10, duration: 2.224s, episode steps: 82, steps per second: 37, episode reward: 0.460, mean reward: 0.006 [0.001, 0.011], mean action: 0.196 [-1.154, 1.118], mean observation: 0.146 [-9.878, 20.480], loss: --, mean_squared_error: --, mean_q: --
  894/2000: episode: 11, duration: 2.161s, episode steps: 81, steps per second: 37, episode reward: 0.453, mean reward: 0.006 [0.001, 0.011], mean action: 0.209 [-1.178, 1.186], mean observation: 0.145 [-9.828, 20.034], loss: --, mean_squared_error: --, mean_q: --
  975/2000: episode: 12, duration: 2.165s, episode steps: 81, steps per second: 37, episode reward: 0.452, mean reward: 0.006 [0.001, 0.011], mean action: 0.196 [-1.142, 1.168], mean observation: 0.146 [-9.

 1167/2000: episode: 14, duration: 3.094s, episode steps: 83, steps per second: 27, episode reward: 0.456, mean reward: 0.005 [-0.000, 0.011], mean action: 0.194 [-1.180, 1.199], mean observation: 0.145 [-10.327, 20.148], loss: 0.000023, mean_squared_error: 0.000046, mean_q: 0.309873
 1250/2000: episode: 15, duration: 3.039s, episode steps: 83, steps per second: 27, episode reward: 0.453, mean reward: 0.005 [-0.000, 0.011], mean action: 0.160 [-1.189, 1.114], mean observation: 0.146 [-10.335, 20.101], loss: 0.000126, mean_squared_error: 0.000252, mean_q: 0.315866
 1334/2000: episode: 16, duration: 3.094s, episode steps: 84, steps per second: 27, episode reward: 0.463, mean reward: 0.006 [0.000, 0.011], mean action: 0.196 [-1.156, 1.195], mean observation: 0.146 [-10.425, 20.072], loss: 0.000142, mean_squared_error: 0.000284, mean_q: 0.317596
 1417/2000: episode: 17, duration: 3.056s, episode steps: 83, steps per second: 27, episode reward: 0.456, mean reward: 0.005 [-0.000, 0.011], mea

 1663/2000: episode: 20, duration: 3.019s, episode steps: 83, steps per second: 27, episode reward: 0.453, mean reward: 0.005 [-0.000, 0.011], mean action: 0.187 [-1.141, 1.177], mean observation: 0.145 [-10.321, 19.935], loss: 0.000078, mean_squared_error: 0.000156, mean_q: 0.306052
 1746/2000: episode: 21, duration: 3.041s, episode steps: 83, steps per second: 27, episode reward: 0.457, mean reward: 0.006 [0.000, 0.011], mean action: 0.208 [-1.089, 1.183], mean observation: 0.146 [-10.318, 20.073], loss: 0.000126, mean_squared_error: 0.000251, mean_q: 0.310411
 1828/2000: episode: 22, duration: 2.975s, episode steps: 82, steps per second: 28, episode reward: 0.451, mean reward: 0.005 [0.001, 0.011], mean action: 0.192 [-1.134, 1.107], mean observation: 0.146 [-10.322, 20.135], loss: 0.000093, mean_squared_error: 0.000187, mean_q: 0.308300
 1909/2000: episode: 23, duration: 3.102s, episode steps: 81, steps per second: 26, episode reward: 0.449, mean reward: 0.006 [0.001, 0.012], mean 

  166/2000: episode: 2, duration: 2.254s, episode steps: 83, steps per second: 37, episode reward: 0.459, mean reward: 0.006 [0.000, 0.011], mean action: 0.217 [-1.116, 1.303], mean observation: 0.145 [-10.313, 19.974], loss: --, mean_squared_error: --, mean_q: --
  249/2000: episode: 3, duration: 2.241s, episode steps: 83, steps per second: 37, episode reward: 0.453, mean reward: 0.005 [0.000, 0.011], mean action: 0.183 [-1.124, 1.122], mean observation: 0.146 [-10.370, 20.171], loss: --, mean_squared_error: --, mean_q: --
  332/2000: episode: 4, duration: 2.254s, episode steps: 83, steps per second: 37, episode reward: 0.457, mean reward: 0.006 [0.000, 0.011], mean action: 0.182 [-1.198, 1.176], mean observation: 0.146 [-10.349, 20.043], loss: --, mean_squared_error: --, mean_q: --
  415/2000: episode: 5, duration: 2.325s, episode steps: 83, steps per second: 36, episode reward: 0.457, mean reward: 0.006 [0.000, 0.011], mean action: 0.206 [-1.099, 1.198], mean observation: 0.146 [-10

  664/2000: episode: 8, duration: 2.300s, episode steps: 83, steps per second: 36, episode reward: 0.458, mean reward: 0.006 [0.000, 0.011], mean action: 0.209 [-1.071, 1.260], mean observation: 0.146 [-10.344, 20.018], loss: --, mean_squared_error: --, mean_q: --
  747/2000: episode: 9, duration: 2.276s, episode steps: 83, steps per second: 36, episode reward: 0.460, mean reward: 0.006 [0.000, 0.011], mean action: 0.201 [-1.240, 1.200], mean observation: 0.146 [-10.340, 20.197], loss: --, mean_squared_error: --, mean_q: --
  830/2000: episode: 10, duration: 2.230s, episode steps: 83, steps per second: 37, episode reward: 0.459, mean reward: 0.006 [0.001, 0.011], mean action: 0.202 [-1.274, 1.192], mean observation: 0.146 [-10.383, 20.182], loss: --, mean_squared_error: --, mean_q: --
  913/2000: episode: 11, duration: 2.244s, episode steps: 83, steps per second: 37, episode reward: 0.459, mean reward: 0.006 [0.001, 0.011], mean action: 0.201 [-1.096, 1.131], mean observation: 0.146 [-

 1135/2000: episode: 14, duration: 3.079s, episode steps: 81, steps per second: 26, episode reward: 0.446, mean reward: 0.006 [0.000, 0.011], mean action: 0.192 [-1.129, 1.145], mean observation: 0.145 [-10.391, 20.102], loss: 0.000055, mean_squared_error: 0.000110, mean_q: 0.295050
 1218/2000: episode: 15, duration: 3.119s, episode steps: 83, steps per second: 27, episode reward: 0.457, mean reward: 0.006 [0.001, 0.011], mean action: 0.192 [-1.071, 1.202], mean observation: 0.146 [-10.362, 20.033], loss: 0.000017, mean_squared_error: 0.000034, mean_q: 0.293906
 1301/2000: episode: 16, duration: 3.063s, episode steps: 83, steps per second: 27, episode reward: 0.460, mean reward: 0.006 [0.000, 0.011], mean action: 0.180 [-1.159, 1.150], mean observation: 0.146 [-10.287, 20.285], loss: 0.000081, mean_squared_error: 0.000161, mean_q: 0.286311
 1384/2000: episode: 17, duration: 3.069s, episode steps: 83, steps per second: 27, episode reward: 0.461, mean reward: 0.006 [0.001, 0.011], mean a

 1715/2000: episode: 20, duration: 3.662s, episode steps: 99, steps per second: 27, episode reward: 0.470, mean reward: 0.005 [0.000, 0.011], mean action: 0.190 [-1.141, 1.216], mean observation: 0.144 [-13.554, 20.860], loss: 0.000027, mean_squared_error: 0.000055, mean_q: 0.284888
 1808/2000: episode: 21, duration: 3.395s, episode steps: 93, steps per second: 27, episode reward: 0.468, mean reward: 0.005 [0.001, 0.010], mean action: 0.181 [-1.151, 1.182], mean observation: 0.144 [-13.328, 20.942], loss: 0.000093, mean_squared_error: 0.000186, mean_q: 0.286440
 1901/2000: episode: 22, duration: 3.404s, episode steps: 93, steps per second: 27, episode reward: 0.469, mean reward: 0.005 [0.001, 0.011], mean action: 0.220 [-1.085, 1.175], mean observation: 0.145 [-13.533, 20.931], loss: 0.000021, mean_squared_error: 0.000041, mean_q: 0.287751
 1999/2000: episode: 23, duration: 3.624s, episode steps: 98, steps per second: 27, episode reward: 0.474, mean reward: 0.005 [0.001, 0.010], mean a

  546/2000: episode: 6, duration: 2.554s, episode steps: 91, steps per second: 36, episode reward: 0.470, mean reward: 0.005 [0.001, 0.010], mean action: 0.079 [-1.164, 1.147], mean observation: 0.142 [-21.935, 21.388], loss: --, mean_squared_error: --, mean_q: --
  637/2000: episode: 7, duration: 2.549s, episode steps: 91, steps per second: 36, episode reward: 0.469, mean reward: 0.005 [0.001, 0.010], mean action: 0.089 [-1.236, 1.144], mean observation: 0.142 [-21.753, 21.351], loss: --, mean_squared_error: --, mean_q: --
  728/2000: episode: 8, duration: 2.617s, episode steps: 91, steps per second: 35, episode reward: 0.465, mean reward: 0.005 [0.001, 0.010], mean action: 0.101 [-1.119, 1.155], mean observation: 0.142 [-21.970, 21.105], loss: --, mean_squared_error: --, mean_q: --
  819/2000: episode: 9, duration: 2.583s, episode steps: 91, steps per second: 35, episode reward: 0.469, mean reward: 0.005 [0.001, 0.010], mean action: 0.092 [-1.115, 1.154], mean observation: 0.142 [-21

 1347/2000: episode: 15, duration: 3.532s, episode steps: 92, steps per second: 26, episode reward: 0.478, mean reward: 0.005 [0.000, 0.011], mean action: 0.217 [-1.175, 1.166], mean observation: 0.147 [-14.029, 16.063], loss: 0.000041, mean_squared_error: 0.000082, mean_q: 0.278389
 1438/2000: episode: 16, duration: 3.535s, episode steps: 91, steps per second: 26, episode reward: 0.470, mean reward: 0.005 [0.000, 0.010], mean action: 0.182 [-1.169, 1.140], mean observation: 0.147 [-14.804, 15.559], loss: 0.000030, mean_squared_error: 0.000061, mean_q: 0.282745
 1525/2000: episode: 17, duration: 3.321s, episode steps: 87, steps per second: 26, episode reward: 0.473, mean reward: 0.005 [-0.000, 0.012], mean action: 0.196 [-1.143, 1.201], mean observation: 0.148 [-8.949, 17.185], loss: 0.000100, mean_squared_error: 0.000200, mean_q: 0.275157
 1616/2000: episode: 18, duration: 3.502s, episode steps: 91, steps per second: 26, episode reward: 0.472, mean reward: 0.005 [0.000, 0.011], mean a

  187/2000: episode: 2, duration: 2.565s, episode steps: 93, steps per second: 36, episode reward: 0.480, mean reward: 0.005 [-0.001, 0.013], mean action: 0.151 [-1.161, 1.347], mean observation: 0.149 [-11.335, 14.030], loss: --, mean_squared_error: --, mean_q: --
  280/2000: episode: 3, duration: 2.570s, episode steps: 93, steps per second: 36, episode reward: 0.478, mean reward: 0.005 [-0.001, 0.013], mean action: 0.164 [-1.125, 1.201], mean observation: 0.147 [-12.677, 14.274], loss: --, mean_squared_error: --, mean_q: --
  372/2000: episode: 4, duration: 2.636s, episode steps: 92, steps per second: 35, episode reward: 0.480, mean reward: 0.005 [0.000, 0.012], mean action: 0.156 [-1.126, 1.144], mean observation: 0.149 [-11.152, 14.306], loss: --, mean_squared_error: --, mean_q: --
  463/2000: episode: 5, duration: 2.605s, episode steps: 91, steps per second: 35, episode reward: 0.481, mean reward: 0.005 [0.000, 0.012], mean action: 0.175 [-1.206, 1.212], mean observation: 0.148 [-

  449/2000: episode: 5, duration: 2.533s, episode steps: 90, steps per second: 36, episode reward: 0.479, mean reward: 0.005 [-0.001, 0.012], mean action: 0.185 [-1.138, 1.171], mean observation: 0.150 [-10.600, 14.127], loss: --, mean_squared_error: --, mean_q: --
  541/2000: episode: 6, duration: 2.662s, episode steps: 92, steps per second: 35, episode reward: 0.484, mean reward: 0.005 [-0.001, 0.012], mean action: 0.185 [-1.147, 1.228], mean observation: 0.152 [-11.228, 14.427], loss: --, mean_squared_error: --, mean_q: --
  631/2000: episode: 7, duration: 2.561s, episode steps: 90, steps per second: 35, episode reward: 0.481, mean reward: 0.005 [-0.001, 0.013], mean action: 0.213 [-1.121, 1.176], mean observation: 0.152 [-10.877, 14.299], loss: --, mean_squared_error: --, mean_q: --
  720/2000: episode: 8, duration: 2.473s, episode steps: 89, steps per second: 36, episode reward: 0.476, mean reward: 0.005 [-0.000, 0.012], mean action: 0.199 [-1.161, 1.149], mean observation: 0.150 

### Test Agent

In [None]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(nb_episodes=5, visualize=True, nb_max_episode_steps=1000)