# **Reinforcement Learning: Taller 4**
## Estudiantes: Juan Pablo Reyes Fajardo y Santiago Rodríguez Ávila 

In [6]:
import gymnasium as gym
import numpy as np
import itertools
import operator
from tqdm import tqdm

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

import visualkeras

In [None]:
env = gym.make('CartPole-v1')

# 1. RL Tabular

In [None]:
# Funcion Auxiliar para diccionarios
def key_max(d):
        return max(d.items(), key=operator.itemgetter(1))
def key_min(d):
        return min(d.items(), key=operator.itemgetter(1))

Política $\epsilon$ - Greedy 

(Útil más adelante)

In [None]:
def eps_greedy(Q_, state,epsilon=0.1):
    rand = np.random.uniform()
    if rand>epsilon:
        return key_max(Q_[state])[0],1-epsilon
    else:
        return key_min(Q_[state])[0],epsilon

## Discretización

Inicialmente se realizan múltiples experimentos para determinar límites razonables para las variables a discretizar (aquellas cuyo espacio de observación es infinito)

In [None]:
velocidades_absolutas_maximas={"Lineal":[],"Angular":[]}

observation, info = env.reset()

abs_lineal=abs(observation[1])
abs_angular=abs(observation[3])

for _ in range(int(1e6)):
    
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)
    
    if abs(observation[1])>abs_lineal:
        abs_lineal=abs(observation[1])
    if abs(observation[3])>abs_angular:
        abs_angular=abs(observation[3])

    if terminated or truncated:
        observation, info = env.reset()
        velocidades_absolutas_maximas["Lineal"].append(abs_lineal)
        velocidades_absolutas_maximas["Angular"].append(abs_angular)

env.close()

vel_lin_abs_max=np.mean(velocidades_absolutas_maximas["Lineal"])
vel_ang_abs_max=np.mean(velocidades_absolutas_maximas["Angular"])
print(f'Promedio de velocidad lineal absoluta máxima: \
      {vel_lin_abs_max} \
      \nPromedio de velocidad angular absoluta máxima: \
      {vel_ang_abs_max}')

Discretización de estados:

In [None]:
# Límites del espacio de observación del MDP real
cart_high_var = env.observation_space.high
cart_low_var = env.observation_space.low

# Espacios de observación discretizados
observation_space_discrete_400=[np.linspace(cart_low_var[0], cart_high_var[0], num= 5),\
                            np.linspace(-vel_lin_abs_max, vel_lin_abs_max, num= 4),\
                            np.linspace(cart_low_var[2], cart_high_var[2], num = 5),\
                            np.linspace(-vel_ang_abs_max, vel_ang_abs_max, num= 4)]\

observation_space_discrete_4096=[np.linspace(cart_low_var[0], cart_high_var[0], num= 8),\
                            np.linspace(-vel_lin_abs_max, vel_lin_abs_max, num= 8),\
                            np.linspace(cart_low_var[2], cart_high_var[2], num = 8),\
                            np.linspace(-vel_ang_abs_max, vel_ang_abs_max, num= 8)]

# Uso de iteradores para obtener todos los estados a partir del espacio de obsevación discreto
states_400=list(itertools.product(*observation_space_discrete_400))
states_4096=list(itertools.product(*observation_space_discrete_4096))

def init_Q_400():
    Q_table={}
    for i in states_400:
        Q_table[i] = {0:0,1:0}
    return Q_table

def init_Q_4096():
    Q_table={}
    for i in states_4096:
        Q_table[i] = {0:0,1:0}
    return Q_table
         
def discretize_400(new_state):
    # car pos, car vel, pole angle, pole vel
    discretizacion=[0]*4
    for i in range(len(discretizacion)):
        dif = [(abs(x - new_state[i])) for x in observation_space_discrete_400[i]]
        discretizacion[i] = observation_space_discrete_400[i][dif.index(min(dif))]
        
    return tuple(discretizacion)

def discretize_4096(new_state):
    # car pos, car vel, pole angle, pole vel
    discretizacion=[0]*4
    for i in range(len(discretizacion)):
        dif = [(abs(x - new_state[i])) for x in observation_space_discrete_4096[i]]
        discretizacion[i] = observation_space_discrete_4096[i][dif.index(min(dif))]
        
    return tuple(discretizacion)

## Estimación de Q con Q-Learning 

### 400 Estados

In [None]:
def Q_Learning_400(gamma,alpha):
    terminated=False
    observation, info = env.reset()
    observation=discretize_400(observation)
    while not terminated:
        
        action,_=eps_greedy(Q,observation,0.1)
        
        observation_, reward, terminated, truncated, info = env.step(action)
        observation_=discretize_400(observation_)
        
        Q_=key_max(Q[observation_])[1]

        Q[observation][action]+=alpha*(reward+gamma*Q_-Q[observation][action])
        observation=observation_

In [None]:
env = gym.make('CartPole-v1',render_mode='human')
Q=init_Q_400()
policy=dict.fromkeys(states_400, 0)
for i in range(0,10):
    Q_Learning_400(0.9,0.1)

In [None]:
def Q_Learning_4096(gamma,alpha):
    terminated=False
    observation, info = env.reset()
    observation=discretize_4096(observation)
    while not terminated:
        
        action,_=eps_greedy(Q,observation,0.1)
        
        observation_, reward, terminated, truncated, info = env.step(action)
        observation_=discretize_4096(observation_)
        
        Q_=key_max(Q[observation_])[1]

        Q[observation][action]+=alpha*(reward+gamma*Q_-Q[observation][action])
        observation=observation_

In [None]:
env = gym.make('CartPole-v1')
Q=init_Q_4096()
policy=dict.fromkeys(states_4096, 0)
    
for i in tqdm(range(int(1e6))):
    Q_Learning_4096(0.9,0.1)

In [None]:
for state_4096 in states_4096:
    policy[state_4096],_=eps_greedy(Q,tuple(state_4096),0)

In [None]:

env = gym.make('CartPole-v1',render_mode="human")


rwds=[]
for _ in range(100):
    observation, info = env.reset()
    r=0
    while True:
        action = policy[discretize_4096(observation)]
        observation, reward, terminated, truncated, info = env.step(action)
        r+=reward

        if terminated or truncated:
            observation, info = env.reset()
            rwds.append(r)
            break
env.close()
print(np.mean(rwds))

In [8]:

ENV_NAME = 'CartPole-v1'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)

nb_actions = env.action_space.n

model = Sequential(
    [
        Flatten(input_shape=(1,) + env.observation_space.shape),
        Dense(16, activation="relu"),
        Dense(2, activation="linear"),
    ]
)


print(model.summary())

policy = EpsGreedyQPolicy()

memory = SequentialMemory(limit=50000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)

dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])

dqn.fit(env, nb_steps=100, visualize=True, verbose=2)

visualkeras.layered_view(model)

dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 4)                 0         
                                                                 
 dense_2 (Dense)             (None, 16)                80        
                                                                 
 dense_3 (Dense)             (None, 2)                 34        
                                                                 
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None
Training for 100 steps ...


ValueError: Error when checking input: expected flatten_1_input to have shape (1, 4) but got array with shape (1, 2)

In [1]:

!python --version
print('NumPy', np.__version__)
print('Tensorflow', tensorflow.__version__)

Python 3.9.13


NameError: name 'np' is not defined

In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory


ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=500000, window_length=1)
policy = EpsGreedyQPolicy()

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
dqn.fit(env, nb_steps=500000, visualize=True, verbose=2)

# After training is done, we save the final weights.
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
dqn.test(env, nb_episodes=5, visualize=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_9 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_10 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_11 (Activation)   (None, 16)                0         
__________



     12/500000: episode: 1, duration: 1.083s, episode steps: 12, steps per second: 11, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.114 [-1.751, 0.973], loss: 0.497553, mean_absolute_error: 0.509256, mean_q: -0.016498
     25/500000: episode: 2, duration: 0.130s, episode steps: 13, steps per second: 100, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.923 [0.000, 1.000], mean observation: -0.074 [-3.253, 2.194], loss: 0.440268, mean_absolute_error: 0.485556, mean_q: 0.067063




     34/500000: episode: 3, duration: 0.121s, episode steps: 9, steps per second: 74, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.127 [-2.809, 1.812], loss: 0.323854, mean_absolute_error: 0.429185, mean_q: 0.236010
     44/500000: episode: 4, duration: 0.156s, episode steps: 10, steps per second: 64, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.157 [-3.116, 1.934], loss: 0.248621, mean_absolute_error: 0.377039, mean_q: 0.416382
     52/500000: episode: 5, duration: 0.086s, episode steps: 8, steps per second: 93, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.138 [-2.563, 1.604], loss: 0.172425, mean_absolute_error: 0.296372, mean_q: 0.644834
     61/500000: episode: 6, duration: 0.083s, episode steps: 9, steps per second: 109, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000]

    330/500000: episode: 32, duration: 0.098s, episode steps: 9, steps per second: 92, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.141 [-1.807, 2.880], loss: 0.110956, mean_absolute_error: 1.358550, mean_q: 2.549176
    348/500000: episode: 33, duration: 0.175s, episode steps: 18, steps per second: 103, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.611 [0.000, 1.000], mean observation: -0.053 [-1.998, 1.386], loss: 0.135675, mean_absolute_error: 1.410125, mean_q: 2.586421
    361/500000: episode: 34, duration: 0.124s, episode steps: 13, steps per second: 105, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.615 [0.000, 1.000], mean observation: -0.083 [-1.451, 1.026], loss: 0.109037, mean_absolute_error: 1.464179, mean_q: 2.694920
    375/500000: episode: 35, duration: 0.110s, episode steps: 14, steps per second: 127, episode reward: 14.000, mean reward: 1.000 [1.00

    789/500000: episode: 61, duration: 0.385s, episode steps: 37, steps per second: 96, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.005 [-1.188, 0.958], loss: 0.447074, mean_absolute_error: 3.003776, mean_q: 5.594283
    863/500000: episode: 62, duration: 0.646s, episode steps: 74, steps per second: 115, episode reward: 74.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.144 [-1.688, 1.261], loss: 0.473266, mean_absolute_error: 3.174543, mean_q: 5.922286
    885/500000: episode: 63, duration: 0.173s, episode steps: 22, steps per second: 127, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.071 [-0.752, 1.411], loss: 0.412897, mean_absolute_error: 3.331103, mean_q: 6.267746
    904/500000: episode: 64, duration: 0.156s, episode steps: 19, steps per second: 122, episode reward: 19.000, mean reward: 1.000 [1.0

   3047/500000: episode: 90, duration: 0.850s, episode steps: 108, steps per second: 127, episode reward: 108.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.223 [-1.241, 0.739], loss: 1.572787, mean_absolute_error: 11.461629, mean_q: 23.290697
   3150/500000: episode: 91, duration: 0.831s, episode steps: 103, steps per second: 124, episode reward: 103.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.220 [-1.230, 0.534], loss: 1.693874, mean_absolute_error: 11.992811, mean_q: 24.310013
   3291/500000: episode: 92, duration: 1.128s, episode steps: 141, steps per second: 125, episode reward: 141.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.204 [-1.659, 0.905], loss: 1.806672, mean_absolute_error: 12.633895, mean_q: 25.601759
   3408/500000: episode: 93, duration: 0.943s, episode steps: 117, steps per second: 124, episode reward: 117.000, mean r

   7571/500000: episode: 118, duration: 1.662s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.208 [-0.971, 1.797], loss: 4.440762, mean_absolute_error: 28.987276, mean_q: 59.010757
   7771/500000: episode: 119, duration: 1.640s, episode steps: 200, steps per second: 122, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.074 [-0.785, 1.136], loss: 5.734856, mean_absolute_error: 29.929678, mean_q: 60.810898
   7971/500000: episode: 120, duration: 1.606s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.151 [-0.926, 1.512], loss: 5.482301, mean_absolute_error: 30.399019, mean_q: 61.803638
   8171/500000: episode: 121, duration: 1.731s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean 

  12344/500000: episode: 146, duration: 1.429s, episode steps: 180, steps per second: 126, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.351 [-0.993, 2.424], loss: 8.731461, mean_absolute_error: 39.786831, mean_q: 80.556511
  12513/500000: episode: 147, duration: 1.350s, episode steps: 169, steps per second: 125, episode reward: 169.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.368 [-0.869, 2.407], loss: 8.780008, mean_absolute_error: 39.700893, mean_q: 80.448067
  12667/500000: episode: 148, duration: 1.204s, episode steps: 154, steps per second: 128, episode reward: 154.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: 0.406 [-0.784, 2.402], loss: 6.413282, mean_absolute_error: 40.389721, mean_q: 81.765907
  12848/500000: episode: 149, duration: 1.488s, episode steps: 181, steps per second: 122, episode reward: 181.000, mean 

  16882/500000: episode: 174, duration: 1.218s, episode steps: 149, steps per second: 122, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.376 [-0.930, 2.422], loss: 16.751335, mean_absolute_error: 50.237713, mean_q: 101.200592
  17040/500000: episode: 175, duration: 1.266s, episode steps: 158, steps per second: 125, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.389 [-0.756, 2.415], loss: 13.844019, mean_absolute_error: 50.561970, mean_q: 101.976616
  17209/500000: episode: 176, duration: 1.386s, episode steps: 169, steps per second: 122, episode reward: 169.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.359 [-0.996, 2.431], loss: 17.128485, mean_absolute_error: 50.844692, mean_q: 102.437569
  17380/500000: episode: 177, duration: 1.414s, episode steps: 171, steps per second: 121, episode reward: 171.000,

  21537/500000: episode: 202, duration: 1.322s, episode steps: 159, steps per second: 120, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: 0.414 [-1.451, 2.416], loss: 6.274466, mean_absolute_error: 50.344322, mean_q: 101.443031
  21709/500000: episode: 203, duration: 1.438s, episode steps: 172, steps per second: 120, episode reward: 172.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.386 [-1.263, 2.429], loss: 9.578525, mean_absolute_error: 50.232693, mean_q: 101.175369
  21864/500000: episode: 204, duration: 1.232s, episode steps: 155, steps per second: 126, episode reward: 155.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.431 [-1.391, 2.411], loss: 10.729667, mean_absolute_error: 50.137936, mean_q: 101.043022
  22014/500000: episode: 205, duration: 1.212s, episode steps: 150, steps per second: 124, episode reward: 150.000, m

  26110/500000: episode: 230, duration: 1.695s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.177 [-1.190, 1.442], loss: 16.042097, mean_absolute_error: 53.509102, mean_q: 107.754715
  26310/500000: episode: 231, duration: 1.647s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.182 [-1.542, 1.618], loss: 10.822270, mean_absolute_error: 53.901173, mean_q: 108.691093
  26372/500000: episode: 232, duration: 0.501s, episode steps: 62, steps per second: 124, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.007 [-1.508, 1.848], loss: 27.600344, mean_absolute_error: 54.642231, mean_q: 109.459473
  26572/500000: episode: 233, duration: 1.612s, episode steps: 200, steps per second: 124, episode reward: 200.000, m

  30814/500000: episode: 258, duration: 1.602s, episode steps: 200, steps per second: 125, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.087 [-1.502, 1.445], loss: 19.714748, mean_absolute_error: 55.689953, mean_q: 112.150726
  31014/500000: episode: 259, duration: 1.631s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.144 [-1.517, 1.178], loss: 14.996939, mean_absolute_error: 55.930298, mean_q: 113.147400
  31214/500000: episode: 260, duration: 1.655s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-1.729, 1.671], loss: 20.996849, mean_absolute_error: 55.910450, mean_q: 112.803253
  31414/500000: episode: 261, duration: 1.650s, episode steps: 200, steps per second: 121, episode reward: 200.0

  36180/500000: episode: 286, duration: 1.731s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.303 [-1.834, 0.849], loss: 17.599890, mean_absolute_error: 52.255672, mean_q: 105.277176
  36380/500000: episode: 287, duration: 1.694s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.360 [-1.898, 0.885], loss: 20.809614, mean_absolute_error: 52.281063, mean_q: 105.338829
  36580/500000: episode: 288, duration: 1.728s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.383 [-2.143, 0.519], loss: 15.633018, mean_absolute_error: 52.764248, mean_q: 106.379684
  36780/500000: episode: 289, duration: 1.722s, episode steps: 200, steps per second: 116, episode reward: 200.0

  41418/500000: episode: 314, duration: 1.365s, episode steps: 159, steps per second: 117, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.383 [-2.402, 0.441], loss: 22.622396, mean_absolute_error: 53.878143, mean_q: 108.502373
  41618/500000: episode: 315, duration: 2.151s, episode steps: 200, steps per second: 93, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.129 [-0.999, 0.983], loss: 22.447277, mean_absolute_error: 53.853081, mean_q: 108.766403
  41818/500000: episode: 316, duration: 1.737s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.109 [-0.816, 0.807], loss: 27.033464, mean_absolute_error: 54.392139, mean_q: 109.787468
  41954/500000: episode: 317, duration: 1.143s, episode steps: 136, steps per second: 119, episode reward: 136.000,

  46896/500000: episode: 342, duration: 1.765s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.000 [-0.562, 0.376], loss: 22.523130, mean_absolute_error: 64.947838, mean_q: 131.325912
  47096/500000: episode: 343, duration: 1.771s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.004 [-0.741, 0.552], loss: 30.447886, mean_absolute_error: 65.782005, mean_q: 132.692322
  47296/500000: episode: 344, duration: 1.776s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.006 [-1.015, 0.736], loss: 35.640553, mean_absolute_error: 65.729645, mean_q: 132.284622
  47496/500000: episode: 345, duration: 1.760s, episode steps: 200, steps per second: 114, episode reward: 200.000,

  52174/500000: episode: 370, duration: 1.712s, episode steps: 190, steps per second: 111, episode reward: 190.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.401 [-2.402, 0.814], loss: 13.459195, mean_absolute_error: 64.254982, mean_q: 129.313644
  52374/500000: episode: 371, duration: 1.790s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.328 [-1.923, 0.985], loss: 27.719213, mean_absolute_error: 64.683693, mean_q: 129.581818
  52574/500000: episode: 372, duration: 1.732s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.326 [-1.930, 0.671], loss: 34.440002, mean_absolute_error: 64.194092, mean_q: 128.391388
  52774/500000: episode: 373, duration: 1.766s, episode steps: 200, steps per second: 113, episode reward: 200.0

  57744/500000: episode: 398, duration: 1.825s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.157 [-0.958, 0.696], loss: 28.070000, mean_absolute_error: 66.199341, mean_q: 132.946762
  57944/500000: episode: 399, duration: 2.017s, episode steps: 200, steps per second: 99, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.181 [-1.556, 1.484], loss: 25.627924, mean_absolute_error: 66.024757, mean_q: 132.985413
  58144/500000: episode: 400, duration: 1.791s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.199 [-1.113, 0.701], loss: 25.600508, mean_absolute_error: 65.922462, mean_q: 132.713196
  58344/500000: episode: 401, duration: 1.724s, episode steps: 200, steps per second: 116, episode reward: 200.00

  63108/500000: episode: 426, duration: 1.555s, episode steps: 193, steps per second: 124, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: -0.387 [-2.422, 1.537], loss: 50.398891, mean_absolute_error: 72.622002, mean_q: 145.762955
  63308/500000: episode: 427, duration: 1.636s, episode steps: 200, steps per second: 122, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.340 [-2.084, 0.745], loss: 22.016062, mean_absolute_error: 72.573708, mean_q: 146.444214
  63508/500000: episode: 428, duration: 1.607s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.332 [-2.199, 1.214], loss: 21.880293, mean_absolute_error: 73.226028, mean_q: 147.853043
  63708/500000: episode: 429, duration: 1.662s, episode steps: 200, steps per second: 120, episode reward: 200.0

  68708/500000: episode: 454, duration: 1.815s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.186 [-0.988, 0.745], loss: 21.696466, mean_absolute_error: 67.030067, mean_q: 134.499496
  68908/500000: episode: 455, duration: 1.743s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.201 [-1.217, 0.833], loss: 21.698740, mean_absolute_error: 67.500366, mean_q: 135.491165
  69092/500000: episode: 456, duration: 1.619s, episode steps: 184, steps per second: 114, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.381 [-2.400, 1.276], loss: 27.215954, mean_absolute_error: 66.938347, mean_q: 134.053543
  69292/500000: episode: 457, duration: 1.787s, episode steps: 200, steps per second: 112, episode reward: 200.0

  74292/500000: episode: 482, duration: 1.769s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.008 [-1.124, 1.107], loss: 29.967680, mean_absolute_error: 53.442333, mean_q: 106.701767
  74492/500000: episode: 483, duration: 1.731s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.018 [-0.817, 0.844], loss: 16.725447, mean_absolute_error: 53.372196, mean_q: 106.836174
  74692/500000: episode: 484, duration: 1.766s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.018 [-1.257, 1.403], loss: 14.266499, mean_absolute_error: 53.537617, mean_q: 107.210960
  74892/500000: episode: 485, duration: 1.749s, episode steps: 200, steps per second: 114, episode reward: 200.000,

  79892/500000: episode: 510, duration: 1.680s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.013 [-1.000, 0.739], loss: 12.164704, mean_absolute_error: 49.578468, mean_q: 99.181404
  80092/500000: episode: 511, duration: 1.618s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.008 [-1.044, 0.892], loss: 9.713131, mean_absolute_error: 49.609730, mean_q: 99.145546
  80292/500000: episode: 512, duration: 1.690s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.055 [-0.887, 0.787], loss: 15.079620, mean_absolute_error: 49.265560, mean_q: 98.373146
  80492/500000: episode: 513, duration: 1.744s, episode steps: 200, steps per second: 115, episode reward: 200.000, m

  84990/500000: episode: 538, duration: 1.590s, episode steps: 200, steps per second: 126, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.041 [-1.715, 1.806], loss: 12.653613, mean_absolute_error: 45.292118, mean_q: 90.347282
  85190/500000: episode: 539, duration: 1.654s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.029 [-1.141, 0.943], loss: 10.513284, mean_absolute_error: 45.029919, mean_q: 89.936501
  85390/500000: episode: 540, duration: 1.655s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.078 [-1.395, 1.625], loss: 7.940914, mean_absolute_error: 45.349503, mean_q: 90.514137
  85590/500000: episode: 541, duration: 1.647s, episode steps: 200, steps per second: 121, episode reward: 200.000, 

  89768/500000: episode: 566, duration: 1.280s, episode steps: 150, steps per second: 117, episode reward: 150.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.457 [-2.404, 0.748], loss: 12.313596, mean_absolute_error: 43.586666, mean_q: 87.190331
  89968/500000: episode: 567, duration: 1.755s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.414 [-2.373, 1.483], loss: 14.041553, mean_absolute_error: 43.420177, mean_q: 86.747421
  90112/500000: episode: 568, duration: 1.159s, episode steps: 144, steps per second: 124, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.444 [-2.423, 0.868], loss: 10.452408, mean_absolute_error: 43.908081, mean_q: 88.002632
  90252/500000: episode: 569, duration: 1.168s, episode steps: 140, steps per second: 120, episode reward: 140.000,

  94838/500000: episode: 594, duration: 1.698s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.019 [-1.181, 1.381], loss: 10.344338, mean_absolute_error: 45.337162, mean_q: 90.482460
  95038/500000: episode: 595, duration: 1.708s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.013 [-1.251, 1.215], loss: 13.105999, mean_absolute_error: 45.193428, mean_q: 90.052734
  95238/500000: episode: 596, duration: 1.678s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.029 [-1.525, 1.517], loss: 16.112947, mean_absolute_error: 45.261494, mean_q: 90.322334
  95259/500000: episode: 597, duration: 0.187s, episode steps: 21, steps per second: 112, episode reward: 21.000, mean

 100077/500000: episode: 622, duration: 1.837s, episode steps: 200, steps per second: 109, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.129 [-1.104, 1.160], loss: 14.122185, mean_absolute_error: 41.913349, mean_q: 83.444923
 100277/500000: episode: 623, duration: 1.722s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.014 [-1.483, 1.358], loss: 14.315076, mean_absolute_error: 41.763897, mean_q: 83.148552
 100477/500000: episode: 624, duration: 1.677s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.019 [-0.659, 0.888], loss: 9.006528, mean_absolute_error: 41.132412, mean_q: 82.214973
 100677/500000: episode: 625, duration: 1.655s, episode steps: 200, steps per second: 121, episode reward: 200.000, me

 105387/500000: episode: 650, duration: 1.700s, episode steps: 190, steps per second: 112, episode reward: 190.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.281 [-2.424, 1.002], loss: 11.691085, mean_absolute_error: 38.606598, mean_q: 76.922562
 105587/500000: episode: 651, duration: 1.799s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.153 [-1.882, 0.986], loss: 8.298207, mean_absolute_error: 38.487736, mean_q: 76.809280
 105787/500000: episode: 652, duration: 1.812s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.173 [-1.648, 1.888], loss: 7.814432, mean_absolute_error: 38.591278, mean_q: 77.160736
 105902/500000: episode: 653, duration: 0.993s, episode steps: 115, steps per second: 116, episode reward: 115.000, me

 110348/500000: episode: 678, duration: 1.044s, episode steps: 129, steps per second: 124, episode reward: 129.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.527 [0.000, 1.000], mean observation: 0.351 [-0.769, 1.549], loss: 13.876936, mean_absolute_error: 38.172852, mean_q: 76.080490
 110548/500000: episode: 679, duration: 1.684s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.173 [-1.288, 1.530], loss: 8.459229, mean_absolute_error: 38.193348, mean_q: 76.225121
 110626/500000: episode: 680, duration: 0.635s, episode steps: 78, steps per second: 123, episode reward: 78.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.078 [-0.983, 1.952], loss: 8.120513, mean_absolute_error: 38.322933, mean_q: 76.494644
 110826/500000: episode: 681, duration: 1.697s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean r

 115145/500000: episode: 706, duration: 1.257s, episode steps: 156, steps per second: 124, episode reward: 156.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.416 [-0.889, 2.747], loss: 9.975401, mean_absolute_error: 39.138329, mean_q: 78.185387
 115207/500000: episode: 707, duration: 0.532s, episode steps: 62, steps per second: 116, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.039 [-0.835, 1.736], loss: 8.063914, mean_absolute_error: 39.217571, mean_q: 78.291351
 115407/500000: episode: 708, duration: 1.685s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.155 [-1.714, 1.680], loss: 13.230325, mean_absolute_error: 39.254772, mean_q: 78.263161
 115607/500000: episode: 709, duration: 1.726s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean r

 120094/500000: episode: 734, duration: 1.673s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.137 [-0.997, 1.284], loss: 10.734838, mean_absolute_error: 41.169601, mean_q: 82.296913
 120294/500000: episode: 735, duration: 1.662s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.137 [-1.239, 1.501], loss: 12.888020, mean_absolute_error: 41.199814, mean_q: 82.529411
 120494/500000: episode: 736, duration: 1.685s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.141 [-1.388, 1.255], loss: 11.939819, mean_absolute_error: 41.694557, mean_q: 83.750954
 120694/500000: episode: 737, duration: 1.773s, episode steps: 200, steps per second: 113, episode reward: 200.000, me

 125328/500000: episode: 762, duration: 1.626s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.129 [-0.995, 1.492], loss: 17.617937, mean_absolute_error: 55.248623, mean_q: 110.946327
 125528/500000: episode: 763, duration: 1.668s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.099 [-1.016, 1.097], loss: 26.767872, mean_absolute_error: 55.388481, mean_q: 110.961113
 125728/500000: episode: 764, duration: 1.608s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.143 [-0.999, 0.988], loss: 19.937572, mean_absolute_error: 55.148266, mean_q: 110.863327
 125928/500000: episode: 765, duration: 1.614s, episode steps: 200, steps per second: 124, episode reward: 200.000,

 130829/500000: episode: 790, duration: 1.798s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: -0.001 [-1.345, 2.913], loss: 22.186090, mean_absolute_error: 55.254238, mean_q: 110.809929
 131029/500000: episode: 791, duration: 1.812s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: -0.118 [-1.861, 2.750], loss: 26.466597, mean_absolute_error: 55.365822, mean_q: 111.116035
 131229/500000: episode: 792, duration: 1.800s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.535 [0.000, 1.000], mean observation: -0.056 [-1.904, 2.724], loss: 19.360168, mean_absolute_error: 55.691479, mean_q: 112.301933
 131429/500000: episode: 793, duration: 1.791s, episode steps: 200, steps per second: 112, episode reward: 200.0

 134569/500000: episode: 818, duration: 1.656s, episode steps: 180, steps per second: 109, episode reward: 180.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.288 [-1.868, 1.101], loss: 32.368797, mean_absolute_error: 63.784515, mean_q: 128.230408
 134769/500000: episode: 819, duration: 1.761s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.206 [-1.219, 0.917], loss: 26.526962, mean_absolute_error: 63.948692, mean_q: 128.720016
 134969/500000: episode: 820, duration: 1.877s, episode steps: 200, steps per second: 107, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.211 [-1.294, 0.971], loss: 20.412148, mean_absolute_error: 63.506180, mean_q: 127.741463
 135169/500000: episode: 821, duration: 1.846s, episode steps: 200, steps per second: 108, episode reward: 200.0

 139788/500000: episode: 846, duration: 1.815s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.008 [-1.717, 1.526], loss: 13.955335, mean_absolute_error: 58.056168, mean_q: 116.452110
 139988/500000: episode: 847, duration: 1.842s, episode steps: 200, steps per second: 109, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.113 [-1.117, 1.406], loss: 19.583397, mean_absolute_error: 58.043663, mean_q: 116.525429
 140179/500000: episode: 848, duration: 1.655s, episode steps: 191, steps per second: 115, episode reward: 191.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.497 [0.000, 1.000], mean observation: -0.132 [-1.309, 1.305], loss: 18.823978, mean_absolute_error: 57.949413, mean_q: 116.273582
 140379/500000: episode: 849, duration: 1.799s, episode steps: 200, steps per second: 111, episode reward: 200.00

 144157/500000: episode: 874, duration: 1.064s, episode steps: 122, steps per second: 115, episode reward: 122.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: 0.070 [-1.737, 1.692], loss: 18.371363, mean_absolute_error: 56.966347, mean_q: 114.547493
 144357/500000: episode: 875, duration: 1.755s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.174 [-1.494, 1.322], loss: 24.551888, mean_absolute_error: 57.660255, mean_q: 115.734001
 144369/500000: episode: 876, duration: 0.108s, episode steps: 12, steps per second: 111, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.120 [-0.929, 1.484], loss: 21.860933, mean_absolute_error: 57.465855, mean_q: 115.305779
 144435/500000: episode: 877, duration: 0.566s, episode steps: 66, steps per second: 117, episode reward: 66.000, mea

 148905/500000: episode: 903, duration: 1.763s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.176 [-1.752, 1.977], loss: 17.515587, mean_absolute_error: 54.983559, mean_q: 110.161278
 149064/500000: episode: 904, duration: 1.325s, episode steps: 159, steps per second: 120, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.402 [-2.401, 0.803], loss: 18.240866, mean_absolute_error: 54.471451, mean_q: 109.142616
 149253/500000: episode: 905, duration: 1.683s, episode steps: 189, steps per second: 112, episode reward: 189.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.487 [0.000, 1.000], mean observation: -0.307 [-2.421, 1.449], loss: 27.636698, mean_absolute_error: 54.358387, mean_q: 108.668373
 149453/500000: episode: 906, duration: 1.825s, episode steps: 200, steps per second: 110, episode reward: 200.00

 153395/500000: episode: 931, duration: 1.625s, episode steps: 178, steps per second: 110, episode reward: 178.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.101 [-2.579, 1.842], loss: 19.264563, mean_absolute_error: 55.130325, mean_q: 110.921608
 153579/500000: episode: 932, duration: 1.663s, episode steps: 184, steps per second: 111, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.473 [0.000, 1.000], mean observation: -0.424 [-2.415, 1.759], loss: 25.522758, mean_absolute_error: 55.407780, mean_q: 111.586449
 153779/500000: episode: 933, duration: 1.766s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.350 [-2.022, 1.518], loss: 26.485144, mean_absolute_error: 56.014404, mean_q: 112.399506
 153872/500000: episode: 934, duration: 0.776s, episode steps: 93, steps per second: 120, episode reward: 93.000

 158374/500000: episode: 959, duration: 1.659s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.158 [-1.369, 1.259], loss: 30.617039, mean_absolute_error: 61.724277, mean_q: 124.057556
 158574/500000: episode: 960, duration: 1.852s, episode steps: 200, steps per second: 108, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.151 [-1.402, 1.365], loss: 25.554798, mean_absolute_error: 62.326660, mean_q: 125.530617
 158774/500000: episode: 961, duration: 1.764s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.124 [-1.263, 1.112], loss: 23.535961, mean_absolute_error: 62.336601, mean_q: 125.465370
 158974/500000: episode: 962, duration: 1.847s, episode steps: 200, steps per second: 108, episode reward: 200.000,

 163099/500000: episode: 988, duration: 0.913s, episode steps: 113, steps per second: 124, episode reward: 113.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.485 [-1.165, 2.403], loss: 25.085896, mean_absolute_error: 62.401127, mean_q: 125.635757
 163205/500000: episode: 989, duration: 0.869s, episode steps: 106, steps per second: 122, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.502 [-0.930, 2.405], loss: 37.173035, mean_absolute_error: 62.414307, mean_q: 125.415138
 163320/500000: episode: 990, duration: 0.901s, episode steps: 115, steps per second: 128, episode reward: 115.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.493 [-0.970, 2.414], loss: 33.989574, mean_absolute_error: 62.758533, mean_q: 126.429749
 163335/500000: episode: 991, duration: 0.132s, episode steps: 15, steps per second: 114, episode reward: 15.000, m

 165939/500000: episode: 1016, duration: 1.687s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.396 [-0.763, 2.312], loss: 35.102798, mean_absolute_error: 71.607979, mean_q: 144.580795
 166139/500000: episode: 1017, duration: 1.667s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.329 [-0.989, 2.103], loss: 36.205185, mean_absolute_error: 72.121765, mean_q: 145.177963
 166287/500000: episode: 1018, duration: 1.199s, episode steps: 148, steps per second: 123, episode reward: 148.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.403 [-0.648, 2.409], loss: 63.404812, mean_absolute_error: 71.641129, mean_q: 143.843796
 166487/500000: episode: 1019, duration: 1.758s, episode steps: 200, steps per second: 114, episode reward: 200.

 170910/500000: episode: 1044, duration: 1.749s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.299 [-2.631, 1.867], loss: 22.804506, mean_absolute_error: 59.615166, mean_q: 119.532784
 171110/500000: episode: 1045, duration: 1.668s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.334 [-2.605, 1.804], loss: 30.029160, mean_absolute_error: 59.291836, mean_q: 118.862267
 171274/500000: episode: 1046, duration: 1.447s, episode steps: 164, steps per second: 113, episode reward: 164.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: -0.370 [-2.614, 0.880], loss: 22.219152, mean_absolute_error: 59.071255, mean_q: 118.564224
 171474/500000: episode: 1047, duration: 1.746s, episode steps: 200, steps per second: 115, episode reward: 2

 175540/500000: episode: 1072, duration: 1.661s, episode steps: 185, steps per second: 111, episode reward: 185.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.459 [0.000, 1.000], mean observation: -0.321 [-2.724, 1.214], loss: 19.622953, mean_absolute_error: 51.168900, mean_q: 102.617622
 175740/500000: episode: 1073, duration: 1.766s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.062 [-0.785, 1.661], loss: 15.090061, mean_absolute_error: 51.050518, mean_q: 102.570023
 175788/500000: episode: 1074, duration: 0.397s, episode steps: 48, steps per second: 121, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: -0.182 [-2.225, 1.848], loss: 25.014795, mean_absolute_error: 51.438801, mean_q: 102.875450
 175948/500000: episode: 1075, duration: 1.338s, episode steps: 160, steps per second: 120, episode reward: 160.

 179735/500000: episode: 1100, duration: 0.707s, episode steps: 87, steps per second: 123, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.494 [0.000, 1.000], mean observation: -0.064 [-1.749, 1.494], loss: 23.665234, mean_absolute_error: 50.800488, mean_q: 102.079071
 179935/500000: episode: 1101, duration: 1.676s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.039 [-1.854, 2.305], loss: 20.486628, mean_absolute_error: 51.016232, mean_q: 102.692024
 180096/500000: episode: 1102, duration: 1.327s, episode steps: 161, steps per second: 121, episode reward: 161.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.064 [-2.403, 1.828], loss: 25.215313, mean_absolute_error: 51.213181, mean_q: 102.997002
 180296/500000: episode: 1103, duration: 1.766s, episode steps: 200, steps per second: 113, episode reward: 200.

 184221/500000: episode: 1128, duration: 0.940s, episode steps: 115, steps per second: 122, episode reward: 115.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.460 [-3.004, 1.736], loss: 19.097946, mean_absolute_error: 51.328468, mean_q: 103.061363
 184368/500000: episode: 1129, duration: 1.167s, episode steps: 147, steps per second: 126, episode reward: 147.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.307 [-3.520, 2.008], loss: 19.134855, mean_absolute_error: 51.309624, mean_q: 102.981331
 184534/500000: episode: 1130, duration: 1.424s, episode steps: 166, steps per second: 117, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.332 [-2.664, 1.861], loss: 15.128757, mean_absolute_error: 51.234547, mean_q: 102.908348
 184731/500000: episode: 1131, duration: 1.744s, episode steps: 197, steps per second: 113, episode reward: 1

 187615/500000: episode: 1156, duration: 0.832s, episode steps: 99, steps per second: 119, episode reward: 99.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.424 [0.000, 1.000], mean observation: -0.539 [-2.962, 0.757], loss: 24.790096, mean_absolute_error: 51.572289, mean_q: 103.904625
 187710/500000: episode: 1157, duration: 0.770s, episode steps: 95, steps per second: 123, episode reward: 95.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.411 [0.000, 1.000], mean observation: -0.508 [-3.489, 1.649], loss: 14.753289, mean_absolute_error: 51.624123, mean_q: 104.438126
 187810/500000: episode: 1158, duration: 0.827s, episode steps: 100, steps per second: 121, episode reward: 100.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: -0.518 [-3.718, 1.927], loss: 18.467680, mean_absolute_error: 51.957325, mean_q: 104.892899
 187915/500000: episode: 1159, duration: 0.879s, episode steps: 105, steps per second: 119, episode reward: 105.0

 190635/500000: episode: 1184, duration: 0.941s, episode steps: 113, steps per second: 120, episode reward: 113.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.451 [0.000, 1.000], mean observation: -0.498 [-2.414, 0.607], loss: 16.043373, mean_absolute_error: 56.931648, mean_q: 114.934776
 190741/500000: episode: 1185, duration: 0.859s, episode steps: 106, steps per second: 123, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.443 [0.000, 1.000], mean observation: -0.519 [-2.415, 0.755], loss: 29.290421, mean_absolute_error: 56.916363, mean_q: 114.545792
 190844/500000: episode: 1186, duration: 0.858s, episode steps: 103, steps per second: 120, episode reward: 103.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: -0.538 [-2.602, 0.748], loss: 24.372950, mean_absolute_error: 57.485817, mean_q: 115.811531
 190959/500000: episode: 1187, duration: 0.926s, episode steps: 115, steps per second: 124, episode reward: 1

 193916/500000: episode: 1212, duration: 1.093s, episode steps: 133, steps per second: 122, episode reward: 133.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: -0.413 [-3.012, 1.562], loss: 31.358751, mean_absolute_error: 61.777210, mean_q: 124.184433
 194038/500000: episode: 1213, duration: 1.020s, episode steps: 122, steps per second: 120, episode reward: 122.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.451 [0.000, 1.000], mean observation: -0.419 [-3.228, 2.091], loss: 17.591482, mean_absolute_error: 62.207458, mean_q: 125.789543
 194173/500000: episode: 1214, duration: 1.126s, episode steps: 135, steps per second: 120, episode reward: 135.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.442 [-2.410, 1.040], loss: 26.405735, mean_absolute_error: 62.306335, mean_q: 125.746178
 194307/500000: episode: 1215, duration: 1.125s, episode steps: 134, steps per second: 119, episode reward: 1

 198971/500000: episode: 1240, duration: 1.718s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.012 [-1.500, 1.371], loss: 24.197491, mean_absolute_error: 62.428921, mean_q: 125.881149
 199171/500000: episode: 1241, duration: 1.678s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.018 [-0.798, 0.725], loss: 18.470661, mean_absolute_error: 62.172379, mean_q: 125.611839
 199371/500000: episode: 1242, duration: 1.732s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.010 [-0.829, 0.746], loss: 19.496433, mean_absolute_error: 62.022011, mean_q: 125.326797
 199571/500000: episode: 1243, duration: 1.705s, episode steps: 200, steps per second: 117, episode reward: 200

 203767/500000: episode: 1268, duration: 0.727s, episode steps: 87, steps per second: 120, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: 0.035 [-2.139, 2.376], loss: 24.159662, mean_absolute_error: 58.613045, mean_q: 117.827217
 203814/500000: episode: 1269, duration: 0.385s, episode steps: 47, steps per second: 122, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: 0.001 [-1.576, 1.898], loss: 29.824978, mean_absolute_error: 58.281940, mean_q: 117.226959
 204014/500000: episode: 1270, duration: 1.621s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.033 [-1.631, 1.716], loss: 19.843739, mean_absolute_error: 57.574776, mean_q: 116.088226
 204135/500000: episode: 1271, duration: 0.937s, episode steps: 121, steps per second: 129, episode reward: 121.000,

 208587/500000: episode: 1296, duration: 1.795s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.476 [-1.426, 2.415], loss: 11.666675, mean_absolute_error: 52.344357, mean_q: 105.505295
 208787/500000: episode: 1297, duration: 1.761s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.265 [-2.130, 1.730], loss: 13.479947, mean_absolute_error: 52.042347, mean_q: 104.816025
 208919/500000: episode: 1298, duration: 1.112s, episode steps: 132, steps per second: 119, episode reward: 132.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: -0.199 [-2.637, 2.094], loss: 14.853431, mean_absolute_error: 51.908630, mean_q: 104.178101
 209119/500000: episode: 1299, duration: 1.816s, episode steps: 200, steps per second: 110, episode reward: 20

 212499/500000: episode: 1324, duration: 0.217s, episode steps: 24, steps per second: 110, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.583 [0.000, 1.000], mean observation: 0.114 [-0.816, 1.255], loss: 12.061517, mean_absolute_error: 46.848377, mean_q: 94.243309
 212605/500000: episode: 1325, duration: 0.946s, episode steps: 106, steps per second: 112, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.500 [-1.359, 2.821], loss: 14.522631, mean_absolute_error: 47.206429, mean_q: 94.745750
 212764/500000: episode: 1326, duration: 1.367s, episode steps: 159, steps per second: 116, episode reward: 159.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.503 [0.000, 1.000], mean observation: 0.500 [-1.584, 2.611], loss: 16.827032, mean_absolute_error: 47.070412, mean_q: 94.818756
 212796/500000: episode: 1327, duration: 0.311s, episode steps: 32, steps per second: 103, episode reward: 32.000, me

 217224/500000: episode: 1352, duration: 1.846s, episode steps: 200, steps per second: 108, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.275 [-1.519, 1.707], loss: 7.639101, mean_absolute_error: 52.149792, mean_q: 105.508408
 217424/500000: episode: 1353, duration: 1.803s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.337 [-1.415, 1.992], loss: 11.582202, mean_absolute_error: 52.383556, mean_q: 105.658585
 217624/500000: episode: 1354, duration: 1.757s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.174 [-1.732, 1.734], loss: 20.541306, mean_absolute_error: 52.460136, mean_q: 105.668030
 217824/500000: episode: 1355, duration: 1.959s, episode steps: 200, steps per second: 102, episode reward: 200.0

 222531/500000: episode: 1380, duration: 0.243s, episode steps: 27, steps per second: 111, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: 0.099 [-0.799, 1.166], loss: 21.810472, mean_absolute_error: 48.351383, mean_q: 97.572556
 222731/500000: episode: 1381, duration: 1.805s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.283 [-1.531, 1.486], loss: 16.379929, mean_absolute_error: 47.717697, mean_q: 96.274040
 222931/500000: episode: 1382, duration: 1.780s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.282 [-1.440, 1.698], loss: 21.358570, mean_absolute_error: 47.572445, mean_q: 95.666168
 223131/500000: episode: 1383, duration: 1.731s, episode steps: 200, steps per second: 116, episode reward: 200.000, 

 227369/500000: episode: 1408, duration: 1.791s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.216 [-2.263, 2.621], loss: 12.076535, mean_absolute_error: 43.852539, mean_q: 88.145760
 227414/500000: episode: 1409, duration: 0.399s, episode steps: 45, steps per second: 113, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.016 [-1.638, 1.553], loss: 16.958899, mean_absolute_error: 43.604683, mean_q: 87.550224
 227614/500000: episode: 1410, duration: 1.830s, episode steps: 200, steps per second: 109, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.464 [-1.012, 2.246], loss: 9.915142, mean_absolute_error: 43.665897, mean_q: 88.069611
 227687/500000: episode: 1411, duration: 0.629s, episode steps: 73, steps per second: 116, episode reward: 73.000, me

 232178/500000: episode: 1436, duration: 0.887s, episode steps: 106, steps per second: 119, episode reward: 106.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.100 [-1.306, 1.354], loss: 21.042601, mean_absolute_error: 45.280979, mean_q: 90.918976
 232378/500000: episode: 1437, duration: 1.747s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.246 [-2.269, 1.915], loss: 8.772726, mean_absolute_error: 45.027657, mean_q: 90.713127
 232578/500000: episode: 1438, duration: 1.728s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.258 [-1.869, 1.589], loss: 18.257977, mean_absolute_error: 44.656387, mean_q: 89.559036
 232695/500000: episode: 1439, duration: 1.008s, episode steps: 117, steps per second: 116, episode reward: 117.00

 236505/500000: episode: 1465, duration: 1.312s, episode steps: 148, steps per second: 113, episode reward: 148.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.022 [-2.329, 2.941], loss: 6.166907, mean_absolute_error: 41.399078, mean_q: 83.274086
 236552/500000: episode: 1466, duration: 0.429s, episode steps: 47, steps per second: 109, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: -0.155 [-2.448, 2.200], loss: 14.144370, mean_absolute_error: 41.703560, mean_q: 83.442833
 236657/500000: episode: 1467, duration: 0.869s, episode steps: 105, steps per second: 121, episode reward: 105.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: -0.119 [-2.227, 1.995], loss: 10.566971, mean_absolute_error: 41.863445, mean_q: 84.050331
 236689/500000: episode: 1468, duration: 0.293s, episode steps: 32, steps per second: 109, episode reward: 32.000, m

 239956/500000: episode: 1493, duration: 0.868s, episode steps: 101, steps per second: 116, episode reward: 101.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.011 [-1.611, 1.495], loss: 15.718637, mean_absolute_error: 40.795994, mean_q: 82.137695
 240156/500000: episode: 1494, duration: 1.795s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.028 [-2.456, 2.277], loss: 12.864606, mean_absolute_error: 40.874557, mean_q: 82.166931
 240356/500000: episode: 1495, duration: 1.881s, episode steps: 200, steps per second: 106, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.068 [-2.430, 2.104], loss: 13.676791, mean_absolute_error: 40.761703, mean_q: 82.354118
 240556/500000: episode: 1496, duration: 1.759s, episode steps: 200, steps per second: 114, episode reward: 200.00

 243759/500000: episode: 1521, duration: 0.241s, episode steps: 28, steps per second: 116, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.052 [-1.911, 1.314], loss: 13.280135, mean_absolute_error: 38.967999, mean_q: 78.167053
 243857/500000: episode: 1522, duration: 0.848s, episode steps: 98, steps per second: 116, episode reward: 98.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.097 [-2.141, 2.043], loss: 10.196816, mean_absolute_error: 38.802032, mean_q: 77.979324
 243903/500000: episode: 1523, duration: 0.404s, episode steps: 46, steps per second: 114, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: -0.157 [-2.516, 2.285], loss: 11.282732, mean_absolute_error: 39.085514, mean_q: 78.494278
 244004/500000: episode: 1524, duration: 0.846s, episode steps: 101, steps per second: 119, episode reward: 101.000, m

 247212/500000: episode: 1550, duration: 1.530s, episode steps: 164, steps per second: 107, episode reward: 164.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.474 [-1.684, 2.415], loss: 10.214026, mean_absolute_error: 38.484615, mean_q: 77.375824
 247335/500000: episode: 1551, duration: 1.054s, episode steps: 123, steps per second: 117, episode reward: 123.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.504 [0.000, 1.000], mean observation: -0.185 [-1.912, 1.920], loss: 13.248992, mean_absolute_error: 38.635273, mean_q: 77.244339
 247535/500000: episode: 1552, duration: 1.802s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: -0.078 [-2.003, 2.046], loss: 11.454882, mean_absolute_error: 38.401806, mean_q: 77.278381
 247567/500000: episode: 1553, duration: 0.281s, episode steps: 32, steps per second: 114, episode reward: 32.000

 251537/500000: episode: 1578, duration: 1.747s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.054 [-2.092, 2.264], loss: 7.862479, mean_absolute_error: 38.017956, mean_q: 76.716446
 251737/500000: episode: 1579, duration: 1.896s, episode steps: 200, steps per second: 105, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.004 [-2.091, 2.614], loss: 15.949161, mean_absolute_error: 38.027294, mean_q: 76.552010
 251937/500000: episode: 1580, duration: 1.791s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.046 [-2.286, 2.420], loss: 9.955457, mean_absolute_error: 38.134991, mean_q: 76.881599
 252137/500000: episode: 1581, duration: 1.789s, episode steps: 200, steps per second: 112, episode reward: 200.00

 256063/500000: episode: 1606, duration: 0.274s, episode steps: 31, steps per second: 113, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.029 [-1.488, 1.162], loss: 13.261134, mean_absolute_error: 39.140121, mean_q: 78.782661
 256256/500000: episode: 1607, duration: 1.693s, episode steps: 193, steps per second: 114, episode reward: 193.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.503 [0.000, 1.000], mean observation: 0.417 [-1.356, 2.403], loss: 14.351689, mean_absolute_error: 38.813519, mean_q: 78.372871
 256456/500000: episode: 1608, duration: 1.789s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.466 [-0.891, 2.394], loss: 9.446841, mean_absolute_error: 38.727310, mean_q: 78.354713
 256656/500000: episode: 1609, duration: 1.769s, episode steps: 200, steps per second: 113, episode reward: 200.000, 

 260313/500000: episode: 1634, duration: 1.047s, episode steps: 123, steps per second: 117, episode reward: 123.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: -0.062 [-2.083, 1.983], loss: 9.416745, mean_absolute_error: 37.874104, mean_q: 76.486755
 260390/500000: episode: 1635, duration: 0.681s, episode steps: 77, steps per second: 113, episode reward: 77.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.225 [-1.823, 2.238], loss: 10.467446, mean_absolute_error: 37.709862, mean_q: 76.183121
 260563/500000: episode: 1636, duration: 1.577s, episode steps: 173, steps per second: 110, episode reward: 173.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.465 [-1.472, 2.418], loss: 10.606610, mean_absolute_error: 38.022839, mean_q: 76.482361
 260586/500000: episode: 1637, duration: 0.214s, episode steps: 23, steps per second: 108, episode reward: 23.000, me

 264442/500000: episode: 1663, duration: 1.771s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.173 [-1.472, 2.279], loss: 9.666718, mean_absolute_error: 38.453114, mean_q: 77.378105
 264536/500000: episode: 1664, duration: 0.808s, episode steps: 94, steps per second: 116, episode reward: 94.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: -0.193 [-1.690, 1.415], loss: 11.909484, mean_absolute_error: 38.708984, mean_q: 77.980476
 264704/500000: episode: 1665, duration: 1.491s, episode steps: 168, steps per second: 113, episode reward: 168.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.018 [-1.742, 2.400], loss: 12.756845, mean_absolute_error: 38.485954, mean_q: 77.666306
 264822/500000: episode: 1666, duration: 1.044s, episode steps: 118, steps per second: 113, episode reward: 118.000,

 268847/500000: episode: 1691, duration: 1.314s, episode steps: 152, steps per second: 116, episode reward: 152.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.453 [-1.413, 2.413], loss: 7.075787, mean_absolute_error: 38.869648, mean_q: 78.419044
 269047/500000: episode: 1692, duration: 1.775s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.092 [-1.899, 2.021], loss: 9.128819, mean_absolute_error: 39.017639, mean_q: 78.450615
 269247/500000: episode: 1693, duration: 1.795s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.111 [-1.842, 2.227], loss: 7.068451, mean_absolute_error: 38.574688, mean_q: 77.858780
 269427/500000: episode: 1694, duration: 1.644s, episode steps: 180, steps per second: 109, episode reward: 180.000, m

 272836/500000: episode: 1719, duration: 1.825s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.121 [-1.987, 1.872], loss: 9.041141, mean_absolute_error: 38.202942, mean_q: 77.188583
 273036/500000: episode: 1720, duration: 1.689s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.109 [-2.076, 1.732], loss: 7.787601, mean_absolute_error: 38.170849, mean_q: 77.172806
 273236/500000: episode: 1721, duration: 1.701s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.081 [-1.361, 1.676], loss: 12.132151, mean_absolute_error: 37.979336, mean_q: 76.804489
 273436/500000: episode: 1722, duration: 1.671s, episode steps: 200, steps per second: 120, episode reward: 200.00

 277859/500000: episode: 1747, duration: 1.712s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.009 [-1.634, 1.865], loss: 7.970906, mean_absolute_error: 38.804039, mean_q: 78.625336
 278059/500000: episode: 1748, duration: 1.681s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.018 [-1.842, 1.926], loss: 9.122066, mean_absolute_error: 38.883438, mean_q: 78.746239
 278259/500000: episode: 1749, duration: 1.764s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.010 [-1.677, 1.868], loss: 8.779367, mean_absolute_error: 39.053165, mean_q: 78.960960
 278459/500000: episode: 1750, duration: 1.753s, episode steps: 200, steps per second: 114, episode reward: 200.000,

 282892/500000: episode: 1775, duration: 1.812s, episode steps: 200, steps per second: 110, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.122 [-1.818, 1.772], loss: 13.514265, mean_absolute_error: 40.301125, mean_q: 81.096329
 283092/500000: episode: 1776, duration: 1.753s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.101 [-1.876, 2.199], loss: 12.772187, mean_absolute_error: 39.956383, mean_q: 80.736679
 283292/500000: episode: 1777, duration: 1.757s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.107 [-2.105, 1.883], loss: 12.544075, mean_absolute_error: 40.044762, mean_q: 80.785843
 283492/500000: episode: 1778, duration: 1.778s, episode steps: 200, steps per second: 112, episode reward: 200.000

 287565/500000: episode: 1803, duration: 1.743s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.251 [-1.149, 1.703], loss: 14.140550, mean_absolute_error: 41.825508, mean_q: 84.708054
 287765/500000: episode: 1804, duration: 1.792s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.305 [-1.839, 1.872], loss: 14.011319, mean_absolute_error: 41.913734, mean_q: 85.082588
 287965/500000: episode: 1805, duration: 1.770s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.257 [-1.445, 1.727], loss: 8.281940, mean_absolute_error: 42.130741, mean_q: 85.928947
 288165/500000: episode: 1806, duration: 1.748s, episode steps: 200, steps per second: 114, episode reward: 200.000,

 292767/500000: episode: 1831, duration: 1.757s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.006 [-1.525, 1.821], loss: 11.255601, mean_absolute_error: 42.969433, mean_q: 87.179977
 292967/500000: episode: 1832, duration: 1.723s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.076 [-1.557, 1.634], loss: 13.986999, mean_absolute_error: 42.935158, mean_q: 86.631622
 293167/500000: episode: 1833, duration: 1.748s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.252 [-1.606, 1.851], loss: 13.821482, mean_absolute_error: 42.722389, mean_q: 86.323662
 293367/500000: episode: 1834, duration: 1.727s, episode steps: 200, steps per second: 116, episode reward: 200.000

 297354/500000: episode: 1859, duration: 1.596s, episode steps: 184, steps per second: 115, episode reward: 184.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.265 [-1.149, 1.888], loss: 9.878450, mean_absolute_error: 41.476555, mean_q: 84.142647
 297554/500000: episode: 1860, duration: 1.780s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.429 [-1.605, 2.456], loss: 15.186353, mean_absolute_error: 41.485409, mean_q: 83.736649
 297754/500000: episode: 1861, duration: 1.770s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.411 [-0.942, 2.208], loss: 14.365879, mean_absolute_error: 41.490376, mean_q: 83.926292
 297952/500000: episode: 1862, duration: 1.797s, episode steps: 198, steps per second: 110, episode reward: 198.000,

 302012/500000: episode: 1887, duration: 1.579s, episode steps: 175, steps per second: 111, episode reward: 175.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.474 [-1.063, 2.406], loss: 8.711987, mean_absolute_error: 41.454632, mean_q: 84.049881
 302212/500000: episode: 1888, duration: 1.752s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.279 [-1.068, 1.720], loss: 11.197938, mean_absolute_error: 41.689529, mean_q: 84.478554
 302412/500000: episode: 1889, duration: 1.747s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.282 [-1.482, 2.111], loss: 12.221652, mean_absolute_error: 41.394638, mean_q: 83.984550
 302612/500000: episode: 1890, duration: 1.799s, episode steps: 200, steps per second: 111, episode reward: 200.000,

 306556/500000: episode: 1915, duration: 1.770s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.190 [-1.187, 1.711], loss: 12.669525, mean_absolute_error: 42.263271, mean_q: 85.666756
 306756/500000: episode: 1916, duration: 1.793s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.291 [-1.879, 2.024], loss: 15.649639, mean_absolute_error: 42.611553, mean_q: 86.392395
 306782/500000: episode: 1917, duration: 0.221s, episode steps: 26, steps per second: 118, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: -0.024 [-1.829, 1.398], loss: 23.734129, mean_absolute_error: 42.364651, mean_q: 85.048531
 306982/500000: episode: 1918, duration: 1.773s, episode steps: 200, steps per second: 113, episode reward: 200.000,

 310922/500000: episode: 1943, duration: 2.243s, episode steps: 200, steps per second: 89, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.165 [-1.684, 1.527], loss: 7.636026, mean_absolute_error: 41.334110, mean_q: 83.539864
 311122/500000: episode: 1944, duration: 2.290s, episode steps: 200, steps per second: 87, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.041 [-1.866, 2.047], loss: 8.770672, mean_absolute_error: 41.274620, mean_q: 83.401894
 311171/500000: episode: 1945, duration: 0.546s, episode steps: 49, steps per second: 90, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.102 [-1.937, 1.949], loss: 4.125997, mean_absolute_error: 40.979229, mean_q: 83.113724
 311220/500000: episode: 1946, duration: 0.541s, episode steps: 49, steps per second: 91, episode reward: 49.000, mean rew

 314321/500000: episode: 1971, duration: 1.207s, episode steps: 145, steps per second: 120, episode reward: 145.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.394 [-1.984, 1.723], loss: 8.481707, mean_absolute_error: 40.060642, mean_q: 81.092651
 314452/500000: episode: 1972, duration: 1.178s, episode steps: 131, steps per second: 111, episode reward: 131.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.466 [0.000, 1.000], mean observation: -0.437 [-2.799, 1.641], loss: 12.060019, mean_absolute_error: 39.872219, mean_q: 80.751526
 314471/500000: episode: 1973, duration: 0.202s, episode steps: 19, steps per second: 94, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.060 [-1.660, 1.187], loss: 13.526069, mean_absolute_error: 39.552292, mean_q: 80.801498
 314671/500000: episode: 1974, duration: 1.771s, episode steps: 200, steps per second: 113, episode reward: 200.000,

 319032/500000: episode: 1999, duration: 1.715s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.025 [-2.157, 2.138], loss: 12.160184, mean_absolute_error: 39.703892, mean_q: 80.216393
 319093/500000: episode: 2000, duration: 0.514s, episode steps: 61, steps per second: 119, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.054 [-1.608, 1.586], loss: 11.665347, mean_absolute_error: 39.780457, mean_q: 80.348473
 319128/500000: episode: 2001, duration: 0.303s, episode steps: 35, steps per second: 116, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: 0.079 [-2.020, 1.962], loss: 19.881475, mean_absolute_error: 40.147438, mean_q: 80.713951
 319167/500000: episode: 2002, duration: 0.327s, episode steps: 39, steps per second: 119, episode reward: 39.000, mea

 323659/500000: episode: 2027, duration: 1.579s, episode steps: 200, steps per second: 127, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.003 [-2.094, 1.985], loss: 12.925139, mean_absolute_error: 40.101151, mean_q: 81.097214
 323730/500000: episode: 2028, duration: 0.556s, episode steps: 71, steps per second: 128, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.437 [0.000, 1.000], mean observation: 0.142 [-1.913, 2.716], loss: 14.219652, mean_absolute_error: 40.181320, mean_q: 81.189499
 323930/500000: episode: 2029, duration: 1.632s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.043 [-1.734, 1.528], loss: 10.320708, mean_absolute_error: 39.945389, mean_q: 80.922249
 324130/500000: episode: 2030, duration: 1.647s, episode steps: 200, steps per second: 121, episode reward: 200.000,

 328143/500000: episode: 2055, duration: 1.649s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.124 [-1.752, 1.966], loss: 12.879153, mean_absolute_error: 40.267426, mean_q: 81.674652
 328343/500000: episode: 2056, duration: 1.659s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.121 [-1.566, 1.714], loss: 10.058211, mean_absolute_error: 40.450314, mean_q: 82.022003
 328428/500000: episode: 2057, duration: 0.672s, episode steps: 85, steps per second: 126, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: 0.097 [-1.741, 1.773], loss: 15.257142, mean_absolute_error: 40.431896, mean_q: 81.951561
 328628/500000: episode: 2058, duration: 1.651s, episode steps: 200, steps per second: 121, episode reward: 200.000, 

 332006/500000: episode: 2083, duration: 0.584s, episode steps: 70, steps per second: 120, episode reward: 70.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.055 [-1.680, 1.761], loss: 16.137209, mean_absolute_error: 41.997715, mean_q: 85.271515
 332110/500000: episode: 2084, duration: 0.806s, episode steps: 104, steps per second: 129, episode reward: 104.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.143 [-1.360, 1.778], loss: 12.316146, mean_absolute_error: 41.667622, mean_q: 84.321587
 332310/500000: episode: 2085, duration: 1.642s, episode steps: 200, steps per second: 122, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.194 [-1.922, 1.868], loss: 8.408943, mean_absolute_error: 41.604599, mean_q: 84.581932
 332510/500000: episode: 2086, duration: 1.656s, episode steps: 200, steps per second: 121, episode reward: 200.000, m

 336545/500000: episode: 2111, duration: 1.617s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.236 [-1.723, 2.079], loss: 16.459742, mean_absolute_error: 42.489952, mean_q: 86.003029
 336700/500000: episode: 2112, duration: 1.289s, episode steps: 155, steps per second: 120, episode reward: 155.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: 0.214 [-1.883, 2.063], loss: 9.535049, mean_absolute_error: 42.556389, mean_q: 86.631790
 336900/500000: episode: 2113, duration: 1.696s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.222 [-2.061, 2.105], loss: 15.616284, mean_absolute_error: 42.583496, mean_q: 86.326950
 336912/500000: episode: 2114, duration: 0.105s, episode steps: 12, steps per second: 114, episode reward: 12.000, m

 340726/500000: episode: 2139, duration: 1.488s, episode steps: 170, steps per second: 114, episode reward: 170.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.176 [-1.915, 1.551], loss: 13.716663, mean_absolute_error: 43.365372, mean_q: 88.179436
 340926/500000: episode: 2140, duration: 1.925s, episode steps: 200, steps per second: 104, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.190 [-1.071, 1.429], loss: 12.619851, mean_absolute_error: 43.265732, mean_q: 88.104118
 341070/500000: episode: 2141, duration: 1.250s, episode steps: 144, steps per second: 115, episode reward: 144.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: 0.088 [-1.711, 1.481], loss: 11.388365, mean_absolute_error: 43.495090, mean_q: 88.336792
 341270/500000: episode: 2142, duration: 1.894s, episode steps: 200, steps per second: 106, episode reward: 200.000

 345125/500000: episode: 2167, duration: 1.250s, episode steps: 154, steps per second: 123, episode reward: 154.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.145 [-2.142, 1.948], loss: 11.744643, mean_absolute_error: 44.422398, mean_q: 90.188034
 345325/500000: episode: 2168, duration: 1.724s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.102 [-1.971, 1.770], loss: 17.322844, mean_absolute_error: 44.397110, mean_q: 89.848297
 345525/500000: episode: 2169, duration: 1.741s, episode steps: 200, steps per second: 115, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.186 [-1.651, 1.912], loss: 16.481382, mean_absolute_error: 44.240444, mean_q: 89.559708
 345725/500000: episode: 2170, duration: 1.850s, episode steps: 200, steps per second: 108, episode reward: 200.000

 348613/500000: episode: 2196, duration: 0.103s, episode steps: 12, steps per second: 117, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.132 [-1.526, 2.511], loss: 9.440408, mean_absolute_error: 41.269588, mean_q: 84.117279
 348749/500000: episode: 2197, duration: 1.084s, episode steps: 136, steps per second: 125, episode reward: 136.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.507 [0.000, 1.000], mean observation: 0.183 [-1.760, 2.050], loss: 7.642839, mean_absolute_error: 42.173584, mean_q: 85.690300
 348949/500000: episode: 2198, duration: 1.644s, episode steps: 200, steps per second: 122, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.201 [-1.695, 1.869], loss: 9.251082, mean_absolute_error: 41.910213, mean_q: 84.841972
 348986/500000: episode: 2199, duration: 0.315s, episode steps: 37, steps per second: 117, episode reward: 37.000, mean 

 352813/500000: episode: 2224, duration: 1.269s, episode steps: 161, steps per second: 127, episode reward: 161.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.503 [0.000, 1.000], mean observation: 0.134 [-2.035, 1.741], loss: 13.840553, mean_absolute_error: 42.116642, mean_q: 85.566246
 352873/500000: episode: 2225, duration: 0.497s, episode steps: 60, steps per second: 121, episode reward: 60.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: 0.188 [-1.761, 2.072], loss: 14.551075, mean_absolute_error: 41.697876, mean_q: 84.492607
 353039/500000: episode: 2226, duration: 1.296s, episode steps: 166, steps per second: 128, episode reward: 166.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.506 [0.000, 1.000], mean observation: 0.235 [-1.373, 1.526], loss: 10.773912, mean_absolute_error: 42.067596, mean_q: 85.387764
 353239/500000: episode: 2227, duration: 1.714s, episode steps: 200, steps per second: 117, episode reward: 200.000, 

 356598/500000: episode: 2252, duration: 1.801s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.131 [-1.508, 1.880], loss: 8.309871, mean_absolute_error: 40.137489, mean_q: 81.765686
 356798/500000: episode: 2253, duration: 1.904s, episode steps: 200, steps per second: 105, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.154 [-1.960, 1.959], loss: 15.098064, mean_absolute_error: 40.409031, mean_q: 81.833397
 356931/500000: episode: 2254, duration: 1.162s, episode steps: 133, steps per second: 114, episode reward: 133.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.023 [-1.596, 2.219], loss: 18.086723, mean_absolute_error: 40.047665, mean_q: 81.311516
 357111/500000: episode: 2255, duration: 1.648s, episode steps: 180, steps per second: 109, episode reward: 180.000

 359792/500000: episode: 2280, duration: 0.401s, episode steps: 35, steps per second: 87, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.066 [-1.877, 1.763], loss: 14.486449, mean_absolute_error: 40.083344, mean_q: 81.492203
 359865/500000: episode: 2281, duration: 0.810s, episode steps: 73, steps per second: 90, episode reward: 73.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.133 [-1.169, 1.338], loss: 14.396460, mean_absolute_error: 40.602638, mean_q: 82.595345
 360054/500000: episode: 2282, duration: 2.135s, episode steps: 189, steps per second: 89, episode reward: 189.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.503 [0.000, 1.000], mean observation: 0.421 [-1.022, 2.050], loss: 15.657666, mean_absolute_error: 40.603497, mean_q: 82.772034
 360254/500000: episode: 2283, duration: 2.289s, episode steps: 200, steps per second: 87, episode reward: 200.000, mean r

 363385/500000: episode: 2308, duration: 0.299s, episode steps: 26, steps per second: 87, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.047 [-0.788, 1.104], loss: 5.307710, mean_absolute_error: 40.760479, mean_q: 83.462807
 363425/500000: episode: 2309, duration: 0.452s, episode steps: 40, steps per second: 89, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.525 [0.000, 1.000], mean observation: 0.072 [-2.003, 1.895], loss: 13.408099, mean_absolute_error: 41.225365, mean_q: 83.699547
 363625/500000: episode: 2310, duration: 2.230s, episode steps: 200, steps per second: 90, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.105 [-1.949, 1.690], loss: 9.635141, mean_absolute_error: 40.796234, mean_q: 83.214157
 363820/500000: episode: 2311, duration: 2.212s, episode steps: 195, steps per second: 88, episode reward: 195.000, mean rew

 367318/500000: episode: 2336, duration: 1.718s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.017 [-2.044, 1.736], loss: 14.011559, mean_absolute_error: 40.153675, mean_q: 81.658157
 367518/500000: episode: 2337, duration: 1.676s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.026 [-1.885, 1.759], loss: 13.236970, mean_absolute_error: 40.071537, mean_q: 81.481346
 367612/500000: episode: 2338, duration: 0.766s, episode steps: 94, steps per second: 123, episode reward: 94.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: 0.009 [-1.794, 1.714], loss: 10.366278, mean_absolute_error: 40.054344, mean_q: 81.841934
 367725/500000: episode: 2339, duration: 0.918s, episode steps: 113, steps per second: 123, episode reward: 113.000,

 370831/500000: episode: 2364, duration: 1.712s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.045 [-1.659, 1.576], loss: 13.791025, mean_absolute_error: 40.614220, mean_q: 82.678108
 371031/500000: episode: 2365, duration: 1.688s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.105 [-1.472, 1.369], loss: 8.301458, mean_absolute_error: 40.988174, mean_q: 83.840294
 371078/500000: episode: 2366, duration: 0.378s, episode steps: 47, steps per second: 124, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.574 [0.000, 1.000], mean observation: 0.107 [-1.820, 1.398], loss: 7.031234, mean_absolute_error: 41.401775, mean_q: 85.036781
 371278/500000: episode: 2367, duration: 1.692s, episode steps: 200, steps per second: 118, episode reward: 200.000, me

 375087/500000: episode: 2392, duration: 1.655s, episode steps: 200, steps per second: 121, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.078 [-1.501, 1.534], loss: 10.740942, mean_absolute_error: 41.295948, mean_q: 84.343681
 375287/500000: episode: 2393, duration: 1.624s, episode steps: 200, steps per second: 123, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.129 [-2.000, 1.717], loss: 13.034628, mean_absolute_error: 40.967361, mean_q: 83.457893
 375487/500000: episode: 2394, duration: 1.753s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.076 [-1.583, 1.536], loss: 11.665936, mean_absolute_error: 40.921928, mean_q: 83.465446
 375687/500000: episode: 2395, duration: 1.726s, episode steps: 200, steps per second: 116, episode reward: 200.000

 378989/500000: episode: 2420, duration: 1.112s, episode steps: 128, steps per second: 115, episode reward: 128.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: 0.119 [-1.944, 1.718], loss: 9.596876, mean_absolute_error: 40.586250, mean_q: 82.933105
 379189/500000: episode: 2421, duration: 1.843s, episode steps: 200, steps per second: 109, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.042 [-2.228, 1.655], loss: 8.293900, mean_absolute_error: 40.783634, mean_q: 83.185997
 379245/500000: episode: 2422, duration: 0.487s, episode steps: 56, steps per second: 115, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.482 [0.000, 1.000], mean observation: 0.040 [-1.978, 1.749], loss: 15.889918, mean_absolute_error: 40.644478, mean_q: 83.218834
 379382/500000: episode: 2423, duration: 1.104s, episode steps: 137, steps per second: 124, episode reward: 137.000, me

 382487/500000: episode: 2448, duration: 1.167s, episode steps: 140, steps per second: 120, episode reward: 140.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.085 [-2.090, 1.751], loss: 13.082961, mean_absolute_error: 40.621609, mean_q: 83.427773
 382687/500000: episode: 2449, duration: 1.777s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.007 [-2.095, 1.861], loss: 16.304539, mean_absolute_error: 40.518574, mean_q: 82.769508
 382887/500000: episode: 2450, duration: 1.718s, episode steps: 200, steps per second: 116, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: 0.119 [-1.655, 1.457], loss: 12.019751, mean_absolute_error: 40.425499, mean_q: 82.767792
 382947/500000: episode: 2451, duration: 0.491s, episode steps: 60, steps per second: 122, episode reward: 60.000,

 385691/500000: episode: 2477, duration: 0.452s, episode steps: 54, steps per second: 119, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.106 [-1.203, 1.415], loss: 11.618419, mean_absolute_error: 40.667721, mean_q: 83.262352
 385737/500000: episode: 2478, duration: 0.373s, episode steps: 46, steps per second: 123, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: 0.117 [-1.259, 1.499], loss: 12.320109, mean_absolute_error: 40.522511, mean_q: 83.005394
 385937/500000: episode: 2479, duration: 1.680s, episode steps: 200, steps per second: 119, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.080 [-1.842, 1.757], loss: 14.184287, mean_absolute_error: 40.779476, mean_q: 83.670837
 385966/500000: episode: 2480, duration: 0.245s, episode steps: 29, steps per second: 119, episode reward: 29.000, mean

 389553/500000: episode: 2505, duration: 1.021s, episode steps: 126, steps per second: 123, episode reward: 126.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.352 [-1.707, 1.512], loss: 11.223884, mean_absolute_error: 41.015694, mean_q: 83.559189
 389701/500000: episode: 2506, duration: 1.223s, episode steps: 148, steps per second: 121, episode reward: 148.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.534 [0.000, 1.000], mean observation: -0.107 [-1.899, 1.899], loss: 12.807270, mean_absolute_error: 40.746883, mean_q: 83.189430
 389796/500000: episode: 2507, duration: 0.832s, episode steps: 95, steps per second: 114, episode reward: 95.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: -0.058 [-2.057, 2.292], loss: 15.091535, mean_absolute_error: 40.715034, mean_q: 82.871582
 389996/500000: episode: 2508, duration: 1.681s, episode steps: 200, steps per second: 119, episode reward: 200.00

 392804/500000: episode: 2533, duration: 0.738s, episode steps: 88, steps per second: 119, episode reward: 88.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.221 [-1.450, 0.887], loss: 8.388360, mean_absolute_error: 39.984116, mean_q: 81.952782
 392854/500000: episode: 2534, duration: 0.408s, episode steps: 50, steps per second: 123, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: 0.090 [-2.199, 2.128], loss: 15.843173, mean_absolute_error: 39.780602, mean_q: 81.247444
 393054/500000: episode: 2535, duration: 1.755s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.083 [-1.474, 1.563], loss: 10.602487, mean_absolute_error: 39.885658, mean_q: 81.518951
 393067/500000: episode: 2536, duration: 0.128s, episode steps: 13, steps per second: 101, episode reward: 13.000, mean

 396620/500000: episode: 2561, duration: 1.673s, episode steps: 200, steps per second: 120, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.165 [-1.931, 2.038], loss: 11.043073, mean_absolute_error: 40.071751, mean_q: 81.761925
 396820/500000: episode: 2562, duration: 1.714s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.115 [-2.122, 1.643], loss: 13.982146, mean_absolute_error: 40.141571, mean_q: 82.075027
 397020/500000: episode: 2563, duration: 1.711s, episode steps: 200, steps per second: 117, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.115 [-1.791, 1.913], loss: 14.455153, mean_absolute_error: 40.028683, mean_q: 81.837929
 397220/500000: episode: 2564, duration: 1.675s, episode steps: 200, steps per second: 119, episode reward: 200.000

 399891/500000: episode: 2589, duration: 0.667s, episode steps: 81, steps per second: 122, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: -0.167 [-2.064, 1.687], loss: 18.196272, mean_absolute_error: 40.731796, mean_q: 82.350090
 399945/500000: episode: 2590, duration: 0.451s, episode steps: 54, steps per second: 120, episode reward: 54.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.537 [0.000, 1.000], mean observation: 0.187 [-1.297, 1.698], loss: 19.414967, mean_absolute_error: 40.289974, mean_q: 81.870148
 399976/500000: episode: 2591, duration: 0.260s, episode steps: 31, steps per second: 119, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: -0.047 [-1.939, 2.131], loss: 15.424267, mean_absolute_error: 40.513863, mean_q: 82.622368
 400146/500000: episode: 2592, duration: 1.414s, episode steps: 170, steps per second: 120, episode reward: 170.000, me

 402993/500000: episode: 2617, duration: 1.663s, episode steps: 194, steps per second: 117, episode reward: 194.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.521 [0.000, 1.000], mean observation: 0.218 [-1.908, 2.119], loss: 11.923902, mean_absolute_error: 39.324139, mean_q: 80.390190
 403018/500000: episode: 2618, duration: 0.203s, episode steps: 25, steps per second: 123, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.073 [-0.980, 1.396], loss: 10.230780, mean_absolute_error: 38.863068, mean_q: 79.835365
 403218/500000: episode: 2619, duration: 1.752s, episode steps: 200, steps per second: 114, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.343 [-0.848, 1.651], loss: 17.108173, mean_absolute_error: 39.283375, mean_q: 79.853867
 403418/500000: episode: 2620, duration: 1.688s, episode steps: 200, steps per second: 118, episode reward: 200.000, 

 407259/500000: episode: 2646, duration: 1.787s, episode steps: 200, steps per second: 112, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.237 [-2.067, 2.076], loss: 9.931877, mean_absolute_error: 39.969486, mean_q: 81.906105
 407459/500000: episode: 2647, duration: 1.804s, episode steps: 200, steps per second: 111, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: 0.305 [-1.645, 2.405], loss: 10.360490, mean_absolute_error: 40.018551, mean_q: 81.797050
 407473/500000: episode: 2648, duration: 0.127s, episode steps: 14, steps per second: 110, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: 0.075 [-1.022, 1.519], loss: 13.790668, mean_absolute_error: 40.029858, mean_q: 82.051086
 407670/500000: episode: 2649, duration: 1.665s, episode steps: 197, steps per second: 118, episode reward: 197.000, m

 411141/500000: episode: 2674, duration: 1.579s, episode steps: 186, steps per second: 118, episode reward: 186.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.199 [-1.606, 1.886], loss: 10.876229, mean_absolute_error: 39.938255, mean_q: 82.187584
 411226/500000: episode: 2675, duration: 0.743s, episode steps: 85, steps per second: 114, episode reward: 85.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: -0.050 [-1.946, 2.090], loss: 18.772366, mean_absolute_error: 40.378792, mean_q: 82.251564
 411399/500000: episode: 2676, duration: 1.472s, episode steps: 173, steps per second: 118, episode reward: 173.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.097 [-2.093, 1.725], loss: 14.532557, mean_absolute_error: 40.293518, mean_q: 82.145065
 411469/500000: episode: 2677, duration: 0.615s, episode steps: 70, steps per second: 114, episode reward: 70.000, m

 413498/500000: episode: 2702, duration: 0.262s, episode steps: 30, steps per second: 115, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.433 [0.000, 1.000], mean observation: -0.045 [-1.529, 1.702], loss: 15.048120, mean_absolute_error: 40.338154, mean_q: 82.229614
 413645/500000: episode: 2703, duration: 1.226s, episode steps: 147, steps per second: 120, episode reward: 147.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.002 [-2.455, 2.429], loss: 11.637955, mean_absolute_error: 40.186684, mean_q: 82.567940
 413712/500000: episode: 2704, duration: 0.579s, episode steps: 67, steps per second: 116, episode reward: 67.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.493 [0.000, 1.000], mean observation: -0.136 [-1.557, 1.225], loss: 12.025319, mean_absolute_error: 40.225029, mean_q: 83.011803
 413766/500000: episode: 2705, duration: 0.438s, episode steps: 54, steps per second: 123, episode reward: 54.000, m

 416142/500000: episode: 2730, duration: 1.699s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.100 [-1.536, 2.027], loss: 12.353785, mean_absolute_error: 41.946712, mean_q: 86.340721
 416342/500000: episode: 2731, duration: 1.928s, episode steps: 200, steps per second: 104, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.128 [-1.528, 1.345], loss: 14.830303, mean_absolute_error: 42.325226, mean_q: 87.092560
 416456/500000: episode: 2732, duration: 1.195s, episode steps: 114, steps per second: 95, episode reward: 114.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.491 [0.000, 1.000], mean observation: -0.164 [-1.674, 1.425], loss: 16.658138, mean_absolute_error: 42.064987, mean_q: 86.644958
 416656/500000: episode: 2733, duration: 2.143s, episode steps: 200, steps per second: 93, episode reward: 200.00

 421365/500000: episode: 2758, duration: 1.768s, episode steps: 200, steps per second: 113, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.051 [-1.249, 1.184], loss: 10.220470, mean_absolute_error: 44.574570, mean_q: 91.806618
 421429/500000: episode: 2759, duration: 0.580s, episode steps: 64, steps per second: 110, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: -0.103 [-1.624, 1.367], loss: 15.804037, mean_absolute_error: 44.634033, mean_q: 91.785278
 421629/500000: episode: 2760, duration: 1.702s, episode steps: 200, steps per second: 118, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.105 [-1.137, 1.314], loss: 13.496194, mean_absolute_error: 44.265938, mean_q: 90.981682
 421829/500000: episode: 2761, duration: 1.693s, episode steps: 200, steps per second: 118, episode reward: 200.000

 426259/500000: episode: 2786, duration: 0.469s, episode steps: 59, steps per second: 126, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.559 [0.000, 1.000], mean observation: 0.185 [-1.854, 2.225], loss: 13.207333, mean_absolute_error: 42.915646, mean_q: 87.355263
 426281/500000: episode: 2787, duration: 0.179s, episode steps: 22, steps per second: 123, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: 0.086 [-1.196, 1.706], loss: 26.700842, mean_absolute_error: 43.097527, mean_q: 87.332321
 426429/500000: episode: 2788, duration: 1.192s, episode steps: 148, steps per second: 124, episode reward: 148.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.534 [0.000, 1.000], mean observation: 0.011 [-2.160, 2.265], loss: 13.672609, mean_absolute_error: 42.846447, mean_q: 87.555161
 426543/500000: episode: 2789, duration: 0.890s, episode steps: 114, steps per second: 128, episode reward: 114.000, me

 429435/500000: episode: 2814, duration: 1.618s, episode steps: 200, steps per second: 124, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.160 [-1.827, 2.289], loss: 17.152031, mean_absolute_error: 42.477589, mean_q: 86.485062
 429481/500000: episode: 2815, duration: 0.369s, episode steps: 46, steps per second: 124, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.098 [-1.514, 1.426], loss: 4.572591, mean_absolute_error: 42.186874, mean_q: 86.220726
 429570/500000: episode: 2816, duration: 0.707s, episode steps: 89, steps per second: 126, episode reward: 89.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.325 [-1.638, 2.577], loss: 11.112828, mean_absolute_error: 42.318066, mean_q: 86.308273
 429583/500000: episode: 2817, duration: 0.112s, episode steps: 13, steps per second: 116, episode reward: 13.000, mean

 432571/500000: episode: 2843, duration: 1.450s, episode steps: 183, steps per second: 126, episode reward: 183.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: -0.133 [-1.433, 1.022], loss: 14.399395, mean_absolute_error: 41.794525, mean_q: 85.595161
 432771/500000: episode: 2844, duration: 1.564s, episode steps: 200, steps per second: 128, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.095 [-1.480, 1.882], loss: 11.348741, mean_absolute_error: 41.518097, mean_q: 85.458809
 432798/500000: episode: 2845, duration: 0.211s, episode steps: 27, steps per second: 128, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.630 [0.000, 1.000], mean observation: -0.005 [-1.978, 1.584], loss: 15.453953, mean_absolute_error: 41.731148, mean_q: 84.959259
 432934/500000: episode: 2846, duration: 1.028s, episode steps: 136, steps per second: 132, episode reward: 136.00

In [None]:
#SARSA algorithm
def SARSA_algorithm(num_episodes):
    allowed_actions=[0,1]
    # Variables: alpha, gamma y epsilon.
    alpha = 0.3
    gamma = 1
    epsilon = 0.1
    cont = 0
    #Para 5000 episodios de entrenamiento
    for i in range(num_episodes):
        # Inicializa las variables para cada episodio
        state=env.reset()
        num_steps = 0
        state = discretize(state[0])


        # Seleccion accion "a" de forma epsilon-greedy
        if epsilon< np.random.uniform():        
            act_arg = np.array([Q_table[tuple(state), act] for act in range(2)])
            action = allowed_actions[np.argmax(act_arg)]
        else:
            action = env.action_space.sample()

        # Inicia el episodio
        continue_episode = True
        while continue_episode:        
            # Obtengo s'
            new_state, reward, done, _,_=env.step(action)
            new_state = discretize(new_state)

            # Revisa que new_state no sea un estado terminal
            if done:
                # Valor q(s',a') terminal
                q_value_next_step = 0
            else:
                #Obtengo a' de s' con epsilon greedy
                if epsilon< np.random.uniform():        
                    act_arg = np.array([Q_table[tuple(state), act] for act in range(2)])
                    new_action = allowed_actions[np.argmax(act_arg)]
                else:
                    new_action = env.action_space.sample()

                # Valor q(s',a') no terminal
                q_value_next_step = Q_table[tuple(new_state),new_action]


            # Calculo de actualizacion q(s,a) <- q(s,a) + alpha*(R + gamma*q(s',a') - q(s,a))
            Q_table[tuple(state), action] += alpha*(reward + gamma*q_value_next_step - Q_table[tuple(state),action])

            # asigna a = a' y s = s'
            state = new_state
            action = new_action


            # Parte que termina el episodio si se llega a algun estado terminal
            if done:
                continue_episode = False
        cont+=1
        if cont%100==0:
            print(cont)

## SARSA con 400 estados:

In [None]:
set_states()
set_q_table()

In [None]:
discrete_values = num_states(400)
SARSA_algorithm(5000)

In [None]:
allowed_actions= [0,1]
prom=[]
prom_rand=[]
for i in range(100):
    G_pi = 0
    state =env.reset()
    state = state[0]
    state = discretize(state)
    act_arg = np.array([Q_table[tuple(state), act] for act in range(2)])
    action = allowed_actions[np.argmax(act_arg)]
    done = False
    n=1
    while not done:
        new_state, reward, done, _,_=env.step(action)
        G_pi= G_pi + reward
        discrete_state = discretize(new_state)
        act_arg = np.array([Q_table[tuple(discrete_state), act] for act in range(2)])
        action = allowed_actions[np.argmax(act_arg)]
    prom.append(G_pi)
print('---'*5)
for i in range(100):
    G_pi = 0
    state =env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        new_state, reward, done, _,_=env.step(action)
        G_pi= G_pi + reward
    prom_rand.append(G_pi)

In [None]:
print("Retorno obtenido con la funcion q:",sum(prom)/len(prom))
print("Retorno obtenido de acciones al azar:",sum(prom_rand)/len(prom_rand))

## SARSA con 4000 estados:

In [None]:
set_states()
set_q_table()

In [None]:
discrete_values = num_states(4000)
SARSA_algorithm(1000)

In [None]:
prom=[]
prom_rand=[]
allowed_actions= [0,1]
for i in range(100):
    G_pi = 0
    state =env.reset()
    state = state[0]
    state = discretize(state)
    act_arg = np.array([Q_table[tuple(state), act] for act in range(2)])
    action = allowed_actions[np.argmax(act_arg)]
    done = False
    n=1
    while not done:
        new_state, reward, done, _,_=env.step(action)
        G_pi= G_pi + reward
        discrete_state = discretize(new_state)
        act_arg = np.array([Q_table[tuple(discrete_state), act] for act in range(2)])
        action = allowed_actions[np.argmax(act_arg)]
    prom.append(G_pi)
print('---'*5)
for i in range(100):
    G_pi = 0
    state =env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        new_state, reward, done, _,_=env.step(action)
        G_pi= G_pi + reward
    prom_rand.append(G_pi)

In [None]:
print("Retorno obtenido con la funcion q:",sum(prom)/len(prom))
print("Retorno obtenido de acciones al azar:",sum(prom_rand)/len(prom_rand))

In [None]:
env.close()