In [8]:

import matplotlib.pyplot as plt
import numpy as np
from dm_control import viewer
from tqdm import tqdm
import simulation.dm_control.simulation_control.environments as environments
from simulation.dm_control.ddpg.ddpg import DDPGagent, OUNoise
from simulation.dm_control.simulation_api import SimulationAPI

In [9]:
random_state = np.random.RandomState(42)

RESUME_TRAINING = True
PATH_MODEL = 'passive_hand_2'
NUM_EPISODES = 3000
BATCH_SIZE = 128
DURATION = 100
ACTOR_LEARNING_RATE = 1e-4
CRITIC_LEARNING_RATE = 1e-3
GAMMA = 0.99
TAU = 1e-2

# for parametrization
sapi = SimulationAPI()
sapi.rebuild_XML()
env = environments.load(domain_name='passive_hand', task_name='lift_sparse')
action_spec = env.action_spec()
dim_action = action_spec.shape[0]
dim_obs = 21

updated object_translate
updated object_change_slope
updated robot_change_finger_length
updated robot_change_joint_stiffness
updated robot_change_finger_spring_default
updated robot_change_thumb_spring_default
updated robot_change_friction


In [10]:
def parse_obs(obs):
    x = np.array([])
    for k, v in obs.items():
        if k == 'simulation_time': continue
        x = np.append(x, v)
    return x

agent = DDPGagent(
    dim_obs,
    dim_action,
    actor_learning_rate=ACTOR_LEARNING_RATE,
    critic_learning_rate=CRITIC_LEARNING_RATE,
    gamma=GAMMA,
    tau=TAU
)

if RESUME_TRAINING: agent.load(PATH_MODEL)

noise = OUNoise(dim_action, action_spec.minimum, action_spec.maximum)


def denorm(a):  #  use on model output before passing to env
    act_k = (action_spec.maximum - action_spec.minimum) / 2.
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return a * act_k + act_b

In [None]:
rewards = []
avg_rewards = []

for episode in tqdm(range(NUM_EPISODES)):
    time_step = env.reset()
    state = parse_obs(time_step.observation)
    noise.reset()
    episode_reward = 0
    episode_reward_history = []
    for step in range(DURATION):
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        try:
            time_step_2 = env.step(denorm(action))
        except:
            print(f'Physics Error: {action}')
            break
        state_2 = parse_obs(time_step_2.observation)
        reward = time_step_2.reward
        agent.memory.push(state, action, reward, state_2, -1)
        state = state_2
        if len(agent.memory) > BATCH_SIZE:
            agent.update(BATCH_SIZE)
        episode_reward += reward
        episode_reward_history.append(reward)
    print(f"episode: {episode}, "
          f"reward: {np.round(episode_reward, decimals=2)}, "
          f"average_reward: {np.mean(rewards[-10:])}")
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

agent.save(PATH_MODEL)

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

  0%|          | 1/3000 [00:00<21:48,  2.29it/s]

episode: 0, reward: -262.26, average_reward: nan


  0%|          | 2/3000 [00:01<41:57,  1.19it/s]

episode: 1, reward: -1247.11, average_reward: -262.2585981891645


  0%|          | 3/3000 [00:02<54:52,  1.10s/it]

episode: 2, reward: -260.53, average_reward: -754.6842938123141


  0%|          | 4/3000 [00:04<1:00:21,  1.21s/it]

episode: 3, reward: -277.11, average_reward: -589.9655844500145


  0%|          | 5/3000 [00:05<1:02:47,  1.26s/it]

episode: 4, reward: -1534.46, average_reward: -511.7519980710249


  0%|          | 6/3000 [00:07<1:04:45,  1.30s/it]

episode: 5, reward: -939.45, average_reward: -716.2941736726956


  0%|          | 7/3000 [00:08<1:06:19,  1.33s/it]

episode: 6, reward: -271.64, average_reward: -753.4868637149415


  0%|          | 8/3000 [00:09<1:06:53,  1.34s/it]

episode: 7, reward: -258.16, average_reward: -684.6515568695764


  0%|          | 9/3000 [00:11<1:08:22,  1.37s/it]

episode: 8, reward: -271.68, average_reward: -631.3406343097504


  0%|          | 10/3000 [00:12<1:08:13,  1.37s/it]

episode: 9, reward: -283.35, average_reward: -591.3784322308186


  0%|          | 11/3000 [00:14<1:09:02,  1.39s/it]

episode: 10, reward: -258.37, average_reward: -560.5751351908593


  0%|          | 12/3000 [00:15<1:11:00,  1.43s/it]

episode: 11, reward: -218.39, average_reward: -560.1859606562315


  0%|          | 13/3000 [00:17<1:11:46,  1.44s/it]

episode: 12, reward: -268.1, average_reward: -457.3141665517104


  0%|          | 14/3000 [00:18<1:12:10,  1.45s/it]

episode: 13, reward: -280.93, average_reward: -458.07102777410034


  0%|          | 15/3000 [00:19<1:12:27,  1.46s/it]

episode: 14, reward: -270.82, average_reward: -458.45251984504387


  1%|          | 16/3000 [00:21<1:12:33,  1.46s/it]

episode: 15, reward: -256.83, average_reward: -332.088660364137


  1%|          | 17/3000 [00:22<1:13:20,  1.48s/it]

episode: 16, reward: -208.06, average_reward: -263.82698387145956


  1%|          | 18/3000 [00:24<1:13:29,  1.48s/it]

episode: 17, reward: -233.71, average_reward: -257.4685293416256


  1%|          | 19/3000 [00:25<1:10:58,  1.43s/it]

episode: 18, reward: -1621.66, average_reward: -255.0235894987833


  1%|          | 20/3000 [00:27<1:10:02,  1.41s/it]

episode: 19, reward: -284.9, average_reward: -390.0217920979189


  1%|          | 21/3000 [00:28<1:09:17,  1.40s/it]

episode: 20, reward: -269.31, average_reward: -390.1767713888804


  1%|          | 22/3000 [00:29<1:08:21,  1.38s/it]

episode: 21, reward: -273.59, average_reward: -391.27128091718674


  1%|          | 23/3000 [00:31<1:07:54,  1.37s/it]

episode: 22, reward: -274.81, average_reward: -396.790694507442


  1%|          | 24/3000 [00:32<1:08:33,  1.38s/it]

episode: 23, reward: -224.0, average_reward: -397.46235862624826


  1%|          | 25/3000 [00:33<1:07:54,  1.37s/it]

episode: 24, reward: -265.24, average_reward: -391.7696096533307


  1%|          | 26/3000 [00:35<1:07:12,  1.36s/it]

episode: 25, reward: -255.05, average_reward: -391.2107590927993


  1%|          | 27/3000 [00:36<1:07:00,  1.35s/it]

episode: 26, reward: -275.11, average_reward: -391.0324995920116


  1%|          | 28/3000 [00:37<1:07:00,  1.35s/it]

episode: 27, reward: -278.07, average_reward: -397.7382279210022


  1%|          | 29/3000 [00:39<1:08:18,  1.38s/it]

episode: 28, reward: -269.8, average_reward: -402.17383971684944


  1%|          | 30/3000 [00:40<1:11:29,  1.44s/it]

episode: 29, reward: -284.4, average_reward: -266.9873747663063


  1%|          | 31/3000 [00:42<1:11:37,  1.45s/it]

episode: 30, reward: -280.8, average_reward: -266.9381103803336


  1%|          | 32/3000 [00:43<1:11:43,  1.45s/it]

episode: 31, reward: -277.51, average_reward: -268.0868185492094


  1%|          | 33/3000 [00:45<1:11:31,  1.45s/it]

episode: 32, reward: -273.83, average_reward: -268.47904640651296


  1%|          | 34/3000 [00:46<1:13:47,  1.49s/it]

episode: 33, reward: -273.19, average_reward: -268.38106219954113


  1%|          | 35/3000 [00:48<1:16:03,  1.54s/it]

episode: 34, reward: -248.99, average_reward: -273.29986426462284


  1%|          | 36/3000 [00:50<1:14:52,  1.52s/it]

episode: 35, reward: -1376.31, average_reward: -271.67480009113825


  1%|          | 37/3000 [00:51<1:13:46,  1.49s/it]

episode: 36, reward: -279.52, average_reward: -383.8002791852008


  1%|▏         | 38/3000 [00:52<1:12:40,  1.47s/it]

episode: 37, reward: -261.2, average_reward: -384.2406355069584


  1%|▏         | 39/3000 [00:54<1:10:55,  1.44s/it]

episode: 38, reward: -283.14, average_reward: -382.5535249802607


  1%|▏         | 40/3000 [00:55<1:10:05,  1.42s/it]

episode: 39, reward: -281.34, average_reward: -383.88722519164054


  1%|▏         | 41/3000 [00:57<1:09:43,  1.41s/it]

episode: 40, reward: -291.9, average_reward: -383.5806141194063


  1%|▏         | 42/3000 [00:58<1:09:13,  1.40s/it]

episode: 41, reward: -257.31, average_reward: -384.6906446579259


  1%|▏         | 43/3000 [00:59<1:09:00,  1.40s/it]

episode: 42, reward: -271.36, average_reward: -382.67057238142


  1%|▏         | 44/3000 [01:01<1:09:08,  1.40s/it]

episode: 43, reward: -260.79, average_reward: -382.42280025427965


  2%|▏         | 45/3000 [01:02<1:09:29,  1.41s/it]

episode: 44, reward: -274.78, average_reward: -381.18362541193403


  2%|▏         | 46/3000 [01:04<1:09:01,  1.40s/it]

episode: 45, reward: -267.49, average_reward: -383.7634824425766


  2%|▏         | 47/3000 [01:05<1:09:05,  1.40s/it]

episode: 46, reward: -274.73, average_reward: -272.88144414187957


  2%|▏         | 48/3000 [01:06<1:08:08,  1.39s/it]

episode: 47, reward: -290.28, average_reward: -272.4028260871052


  2%|▏         | 49/3000 [01:08<1:07:03,  1.36s/it]

episode: 48, reward: -284.73, average_reward: -275.3111269388197


  2%|▏         | 50/3000 [01:09<1:06:34,  1.35s/it]

episode: 49, reward: -279.98, average_reward: -275.4702467439275


  2%|▏         | 51/3000 [01:10<1:05:58,  1.34s/it]

episode: 50, reward: -286.12, average_reward: -275.33438772982134


  2%|▏         | 52/3000 [01:12<1:07:10,  1.37s/it]

episode: 51, reward: -234.81, average_reward: -274.75636348308177


  2%|▏         | 53/3000 [01:13<1:06:17,  1.35s/it]

episode: 52, reward: -281.8, average_reward: -272.50667848468146


  2%|▏         | 54/3000 [01:14<1:05:48,  1.34s/it]

episode: 53, reward: -1680.26, average_reward: -273.55104599474055


  2%|▏         | 55/3000 [01:16<1:07:07,  1.37s/it]

episode: 54, reward: -252.09, average_reward: -415.4978314624612


  2%|▏         | 56/3000 [01:17<1:07:48,  1.38s/it]

episode: 55, reward: -282.4, average_reward: -413.2287233705928


  2%|▏         | 57/3000 [01:18<1:07:07,  1.37s/it]

episode: 56, reward: -1415.82, average_reward: -414.72015600614003


  2%|▏         | 58/3000 [01:20<1:06:22,  1.35s/it]

episode: 57, reward: -1874.21, average_reward: -528.8295424305445


  2%|▏         | 59/3000 [01:21<1:07:14,  1.37s/it]

episode: 58, reward: -265.46, average_reward: -687.222714379116


  2%|▏         | 60/3000 [01:23<1:07:37,  1.38s/it]

episode: 59, reward: -292.29, average_reward: -685.2960552533488


  2%|▏         | 61/3000 [01:24<1:08:03,  1.39s/it]

episode: 60, reward: -784.58, average_reward: -686.5277499856613


  2%|▏         | 62/3000 [01:25<1:07:20,  1.38s/it]

episode: 61, reward: -348.99, average_reward: -736.3735461602298


  2%|▏         | 63/3000 [01:27<1:06:40,  1.36s/it]

episode: 62, reward: -268.55, average_reward: -747.7917339011493


  2%|▏         | 64/3000 [01:28<1:06:55,  1.37s/it]

episode: 63, reward: -269.69, average_reward: -746.4667811517535


  2%|▏         | 65/3000 [01:29<1:07:15,  1.37s/it]

episode: 64, reward: -246.9, average_reward: -605.4091714883789


  2%|▏         | 66/3000 [01:31<1:09:17,  1.42s/it]

episode: 65, reward: -233.47, average_reward: -604.8899989964782


  2%|▏         | 67/3000 [01:32<1:09:56,  1.43s/it]

episode: 66, reward: -271.54, average_reward: -599.997071389903


  2%|▏         | 68/3000 [01:34<1:09:23,  1.42s/it]

episode: 67, reward: -262.26, average_reward: -485.56894523039125


  2%|▏         | 69/3000 [01:35<1:09:05,  1.41s/it]

episode: 68, reward: -277.6, average_reward: -324.37310883610337


  2%|▏         | 70/3000 [01:37<1:08:20,  1.40s/it]

episode: 69, reward: -284.11, average_reward: -325.5874079058438


  2%|▏         | 71/3000 [01:38<1:07:51,  1.39s/it]

episode: 70, reward: -271.23, average_reward: -324.7693057809524


  2%|▏         | 72/3000 [01:39<1:07:50,  1.39s/it]

episode: 71, reward: -272.32, average_reward: -273.4348864043875


  2%|▏         | 73/3000 [01:41<1:10:00,  1.43s/it]

episode: 72, reward: -279.98, average_reward: -265.767904490902


  2%|▏         | 74/3000 [01:42<1:09:35,  1.43s/it]

episode: 73, reward: -285.14, average_reward: -266.91053018458206


  2%|▎         | 75/3000 [01:44<1:08:19,  1.40s/it]

episode: 74, reward: -279.34, average_reward: -268.45548008383514


  3%|▎         | 76/3000 [01:45<1:08:11,  1.40s/it]

episode: 75, reward: -277.28, average_reward: -271.69929310448777


  3%|▎         | 77/3000 [01:46<1:08:11,  1.40s/it]

episode: 76, reward: -265.58, average_reward: -276.0806034446582


  3%|▎         | 78/3000 [01:48<1:07:45,  1.39s/it]

episode: 77, reward: -261.78, average_reward: -275.4843431117176


  3%|▎         | 79/3000 [01:49<1:06:43,  1.37s/it]

episode: 78, reward: -286.63, average_reward: -275.43688960960327


  3%|▎         | 80/3000 [01:50<1:05:36,  1.35s/it]

episode: 79, reward: -1278.99, average_reward: -276.3395813948276


  3%|▎         | 81/3000 [01:52<1:06:15,  1.36s/it]

episode: 80, reward: -269.83, average_reward: -375.8273288712937


  3%|▎         | 82/3000 [01:53<1:06:20,  1.36s/it]

episode: 81, reward: -267.04, average_reward: -375.6867820410377


  3%|▎         | 83/3000 [01:55<1:09:41,  1.43s/it]

episode: 82, reward: -367.6, average_reward: -375.1585091987273


  3%|▎         | 84/3000 [01:56<1:09:57,  1.44s/it]

episode: 83, reward: -791.33, average_reward: -383.9207568870789


  3%|▎         | 85/3000 [01:58<1:09:43,  1.43s/it]

episode: 84, reward: -228.88, average_reward: -434.5399306172546


  3%|▎         | 86/3000 [01:59<1:08:24,  1.41s/it]

episode: 85, reward: -290.22, average_reward: -429.49356984305524


  3%|▎         | 87/3000 [02:00<1:08:34,  1.41s/it]

episode: 86, reward: -266.99, average_reward: -430.78757882028447


  3%|▎         | 88/3000 [02:02<1:07:33,  1.39s/it]

episode: 87, reward: -272.36, average_reward: -430.92904529902955


  3%|▎         | 89/3000 [02:03<1:07:04,  1.38s/it]

episode: 88, reward: -276.81, average_reward: -431.98694733161574


  3%|▎         | 90/3000 [02:05<1:07:00,  1.38s/it]

episode: 89, reward: -255.33, average_reward: -431.00501353863547


  3%|▎         | 91/3000 [02:06<1:06:33,  1.37s/it]

episode: 90, reward: -263.91, average_reward: -328.6389010912109


  3%|▎         | 92/3000 [02:07<1:06:02,  1.36s/it]

episode: 91, reward: -260.72, average_reward: -328.0474214592619


  3%|▎         | 93/3000 [02:09<1:05:46,  1.36s/it]

episode: 92, reward: -270.58, average_reward: -327.41522044445156


  3%|▎         | 94/3000 [02:10<1:05:12,  1.35s/it]

episode: 93, reward: -277.25, average_reward: -317.7129504559426


  3%|▎         | 95/3000 [02:11<1:06:12,  1.37s/it]

episode: 94, reward: -267.49, average_reward: -266.3046795177211


  3%|▎         | 96/3000 [02:13<1:06:10,  1.37s/it]

episode: 95, reward: -279.19, average_reward: -270.16630526490275


  3%|▎         | 97/3000 [02:14<1:07:52,  1.40s/it]

episode: 96, reward: -360.31, average_reward: -269.0627154179571


  3%|▎         | 98/3000 [02:16<1:07:54,  1.40s/it]

episode: 97, reward: -287.54, average_reward: -278.39434322968407


  3%|▎         | 99/3000 [02:17<1:08:06,  1.41s/it]

episode: 98, reward: -274.98, average_reward: -279.91189246318964


  3%|▎         | 100/3000 [02:18<1:09:06,  1.43s/it]

episode: 99, reward: -214.1, average_reward: -279.72893047345605


  3%|▎         | 101/3000 [02:20<1:08:06,  1.41s/it]

episode: 100, reward: -277.95, average_reward: -275.60541428340673


  3%|▎         | 102/3000 [02:21<1:09:00,  1.43s/it]

episode: 101, reward: -249.84, average_reward: -277.0092087581361


  3%|▎         | 103/3000 [02:23<1:07:24,  1.40s/it]

episode: 102, reward: -280.62, average_reward: -275.9217443966377


  3%|▎         | 104/3000 [02:24<1:07:03,  1.39s/it]

episode: 103, reward: -281.72, average_reward: -276.92626996960473


  4%|▎         | 105/3000 [02:25<1:05:59,  1.37s/it]

episode: 104, reward: -225.67, average_reward: -277.3735820360434


  4%|▎         | 106/3000 [02:27<1:05:02,  1.35s/it]

episode: 105, reward: -263.98, average_reward: -273.1910178306746


  4%|▎         | 107/3000 [02:28<1:05:37,  1.36s/it]

episode: 106, reward: -256.94, average_reward: -271.6705138021674


  4%|▎         | 108/3000 [02:29<1:05:02,  1.35s/it]

episode: 107, reward: -278.08, average_reward: -261.3334520116089


  4%|▎         | 109/3000 [02:31<1:05:26,  1.36s/it]

episode: 108, reward: -277.48, average_reward: -260.3875036991531


  4%|▎         | 110/3000 [02:32<1:05:22,  1.36s/it]

episode: 109, reward: -277.2, average_reward: -260.63705906978373


  4%|▎         | 111/3000 [02:33<1:05:15,  1.36s/it]

episode: 110, reward: -268.97, average_reward: -266.9479839584406


  4%|▎         | 112/3000 [02:35<1:05:11,  1.35s/it]

episode: 111, reward: -278.54, average_reward: -266.05017773906957


  4%|▍         | 113/3000 [02:36<1:06:41,  1.39s/it]

episode: 112, reward: -267.72, average_reward: -268.9197549754149


  4%|▍         | 114/3000 [02:38<1:07:08,  1.40s/it]

episode: 113, reward: -267.43, average_reward: -267.62950830351326


  4%|▍         | 115/3000 [02:39<1:06:57,  1.39s/it]

episode: 114, reward: -287.87, average_reward: -266.20099952832635


  4%|▍         | 116/3000 [02:41<1:09:08,  1.44s/it]

episode: 115, reward: -236.62, average_reward: -272.42182767224233


  4%|▍         | 117/3000 [02:42<1:08:16,  1.42s/it]

episode: 116, reward: -275.39, average_reward: -269.6858393800841


  4%|▍         | 118/3000 [02:43<1:07:49,  1.41s/it]

episode: 117, reward: -240.74, average_reward: -271.5309437103149


  4%|▍         | 119/3000 [02:45<1:07:18,  1.40s/it]

episode: 118, reward: -264.44, average_reward: -267.7977410446399


  4%|▍         | 120/3000 [02:46<1:06:33,  1.39s/it]

episode: 119, reward: -276.43, average_reward: -266.4936696556557


  4%|▍         | 121/3000 [02:47<1:05:38,  1.37s/it]

episode: 120, reward: -276.33, average_reward: -266.41626933028044


  4%|▍         | 122/3000 [02:49<1:04:51,  1.35s/it]

episode: 121, reward: -277.82, average_reward: -267.15176308215376


  4%|▍         | 123/3000 [02:50<1:05:35,  1.37s/it]

episode: 122, reward: -246.3, average_reward: -267.08030919482627


  4%|▍         | 124/3000 [02:51<1:05:19,  1.36s/it]

episode: 123, reward: -277.83, average_reward: -264.9380033594071


  4%|▍         | 125/3000 [02:53<1:05:01,  1.36s/it]

episode: 124, reward: -286.62, average_reward: -265.9773965334497


  4%|▍         | 126/3000 [02:54<1:04:42,  1.35s/it]

episode: 125, reward: -274.06, average_reward: -265.85153478360223


  4%|▍         | 127/3000 [02:55<1:04:13,  1.34s/it]

episode: 126, reward: -285.14, average_reward: -269.5951347568854


  4%|▍         | 128/3000 [02:57<1:03:43,  1.33s/it]

episode: 127, reward: -278.16, average_reward: -270.570450416374


  4%|▍         | 129/3000 [02:58<1:03:09,  1.32s/it]

episode: 128, reward: -288.72, average_reward: -274.31202217755447


  4%|▍         | 130/3000 [02:59<1:04:20,  1.35s/it]

episode: 129, reward: -252.78, average_reward: -276.74036509662454


  4%|▍         | 131/3000 [03:01<1:03:39,  1.33s/it]

episode: 130, reward: -283.99, average_reward: -274.37507354323265


  4%|▍         | 132/3000 [03:02<1:03:35,  1.33s/it]

episode: 131, reward: -279.47, average_reward: -275.14089983864875


  4%|▍         | 133/3000 [03:04<1:04:38,  1.35s/it]

episode: 132, reward: -250.69, average_reward: -275.3050591115822


  4%|▍         | 134/3000 [03:05<1:04:03,  1.34s/it]

episode: 133, reward: -277.38, average_reward: -275.74399579902706


  4%|▍         | 135/3000 [03:06<1:04:21,  1.35s/it]

episode: 134, reward: -286.11, average_reward: -275.6992597010037


  5%|▍         | 136/3000 [03:07<1:03:26,  1.33s/it]

episode: 135, reward: -281.95, average_reward: -275.64851196219024


  5%|▍         | 137/3000 [03:09<1:06:18,  1.39s/it]

episode: 136, reward: -226.3, average_reward: -276.43746578626326


  5%|▍         | 138/3000 [03:10<1:05:33,  1.37s/it]

episode: 137, reward: -283.71, average_reward: -270.5530056562613


  5%|▍         | 139/3000 [03:12<1:05:33,  1.37s/it]

episode: 138, reward: -277.71, average_reward: -271.1077245822474


  5%|▍         | 140/3000 [03:13<1:04:57,  1.36s/it]

episode: 139, reward: -286.97, average_reward: -270.00691486350087


  5%|▍         | 141/3000 [03:14<1:04:23,  1.35s/it]

episode: 140, reward: -274.5, average_reward: -273.42647437596895


  5%|▍         | 142/3000 [03:16<1:04:45,  1.36s/it]

episode: 141, reward: -739.87, average_reward: -272.4779977669023


  5%|▍         | 143/3000 [03:17<1:05:05,  1.37s/it]

episode: 142, reward: -275.55, average_reward: -318.5184018240288


  5%|▍         | 144/3000 [03:19<1:04:58,  1.36s/it]

episode: 143, reward: -252.95, average_reward: -321.00497732170913


  5%|▍         | 145/3000 [03:20<1:04:12,  1.35s/it]

episode: 144, reward: -275.9, average_reward: -318.56231035215467


  5%|▍         | 146/3000 [03:21<1:04:08,  1.35s/it]

episode: 145, reward: -260.11, average_reward: -317.54178942874523


  5%|▍         | 147/3000 [03:22<1:03:51,  1.34s/it]

episode: 146, reward: -271.45, average_reward: -315.35796800876324


  5%|▍         | 148/3000 [03:24<1:03:45,  1.34s/it]

episode: 147, reward: -273.53, average_reward: -319.87291232471847


  5%|▍         | 149/3000 [03:25<1:03:49,  1.34s/it]

episode: 148, reward: -272.38, average_reward: -318.85504941461534


  5%|▌         | 150/3000 [03:26<1:03:02,  1.33s/it]

episode: 149, reward: -289.02, average_reward: -318.3217153801067


  5%|▌         | 151/3000 [03:28<1:02:48,  1.32s/it]

episode: 150, reward: -278.95, average_reward: -318.52687444056573


  5%|▌         | 152/3000 [03:29<1:02:35,  1.32s/it]

episode: 151, reward: -266.75, average_reward: -318.9717871771267


  5%|▌         | 153/3000 [03:30<1:03:19,  1.33s/it]

episode: 152, reward: -268.65, average_reward: -271.6593755002862


  5%|▌         | 154/3000 [03:32<1:03:35,  1.34s/it]

episode: 153, reward: -279.96, average_reward: -270.9690813670412


  5%|▌         | 155/3000 [03:33<1:03:20,  1.34s/it]

episode: 154, reward: -1086.28, average_reward: -273.66931946185287


  5%|▌         | 156/3000 [03:34<1:03:05,  1.33s/it]

episode: 155, reward: -270.0, average_reward: -354.7066550427335


  5%|▌         | 157/3000 [03:36<1:02:52,  1.33s/it]

episode: 156, reward: -273.33, average_reward: -355.69521941221194


  5%|▌         | 158/3000 [03:37<1:04:28,  1.36s/it]

episode: 157, reward: -215.6, average_reward: -355.8832273267815


  5%|▌         | 159/3000 [03:39<1:04:17,  1.36s/it]

episode: 158, reward: -288.64, average_reward: -350.09076609958254


  5%|▌         | 160/3000 [03:40<1:04:12,  1.36s/it]

episode: 159, reward: -258.03, average_reward: -351.7171530531391


  5%|▌         | 161/3000 [03:41<1:03:41,  1.35s/it]

episode: 160, reward: -263.58, average_reward: -348.6180809127378


  5%|▌         | 162/3000 [03:43<1:03:29,  1.34s/it]

episode: 161, reward: -269.66, average_reward: -347.08132589282286


  5%|▌         | 163/3000 [03:44<1:03:15,  1.34s/it]

episode: 162, reward: -272.59, average_reward: -347.3727774183088


  5%|▌         | 164/3000 [03:45<1:03:01,  1.33s/it]

episode: 163, reward: -277.27, average_reward: -347.76697251359604


  6%|▌         | 165/3000 [03:47<1:03:21,  1.34s/it]

episode: 164, reward: -263.45, average_reward: -347.49883457257204


  6%|▌         | 166/3000 [03:48<1:04:13,  1.36s/it]

episode: 165, reward: -253.11, average_reward: -265.2159008174159


  6%|▌         | 167/3000 [03:49<1:03:38,  1.35s/it]

episode: 166, reward: -282.58, average_reward: -263.52723254843636


  6%|▌         | 168/3000 [03:51<1:03:53,  1.35s/it]

episode: 167, reward: -264.07, average_reward: -264.4525432413033


  6%|▌         | 169/3000 [03:52<1:03:28,  1.35s/it]

episode: 168, reward: -292.35, average_reward: -269.2991662990864


  6%|▌         | 170/3000 [03:53<1:04:55,  1.38s/it]

episode: 169, reward: -288.44, average_reward: -269.6697433017374


  6%|▌         | 171/3000 [03:55<1:04:46,  1.37s/it]

episode: 170, reward: -277.87, average_reward: -272.7102094446626


  6%|▌         | 172/3000 [03:56<1:05:12,  1.38s/it]

episode: 171, reward: -243.56, average_reward: -274.1388545608903


  6%|▌         | 173/3000 [03:58<1:06:25,  1.41s/it]

episode: 172, reward: -234.85, average_reward: -271.5289831836704


  6%|▌         | 174/3000 [03:59<1:05:46,  1.40s/it]

episode: 173, reward: -287.75, average_reward: -267.7547064604859


  6%|▌         | 175/3000 [04:00<1:05:46,  1.40s/it]

episode: 174, reward: -269.45, average_reward: -268.80273878627446


  6%|▌         | 176/3000 [04:02<1:05:11,  1.39s/it]

episode: 175, reward: -276.24, average_reward: -269.4034713136706


  6%|▌         | 177/3000 [04:03<1:04:28,  1.37s/it]

episode: 176, reward: -357.18, average_reward: -271.71676159827416


  6%|▌         | 178/3000 [04:04<1:04:03,  1.36s/it]

episode: 177, reward: -282.11, average_reward: -279.17642987049646


  6%|▌         | 179/3000 [04:06<1:03:23,  1.35s/it]

episode: 178, reward: -279.9, average_reward: -280.9804811175917


  6%|▌         | 180/3000 [04:07<1:03:15,  1.35s/it]

episode: 179, reward: -276.26, average_reward: -279.73570631914976


  6%|▌         | 181/3000 [04:09<1:03:36,  1.35s/it]

episode: 180, reward: -273.39, average_reward: -278.51786324382584


  6%|▌         | 182/3000 [04:10<1:03:42,  1.36s/it]

episode: 181, reward: -275.45, average_reward: -278.069561409175


  6%|▌         | 183/3000 [04:11<1:03:30,  1.35s/it]

episode: 182, reward: -277.08, average_reward: -281.2586990326943


  6%|▌         | 184/3000 [04:13<1:03:08,  1.35s/it]

episode: 183, reward: -264.68, average_reward: -285.48208052387616


  6%|▌         | 185/3000 [04:14<1:02:49,  1.34s/it]

episode: 184, reward: -269.39, average_reward: -283.17465361466054


  6%|▌         | 186/3000 [04:15<1:03:23,  1.35s/it]

episode: 185, reward: -260.44, average_reward: -283.16856959957533


  6%|▌         | 187/3000 [04:17<1:03:11,  1.35s/it]

episode: 186, reward: -274.97, average_reward: -281.58851548920273


  6%|▋         | 188/3000 [04:18<1:02:52,  1.34s/it]

episode: 187, reward: -280.52, average_reward: -273.36750302826573


  6%|▋         | 189/3000 [04:19<1:03:23,  1.35s/it]

episode: 188, reward: -282.29, average_reward: -273.2080047245769


  6%|▋         | 190/3000 [04:21<1:03:06,  1.35s/it]

episode: 189, reward: -277.2, average_reward: -273.44696442208794


  6%|▋         | 191/3000 [04:22<1:03:35,  1.36s/it]

episode: 190, reward: -265.32, average_reward: -273.5412505749253


  6%|▋         | 192/3000 [04:23<1:03:41,  1.36s/it]

episode: 191, reward: -283.77, average_reward: -272.73467097871423


  6%|▋         | 193/3000 [04:25<1:03:13,  1.35s/it]

episode: 192, reward: -277.38, average_reward: -273.56632581345633


  6%|▋         | 194/3000 [04:26<1:04:06,  1.37s/it]

episode: 193, reward: -250.86, average_reward: -273.5961781831832


  6%|▋         | 195/3000 [04:27<1:03:52,  1.37s/it]

episode: 194, reward: -247.04, average_reward: -272.21411844870926


  7%|▋         | 196/3000 [04:29<1:03:11,  1.35s/it]

episode: 195, reward: -279.48, average_reward: -269.9783917472429


  7%|▋         | 197/3000 [04:30<1:04:19,  1.38s/it]

episode: 196, reward: -260.18, average_reward: -271.88274272576865


  7%|▋         | 198/3000 [04:32<1:03:08,  1.35s/it]

episode: 197, reward: -273.46, average_reward: -270.4035245824375


  7%|▋         | 199/3000 [04:33<1:02:59,  1.35s/it]

episode: 198, reward: -264.28, average_reward: -269.6980764916847


  7%|▋         | 200/3000 [04:34<1:03:19,  1.36s/it]

episode: 199, reward: -274.35, average_reward: -267.8971873940801


  7%|▋         | 201/3000 [04:36<1:02:52,  1.35s/it]

episode: 200, reward: -267.91, average_reward: -267.61187427010975


  7%|▋         | 202/3000 [04:37<1:02:45,  1.35s/it]

episode: 201, reward: -274.69, average_reward: -267.8710036919416


  7%|▋         | 203/3000 [04:38<1:02:48,  1.35s/it]

episode: 202, reward: -282.86, average_reward: -266.9625109424072


  7%|▋         | 204/3000 [04:40<1:03:04,  1.35s/it]

episode: 203, reward: -263.26, average_reward: -267.51049604776006


  7%|▋         | 205/3000 [04:41<1:03:17,  1.36s/it]

episode: 204, reward: -282.04, average_reward: -268.75092105996885


  7%|▋         | 206/3000 [04:42<1:03:13,  1.36s/it]

episode: 205, reward: -263.83, average_reward: -272.2509291635673


  7%|▋         | 207/3000 [04:44<1:03:09,  1.36s/it]

episode: 206, reward: -283.87, average_reward: -270.6854769438572


  7%|▋         | 208/3000 [04:45<1:02:55,  1.35s/it]

episode: 207, reward: -271.54, average_reward: -273.0543656460093


  7%|▋         | 209/3000 [04:46<1:02:58,  1.35s/it]

episode: 208, reward: -272.99, average_reward: -272.8619201626509


  7%|▋         | 210/3000 [04:48<1:03:11,  1.36s/it]

episode: 209, reward: -280.05, average_reward: -273.732758752105


  7%|▋         | 211/3000 [04:49<1:03:39,  1.37s/it]

episode: 210, reward: -277.91, average_reward: -274.3026580283024


  7%|▋         | 212/3000 [04:51<1:04:21,  1.38s/it]

episode: 211, reward: -291.71, average_reward: -275.3020401751518


  7%|▋         | 213/3000 [04:52<1:03:51,  1.37s/it]

episode: 212, reward: -272.3, average_reward: -277.00430092515415


  7%|▋         | 214/3000 [04:53<1:03:38,  1.37s/it]

episode: 213, reward: -260.09, average_reward: -275.94809138869476


  7%|▋         | 215/3000 [04:55<1:03:14,  1.36s/it]

episode: 214, reward: -271.92, average_reward: -275.6305440440998


  7%|▋         | 216/3000 [04:56<1:03:34,  1.37s/it]

episode: 215, reward: -270.69, average_reward: -274.6190394611557


  7%|▋         | 217/3000 [04:57<1:03:32,  1.37s/it]

episode: 216, reward: -265.88, average_reward: -275.30534512445263


  7%|▋         | 218/3000 [04:59<1:03:33,  1.37s/it]

episode: 217, reward: -284.83, average_reward: -273.5070952642356


  7%|▋         | 219/3000 [05:00<1:03:25,  1.37s/it]

episode: 218, reward: -1087.98, average_reward: -274.83688056920136


  7%|▋         | 220/3000 [05:02<1:03:43,  1.38s/it]

episode: 219, reward: -273.47, average_reward: -356.33647515934723


  7%|▋         | 221/3000 [05:03<1:03:40,  1.37s/it]

episode: 220, reward: -277.4, average_reward: -355.6787893360504


  7%|▋         | 222/3000 [05:04<1:04:18,  1.39s/it]

episode: 221, reward: -282.88, average_reward: -355.62821371657543


  7%|▋         | 223/3000 [05:06<1:03:46,  1.38s/it]

episode: 222, reward: -262.82, average_reward: -354.7452860601286


  7%|▋         | 224/3000 [05:07<1:04:22,  1.39s/it]

episode: 223, reward: -236.07, average_reward: -353.79755307274587


  8%|▊         | 225/3000 [05:09<1:05:17,  1.41s/it]

episode: 224, reward: -230.62, average_reward: -351.39555831271383


  8%|▊         | 226/3000 [05:10<1:04:37,  1.40s/it]

episode: 225, reward: -283.36, average_reward: -347.2659277099532


  8%|▊         | 227/3000 [05:11<1:03:57,  1.38s/it]

episode: 226, reward: -284.95, average_reward: -348.53247596688175


  8%|▊         | 228/3000 [05:13<1:03:04,  1.37s/it]

episode: 227, reward: -288.85, average_reward: -350.4389163873035


  8%|▊         | 229/3000 [05:14<1:03:30,  1.38s/it]

episode: 228, reward: -272.43, average_reward: -350.8402452759678


  8%|▊         | 230/3000 [05:15<1:03:21,  1.37s/it]

episode: 229, reward: -268.06, average_reward: -269.284487785349


  8%|▊         | 231/3000 [05:17<1:03:04,  1.37s/it]

episode: 230, reward: -281.37, average_reward: -268.7432360903136


  8%|▊         | 232/3000 [05:18<1:03:19,  1.37s/it]

episode: 231, reward: -259.43, average_reward: -269.1399870218553


  8%|▊         | 233/3000 [05:20<1:04:18,  1.39s/it]

episode: 232, reward: -271.29, average_reward: -266.79504025699885


  8%|▊         | 234/3000 [05:21<1:04:04,  1.39s/it]

episode: 233, reward: -259.72, average_reward: -267.6416745182413


  8%|▊         | 235/3000 [05:22<1:04:17,  1.39s/it]

episode: 234, reward: -278.77, average_reward: -270.006445739372


  8%|▊         | 236/3000 [05:24<1:03:31,  1.38s/it]

episode: 235, reward: -265.57, average_reward: -274.821258904152


  8%|▊         | 237/3000 [05:25<1:03:21,  1.38s/it]

episode: 236, reward: -752.22, average_reward: -273.0420185818847


  8%|▊         | 238/3000 [05:26<1:02:55,  1.37s/it]

episode: 237, reward: -1627.1, average_reward: -319.76895451034136


  8%|▊         | 239/3000 [05:28<1:02:11,  1.35s/it]

episode: 238, reward: -288.08, average_reward: -453.5936990997143


  8%|▊         | 240/3000 [05:29<1:02:34,  1.36s/it]

episode: 239, reward: -245.92, average_reward: -455.1586849979734


  8%|▊         | 241/3000 [05:30<1:02:28,  1.36s/it]

episode: 240, reward: -274.85, average_reward: -452.94520446961303


  8%|▊         | 242/3000 [05:32<1:01:50,  1.35s/it]

episode: 241, reward: -268.3, average_reward: -452.2931721983888


  8%|▊         | 243/3000 [05:33<1:02:07,  1.35s/it]

episode: 242, reward: -249.22, average_reward: -453.18022100324214


  8%|▊         | 244/3000 [05:35<1:02:09,  1.35s/it]

episode: 243, reward: -281.54, average_reward: -450.97318492568445


  8%|▊         | 245/3000 [05:36<1:02:12,  1.35s/it]

episode: 244, reward: -283.2, average_reward: -453.15591827702417


  8%|▊         | 246/3000 [05:37<1:02:01,  1.35s/it]

episode: 245, reward: -272.89, average_reward: -453.59869527140665


  8%|▊         | 247/3000 [05:39<1:01:44,  1.35s/it]

episode: 246, reward: -254.47, average_reward: -454.3305590754773


  8%|▊         | 248/3000 [05:40<1:01:41,  1.34s/it]

episode: 247, reward: -274.62, average_reward: -404.5559553383442


  8%|▊         | 249/3000 [05:41<1:01:36,  1.34s/it]

episode: 248, reward: -269.44, average_reward: -269.3084264772362


  8%|▊         | 250/3000 [05:43<1:01:36,  1.34s/it]

episode: 249, reward: -265.5, average_reward: -267.4446337689908


  8%|▊         | 251/3000 [05:44<1:01:48,  1.35s/it]

episode: 250, reward: -241.15, average_reward: -269.40262755936914


  8%|▊         | 252/3000 [05:45<1:01:39,  1.35s/it]

episode: 251, reward: -937.79, average_reward: -266.03312841640354


  8%|▊         | 253/3000 [05:47<1:02:00,  1.35s/it]

episode: 252, reward: -354.0, average_reward: -332.9820538592129


  8%|▊         | 254/3000 [05:48<1:01:40,  1.35s/it]

episode: 253, reward: -281.59, average_reward: -343.4607399857486


  8%|▊         | 255/3000 [05:49<1:02:37,  1.37s/it]

episode: 254, reward: -231.48, average_reward: -343.46528425435616


  9%|▊         | 256/3000 [05:51<1:02:08,  1.36s/it]

episode: 255, reward: -259.71, average_reward: -338.2935984960095


  9%|▊         | 257/3000 [05:52<1:02:33,  1.37s/it]

episode: 256, reward: -249.2, average_reward: -336.9758607633572


  9%|▊         | 258/3000 [05:53<1:01:40,  1.35s/it]

episode: 257, reward: -275.84, average_reward: -336.4490290393402


  9%|▊         | 259/3000 [05:55<1:02:01,  1.36s/it]

episode: 258, reward: -285.5, average_reward: -336.5707003157738


  9%|▊         | 260/3000 [05:56<1:01:51,  1.35s/it]

episode: 259, reward: -1474.49, average_reward: -338.1770876581248


  9%|▊         | 261/3000 [05:58<1:01:53,  1.36s/it]

episode: 260, reward: -259.76, average_reward: -459.0759675810629


  9%|▊         | 262/3000 [05:59<1:02:09,  1.36s/it]

episode: 261, reward: -254.25, average_reward: -460.9364413391921


  9%|▉         | 263/3000 [06:00<1:02:02,  1.36s/it]

episode: 262, reward: -265.11, average_reward: -392.5820799448214


  9%|▉         | 264/3000 [06:02<1:01:10,  1.34s/it]

episode: 263, reward: -260.22, average_reward: -383.69247804964044


  9%|▉         | 265/3000 [06:03<1:01:27,  1.35s/it]

episode: 264, reward: -331.77, average_reward: -381.5555745734049


  9%|▉         | 266/3000 [06:04<1:01:15,  1.34s/it]

episode: 265, reward: -272.65, average_reward: -391.583853474918


  9%|▉         | 267/3000 [06:06<1:01:20,  1.35s/it]

episode: 266, reward: -276.66, average_reward: -392.8783570421651


  9%|▉         | 268/3000 [06:07<1:00:36,  1.33s/it]

episode: 267, reward: -282.82, average_reward: -395.6241842552493


  9%|▉         | 269/3000 [06:08<1:01:43,  1.36s/it]

episode: 268, reward: -233.77, average_reward: -396.322659815836


  9%|▉         | 270/3000 [06:10<1:01:51,  1.36s/it]

episode: 269, reward: -256.2, average_reward: -391.14888661347055


  9%|▉         | 271/3000 [06:11<1:01:30,  1.35s/it]

episode: 270, reward: -287.42, average_reward: -269.31919504413156


  9%|▉         | 272/3000 [06:12<1:01:22,  1.35s/it]

episode: 271, reward: -1529.41, average_reward: -272.0852622333192


  9%|▉         | 273/3000 [06:14<1:01:28,  1.35s/it]

episode: 272, reward: -269.34, average_reward: -399.6020643356677


  9%|▉         | 274/3000 [06:15<1:01:01,  1.34s/it]

episode: 273, reward: -268.36, average_reward: -400.0255019911685


  9%|▉         | 275/3000 [06:16<1:01:01,  1.34s/it]

episode: 274, reward: -251.92, average_reward: -400.83956262524487


  9%|▉         | 276/3000 [06:18<1:00:46,  1.34s/it]

episode: 275, reward: -269.34, average_reward: -392.8546035521996


  9%|▉         | 277/3000 [06:19<1:00:45,  1.34s/it]

episode: 276, reward: -274.98, average_reward: -392.52336982191156


  9%|▉         | 278/3000 [06:20<1:00:11,  1.33s/it]

episode: 277, reward: -283.46, average_reward: -392.35553154255075


  9%|▉         | 279/3000 [06:22<1:00:32,  1.34s/it]

episode: 278, reward: -273.82, average_reward: -392.4195974432642


  9%|▉         | 280/3000 [06:23<1:00:48,  1.34s/it]

episode: 279, reward: -285.32, average_reward: -396.42556334351866


  9%|▉         | 281/3000 [06:24<1:01:04,  1.35s/it]

episode: 280, reward: -243.27, average_reward: -399.3381230789288


  9%|▉         | 282/3000 [06:26<1:00:38,  1.34s/it]

episode: 281, reward: -283.16, average_reward: -394.92325988857533


  9%|▉         | 283/3000 [06:27<1:00:33,  1.34s/it]

episode: 282, reward: -276.45, average_reward: -270.2981786435493


  9%|▉         | 284/3000 [06:28<1:00:21,  1.33s/it]

episode: 283, reward: -280.18, average_reward: -271.0088003781432


 10%|▉         | 285/3000 [06:30<1:01:07,  1.35s/it]

episode: 284, reward: -257.73, average_reward: -272.1908550991883


 10%|▉         | 286/3000 [06:31<1:01:27,  1.36s/it]

episode: 285, reward: -253.45, average_reward: -272.77238507536265


 10%|▉         | 287/3000 [06:33<1:01:33,  1.36s/it]

episode: 286, reward: -278.12, average_reward: -271.18327837880696


 10%|▉         | 288/3000 [06:34<1:01:07,  1.35s/it]

episode: 287, reward: -275.47, average_reward: -271.4971208469715


 10%|▉         | 289/3000 [06:35<1:00:37,  1.34s/it]

episode: 288, reward: -291.0, average_reward: -270.69789545079027


 10%|▉         | 290/3000 [06:37<1:01:07,  1.35s/it]

episode: 289, reward: -270.63, average_reward: -272.4151171131763


 10%|▉         | 291/3000 [06:38<1:01:11,  1.36s/it]

episode: 290, reward: -266.27, average_reward: -270.9457863006647


 10%|▉         | 292/3000 [06:39<1:00:56,  1.35s/it]

episode: 291, reward: -275.43, average_reward: -273.2461108633514


 10%|▉         | 293/3000 [06:41<1:00:50,  1.35s/it]

episode: 292, reward: -282.73, average_reward: -272.47253150659583


 10%|▉         | 294/3000 [06:42<1:01:46,  1.37s/it]

episode: 293, reward: -721.19, average_reward: -273.1010756778411


 10%|▉         | 295/3000 [06:43<1:01:22,  1.36s/it]

episode: 294, reward: -269.41, average_reward: -317.2016963641525


 10%|▉         | 296/3000 [06:45<1:01:16,  1.36s/it]

episode: 295, reward: -285.84, average_reward: -318.36980852355816


 10%|▉         | 297/3000 [06:46<1:00:51,  1.35s/it]

episode: 296, reward: -278.26, average_reward: -321.60883062329424


 10%|▉         | 298/3000 [06:47<1:00:41,  1.35s/it]

episode: 297, reward: -281.23, average_reward: -321.62302854578235


 10%|▉         | 299/3000 [06:49<1:00:21,  1.34s/it]

episode: 298, reward: -266.7, average_reward: -322.19941011637013


 10%|█         | 300/3000 [06:50<1:00:03,  1.33s/it]

episode: 299, reward: -279.03, average_reward: -319.76939658448646


 10%|█         | 301/3000 [06:51<59:50,  1.33s/it]  

episode: 300, reward: -273.83, average_reward: -320.60932437865483


 10%|█         | 302/3000 [06:53<1:00:08,  1.34s/it]

episode: 301, reward: -274.18, average_reward: -321.36519446655177


 10%|█         | 303/3000 [06:54<1:00:56,  1.36s/it]

episode: 302, reward: -256.14, average_reward: -321.2405411838178


 10%|█         | 304/3000 [06:56<1:02:39,  1.39s/it]

episode: 303, reward: -270.27, average_reward: -318.5813025412955


 10%|█         | 305/3000 [06:57<1:02:55,  1.40s/it]

episode: 304, reward: -268.89, average_reward: -273.48947406019073


 10%|█         | 306/3000 [06:59<1:04:20,  1.43s/it]

episode: 305, reward: -259.59, average_reward: -273.43727054893964


 10%|█         | 307/3000 [07:00<1:03:49,  1.42s/it]

episode: 306, reward: -271.15, average_reward: -270.8118718041152


 10%|█         | 308/3000 [07:01<1:03:44,  1.42s/it]

episode: 307, reward: -284.77, average_reward: -270.1008119813735


 10%|█         | 309/3000 [07:03<1:04:35,  1.44s/it]

episode: 308, reward: -257.01, average_reward: -270.45449807048124


 10%|█         | 310/3000 [07:04<1:03:55,  1.43s/it]

episode: 309, reward: -279.03, average_reward: -269.4861889041417


 10%|█         | 311/3000 [07:06<1:03:42,  1.42s/it]

episode: 310, reward: -1600.54, average_reward: -269.4865131353747


 10%|█         | 312/3000 [07:07<1:02:37,  1.40s/it]

episode: 311, reward: -275.33, average_reward: -402.15775361569297


 10%|█         | 313/3000 [07:08<1:01:46,  1.38s/it]

episode: 312, reward: -276.93, average_reward: -402.2724201659417


 10%|█         | 314/3000 [07:10<1:01:55,  1.38s/it]

episode: 313, reward: -277.85, average_reward: -404.351026458207


 10%|█         | 315/3000 [07:11<1:01:44,  1.38s/it]

episode: 314, reward: -248.59, average_reward: -405.10877981198087


 11%|█         | 316/3000 [07:12<1:01:48,  1.38s/it]

episode: 315, reward: -257.55, average_reward: -403.0785260542425


 11%|█         | 317/3000 [07:14<1:01:22,  1.37s/it]

episode: 316, reward: -273.33, average_reward: -402.8749023157416


 11%|█         | 318/3000 [07:15<1:01:44,  1.38s/it]

episode: 317, reward: -256.21, average_reward: -403.0923266207009


 11%|█         | 319/3000 [07:17<1:02:07,  1.39s/it]

episode: 318, reward: -252.11, average_reward: -400.2366361198135


 11%|█         | 320/3000 [07:18<1:03:05,  1.41s/it]

episode: 319, reward: -246.01, average_reward: -399.74618277096033


 11%|█         | 321/3000 [07:19<1:03:08,  1.41s/it]

episode: 320, reward: -282.16, average_reward: -396.4440533978712


 11%|█         | 322/3000 [07:21<1:02:22,  1.40s/it]

episode: 321, reward: -277.06, average_reward: -264.6062233143864


 11%|█         | 323/3000 [07:22<1:01:50,  1.39s/it]

episode: 322, reward: -271.19, average_reward: -264.77995107189406


 11%|█         | 324/3000 [07:24<1:01:53,  1.39s/it]

episode: 323, reward: -270.72, average_reward: -264.2065715057501


 11%|█         | 325/3000 [07:25<1:02:06,  1.39s/it]

episode: 324, reward: -252.47, average_reward: -263.49349621358476


 11%|█         | 326/3000 [07:26<1:01:29,  1.38s/it]

episode: 325, reward: -268.16, average_reward: -263.88126752336785


 11%|█         | 327/3000 [07:28<1:01:26,  1.38s/it]

episode: 326, reward: -825.48, average_reward: -264.94210128889074


 11%|█         | 328/3000 [07:29<1:02:00,  1.39s/it]

episode: 327, reward: -235.93, average_reward: -320.15768297224867


 11%|█         | 329/3000 [07:31<1:02:17,  1.40s/it]

episode: 328, reward: -264.54, average_reward: -318.1291556686375


 11%|█         | 330/3000 [07:32<1:02:07,  1.40s/it]

episode: 329, reward: -1551.09, average_reward: -319.372074661033


 11%|█         | 331/3000 [07:33<1:01:22,  1.38s/it]

episode: 330, reward: -277.9, average_reward: -449.8804396641409


 11%|█         | 332/3000 [07:35<1:01:48,  1.39s/it]

episode: 331, reward: -353.55, average_reward: -449.45442455645417


 11%|█         | 333/3000 [07:36<1:01:23,  1.38s/it]

episode: 332, reward: -264.13, average_reward: -457.1026211274255


 11%|█         | 334/3000 [07:37<1:01:49,  1.39s/it]

episode: 333, reward: -286.15, average_reward: -456.3964819178739


 11%|█         | 335/3000 [07:39<1:01:22,  1.38s/it]

episode: 334, reward: -259.15, average_reward: -457.9401301479926


 11%|█         | 336/3000 [07:40<1:01:19,  1.38s/it]

episode: 335, reward: -249.64, average_reward: -458.6084243639583


 11%|█         | 337/3000 [07:42<1:01:46,  1.39s/it]

episode: 336, reward: -259.45, average_reward: -456.75683232459625


 11%|█▏        | 338/3000 [07:43<1:01:46,  1.39s/it]

episode: 337, reward: -274.25, average_reward: -400.15323919362237


 11%|█▏        | 339/3000 [07:44<1:02:18,  1.40s/it]

episode: 338, reward: -251.21, average_reward: -403.98509751723014


 11%|█▏        | 340/3000 [07:46<1:01:44,  1.39s/it]

episode: 339, reward: -284.48, average_reward: -402.6521295527606


 11%|█▏        | 341/3000 [07:47<1:01:24,  1.39s/it]

episode: 340, reward: -278.86, average_reward: -275.99076266254195


 11%|█▏        | 342/3000 [07:49<1:01:04,  1.38s/it]

episode: 341, reward: -270.46, average_reward: -276.08650997814146


 11%|█▏        | 343/3000 [07:50<1:00:30,  1.37s/it]

episode: 342, reward: -1340.54, average_reward: -267.77789830997557


 11%|█▏        | 344/3000 [07:51<1:00:19,  1.36s/it]

episode: 343, reward: -289.8, average_reward: -375.41849313514825


 12%|█▏        | 345/3000 [07:53<1:00:56,  1.38s/it]

episode: 344, reward: -277.39, average_reward: -375.78353323070496


 12%|█▏        | 346/3000 [07:54<1:01:51,  1.40s/it]

episode: 345, reward: -151.0, average_reward: -377.60796865296527


 12%|█▏        | 347/3000 [07:55<1:01:26,  1.39s/it]

episode: 346, reward: -285.03, average_reward: -367.7433672476135


 12%|█▏        | 348/3000 [07:57<1:01:00,  1.38s/it]

episode: 347, reward: -261.65, average_reward: -370.30187175615873


 12%|█▏        | 349/3000 [07:58<1:00:58,  1.38s/it]

episode: 348, reward: -142.71, average_reward: -369.0422376948338


 12%|█▏        | 350/3000 [08:00<1:01:34,  1.39s/it]

episode: 349, reward: -1048.88, average_reward: -358.19242921157314


 12%|█▏        | 351/3000 [08:01<1:01:25,  1.39s/it]

episode: 350, reward: -277.57, average_reward: -434.6326399036684


 12%|█▏        | 352/3000 [08:02<1:01:01,  1.38s/it]

episode: 351, reward: -258.85, average_reward: -434.5030753807876


 12%|█▏        | 353/3000 [08:04<1:02:07,  1.41s/it]

episode: 352, reward: -256.6, average_reward: -433.3419885333668


 12%|█▏        | 354/3000 [08:05<1:02:22,  1.41s/it]

episode: 353, reward: -230.32, average_reward: -324.94827626189306


 12%|█▏        | 355/3000 [08:07<1:02:58,  1.43s/it]

episode: 354, reward: -283.2, average_reward: -318.9996308549708


 12%|█▏        | 356/3000 [08:08<1:02:39,  1.42s/it]

episode: 355, reward: -249.2, average_reward: -319.5801269319551


 12%|█▏        | 357/3000 [08:10<1:02:50,  1.43s/it]

episode: 356, reward: -603.66, average_reward: -329.40090562887036


 12%|█▏        | 358/3000 [08:11<1:01:49,  1.40s/it]

episode: 357, reward: -289.17, average_reward: -361.2633901116437


 12%|█▏        | 359/3000 [08:12<1:01:49,  1.40s/it]

episode: 358, reward: -599.71, average_reward: -364.0151180924499


 12%|█▏        | 360/3000 [08:14<1:01:39,  1.40s/it]

episode: 359, reward: -228.22, average_reward: -409.71504703036663


 12%|█▏        | 361/3000 [08:15<1:01:21,  1.40s/it]

episode: 360, reward: -271.21, average_reward: -327.6487114962606


 12%|█▏        | 362/3000 [08:17<1:01:52,  1.41s/it]

episode: 361, reward: -1152.97, average_reward: -327.0127988477813


 12%|█▏        | 363/3000 [08:18<1:01:51,  1.41s/it]

episode: 362, reward: -267.72, average_reward: -416.42468632151815


 12%|█▏        | 364/3000 [08:19<1:03:03,  1.44s/it]

episode: 363, reward: -242.13, average_reward: -417.53655348305176


 12%|█▏        | 365/3000 [08:21<1:01:32,  1.40s/it]

episode: 364, reward: -271.87, average_reward: -418.7183688007725


 12%|█▏        | 366/3000 [08:22<1:02:48,  1.43s/it]

episode: 365, reward: -217.61, average_reward: -417.5859124371324


 12%|█▏        | 367/3000 [08:24<1:01:41,  1.41s/it]

episode: 366, reward: -279.75, average_reward: -414.4262753888547


 12%|█▏        | 368/3000 [08:25<1:01:35,  1.40s/it]

episode: 367, reward: -270.64, average_reward: -382.0359787578268


 12%|█▏        | 369/3000 [08:26<1:02:06,  1.42s/it]

episode: 368, reward: -280.41, average_reward: -380.1828496536142


 12%|█▏        | 370/3000 [08:28<1:02:32,  1.43s/it]

episode: 369, reward: -255.84, average_reward: -348.25302311428266


 12%|█▏        | 371/3000 [08:29<1:02:03,  1.42s/it]

episode: 370, reward: -1434.94, average_reward: -351.01489406873657


 12%|█▏        | 372/3000 [08:31<1:01:44,  1.41s/it]

episode: 371, reward: -289.12, average_reward: -467.3885563900749


 12%|█▏        | 373/3000 [08:32<1:00:43,  1.39s/it]

episode: 372, reward: -280.5, average_reward: -381.0035413423388


 12%|█▏        | 374/3000 [08:33<1:01:11,  1.40s/it]

episode: 373, reward: -922.52, average_reward: -382.2815274601862


 12%|█▎        | 375/3000 [08:35<1:02:49,  1.44s/it]

episode: 374, reward: -758.52, average_reward: -450.32028814024534


 13%|█▎        | 376/3000 [08:36<1:02:59,  1.44s/it]

episode: 375, reward: -269.2, average_reward: -498.9843729022961


 13%|█▎        | 377/3000 [08:38<1:01:46,  1.41s/it]

episode: 376, reward: -244.39, average_reward: -504.14349612560727


 13%|█▎        | 378/3000 [08:39<1:01:36,  1.41s/it]

episode: 377, reward: -955.77, average_reward: -500.60748195338886


 13%|█▎        | 379/3000 [08:41<1:01:02,  1.40s/it]

episode: 378, reward: -272.09, average_reward: -569.120578730787


 13%|█▎        | 380/3000 [08:42<1:00:10,  1.38s/it]

episode: 379, reward: -273.4, average_reward: -568.2883814087362


 13%|█▎        | 381/3000 [08:43<1:00:26,  1.38s/it]

episode: 380, reward: -261.3, average_reward: -570.0442503047742


 13%|█▎        | 382/3000 [08:45<1:00:58,  1.40s/it]

episode: 381, reward: -263.93, average_reward: -452.68008179267343


 13%|█▎        | 383/3000 [08:45<46:04,  1.06s/it]  

Physics Error: [-1.          0.82791604 -1.         -1.          0.64148278]
episode: 382, reward: 5.35, average_reward: -450.16170323348354


 13%|█▎        | 384/3000 [08:47<53:41,  1.23s/it]

episode: 383, reward: -851.77, average_reward: -421.57669417568366


 13%|█▎        | 385/3000 [08:48<56:58,  1.31s/it]

episode: 384, reward: -245.14, average_reward: -414.501611112007


 13%|█▎        | 386/3000 [08:49<57:40,  1.32s/it]

episode: 385, reward: -452.77, average_reward: -363.1636952759335


 13%|█▎        | 387/3000 [08:51<58:03,  1.33s/it]

episode: 386, reward: -271.01, average_reward: -381.52120471163477


 13%|█▎        | 388/3000 [08:52<59:00,  1.36s/it]

episode: 387, reward: -249.08, average_reward: -384.1831207870197


 13%|█▎        | 389/3000 [08:54<59:37,  1.37s/it]

episode: 388, reward: -1105.33, average_reward: -313.51427511631135


 13%|█▎        | 390/3000 [08:55<59:35,  1.37s/it]

episode: 389, reward: -267.2, average_reward: -396.8382327938939


 13%|█▎        | 391/3000 [08:56<59:50,  1.38s/it]

episode: 390, reward: -270.29, average_reward: -396.2185508074707


 13%|█▎        | 392/3000 [08:58<1:02:21,  1.43s/it]

episode: 391, reward: -282.76, average_reward: -397.1169992125041


 13%|█▎        | 393/3000 [08:59<1:03:08,  1.45s/it]

episode: 392, reward: -255.57, average_reward: -398.99956363724294


 13%|█▎        | 394/3000 [09:01<1:01:58,  1.43s/it]

episode: 393, reward: -285.18, average_reward: -425.09126231654534


 13%|█▎        | 395/3000 [09:02<1:01:06,  1.41s/it]

episode: 394, reward: -294.1, average_reward: -368.43262244392105


 13%|█▎        | 396/3000 [09:04<1:00:25,  1.39s/it]

episode: 395, reward: -268.32, average_reward: -373.3286966372899


 13%|█▎        | 397/3000 [09:05<1:00:02,  1.38s/it]

episode: 396, reward: -273.95, average_reward: -354.88377365709925


 13%|█▎        | 398/3000 [09:06<59:55,  1.38s/it]  

episode: 397, reward: -283.08, average_reward: -355.17744427756395


 13%|█▎        | 399/3000 [09:08<59:33,  1.37s/it]

episode: 398, reward: -1137.87, average_reward: -358.57779669969955


 13%|█▎        | 400/3000 [09:09<59:30,  1.37s/it]

episode: 399, reward: -280.6, average_reward: -361.83219158845435


 13%|█▎        | 401/3000 [09:10<59:44,  1.38s/it]

episode: 400, reward: -276.79, average_reward: -363.17220377506254


 13%|█▎        | 402/3000 [09:12<1:01:50,  1.43s/it]

episode: 401, reward: -271.3, average_reward: -363.8230455984175


 13%|█▎        | 403/3000 [09:13<1:01:35,  1.42s/it]

episode: 402, reward: -249.8, average_reward: -362.6770293024073


 13%|█▎        | 404/3000 [09:15<1:01:27,  1.42s/it]

episode: 403, reward: -280.98, average_reward: -362.10011030207005


 14%|█▎        | 405/3000 [09:16<1:01:26,  1.42s/it]

episode: 404, reward: -278.94, average_reward: -361.67979115549343


 14%|█▎        | 406/3000 [09:18<1:00:12,  1.39s/it]

episode: 405, reward: -274.43, average_reward: -360.16400781330697


 14%|█▎        | 407/3000 [09:19<59:58,  1.39s/it]  

episode: 406, reward: -248.69, average_reward: -360.7743090654542


 14%|█▎        | 408/3000 [09:20<1:00:01,  1.39s/it]

episode: 407, reward: -270.86, average_reward: -358.2479671745843


 14%|█▎        | 409/3000 [09:22<59:44,  1.38s/it]  

episode: 408, reward: -279.64, average_reward: -357.0255791519888


 14%|█▎        | 410/3000 [09:23<59:24,  1.38s/it]

episode: 409, reward: -266.71, average_reward: -271.20218794779964


 14%|█▎        | 411/3000 [09:24<59:42,  1.38s/it]

episode: 410, reward: -280.66, average_reward: -269.81291334297987


 14%|█▎        | 412/3000 [09:26<59:42,  1.38s/it]

episode: 411, reward: -268.04, average_reward: -270.19908264141674


 14%|█▍        | 413/3000 [09:27<58:57,  1.37s/it]

episode: 412, reward: -278.07, average_reward: -269.8735666481349


 14%|█▍        | 414/3000 [09:28<58:29,  1.36s/it]

episode: 413, reward: -280.05, average_reward: -272.7010318159968


 14%|█▍        | 415/3000 [09:30<58:36,  1.36s/it]

episode: 414, reward: -263.76, average_reward: -272.60750232946646


 14%|█▍        | 416/3000 [09:31<58:14,  1.35s/it]

episode: 415, reward: -278.07, average_reward: -271.0895127220554


 14%|█▍        | 417/3000 [09:33<58:18,  1.35s/it]

episode: 416, reward: -292.33, average_reward: -271.4541079015672


 14%|█▍        | 418/3000 [09:34<58:51,  1.37s/it]

episode: 417, reward: -249.34, average_reward: -275.81857345637144


 14%|█▍        | 419/3000 [09:35<59:08,  1.38s/it]

episode: 418, reward: -248.28, average_reward: -273.6663911947278


 14%|█▍        | 420/3000 [09:37<59:02,  1.37s/it]

episode: 419, reward: -250.05, average_reward: -270.5301440795109


 14%|█▍        | 421/3000 [09:38<58:03,  1.35s/it]

episode: 420, reward: -282.79, average_reward: -268.86485909734586


 14%|█▍        | 422/3000 [09:39<58:23,  1.36s/it]

episode: 421, reward: -262.3, average_reward: -269.0779811891179


 14%|█▍        | 423/3000 [09:41<57:48,  1.35s/it]

episode: 422, reward: -268.62, average_reward: -268.50397055331877


 14%|█▍        | 424/3000 [09:42<57:57,  1.35s/it]

episode: 423, reward: -279.78, average_reward: -267.55850765776455


 14%|█▍        | 425/3000 [09:44<59:28,  1.39s/it]

episode: 424, reward: -233.77, average_reward: -267.53146220500537


 14%|█▍        | 426/3000 [09:45<59:00,  1.38s/it]

episode: 425, reward: -279.91, average_reward: -264.5330366289066


 14%|█▍        | 427/3000 [09:46<59:35,  1.39s/it]

episode: 426, reward: -253.16, average_reward: -264.71653410899165


 14%|█▍        | 428/3000 [09:48<59:49,  1.40s/it]

episode: 427, reward: -261.82, average_reward: -260.7996876408237


 14%|█▍        | 429/3000 [09:49<1:00:14,  1.41s/it]

episode: 428, reward: -276.81, average_reward: -262.04838171203676


 14%|█▍        | 430/3000 [09:51<59:46,  1.40s/it]  

episode: 429, reward: -262.29, average_reward: -264.90211607370077


 14%|█▍        | 431/3000 [09:52<59:49,  1.40s/it]

episode: 430, reward: -285.28, average_reward: -266.1253341671471


 14%|█▍        | 432/3000 [09:53<59:01,  1.38s/it]

episode: 431, reward: -263.36, average_reward: -266.3748840309861


 14%|█▍        | 433/3000 [09:55<58:12,  1.36s/it]

episode: 432, reward: -283.66, average_reward: -266.48035145362394


 14%|█▍        | 434/3000 [09:56<58:25,  1.37s/it]

episode: 433, reward: -273.17, average_reward: -267.98504292012547


 14%|█▍        | 435/3000 [09:57<58:23,  1.37s/it]

episode: 434, reward: -270.2, average_reward: -267.3242009589685


 15%|█▍        | 436/3000 [09:59<58:36,  1.37s/it]

episode: 435, reward: -268.46, average_reward: -270.96635605633816


 15%|█▍        | 437/3000 [10:00<58:05,  1.36s/it]

episode: 436, reward: -281.43, average_reward: -269.82175366147465


 15%|█▍        | 438/3000 [10:01<58:39,  1.37s/it]

episode: 437, reward: -283.32, average_reward: -272.6489330136309


 15%|█▍        | 439/3000 [10:03<59:15,  1.39s/it]

episode: 438, reward: -1446.02, average_reward: -274.7986284950815


 15%|█▍        | 440/3000 [10:04<57:50,  1.36s/it]

episode: 439, reward: -1650.26, average_reward: -391.719017719531


 15%|█▍        | 441/3000 [10:06<58:49,  1.38s/it]

episode: 440, reward: -234.25, average_reward: -530.5165241748521


 15%|█▍        | 442/3000 [10:07<58:27,  1.37s/it]

episode: 441, reward: -279.37, average_reward: -525.4134826896629


 15%|█▍        | 443/3000 [10:08<57:38,  1.35s/it]

episode: 442, reward: -278.65, average_reward: -527.0147410627984


 15%|█▍        | 444/3000 [10:10<57:37,  1.35s/it]

episode: 443, reward: -276.9, average_reward: -526.513474154623


 15%|█▍        | 445/3000 [10:11<58:28,  1.37s/it]

episode: 444, reward: -285.49, average_reward: -526.8863062418386


 15%|█▍        | 446/3000 [10:12<58:23,  1.37s/it]

episode: 445, reward: -283.07, average_reward: -528.415452895489


 15%|█▍        | 447/3000 [10:14<1:00:19,  1.42s/it]

episode: 446, reward: -235.22, average_reward: -529.8762887219759


 15%|█▍        | 448/3000 [10:15<59:44,  1.40s/it]  

episode: 447, reward: -278.75, average_reward: -525.2549859963592


 15%|█▍        | 449/3000 [10:17<58:56,  1.39s/it]

episode: 448, reward: -283.41, average_reward: -524.7976857281095


 15%|█▌        | 450/3000 [10:18<58:37,  1.38s/it]

episode: 449, reward: -289.09, average_reward: -408.53664337019006


 15%|█▌        | 451/3000 [10:19<58:11,  1.37s/it]

episode: 450, reward: -262.1, average_reward: -272.41918402552574


 15%|█▌        | 452/3000 [10:21<57:40,  1.36s/it]

episode: 451, reward: -288.87, average_reward: -275.2041014961427


 15%|█▌        | 453/3000 [10:22<57:22,  1.35s/it]

episode: 452, reward: -264.85, average_reward: -276.1542584713995


 15%|█▌        | 454/3000 [10:23<57:25,  1.35s/it]

episode: 453, reward: -268.0, average_reward: -274.774051536481


 15%|█▌        | 455/3000 [10:25<57:22,  1.35s/it]

episode: 454, reward: -288.56, average_reward: -273.8840426386196


 15%|█▌        | 456/3000 [10:26<58:10,  1.37s/it]

episode: 455, reward: -1046.02, average_reward: -274.1917404316608


 15%|█▌        | 457/3000 [10:28<58:25,  1.38s/it]

episode: 456, reward: -171.67, average_reward: -350.48627477812744


 15%|█▌        | 458/3000 [10:29<58:21,  1.38s/it]

episode: 457, reward: -258.79, average_reward: -344.1312637794264


 15%|█▌        | 459/3000 [10:30<59:01,  1.39s/it]

episode: 458, reward: -252.94, average_reward: -342.13519349679865


 15%|█▌        | 460/3000 [10:32<58:10,  1.37s/it]

episode: 459, reward: -290.02, average_reward: -339.0880697970219


 15%|█▌        | 461/3000 [10:33<57:58,  1.37s/it]

episode: 460, reward: -243.77, average_reward: -339.1814143207694


 15%|█▌        | 462/3000 [10:34<57:54,  1.37s/it]

episode: 461, reward: -287.69, average_reward: -337.3479362188208


 15%|█▌        | 463/3000 [10:36<57:48,  1.37s/it]

episode: 462, reward: -284.25, average_reward: -337.2292212696166


 15%|█▌        | 464/3000 [10:37<57:32,  1.36s/it]

episode: 463, reward: -268.27, average_reward: -339.1697001812309


 16%|█▌        | 465/3000 [10:38<57:14,  1.35s/it]

episode: 464, reward: -291.15, average_reward: -339.19760738053145


 16%|█▌        | 466/3000 [10:40<57:12,  1.35s/it]

episode: 465, reward: -271.83, average_reward: -339.4556940495883


 16%|█▌        | 467/3000 [10:41<57:56,  1.37s/it]

episode: 466, reward: -306.56, average_reward: -262.0368535984347


 16%|█▌        | 468/3000 [10:43<57:35,  1.36s/it]

episode: 467, reward: -1237.46, average_reward: -275.5259785765094


 16%|█▌        | 469/3000 [10:44<57:13,  1.36s/it]

episode: 468, reward: -294.53, average_reward: -373.39343185605946


 16%|█▌        | 470/3000 [10:45<56:56,  1.35s/it]

episode: 469, reward: -267.78, average_reward: -377.5531007259899


 16%|█▌        | 471/3000 [10:47<56:37,  1.34s/it]

episode: 470, reward: -271.13, average_reward: -375.32881579879097


 16%|█▌        | 472/3000 [10:48<56:58,  1.35s/it]

episode: 471, reward: -272.15, average_reward: -378.0649854199535


 16%|█▌        | 473/3000 [10:49<57:49,  1.37s/it]

episode: 472, reward: -242.84, average_reward: -376.5113162512845


 16%|█▌        | 474/3000 [10:51<57:26,  1.36s/it]

episode: 473, reward: -280.27, average_reward: -372.3699645353647


 16%|█▌        | 475/3000 [10:52<57:25,  1.36s/it]

episode: 474, reward: -269.25, average_reward: -373.5698832100728


 16%|█▌        | 476/3000 [10:53<57:56,  1.38s/it]

episode: 475, reward: -260.7, average_reward: -371.3802527484148


 16%|█▌        | 477/3000 [10:55<57:26,  1.37s/it]

episode: 476, reward: -284.27, average_reward: -370.26746317110565


 16%|█▌        | 478/3000 [10:56<56:35,  1.35s/it]

episode: 477, reward: -269.12, average_reward: -368.0377933575418


 16%|█▌        | 479/3000 [10:57<56:34,  1.35s/it]

episode: 478, reward: -276.77, average_reward: -271.2036509821633


 16%|█▌        | 480/3000 [10:59<56:14,  1.34s/it]

episode: 479, reward: -285.29, average_reward: -269.4276888691514


 16%|█▌        | 481/3000 [11:00<56:33,  1.35s/it]

episode: 480, reward: -274.74, average_reward: -271.179032798044


 16%|█▌        | 482/3000 [11:02<57:00,  1.36s/it]

episode: 481, reward: -273.63, average_reward: -271.54028883650153


 16%|█▌        | 483/3000 [11:03<56:53,  1.36s/it]

episode: 482, reward: -1251.55, average_reward: -271.68855916350327


 16%|█▌        | 484/3000 [11:04<56:42,  1.35s/it]

episode: 483, reward: -273.28, average_reward: -372.55996729883174


 16%|█▌        | 485/3000 [11:06<56:23,  1.35s/it]

episode: 484, reward: -282.96, average_reward: -371.8603992360022


 16%|█▌        | 486/3000 [11:07<56:01,  1.34s/it]

episode: 485, reward: -266.35, average_reward: -373.23147870083665


 16%|█▌        | 487/3000 [11:08<55:53,  1.33s/it]

episode: 486, reward: -276.33, average_reward: -373.7968554844805


 16%|█▋        | 488/3000 [11:10<56:34,  1.35s/it]

episode: 487, reward: -247.83, average_reward: -373.0035467498893


 16%|█▋        | 489/3000 [11:11<57:17,  1.37s/it]

episode: 488, reward: -249.32, average_reward: -370.8743371743582


 16%|█▋        | 490/3000 [11:12<57:54,  1.38s/it]

episode: 489, reward: -261.49, average_reward: -368.1291363115294


 16%|█▋        | 491/3000 [11:14<57:09,  1.37s/it]

episode: 490, reward: -263.39, average_reward: -365.7491649509984


 16%|█▋        | 492/3000 [11:15<56:12,  1.34s/it]

episode: 491, reward: -277.39, average_reward: -364.6135317538762


 16%|█▋        | 493/3000 [11:16<56:29,  1.35s/it]

episode: 492, reward: -267.43, average_reward: -364.9890679137722


 16%|█▋        | 494/3000 [11:18<57:01,  1.37s/it]

episode: 493, reward: -255.94, average_reward: -266.57685435668657


 16%|█▋        | 495/3000 [11:19<56:33,  1.35s/it]

episode: 494, reward: -275.79, average_reward: -264.8426698402571


 17%|█▋        | 496/3000 [11:20<56:16,  1.35s/it]

episode: 495, reward: -279.55, average_reward: -264.12539392062314


 17%|█▋        | 497/3000 [11:22<55:50,  1.34s/it]

episode: 496, reward: -273.92, average_reward: -265.4446642233387


 17%|█▋        | 498/3000 [11:23<55:58,  1.34s/it]

episode: 497, reward: -263.3, average_reward: -265.2036524887282


 17%|█▋        | 499/3000 [11:24<55:49,  1.34s/it]

episode: 498, reward: -286.42, average_reward: -266.750442129047


 17%|█▋        | 500/3000 [11:26<55:53,  1.34s/it]

episode: 499, reward: -263.41, average_reward: -270.46035155804697


 17%|█▋        | 501/3000 [11:27<56:06,  1.35s/it]

episode: 500, reward: -1699.98, average_reward: -270.6518606578758


 17%|█▋        | 502/3000 [11:29<56:17,  1.35s/it]

episode: 501, reward: -291.57, average_reward: -414.3117336283423


 17%|█▋        | 503/3000 [11:30<57:37,  1.38s/it]

episode: 502, reward: -265.0, average_reward: -415.7300239744492


 17%|█▋        | 504/3000 [11:31<58:45,  1.41s/it]

episode: 503, reward: -278.95, average_reward: -415.4864932202055


 17%|█▋        | 505/3000 [11:33<58:58,  1.42s/it]

episode: 504, reward: -471.33, average_reward: -417.7881752598525


 17%|█▋        | 506/3000 [11:34<59:31,  1.43s/it]

episode: 505, reward: -231.22, average_reward: -437.3424067327213


 17%|█▋        | 507/3000 [11:36<58:39,  1.41s/it]

episode: 506, reward: -270.71, average_reward: -432.5102761440859


 17%|█▋        | 508/3000 [11:37<57:32,  1.39s/it]

episode: 507, reward: -280.71, average_reward: -432.1888112672403


 17%|█▋        | 509/3000 [11:39<58:36,  1.41s/it]

episode: 508, reward: -223.47, average_reward: -433.9301770031916


 17%|█▋        | 510/3000 [11:40<57:51,  1.39s/it]

episode: 509, reward: -267.07, average_reward: -427.6351004452643


 17%|█▋        | 511/3000 [11:41<57:58,  1.40s/it]

episode: 510, reward: -249.76, average_reward: -428.00112596867586


 17%|█▋        | 512/3000 [11:43<57:25,  1.39s/it]

episode: 511, reward: -265.48, average_reward: -282.9785338217831


 17%|█▋        | 513/3000 [11:44<57:02,  1.38s/it]

episode: 512, reward: -282.3, average_reward: -280.3691707787576


 17%|█▋        | 514/3000 [11:45<56:34,  1.37s/it]

episode: 513, reward: -885.49, average_reward: -282.0996422910317


 17%|█▋        | 515/3000 [11:47<57:13,  1.38s/it]

episode: 514, reward: -245.81, average_reward: -342.75326477364825


 17%|█▋        | 516/3000 [11:48<56:21,  1.36s/it]

episode: 515, reward: -266.72, average_reward: -320.20125186336077


 17%|█▋        | 517/3000 [11:49<56:05,  1.36s/it]

episode: 516, reward: -252.93, average_reward: -323.75076024023633


 17%|█▋        | 518/3000 [11:51<56:52,  1.37s/it]

episode: 517, reward: -268.67, average_reward: -321.97300989271605


 17%|█▋        | 519/3000 [11:52<56:47,  1.37s/it]

episode: 518, reward: -259.4, average_reward: -320.76905548098955


 17%|█▋        | 520/3000 [11:53<55:52,  1.35s/it]

episode: 519, reward: -280.14, average_reward: -324.36216212499465


 17%|█▋        | 521/3000 [11:55<56:15,  1.36s/it]

episode: 520, reward: -263.17, average_reward: -325.669317178029


 17%|█▋        | 522/3000 [11:56<55:57,  1.36s/it]

episode: 521, reward: -266.53, average_reward: -327.0104229351574


 17%|█▋        | 523/3000 [11:58<55:33,  1.35s/it]

episode: 522, reward: -278.27, average_reward: -327.11559159766017


 17%|█▋        | 524/3000 [11:59<55:02,  1.33s/it]

episode: 523, reward: -273.24, average_reward: -326.71239141934575


 18%|█▊        | 525/3000 [12:00<54:45,  1.33s/it]

episode: 524, reward: -288.91, average_reward: -265.4874995190868


 18%|█▊        | 526/3000 [12:01<54:34,  1.32s/it]

episode: 525, reward: -273.76, average_reward: -269.79795036262476


 18%|█▊        | 527/3000 [12:03<55:46,  1.35s/it]

episode: 526, reward: -241.92, average_reward: -270.50238378655047


 18%|█▊        | 528/3000 [12:04<55:18,  1.34s/it]

episode: 527, reward: -277.33, average_reward: -269.4015857359055


 18%|█▊        | 529/3000 [12:06<55:10,  1.34s/it]

episode: 528, reward: -276.85, average_reward: -270.2677180189129


 18%|█▊        | 530/3000 [12:07<55:34,  1.35s/it]

episode: 529, reward: -263.35, average_reward: -272.01294600343033


 18%|█▊        | 531/3000 [12:08<55:34,  1.35s/it]

episode: 530, reward: -272.58, average_reward: -270.3338668286418


 18%|█▊        | 532/3000 [12:10<55:15,  1.34s/it]

episode: 531, reward: -258.19, average_reward: -271.27475990403883


 18%|█▊        | 533/3000 [12:11<55:17,  1.34s/it]

episode: 532, reward: -253.56, average_reward: -270.4414150434315


 18%|█▊        | 534/3000 [12:12<55:15,  1.34s/it]

episode: 533, reward: -276.36, average_reward: -267.9705317893431


 18%|█▊        | 535/3000 [12:14<54:34,  1.33s/it]

episode: 534, reward: -279.64, average_reward: -268.2821425744042


 18%|█▊        | 536/3000 [12:15<54:55,  1.34s/it]

episode: 535, reward: -1003.0, average_reward: -267.35428249547715


 18%|█▊        | 537/3000 [12:16<55:43,  1.36s/it]

episode: 536, reward: -244.98, average_reward: -340.2776252563207


 18%|█▊        | 538/3000 [12:18<55:39,  1.36s/it]

episode: 537, reward: -281.01, average_reward: -340.58364365610976


 18%|█▊        | 539/3000 [12:19<55:09,  1.34s/it]

episode: 538, reward: -271.09, average_reward: -340.9517154940531


 18%|█▊        | 540/3000 [12:20<55:31,  1.35s/it]

episode: 539, reward: -275.21, average_reward: -340.3751917916905


 18%|█▊        | 541/3000 [12:22<55:26,  1.35s/it]

episode: 540, reward: -259.42, average_reward: -341.5616239070057


 18%|█▊        | 542/3000 [12:23<55:15,  1.35s/it]

episode: 541, reward: -267.1, average_reward: -340.2454649317377


 18%|█▊        | 543/3000 [12:24<54:53,  1.34s/it]

episode: 542, reward: -272.1, average_reward: -341.1359772916934


 18%|█▊        | 544/3000 [12:26<54:59,  1.34s/it]

episode: 543, reward: -277.59, average_reward: -342.9897832638056


 18%|█▊        | 545/3000 [12:27<56:25,  1.38s/it]

episode: 544, reward: -249.74, average_reward: -343.113071365785


 18%|█▊        | 546/3000 [12:29<56:18,  1.38s/it]

episode: 545, reward: -279.94, average_reward: -340.1234236190548


 18%|█▊        | 547/3000 [12:30<55:32,  1.36s/it]

episode: 546, reward: -277.47, average_reward: -267.8172750056709


 18%|█▊        | 548/3000 [12:31<56:20,  1.38s/it]

episode: 547, reward: -275.36, average_reward: -271.066523729406


 18%|█▊        | 549/3000 [12:33<57:23,  1.40s/it]

episode: 548, reward: -271.11, average_reward: -270.5010460060488


 18%|█▊        | 550/3000 [12:34<57:21,  1.40s/it]

episode: 549, reward: -234.11, average_reward: -270.50349026220096


 18%|█▊        | 551/3000 [12:36<56:40,  1.39s/it]

episode: 550, reward: -258.69, average_reward: -266.3938205241764


 18%|█▊        | 552/3000 [12:37<57:27,  1.41s/it]

episode: 551, reward: -981.63, average_reward: -266.3215153821299


 18%|█▊        | 553/3000 [12:38<57:43,  1.42s/it]

episode: 552, reward: -249.41, average_reward: -337.7749969183331


 18%|█▊        | 554/3000 [12:40<56:57,  1.40s/it]

episode: 553, reward: -278.23, average_reward: -335.5058471681075


 18%|█▊        | 555/3000 [12:41<56:05,  1.38s/it]

episode: 554, reward: -279.79, average_reward: -335.5695258611773


 19%|█▊        | 556/3000 [12:42<55:54,  1.37s/it]

episode: 555, reward: -282.5, average_reward: -338.5749344472006


 19%|█▊        | 557/3000 [12:44<56:03,  1.38s/it]

episode: 556, reward: -316.11, average_reward: -338.8308836082716


 19%|█▊        | 558/3000 [12:45<55:16,  1.36s/it]

episode: 557, reward: -281.97, average_reward: -342.69410394021736


 19%|█▊        | 559/3000 [12:47<56:09,  1.38s/it]

episode: 558, reward: -1234.9, average_reward: -343.35567956166255


 19%|█▊        | 560/3000 [12:48<55:45,  1.37s/it]

episode: 559, reward: -287.93, average_reward: -439.7342399040701


 19%|█▊        | 561/3000 [12:49<56:10,  1.38s/it]

episode: 560, reward: -303.81, average_reward: -445.1162351252992


 19%|█▊        | 562/3000 [12:51<56:54,  1.40s/it]

episode: 561, reward: -356.56, average_reward: -449.62803078940044


 19%|█▉        | 563/3000 [12:52<57:24,  1.41s/it]

episode: 562, reward: -274.99, average_reward: -387.1209959304953


 19%|█▉        | 564/3000 [12:54<57:14,  1.41s/it]

episode: 563, reward: -279.99, average_reward: -389.6788130249437


 19%|█▉        | 565/3000 [12:55<56:35,  1.39s/it]

episode: 564, reward: -274.25, average_reward: -389.85501041811136


 19%|█▉        | 566/3000 [12:56<56:31,  1.39s/it]

episode: 565, reward: -262.59, average_reward: -389.3004100902709


 19%|█▉        | 567/3000 [12:58<56:19,  1.39s/it]

episode: 566, reward: -285.46, average_reward: -387.3099204411607


 19%|█▉        | 568/3000 [12:59<56:51,  1.40s/it]

episode: 567, reward: -1331.89, average_reward: -384.2450448322758


 19%|█▉        | 569/3000 [13:01<56:29,  1.39s/it]

episode: 568, reward: -291.32, average_reward: -489.2365284316103


 19%|█▉        | 570/3000 [13:02<56:39,  1.40s/it]

episode: 569, reward: -284.45, average_reward: -394.87860538219945


 19%|█▉        | 571/3000 [13:03<56:04,  1.39s/it]

episode: 570, reward: -265.26, average_reward: -394.52999274103934


 19%|█▉        | 572/3000 [13:05<56:45,  1.40s/it]

episode: 571, reward: -212.89, average_reward: -390.6749606619918


 19%|█▉        | 573/3000 [13:06<55:49,  1.38s/it]

episode: 572, reward: -281.91, average_reward: -376.30765796157146


 19%|█▉        | 574/3000 [13:08<56:09,  1.39s/it]

episode: 573, reward: -1155.44, average_reward: -377.0004748640841


 19%|█▉        | 575/3000 [13:09<56:03,  1.39s/it]

episode: 574, reward: -268.43, average_reward: -464.5451910926637


 19%|█▉        | 576/3000 [13:10<55:37,  1.38s/it]

episode: 575, reward: -286.67, average_reward: -463.9634620681398


 19%|█▉        | 577/3000 [13:12<55:10,  1.37s/it]

episode: 576, reward: -282.02, average_reward: -466.37175054489546


 19%|█▉        | 578/3000 [13:13<55:56,  1.39s/it]

episode: 577, reward: -284.29, average_reward: -466.0282824628939


 19%|█▉        | 579/3000 [13:14<56:28,  1.40s/it]

episode: 578, reward: -272.44, average_reward: -361.2688159074179


 19%|█▉        | 580/3000 [13:16<56:42,  1.41s/it]

episode: 579, reward: -1079.37, average_reward: -359.3811021178848


 19%|█▉        | 581/3000 [13:17<55:56,  1.39s/it]

episode: 580, reward: -275.99, average_reward: -438.87295329374604


 19%|█▉        | 582/3000 [13:19<55:30,  1.38s/it]

episode: 581, reward: -289.56, average_reward: -439.9455389501169


 19%|█▉        | 583/3000 [13:20<57:17,  1.42s/it]

episode: 582, reward: -274.3, average_reward: -447.6125503516929


 19%|█▉        | 584/3000 [13:22<56:52,  1.41s/it]

episode: 583, reward: -266.34, average_reward: -446.85118008358097


 20%|█▉        | 585/3000 [13:23<57:18,  1.42s/it]

episode: 584, reward: -275.48, average_reward: -357.94160522471236


 20%|█▉        | 586/3000 [13:24<57:10,  1.42s/it]

episode: 585, reward: -275.95, average_reward: -358.6464005400182


 20%|█▉        | 587/3000 [13:26<56:22,  1.40s/it]

episode: 586, reward: -249.32, average_reward: -357.57394771133283


 20%|█▉        | 588/3000 [13:27<55:59,  1.39s/it]

episode: 587, reward: -283.75, average_reward: -354.30341126500446


 20%|█▉        | 589/3000 [13:28<55:37,  1.38s/it]

episode: 588, reward: -1427.59, average_reward: -354.24907570152675


 20%|█▉        | 590/3000 [13:30<55:35,  1.38s/it]

episode: 589, reward: -280.89, average_reward: -469.7643066138955


 20%|█▉        | 591/3000 [13:31<56:18,  1.40s/it]

episode: 590, reward: -260.5, average_reward: -389.91697903023703


 20%|█▉        | 592/3000 [13:33<57:11,  1.42s/it]

episode: 591, reward: -266.13, average_reward: -388.3680664926033


 20%|█▉        | 593/3000 [13:34<58:34,  1.46s/it]

episode: 592, reward: -239.91, average_reward: -386.0253436141957


 20%|█▉        | 594/3000 [13:36<58:25,  1.46s/it]

episode: 593, reward: -241.93, average_reward: -382.58609685649174


 20%|█▉        | 595/3000 [13:37<57:47,  1.44s/it]

episode: 594, reward: -277.67, average_reward: -380.14526557230243


 20%|█▉        | 596/3000 [13:39<57:58,  1.45s/it]

episode: 595, reward: -242.17, average_reward: -380.36459969594847


 20%|█▉        | 597/3000 [13:40<57:45,  1.44s/it]

episode: 596, reward: -250.04, average_reward: -376.9867821488823


 20%|█▉        | 598/3000 [13:41<57:31,  1.44s/it]

episode: 597, reward: -279.62, average_reward: -377.05922686613064


 20%|█▉        | 599/3000 [13:43<58:29,  1.46s/it]

episode: 598, reward: -269.74, average_reward: -376.6458013455823


 20%|██        | 600/3000 [13:44<58:24,  1.46s/it]

episode: 599, reward: -234.65, average_reward: -260.86015494688206


 20%|██        | 601/3000 [13:46<58:34,  1.46s/it]

episode: 600, reward: -272.48, average_reward: -256.23591981223245


 20%|██        | 602/3000 [13:47<57:19,  1.43s/it]

episode: 601, reward: -278.07, average_reward: -257.43376907455274


 20%|██        | 603/3000 [13:49<56:04,  1.40s/it]

episode: 602, reward: -272.38, average_reward: -258.62779762724807


 20%|██        | 604/3000 [13:50<55:34,  1.39s/it]

episode: 603, reward: -272.07, average_reward: -261.8754166205611


 20%|██        | 605/3000 [13:51<55:02,  1.38s/it]

episode: 604, reward: -278.79, average_reward: -264.888843016789


 20%|██        | 606/3000 [13:53<55:39,  1.40s/it]

episode: 605, reward: -263.7, average_reward: -265.00069518000305


 20%|██        | 607/3000 [13:54<57:00,  1.43s/it]

episode: 606, reward: -245.6, average_reward: -267.1531925332236


 20%|██        | 608/3000 [13:56<57:08,  1.43s/it]

episode: 607, reward: -218.91, average_reward: -266.7093690295599


 20%|██        | 609/3000 [13:57<56:34,  1.42s/it]

episode: 608, reward: -268.79, average_reward: -260.639242175496


 20%|██        | 610/3000 [13:59<57:16,  1.44s/it]

episode: 609, reward: -274.66, average_reward: -260.5446432197883


 20%|██        | 611/3000 [14:00<58:14,  1.46s/it]

episode: 610, reward: -261.51, average_reward: -264.54537744513533


 20%|██        | 612/3000 [14:02<58:33,  1.47s/it]

episode: 611, reward: -259.46, average_reward: -263.44899049602026


 20%|██        | 613/3000 [14:03<58:22,  1.47s/it]

episode: 612, reward: -272.39, average_reward: -261.5872242787707


 20%|██        | 614/3000 [14:05<58:56,  1.48s/it]

episode: 613, reward: -260.95, average_reward: -261.5881491714309


 20%|██        | 615/3000 [14:06<57:33,  1.45s/it]

episode: 614, reward: -272.76, average_reward: -260.4766997929262


 21%|██        | 616/3000 [14:07<57:59,  1.46s/it]

episode: 615, reward: -254.58, average_reward: -259.8741642819836


 21%|██        | 617/3000 [14:09<57:35,  1.45s/it]

episode: 616, reward: -265.34, average_reward: -258.9624524463767


 21%|██        | 618/3000 [14:10<57:23,  1.45s/it]

episode: 617, reward: -259.04, average_reward: -260.9356880894772


 21%|██        | 619/3000 [14:12<56:33,  1.43s/it]

episode: 618, reward: -270.87, average_reward: -264.947853731275


 21%|██        | 620/3000 [14:13<56:15,  1.42s/it]

episode: 619, reward: -259.37, average_reward: -265.1560954548234


 21%|██        | 621/3000 [14:15<56:21,  1.42s/it]

episode: 620, reward: -232.05, average_reward: -263.6276067415084


 21%|██        | 622/3000 [14:16<56:24,  1.42s/it]

episode: 621, reward: -921.84, average_reward: -260.6811801189586


 21%|██        | 623/3000 [14:17<55:33,  1.40s/it]

episode: 622, reward: -271.92, average_reward: -326.92000921401205


 21%|██        | 624/3000 [14:19<54:42,  1.38s/it]

episode: 623, reward: -1531.83, average_reward: -326.8724737959242


 21%|██        | 625/3000 [14:20<54:20,  1.37s/it]

episode: 624, reward: -1348.23, average_reward: -453.9600429514976


 21%|██        | 626/3000 [14:21<54:06,  1.37s/it]

episode: 625, reward: -276.84, average_reward: -561.506749299775


 21%|██        | 627/3000 [14:23<53:47,  1.36s/it]

episode: 626, reward: -276.18, average_reward: -563.7324872876451


 21%|██        | 628/3000 [14:24<54:06,  1.37s/it]

episode: 627, reward: -289.01, average_reward: -564.8166586750373


 21%|██        | 629/3000 [14:25<54:15,  1.37s/it]

episode: 628, reward: -275.06, average_reward: -567.8135212511995


 21%|██        | 630/3000 [14:27<54:07,  1.37s/it]

episode: 629, reward: -272.28, average_reward: -568.2323467704744


 21%|██        | 631/3000 [14:28<54:10,  1.37s/it]

episode: 630, reward: -1285.45, average_reward: -569.5229122747951


 21%|██        | 632/3000 [14:30<54:21,  1.38s/it]

episode: 631, reward: -275.33, average_reward: -674.8627617060343


 21%|██        | 633/3000 [14:31<53:57,  1.37s/it]

episode: 632, reward: -281.5, average_reward: -610.21137788149


 21%|██        | 634/3000 [14:32<53:29,  1.36s/it]

episode: 633, reward: -268.72, average_reward: -611.1694773355987


 21%|██        | 635/3000 [14:34<53:13,  1.35s/it]

episode: 634, reward: -262.12, average_reward: -484.8588358585583


 21%|██        | 636/3000 [14:35<54:05,  1.37s/it]

episode: 635, reward: -247.4, average_reward: -376.2471918145659


 21%|██        | 637/3000 [14:36<53:59,  1.37s/it]

episode: 636, reward: -289.36, average_reward: -373.30354925049545


 21%|██▏       | 638/3000 [14:38<53:57,  1.37s/it]

episode: 637, reward: -1119.28, average_reward: -374.62168200808935


 21%|██▏       | 639/3000 [14:39<53:39,  1.36s/it]

episode: 638, reward: -1571.22, average_reward: -457.64916489314203


 21%|██▏       | 640/3000 [14:41<53:59,  1.37s/it]

episode: 639, reward: -265.88, average_reward: -587.2647997481514


 21%|██▏       | 641/3000 [14:42<53:49,  1.37s/it]

episode: 640, reward: -289.02, average_reward: -586.6247364289251


 21%|██▏       | 642/3000 [14:43<54:22,  1.38s/it]

episode: 641, reward: -244.33, average_reward: -486.98210224630975


 21%|██▏       | 643/3000 [14:45<54:00,  1.37s/it]

episode: 642, reward: -285.71, average_reward: -483.88238853789414


 21%|██▏       | 644/3000 [14:46<53:57,  1.37s/it]

episode: 643, reward: -259.86, average_reward: -484.30341851805025


 22%|██▏       | 645/3000 [14:47<53:49,  1.37s/it]

episode: 644, reward: -286.9, average_reward: -483.4176502395277


 22%|██▏       | 646/3000 [14:49<53:10,  1.36s/it]

episode: 645, reward: -269.49, average_reward: -485.8965284360779


 22%|██▏       | 647/3000 [14:50<53:03,  1.35s/it]

episode: 646, reward: -293.88, average_reward: -488.10588720069666


 22%|██▏       | 648/3000 [14:51<53:01,  1.35s/it]

episode: 647, reward: -274.57, average_reward: -488.5575890028605


 22%|██▏       | 649/3000 [14:53<53:30,  1.37s/it]

episode: 648, reward: -278.47, average_reward: -404.0863557575153


 22%|██▏       | 650/3000 [14:54<53:29,  1.37s/it]

episode: 649, reward: -273.26, average_reward: -274.81150200771424


 22%|██▏       | 651/3000 [14:55<53:12,  1.36s/it]

episode: 650, reward: -275.73, average_reward: -275.5496889926229


 22%|██▏       | 652/3000 [14:57<53:34,  1.37s/it]

episode: 651, reward: -282.82, average_reward: -274.22090698439837


 22%|██▏       | 653/3000 [14:58<53:46,  1.37s/it]

episode: 652, reward: -1118.11, average_reward: -278.0698055951092


 22%|██▏       | 654/3000 [15:00<53:44,  1.37s/it]

episode: 653, reward: -254.92, average_reward: -361.3097886611986


 22%|██▏       | 655/3000 [15:01<54:03,  1.38s/it]

episode: 654, reward: -249.97, average_reward: -360.8151634010304


 22%|██▏       | 656/3000 [15:02<53:36,  1.37s/it]

episode: 655, reward: -1444.9, average_reward: -357.1220459059426


 22%|██▏       | 657/3000 [15:04<53:36,  1.37s/it]

episode: 656, reward: -258.48, average_reward: -474.6625728775531


 22%|██▏       | 658/3000 [15:05<53:25,  1.37s/it]

episode: 657, reward: -283.06, average_reward: -471.12288281196425


 22%|██▏       | 659/3000 [15:06<53:02,  1.36s/it]

episode: 658, reward: -272.03, average_reward: -471.9718545922833


 22%|██▏       | 660/3000 [15:08<53:10,  1.36s/it]

episode: 659, reward: -238.23, average_reward: -471.32796246670296


 22%|██▏       | 661/3000 [15:09<52:48,  1.35s/it]

episode: 660, reward: -265.87, average_reward: -467.8247343530632


 22%|██▏       | 662/3000 [15:11<52:50,  1.36s/it]

episode: 661, reward: -266.81, average_reward: -466.8386182693333


 22%|██▏       | 663/3000 [15:12<52:43,  1.35s/it]

episode: 662, reward: -280.44, average_reward: -465.23718047222775


 22%|██▏       | 664/3000 [15:13<54:00,  1.39s/it]

episode: 663, reward: -247.24, average_reward: -381.47079793906084


 22%|██▏       | 665/3000 [15:15<53:13,  1.37s/it]

episode: 664, reward: -272.73, average_reward: -380.7033129949019


 22%|██▏       | 666/3000 [15:16<53:02,  1.36s/it]

episode: 665, reward: -280.8, average_reward: -382.97937884810574


 22%|██▏       | 667/3000 [15:17<53:30,  1.38s/it]

episode: 666, reward: -268.86, average_reward: -266.5697220776384


 22%|██▏       | 668/3000 [15:19<54:33,  1.40s/it]

episode: 667, reward: -268.96, average_reward: -267.6082546435436


 22%|██▏       | 669/3000 [15:20<53:58,  1.39s/it]

episode: 668, reward: -274.02, average_reward: -266.198327764593


 22%|██▏       | 670/3000 [15:22<53:21,  1.37s/it]

episode: 669, reward: -249.45, average_reward: -266.39730744287783


 22%|██▏       | 671/3000 [15:23<53:15,  1.37s/it]

episode: 670, reward: -283.86, average_reward: -267.51982885261697


 22%|██▏       | 672/3000 [15:24<54:46,  1.41s/it]

episode: 671, reward: -219.91, average_reward: -269.31850866885856


 22%|██▏       | 673/3000 [15:26<54:01,  1.39s/it]

episode: 672, reward: -251.61, average_reward: -264.62893475059144


 22%|██▏       | 674/3000 [15:27<54:00,  1.39s/it]

episode: 673, reward: -285.32, average_reward: -261.7459752213525


 22%|██▎       | 675/3000 [15:29<53:18,  1.38s/it]

episode: 674, reward: -298.37, average_reward: -265.553604464463


 23%|██▎       | 676/3000 [15:30<52:53,  1.37s/it]

episode: 675, reward: -263.83, average_reward: -268.11732914456593


 23%|██▎       | 677/3000 [15:31<53:00,  1.37s/it]

episode: 676, reward: -278.68, average_reward: -266.41982341981105


 23%|██▎       | 678/3000 [15:33<52:52,  1.37s/it]

episode: 677, reward: -281.3, average_reward: -267.4013125619841


 23%|██▎       | 679/3000 [15:34<52:21,  1.35s/it]

episode: 678, reward: -278.2, average_reward: -268.63516315516335


 23%|██▎       | 680/3000 [15:35<52:21,  1.35s/it]

episode: 679, reward: -283.29, average_reward: -269.0529881312784


 23%|██▎       | 681/3000 [15:37<52:08,  1.35s/it]

episode: 680, reward: -283.27, average_reward: -272.437039726085


 23%|██▎       | 682/3000 [15:38<51:57,  1.34s/it]

episode: 681, reward: -281.39, average_reward: -272.3787057748487


 23%|██▎       | 683/3000 [15:39<52:23,  1.36s/it]

episode: 682, reward: -240.18, average_reward: -278.5263289758892


 23%|██▎       | 684/3000 [15:41<52:46,  1.37s/it]

episode: 683, reward: -242.75, average_reward: -277.3829318863494


 23%|██▎       | 685/3000 [15:42<51:59,  1.35s/it]

episode: 684, reward: -257.14, average_reward: -273.1264140435168


 23%|██▎       | 686/3000 [15:43<51:41,  1.34s/it]

episode: 685, reward: -283.24, average_reward: -269.0031633322134


 23%|██▎       | 687/3000 [15:45<52:03,  1.35s/it]

episode: 686, reward: -273.63, average_reward: -270.94426157089987


 23%|██▎       | 688/3000 [15:46<51:54,  1.35s/it]

episode: 687, reward: -279.98, average_reward: -270.43884655681154


 23%|██▎       | 689/3000 [15:47<51:49,  1.35s/it]

episode: 688, reward: -272.2, average_reward: -270.30677649278186


 23%|██▎       | 690/3000 [15:49<51:59,  1.35s/it]

episode: 689, reward: -261.21, average_reward: -269.70714349239154


 23%|██▎       | 691/3000 [15:50<52:13,  1.36s/it]

episode: 690, reward: -254.99, average_reward: -267.498754099492


 23%|██▎       | 692/3000 [15:52<52:28,  1.36s/it]

episode: 691, reward: -268.95, average_reward: -264.6706702315052


 23%|██▎       | 693/3000 [15:53<52:00,  1.35s/it]

episode: 692, reward: -270.64, average_reward: -263.4270850618351


 23%|██▎       | 694/3000 [15:54<53:31,  1.39s/it]

episode: 693, reward: -226.76, average_reward: -266.4726147510316


 23%|██▎       | 695/3000 [15:56<53:47,  1.40s/it]

episode: 694, reward: -283.84, average_reward: -264.87340172815436


 23%|██▎       | 696/3000 [15:57<53:24,  1.39s/it]

episode: 695, reward: -1220.6, average_reward: -267.5439795769526


 23%|██▎       | 697/3000 [15:59<53:14,  1.39s/it]

episode: 696, reward: -260.8, average_reward: -361.2798009293337


 23%|██▎       | 698/3000 [16:00<52:34,  1.37s/it]

episode: 697, reward: -272.95, average_reward: -359.99719740457874


 23%|██▎       | 699/3000 [16:01<51:50,  1.35s/it]

Physics Error: [ 0.75636497  0.57033379 -0.73417473  0.24577097  1.        ]
episode: 698, reward: 210733024.65, average_reward: -359.2948548921943


 23%|██▎       | 700/3000 [16:03<51:47,  1.35s/it]

episode: 699, reward: -276.02, average_reward: 21072970.390305545


 23%|██▎       | 701/3000 [16:04<53:27,  1.40s/it]

episode: 700, reward: -228.55, average_reward: 21072968.909707777


 23%|██▎       | 702/3000 [16:05<53:26,  1.40s/it]

episode: 701, reward: -272.21, average_reward: 21072971.554575566


 23%|██▎       | 703/3000 [16:07<53:12,  1.39s/it]

episode: 702, reward: -1432.51, average_reward: 21072971.228704188


 23%|██▎       | 704/3000 [16:08<52:56,  1.38s/it]

episode: 703, reward: -264.53, average_reward: 21072855.041167032


 24%|██▎       | 705/3000 [16:10<52:48,  1.38s/it]

episode: 704, reward: -261.24, average_reward: 21072851.26421086


 24%|██▎       | 706/3000 [16:11<52:22,  1.37s/it]

episode: 705, reward: -280.13, average_reward: 21072853.52437715


 24%|██▎       | 707/3000 [16:12<52:15,  1.37s/it]

episode: 706, reward: -278.64, average_reward: 21072947.57052218


 24%|██▎       | 708/3000 [16:14<52:28,  1.37s/it]

episode: 707, reward: -273.61, average_reward: 21072945.786832567


 24%|██▎       | 709/3000 [16:15<52:01,  1.36s/it]

episode: 708, reward: -290.21, average_reward: 21072945.72152916


 24%|██▎       | 710/3000 [16:16<51:52,  1.36s/it]

episode: 709, reward: -1368.26, average_reward: -385.764263989347


 24%|██▎       | 711/3000 [16:18<52:21,  1.37s/it]

episode: 710, reward: -266.7, average_reward: -494.9889622624975


 24%|██▎       | 712/3000 [16:19<52:13,  1.37s/it]

episode: 711, reward: -268.05, average_reward: -498.8045479876243


 24%|██▍       | 713/3000 [16:20<52:41,  1.38s/it]

episode: 712, reward: -263.29, average_reward: -498.3884060773362


 24%|██▍       | 714/3000 [16:22<52:07,  1.37s/it]

episode: 713, reward: -247.83, average_reward: -381.4658806645253


 24%|██▍       | 715/3000 [16:23<51:57,  1.36s/it]

episode: 714, reward: -270.55, average_reward: -379.7957404658342


 24%|██▍       | 716/3000 [16:25<52:48,  1.39s/it]

episode: 715, reward: -245.44, average_reward: -380.7263551426437


 24%|██▍       | 717/3000 [16:26<52:48,  1.39s/it]

episode: 716, reward: -278.97, average_reward: -377.2572567474351


 24%|██▍       | 718/3000 [16:27<53:20,  1.40s/it]

episode: 717, reward: -245.54, average_reward: -377.29041400169064


 24%|██▍       | 719/3000 [16:29<53:07,  1.40s/it]

episode: 718, reward: -270.5, average_reward: -374.48389374642477


 24%|██▍       | 720/3000 [16:30<52:07,  1.37s/it]

episode: 719, reward: -1476.31, average_reward: -372.51285756066835


 24%|██▍       | 721/3000 [16:31<51:39,  1.36s/it]

episode: 720, reward: -281.65, average_reward: -383.3176583664728


 24%|██▍       | 722/3000 [16:33<51:21,  1.35s/it]

episode: 721, reward: -281.51, average_reward: -384.8127610651422


 24%|██▍       | 723/3000 [16:34<51:53,  1.37s/it]

episode: 722, reward: -261.07, average_reward: -386.1588280627212


 24%|██▍       | 724/3000 [16:36<51:27,  1.36s/it]

episode: 723, reward: -1809.92, average_reward: -385.9369355108492


 24%|██▍       | 725/3000 [16:37<51:19,  1.35s/it]

episode: 724, reward: -264.85, average_reward: -542.1465069303107


 24%|██▍       | 726/3000 [16:38<51:23,  1.36s/it]

episode: 725, reward: -273.08, average_reward: -541.5768945870957


 24%|██▍       | 727/3000 [16:40<51:19,  1.36s/it]

episode: 726, reward: -266.43, average_reward: -544.3402942212841


 24%|██▍       | 728/3000 [16:41<51:34,  1.36s/it]

episode: 727, reward: -290.66, average_reward: -543.0862723325849


 24%|██▍       | 729/3000 [16:42<51:38,  1.36s/it]

episode: 728, reward: -270.57, average_reward: -547.5984124887606


 24%|██▍       | 730/3000 [16:44<51:41,  1.37s/it]

episode: 729, reward: -299.05, average_reward: -547.6053810243783


 24%|██▍       | 731/3000 [16:45<52:53,  1.40s/it]

episode: 730, reward: -237.12, average_reward: -429.87928800071325


 24%|██▍       | 732/3000 [16:47<52:36,  1.39s/it]

episode: 731, reward: -282.99, average_reward: -425.4265238317663


 24%|██▍       | 733/3000 [16:48<52:06,  1.38s/it]

episode: 732, reward: -257.25, average_reward: -425.5745176833736


 24%|██▍       | 734/3000 [16:49<52:53,  1.40s/it]

episode: 733, reward: -231.96, average_reward: -425.1929104194004


 24%|██▍       | 735/3000 [16:51<52:11,  1.38s/it]

episode: 734, reward: -267.79, average_reward: -267.3959961286212


 25%|██▍       | 736/3000 [16:52<51:54,  1.38s/it]

episode: 735, reward: -245.63, average_reward: -267.68992496739554


 25%|██▍       | 737/3000 [16:53<51:47,  1.37s/it]

episode: 736, reward: -278.6, average_reward: -264.94490189324216


 25%|██▍       | 738/3000 [16:55<51:35,  1.37s/it]

episode: 737, reward: -255.81, average_reward: -266.16216191402344


 25%|██▍       | 739/3000 [16:56<52:05,  1.38s/it]

episode: 738, reward: -269.25, average_reward: -262.67726919269927


 25%|██▍       | 740/3000 [16:58<51:44,  1.37s/it]

episode: 739, reward: -430.99, average_reward: -262.5457705236895


 25%|██▍       | 741/3000 [16:59<51:05,  1.36s/it]

episode: 740, reward: -281.77, average_reward: -275.739977988294


 25%|██▍       | 742/3000 [17:00<50:53,  1.35s/it]

episode: 741, reward: -1530.62, average_reward: -280.20482077304104


 25%|██▍       | 743/3000 [17:02<51:24,  1.37s/it]

episode: 742, reward: -1275.26, average_reward: -404.96742725116735


 25%|██▍       | 744/3000 [17:03<51:02,  1.36s/it]

episode: 743, reward: -278.31, average_reward: -506.76789463559516


 25%|██▍       | 745/3000 [17:04<51:25,  1.37s/it]

episode: 744, reward: -263.45, average_reward: -511.4030936247103


 25%|██▍       | 746/3000 [17:06<51:42,  1.38s/it]

episode: 745, reward: -283.83, average_reward: -510.96896736448326


 25%|██▍       | 747/3000 [17:07<51:46,  1.38s/it]

episode: 746, reward: -280.5, average_reward: -514.7896797978783


 25%|██▍       | 748/3000 [17:08<51:11,  1.36s/it]

episode: 747, reward: -265.09, average_reward: -514.9798466421246


 25%|██▍       | 749/3000 [17:10<50:52,  1.36s/it]

episode: 748, reward: -261.26, average_reward: -515.9076239790295


 25%|██▌       | 750/3000 [17:11<51:00,  1.36s/it]

episode: 749, reward: -246.05, average_reward: -515.1082412139679


 25%|██▌       | 751/3000 [17:13<51:18,  1.37s/it]

episode: 750, reward: -285.64, average_reward: -496.61394589375794


 25%|██▌       | 752/3000 [17:14<51:14,  1.37s/it]

episode: 751, reward: -278.95, average_reward: -497.0010379559217


 25%|██▌       | 753/3000 [17:15<51:18,  1.37s/it]

episode: 752, reward: -279.18, average_reward: -371.8347570529206


 25%|██▌       | 754/3000 [17:17<52:46,  1.41s/it]

episode: 753, reward: -235.28, average_reward: -272.22736890266486


 25%|██▌       | 755/3000 [17:18<51:45,  1.38s/it]

episode: 754, reward: -285.92, average_reward: -267.9248013011981


 25%|██▌       | 756/3000 [17:19<51:03,  1.37s/it]

episode: 755, reward: -293.44, average_reward: -270.17218370035596


 25%|██▌       | 757/3000 [17:21<51:14,  1.37s/it]

episode: 756, reward: -261.81, average_reward: -271.13271668639027


 25%|██▌       | 758/3000 [17:22<50:50,  1.36s/it]

episode: 757, reward: -276.54, average_reward: -269.2630322846875


 25%|██▌       | 759/3000 [17:24<51:20,  1.37s/it]

episode: 758, reward: 6505.82, average_reward: -270.40812136904884


 25%|██▌       | 760/3000 [17:25<51:33,  1.38s/it]

episode: 759, reward: -270.68, average_reward: 406.29934274005984


 25%|██▌       | 761/3000 [17:26<51:10,  1.37s/it]

episode: 760, reward: -286.94, average_reward: 403.8366444352849


 25%|██▌       | 762/3000 [17:28<50:50,  1.36s/it]

episode: 761, reward: -283.76, average_reward: 403.7067381139028


 25%|██▌       | 763/3000 [17:29<52:13,  1.40s/it]

episode: 762, reward: -281.38, average_reward: 403.22568949492694


 25%|██▌       | 764/3000 [17:31<51:51,  1.39s/it]

episode: 763, reward: -271.67, average_reward: 403.00636191423655


 26%|██▌       | 765/3000 [17:32<51:49,  1.39s/it]

episode: 764, reward: -257.11, average_reward: 399.36750029392164


 26%|██▌       | 766/3000 [17:33<51:29,  1.38s/it]

episode: 765, reward: -281.39, average_reward: 402.248606629988


 26%|██▌       | 767/3000 [17:35<51:36,  1.39s/it]

episode: 766, reward: -286.03, average_reward: 403.4535578009154


 26%|██▌       | 768/3000 [17:36<51:25,  1.38s/it]

episode: 767, reward: -272.74, average_reward: 401.03101513875964


 26%|██▌       | 769/3000 [17:37<51:05,  1.37s/it]

episode: 768, reward: -269.07, average_reward: 401.41114209155756


 26%|██▌       | 770/3000 [17:39<51:08,  1.38s/it]

episode: 769, reward: -1529.8, average_reward: -276.07721501018375


 26%|██▌       | 771/3000 [17:40<50:59,  1.37s/it]

episode: 770, reward: -280.09, average_reward: -401.9898611684263


 26%|██▌       | 772/3000 [17:42<51:55,  1.40s/it]

episode: 771, reward: -230.55, average_reward: -401.30471493972885


 26%|██▌       | 773/3000 [17:43<51:59,  1.40s/it]

episode: 772, reward: -251.25, average_reward: -395.9837028231933


 26%|██▌       | 774/3000 [17:44<51:53,  1.40s/it]

episode: 773, reward: -270.49, average_reward: -392.9707026222465


 26%|██▌       | 775/3000 [17:46<51:51,  1.40s/it]

episode: 774, reward: -248.54, average_reward: -392.85247372914796


 26%|██▌       | 776/3000 [17:47<52:26,  1.41s/it]

episode: 775, reward: -247.63, average_reward: -391.9951309877615


 26%|██▌       | 777/3000 [17:49<52:33,  1.42s/it]

episode: 776, reward: -268.82, average_reward: -388.6187687779021


 26%|██▌       | 778/3000 [17:50<51:40,  1.40s/it]

episode: 777, reward: -273.05, average_reward: -386.89736119785795


 26%|██▌       | 779/3000 [17:51<51:19,  1.39s/it]

episode: 778, reward: -271.4, average_reward: -386.9286482206729


 26%|██▌       | 780/3000 [17:53<51:26,  1.39s/it]

episode: 779, reward: -279.04, average_reward: -387.1614429697384


 26%|██▌       | 781/3000 [17:54<51:30,  1.39s/it]

episode: 780, reward: -258.11, average_reward: -262.08537495518465


 26%|██▌       | 782/3000 [17:56<51:28,  1.39s/it]

episode: 781, reward: -270.54, average_reward: -259.8870281577177


 26%|██▌       | 783/3000 [17:57<50:37,  1.37s/it]

episode: 782, reward: -273.14, average_reward: -263.8850833772966


 26%|██▌       | 784/3000 [17:58<50:33,  1.37s/it]

episode: 783, reward: -284.17, average_reward: -266.074129814983


 26%|██▌       | 785/3000 [18:00<51:11,  1.39s/it]

episode: 784, reward: -236.91, average_reward: -267.4419291429749


 26%|██▌       | 786/3000 [18:01<51:06,  1.39s/it]

episode: 785, reward: -285.19, average_reward: -266.279306912225


 26%|██▌       | 787/3000 [18:03<52:05,  1.41s/it]

episode: 786, reward: -275.49, average_reward: -270.03536997763115


 26%|██▋       | 788/3000 [18:04<52:06,  1.41s/it]

episode: 787, reward: -269.62, average_reward: -270.7027950956999


 26%|██▋       | 789/3000 [18:05<52:08,  1.41s/it]

episode: 788, reward: -280.13, average_reward: -270.3593436139823


 26%|██▋       | 790/3000 [18:07<52:04,  1.41s/it]

episode: 789, reward: -267.67, average_reward: -271.2327258683401


 26%|██▋       | 791/3000 [18:08<52:43,  1.43s/it]

episode: 790, reward: -222.21, average_reward: -270.0954626604898


 26%|██▋       | 792/3000 [18:10<52:40,  1.43s/it]

episode: 791, reward: -286.82, average_reward: -266.5052724000694


 26%|██▋       | 793/3000 [18:11<52:44,  1.43s/it]

episode: 792, reward: -274.1, average_reward: -268.1340390160894


 26%|██▋       | 794/3000 [18:13<53:18,  1.45s/it]

episode: 793, reward: -234.42, average_reward: -268.2303105302427


 26%|██▋       | 795/3000 [18:14<53:54,  1.47s/it]

episode: 794, reward: -260.52, average_reward: -263.2554696130918


 27%|██▋       | 796/3000 [18:16<54:00,  1.47s/it]

episode: 795, reward: -257.76, average_reward: -265.6163517114445


 27%|██▋       | 797/3000 [18:17<54:08,  1.47s/it]

episode: 796, reward: -224.0, average_reward: -262.87354134210835


 27%|██▋       | 798/3000 [18:18<53:13,  1.45s/it]

episode: 797, reward: -291.94, average_reward: -257.72443287962636


 27%|██▋       | 799/3000 [18:20<52:09,  1.42s/it]

episode: 798, reward: -300.23, average_reward: -259.9567602829903


 27%|██▋       | 800/3000 [18:21<51:26,  1.40s/it]

episode: 799, reward: -268.99, average_reward: -261.9665918906699


 27%|██▋       | 801/3000 [18:23<50:47,  1.39s/it]

episode: 800, reward: -271.99, average_reward: -262.09871738668886


 27%|██▋       | 802/3000 [18:24<50:40,  1.38s/it]

episode: 801, reward: -286.64, average_reward: -267.0773163940959


 27%|██▋       | 803/3000 [18:25<51:13,  1.40s/it]

episode: 802, reward: -274.03, average_reward: -267.05901776959325


 27%|██▋       | 804/3000 [18:27<51:16,  1.40s/it]

episode: 803, reward: -279.06, average_reward: -267.05229130966353


 27%|██▋       | 805/3000 [18:28<51:08,  1.40s/it]

episode: 804, reward: -261.62, average_reward: -271.5160779095096


 27%|██▋       | 806/3000 [18:30<50:41,  1.39s/it]

episode: 805, reward: -275.19, average_reward: -271.6261695837199


 27%|██▋       | 807/3000 [18:31<50:26,  1.38s/it]

episode: 806, reward: -264.38, average_reward: -273.36931326928453


 27%|██▋       | 808/3000 [18:32<50:52,  1.39s/it]

episode: 807, reward: -252.66, average_reward: -277.4069427824904


 27%|██▋       | 809/3000 [18:34<51:10,  1.40s/it]

episode: 808, reward: -310.25, average_reward: -273.47876352093084


 27%|██▋       | 810/3000 [18:35<50:37,  1.39s/it]

episode: 809, reward: -284.52, average_reward: -274.4808363484018


 27%|██▋       | 811/3000 [18:36<50:07,  1.37s/it]

episode: 810, reward: -276.82, average_reward: -276.0335708065754


 27%|██▋       | 812/3000 [18:38<49:43,  1.36s/it]

episode: 811, reward: -271.74, average_reward: -276.51601402840436


 27%|██▋       | 813/3000 [18:39<50:46,  1.39s/it]

episode: 812, reward: -255.17, average_reward: -275.0258806401685


 27%|██▋       | 814/3000 [18:41<50:14,  1.38s/it]

episode: 813, reward: -275.1, average_reward: -273.1392894879475


 27%|██▋       | 815/3000 [18:42<50:44,  1.39s/it]

episode: 814, reward: -237.64, average_reward: -272.7441145114786


 27%|██▋       | 816/3000 [18:43<51:02,  1.40s/it]

episode: 815, reward: -260.91, average_reward: -270.3460071050439


 27%|██▋       | 817/3000 [18:45<50:54,  1.40s/it]

episode: 816, reward: -253.68, average_reward: -268.91796213599815


 27%|██▋       | 818/3000 [18:46<50:56,  1.40s/it]

episode: 817, reward: -1011.28, average_reward: -267.8485193760472


 27%|██▋       | 819/3000 [18:48<50:14,  1.38s/it]

episode: 818, reward: -274.87, average_reward: -343.7108267190629


 27%|██▋       | 820/3000 [18:49<49:37,  1.37s/it]

episode: 819, reward: -271.46, average_reward: -340.17243336879505


 27%|██▋       | 821/3000 [18:50<49:16,  1.36s/it]

episode: 820, reward: -389.73, average_reward: -338.8666808821324


 27%|██▋       | 822/3000 [18:52<49:12,  1.36s/it]

episode: 821, reward: -268.98, average_reward: -350.1578407540562


 27%|██▋       | 823/3000 [18:53<49:24,  1.36s/it]

episode: 822, reward: -253.5, average_reward: -349.88233182796785


 27%|██▋       | 824/3000 [18:54<49:17,  1.36s/it]

episode: 823, reward: -288.89, average_reward: -349.71571294621435


 28%|██▊       | 825/3000 [18:56<49:04,  1.35s/it]

episode: 824, reward: -260.68, average_reward: -351.09416714308884


 28%|██▊       | 826/3000 [18:57<50:21,  1.39s/it]

episode: 825, reward: -235.46, average_reward: -353.39747221122934


 28%|██▊       | 827/3000 [18:59<50:50,  1.40s/it]

episode: 826, reward: -261.72, average_reward: -350.8521008879246


 28%|██▊       | 828/3000 [19:00<50:59,  1.41s/it]

episode: 827, reward: -263.29, average_reward: -351.6561650915031


 28%|██▊       | 829/3000 [19:01<50:31,  1.40s/it]

episode: 828, reward: -285.6, average_reward: -276.8569170594602


 28%|██▊       | 830/3000 [19:03<50:05,  1.39s/it]

episode: 829, reward: -281.55, average_reward: -277.93023906523825


 28%|██▊       | 831/3000 [19:04<50:52,  1.41s/it]

episode: 830, reward: -262.08, average_reward: -278.9396104390442


 28%|██▊       | 832/3000 [19:06<50:19,  1.39s/it]

episode: 831, reward: -265.35, average_reward: -266.17521356133915


 28%|██▊       | 833/3000 [19:07<50:12,  1.39s/it]

episode: 832, reward: -255.9, average_reward: -265.81207153636217


 28%|██▊       | 834/3000 [19:08<49:44,  1.38s/it]

episode: 833, reward: -274.37, average_reward: -266.052550496168


 28%|██▊       | 835/3000 [19:10<49:31,  1.37s/it]

episode: 834, reward: -278.03, average_reward: -264.60035055748733


 28%|██▊       | 836/3000 [19:11<49:12,  1.36s/it]

episode: 835, reward: -276.4, average_reward: -266.3356346352457


 28%|██▊       | 837/3000 [19:12<49:20,  1.37s/it]

episode: 836, reward: -273.56, average_reward: -270.4301599343136


 28%|██▊       | 838/3000 [19:14<49:13,  1.37s/it]

episode: 837, reward: -1146.72, average_reward: -271.61378261645405


 28%|██▊       | 839/3000 [19:15<50:16,  1.40s/it]

episode: 838, reward: -263.7, average_reward: -359.9566596511296


 28%|██▊       | 840/3000 [19:17<52:01,  1.45s/it]

episode: 839, reward: -236.25, average_reward: -357.76663244392813


 28%|██▊       | 841/3000 [19:18<51:22,  1.43s/it]

episode: 840, reward: -263.9, average_reward: -353.23578805361353


 28%|██▊       | 842/3000 [19:20<51:25,  1.43s/it]

episode: 841, reward: -270.43, average_reward: -353.41773402033954


 28%|██▊       | 843/3000 [19:21<52:21,  1.46s/it]

episode: 842, reward: -251.63, average_reward: -353.92599591016005


 28%|██▊       | 844/3000 [19:23<52:59,  1.47s/it]

episode: 843, reward: -253.27, average_reward: -353.49891596118204


 28%|██▊       | 845/3000 [19:24<51:54,  1.45s/it]

episode: 844, reward: -279.37, average_reward: -351.38928333387196


 28%|██▊       | 846/3000 [19:25<51:39,  1.44s/it]

episode: 845, reward: -240.94, average_reward: -351.52314383756044


 28%|██▊       | 847/3000 [19:27<50:26,  1.41s/it]

episode: 846, reward: -283.25, average_reward: -347.97668997809984


 28%|██▊       | 848/3000 [19:28<49:47,  1.39s/it]

episode: 847, reward: -279.1, average_reward: -348.94547862633914


 28%|██▊       | 849/3000 [19:29<49:39,  1.39s/it]

episode: 848, reward: -278.22, average_reward: -262.18366901879625


 28%|██▊       | 850/3000 [19:31<49:43,  1.39s/it]

episode: 849, reward: -273.33, average_reward: -263.63587467504465


 28%|██▊       | 851/3000 [19:32<49:14,  1.38s/it]

episode: 850, reward: -288.43, average_reward: -267.34416384364846


 28%|██▊       | 852/3000 [19:34<49:00,  1.37s/it]

episode: 851, reward: -275.75, average_reward: -269.79729960132056


 28%|██▊       | 853/3000 [19:35<49:09,  1.37s/it]

episode: 852, reward: -269.39, average_reward: -270.3285382142703


 28%|██▊       | 854/3000 [19:36<48:26,  1.35s/it]

episode: 853, reward: -272.08, average_reward: -272.10377628038043


 28%|██▊       | 855/3000 [19:38<48:25,  1.35s/it]

episode: 854, reward: -250.82, average_reward: -273.98505263914933


 29%|██▊       | 856/3000 [19:39<48:43,  1.36s/it]

episode: 855, reward: -265.45, average_reward: -271.13009433260146


 29%|██▊       | 857/3000 [19:40<49:52,  1.40s/it]

episode: 856, reward: -670.75, average_reward: -273.5808999836155


 29%|██▊       | 858/3000 [19:42<49:13,  1.38s/it]

episode: 857, reward: -279.75, average_reward: -312.33122781016755


 29%|██▊       | 859/3000 [19:43<50:16,  1.41s/it]

episode: 858, reward: -272.75, average_reward: -312.3957611321388


 29%|██▊       | 860/3000 [19:45<49:40,  1.39s/it]

episode: 859, reward: -275.03, average_reward: -311.84874497124554


 29%|██▊       | 861/3000 [19:46<49:21,  1.38s/it]

episode: 860, reward: -275.12, average_reward: -312.01934628682005


 29%|██▊       | 862/3000 [19:47<50:34,  1.42s/it]

episode: 861, reward: -269.31, average_reward: -310.68756820299603


 29%|██▉       | 863/3000 [19:49<49:55,  1.40s/it]

episode: 862, reward: -291.45, average_reward: -310.0439447699831


 29%|██▉       | 864/3000 [19:50<50:10,  1.41s/it]

episode: 863, reward: -277.54, average_reward: -312.25080635434995


 29%|██▉       | 865/3000 [19:52<49:56,  1.40s/it]

episode: 864, reward: -270.53, average_reward: -312.7963925694066


 29%|██▉       | 866/3000 [19:53<50:13,  1.41s/it]

episode: 865, reward: -261.15, average_reward: -314.7676949957853


 29%|██▉       | 867/3000 [19:54<49:36,  1.40s/it]

episode: 866, reward: -281.43, average_reward: -314.33769564487613


 29%|██▉       | 868/3000 [19:56<48:56,  1.38s/it]

episode: 867, reward: -279.41, average_reward: -275.4061193458876


 29%|██▉       | 869/3000 [19:57<49:24,  1.39s/it]

episode: 868, reward: -166.24, average_reward: -275.3724638184619


 29%|██▉       | 870/3000 [19:59<49:38,  1.40s/it]

episode: 869, reward: -935.61, average_reward: -264.7210030055373


 29%|██▉       | 871/3000 [20:00<49:31,  1.40s/it]

episode: 870, reward: -292.53, average_reward: -330.7780791551552


 29%|██▉       | 872/3000 [20:01<49:28,  1.39s/it]

episode: 871, reward: -283.7, average_reward: -332.51948436339325


 29%|██▉       | 873/3000 [20:03<49:18,  1.39s/it]

episode: 872, reward: -257.28, average_reward: -333.95845059276814


 29%|██▉       | 874/3000 [20:04<50:21,  1.42s/it]

episode: 873, reward: -238.43, average_reward: -330.54069766180277


 29%|██▉       | 875/3000 [20:06<50:12,  1.42s/it]

episode: 874, reward: -241.4, average_reward: -326.6300792515502


 29%|██▉       | 876/3000 [20:07<50:14,  1.42s/it]

episode: 875, reward: -280.79, average_reward: -323.71743854361154


 29%|██▉       | 877/3000 [20:09<50:17,  1.42s/it]

episode: 876, reward: -263.54, average_reward: -325.6814987068321


 29%|██▉       | 878/3000 [20:10<50:22,  1.42s/it]

episode: 877, reward: -531.45, average_reward: -323.89191537722996


 29%|██▉       | 879/3000 [20:11<49:55,  1.41s/it]

episode: 878, reward: -1702.75, average_reward: -349.0957000073285


 29%|██▉       | 880/3000 [20:13<49:33,  1.40s/it]

episode: 879, reward: -266.76, average_reward: -502.7467601475487


 29%|██▉       | 881/3000 [20:14<49:42,  1.41s/it]

episode: 880, reward: -1064.05, average_reward: -435.8619918679557


 29%|██▉       | 882/3000 [20:16<49:13,  1.39s/it]

episode: 881, reward: -264.94, average_reward: -513.0138737130495


 29%|██▉       | 883/3000 [20:17<48:46,  1.38s/it]

episode: 882, reward: -264.32, average_reward: -511.13819829457026


 29%|██▉       | 884/3000 [20:18<48:41,  1.38s/it]

episode: 883, reward: -257.91, average_reward: -511.84282158996496


 30%|██▉       | 885/3000 [20:20<50:04,  1.42s/it]

episode: 884, reward: -225.43, average_reward: -513.7906323382174


 30%|██▉       | 886/3000 [20:21<51:18,  1.46s/it]

episode: 885, reward: -272.58, average_reward: -512.1935467203714


 30%|██▉       | 887/3000 [20:23<52:07,  1.48s/it]

episode: 886, reward: -258.49, average_reward: -511.3725426151553


 30%|██▉       | 888/3000 [20:24<50:53,  1.45s/it]

episode: 887, reward: -1712.19, average_reward: -510.8681624702423


 30%|██▉       | 889/3000 [20:26<50:55,  1.45s/it]

episode: 888, reward: -271.48, average_reward: -628.9420143589083


 30%|██▉       | 890/3000 [20:27<51:36,  1.47s/it]

episode: 889, reward: -259.25, average_reward: -485.8154792891419


 30%|██▉       | 891/3000 [20:29<51:11,  1.46s/it]

episode: 890, reward: -270.02, average_reward: -485.06443807885097


 30%|██▉       | 892/3000 [20:30<51:30,  1.47s/it]

episode: 891, reward: -255.32, average_reward: -405.66179052713284


 30%|██▉       | 893/3000 [20:31<51:00,  1.45s/it]

episode: 892, reward: -278.37, average_reward: -404.69950475128974


 30%|██▉       | 894/3000 [20:33<51:08,  1.46s/it]

episode: 893, reward: -248.13, average_reward: -406.10378327916663


 30%|██▉       | 895/3000 [20:34<50:17,  1.43s/it]

episode: 894, reward: -268.64, average_reward: -405.1256743957815


 30%|██▉       | 896/3000 [20:36<49:26,  1.41s/it]

episode: 895, reward: -270.44, average_reward: -409.44685596201816


 30%|██▉       | 897/3000 [20:37<48:57,  1.40s/it]

episode: 896, reward: -287.11, average_reward: -409.2331093054897


 30%|██▉       | 898/3000 [20:38<48:59,  1.40s/it]

episode: 897, reward: -273.85, average_reward: -412.0949804964983


 30%|██▉       | 899/3000 [20:40<48:41,  1.39s/it]

episode: 898, reward: -261.34, average_reward: -268.26087325093874


 30%|███       | 900/3000 [20:41<48:10,  1.38s/it]

episode: 899, reward: -273.87, average_reward: -267.2471149733018


 30%|███       | 901/3000 [20:43<48:04,  1.37s/it]

episode: 900, reward: -269.97, average_reward: -268.7096647380764


 30%|███       | 902/3000 [20:44<48:21,  1.38s/it]

episode: 901, reward: -262.52, average_reward: -268.7039210398191


 30%|███       | 903/3000 [20:45<48:19,  1.38s/it]

episode: 902, reward: -254.44, average_reward: -269.42355980351806


 30%|███       | 904/3000 [20:47<49:07,  1.41s/it]

episode: 903, reward: -231.34, average_reward: -267.0309697377292


 30%|███       | 905/3000 [20:48<48:55,  1.40s/it]

episode: 904, reward: -265.4, average_reward: -265.35181149001295


 30%|███       | 906/3000 [20:50<48:26,  1.39s/it]

episode: 905, reward: -287.76, average_reward: -265.027681069039


 30%|███       | 907/3000 [20:51<48:11,  1.38s/it]

episode: 906, reward: -263.25, average_reward: -266.76031278244375


 30%|███       | 908/3000 [20:52<48:12,  1.38s/it]

episode: 907, reward: -268.15, average_reward: -264.3735771488201


 30%|███       | 909/3000 [20:54<48:15,  1.38s/it]

episode: 908, reward: -268.77, average_reward: -263.8038641564367


 30%|███       | 910/3000 [20:55<47:55,  1.38s/it]

episode: 909, reward: -276.41, average_reward: -264.5468990179851


 30%|███       | 911/3000 [20:56<47:41,  1.37s/it]

episode: 910, reward: -269.15, average_reward: -264.8005519065015


 30%|███       | 912/3000 [20:58<47:08,  1.35s/it]

episode: 911, reward: -271.9, average_reward: -264.7192123194397


 30%|███       | 913/3000 [20:59<47:09,  1.36s/it]

episode: 912, reward: -280.36, average_reward: -265.6570927358069


 30%|███       | 914/3000 [21:00<47:08,  1.36s/it]

episode: 913, reward: -288.67, average_reward: -268.24944921339784


 30%|███       | 915/3000 [21:02<46:52,  1.35s/it]

episode: 914, reward: -288.02, average_reward: -273.98281400286527


 31%|███       | 916/3000 [21:03<46:32,  1.34s/it]

episode: 915, reward: -273.98, average_reward: -276.244777461407


 31%|███       | 917/3000 [21:05<47:39,  1.37s/it]

episode: 916, reward: -465.55, average_reward: -274.86674060619504


 31%|███       | 918/3000 [21:06<47:56,  1.38s/it]

episode: 917, reward: -260.96, average_reward: -295.09676351808633


 31%|███       | 919/3000 [21:07<47:41,  1.38s/it]

episode: 918, reward: -292.36, average_reward: -294.3779593833146


 31%|███       | 920/3000 [21:09<47:14,  1.36s/it]

episode: 919, reward: -286.02, average_reward: -296.73627768801646


 31%|███       | 921/3000 [21:10<48:13,  1.39s/it]

episode: 920, reward: -245.42, average_reward: -297.696869040281


 31%|███       | 922/3000 [21:11<47:29,  1.37s/it]

episode: 921, reward: -281.75, average_reward: -295.3235055637839


 31%|███       | 923/3000 [21:13<47:25,  1.37s/it]

episode: 922, reward: -268.39, average_reward: -296.30845598931614


 31%|███       | 924/3000 [21:14<47:33,  1.37s/it]

episode: 923, reward: -272.9, average_reward: -295.1107377523901


 31%|███       | 925/3000 [21:16<47:16,  1.37s/it]

episode: 924, reward: -279.41, average_reward: -293.5339057969453


 31%|███       | 926/3000 [21:17<47:49,  1.38s/it]

episode: 925, reward: -230.69, average_reward: -292.67266863155953


 31%|███       | 927/3000 [21:18<47:51,  1.38s/it]

episode: 926, reward: -258.21, average_reward: -288.34326493255367


 31%|███       | 928/3000 [21:20<48:15,  1.40s/it]

episode: 927, reward: -276.92, average_reward: -267.60942676681657


 31%|███       | 929/3000 [21:21<48:06,  1.39s/it]

episode: 928, reward: -280.34, average_reward: -269.2055211122454


 31%|███       | 930/3000 [21:22<47:32,  1.38s/it]

episode: 929, reward: -282.03, average_reward: -268.0039199712276


 31%|███       | 931/3000 [21:24<48:13,  1.40s/it]

episode: 930, reward: -243.9, average_reward: -267.60521578935095


 31%|███       | 932/3000 [21:25<48:11,  1.40s/it]

episode: 931, reward: -277.16, average_reward: -267.4530201659481


 31%|███       | 933/3000 [21:27<47:39,  1.38s/it]

episode: 932, reward: -270.6, average_reward: -266.9941389864697


 31%|███       | 934/3000 [21:28<47:27,  1.38s/it]

episode: 933, reward: -263.41, average_reward: -267.21524841493306


 31%|███       | 935/3000 [21:29<47:23,  1.38s/it]

episode: 934, reward: -267.77, average_reward: -266.26576561140234


 31%|███       | 936/3000 [21:31<47:14,  1.37s/it]

episode: 935, reward: -284.45, average_reward: -265.10134627889613


 31%|███       | 937/3000 [21:32<47:21,  1.38s/it]

episode: 936, reward: -278.43, average_reward: -270.47696330012656


 31%|███▏      | 938/3000 [21:34<47:03,  1.37s/it]

episode: 937, reward: -278.46, average_reward: -272.4988225332278


 31%|███▏      | 939/3000 [21:35<46:30,  1.35s/it]

episode: 938, reward: -274.79, average_reward: -272.65257744463827


 31%|███▏      | 940/3000 [21:36<46:35,  1.36s/it]

episode: 939, reward: -282.57, average_reward: -272.0971843028311


 31%|███▏      | 941/3000 [21:38<46:52,  1.37s/it]

episode: 940, reward: -269.53, average_reward: -272.15116835741526


 31%|███▏      | 942/3000 [21:39<46:53,  1.37s/it]

episode: 941, reward: -352.58, average_reward: -274.71455890541733


 31%|███▏      | 943/3000 [21:40<46:31,  1.36s/it]

episode: 942, reward: -273.15, average_reward: -282.2568432469209


 31%|███▏      | 944/3000 [21:42<46:45,  1.36s/it]

episode: 943, reward: -248.21, average_reward: -282.51240095233373


 32%|███▏      | 945/3000 [21:43<46:02,  1.34s/it]

episode: 944, reward: -2269.26, average_reward: -280.99257614306373


 32%|███▏      | 946/3000 [21:44<45:54,  1.34s/it]

episode: 945, reward: -269.89, average_reward: -481.1417134284564


 32%|███▏      | 947/3000 [21:46<46:18,  1.35s/it]

episode: 946, reward: -1006.77, average_reward: -479.68640808237467


 32%|███▏      | 948/3000 [21:47<46:44,  1.37s/it]

episode: 947, reward: -273.11, average_reward: -552.5206075354733


 32%|███▏      | 949/3000 [21:48<46:47,  1.37s/it]

episode: 948, reward: -1228.4, average_reward: -551.9859675640698


 32%|███▏      | 950/3000 [21:50<46:17,  1.35s/it]

episode: 949, reward: -278.47, average_reward: -647.3471934098106


 32%|███▏      | 951/3000 [21:51<46:28,  1.36s/it]

episode: 950, reward: -253.25, average_reward: -646.9375690525569


 32%|███▏      | 952/3000 [21:52<46:18,  1.36s/it]

episode: 951, reward: -269.1, average_reward: -645.3098996169122


 32%|███▏      | 953/3000 [21:54<47:14,  1.38s/it]

episode: 952, reward: -241.1, average_reward: -636.9615690957976


 32%|███▏      | 954/3000 [21:55<46:39,  1.37s/it]

episode: 953, reward: -281.14, average_reward: -633.7563413708936


 32%|███▏      | 955/3000 [21:57<46:44,  1.37s/it]

episode: 954, reward: -275.92, average_reward: -637.0494699554079


 32%|███▏      | 956/3000 [21:58<46:42,  1.37s/it]

episode: 955, reward: -266.39, average_reward: -437.71553689546846


 32%|███▏      | 957/3000 [21:59<46:38,  1.37s/it]

episode: 956, reward: -272.21, average_reward: -437.36526502631114


 32%|███▏      | 958/3000 [22:01<46:22,  1.36s/it]

episode: 957, reward: -266.06, average_reward: -363.90979841388344


 32%|███▏      | 959/3000 [22:02<45:54,  1.35s/it]

episode: 958, reward: -1655.88, average_reward: -363.20498469182394


 32%|███▏      | 960/3000 [22:03<46:11,  1.36s/it]

episode: 959, reward: -270.6, average_reward: -405.95328289614474


 32%|███▏      | 961/3000 [22:05<46:34,  1.37s/it]

episode: 960, reward: -973.37, average_reward: -405.16652731295096


 32%|███▏      | 962/3000 [22:06<45:54,  1.35s/it]

episode: 961, reward: -1770.05, average_reward: -477.17859646103335


 32%|███▏      | 963/3000 [22:08<46:07,  1.36s/it]

episode: 962, reward: -255.65, average_reward: -627.2740155086444


 32%|███▏      | 964/3000 [22:09<46:18,  1.36s/it]

episode: 963, reward: -279.25, average_reward: -628.7286610111018


 32%|███▏      | 965/3000 [22:10<46:18,  1.37s/it]

episode: 964, reward: -287.24, average_reward: -628.5391577009199


 32%|███▏      | 966/3000 [22:12<46:24,  1.37s/it]

episode: 965, reward: -259.25, average_reward: -629.670930770867


 32%|███▏      | 967/3000 [22:13<46:43,  1.38s/it]

episode: 966, reward: -272.61, average_reward: -628.9568247374704


 32%|███▏      | 968/3000 [22:14<46:41,  1.38s/it]

episode: 967, reward: -271.68, average_reward: -628.9965942008604


 32%|███▏      | 969/3000 [22:16<46:09,  1.36s/it]

episode: 968, reward: -271.35, average_reward: -629.5577397860692


 32%|███▏      | 970/3000 [22:17<46:15,  1.37s/it]

episode: 969, reward: -278.31, average_reward: -491.1046151233004


 32%|███▏      | 971/3000 [22:19<47:12,  1.40s/it]

episode: 970, reward: -245.01, average_reward: -491.87531682144834


 32%|███▏      | 972/3000 [22:20<47:11,  1.40s/it]

episode: 971, reward: -255.01, average_reward: -419.03896200858514


 32%|███▏      | 973/3000 [22:21<46:27,  1.38s/it]

episode: 972, reward: -281.0, average_reward: -267.5348330081456


 32%|███▏      | 974/3000 [22:23<46:31,  1.38s/it]

episode: 973, reward: -272.84, average_reward: -270.07043811012693


 32%|███▎      | 975/3000 [22:23<39:41,  1.18s/it]

Physics Error: [ 1.         -1.          0.74448566 -1.         -0.36839707]
episode: 974, reward: 13326.64, average_reward: -269.42952420600943


 33%|███▎      | 976/3000 [22:25<41:54,  1.24s/it]

episode: 975, reward: -287.05, average_reward: 1091.9580889644667


 33%|███▎      | 977/3000 [22:26<43:15,  1.28s/it]

episode: 976, reward: -280.25, average_reward: 1089.177806833863


 33%|███▎      | 978/3000 [22:28<44:10,  1.31s/it]

episode: 977, reward: -268.58, average_reward: 1088.4139508336134


 33%|███▎      | 979/3000 [22:29<45:58,  1.37s/it]

episode: 978, reward: -238.57, average_reward: 1088.7230768183467


 33%|███▎      | 980/3000 [22:30<46:18,  1.38s/it]

episode: 979, reward: -281.28, average_reward: 1092.0013873829982


 33%|███▎      | 981/3000 [22:32<46:42,  1.39s/it]

episode: 980, reward: -251.45, average_reward: 1091.7049919963902


 33%|███▎      | 982/3000 [22:33<47:06,  1.40s/it]

episode: 981, reward: -251.93, average_reward: 1091.0611333059387


 33%|███▎      | 983/3000 [22:35<47:53,  1.42s/it]

episode: 982, reward: -275.94, average_reward: 1091.368716481678


 33%|███▎      | 984/3000 [22:36<49:54,  1.49s/it]

episode: 983, reward: -222.69, average_reward: 1091.8752777178847


 33%|███▎      | 985/3000 [22:38<48:55,  1.46s/it]

episode: 984, reward: -273.97, average_reward: 1096.8897696659765


 33%|███▎      | 986/3000 [22:39<49:17,  1.47s/it]

episode: 985, reward: -781.98, average_reward: -263.17094710209386


 33%|███▎      | 987/3000 [22:41<48:53,  1.46s/it]

episode: 986, reward: -284.58, average_reward: -312.66341879004915


 33%|███▎      | 988/3000 [22:42<49:04,  1.46s/it]

episode: 987, reward: -262.45, average_reward: -313.0968684917699


 33%|███▎      | 989/3000 [22:44<49:17,  1.47s/it]

episode: 988, reward: -264.18, average_reward: -312.4836665076658


 33%|███▎      | 990/3000 [22:45<48:28,  1.45s/it]

episode: 989, reward: -273.84, average_reward: -315.0452037694862


 33%|███▎      | 991/3000 [22:46<47:48,  1.43s/it]

episode: 990, reward: -267.4, average_reward: -314.3021415019618


 33%|███▎      | 992/3000 [22:48<48:11,  1.44s/it]

episode: 991, reward: -279.56, average_reward: -315.89710726677976


 33%|███▎      | 993/3000 [22:50<49:53,  1.49s/it]

episode: 992, reward: -829.38, average_reward: -318.6602212293018


 33%|███▎      | 994/3000 [22:51<49:29,  1.48s/it]

episode: 993, reward: -267.35, average_reward: -374.00428737556797


 33%|███▎      | 995/3000 [22:52<49:05,  1.47s/it]

episode: 994, reward: -1641.8, average_reward: -378.4696810790574


 33%|███▎      | 996/3000 [22:54<49:09,  1.47s/it]

episode: 995, reward: -276.32, average_reward: -515.253083671825


 33%|███▎      | 997/3000 [22:55<48:53,  1.46s/it]

episode: 996, reward: -1150.29, average_reward: -464.68698522237185


 33%|███▎      | 998/3000 [22:57<48:12,  1.45s/it]

episode: 997, reward: -271.99, average_reward: -551.2580178927449


 33%|███▎      | 999/3000 [22:58<48:10,  1.44s/it]

episode: 998, reward: -1153.08, average_reward: -552.2119416989225


 33%|███▎      | 1000/3000 [23:00<48:22,  1.45s/it]

episode: 999, reward: -255.01, average_reward: -641.1017579993869


 33%|███▎      | 1001/3000 [23:01<47:40,  1.43s/it]

episode: 1000, reward: -251.29, average_reward: -639.218738094611


 33%|███▎      | 1002/3000 [23:02<47:18,  1.42s/it]

episode: 1001, reward: -275.83, average_reward: -637.6072982674759


 33%|███▎      | 1003/3000 [23:04<47:22,  1.42s/it]

episode: 1002, reward: -275.21, average_reward: -637.2340711833118


 33%|███▎      | 1004/3000 [23:05<47:35,  1.43s/it]

episode: 1003, reward: -266.11, average_reward: -581.817596075557


 34%|███▎      | 1005/3000 [23:07<47:28,  1.43s/it]

episode: 1004, reward: -279.04, average_reward: -581.6940718636748


 34%|███▎      | 1006/3000 [23:08<48:18,  1.45s/it]

episode: 1005, reward: -279.15, average_reward: -445.4174758173241


 34%|███▎      | 1007/3000 [23:10<48:50,  1.47s/it]

episode: 1006, reward: -268.22, average_reward: -445.70045109688454


 34%|███▎      | 1008/3000 [23:11<48:35,  1.46s/it]

episode: 1007, reward: -263.43, average_reward: -357.49265225501324


 34%|███▎      | 1009/3000 [23:13<48:13,  1.45s/it]

episode: 1008, reward: -279.07, average_reward: -356.6368802807268


 34%|███▎      | 1010/3000 [23:14<48:14,  1.45s/it]

episode: 1009, reward: -1615.98, average_reward: -269.2360835427482


 34%|███▎      | 1011/3000 [23:16<47:48,  1.44s/it]

episode: 1010, reward: -277.63, average_reward: -405.3330358256434


 34%|███▎      | 1012/3000 [23:17<47:45,  1.44s/it]

episode: 1011, reward: -261.84, average_reward: -407.9670565010097


 34%|███▍      | 1013/3000 [23:18<48:08,  1.45s/it]

episode: 1012, reward: -324.47, average_reward: -406.56819487118395


 34%|███▍      | 1014/3000 [23:20<48:01,  1.45s/it]

episode: 1013, reward: -261.46, average_reward: -411.49433711004593


 34%|███▍      | 1015/3000 [23:21<47:26,  1.43s/it]

episode: 1014, reward: -1552.85, average_reward: -411.02935316821583


 34%|███▍      | 1016/3000 [23:23<46:48,  1.42s/it]

episode: 1015, reward: -269.45, average_reward: -538.4105414856587


 34%|███▍      | 1017/3000 [23:24<46:43,  1.41s/it]

episode: 1016, reward: -291.68, average_reward: -537.4413599757588


 34%|███▍      | 1018/3000 [23:25<46:16,  1.40s/it]

episode: 1017, reward: -285.14, average_reward: -539.7875533910756


 34%|███▍      | 1019/3000 [23:27<46:13,  1.40s/it]

episode: 1018, reward: -268.82, average_reward: -541.9581892479342


 34%|███▍      | 1020/3000 [23:28<46:31,  1.41s/it]

episode: 1019, reward: -350.97, average_reward: -540.9332099078937


 34%|███▍      | 1021/3000 [23:30<46:10,  1.40s/it]

episode: 1020, reward: -278.87, average_reward: -414.4321428505451


 34%|███▍      | 1022/3000 [23:31<45:55,  1.39s/it]

episode: 1021, reward: -262.2, average_reward: -414.5564123031293


 34%|███▍      | 1023/3000 [23:32<45:57,  1.39s/it]

episode: 1022, reward: -267.3, average_reward: -414.59162900923695


 34%|███▍      | 1024/3000 [23:34<45:31,  1.38s/it]

episode: 1023, reward: -272.31, average_reward: -408.874398198539


 34%|███▍      | 1025/3000 [23:35<46:30,  1.41s/it]

episode: 1024, reward: -280.43, average_reward: -409.9589384361032


 34%|███▍      | 1026/3000 [23:37<46:42,  1.42s/it]

episode: 1025, reward: -247.5, average_reward: -282.7168714494766


 34%|███▍      | 1027/3000 [23:38<46:51,  1.43s/it]

episode: 1026, reward: -1596.66, average_reward: -280.52193504196146


 34%|███▍      | 1028/3000 [23:40<47:04,  1.43s/it]

episode: 1027, reward: -1663.47, average_reward: -411.02023265355035


 34%|███▍      | 1029/3000 [23:41<47:08,  1.44s/it]

episode: 1028, reward: -281.67, average_reward: -548.8533153725308


 34%|███▍      | 1030/3000 [23:43<48:55,  1.49s/it]

episode: 1029, reward: -286.09, average_reward: -550.1380662198155


 34%|███▍      | 1031/3000 [23:44<48:41,  1.48s/it]

episode: 1030, reward: -1814.1, average_reward: -543.6500223627176


 34%|███▍      | 1032/3000 [23:46<48:16,  1.47s/it]

episode: 1031, reward: -248.86, average_reward: -697.1736620981537


 34%|███▍      | 1033/3000 [23:47<48:44,  1.49s/it]

episode: 1032, reward: -276.46, average_reward: -695.8398375885047


 34%|███▍      | 1034/3000 [23:48<47:52,  1.46s/it]

episode: 1033, reward: -265.47, average_reward: -696.75576536108


 34%|███▍      | 1035/3000 [23:50<47:07,  1.44s/it]

episode: 1034, reward: -284.15, average_reward: -696.071745144757


 35%|███▍      | 1036/3000 [23:51<46:21,  1.42s/it]

episode: 1035, reward: -288.22, average_reward: -696.4445314587165


 35%|███▍      | 1037/3000 [23:53<46:03,  1.41s/it]

episode: 1036, reward: -260.78, average_reward: -700.5157999999911


 35%|███▍      | 1038/3000 [23:54<45:32,  1.39s/it]

episode: 1037, reward: -283.89, average_reward: -566.9273025627774


 35%|███▍      | 1039/3000 [23:55<45:12,  1.38s/it]

episode: 1038, reward: -275.68, average_reward: -428.968833269883


 35%|███▍      | 1040/3000 [23:57<44:51,  1.37s/it]

episode: 1039, reward: -289.94, average_reward: -428.36951493055574


 35%|███▍      | 1041/3000 [23:58<44:53,  1.37s/it]

episode: 1040, reward: -271.3, average_reward: -428.75422864868585


 35%|███▍      | 1042/3000 [23:59<44:55,  1.38s/it]

episode: 1041, reward: -280.8, average_reward: -274.47426294240097


 35%|███▍      | 1043/3000 [24:01<45:35,  1.40s/it]

episode: 1042, reward: -276.83, average_reward: -277.6684758383723


 35%|███▍      | 1044/3000 [24:02<46:05,  1.41s/it]

episode: 1043, reward: -262.83, average_reward: -277.7050073733026


 35%|███▍      | 1045/3000 [24:04<46:15,  1.42s/it]

episode: 1044, reward: -254.28, average_reward: -277.441005481497


 35%|███▍      | 1046/3000 [24:05<46:20,  1.42s/it]

episode: 1045, reward: -762.36, average_reward: -274.4537809043059


 35%|███▍      | 1047/3000 [24:07<45:38,  1.40s/it]

episode: 1046, reward: -1809.85, average_reward: -321.86769928790784


 35%|███▍      | 1048/3000 [24:08<45:11,  1.39s/it]

episode: 1047, reward: -1634.78, average_reward: -476.7754849048677


 35%|███▍      | 1049/3000 [24:09<45:03,  1.39s/it]

episode: 1048, reward: -276.7, average_reward: -611.8653198182947


 35%|███▌      | 1050/3000 [24:11<45:14,  1.39s/it]

episode: 1049, reward: -251.53, average_reward: -611.9670758167084


 35%|███▌      | 1051/3000 [24:12<46:17,  1.42s/it]

episode: 1050, reward: -614.45, average_reward: -608.1264598493534


 35%|███▌      | 1052/3000 [24:14<46:41,  1.44s/it]

episode: 1051, reward: -250.11, average_reward: -642.441078350859


 35%|███▌      | 1053/3000 [24:15<45:55,  1.42s/it]

episode: 1052, reward: -277.04, average_reward: -639.3722176091588


 35%|███▌      | 1054/3000 [24:16<45:37,  1.41s/it]

episode: 1053, reward: -283.63, average_reward: -639.3931329439046


 35%|███▌      | 1055/3000 [24:18<45:27,  1.40s/it]

episode: 1054, reward: -266.08, average_reward: -641.4730260326985


 35%|███▌      | 1056/3000 [24:19<45:02,  1.39s/it]

episode: 1055, reward: -287.71, average_reward: -642.6525104846522


 35%|███▌      | 1057/3000 [24:20<44:20,  1.37s/it]

episode: 1056, reward: -287.89, average_reward: -595.1880712560394


 35%|███▌      | 1058/3000 [24:22<44:14,  1.37s/it]

episode: 1057, reward: -287.55, average_reward: -442.9911829085971


 35%|███▌      | 1059/3000 [24:23<43:56,  1.36s/it]

episode: 1058, reward: -280.5, average_reward: -308.2674725313603


 35%|███▌      | 1060/3000 [24:25<44:20,  1.37s/it]

episode: 1059, reward: -281.2, average_reward: -308.6475479899649


 35%|███▌      | 1061/3000 [24:26<44:10,  1.37s/it]

episode: 1060, reward: -264.35, average_reward: -311.61403102559086


 35%|███▌      | 1062/3000 [24:27<44:05,  1.37s/it]

episode: 1061, reward: -248.77, average_reward: -276.6043717446851


 35%|███▌      | 1063/3000 [24:29<45:04,  1.40s/it]

episode: 1062, reward: -267.87, average_reward: -276.47038974176195


 35%|███▌      | 1064/3000 [24:30<44:43,  1.39s/it]

episode: 1063, reward: -265.92, average_reward: -275.5533550218677


 36%|███▌      | 1065/3000 [24:32<45:01,  1.40s/it]

episode: 1064, reward: -284.42, average_reward: -273.7827638951371


 36%|███▌      | 1066/3000 [24:33<45:18,  1.41s/it]

episode: 1065, reward: -259.99, average_reward: -275.616862477662


 36%|███▌      | 1067/3000 [24:34<45:44,  1.42s/it]

episode: 1066, reward: -260.33, average_reward: -272.8442819522696


 36%|███▌      | 1068/3000 [24:35<40:33,  1.26s/it]

Physics Error: [-1.  1. -1. -1.  1.]
episode: 1067, reward: -118.66, average_reward: -270.08872733311864


 36%|███▌      | 1069/3000 [24:37<41:49,  1.30s/it]

episode: 1068, reward: -284.81, average_reward: -253.2001571119652


 36%|███▌      | 1070/3000 [24:38<42:32,  1.32s/it]

episode: 1069, reward: -278.31, average_reward: -253.63116438786233


 36%|███▌      | 1071/3000 [24:40<43:45,  1.36s/it]

episode: 1070, reward: -271.6, average_reward: -253.34249488007976


 36%|███▌      | 1072/3000 [24:41<43:20,  1.35s/it]

episode: 1071, reward: -289.82, average_reward: -254.06669901016426


 36%|███▌      | 1073/3000 [24:42<43:37,  1.36s/it]

episode: 1072, reward: -280.45, average_reward: -258.17165551978087


 36%|███▌      | 1074/3000 [24:44<43:16,  1.35s/it]

episode: 1073, reward: -277.57, average_reward: -259.4302477817502


 36%|███▌      | 1075/3000 [24:45<43:50,  1.37s/it]

episode: 1074, reward: -263.01, average_reward: -260.59490312050275


 36%|███▌      | 1076/3000 [24:46<44:54,  1.40s/it]

episode: 1075, reward: -232.07, average_reward: -258.4539794194863


 36%|███▌      | 1077/3000 [24:48<44:31,  1.39s/it]

episode: 1076, reward: -278.03, average_reward: -255.66243722380864


 36%|███▌      | 1078/3000 [24:49<44:35,  1.39s/it]

episode: 1077, reward: -290.29, average_reward: -257.4328150655241


 36%|███▌      | 1079/3000 [24:51<44:06,  1.38s/it]

episode: 1078, reward: -261.31, average_reward: -274.5957319334777


 36%|███▌      | 1080/3000 [24:52<43:48,  1.37s/it]

episode: 1079, reward: -269.81, average_reward: -272.24580221065355


 36%|███▌      | 1081/3000 [24:53<44:50,  1.40s/it]

episode: 1080, reward: -227.39, average_reward: -271.39562541243595


 36%|███▌      | 1082/3000 [24:55<43:53,  1.37s/it]

episode: 1081, reward: -267.71, average_reward: -266.9754042971604


 36%|███▌      | 1083/3000 [24:56<44:00,  1.38s/it]

episode: 1082, reward: -280.1, average_reward: -264.76434108086636


 36%|███▌      | 1084/3000 [24:57<43:51,  1.37s/it]

episode: 1083, reward: -275.47, average_reward: -264.72954282167655


 36%|███▌      | 1085/3000 [24:59<43:56,  1.38s/it]

episode: 1084, reward: -261.68, average_reward: -264.5195420710394


 36%|███▌      | 1086/3000 [25:00<44:58,  1.41s/it]

episode: 1085, reward: -235.19, average_reward: -264.38645388796067


 36%|███▌      | 1087/3000 [25:02<44:36,  1.40s/it]

episode: 1086, reward: -277.65, average_reward: -264.6984134878997


 36%|███▋      | 1088/3000 [25:03<44:09,  1.39s/it]

episode: 1087, reward: -254.14, average_reward: -264.6604107208925


 36%|███▋      | 1089/3000 [25:04<43:40,  1.37s/it]

episode: 1088, reward: -279.91, average_reward: -261.0451713361823


 36%|███▋      | 1090/3000 [25:06<44:06,  1.39s/it]

episode: 1089, reward: -240.51, average_reward: -262.9058071680433


 36%|███▋      | 1091/3000 [25:07<43:37,  1.37s/it]

episode: 1090, reward: -271.44, average_reward: -259.97548878946134


 36%|███▋      | 1092/3000 [25:09<43:41,  1.37s/it]

episode: 1091, reward: -285.84, average_reward: -264.37996168692126


 36%|███▋      | 1093/3000 [25:10<44:49,  1.41s/it]

episode: 1092, reward: -255.68, average_reward: -266.19281743963467


 36%|███▋      | 1094/3000 [25:11<44:14,  1.39s/it]

episode: 1093, reward: -291.76, average_reward: -263.75030315885374


 36%|███▋      | 1095/3000 [25:13<44:55,  1.42s/it]

episode: 1094, reward: -272.15, average_reward: -265.37985760198615


 37%|███▋      | 1096/3000 [25:14<44:49,  1.41s/it]

episode: 1095, reward: -271.02, average_reward: -266.4267425114451


 37%|███▋      | 1097/3000 [25:16<43:59,  1.39s/it]

episode: 1096, reward: -287.24, average_reward: -270.0093559397193


 37%|███▋      | 1098/3000 [25:17<43:46,  1.38s/it]

episode: 1097, reward: -282.19, average_reward: -270.9678251836916


 37%|███▋      | 1099/3000 [25:18<44:12,  1.40s/it]

episode: 1098, reward: -269.56, average_reward: -273.77331631810114


 37%|███▋      | 1100/3000 [25:20<44:11,  1.40s/it]

episode: 1099, reward: -272.03, average_reward: -272.7375919674333


 37%|███▋      | 1101/3000 [25:21<43:38,  1.38s/it]

episode: 1100, reward: -282.31, average_reward: -275.88989422795464


 37%|███▋      | 1102/3000 [25:23<43:50,  1.39s/it]

episode: 1101, reward: -240.0, average_reward: -276.9769219187093


 37%|███▋      | 1103/3000 [25:24<43:55,  1.39s/it]

episode: 1102, reward: -282.29, average_reward: -272.3930720964985


 37%|███▋      | 1104/3000 [25:25<43:35,  1.38s/it]

episode: 1103, reward: -265.61, average_reward: -275.0538967811218


 37%|███▋      | 1105/3000 [25:27<44:02,  1.39s/it]

episode: 1104, reward: -265.49, average_reward: -272.4388054941456


 37%|███▋      | 1106/3000 [25:28<44:48,  1.42s/it]

episode: 1105, reward: -280.59, average_reward: -271.77282788130015


 37%|███▋      | 1107/3000 [25:30<44:56,  1.42s/it]

episode: 1106, reward: -279.74, average_reward: -272.72991886610964


 37%|███▋      | 1108/3000 [25:31<44:54,  1.42s/it]

episode: 1107, reward: -282.95, average_reward: -271.97983956451077


 37%|███▋      | 1109/3000 [25:32<44:39,  1.42s/it]

episode: 1108, reward: -952.28, average_reward: -272.0557814039163


 37%|███▋      | 1110/3000 [25:34<44:17,  1.41s/it]

episode: 1109, reward: -262.85, average_reward: -340.32846317006073


 37%|███▋      | 1111/3000 [25:35<44:06,  1.40s/it]

episode: 1110, reward: -1726.74, average_reward: -339.40997648092497


 37%|███▋      | 1112/3000 [25:37<44:21,  1.41s/it]

episode: 1111, reward: -725.77, average_reward: -483.85323922499464


 37%|███▋      | 1113/3000 [25:38<44:02,  1.40s/it]

episode: 1112, reward: -270.66, average_reward: -532.429787483484


 37%|███▋      | 1114/3000 [25:39<43:22,  1.38s/it]

episode: 1113, reward: -1267.4, average_reward: -531.2671774158921


 37%|███▋      | 1115/3000 [25:41<43:09,  1.37s/it]

episode: 1114, reward: -274.87, average_reward: -631.4464737753611


 37%|███▋      | 1116/3000 [25:42<43:14,  1.38s/it]

episode: 1115, reward: -266.32, average_reward: -632.3845907791743


 37%|███▋      | 1117/3000 [25:43<43:04,  1.37s/it]

episode: 1116, reward: -273.21, average_reward: -630.957981000561


 37%|███▋      | 1118/3000 [25:45<42:31,  1.36s/it]

episode: 1117, reward: -279.78, average_reward: -630.3052031201826


 37%|███▋      | 1119/3000 [25:46<42:54,  1.37s/it]

episode: 1118, reward: -276.02, average_reward: -629.9876600291278


 37%|███▋      | 1120/3000 [25:47<42:31,  1.36s/it]

episode: 1119, reward: -263.06, average_reward: -562.361532520424


 37%|███▋      | 1121/3000 [25:49<42:38,  1.36s/it]

episode: 1120, reward: -272.48, average_reward: -562.383089859861


 37%|███▋      | 1122/3000 [25:50<43:21,  1.39s/it]

episode: 1121, reward: -184.69, average_reward: -416.9573391294604


 37%|███▋      | 1123/3000 [25:52<43:06,  1.38s/it]

episode: 1122, reward: -932.53, average_reward: -362.84988817253463


 37%|███▋      | 1124/3000 [25:53<43:09,  1.38s/it]

episode: 1123, reward: -901.28, average_reward: -429.037072753547


 38%|███▊      | 1125/3000 [25:54<43:17,  1.39s/it]

episode: 1124, reward: -273.46, average_reward: -392.4250134652954


 38%|███▊      | 1126/3000 [25:56<43:20,  1.39s/it]

episode: 1125, reward: -269.75, average_reward: -392.2838442199571


 38%|███▊      | 1127/3000 [25:57<43:53,  1.41s/it]

episode: 1126, reward: -252.01, average_reward: -392.62656533190454


 38%|███▊      | 1128/3000 [25:59<43:55,  1.41s/it]

episode: 1127, reward: -271.41, average_reward: -390.50695808794006


 38%|███▊      | 1129/3000 [26:00<43:33,  1.40s/it]

episode: 1128, reward: -280.57, average_reward: -389.6706087183434


 38%|███▊      | 1130/3000 [26:01<43:32,  1.40s/it]

episode: 1129, reward: -277.05, average_reward: -390.12544416532626


 38%|███▊      | 1131/3000 [26:03<43:11,  1.39s/it]

episode: 1130, reward: -273.6, average_reward: -391.52422708068366


 38%|███▊      | 1132/3000 [26:04<42:50,  1.38s/it]

episode: 1131, reward: -272.92, average_reward: -391.6359571883753


 38%|███▊      | 1133/3000 [26:06<42:28,  1.37s/it]

episode: 1132, reward: -279.98, average_reward: -400.45914009200357


 38%|███▊      | 1134/3000 [26:07<42:54,  1.38s/it]

episode: 1133, reward: -264.77, average_reward: -335.2038971502762


 38%|███▊      | 1135/3000 [26:08<42:29,  1.37s/it]

episode: 1134, reward: -263.09, average_reward: -271.55285263320735


 38%|███▊      | 1136/3000 [26:10<42:15,  1.36s/it]

episode: 1135, reward: -275.11, average_reward: -270.51619112289575


 38%|███▊      | 1137/3000 [26:11<42:34,  1.37s/it]

episode: 1136, reward: -263.51, average_reward: -271.05277316446734


 38%|███▊      | 1138/3000 [26:12<42:31,  1.37s/it]

episode: 1137, reward: -267.67, average_reward: -272.2023354726913


 38%|███▊      | 1139/3000 [26:14<42:53,  1.38s/it]

episode: 1138, reward: -269.27, average_reward: -271.82837068206317


 38%|███▊      | 1140/3000 [26:15<43:21,  1.40s/it]

episode: 1139, reward: -278.44, average_reward: -270.6987624611966


 38%|███▊      | 1141/3000 [26:17<45:22,  1.46s/it]

episode: 1140, reward: -247.55, average_reward: -270.83832282334157


 38%|███▊      | 1142/3000 [26:18<44:20,  1.43s/it]

episode: 1141, reward: -274.06, average_reward: -268.23312503286513


 38%|███▊      | 1143/3000 [26:20<45:05,  1.46s/it]

episode: 1142, reward: -231.46, average_reward: -268.34718853752463


 38%|███▊      | 1144/3000 [26:21<44:18,  1.43s/it]

episode: 1143, reward: -265.44, average_reward: -263.49523641677763


 38%|███▊      | 1145/3000 [26:23<44:19,  1.43s/it]

episode: 1144, reward: -274.68, average_reward: -263.56173177445163


 38%|███▊      | 1146/3000 [26:24<44:06,  1.43s/it]

episode: 1145, reward: -260.69, average_reward: -264.72108828544884


 38%|███▊      | 1147/3000 [26:25<43:57,  1.42s/it]

episode: 1146, reward: -1374.26, average_reward: -263.2783081697511


 38%|███▊      | 1148/3000 [26:27<44:34,  1.44s/it]

episode: 1147, reward: -253.5, average_reward: -374.3534142794043


 38%|███▊      | 1149/3000 [26:28<44:40,  1.45s/it]

episode: 1148, reward: -285.81, average_reward: -372.9364129171464


 38%|███▊      | 1150/3000 [26:30<45:02,  1.46s/it]

In [None]:
def memory_policy():
    t = 0

    def policy(time_step):
        nonlocal t
        state = parse_obs(time_step.observation)
        action = agent.get_action(state)
        action = noise.get_action(action, t)
        action = denorm(action)
        t += 1
        return action

    return policy


policy = memory_policy()
viewer.launch(env, policy=policy)