In [1]:
import copy

import matplotlib
import numpy as np
from dm_control import viewer
import matplotlib.pyplot as plt
from matplotlib import animation
from tqdm import tqdm
from simulation.dm_control.ddpg.ddpg import DDPGagent, OUNoise
import simulation.dm_control.simulation_control.environments as environments

In [2]:
random_state = np.random.RandomState(42)

SAVE_VIDEOS = False
RESUME_TRAINING = False
PATH_MODEL = 'passive_hand'
NUM_EPISODES = 200
BATCH_SIZE = 128
DURATION = 100
ACTOR_LEARNING_RATE=1e-4
CRITIC_LEARNING_RATE=1e-3
GAMMA=0.99
TAU=1e-2

env = environments.load(domain_name='passive_hand', task_name='lift_sparse')
action_spec = env.action_spec()
dim_action = action_spec.shape[0]
dim_obs = 21

In [3]:
agent = DDPGagent(
    dim_obs,
    dim_action,
    actor_learning_rate=ACTOR_LEARNING_RATE,
    critic_learning_rate=CRITIC_LEARNING_RATE,
    gamma=GAMMA,
    tau=TAU
)
if RESUME_TRAINING: agent.load(PATH_MODEL)
noise = OUNoise(dim_action, action_spec.minimum, action_spec.maximum)

def denorm(a): #  use on model output before passing to env
    act_k = (action_spec.maximum - action_spec.minimum) / 2.
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return a * act_k + act_b

def norm(a): # use on env output before passing to model
    act_k_inv = 2. / (action_spec.maximum - action_spec.minimum)
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return act_k_inv * (a - act_b)

def parse_obs(obs):
    """
    Take only gripper position and object position???
    """
    x = np.array([])
    for k, v in obs.items():
        if k == 'simulation_time': continue
        x = np.append(x, v)
    return x

In [4]:
def save_video(frames, path, framerate=30):
    height, width, _ = frames[0].shape
    dpi = 70
    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')  # Switch to headless 'Agg' to inhibit figure rendering.
    fig, ax = plt.subplots(1, 1, figsize=(width / dpi, height / dpi), dpi=dpi)
    matplotlib.use(orig_backend)  # Switch back to the original backend.
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
        im.set_data(frame)
        return [im]
    interval = 1000/framerate
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                   interval=interval, blit=True, repeat=False)
    anim.save(f'{path}.mp4', writer='ffmpeg')


In [None]:
frames = []
rewards = []
avg_rewards = []

for episode in tqdm(range(NUM_EPISODES)):
    frames.clear()

    time_step = env.reset()
    state = parse_obs(time_step.observation)
    noise.reset()
    episode_reward = 0
    episode_reward_history = []
    for step in range(DURATION):
        if SAVE_VIDEOS:
            camera0 = env.physics.render(camera_id=3, height=200, width=200)
            frames.append(np.hstack((camera0,)))


        action = agent.get_action(state)
        action = noise.get_action(action, step)
        try:
            time_step_2 = env.step(denorm(action))
        except:
            print(f'Physics Error: {action}')
            break
        state_2 = parse_obs(time_step_2.observation)
        reward = time_step_2.reward
        agent.memory.push(state, action, reward, state_2, -1)
        state = state_2
        if len(agent.memory) > BATCH_SIZE:
            agent.update(BATCH_SIZE)
        episode_reward += reward
        episode_reward_history.append(reward)
    if SAVE_VIDEOS:
        dest = f'ep{episode}_rew{int(episode_reward)}'
        with open('history.txt', 'a') as f:
            f.write(f'{dest}\n{str(episode_reward_history)}\n')
        save_video(frames, f'ep{episode}_rew{int(episode_reward)}', framerate=1./env.control_timestep())
    print(f"episode: {episode}, "
      f"reward: {np.round(episode_reward, decimals=2)}, "
      f"average_reward: {np.mean(rewards[-10:])}")
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

agent.save(PATH_MODEL)

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  0%|          | 1/200 [00:00<01:13,  2.72it/s]

episode: 0, reward: -59.45, average_reward: nan


  1%|          | 2/200 [00:01<02:56,  1.12it/s]

episode: 1, reward: -76.18, average_reward: -59.445239261050794


  2%|▏         | 3/200 [00:03<03:42,  1.13s/it]

episode: 2, reward: -46.22, average_reward: -67.81367064400875


  2%|▏         | 4/200 [00:04<04:06,  1.26s/it]

episode: 3, reward: -47.78, average_reward: -60.61485863047003


  2%|▎         | 5/200 [00:05<04:17,  1.32s/it]

episode: 4, reward: -37.67, average_reward: -57.4063276567937


  3%|▎         | 6/200 [00:07<04:25,  1.37s/it]

episode: 5, reward: -32.18, average_reward: -53.45823261578715


  4%|▎         | 7/200 [00:08<04:28,  1.39s/it]

episode: 6, reward: -40.81, average_reward: -49.911401872965605


  4%|▍         | 8/200 [00:10<04:28,  1.40s/it]

episode: 7, reward: -78.35, average_reward: -48.61053777887065


  4%|▍         | 9/200 [00:11<04:31,  1.42s/it]

episode: 8, reward: -35.45, average_reward: -52.32744489479244


  5%|▌         | 10/200 [00:13<04:29,  1.42s/it]

episode: 9, reward: -36.9, average_reward: -50.452726613490334


  6%|▌         | 11/200 [00:14<04:29,  1.43s/it]

episode: 10, reward: -42.51, average_reward: -49.097928471185114


  6%|▌         | 12/200 [00:15<04:28,  1.43s/it]

episode: 11, reward: -34.34, average_reward: -47.40441686300634


  6%|▋         | 13/200 [00:17<04:27,  1.43s/it]

episode: 12, reward: -40.36, average_reward: -43.2203789455498


  7%|▋         | 14/200 [00:18<04:28,  1.44s/it]

episode: 13, reward: -40.23, average_reward: -42.63481952938722


  8%|▊         | 15/200 [00:20<04:30,  1.46s/it]

episode: 14, reward: -44.83, average_reward: -41.879548703157205


  8%|▊         | 16/200 [00:21<04:23,  1.43s/it]

episode: 15, reward: -107.67, average_reward: -42.59639196482247


  8%|▊         | 17/200 [00:23<04:19,  1.42s/it]

episode: 16, reward: -65.57, average_reward: -50.14536178532355


  9%|▉         | 18/200 [00:24<04:24,  1.45s/it]

episode: 17, reward: -89.98, average_reward: -52.62188915652299


 10%|▉         | 19/200 [00:26<04:23,  1.45s/it]

episode: 18, reward: -38.42, average_reward: -53.785788355988316


 10%|█         | 20/200 [00:27<04:23,  1.46s/it]

episode: 19, reward: -45.39, average_reward: -54.08227491682029


 10%|█         | 21/200 [00:29<04:20,  1.46s/it]

episode: 20, reward: -56.55, average_reward: -54.93051530991818


 11%|█         | 22/200 [00:30<04:20,  1.46s/it]

episode: 21, reward: -47.5, average_reward: -56.334627644669276


 12%|█▏        | 23/200 [00:31<04:15,  1.44s/it]

episode: 22, reward: -60.38, average_reward: -57.65052745049964


 12%|█▏        | 24/200 [00:33<04:14,  1.44s/it]

episode: 23, reward: -66.29, average_reward: -59.65196261728606


 12%|█▎        | 25/200 [00:34<04:14,  1.46s/it]

episode: 24, reward: -35.08, average_reward: -62.25807275231265


 13%|█▎        | 26/200 [00:36<04:14,  1.46s/it]

episode: 25, reward: -32.83, average_reward: -61.282220885870665


 14%|█▎        | 27/200 [00:37<04:09,  1.44s/it]

episode: 26, reward: -97.54, average_reward: -53.798076262392215


 14%|█▍        | 28/200 [00:39<04:07,  1.44s/it]

episode: 27, reward: -47.84, average_reward: -56.99460434223935


 14%|█▍        | 29/200 [00:40<04:04,  1.43s/it]

episode: 28, reward: -43.06, average_reward: -52.78042423519931


 15%|█▌        | 30/200 [00:42<04:04,  1.44s/it]

episode: 29, reward: -42.64, average_reward: -53.24466623573376


 16%|█▌        | 31/200 [00:43<04:03,  1.44s/it]

episode: 30, reward: -48.99, average_reward: -52.96975230327389


 16%|█▌        | 32/200 [00:44<04:04,  1.46s/it]

episode: 31, reward: -44.64, average_reward: -52.21314694798673


 16%|█▋        | 33/200 [00:46<04:01,  1.45s/it]

episode: 32, reward: -45.22, average_reward: -51.92657524346579


 17%|█▋        | 34/200 [00:47<04:03,  1.47s/it]

episode: 33, reward: -45.79, average_reward: -50.410595363918986


 18%|█▊        | 35/200 [00:49<04:01,  1.46s/it]

episode: 34, reward: -43.14, average_reward: -48.36025440729286


 18%|█▊        | 36/200 [00:50<04:02,  1.48s/it]

episode: 35, reward: -40.51, average_reward: -49.16662116370464


 18%|█▊        | 37/200 [00:52<03:56,  1.45s/it]

episode: 36, reward: -57.37, average_reward: -49.93539925037213


 19%|█▉        | 38/200 [00:53<03:53,  1.44s/it]

episode: 37, reward: -45.1, average_reward: -45.91905165234563


 20%|█▉        | 39/200 [00:55<03:50,  1.43s/it]

episode: 38, reward: -52.85, average_reward: -45.644818096474395


 20%|██        | 40/200 [00:56<03:45,  1.41s/it]

episode: 39, reward: -50.13, average_reward: -46.623608423941015


 20%|██        | 41/200 [00:57<03:44,  1.41s/it]

episode: 40, reward: -38.94, average_reward: -47.37270067060082


 21%|██        | 42/200 [00:59<03:42,  1.41s/it]

episode: 41, reward: -31.18, average_reward: -46.368503580654775


 22%|██▏       | 43/200 [01:00<03:41,  1.41s/it]

episode: 42, reward: -32.58, average_reward: -45.023475110848224


 22%|██▏       | 44/200 [01:02<03:37,  1.40s/it]

episode: 43, reward: -48.41, average_reward: -43.760297827813524


 22%|██▎       | 45/200 [01:03<03:37,  1.41s/it]

episode: 44, reward: -42.2, average_reward: -44.022498825818516


 23%|██▎       | 46/200 [01:04<03:36,  1.41s/it]

episode: 45, reward: -35.09, average_reward: -43.92869806256742


 24%|██▎       | 47/200 [01:06<03:33,  1.39s/it]

episode: 46, reward: -86.9, average_reward: -43.38663210625069


 24%|██▍       | 48/200 [01:07<03:31,  1.39s/it]

episode: 47, reward: -32.85, average_reward: -46.339824256226194


 24%|██▍       | 49/200 [01:09<03:30,  1.39s/it]

episode: 48, reward: -51.87, average_reward: -45.11496937149981


 25%|██▌       | 50/200 [01:10<03:29,  1.39s/it]

episode: 49, reward: -40.36, average_reward: -45.01645636363229


 26%|██▌       | 51/200 [01:11<03:27,  1.39s/it]

episode: 50, reward: -47.54, average_reward: -44.03934221662644


 26%|██▌       | 52/200 [01:13<03:25,  1.39s/it]

episode: 51, reward: -32.37, average_reward: -44.899358060156786


 26%|██▋       | 53/200 [01:14<03:24,  1.39s/it]

episode: 52, reward: -41.23, average_reward: -45.01816198420781


 27%|██▋       | 54/200 [01:15<03:22,  1.39s/it]

episode: 53, reward: -34.18, average_reward: -45.88318666995941


 28%|██▊       | 55/200 [01:17<03:20,  1.39s/it]

episode: 54, reward: -40.95, average_reward: -44.46065992650292


 28%|██▊       | 56/200 [01:18<03:21,  1.40s/it]

episode: 55, reward: -40.6, average_reward: -44.335729802272176


 28%|██▊       | 57/200 [01:20<03:20,  1.40s/it]

episode: 56, reward: -32.38, average_reward: -44.88604299850177


 29%|██▉       | 58/200 [01:21<03:19,  1.40s/it]

episode: 57, reward: -53.1, average_reward: -39.43401728059302


 30%|██▉       | 59/200 [01:22<03:17,  1.40s/it]

episode: 58, reward: -46.29, average_reward: -41.45840477336397


 30%|███       | 60/200 [01:24<03:17,  1.41s/it]

episode: 59, reward: -33.88, average_reward: -40.90098409179775


 30%|███       | 61/200 [01:25<03:14,  1.40s/it]

episode: 60, reward: -39.44, average_reward: -40.25293669799142


 31%|███       | 62/200 [01:27<03:09,  1.37s/it]

episode: 61, reward: -55.15, average_reward: -39.4429786522488


 32%|███▏      | 63/200 [01:28<03:10,  1.39s/it]

episode: 62, reward: -45.48, average_reward: -41.721093034889634


 32%|███▏      | 64/200 [01:29<03:07,  1.38s/it]

episode: 63, reward: -31.67, average_reward: -42.14552106536202


 32%|███▎      | 65/200 [01:31<03:07,  1.39s/it]

episode: 64, reward: -49.03, average_reward: -41.89449962135926


 33%|███▎      | 66/200 [01:32<03:04,  1.38s/it]

episode: 65, reward: -39.34, average_reward: -42.70252789302168


 34%|███▎      | 67/200 [01:34<03:04,  1.39s/it]

episode: 66, reward: -47.22, average_reward: -42.57702454833718


 34%|███▍      | 68/200 [01:35<03:02,  1.38s/it]

episode: 67, reward: -31.03, average_reward: -44.06064995588396


 34%|███▍      | 69/200 [01:36<03:01,  1.38s/it]

episode: 68, reward: -59.75, average_reward: -41.854252872433804


 35%|███▌      | 70/200 [01:38<02:59,  1.38s/it]

episode: 69, reward: -33.91, average_reward: -43.19996441582502


 36%|███▌      | 71/200 [01:39<03:00,  1.40s/it]

episode: 70, reward: -44.59, average_reward: -43.203420205388184


 36%|███▌      | 72/200 [01:41<03:00,  1.41s/it]

episode: 71, reward: -35.79, average_reward: -43.71818309878973


 36%|███▋      | 73/200 [01:42<02:58,  1.41s/it]

episode: 72, reward: -36.46, average_reward: -41.78151928052449


 37%|███▋      | 74/200 [01:43<02:54,  1.39s/it]

episode: 73, reward: -63.02, average_reward: -40.87947810037531


 38%|███▊      | 75/200 [01:45<02:55,  1.40s/it]

episode: 74, reward: -40.58, average_reward: -44.01420688502374


 38%|███▊      | 76/200 [01:46<02:52,  1.39s/it]

episode: 75, reward: -50.84, average_reward: -43.16890455360889


 38%|███▊      | 77/200 [01:48<02:52,  1.40s/it]

episode: 76, reward: -39.64, average_reward: -44.31905984624346


 39%|███▉      | 78/200 [01:49<02:50,  1.40s/it]

episode: 77, reward: -39.3, average_reward: -43.56127135248833


 40%|███▉      | 79/200 [01:50<02:49,  1.40s/it]

episode: 78, reward: -27.68, average_reward: -44.38766762020637


 40%|████      | 80/200 [01:52<02:48,  1.40s/it]

episode: 79, reward: -36.36, average_reward: -41.18056034246671


 40%|████      | 81/200 [01:53<02:48,  1.42s/it]

episode: 80, reward: -34.01, average_reward: -41.42550874450243


 41%|████      | 82/200 [01:55<02:49,  1.44s/it]

episode: 81, reward: -41.9, average_reward: -40.3676780590114


 42%|████▏     | 83/200 [01:56<02:48,  1.44s/it]

episode: 82, reward: -32.16, average_reward: -40.979065813372685


 42%|████▏     | 84/200 [01:58<02:46,  1.43s/it]

episode: 83, reward: -42.46, average_reward: -40.548793248558724


 42%|████▎     | 85/200 [01:59<02:44,  1.43s/it]

episode: 84, reward: -39.44, average_reward: -38.49309724654163


 43%|████▎     | 86/200 [02:00<02:41,  1.42s/it]

episode: 85, reward: -30.32, average_reward: -38.379201415222255


 44%|████▎     | 87/200 [02:02<02:39,  1.41s/it]

episode: 86, reward: -45.72, average_reward: -36.32648960244232


 44%|████▍     | 88/200 [02:03<02:36,  1.40s/it]

episode: 87, reward: -34.29, average_reward: -36.9342919743493


 44%|████▍     | 89/200 [02:05<02:36,  1.41s/it]

episode: 88, reward: -40.17, average_reward: -36.433252407005874


 45%|████▌     | 90/200 [02:06<02:36,  1.42s/it]

episode: 89, reward: -35.96, average_reward: -37.68208513953862


 46%|████▌     | 91/200 [02:07<02:33,  1.41s/it]

episode: 90, reward: -50.71, average_reward: -37.641550383891776


 46%|████▌     | 92/200 [02:09<02:31,  1.40s/it]

episode: 91, reward: -38.7, average_reward: -39.31165093224793


 46%|████▋     | 93/200 [02:10<02:28,  1.39s/it]

episode: 92, reward: -54.39, average_reward: -38.99134754018981


 47%|████▋     | 94/200 [02:12<02:27,  1.39s/it]

episode: 93, reward: -35.2, average_reward: -41.21496144889677


 48%|████▊     | 95/200 [02:13<02:24,  1.38s/it]

episode: 94, reward: -63.51, average_reward: -40.48826589462668


 48%|████▊     | 96/200 [02:14<02:23,  1.38s/it]

episode: 95, reward: -36.69, average_reward: -42.895349019125625


 48%|████▊     | 97/200 [02:16<02:22,  1.38s/it]

episode: 96, reward: -41.36, average_reward: -43.533173545571415


 49%|████▉     | 98/200 [02:17<02:21,  1.38s/it]

episode: 97, reward: -39.41, average_reward: -43.09683351687501


 50%|████▉     | 99/200 [02:19<02:22,  1.41s/it]

episode: 98, reward: -43.89, average_reward: -43.60904996066732


 50%|█████     | 100/200 [02:20<02:20,  1.40s/it]

episode: 99, reward: -34.91, average_reward: -43.98192343862984


 50%|█████     | 101/200 [02:21<02:18,  1.40s/it]

episode: 100, reward: -35.34, average_reward: -43.87753309190577


 51%|█████     | 102/200 [02:23<02:17,  1.41s/it]

episode: 101, reward: -31.41, average_reward: -42.339872851758756


 52%|█████▏    | 103/200 [02:24<02:15,  1.39s/it]

episode: 102, reward: -57.19, average_reward: -41.610706092003554


 52%|█████▏    | 104/200 [02:25<02:14,  1.40s/it]

episode: 103, reward: -46.08, average_reward: -41.89023133948684


 52%|█████▎    | 105/200 [02:27<02:12,  1.39s/it]

episode: 104, reward: -50.39, average_reward: -42.97893343928276


 53%|█████▎    | 106/200 [02:28<02:09,  1.38s/it]

episode: 105, reward: -58.6, average_reward: -41.66639637446751


 54%|█████▎    | 107/200 [02:30<02:08,  1.38s/it]

episode: 106, reward: -41.38, average_reward: -43.857391029357224


 54%|█████▍    | 108/200 [02:31<02:06,  1.38s/it]

episode: 107, reward: -47.79, average_reward: -43.859757721095306


 55%|█████▍    | 109/200 [02:32<02:06,  1.39s/it]

episode: 108, reward: -33.31, average_reward: -44.69795226771405


 55%|█████▌    | 110/200 [02:33<01:43,  1.15s/it]

Physics Error: [ 1.          1.         -0.771772   -0.83223305  1.        ]
episode: 109, reward: -922041.11, average_reward: -43.6398900417624


 56%|█████▌    | 111/200 [02:34<01:48,  1.21s/it]

episode: 110, reward: -35.09, average_reward: -92244.26004184913


 56%|█████▌    | 112/200 [02:36<01:51,  1.26s/it]

episode: 111, reward: -46.33, average_reward: -92244.23570402236


 56%|█████▋    | 113/200 [02:37<01:52,  1.29s/it]

episode: 112, reward: -70.77, average_reward: -92245.72791775457


 57%|█████▋    | 114/200 [02:38<01:52,  1.31s/it]

episode: 113, reward: -69.06, average_reward: -92247.08594701973


 57%|█████▊    | 115/200 [02:40<01:52,  1.32s/it]

episode: 114, reward: -78.85, average_reward: -92249.38368978832


 58%|█████▊    | 116/200 [02:41<01:52,  1.34s/it]

episode: 115, reward: -114.59, average_reward: -92252.22973845786


 58%|█████▊    | 117/200 [02:43<01:54,  1.38s/it]

episode: 116, reward: -66.11, average_reward: -92257.82830779003


 59%|█████▉    | 118/200 [02:44<01:53,  1.39s/it]

episode: 117, reward: -98.28, average_reward: -92260.30098338643


 60%|█████▉    | 119/200 [02:45<01:53,  1.40s/it]

episode: 118, reward: -52.48, average_reward: -92265.34982300164


 60%|██████    | 120/200 [02:47<01:52,  1.40s/it]

episode: 119, reward: -63.76, average_reward: -92267.26691011446


 60%|██████    | 121/200 [02:48<01:50,  1.40s/it]

episode: 120, reward: -70.01, average_reward: -69.53194318222594


 61%|██████    | 122/200 [02:50<01:50,  1.42s/it]

episode: 121, reward: -44.79, average_reward: -73.02353051102125


 62%|██████▏   | 123/200 [02:51<01:47,  1.40s/it]

episode: 122, reward: -116.99, average_reward: -72.86971811222647


 62%|██████▏   | 124/200 [02:52<01:46,  1.40s/it]

episode: 123, reward: -114.44, average_reward: -77.49202977229149


 62%|██████▎   | 125/200 [02:54<01:46,  1.42s/it]

episode: 124, reward: -87.89, average_reward: -82.02960301907402


 63%|██████▎   | 126/200 [02:55<01:46,  1.44s/it]

episode: 125, reward: -87.62, average_reward: -82.9337904461998


 64%|██████▎   | 127/200 [02:57<01:44,  1.44s/it]

episode: 126, reward: -35.78, average_reward: -80.23681959785598


 64%|██████▍   | 128/200 [02:58<01:42,  1.43s/it]

episode: 127, reward: -30.81, average_reward: -77.20410637351799


 64%|██████▍   | 129/200 [03:00<01:41,  1.43s/it]

episode: 128, reward: -31.15, average_reward: -70.4571357324523


 65%|██████▌   | 130/200 [03:01<01:39,  1.42s/it]

episode: 129, reward: -40.81, average_reward: -68.32323126735699


 66%|██████▌   | 131/200 [03:03<01:37,  1.42s/it]

episode: 130, reward: -65.83, average_reward: -66.02778508509661


 66%|██████▌   | 132/200 [03:04<01:36,  1.41s/it]

episode: 131, reward: -69.95, average_reward: -65.61025624471472


 66%|██████▋   | 133/200 [03:05<01:35,  1.43s/it]

episode: 132, reward: -65.28, average_reward: -68.12586653813909


 67%|██████▋   | 134/200 [03:07<01:34,  1.43s/it]

episode: 133, reward: -56.96, average_reward: -62.95485265815125


 68%|██████▊   | 135/200 [03:08<01:32,  1.43s/it]

episode: 134, reward: -82.67, average_reward: -57.20690788949629


 68%|██████▊   | 136/200 [03:10<01:31,  1.42s/it]

episode: 135, reward: -64.54, average_reward: -56.685358328793725


 68%|██████▊   | 137/200 [03:11<01:30,  1.44s/it]

episode: 136, reward: -105.1, average_reward: -54.37704228485179


 69%|██████▉   | 138/200 [03:13<01:29,  1.44s/it]

episode: 137, reward: -89.63, average_reward: -61.30929383665832


 70%|██████▉   | 139/200 [03:13<01:12,  1.19s/it]

Physics Error: [-0.5860078   1.         -1.          0.05265313 -0.60111653]
episode: 138, reward: -245087.7, average_reward: -67.19132118319929


 70%|███████   | 140/200 [03:15<01:16,  1.27s/it]

episode: 139, reward: -94.67, average_reward: -24572.84690762173


 70%|███████   | 141/200 [03:16<01:20,  1.37s/it]

episode: 140, reward: -86.05, average_reward: -24578.232609536783


 71%|███████   | 142/200 [03:18<01:21,  1.40s/it]

episode: 141, reward: -110.1, average_reward: -24580.253999744273


 72%|███████▏  | 143/200 [03:19<01:20,  1.40s/it]

episode: 142, reward: -69.01, average_reward: -24584.269055733414


 72%|███████▏  | 144/200 [03:20<01:18,  1.40s/it]

episode: 143, reward: -62.64, average_reward: -24584.641643776424


 72%|███████▎  | 145/200 [03:22<01:16,  1.40s/it]

episode: 144, reward: -152.92, average_reward: -24585.210104834947


 73%|███████▎  | 146/200 [03:23<01:15,  1.40s/it]

episode: 145, reward: -96.44, average_reward: -24592.235282506845


In [None]:
t=-1
def policy(time_step):
    global t
    t += 1
    state = parse_obs(time_step.observation)
    action = agent.get_action(state)
    # print(time_step)
    action = noise.get_action(action, t)
    action = denorm(action)
    # print(action)
    # print(calc_reward(time_step.observation))
    return action

viewer.launch(env, policy=policy)

In [None]:

#@title Loading and simulating a `suite` task{vertical-output: true}

# Load the environment
# random_state = np.random.RandomState(42)
# env = suite.load('hopper', 'stand', task_kwargs={'random': random_state})

# Simulate episode with random actions
duration = 10  # Seconds
frames = []
ticks = []
rewards = []
observations = []

spec = env.action_spec()
time_step = env.reset()

while env.physics.data.time < duration:
    # action = random_state.uniform(spec.minimum, spec.maximum, spec.shape)
    action = agent.get_action(state)
    action = denorm(action)

    time_step = env.step(action)

    camera0 = env.physics.render(camera_id=3, height=200, width=200)
    # camera1 = env.physics.render(camera_id=3, height=500, width=500)
    frames.append(np.hstack((camera0,)))
    rewards.append(time_step.reward)
    observations.append(copy.deepcopy(time_step.observation))
    ticks.append(env.physics.data.time)

save_video(frames, 'lmao', framerate=1./env.control_timestep())

# Show video and plot reward and observations
# num_sensors = len(time_step.observation)
#
# _, ax = plt.subplots(1 + num_sensors, 1, sharex=True, figsize=(4, 8))
# ax[0].plot(ticks, rewards)
# ax[0].set_ylabel('reward')
# ax[-1].set_xlabel('time')
#
# for i, key in enumerate(time_step.observation):
#   data = np.asarray([observations[j][key] for j in range(len(observations))])
#   ax[i+1].plot(ticks, data, label=key)
#   ax[i+1].set_ylabel(key)