In [5]:
import numpy as np
from dm_control import viewer
import matplotlib.pyplot as plt
from tqdm import tqdm
from simulation.dm_control.ddpg.ddpg import DDPGagent, OUNoise
import simulation.dm_control.simulation_control.environments as environments

In [6]:
random_state = np.random.RandomState(42)

PATH_MODEL = 'ddpg_actor.pt'
NUM_EPISODES = 500
BATCH_SIZE = 128
DURATION = 200
ACTOR_LEARNING_RATE=1e-4
CRITIC_LEARNING_RATE=1e-3
GAMMA=0.99
TAU=1e-2

env = environments.load(domain_name='passive_hand', task_name='lift_sparse')
action_spec = env.action_spec()
dim_action = action_spec.shape[0]
dim_obs = 6

In [7]:
agent = DDPGagent(
    dim_obs,
    dim_action,
    actor_learning_rate=ACTOR_LEARNING_RATE,
    critic_learning_rate=CRITIC_LEARNING_RATE,
    gamma=GAMMA,
    tau=TAU
)
noise = OUNoise(dim_action, action_spec.minimum, action_spec.maximum)

def denorm(a): #  use on model output before passing to env
    act_k = (action_spec.maximum - action_spec.minimum) / 2.
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return a * act_k + act_b

def norm(a): # use on env output before passing to model
    act_k_inv = 2. / (action_spec.maximum - action_spec.minimum)
    act_b = (action_spec.maximum + action_spec.minimum) / 2.
    return act_k_inv * (a - act_b)

def parse(obs):
    """
    Take only gripper position and object position???
    """
    grip_pos = obs['grip_pos']
    object_pos = obs['object_pos']
    return np.append(grip_pos, object_pos)

def calc_reward(obs):
    """
    Calculate the reward based on stuffs
    observation=OrderedDict(
    [('grip_pos', array([1.38313716, 0.74702476, 0.58570326])),
    ('grip_velp', array([-1.02185151,  0.06579234, -0.62871064])),
    ('grip_velr', array([-0.01182298,  0.16122969, -0.00438519])),
    ('grip_rot', array([-0.00114049, -0.01091738,  0.00111363])),
    ('object_pos', array([1.45921682, 0.74832965, 0.41110006])),
    ('object_rel_pos', array([ 0.07607965,  0.00130489, -0.1746032 ])),
    ('object_velp', array([-0.10321769, -0.1453771 , -0.1968166 ])),
    ('object_velr', array([ 1.90336989, -0.97895901, -8.12644349])),
    ('object_rel_velp', array([ 0.91863382, -0.21116944,  0.43189404])),
    ('simulation_time', 0.12000000000000009)]))
    """
    grip_pos = obs['grip_pos']
    object_pos = obs['object_pos']
    obj_height = object_pos[2] - 0.41110006
    rel_dist = np.sum((object_pos - grip_pos) ** 2)**(1/2)
    # print(f'h: {obj_height}, d: {rel_dist}, r: {obj_height - rel_dist}')
    return obj_height - rel_dist


In [None]:
rewards = []
avg_rewards = []

for episode in tqdm(range(NUM_EPISODES)):
    time_step = env.reset()
    state = parse(time_step.observation)
    noise.reset()
    episode_reward = 0
    for step in range(DURATION):
        action = agent.get_action(state)
        action = noise.get_action(action, step)
        try:
            time_step_2 = env.step(denorm(action))
        except:
            print(f'Physics Error: {action}')
            continue
        state_2 = parse(time_step_2.observation)
        reward = calc_reward(time_step_2.observation)
        agent.memory.push(state, action, reward, state_2, -1)
        state = state_2
        if len(agent.memory) > BATCH_SIZE:
            agent.update(BATCH_SIZE)
        episode_reward += reward
    print(f"episode: {episode}, "
      f"reward: {np.round(episode_reward, decimals=2)}, "
      f"average_reward: {np.mean(rewards[-10:])}")
    rewards.append(episode_reward)
    avg_rewards.append(np.mean(rewards[-10:]))

agent.save(PATH_MODEL)

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 22%|██▏       | 111/500 [05:26<20:49,  3.21s/it]

episode: 0, reward: -97.27, average_reward: nan
episode: 1, reward: -117.83, average_reward: -97.27305677930904
episode: 2, reward: -92.38, average_reward: -107.55364877345659
episode: 3, reward: -153.12, average_reward: -102.4953980628493
episode: 4, reward: -104.81, average_reward: -115.15163667034633
episode: 5, reward: -131.17, average_reward: -113.08317737813145
episode: 6, reward: -228.97, average_reward: -116.09690395391101
episode: 7, reward: -231.22, average_reward: -132.22161001836182
episode: 8, reward: -228.21, average_reward: -144.5966594262432
episode: 9, reward: -221.41, average_reward: -153.8872847844104
episode: 10, reward: -211.41, average_reward: -160.63986992346207
episode: 11, reward: -114.14, average_reward: -172.0537181940212
episode: 12, reward: -70.0, average_reward: -171.6842842785997
episode: 13, reward: -150.38, average_reward: -169.44594957236828
episode: 14, reward: -98.1, average_reward: -169.17230978810807
episode: 15, reward: -89.0, average_reward: -168

In [None]:
agent.load(PATH_MODEL)
# Define a uniform random policy.
def random_policy(time_step):
    state = parse(time_step.observation)
    x = denorm(agent.get_action(state))
    print(x)
    return x

# Launch the viewer application.
viewer.launch(env, policy=random_policy)
