In [1]:
import os
import os.path as osp
import gym
import numpy as np
import pickle

from scripts.run_evaluate_policy_all_levels import TestSample
from rrc_simulation.gym_wrapper.envs import cube_env, control_env
from rrc_simulation.gym_wrapper.envs.control_env import ResidualPolicyWrapper
from rrc_simulation.tasks import move_cube
from rrc_simulation.control import control_policy

from stable_baselines import HER, SAC

In [2]:
def get_ac_log(rpath):
    with open(rpath, 'rb') as fh:
        action_log = pickle.load(fh)
    return action_log

In [19]:
failed_root_dir = './scripts/output/runs_190920/failed_samples/'
runs = os.listdir(failed_root_dir)
run_info = [fp.split('_') for fp in runs]
run_ids = ['_'.join(r[:2]) for r in run_info]
levels = [int(r[2][-1]) for r in run_info]
run_itr = [int(r[3][1:]) for r in run_info]

failure_runs = {}
for r, rid in zip(runs, run_ids):
    rpath = osp.join(failed_root_dir, r)
    if rid in failure_runs:
        failure_runs[rid].append(get_ac_log(rpath))
    else:
        failure_runs[rid] = [get_ac_log(rpath)]
        
run_samples = {}
run_log_dir = './scripts/output/runs_190920/'
for run_dir in os.listdir(run_log_dir):
    if run_dir == 'failed_samples': continue
    test_data = osp.join(run_log_dir, run_dir, 'test_data.p')
    with open(test_data, 'rb') as fh:
        samples = pickle.load(fh)
    run_samples[run_dir] = samples
        
rews = []
for r in run_ids:
    for ac_log in failure_runs[r]:
        rews.append(ac_log['final_accum_reward'])
np.mean(rews), np.std(rews), np.min(rews), np.max(rews)

(-927.5112018659132,
 473.8744518089906,
 -2008.2112119621045,
 -407.17932998294583)

In [12]:
failure_runs['run_1'][-1]['final_object_pose'], run_samples['run_1'][-1].goal_pose_json

({'position': [-0.002880135403864411,
   -0.0016676770374104387,
   0.08839914713378444],
  'orientation': [-0.0853753975359607,
   0.0017868020914121323,
   -0.5064862088123347,
   0.8580090728639009]},
 '{"position": [-0.039717745661856145, 0.012994802397090024, 0.07257097640082488], "orientation": [0, 0, 0, 1]}')

In [None]:
# initial_pose = move_cube.Pose(position=np.array([0,0,0.0325]), orientation=np.array([0,0,0,1]))
initial_pose = move_cube.Pose(np.array([0.02176933,0.11905757,0.0325]),
                              np.array([0,0,0.47478757,0.88010043]))
goal_pose =  move_cube.Pose(position=np.array([0,0,0.0825]), orientation=np.array([0,0,0,1]))

def run_eval(sample, n_eps=3):
    intial_pose = move_cube.Pose.from_json(sample.init_pose_json)
    goal_pose = move_cube.Pose.from_json(sample.goal_pose_json)
    difficulty = sample.difficulty
    initializer = cube_env.FixedInitializer(
        difficulty, initial_pose, goal_pose
    )
    action_type = cube_env.ActionType.TORQUE_AND_POSITION

    env = gym.make(
        "rrc_simulation.gym_wrapper:real_robot_challenge_phase_1-v1",
        initializer=initializer,
        action_type=action_type,
        visualization=False,
    )

    policy = control_policy.HierarchicalControllerPolicy(action_space=env.action_space,
                initial_pose=initial_pose, goal_pose=goal_pose,
                load_dir=rl_load_dir)
    env = ResidualPolicyWrapper(env, policy)
    rews, infos = [], []
    for _ in range(n_eps):
        obs, done = env.reset(), False
        while not done:
            env.step(policy.predict(obs))

In [59]:
rl_load_dir = './scripts/models/push_reorient/push_reorient_s0/'

In [61]:
from spinup.utils.test_policy import load_policy_and_env

e, p = load_policy_and_env(rl_load_dir)



Loading from ./scripts/models/push_reorient/push_reorient_s0/pyt_save/model99.pt.




In [29]:
sorted([p for p in os.listdir(rl_load_dir) if '.zip' in p])

['1e6-steps.zip', '2e6-steps.zip']

In [31]:
spinup_exp_root = '/scr1/Developer/Projects/spinningup/notebooks/data/'
rl_load_dir = osp.join(spinup_exp_root, 'HER-SAC_sparse_push/2020-09-18_12-28-22/')
saves = sorted([p for p in os.listdir(rl_load_dir) if '.zip' in p])
rl_load_path = osp.join(rl_load_dir, saves[-1])

In [42]:
sample = run_samples[run_ids[0]][run_itr[0]]

initial_pose = move_cube.Pose.from_json(sample.init_pose_json)
goal_pose = move_cube.Pose.from_json(sample.goal_pose_json)
difficulty = sample.difficulty
initializer = cube_env.FixedInitializer(
    difficulty, initial_pose, goal_pose
)

env = gym.make(
    "rrc_simulation.gym_wrapper:real_robot_challenge_phase_1-v1",
    initializer=initializer,
    action_type=cube_env.ActionType.TORQUE_AND_POSITION,
    visualization=False,
)

rl_load_dir = '/scr1/Developer/Projects/spinningup/notebooks/data/HER-SAC_sparse_push/2020-09-21_12-38-42/'

policy = control_policy.HierarchicalControllerPolicy(action_space=env.action_space,
            initial_pose=initial_pose, goal_pose=goal_pose,
            load_dir=rl_load_path)
env = ResidualPolicyWrapper(env, policy)

init position: [ 0.11182217 -0.06591327  0.0325    ], goal position: [-0.00309177 -0.0334373   0.0325    ], dist: 0.11941482992693092
init orientation: [ 0.          0.          0.99113608 -0.13285057], goal orientation: [0 0 0 1]


TypeError: load_policy() takes 2 positional arguments but 3 were given

In [11]:
is_done = False
observation = env.reset()
accumulated_reward = 0

if isinstance(policy, (control_policy.HierarchicalControllerPolicy, 
                       control_policy.ImpedanceControllerPolicy)):
    policy.set_waypoints(env.platform, observation)

while not is_done:
    action = policy.predict(observation)
    observation, reward, is_done, info = env.step(action)
    accumulated_reward += reward

print(accumulated_reward, info)

float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32


KeyboardInterrupt: 

In [4]:
reward

-0.25868281864735637