diff --git a/examples/torch/mtsac_metaworld_ml1_pick_place.py b/examples/torch/mtsac_metaworld_ml1_pick_place.py index 9a63897e46..03c121b23e 100755 --- a/examples/torch/mtsac_metaworld_ml1_pick_place.py +++ b/examples/torch/mtsac_metaworld_ml1_pick_place.py @@ -89,6 +89,7 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, + max_eval_path_length=150, eval_env=ml1_test_envs, env_spec=ml1_train_envs.spec, num_tasks=50, @@ -101,7 +102,10 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None): if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() - runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler) + runner.setup(algo=mtsac, + env=ml1_train_envs, + sampler_cls=LocalSampler, + n_workers=1) runner.train(n_epochs=epochs, batch_size=batch_size) diff --git a/examples/torch/mtsac_metaworld_mt10.py b/examples/torch/mtsac_metaworld_mt10.py index 60646e4ff8..1d548c4cfd 100755 --- a/examples/torch/mtsac_metaworld_mt10.py +++ b/examples/torch/mtsac_metaworld_mt10.py @@ -61,7 +61,6 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None): min_std=np.exp(-20.), max_std=np.exp(2.), ) - qf1 = ContinuousMLPQFunction(env_spec=mt10_train_envs.spec, hidden_sizes=[400, 400, 400], hidden_nonlinearity=F.relu) @@ -72,17 +71,16 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None): replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) - timesteps = 20000000 + timesteps = int(20e6) batch_size = int(150 * mt10_train_envs.num_tasks) - num_evaluation_points = 500 - epochs = timesteps // batch_size - epoch_cycles = epochs // num_evaluation_points - epochs = epochs // epoch_cycles + epochs = 250 + epoch_cycles = timesteps // (epochs * batch_size) mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, + max_eval_path_length=150, eval_env=mt10_test_envs, env_spec=mt10_train_envs.spec, num_tasks=10, @@ -95,7 +93,10 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None): if _gpu is not None: set_gpu_mode(True, _gpu) mtsac.to() - runner.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler) + runner.setup(algo=mtsac, + env=mt10_train_envs, + sampler_cls=LocalSampler, + n_workers=1) runner.train(n_epochs=epochs, batch_size=batch_size) diff --git a/examples/torch/mtsac_metaworld_mt50.py b/examples/torch/mtsac_metaworld_mt50.py index 224c08ad93..8b965caf34 100755 --- a/examples/torch/mtsac_metaworld_mt50.py +++ b/examples/torch/mtsac_metaworld_mt50.py @@ -84,10 +84,11 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, - max_path_length=250, + max_path_length=150, + max_eval_path_length=150, eval_env=mt50_test_envs, env_spec=mt50_train_envs.spec, - num_tasks=10, + num_tasks=50, steps_per_epoch=epoch_cycles, replay_buffer=replay_buffer, min_buffer_size=7500, @@ -96,7 +97,10 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0): buffer_batch_size=6400) set_gpu_mode(use_gpu, _gpu) mtsac.to() - runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler) + runner.setup(algo=mtsac, + env=mt50_train_envs, + sampler_cls=LocalSampler, + n_workers=1) runner.train(n_epochs=epochs, batch_size=batch_size) diff --git a/examples/torch/sac_half_cheetah_batch.py b/examples/torch/sac_half_cheetah_batch.py index 4a58cd9f43..7c6b160989 100755 --- a/examples/torch/sac_half_cheetah_batch.py +++ b/examples/torch/sac_half_cheetah_batch.py @@ -56,7 +56,8 @@ def sac_half_cheetah_batch(ctxt=None, seed=1): qf1=qf1, qf2=qf2, gradient_steps_per_itr=1000, - max_path_length=500, + max_path_length=1000, + max_eval_path_length=1000, replay_buffer=replay_buffer, min_buffer_size=1e4, target_update_tau=5e-3, diff --git a/src/garage/envs/garage_env.py b/src/garage/envs/garage_env.py index d55b631173..d8f69121b7 100644 --- a/src/garage/envs/garage_env.py +++ b/src/garage/envs/garage_env.py @@ -161,8 +161,9 @@ def step(self, action): # will be saved inside env_infos as # 'GarageEnv.TimeLimitTerminated' if 'TimeLimit.truncated' in info: - info['GarageEnv.TimeLimitTerminated'] = done # done = True always - done = not info['TimeLimit.truncated'] + info['GarageEnv.TimeLimitTerminated'] = info['TimeLimit.truncated'] + if info['TimeLimit.truncated']: + done = False else: info['TimeLimit.truncated'] = False info['GarageEnv.TimeLimitTerminated'] = False diff --git a/src/garage/torch/algos/mtsac.py b/src/garage/torch/algos/mtsac.py index a25318be8c..ca23153ced 100644 --- a/src/garage/torch/algos/mtsac.py +++ b/src/garage/torch/algos/mtsac.py @@ -34,6 +34,8 @@ class MTSAC(SAC): by calling env.spec. num_tasks (int): The number of tasks being learned. max_path_length (int): The max path length of the algorithm. + max_eval_path_length (int or None): Maximum length of paths used for + off-policy evaluation. If None, defaults to `max_path_length`. eval_env (garage.envs.GarageEnv): The environment used for collecting evaluation trajectories. gradient_steps_per_itr (int): Number of optimization steps that should @@ -78,7 +80,9 @@ def __init__( replay_buffer, env_spec, num_tasks, + *, # Everything after this is numbers. max_path_length, + max_eval_path_length=None, eval_env, gradient_steps_per_itr, fixed_alpha=None, @@ -93,8 +97,9 @@ def __init__( reward_scale=1.0, optimizer=torch.optim.Adam, steps_per_epoch=1, - num_evaluation_trajectories=5, - ): + # yapf: disable + num_evaluation_trajectories=5): + # yapf: enable super().__init__( policy=policy, @@ -103,6 +108,7 @@ def __init__( replay_buffer=replay_buffer, env_spec=env_spec, max_path_length=max_path_length, + max_eval_path_length=max_eval_path_length, gradient_steps_per_itr=gradient_steps_per_itr, fixed_alpha=fixed_alpha, target_entropy=target_entropy, @@ -147,6 +153,11 @@ def _get_log_alpha(self, samples_data): the replay buffer. It should have the keys 'observation', 'action', 'reward', 'terminal', and 'next_observations'. + Raises: + ValueError: If the number of tasks, num_tasks passed to + this algorithm doesn't match the length of the task + one-hot id in the observation vector. + Note: samples_data's entries should be torch.Tensor's with the following shapes: @@ -163,6 +174,13 @@ def _get_log_alpha(self, samples_data): obs = samples_data['observation'] log_alpha = self._log_alpha one_hots = obs[:, -self._num_tasks:] + if (log_alpha.shape[0] != one_hots.shape[1] + or one_hots.shape[1] != self._num_tasks + or log_alpha.shape[0] != self._num_tasks): + raise ValueError( + 'The number of tasks in the environment does ' + 'not match self._num_tasks. Are you sure that you passed ' + 'The correct number of tasks?') ret = torch.mm(one_hots, log_alpha.unsqueeze(0).t()).squeeze() return ret @@ -186,6 +204,7 @@ def _evaluate_policy(self, epoch): obtain_evaluation_samples( self.policy, self._eval_env, + max_path_length=self._max_eval_path_length, num_trajs=self._num_evaluation_trajectories)) eval_trajs = TrajectoryBatch.concatenate(*eval_trajs) last_return = log_multitask_performance(epoch, eval_trajs, diff --git a/src/garage/torch/algos/sac.py b/src/garage/torch/algos/sac.py index 7eaa2f0530..cbb04c1d88 100644 --- a/src/garage/torch/algos/sac.py +++ b/src/garage/torch/algos/sac.py @@ -44,11 +44,10 @@ class SAC(RLAlgorithm): env_spec (garage.envs.env_spec.EnvSpec): The env_spec attribute of the environment that the agent is being trained in. Usually accessable by calling env.spec. - max_path_length (int): Max path length of the algorithm. + max_path_length (int): Max path length of the environment. max_eval_path_length (int or None): Maximum length of paths used for off-policy evaluation. If None, defaults to `max_path_length`. gradient_steps_per_itr (int): Number of optimization steps that should - max_path_length(int): Max path length of the environment. gradient_steps_per_itr(int): Number of optimization steps that should occur before the training step is over and a new batch of transitions is collected by the sampler. @@ -129,7 +128,7 @@ def __init__( self._discount = discount self._reward_scale = reward_scale self.max_path_length = max_path_length - self._max_eval_path_length = max_eval_path_length + self._max_eval_path_length = (max_eval_path_length or max_path_length) # used by OffPolicyVectorizedSampler self.policy = policy @@ -461,6 +460,7 @@ def _evaluate_policy(self, epoch): eval_trajectories = obtain_evaluation_samples( self.policy, self._eval_env, + max_path_length=self._max_eval_path_length, num_trajs=self._num_evaluation_trajectories) last_return = log_performance(epoch, eval_trajectories, diff --git a/tests/garage/envs/test_garage_env.py b/tests/garage/envs/test_garage_env.py index b87bc0f071..c026ae0e91 100644 --- a/tests/garage/envs/test_garage_env.py +++ b/tests/garage/envs/test_garage_env.py @@ -1,6 +1,10 @@ +from gym.wrappers import TimeLimit +import numpy as np import pytest from garage.envs import EnvSpec, GarageEnv +from garage.envs.grid_world_env import GridWorldEnv +from garage.np.policies import ScriptedPolicy class TestGarageEnv: @@ -32,3 +36,55 @@ def test_time_limit_env(self): garage_env.spec.action_space.sample()) assert not done and info['TimeLimit.truncated'] assert info['GarageEnv.TimeLimitTerminated'] + + +def test_garage_env_idempotent(): + # test if garage env can wrap itself + env_no_wrap = GridWorldEnv(desc='4x4') + env_single_wrap = GarageEnv(GridWorldEnv(desc='4x4')) + env_double_wrap = GarageEnv(GarageEnv(GridWorldEnv(desc='4x4'))) + + policy = ScriptedPolicy( + scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) + obs_nw = env_no_wrap.reset() + obs_sw = env_single_wrap.reset() + obs_dw = env_double_wrap.reset() + + for _ in range(16): + assert np.all(np.equal(obs_nw, obs_sw)) + assert np.all(np.equal(obs_nw, obs_dw)) + assert np.all(np.equal(obs_sw, obs_dw)) + step_nw = env_no_wrap.step(policy.get_action(obs_nw)[0]) + step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0]) + step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0]) + obs_nw = step_nw[0] + obs_sw = step_sw[0] + obs_dw = step_dw[0] + # test that single wrapped and double wrapped envs return the same + # values + assert np.all(np.equal(step_sw, step_dw)) + + +def test_garage_env_idempotent_time_limit(): + # test if garage env can wrap itself if environments + # are wrapped with timelimits + env_single_wrap = GarageEnv( + TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16)) + env_double_wrap = GarageEnv( + GarageEnv(TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16))) + # purposefully greater than the max path length to expose + # time limit truncations + num_steps = 20 + policy = ScriptedPolicy( + scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) + obs_sw = env_single_wrap.reset() + obs_dw = env_double_wrap.reset() + assert np.all(np.equal(obs_sw, obs_dw)) + for _ in range(num_steps): + step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0]) + step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0]) + obs_sw = step_sw[0] + obs_dw = step_dw[0] + # test that single wrapped and double wrapped envs return the same + # values + assert np.all(np.equal(step_sw, step_dw)) diff --git a/tests/garage/torch/algos/test_mtsac.py b/tests/garage/torch/algos/test_mtsac.py index 7069fec447..e6eda152f9 100644 --- a/tests/garage/torch/algos/test_mtsac.py +++ b/tests/garage/torch/algos/test_mtsac.py @@ -69,6 +69,61 @@ def test_mtsac_get_log_alpha(monkeypatch): assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size]) +@pytest.mark.mujoco +def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch): + """Check that if the num_tasks passed does not match the number of tasks + + in the environment, then the algorithm should raise an exception. + + MTSAC uses disentangled alphas, meaning that + + """ + env_names = ['CartPole-v0', 'CartPole-v1'] + task_envs = [GarageEnv(env_name=name) for name in env_names] + env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) + deterministic.set_seed(0) + policy = TanhGaussianMLPPolicy( + env_spec=env.spec, + hidden_sizes=[1, 1], + hidden_nonlinearity=torch.nn.ReLU, + output_nonlinearity=None, + min_std=np.exp(-20.), + max_std=np.exp(2.), + ) + + qf1 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[1, 1], + hidden_nonlinearity=F.relu) + + qf2 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[1, 1], + hidden_nonlinearity=F.relu) + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) + + buffer_batch_size = 2 + mtsac = MTSAC(policy=policy, + qf1=qf1, + qf2=qf2, + gradient_steps_per_itr=150, + max_path_length=150, + eval_env=env, + env_spec=env.spec, + num_tasks=4, + steps_per_epoch=5, + replay_buffer=replay_buffer, + min_buffer_size=1e3, + target_update_tau=5e-3, + discount=0.99, + buffer_batch_size=buffer_batch_size) + monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.])) + error_string = ('The number of tasks in the environment does ' + 'not match self._num_tasks. Are you sure that you passed ' + 'The correct number of tasks?') + obs = torch.Tensor([env.reset()] * buffer_batch_size) + with pytest.raises(ValueError, match=error_string): + mtsac._get_log_alpha(dict(observation=obs)) + + @pytest.mark.mujoco def test_mtsac_inverted_double_pendulum(): """Performance regression test of MTSAC on 2 InvDoublePendulum envs."""