Backport Various SAC and MTSAC Bug Fixes (#2029)

Backport #1905, #1975, #1908 to fix problems with max_eval_path_length being not used by mtsac and sac, and add checking for incorrect num_tasks being set in mtsac. Timelimit.truncated modified only when necessary This issue occurs when there are multiple garage envs that are nested or timelimit truncated = False is included in the environment keys. Previously, our timelimit truncated logic was written with the idea in mind that the key was only added when a time limit truncation occured. If an environment already has timelimit truncated = False in its keys then the previous behavior was to set Done = True which is the incorrect behavior. That was causing performance degradation in MTSAC and MTPPO/TRPO. Now Done is only true in the normal/trivial case, never if timelimit truncated is False.
rlworkgroup · Sep 11, 2020 · 5ce8850 · 5ce8850
1 parent d8f0235
commit 5ce8850
Show file tree

Hide file tree

Showing 9 changed files with 160 additions and 19 deletions.
diff --git a/examples/torch/mtsac_metaworld_ml1_pick_place.py b/examples/torch/mtsac_metaworld_ml1_pick_place.py
@@ -89,6 +89,7 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
                   qf2=qf2,
                   gradient_steps_per_itr=150,
                   max_path_length=150,
+                  max_eval_path_length=150,
                   eval_env=ml1_test_envs,
                   env_spec=ml1_train_envs.spec,
                   num_tasks=50,
@@ -101,7 +102,10 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
     if _gpu is not None:
         set_gpu_mode(True, _gpu)
     mtsac.to()
-    runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler)
+    runner.setup(algo=mtsac,
+                 env=ml1_train_envs,
+                 sampler_cls=LocalSampler,
+                 n_workers=1)
     runner.train(n_epochs=epochs, batch_size=batch_size)
 
 

diff --git a/examples/torch/mtsac_metaworld_mt10.py b/examples/torch/mtsac_metaworld_mt10.py
@@ -61,7 +61,6 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):
         min_std=np.exp(-20.),
         max_std=np.exp(2.),
     )
-
     qf1 = ContinuousMLPQFunction(env_spec=mt10_train_envs.spec,
                                  hidden_sizes=[400, 400, 400],
                                  hidden_nonlinearity=F.relu)
@@ -72,17 +71,16 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):
 
     replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
 
-    timesteps = 20000000
+    timesteps = int(20e6)
     batch_size = int(150 * mt10_train_envs.num_tasks)
-    num_evaluation_points = 500
-    epochs = timesteps // batch_size
-    epoch_cycles = epochs // num_evaluation_points
-    epochs = epochs // epoch_cycles
+    epochs = 250
+    epoch_cycles = timesteps // (epochs * batch_size)
     mtsac = MTSAC(policy=policy,
                   qf1=qf1,
                   qf2=qf2,
                   gradient_steps_per_itr=150,
                   max_path_length=150,
+                  max_eval_path_length=150,
                   eval_env=mt10_test_envs,
                   env_spec=mt10_train_envs.spec,
                   num_tasks=10,
@@ -95,7 +93,10 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):
     if _gpu is not None:
         set_gpu_mode(True, _gpu)
     mtsac.to()
-    runner.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler)
+    runner.setup(algo=mtsac,
+                 env=mt10_train_envs,
+                 sampler_cls=LocalSampler,
+                 n_workers=1)
     runner.train(n_epochs=epochs, batch_size=batch_size)
 
 

diff --git a/examples/torch/mtsac_metaworld_mt50.py b/examples/torch/mtsac_metaworld_mt50.py
@@ -84,10 +84,11 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
                   qf1=qf1,
                   qf2=qf2,
                   gradient_steps_per_itr=150,
-                  max_path_length=250,
+                  max_path_length=150,
+                  max_eval_path_length=150,
                   eval_env=mt50_test_envs,
                   env_spec=mt50_train_envs.spec,
-                  num_tasks=10,
+                  num_tasks=50,
                   steps_per_epoch=epoch_cycles,
                   replay_buffer=replay_buffer,
                   min_buffer_size=7500,
@@ -96,7 +97,10 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
                   buffer_batch_size=6400)
     set_gpu_mode(use_gpu, _gpu)
     mtsac.to()
-    runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler)
+    runner.setup(algo=mtsac,
+                 env=mt50_train_envs,
+                 sampler_cls=LocalSampler,
+                 n_workers=1)
     runner.train(n_epochs=epochs, batch_size=batch_size)
 
 

diff --git a/examples/torch/sac_half_cheetah_batch.py b/examples/torch/sac_half_cheetah_batch.py
@@ -56,7 +56,8 @@ def sac_half_cheetah_batch(ctxt=None, seed=1):
               qf1=qf1,
               qf2=qf2,
               gradient_steps_per_itr=1000,
-              max_path_length=500,
+              max_path_length=1000,
+              max_eval_path_length=1000,
               replay_buffer=replay_buffer,
               min_buffer_size=1e4,
               target_update_tau=5e-3,

diff --git a/src/garage/envs/garage_env.py b/src/garage/envs/garage_env.py
@@ -161,8 +161,9 @@ def step(self, action):
         # will be saved inside env_infos as
         # 'GarageEnv.TimeLimitTerminated'
         if 'TimeLimit.truncated' in info:
-            info['GarageEnv.TimeLimitTerminated'] = done  # done = True always
-            done = not info['TimeLimit.truncated']
+            info['GarageEnv.TimeLimitTerminated'] = info['TimeLimit.truncated']
+            if info['TimeLimit.truncated']:
+                done = False
         else:
             info['TimeLimit.truncated'] = False
             info['GarageEnv.TimeLimitTerminated'] = False

diff --git a/src/garage/torch/algos/mtsac.py b/src/garage/torch/algos/mtsac.py
@@ -34,6 +34,8 @@ class MTSAC(SAC):
             by calling env.spec.
         num_tasks (int): The number of tasks being learned.
         max_path_length (int): The max path length of the algorithm.
+        max_eval_path_length (int or None): Maximum length of paths used for
+            off-policy evaluation. If None, defaults to `max_path_length`.
         eval_env (garage.envs.GarageEnv): The environment used for collecting
             evaluation trajectories.
         gradient_steps_per_itr (int): Number of optimization steps that should
@@ -78,7 +80,9 @@ def __init__(
             replay_buffer,
             env_spec,
             num_tasks,
+            *,  # Everything after this is numbers.
             max_path_length,
+            max_eval_path_length=None,
             eval_env,
             gradient_steps_per_itr,
             fixed_alpha=None,
@@ -93,8 +97,9 @@ def __init__(
             reward_scale=1.0,
             optimizer=torch.optim.Adam,
             steps_per_epoch=1,
-            num_evaluation_trajectories=5,
-    ):
+            # yapf: disable
+            num_evaluation_trajectories=5):
+        # yapf: enable
 
         super().__init__(
             policy=policy,
@@ -103,6 +108,7 @@ def __init__(
             replay_buffer=replay_buffer,
             env_spec=env_spec,
             max_path_length=max_path_length,
+            max_eval_path_length=max_eval_path_length,
             gradient_steps_per_itr=gradient_steps_per_itr,
             fixed_alpha=fixed_alpha,
             target_entropy=target_entropy,
@@ -147,6 +153,11 @@ def _get_log_alpha(self, samples_data):
                 the replay buffer. It should have the keys 'observation',
                 'action', 'reward', 'terminal', and 'next_observations'.
 
+        Raises:
+            ValueError: If the number of tasks, num_tasks passed to
+                this algorithm doesn't match the length of the task
+                one-hot id in the observation vector.
+
         Note:
             samples_data's entries should be torch.Tensor's with the following
             shapes:
@@ -163,6 +174,13 @@ def _get_log_alpha(self, samples_data):
         obs = samples_data['observation']
         log_alpha = self._log_alpha
         one_hots = obs[:, -self._num_tasks:]
+        if (log_alpha.shape[0] != one_hots.shape[1]
+                or one_hots.shape[1] != self._num_tasks
+                or log_alpha.shape[0] != self._num_tasks):
+            raise ValueError(
+                'The number of tasks in the environment does '
+                'not match self._num_tasks. Are you sure that you passed '
+                'The correct number of tasks?')
         ret = torch.mm(one_hots, log_alpha.unsqueeze(0).t()).squeeze()
         return ret
 
@@ -186,6 +204,7 @@ def _evaluate_policy(self, epoch):
                 obtain_evaluation_samples(
                     self.policy,
                     self._eval_env,
+                    max_path_length=self._max_eval_path_length,
                     num_trajs=self._num_evaluation_trajectories))
         eval_trajs = TrajectoryBatch.concatenate(*eval_trajs)
         last_return = log_multitask_performance(epoch, eval_trajs,

diff --git a/src/garage/torch/algos/sac.py b/src/garage/torch/algos/sac.py
@@ -44,11 +44,10 @@ class SAC(RLAlgorithm):
         env_spec (garage.envs.env_spec.EnvSpec): The env_spec attribute of the
             environment that the agent is being trained in. Usually accessable
             by calling env.spec.
-        max_path_length (int): Max path length of the algorithm.
+        max_path_length (int): Max path length of the environment.
         max_eval_path_length (int or None): Maximum length of paths used for
             off-policy evaluation. If None, defaults to `max_path_length`.
         gradient_steps_per_itr (int): Number of optimization steps that should
-        max_path_length(int): Max path length of the environment.
         gradient_steps_per_itr(int): Number of optimization steps that should
             occur before the training step is over and a new batch of
             transitions is collected by the sampler.
@@ -129,7 +128,7 @@ def __init__(
         self._discount = discount
         self._reward_scale = reward_scale
         self.max_path_length = max_path_length
-        self._max_eval_path_length = max_eval_path_length
+        self._max_eval_path_length = (max_eval_path_length or max_path_length)
 
         # used by OffPolicyVectorizedSampler
         self.policy = policy
@@ -461,6 +460,7 @@ def _evaluate_policy(self, epoch):
         eval_trajectories = obtain_evaluation_samples(
             self.policy,
             self._eval_env,
+            max_path_length=self._max_eval_path_length,
             num_trajs=self._num_evaluation_trajectories)
         last_return = log_performance(epoch,
                                       eval_trajectories,

diff --git a/tests/garage/envs/test_garage_env.py b/tests/garage/envs/test_garage_env.py
@@ -1,6 +1,10 @@
+from gym.wrappers import TimeLimit
+import numpy as np
 import pytest
 
 from garage.envs import EnvSpec, GarageEnv
+from garage.envs.grid_world_env import GridWorldEnv
+from garage.np.policies import ScriptedPolicy
 
 
 class TestGarageEnv:
@@ -32,3 +36,55 @@ def test_time_limit_env(self):
                 garage_env.spec.action_space.sample())
         assert not done and info['TimeLimit.truncated']
         assert info['GarageEnv.TimeLimitTerminated']
+
+
+def test_garage_env_idempotent():
+    # test if garage env can wrap itself
+    env_no_wrap = GridWorldEnv(desc='4x4')
+    env_single_wrap = GarageEnv(GridWorldEnv(desc='4x4'))
+    env_double_wrap = GarageEnv(GarageEnv(GridWorldEnv(desc='4x4')))
+
+    policy = ScriptedPolicy(
+        scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
+    obs_nw = env_no_wrap.reset()
+    obs_sw = env_single_wrap.reset()
+    obs_dw = env_double_wrap.reset()
+
+    for _ in range(16):
+        assert np.all(np.equal(obs_nw, obs_sw))
+        assert np.all(np.equal(obs_nw, obs_dw))
+        assert np.all(np.equal(obs_sw, obs_dw))
+        step_nw = env_no_wrap.step(policy.get_action(obs_nw)[0])
+        step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0])
+        step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0])
+        obs_nw = step_nw[0]
+        obs_sw = step_sw[0]
+        obs_dw = step_dw[0]
+        # test that single wrapped and double wrapped envs return the same
+        # values
+        assert np.all(np.equal(step_sw, step_dw))
+
+
+def test_garage_env_idempotent_time_limit():
+    # test if garage env can wrap itself if environments
+    # are wrapped with timelimits
+    env_single_wrap = GarageEnv(
+        TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16))
+    env_double_wrap = GarageEnv(
+        GarageEnv(TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16)))
+    # purposefully greater than the max path length to expose
+    # time limit truncations
+    num_steps = 20
+    policy = ScriptedPolicy(
+        scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
+    obs_sw = env_single_wrap.reset()
+    obs_dw = env_double_wrap.reset()
+    assert np.all(np.equal(obs_sw, obs_dw))
+    for _ in range(num_steps):
+        step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0])
+        step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0])
+        obs_sw = step_sw[0]
+        obs_dw = step_dw[0]
+        # test that single wrapped and double wrapped envs return the same
+        # values
+        assert np.all(np.equal(step_sw, step_dw))
diff --git a/tests/garage/torch/algos/test_mtsac.py b/tests/garage/torch/algos/test_mtsac.py
@@ -69,6 +69,61 @@ def test_mtsac_get_log_alpha(monkeypatch):
         assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
 
 
+@pytest.mark.mujoco
+def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch):
+    """Check that if the num_tasks passed does not match the number of tasks
+
+    in the environment, then the algorithm should raise an exception.
+
+    MTSAC uses disentangled alphas, meaning that
+
+    """
+    env_names = ['CartPole-v0', 'CartPole-v1']
+    task_envs = [GarageEnv(env_name=name) for name in env_names]
+    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
+    deterministic.set_seed(0)
+    policy = TanhGaussianMLPPolicy(
+        env_spec=env.spec,
+        hidden_sizes=[1, 1],
+        hidden_nonlinearity=torch.nn.ReLU,
+        output_nonlinearity=None,
+        min_std=np.exp(-20.),
+        max_std=np.exp(2.),
+    )
+
+    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[1, 1],
+                                 hidden_nonlinearity=F.relu)
+
+    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[1, 1],
+                                 hidden_nonlinearity=F.relu)
+    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
+
+    buffer_batch_size = 2
+    mtsac = MTSAC(policy=policy,
+                  qf1=qf1,
+                  qf2=qf2,
+                  gradient_steps_per_itr=150,
+                  max_path_length=150,
+                  eval_env=env,
+                  env_spec=env.spec,
+                  num_tasks=4,
+                  steps_per_epoch=5,
+                  replay_buffer=replay_buffer,
+                  min_buffer_size=1e3,
+                  target_update_tau=5e-3,
+                  discount=0.99,
+                  buffer_batch_size=buffer_batch_size)
+    monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.]))
+    error_string = ('The number of tasks in the environment does '
+                    'not match self._num_tasks. Are you sure that you passed '
+                    'The correct number of tasks?')
+    obs = torch.Tensor([env.reset()] * buffer_batch_size)
+    with pytest.raises(ValueError, match=error_string):
+        mtsac._get_log_alpha(dict(observation=obs))
+
+
 @pytest.mark.mujoco
 def test_mtsac_inverted_double_pendulum():
     """Performance regression test of MTSAC on 2 InvDoublePendulum envs."""