Skip to content

Commit

Permalink
Backport Various SAC and MTSAC Bug Fixes (#2029)
Browse files Browse the repository at this point in the history
Backport #1905, #1975, #1908 to fix problems
with max_eval_path_length being not used by
mtsac and sac, and add checking for incorrect
num_tasks being set in mtsac.

Timelimit.truncated modified only when necessary

This issue occurs when there are multiple garage
envs that are nested or timelimit truncated = False
is included in the environment keys.
Previously, our timelimit
truncated logic was written with the idea in mind
that the key was only added when a time limit
truncation occured. If an environment already
has timelimit truncated = False in its keys
then the previous behavior was to set Done = True
which is the incorrect behavior.

That was causing performance degradation
in MTSAC and MTPPO/TRPO.

Now Done is only true in the normal/trivial case,
never if timelimit truncated is False.
  • Loading branch information
avnishn committed Sep 11, 2020
1 parent d8f0235 commit 5ce8850
Show file tree
Hide file tree
Showing 9 changed files with 160 additions and 19 deletions.
6 changes: 5 additions & 1 deletion examples/torch/mtsac_metaworld_ml1_pick_place.py
Expand Up @@ -89,6 +89,7 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
qf2=qf2,
gradient_steps_per_itr=150,
max_path_length=150,
max_eval_path_length=150,
eval_env=ml1_test_envs,
env_spec=ml1_train_envs.spec,
num_tasks=50,
Expand All @@ -101,7 +102,10 @@ def mtsac_metaworld_ml1_pick_place(ctxt=None, seed=1, _gpu=None):
if _gpu is not None:
set_gpu_mode(True, _gpu)
mtsac.to()
runner.setup(algo=mtsac, env=ml1_train_envs, sampler_cls=LocalSampler)
runner.setup(algo=mtsac,
env=ml1_train_envs,
sampler_cls=LocalSampler,
n_workers=1)
runner.train(n_epochs=epochs, batch_size=batch_size)


Expand Down
15 changes: 8 additions & 7 deletions examples/torch/mtsac_metaworld_mt10.py
Expand Up @@ -61,7 +61,6 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):
min_std=np.exp(-20.),
max_std=np.exp(2.),
)

qf1 = ContinuousMLPQFunction(env_spec=mt10_train_envs.spec,
hidden_sizes=[400, 400, 400],
hidden_nonlinearity=F.relu)
Expand All @@ -72,17 +71,16 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):

replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

timesteps = 20000000
timesteps = int(20e6)
batch_size = int(150 * mt10_train_envs.num_tasks)
num_evaluation_points = 500
epochs = timesteps // batch_size
epoch_cycles = epochs // num_evaluation_points
epochs = epochs // epoch_cycles
epochs = 250
epoch_cycles = timesteps // (epochs * batch_size)
mtsac = MTSAC(policy=policy,
qf1=qf1,
qf2=qf2,
gradient_steps_per_itr=150,
max_path_length=150,
max_eval_path_length=150,
eval_env=mt10_test_envs,
env_spec=mt10_train_envs.spec,
num_tasks=10,
Expand All @@ -95,7 +93,10 @@ def mtsac_metaworld_mt10(ctxt=None, seed=1, _gpu=None):
if _gpu is not None:
set_gpu_mode(True, _gpu)
mtsac.to()
runner.setup(algo=mtsac, env=mt10_train_envs, sampler_cls=LocalSampler)
runner.setup(algo=mtsac,
env=mt10_train_envs,
sampler_cls=LocalSampler,
n_workers=1)
runner.train(n_epochs=epochs, batch_size=batch_size)


Expand Down
10 changes: 7 additions & 3 deletions examples/torch/mtsac_metaworld_mt50.py
Expand Up @@ -84,10 +84,11 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
qf1=qf1,
qf2=qf2,
gradient_steps_per_itr=150,
max_path_length=250,
max_path_length=150,
max_eval_path_length=150,
eval_env=mt50_test_envs,
env_spec=mt50_train_envs.spec,
num_tasks=10,
num_tasks=50,
steps_per_epoch=epoch_cycles,
replay_buffer=replay_buffer,
min_buffer_size=7500,
Expand All @@ -96,7 +97,10 @@ def mtsac_metaworld_mt50(ctxt=None, seed=1, use_gpu=False, _gpu=0):
buffer_batch_size=6400)
set_gpu_mode(use_gpu, _gpu)
mtsac.to()
runner.setup(algo=mtsac, env=mt50_train_envs, sampler_cls=LocalSampler)
runner.setup(algo=mtsac,
env=mt50_train_envs,
sampler_cls=LocalSampler,
n_workers=1)
runner.train(n_epochs=epochs, batch_size=batch_size)


Expand Down
3 changes: 2 additions & 1 deletion examples/torch/sac_half_cheetah_batch.py
Expand Up @@ -56,7 +56,8 @@ def sac_half_cheetah_batch(ctxt=None, seed=1):
qf1=qf1,
qf2=qf2,
gradient_steps_per_itr=1000,
max_path_length=500,
max_path_length=1000,
max_eval_path_length=1000,
replay_buffer=replay_buffer,
min_buffer_size=1e4,
target_update_tau=5e-3,
Expand Down
5 changes: 3 additions & 2 deletions src/garage/envs/garage_env.py
Expand Up @@ -161,8 +161,9 @@ def step(self, action):
# will be saved inside env_infos as
# 'GarageEnv.TimeLimitTerminated'
if 'TimeLimit.truncated' in info:
info['GarageEnv.TimeLimitTerminated'] = done # done = True always
done = not info['TimeLimit.truncated']
info['GarageEnv.TimeLimitTerminated'] = info['TimeLimit.truncated']
if info['TimeLimit.truncated']:
done = False
else:
info['TimeLimit.truncated'] = False
info['GarageEnv.TimeLimitTerminated'] = False
Expand Down
23 changes: 21 additions & 2 deletions src/garage/torch/algos/mtsac.py
Expand Up @@ -34,6 +34,8 @@ class MTSAC(SAC):
by calling env.spec.
num_tasks (int): The number of tasks being learned.
max_path_length (int): The max path length of the algorithm.
max_eval_path_length (int or None): Maximum length of paths used for
off-policy evaluation. If None, defaults to `max_path_length`.
eval_env (garage.envs.GarageEnv): The environment used for collecting
evaluation trajectories.
gradient_steps_per_itr (int): Number of optimization steps that should
Expand Down Expand Up @@ -78,7 +80,9 @@ def __init__(
replay_buffer,
env_spec,
num_tasks,
*, # Everything after this is numbers.
max_path_length,
max_eval_path_length=None,
eval_env,
gradient_steps_per_itr,
fixed_alpha=None,
Expand All @@ -93,8 +97,9 @@ def __init__(
reward_scale=1.0,
optimizer=torch.optim.Adam,
steps_per_epoch=1,
num_evaluation_trajectories=5,
):
# yapf: disable
num_evaluation_trajectories=5):
# yapf: enable

super().__init__(
policy=policy,
Expand All @@ -103,6 +108,7 @@ def __init__(
replay_buffer=replay_buffer,
env_spec=env_spec,
max_path_length=max_path_length,
max_eval_path_length=max_eval_path_length,
gradient_steps_per_itr=gradient_steps_per_itr,
fixed_alpha=fixed_alpha,
target_entropy=target_entropy,
Expand Down Expand Up @@ -147,6 +153,11 @@ def _get_log_alpha(self, samples_data):
the replay buffer. It should have the keys 'observation',
'action', 'reward', 'terminal', and 'next_observations'.
Raises:
ValueError: If the number of tasks, num_tasks passed to
this algorithm doesn't match the length of the task
one-hot id in the observation vector.
Note:
samples_data's entries should be torch.Tensor's with the following
shapes:
Expand All @@ -163,6 +174,13 @@ def _get_log_alpha(self, samples_data):
obs = samples_data['observation']
log_alpha = self._log_alpha
one_hots = obs[:, -self._num_tasks:]
if (log_alpha.shape[0] != one_hots.shape[1]
or one_hots.shape[1] != self._num_tasks
or log_alpha.shape[0] != self._num_tasks):
raise ValueError(
'The number of tasks in the environment does '
'not match self._num_tasks. Are you sure that you passed '
'The correct number of tasks?')
ret = torch.mm(one_hots, log_alpha.unsqueeze(0).t()).squeeze()
return ret

Expand All @@ -186,6 +204,7 @@ def _evaluate_policy(self, epoch):
obtain_evaluation_samples(
self.policy,
self._eval_env,
max_path_length=self._max_eval_path_length,
num_trajs=self._num_evaluation_trajectories))
eval_trajs = TrajectoryBatch.concatenate(*eval_trajs)
last_return = log_multitask_performance(epoch, eval_trajs,
Expand Down
6 changes: 3 additions & 3 deletions src/garage/torch/algos/sac.py
Expand Up @@ -44,11 +44,10 @@ class SAC(RLAlgorithm):
env_spec (garage.envs.env_spec.EnvSpec): The env_spec attribute of the
environment that the agent is being trained in. Usually accessable
by calling env.spec.
max_path_length (int): Max path length of the algorithm.
max_path_length (int): Max path length of the environment.
max_eval_path_length (int or None): Maximum length of paths used for
off-policy evaluation. If None, defaults to `max_path_length`.
gradient_steps_per_itr (int): Number of optimization steps that should
max_path_length(int): Max path length of the environment.
gradient_steps_per_itr(int): Number of optimization steps that should
occur before the training step is over and a new batch of
transitions is collected by the sampler.
Expand Down Expand Up @@ -129,7 +128,7 @@ def __init__(
self._discount = discount
self._reward_scale = reward_scale
self.max_path_length = max_path_length
self._max_eval_path_length = max_eval_path_length
self._max_eval_path_length = (max_eval_path_length or max_path_length)

# used by OffPolicyVectorizedSampler
self.policy = policy
Expand Down Expand Up @@ -461,6 +460,7 @@ def _evaluate_policy(self, epoch):
eval_trajectories = obtain_evaluation_samples(
self.policy,
self._eval_env,
max_path_length=self._max_eval_path_length,
num_trajs=self._num_evaluation_trajectories)
last_return = log_performance(epoch,
eval_trajectories,
Expand Down
56 changes: 56 additions & 0 deletions tests/garage/envs/test_garage_env.py
@@ -1,6 +1,10 @@
from gym.wrappers import TimeLimit
import numpy as np
import pytest

from garage.envs import EnvSpec, GarageEnv
from garage.envs.grid_world_env import GridWorldEnv
from garage.np.policies import ScriptedPolicy


class TestGarageEnv:
Expand Down Expand Up @@ -32,3 +36,55 @@ def test_time_limit_env(self):
garage_env.spec.action_space.sample())
assert not done and info['TimeLimit.truncated']
assert info['GarageEnv.TimeLimitTerminated']


def test_garage_env_idempotent():
# test if garage env can wrap itself
env_no_wrap = GridWorldEnv(desc='4x4')
env_single_wrap = GarageEnv(GridWorldEnv(desc='4x4'))
env_double_wrap = GarageEnv(GarageEnv(GridWorldEnv(desc='4x4')))

policy = ScriptedPolicy(
scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
obs_nw = env_no_wrap.reset()
obs_sw = env_single_wrap.reset()
obs_dw = env_double_wrap.reset()

for _ in range(16):
assert np.all(np.equal(obs_nw, obs_sw))
assert np.all(np.equal(obs_nw, obs_dw))
assert np.all(np.equal(obs_sw, obs_dw))
step_nw = env_no_wrap.step(policy.get_action(obs_nw)[0])
step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0])
step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0])
obs_nw = step_nw[0]
obs_sw = step_sw[0]
obs_dw = step_dw[0]
# test that single wrapped and double wrapped envs return the same
# values
assert np.all(np.equal(step_sw, step_dw))


def test_garage_env_idempotent_time_limit():
# test if garage env can wrap itself if environments
# are wrapped with timelimits
env_single_wrap = GarageEnv(
TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16))
env_double_wrap = GarageEnv(
GarageEnv(TimeLimit(GridWorldEnv(desc='4x4'), max_episode_steps=16)))
# purposefully greater than the max path length to expose
# time limit truncations
num_steps = 20
policy = ScriptedPolicy(
scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
obs_sw = env_single_wrap.reset()
obs_dw = env_double_wrap.reset()
assert np.all(np.equal(obs_sw, obs_dw))
for _ in range(num_steps):
step_sw = env_single_wrap.step(policy.get_action(obs_sw)[0])
step_dw = env_double_wrap.step(policy.get_action(obs_dw)[0])
obs_sw = step_sw[0]
obs_dw = step_dw[0]
# test that single wrapped and double wrapped envs return the same
# values
assert np.all(np.equal(step_sw, step_dw))
55 changes: 55 additions & 0 deletions tests/garage/torch/algos/test_mtsac.py
Expand Up @@ -69,6 +69,61 @@ def test_mtsac_get_log_alpha(monkeypatch):
assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])


@pytest.mark.mujoco
def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch):
"""Check that if the num_tasks passed does not match the number of tasks
in the environment, then the algorithm should raise an exception.
MTSAC uses disentangled alphas, meaning that
"""
env_names = ['CartPole-v0', 'CartPole-v1']
task_envs = [GarageEnv(env_name=name) for name in env_names]
env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
deterministic.set_seed(0)
policy = TanhGaussianMLPPolicy(
env_spec=env.spec,
hidden_sizes=[1, 1],
hidden_nonlinearity=torch.nn.ReLU,
output_nonlinearity=None,
min_std=np.exp(-20.),
max_std=np.exp(2.),
)

qf1 = ContinuousMLPQFunction(env_spec=env.spec,
hidden_sizes=[1, 1],
hidden_nonlinearity=F.relu)

qf2 = ContinuousMLPQFunction(env_spec=env.spec,
hidden_sizes=[1, 1],
hidden_nonlinearity=F.relu)
replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

buffer_batch_size = 2
mtsac = MTSAC(policy=policy,
qf1=qf1,
qf2=qf2,
gradient_steps_per_itr=150,
max_path_length=150,
eval_env=env,
env_spec=env.spec,
num_tasks=4,
steps_per_epoch=5,
replay_buffer=replay_buffer,
min_buffer_size=1e3,
target_update_tau=5e-3,
discount=0.99,
buffer_batch_size=buffer_batch_size)
monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.]))
error_string = ('The number of tasks in the environment does '
'not match self._num_tasks. Are you sure that you passed '
'The correct number of tasks?')
obs = torch.Tensor([env.reset()] * buffer_batch_size)
with pytest.raises(ValueError, match=error_string):
mtsac._get_log_alpha(dict(observation=obs))


@pytest.mark.mujoco
def test_mtsac_inverted_double_pendulum():
"""Performance regression test of MTSAC on 2 InvDoublePendulum envs."""
Expand Down

0 comments on commit 5ce8850

Please sign in to comment.