pytorch · vmoens · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022 · Dec 14, 2022
diff --git a/examples/a2c/a2c.py b/examples/a2c/a2c.py
@@ -143,19 +143,16 @@ def main(cfg: "DictConfig"):  # noqa: F821
         cfg=cfg,
     )
 
-    if not cfg.advantage_in_loss:
-        critic_model = model.get_value_operator()
-        advantage = TDEstimate(
-            cfg.gamma,
-            value_network=critic_model,
-            average_rewards=True,
-            gradient_mode=False,
-        )
-        advantage = advantage.to(device)
-        trainer.register_op(
-            "process_optim_batch",
-            advantage,
-        )
+    critic_model = model.get_value_operator()
+    advantage = TDEstimate(
+        cfg.gamma,
+        value_network=critic_model,
+        average_rewards=True,
+    )
+    trainer.register_op(
+        "process_optim_batch",
+        advantage,
+    )
 
     final_seed = collector.set_seed(cfg.seed)
     print(f"init seed: {cfg.seed}, final seed: {final_seed}")

diff --git a/examples/a2c/config.yaml b/examples/a2c/config.yaml
@@ -26,7 +26,6 @@ gamma: 0.99
 entropy_coef: 0.01  # Entropy factor for the A2C loss
 critic_coef: 0.25  # Critic factor for the A2C loss
 critic_loss_function: l2  # loss function for the value network. Either one of l1, l2 or smooth_l1 (default).
-advantage_in_loss: False  # if True, the advantage is computed on the sub-batch
 
 # Trainer
 optim_steps_per_batch: 1  # Number of optimization steps in between two collection of data.

diff --git a/examples/ppo/config.yaml b/examples/ppo/config.yaml
@@ -28,4 +28,3 @@ loss_function: smooth_l1
 batch_transform: 1
 entropy_coef: 0.1
 default_policy_scale: 1.0
-advantage_in_loss: 1
diff --git a/examples/ppo/ppo.py b/examples/ppo/ppo.py
@@ -169,23 +169,21 @@ def main(cfg: "DictConfig"):  # noqa: F821
     if cfg.loss == "kl":
         trainer.register_op("pre_optim_steps", loss_module.reset)
 
-    if not cfg.advantage_in_loss:
-        critic_model = model.get_value_operator()
-        advantage = GAE(
-            cfg.gamma,
-            cfg.lmbda,
-            value_network=critic_model,
-            average_rewards=True,
-            gradient_mode=False,
-        )
-        trainer.register_op(
-            "process_optim_batch",
-            advantage,
-        )
-        trainer._process_optim_batch_ops = [
-            trainer._process_optim_batch_ops[-1],
-            *trainer._process_optim_batch_ops[:-1],
-        ]
+    critic_model = model.get_value_operator()
+    advantage = GAE(
+        cfg.gamma,
+        cfg.lmbda,
+        value_network=critic_model,
+        average_gae=True,
+    )
+    trainer.register_op(
+        "process_optim_batch",
+        lambda tensordict: advantage(tensordict.to(device)),
+    )
+    trainer._process_optim_batch_ops = [
+        trainer._process_optim_batch_ops[-1],
+        *trainer._process_optim_batch_ops[:-1],
+    ]
 
     final_seed = collector.set_seed(cfg.seed)
     print(f"init seed: {cfg.seed}, final seed: {final_seed}")