pytorch · vmoens · Mar 19, 2024 · Mar 18, 2024 · Mar 19, 2024
diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py
@@ -856,12 +856,13 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         gain1 = log_weight.exp() * advantage
 
         log_weight_clip = log_weight.clamp(*self._clip_bounds)
+        clip_fraction = (log_weight_clip != log_weight).to(log_weight.dtype).mean()
         ratio = log_weight_clip.exp()
         gain2 = ratio * advantage
 
         gain = torch.stack([gain1, gain2], -1).min(dim=-1)[0]
         td_out = TensorDict({"loss_objective": -gain}, batch_size=[])
-        td_out.set("clip_fraction", ratio.abs().detach())
+        td_out.set("clip_fraction", clip_fraction)
 
         if self.entropy_bonus:
             entropy = self.get_entropy_bonus(dist)

diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py
@@ -530,18 +530,16 @@ def _clip_value_loss(
     and returns the most pessimistic value prediction between clipped and non-clipped options.
     It also computes the clip fraction.
     """
-    state_value_clipped = old_state_value + (state_value - old_state_value).clamp(
-        -clip_value, clip_value
-    )
+    pre_clipped = state_value - old_state_value
+    clipped = pre_clipped.clamp(-clip_value, clip_value)
+    with torch.no_grad():
+        clip_fraction = (pre_clipped != clipped).to(state_value.dtype).mean()
+    state_value_clipped = old_state_value + clipped
     loss_value_clipped = distance_loss(
         target_return,
         state_value_clipped,
         loss_function=loss_critic_type,
     )
     # Chose the most pessimistic value prediction between clipped and non-clipped
     loss_value = torch.max(loss_value, loss_value_clipped)
-    with torch.no_grad():
-        clip_fraction = (
-            (state_value / old_state_value).clamp(1 - clip_value, 1 + clip_value).abs()
-        )
     return loss_value, clip_fraction