Clearly Explain Efficient Attention

anasash-b · anasash-b · commit c5bf10c833ef · 2025-12-06T20:39:39.000+04:00
diff --git a/pytorch_forecasting/layers/_attention/_full_attention.py b/pytorch_forecasting/layers/_attention/_full_attention.py
@@ -36,8 +36,13 @@ class FullAttention(nn.Module):
         scale (float): Scaling factor for attention scores.
         attention_dropout (float): Dropout rate for attention scores.
         output_attention (bool): Whether to output attention weights.
-        use_efficient_attention (bool): Whether to use torch's native efficient
-            scaled dot product attention implementation.
+        use_efficient_attention (bool): Whether to use PyTorch's native,
+            optimized Scaled Dot Product Attention implementation which can
+            reduce computation time and memory consumption for longer sequences.
+            PyTorch automatically selects the optimal backend (FlashAttention-2,
+            Memory-Efficient Attention, or their own C++ implementation) based
+            on user's input properties, hardware capabilities, and build
+            configuration.
     """
 
     def __init__(
diff --git a/pytorch_forecasting/models/timexer/_timexer.py b/pytorch_forecasting/models/timexer/_timexer.py
@@ -121,7 +121,11 @@ def __init__(
             ('relu' or 'gelu').
         use_efficient_attention (bool, optional): If set to True, will use
             PyTorch's native, optimized Scaled Dot Product Attention
-            Implementation.
+            implementation which can reduce computation time and memory
+            consumption for longer sequences. PyTorch automatically selects the
+            optimal backend (FlashAttention-2, Memory-Efficient Attention, or
+            their own C++ implementation) based on user's input properties,
+            hardware capabilities, and build configuration.
         patch_length (int, optional): Length of each non-overlapping patch for
             endogenous variable tokenization.
         use_norm (bool, optional): Whether to apply normalization to input data.
diff --git a/pytorch_forecasting/models/timexer/_timexer_v2.py b/pytorch_forecasting/models/timexer/_timexer_v2.py
@@ -59,7 +59,11 @@ class TimeXer(TslibBaseModel):
         Activation function to use in the feed-forward network. Common choices are 'relu', 'gelu', etc.
     use_efficient_attention: bool, default=False
         If set to True, will use PyTorch's native, optimized Scaled Dot Product
-        Attention Implementation.
+        Attention implementation which can reduce computation time and memory
+        consumption for longer sequences. PyTorch automatically selects the
+        optimal backend (FlashAttention-2, Memory-Efficient Attention, or their
+        own C++ implementation) based on user's input properties, hardware
+        capabilities, and build configuration.
     endogenous_vars: Optional[list[str]], default=None
         List of endogenous variable names to be used in the model. If None, all historical values
         for the target variable are used.
diff --git a/pytorch_forecasting/models/timexer/sub_modules.py b/pytorch_forecasting/models/timexer/sub_modules.py
@@ -37,8 +37,13 @@ class FullAttention(nn.Module):
         scale (float): Scaling factor for attention scores.
         attention_dropout (float): Dropout rate for attention scores.
         output_attention (bool): Whether to output attention weights.
-        use_efficient_attention (bool): Whether to use torch's native efficient
-            scaled dot product attention implementation.
+        use_efficient_attention (bool): Whether to use PyTorch's native,
+            optimized Scaled Dot Product Attention implementation which can
+            reduce computation time and memory consumption for longer sequences.
+            PyTorch automatically selects the optimal backend (FlashAttention-2,
+            Memory-Efficient Attention, or their own C++ implementation) based
+            on user's input properties, hardware capabilities, and build
+            configuration.
     """
 
     def __init__(