From 86c047f5e289875e18dcdf933bb32c3ae26808e7 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Thu, 16 May 2024 17:48:36 -0700
Subject: [PATCH 01/14] Initial pass at lr sched tutorial

---
 .../compiling_optimizer_lr_scheduler.rst      | 125 ++++++++++++++++++
 recipes_source/recipes_index.rst              |   9 ++
 2 files changed, 134 insertions(+)
 create mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst
new file mode 100644
index 00000000000..acc08d30243
--- /dev/null
+++ b/recipes_source/compiling_optimizer_lr_scheduler.rst
@@ -0,0 +1,125 @@
+(beta) Running the compiled optimizer with an LR Scheduler
+==========================================================================================
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+
+The optimizer is a key algorithm for training any deep learning model.
+In this example, we will show how to pair the ``torch.compile``d optimizer
+with the LR schedulers to accelerate training convergence
+
+.. note::
+
+   This tutorial requires PyTorch 2.2.0 or later.
+
+Model Setup
+~~~~~~~~~~~~~~~~~~~~~
+For this example, we'll use a simple sequence of linear layers.
+
+.. code-block:: python
+
+   import torch
+
+   model = torch.nn.Sequential(
+       *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+   )
+   input = torch.rand(1024, device="cuda")
+   output = model(input)
+   output.sum().backward()
+
+Setting up and running the compiled optimizer with LR Scheduler
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In this example, we'll use the Adam optimizer with ConstantLR Scheduler
+and create a helper function to wrap the step()
+in ``torch.compile()``.
+
+.. note::
+
+   ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
+
+.. code-block:: python
+
+  # exit cleanly if we are on a device that doesn't support torch.compile
+    if torch.cuda.get_device_capability() < (7, 0):
+        print("Exiting because torch.compile is not supported on this device.")
+        import sys
+        sys.exit(0)
+
+    # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the
+    # the optimizer with an LR Scheduler.
+    # Without this, torch.compile will recompile as the value of the LR
+    # changes.
+    opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
+    sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
+
+    @torch.compile(fullgraph=False)
+    def fn():
+        opt.step()
+        sched.step()
+
+
+    # Warmup runs to compile the function
+    for _ in range(5):
+        fn()
+        print(opt.param_groups[0]["lr"])
+
+Sample Output:
+
+>> tensor(0.0047)
+>> tensor(0.0060)
+>> tensor(0.0073)
+>> tensor(0.0087)
+>> tensor(0.0100)
+
+Extension: What happens with a non-tensor LR?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the
+LR in a tensor.
+
+.. code-block:: python
+   # exit cleanly if we are on a device that doesn't support torch.compile
+   if torch.cuda.get_device_capability() < (7, 0):
+       print("Exiting because torch.compile is not supported on this device.")
+       import sys
+       sys.exit(0)
+
+   # No longer wrap the LR in a tensor here
+   opt = torch.optim.Adam(model.parameters(), lr=0.01)
+   sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4)
+
+   @torch.compile(fullgraph=False)
+   def fn():
+       opt.step()
+       sched.step()
+
+   # Setup logging to view recompiles
+   torch._logging.set_logs(recompiles=True)
+
+   # Warmup runs to compile the function
+   # We will now recompile on each iteration
+   # as the value of the lr is mutated.
+   for _ in range(5):
+       fn()
+
+Sample Output:
+
+>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+>>    triggered by the following guard failure(s):
+>>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+>>    triggered by the following guard failure(s):
+>>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+>>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+>>    triggered by the following guard failure(s):
+>>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+>>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+>>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+>>    triggered by the following guard failure(s):
+>>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
+>>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+>>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+>>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+
+With this example, we can see that we recompile the optimizer 4 additional
+due to the guard failure on the 'lr' in param_groups[0]
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 8da2c647f63..b395b13a153 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -307,6 +307,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/compiling_optimizer.html
    :tags: Model-Optimization
 
+.. (beta) Running the compiled optimizer with an LR Scheduler
+
+.. customcarditem::
+   :header: (beta) Running the compiled optimizer with an LR Scheduler
+   :card_description: Speed up training with LRScheduler and torch.compiled optimizer
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../recipes/compiling_optimizer_lr_scheduler.html
+   :tags: Model-Optimization
+
 .. Using User-Defined Triton Kernels with ``torch.compile``
 
 .. customcarditem::

From c30649c0fef6611f2e042c7c9de4f45fe862f7b0 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos922@gmail.com>
Date: Fri, 17 May 2024 10:35:18 -0700
Subject: [PATCH 02/14] Apply suggestions from code review

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/compiling_optimizer_lr_scheduler.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst
index acc08d30243..bb069df38b3 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.rst
+++ b/recipes_source/compiling_optimizer_lr_scheduler.rst
@@ -4,8 +4,8 @@
 **Author:** `Michael Lazos <https://github.com/mlazos>`_
 
 The optimizer is a key algorithm for training any deep learning model.
-In this example, we will show how to pair the ``torch.compile``d optimizer
-with the LR schedulers to accelerate training convergence
+In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``,
+with the LR schedulers to accelerate training convergence.
 
 .. note::
 
@@ -27,14 +27,14 @@ For this example, we'll use a simple sequence of linear layers.
    output.sum().backward()
 
 Setting up and running the compiled optimizer with LR Scheduler
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In this example, we'll use the Adam optimizer with ConstantLR Scheduler
-and create a helper function to wrap the step()
+and create a helper function to wrap the ``step()``
 in ``torch.compile()``.
 
 .. note::
 
-   ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
+   ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher.
 
 .. code-block:: python
 

From 67d5e45d03f9990b116f02245aafedb633051ed3 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Fri, 17 May 2024 12:02:32 -0700
Subject: [PATCH 03/14] Convert to py, comments

---
 .../compiling_optimizer_lr_scheduler.py       | 146 ++++++++++++++++++
 .../compiling_optimizer_lr_scheduler.rst      | 125 ---------------
 2 files changed, 146 insertions(+), 125 deletions(-)
 create mode 100644 recipes_source/compiling_optimizer_lr_scheduler.py
 delete mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
new file mode 100644
index 00000000000..fc13e497f2a
--- /dev/null
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -0,0 +1,146 @@
+"""
+(beta) Running the compiled optimizer with an LR Scheduler
+
+**Author:** `Michael Lazos <https://github.com/mlazos>`_
+"""
+
+#########################################################
+# The optimizer is a key algorithm for training any deep learning model.
+# In this example, we will show how to pair the ``torch.compile``d optimizer
+# with the LR schedulers to accelerate training convergence
+#
+# .. note::
+#
+#    This tutorial requires PyTorch 2.2.0 or later.
+
+#####################################################################
+# Model Setup
+# ~~~~~~~~~~~~~~~~~~~~~
+# For this example, we'll use a simple sequence of linear layers.
+#
+
+import torch
+
+# Create simple model
+model = torch.nn.Sequential(
+    *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
+)
+input = torch.rand(1024, device="cuda")
+
+# run forward pass
+output = model(input)
+
+# run backward to populate the grads for our optimizer below
+output.sum().backward()
+
+
+#####################################################################
+# Setting up and running the compiled optimizer with LR Scheduler
+#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# In this section, we'll use the Adam optimizer with LinearLR Scheduler
+# and create a helper function to wrap the ``step()`` call for each
+# in ``torch.compile()``.
+#
+# .. note::
+#
+#    ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
+
+
+# exit cleanly if we are on a device that doesn't support torch.compile
+if torch.cuda.get_device_capability() < (7, 0):
+    print("Exiting because torch.compile is not supported on this device.")
+    import sys
+    sys.exit(0)
+
+# !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the
+# the optimizer with an LR Scheduler.
+# Without this, torch.compile will recompile as the value of the LR
+# changes.
+opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
+sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
+
+@torch.compile(fullgraph=False)
+def fn():
+    opt.step()
+    sched.step()
+
+
+# Warmup runs to compile the function
+for _ in range(5):
+    fn()
+    print(opt.param_groups[0]["lr"])
+
+########################################################################
+# Sample Output:
+#
+# >> tensor(0.0047)
+# >> tensor(0.0060)
+# >> tensor(0.0073)
+# >> tensor(0.0087)
+# >> tensor(0.0100)
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+#
+# In this tutorial we showed how to pair the ``torch.compile``d optimizer
+# with an LR Scheduler to accelerate training convergence. We used a model consisting
+# of a simple sequence of linear layers with the Adam optimizer paired
+# with a LinearLR scheduler to demonstrate the LR changing across iterations.
+#
+# See also:
+# * tutorial on the compiled optimizer - `Compiled optimizer tutorial`_
+# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2`_
+#
+# .. Compiled optimizer tutorial: https://pytorch.org/tutorials/recipes/compiling_optimizer.html
+# .. Compiling the optimizer with PT2: https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669
+
+######################################################################
+# Extension: What happens with a non-tensor LR?
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the
+# LR in a tensor.
+
+# No longer wrap the LR in a tensor here
+opt = torch.optim.Adam(model.parameters(), lr=0.01)
+sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
+
+@torch.compile(fullgraph=False)
+def fn():
+    opt.step()
+    sched.step()
+
+# Setup logging to view recompiles
+torch._logging.set_logs(recompiles=True)
+
+# Warmup runs to compile the function
+# We will now recompile on each iteration
+# as the value of the lr is mutated.
+for _ in range(5):
+    fn()
+
+
+######################################################################
+# Sample Output:
+#
+# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+# >>    triggered by the following guard failure(s):
+# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+# >>    triggered by the following guard failure(s):
+# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+# >>    triggered by the following guard failure(s):
+# >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+# >>    triggered by the following guard failure(s):
+# >>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
+# >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+#
+# With this example, we can see that we recompile the optimizer 4 additional
+# due to the guard failure on the 'lr' in param_groups[0]
diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst
deleted file mode 100644
index acc08d30243..00000000000
--- a/recipes_source/compiling_optimizer_lr_scheduler.rst
+++ /dev/null
@@ -1,125 +0,0 @@
-(beta) Running the compiled optimizer with an LR Scheduler
-==========================================================================================
-
-**Author:** `Michael Lazos <https://github.com/mlazos>`_
-
-The optimizer is a key algorithm for training any deep learning model.
-In this example, we will show how to pair the ``torch.compile``d optimizer
-with the LR schedulers to accelerate training convergence
-
-.. note::
-
-   This tutorial requires PyTorch 2.2.0 or later.
-
-Model Setup
-~~~~~~~~~~~~~~~~~~~~~
-For this example, we'll use a simple sequence of linear layers.
-
-.. code-block:: python
-
-   import torch
-
-   model = torch.nn.Sequential(
-       *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
-   )
-   input = torch.rand(1024, device="cuda")
-   output = model(input)
-   output.sum().backward()
-
-Setting up and running the compiled optimizer with LR Scheduler
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In this example, we'll use the Adam optimizer with ConstantLR Scheduler
-and create a helper function to wrap the step()
-in ``torch.compile()``.
-
-.. note::
-
-   ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0
-
-.. code-block:: python
-
-  # exit cleanly if we are on a device that doesn't support torch.compile
-    if torch.cuda.get_device_capability() < (7, 0):
-        print("Exiting because torch.compile is not supported on this device.")
-        import sys
-        sys.exit(0)
-
-    # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the
-    # the optimizer with an LR Scheduler.
-    # Without this, torch.compile will recompile as the value of the LR
-    # changes.
-    opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
-    sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
-
-    @torch.compile(fullgraph=False)
-    def fn():
-        opt.step()
-        sched.step()
-
-
-    # Warmup runs to compile the function
-    for _ in range(5):
-        fn()
-        print(opt.param_groups[0]["lr"])
-
-Sample Output:
-
->> tensor(0.0047)
->> tensor(0.0060)
->> tensor(0.0073)
->> tensor(0.0087)
->> tensor(0.0100)
-
-Extension: What happens with a non-tensor LR?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the
-LR in a tensor.
-
-.. code-block:: python
-   # exit cleanly if we are on a device that doesn't support torch.compile
-   if torch.cuda.get_device_capability() < (7, 0):
-       print("Exiting because torch.compile is not supported on this device.")
-       import sys
-       sys.exit(0)
-
-   # No longer wrap the LR in a tensor here
-   opt = torch.optim.Adam(model.parameters(), lr=0.01)
-   sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4)
-
-   @torch.compile(fullgraph=False)
-   def fn():
-       opt.step()
-       sched.step()
-
-   # Setup logging to view recompiles
-   torch._logging.set_logs(recompiles=True)
-
-   # Warmup runs to compile the function
-   # We will now recompile on each iteration
-   # as the value of the lr is mutated.
-   for _ in range(5):
-       fn()
-
-Sample Output:
-
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
->>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-
-With this example, we can see that we recompile the optimizer 4 additional
-due to the guard failure on the 'lr' in param_groups[0]

From 16b41c2e7f1e1bbbd2ae2165b5cbc7219ea0d37a Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Fri, 17 May 2024 16:27:58 -0700
Subject: [PATCH 04/14] Remove rst

---
 .../compiling_optimizer_lr_scheduler.rst      | 125 ------------------
 1 file changed, 125 deletions(-)
 delete mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst
deleted file mode 100644
index bb069df38b3..00000000000
--- a/recipes_source/compiling_optimizer_lr_scheduler.rst
+++ /dev/null
@@ -1,125 +0,0 @@
-(beta) Running the compiled optimizer with an LR Scheduler
-==========================================================================================
-
-**Author:** `Michael Lazos <https://github.com/mlazos>`_
-
-The optimizer is a key algorithm for training any deep learning model.
-In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``,
-with the LR schedulers to accelerate training convergence.
-
-.. note::
-
-   This tutorial requires PyTorch 2.2.0 or later.
-
-Model Setup
-~~~~~~~~~~~~~~~~~~~~~
-For this example, we'll use a simple sequence of linear layers.
-
-.. code-block:: python
-
-   import torch
-
-   model = torch.nn.Sequential(
-       *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)]
-   )
-   input = torch.rand(1024, device="cuda")
-   output = model(input)
-   output.sum().backward()
-
-Setting up and running the compiled optimizer with LR Scheduler
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-In this example, we'll use the Adam optimizer with ConstantLR Scheduler
-and create a helper function to wrap the ``step()``
-in ``torch.compile()``.
-
-.. note::
-
-   ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher.
-
-.. code-block:: python
-
-  # exit cleanly if we are on a device that doesn't support torch.compile
-    if torch.cuda.get_device_capability() < (7, 0):
-        print("Exiting because torch.compile is not supported on this device.")
-        import sys
-        sys.exit(0)
-
-    # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the
-    # the optimizer with an LR Scheduler.
-    # Without this, torch.compile will recompile as the value of the LR
-    # changes.
-    opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01))
-    sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5)
-
-    @torch.compile(fullgraph=False)
-    def fn():
-        opt.step()
-        sched.step()
-
-
-    # Warmup runs to compile the function
-    for _ in range(5):
-        fn()
-        print(opt.param_groups[0]["lr"])
-
-Sample Output:
-
->> tensor(0.0047)
->> tensor(0.0060)
->> tensor(0.0073)
->> tensor(0.0087)
->> tensor(0.0100)
-
-Extension: What happens with a non-tensor LR?
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the
-LR in a tensor.
-
-.. code-block:: python
-   # exit cleanly if we are on a device that doesn't support torch.compile
-   if torch.cuda.get_device_capability() < (7, 0):
-       print("Exiting because torch.compile is not supported on this device.")
-       import sys
-       sys.exit(0)
-
-   # No longer wrap the LR in a tensor here
-   opt = torch.optim.Adam(model.parameters(), lr=0.01)
-   sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4)
-
-   @torch.compile(fullgraph=False)
-   def fn():
-       opt.step()
-       sched.step()
-
-   # Setup logging to view recompiles
-   torch._logging.set_logs(recompiles=True)
-
-   # Warmup runs to compile the function
-   # We will now recompile on each iteration
-   # as the value of the lr is mutated.
-   for _ in range(5):
-       fn()
-
-Sample Output:
-
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
->>    triggered by the following guard failure(s):
->>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
->>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
->>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
->>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-
-With this example, we can see that we recompile the optimizer 4 additional
-due to the guard failure on the 'lr' in param_groups[0]

From 8a73fc0fa2115c0ddf550949c7bf2747f1f8f7d3 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Mon, 20 May 2024 10:49:09 -0700
Subject: [PATCH 05/14] PR comments

---
 recipes_source/compiling_optimizer_lr_scheduler.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index 7b15f02310e..4a08c3f78b4 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -11,7 +11,7 @@
 #
 # .. note::
 #
-#    This tutorial requires PyTorch 2.2.0 or later.
+#    This tutorial requires PyTorch 2.4.0 or later.
 
 #####################################################################
 # Model Setup
@@ -52,7 +52,7 @@
     import sys
     sys.exit(0)
 
-# !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the
+# !!! IMPORTANT !!! Wrap the lr in a Tensor if we are pairing the
 # the optimizer with an LR Scheduler.
 # Without this, torch.compile will recompile as the value of the LR
 # changes.
@@ -89,11 +89,9 @@ def fn():
 # with a LinearLR scheduler to demonstrate the LR changing across iterations.
 #
 # See also:
-# * tutorial on the compiled optimizer - `Compiled optimizer tutorial`_
-# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2`_
-#
-# .. Compiled optimizer tutorial: https://pytorch.org/tutorials/recipes/compiling_optimizer.html
-# .. Compiling the optimizer with PT2: https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669
+# * tutorial on the compiled optimizer - `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`_
+# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`_
+
 
 ######################################################################
 # Extension: What happens with a non-tensor LR?

From 7a0f79282c27898b5b9fd683ab76af7e131b5917 Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Mon, 20 May 2024 11:07:44 -0700
Subject: [PATCH 06/14] Move conclusion to end

---
 .../compiling_optimizer_lr_scheduler.py       | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index 4a08c3f78b4..e8ec8e721b3 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -79,20 +79,6 @@ def fn():
 # >> tensor(0.0087)
 # >> tensor(0.0100)
 
-######################################################################
-# Conclusion
-# ~~~~~~~~~~
-#
-# In this tutorial we showed how to pair the ``torch.compile``d optimizer
-# with an LR Scheduler to accelerate training convergence. We used a model consisting
-# of a simple sequence of linear layers with the Adam optimizer paired
-# with a LinearLR scheduler to demonstrate the LR changing across iterations.
-#
-# See also:
-# * tutorial on the compiled optimizer - `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`_
-# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`_
-
-
 ######################################################################
 # Extension: What happens with a non-tensor LR?
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -142,3 +128,16 @@ def fn():
 #
 # With this example, we can see that we recompile the optimizer 4 additional
 # due to the guard failure on the 'lr' in param_groups[0]
+
+######################################################################
+# Conclusion
+# ~~~~~~~~~~
+#
+# In this tutorial we showed how to pair the ``torch.compile``d optimizer
+# with an LR Scheduler to accelerate training convergence. We used a model consisting
+# of a simple sequence of linear layers with the Adam optimizer paired
+# with a LinearLR scheduler to demonstrate the LR changing across iterations.
+#
+# See also:
+# * tutorial on the compiled optimizer - `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`_
+# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`_

From 258955cec89956025972327f8577e2c0e6d1b4da Mon Sep 17 00:00:00 2001
From: Michael Lazos <mlazos@meta.com>
Date: Mon, 20 May 2024 12:27:19 -0700
Subject: [PATCH 07/14] Update version number

---
 recipes_source/compiling_optimizer_lr_scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index e8ec8e721b3..33b1ea94304 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -11,7 +11,7 @@
 #
 # .. note::
 #
-#    This tutorial requires PyTorch 2.4.0 or later.
+#    This tutorial requires PyTorch 2.3.0 or later.
 
 #####################################################################
 # Model Setup

From 9d07b46f607563a7cc9112cb2bbac48ee38b75b3 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 08:17:23 -0700
Subject: [PATCH 08/14] Put on linux.g5.4xlarge.nvidia.gpu

---
 .jenkins/metadata.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 9cf51efaf9b..b9c29a9e289 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -36,6 +36,9 @@
      "needs": "linux.g5.4xlarge.nvidia.gpu",
      "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."
   },
+  "recipes_source/compiling_optimizer_lr_scheduler.py": {
+     "needs": "linux.g5.4xlarge.nvidia.gpu"
+  },
   "intermediate_source/torch_compile_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu"
   },

From 8b0edb0f3fa02cd04c1a87a1e2281832cca553a2 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 09:08:53 -0700
Subject: [PATCH 09/14] Update

---
 .jenkins/metadata.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index b9c29a9e289..14e6cd5a619 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -28,6 +28,10 @@
   "intermediate_source/model_parallel_tutorial.py": {
     "needs": "linux.16xlarge.nvidia.gpu"
   },
+  "advanced_source/pendulum.py": {
+    "needs": "linux.4xlarge.nvidia.gpy",
+    "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
+  },
   "intermediate_source/torchvision_tutorial.py": {
     "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."

From 6f413828a1a78fdac7ec7417886e13bf0d978bf1 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 09:52:05 -0700
Subject: [PATCH 10/14] Update

---
 .jenkins/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 14e6cd5a619..254b31fc948 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -29,7 +29,7 @@
     "needs": "linux.16xlarge.nvidia.gpu"
   },
   "advanced_source/pendulum.py": {
-    "needs": "linux.4xlarge.nvidia.gpy",
+    "needs": "linux.g5.4xlarge.nvidia.gpy",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
   },
   "intermediate_source/torchvision_tutorial.py": {

From d759583110a1e9ff951b520bcfe7647e18c5722d Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 10:32:51 -0700
Subject: [PATCH 11/14] Update metadata.json

---
 .jenkins/metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 254b31fc948..4814f9a7d2b 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -29,7 +29,7 @@
     "needs": "linux.16xlarge.nvidia.gpu"
   },
   "advanced_source/pendulum.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpy",
+    "needs": "linux.g5.4xlarge.nvidia.gpu",
     "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
   },
   "intermediate_source/torchvision_tutorial.py": {

From dc62ced354b23b9a855ad67dd8ed2894e7d72b00 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 12:42:14 -0700
Subject: [PATCH 12/14] Update compiling_optimizer_lr_scheduler.py

---
 .../compiling_optimizer_lr_scheduler.py       | 59 +++++++++----------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index 33b1ea94304..ae958a09ad0 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -1,5 +1,6 @@
 """
 (beta) Running the compiled optimizer with an LR Scheduler
+============================================================
 
 **Author:** `Michael Lazos <https://github.com/mlazos>`_
 """
@@ -37,6 +38,7 @@
 #####################################################################
 # Setting up and running the compiled optimizer with LR Scheduler
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
 # In this section, we'll use the Adam optimizer with LinearLR Scheduler
 # and create a helper function to wrap the ``step()`` call for each of them
 # in ``torch.compile()``.
@@ -46,7 +48,7 @@
 #    ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher.
 
 
-# exit cleanly if we are on a device that doesn't support torch.compile
+# exit cleanly if we are on a device that doesn't support ``torch.compile``
 if torch.cuda.get_device_capability() < (7, 0):
     print("Exiting because torch.compile is not supported on this device.")
     import sys
@@ -70,14 +72,6 @@ def fn():
     fn()
     print(opt.param_groups[0]["lr"])
 
-########################################################################
-# Sample Output:
-#
-# >> tensor(0.0047)
-# >> tensor(0.0060)
-# >> tensor(0.0073)
-# >> tensor(0.0087)
-# >> tensor(0.0100)
 
 ######################################################################
 # Extension: What happens with a non-tensor LR?
@@ -106,28 +100,30 @@ def fn():
 
 ######################################################################
 # Sample Output:
+# 
+# .. code-block:: bash
 #
-# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-# >>    triggered by the following guard failure(s):
-# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-# >>    triggered by the following guard failure(s):
-# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-# >>    triggered by the following guard failure(s):
-# >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
-# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-# >>    triggered by the following guard failure(s):
-# >>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
-# >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
-# >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-# >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+#    >>    triggered by the following guard failure(s):
+#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+#    >>    triggered by the following guard failure(s):
+#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+#    >>    triggered by the following guard failure(s):
+#    >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
+#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
+#    >>    triggered by the following guard failure(s):
+#    >>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
+#    >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
+#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
+#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
 #
-# With this example, we can see that we recompile the optimizer 4 additional
-# due to the guard failure on the 'lr' in param_groups[0]
+# With this example, we can see that we recompile the optimizer 4 additional times
+# due to the guard failure on the 'lr' in param_groups[0].
 
 ######################################################################
 # Conclusion
@@ -139,5 +135,6 @@ def fn():
 # with a LinearLR scheduler to demonstrate the LR changing across iterations.
 #
 # See also:
-# * tutorial on the compiled optimizer - `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`_
-# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`_
+#
+# * `Compiled optimizer tutorial <https://pytorch.org/tutorials/recipes/compiling_optimizer.html>`__ - an intro into the compiled optimizer.
+# * `Compiling the optimizer with PT2 <https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669>`__ - deeper technical details on the compiled optimizer. 

From 45c53eacb3a291060208754e90b88bab715b2304 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 12:43:13 -0700
Subject: [PATCH 13/14] Update compiling_optimizer_lr_scheduler.py

---
 .../compiling_optimizer_lr_scheduler.py       | 25 +------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index ae958a09ad0..2bae48fc7ee 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -99,30 +99,7 @@ def fn():
 
 
 ######################################################################
-# Sample Output:
-# 
-# .. code-block:: bash
-#
-#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-#    >>    triggered by the following guard failure(s):
-#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-#    >>    triggered by the following guard failure(s):
-#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-#    >>    triggered by the following guard failure(s):
-#    >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
-#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-#    >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191
-#    >>    triggered by the following guard failure(s):
-#    >>    - L['self'].param_groups[0]['lr'] == 0.007333333333333335
-#    >>    - L['self'].param_groups[0]['lr'] == 0.006000000000000001
-#    >>    - L['self'].param_groups[0]['lr'] == 0.004666666666666667
-#    >>    - L['self'].param_groups[0]['lr'] == 0.003333333333333333
-#
-# With this example, we can see that we recompile the optimizer 4 additional times
+# With this example, we can see that we recompile the optimizer a few times
 # due to the guard failure on the 'lr' in param_groups[0].
 
 ######################################################################

From 56c0b833d27c58e4903add8aa1bae3e9cf1d6d49 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 21 May 2024 14:44:07 -0700
Subject: [PATCH 14/14] Update compiling_optimizer_lr_scheduler.py

---
 recipes_source/compiling_optimizer_lr_scheduler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py
index 2bae48fc7ee..c0402729403 100644
--- a/recipes_source/compiling_optimizer_lr_scheduler.py
+++ b/recipes_source/compiling_optimizer_lr_scheduler.py
@@ -7,7 +7,7 @@
 
 #########################################################
 # The optimizer is a key algorithm for training any deep learning model.
-# In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``,
+# In this example, we will show how to pair the optimizer, which has been compiled using ``torch.compile``,
 # with the LR schedulers to accelerate training convergence.
 #
 # .. note::
@@ -100,13 +100,13 @@ def fn():
 
 ######################################################################
 # With this example, we can see that we recompile the optimizer a few times
-# due to the guard failure on the 'lr' in param_groups[0].
+# due to the guard failure on the ``lr`` in ``param_groups[0]``.
 
 ######################################################################
 # Conclusion
 # ~~~~~~~~~~
 #
-# In this tutorial we showed how to pair the ``torch.compile``d optimizer
+# In this tutorial we showed how to pair the optimizer compiled with ``torch.compile``
 # with an LR Scheduler to accelerate training convergence. We used a model consisting
 # of a simple sequence of linear layers with the Adam optimizer paired
 # with a LinearLR scheduler to demonstrate the LR changing across iterations.