From 86c047f5e289875e18dcdf933bb32c3ae26808e7 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Thu, 16 May 2024 17:48:36 -0700 Subject: [PATCH 01/14] Initial pass at lr sched tutorial --- .../compiling_optimizer_lr_scheduler.rst | 125 ++++++++++++++++++ recipes_source/recipes_index.rst | 9 ++ 2 files changed, 134 insertions(+) create mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst new file mode 100644 index 00000000000..acc08d30243 --- /dev/null +++ b/recipes_source/compiling_optimizer_lr_scheduler.rst @@ -0,0 +1,125 @@ +(beta) Running the compiled optimizer with an LR Scheduler +========================================================================================== + +**Author:** `Michael Lazos `_ + +The optimizer is a key algorithm for training any deep learning model. +In this example, we will show how to pair the ``torch.compile``d optimizer +with the LR schedulers to accelerate training convergence + +.. note:: + + This tutorial requires PyTorch 2.2.0 or later. + +Model Setup +~~~~~~~~~~~~~~~~~~~~~ +For this example, we'll use a simple sequence of linear layers. + +.. code-block:: python + + import torch + + model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] + ) + input = torch.rand(1024, device="cuda") + output = model(input) + output.sum().backward() + +Setting up and running the compiled optimizer with LR Scheduler +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +In this example, we'll use the Adam optimizer with ConstantLR Scheduler +and create a helper function to wrap the step() +in ``torch.compile()``. + +.. note:: + + ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 + +.. code-block:: python + + # exit cleanly if we are on a device that doesn't support torch.compile + if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + + # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the + # the optimizer with an LR Scheduler. + # Without this, torch.compile will recompile as the value of the LR + # changes. + opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) + sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) + + @torch.compile(fullgraph=False) + def fn(): + opt.step() + sched.step() + + + # Warmup runs to compile the function + for _ in range(5): + fn() + print(opt.param_groups[0]["lr"]) + +Sample Output: + +>> tensor(0.0047) +>> tensor(0.0060) +>> tensor(0.0073) +>> tensor(0.0087) +>> tensor(0.0100) + +Extension: What happens with a non-tensor LR? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the +LR in a tensor. + +.. code-block:: python + # exit cleanly if we are on a device that doesn't support torch.compile + if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + + # No longer wrap the LR in a tensor here + opt = torch.optim.Adam(model.parameters(), lr=0.01) + sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4) + + @torch.compile(fullgraph=False) + def fn(): + opt.step() + sched.step() + + # Setup logging to view recompiles + torch._logging.set_logs(recompiles=True) + + # Warmup runs to compile the function + # We will now recompile on each iteration + # as the value of the lr is mutated. + for _ in range(5): + fn() + +Sample Output: + +>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +>> triggered by the following guard failure(s): +>> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +>> triggered by the following guard failure(s): +>> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +>> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +>> triggered by the following guard failure(s): +>> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +>> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +>> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +>>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +>> triggered by the following guard failure(s): +>> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 +>> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +>> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +>> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 + +With this example, we can see that we recompile the optimizer 4 additional +due to the guard failure on the 'lr' in param_groups[0] diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 8da2c647f63..b395b13a153 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -307,6 +307,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/compiling_optimizer.html :tags: Model-Optimization +.. (beta) Running the compiled optimizer with an LR Scheduler + +.. customcarditem:: + :header: (beta) Running the compiled optimizer with an LR Scheduler + :card_description: Speed up training with LRScheduler and torch.compiled optimizer + :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png + :link: ../recipes/compiling_optimizer_lr_scheduler.html + :tags: Model-Optimization + .. Using User-Defined Triton Kernels with ``torch.compile`` .. customcarditem:: From c30649c0fef6611f2e042c7c9de4f45fe862f7b0 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Fri, 17 May 2024 10:35:18 -0700 Subject: [PATCH 02/14] Apply suggestions from code review Co-authored-by: Svetlana Karslioglu --- recipes_source/compiling_optimizer_lr_scheduler.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst index acc08d30243..bb069df38b3 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.rst +++ b/recipes_source/compiling_optimizer_lr_scheduler.rst @@ -4,8 +4,8 @@ **Author:** `Michael Lazos `_ The optimizer is a key algorithm for training any deep learning model. -In this example, we will show how to pair the ``torch.compile``d optimizer -with the LR schedulers to accelerate training convergence +In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``, +with the LR schedulers to accelerate training convergence. .. note:: @@ -27,14 +27,14 @@ For this example, we'll use a simple sequence of linear layers. output.sum().backward() Setting up and running the compiled optimizer with LR Scheduler -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In this example, we'll use the Adam optimizer with ConstantLR Scheduler -and create a helper function to wrap the step() +and create a helper function to wrap the ``step()`` in ``torch.compile()``. .. note:: - ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 + ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher. .. code-block:: python From 67d5e45d03f9990b116f02245aafedb633051ed3 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Fri, 17 May 2024 12:02:32 -0700 Subject: [PATCH 03/14] Convert to py, comments --- .../compiling_optimizer_lr_scheduler.py | 146 ++++++++++++++++++ .../compiling_optimizer_lr_scheduler.rst | 125 --------------- 2 files changed, 146 insertions(+), 125 deletions(-) create mode 100644 recipes_source/compiling_optimizer_lr_scheduler.py delete mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py new file mode 100644 index 00000000000..fc13e497f2a --- /dev/null +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -0,0 +1,146 @@ +""" +(beta) Running the compiled optimizer with an LR Scheduler + +**Author:** `Michael Lazos `_ +""" + +######################################################### +# The optimizer is a key algorithm for training any deep learning model. +# In this example, we will show how to pair the ``torch.compile``d optimizer +# with the LR schedulers to accelerate training convergence +# +# .. note:: +# +# This tutorial requires PyTorch 2.2.0 or later. + +##################################################################### +# Model Setup +# ~~~~~~~~~~~~~~~~~~~~~ +# For this example, we'll use a simple sequence of linear layers. +# + +import torch + +# Create simple model +model = torch.nn.Sequential( + *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] +) +input = torch.rand(1024, device="cuda") + +# run forward pass +output = model(input) + +# run backward to populate the grads for our optimizer below +output.sum().backward() + + +##################################################################### +# Setting up and running the compiled optimizer with LR Scheduler +#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# In this section, we'll use the Adam optimizer with LinearLR Scheduler +# and create a helper function to wrap the ``step()`` call for each +# in ``torch.compile()``. +# +# .. note:: +# +# ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 + + +# exit cleanly if we are on a device that doesn't support torch.compile +if torch.cuda.get_device_capability() < (7, 0): + print("Exiting because torch.compile is not supported on this device.") + import sys + sys.exit(0) + +# !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the +# the optimizer with an LR Scheduler. +# Without this, torch.compile will recompile as the value of the LR +# changes. +opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) +sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) + +@torch.compile(fullgraph=False) +def fn(): + opt.step() + sched.step() + + +# Warmup runs to compile the function +for _ in range(5): + fn() + print(opt.param_groups[0]["lr"]) + +######################################################################## +# Sample Output: +# +# >> tensor(0.0047) +# >> tensor(0.0060) +# >> tensor(0.0073) +# >> tensor(0.0087) +# >> tensor(0.0100) + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# +# In this tutorial we showed how to pair the ``torch.compile``d optimizer +# with an LR Scheduler to accelerate training convergence. We used a model consisting +# of a simple sequence of linear layers with the Adam optimizer paired +# with a LinearLR scheduler to demonstrate the LR changing across iterations. +# +# See also: +# * tutorial on the compiled optimizer - `Compiled optimizer tutorial`_ +# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2`_ +# +# .. Compiled optimizer tutorial: https://pytorch.org/tutorials/recipes/compiling_optimizer.html +# .. Compiling the optimizer with PT2: https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669 + +###################################################################### +# Extension: What happens with a non-tensor LR? +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the +# LR in a tensor. + +# No longer wrap the LR in a tensor here +opt = torch.optim.Adam(model.parameters(), lr=0.01) +sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) + +@torch.compile(fullgraph=False) +def fn(): + opt.step() + sched.step() + +# Setup logging to view recompiles +torch._logging.set_logs(recompiles=True) + +# Warmup runs to compile the function +# We will now recompile on each iteration +# as the value of the lr is mutated. +for _ in range(5): + fn() + + +###################################################################### +# Sample Output: +# +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 +# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# +# With this example, we can see that we recompile the optimizer 4 additional +# due to the guard failure on the 'lr' in param_groups[0] diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst deleted file mode 100644 index acc08d30243..00000000000 --- a/recipes_source/compiling_optimizer_lr_scheduler.rst +++ /dev/null @@ -1,125 +0,0 @@ -(beta) Running the compiled optimizer with an LR Scheduler -========================================================================================== - -**Author:** `Michael Lazos `_ - -The optimizer is a key algorithm for training any deep learning model. -In this example, we will show how to pair the ``torch.compile``d optimizer -with the LR schedulers to accelerate training convergence - -.. note:: - - This tutorial requires PyTorch 2.2.0 or later. - -Model Setup -~~~~~~~~~~~~~~~~~~~~~ -For this example, we'll use a simple sequence of linear layers. - -.. code-block:: python - - import torch - - model = torch.nn.Sequential( - *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] - ) - input = torch.rand(1024, device="cuda") - output = model(input) - output.sum().backward() - -Setting up and running the compiled optimizer with LR Scheduler -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In this example, we'll use the Adam optimizer with ConstantLR Scheduler -and create a helper function to wrap the step() -in ``torch.compile()``. - -.. note:: - - ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 - -.. code-block:: python - - # exit cleanly if we are on a device that doesn't support torch.compile - if torch.cuda.get_device_capability() < (7, 0): - print("Exiting because torch.compile is not supported on this device.") - import sys - sys.exit(0) - - # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the - # the optimizer with an LR Scheduler. - # Without this, torch.compile will recompile as the value of the LR - # changes. - opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) - sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) - - @torch.compile(fullgraph=False) - def fn(): - opt.step() - sched.step() - - - # Warmup runs to compile the function - for _ in range(5): - fn() - print(opt.param_groups[0]["lr"]) - -Sample Output: - ->> tensor(0.0047) ->> tensor(0.0060) ->> tensor(0.0073) ->> tensor(0.0087) ->> tensor(0.0100) - -Extension: What happens with a non-tensor LR? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the -LR in a tensor. - -.. code-block:: python - # exit cleanly if we are on a device that doesn't support torch.compile - if torch.cuda.get_device_capability() < (7, 0): - print("Exiting because torch.compile is not supported on this device.") - import sys - sys.exit(0) - - # No longer wrap the LR in a tensor here - opt = torch.optim.Adam(model.parameters(), lr=0.01) - sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4) - - @torch.compile(fullgraph=False) - def fn(): - opt.step() - sched.step() - - # Setup logging to view recompiles - torch._logging.set_logs(recompiles=True) - - # Warmup runs to compile the function - # We will now recompile on each iteration - # as the value of the lr is mutated. - for _ in range(5): - fn() - -Sample Output: - ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 ->> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 - -With this example, we can see that we recompile the optimizer 4 additional -due to the guard failure on the 'lr' in param_groups[0] From 16b41c2e7f1e1bbbd2ae2165b5cbc7219ea0d37a Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Fri, 17 May 2024 16:27:58 -0700 Subject: [PATCH 04/14] Remove rst --- .../compiling_optimizer_lr_scheduler.rst | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 recipes_source/compiling_optimizer_lr_scheduler.rst diff --git a/recipes_source/compiling_optimizer_lr_scheduler.rst b/recipes_source/compiling_optimizer_lr_scheduler.rst deleted file mode 100644 index bb069df38b3..00000000000 --- a/recipes_source/compiling_optimizer_lr_scheduler.rst +++ /dev/null @@ -1,125 +0,0 @@ -(beta) Running the compiled optimizer with an LR Scheduler -========================================================================================== - -**Author:** `Michael Lazos `_ - -The optimizer is a key algorithm for training any deep learning model. -In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``, -with the LR schedulers to accelerate training convergence. - -.. note:: - - This tutorial requires PyTorch 2.2.0 or later. - -Model Setup -~~~~~~~~~~~~~~~~~~~~~ -For this example, we'll use a simple sequence of linear layers. - -.. code-block:: python - - import torch - - model = torch.nn.Sequential( - *[torch.nn.Linear(1024, 1024, False, device="cuda") for _ in range(10)] - ) - input = torch.rand(1024, device="cuda") - output = model(input) - output.sum().backward() - -Setting up and running the compiled optimizer with LR Scheduler -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In this example, we'll use the Adam optimizer with ConstantLR Scheduler -and create a helper function to wrap the ``step()`` -in ``torch.compile()``. - -.. note:: - - ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher. - -.. code-block:: python - - # exit cleanly if we are on a device that doesn't support torch.compile - if torch.cuda.get_device_capability() < (7, 0): - print("Exiting because torch.compile is not supported on this device.") - import sys - sys.exit(0) - - # !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the - # the optimizer with an LR Scheduler. - # Without this, torch.compile will recompile as the value of the LR - # changes. - opt = torch.optim.Adam(model.parameters(), lr=torch.tensor(0.01)) - sched = torch.optim.lr_scheduler.LinearLR(opt, total_iters=5) - - @torch.compile(fullgraph=False) - def fn(): - opt.step() - sched.step() - - - # Warmup runs to compile the function - for _ in range(5): - fn() - print(opt.param_groups[0]["lr"]) - -Sample Output: - ->> tensor(0.0047) ->> tensor(0.0060) ->> tensor(0.0073) ->> tensor(0.0087) ->> tensor(0.0100) - -Extension: What happens with a non-tensor LR? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For the curious, we will show how to peek into what happens with ``torch.compile`` when we don't wrap the -LR in a tensor. - -.. code-block:: python - # exit cleanly if we are on a device that doesn't support torch.compile - if torch.cuda.get_device_capability() < (7, 0): - print("Exiting because torch.compile is not supported on this device.") - import sys - sys.exit(0) - - # No longer wrap the LR in a tensor here - opt = torch.optim.Adam(model.parameters(), lr=0.01) - sched = torch.optim.ConstantLR(opt, factor=0.001, iters=4) - - @torch.compile(fullgraph=False) - def fn(): - opt.step() - sched.step() - - # Setup logging to view recompiles - torch._logging.set_logs(recompiles=True) - - # Warmup runs to compile the function - # We will now recompile on each iteration - # as the value of the lr is mutated. - for _ in range(5): - fn() - -Sample Output: - ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 ->>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 ->> triggered by the following guard failure(s): ->> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 ->> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 ->> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 ->> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 - -With this example, we can see that we recompile the optimizer 4 additional -due to the guard failure on the 'lr' in param_groups[0] From 8a73fc0fa2115c0ddf550949c7bf2747f1f8f7d3 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Mon, 20 May 2024 10:49:09 -0700 Subject: [PATCH 05/14] PR comments --- recipes_source/compiling_optimizer_lr_scheduler.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index 7b15f02310e..4a08c3f78b4 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -11,7 +11,7 @@ # # .. note:: # -# This tutorial requires PyTorch 2.2.0 or later. +# This tutorial requires PyTorch 2.4.0 or later. ##################################################################### # Model Setup @@ -52,7 +52,7 @@ import sys sys.exit(0) -# !!! IMPORTANT !!! Wrap the lr in a tensor if we are pairing the +# !!! IMPORTANT !!! Wrap the lr in a Tensor if we are pairing the # the optimizer with an LR Scheduler. # Without this, torch.compile will recompile as the value of the LR # changes. @@ -89,11 +89,9 @@ def fn(): # with a LinearLR scheduler to demonstrate the LR changing across iterations. # # See also: -# * tutorial on the compiled optimizer - `Compiled optimizer tutorial`_ -# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2`_ -# -# .. Compiled optimizer tutorial: https://pytorch.org/tutorials/recipes/compiling_optimizer.html -# .. Compiling the optimizer with PT2: https://dev-discuss.pytorch.org/t/compiling-the-optimizer-with-pt2/1669 +# * tutorial on the compiled optimizer - `Compiled optimizer tutorial `_ +# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 `_ + ###################################################################### # Extension: What happens with a non-tensor LR? From 7a0f79282c27898b5b9fd683ab76af7e131b5917 Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Mon, 20 May 2024 11:07:44 -0700 Subject: [PATCH 06/14] Move conclusion to end --- .../compiling_optimizer_lr_scheduler.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index 4a08c3f78b4..e8ec8e721b3 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -79,20 +79,6 @@ def fn(): # >> tensor(0.0087) # >> tensor(0.0100) -###################################################################### -# Conclusion -# ~~~~~~~~~~ -# -# In this tutorial we showed how to pair the ``torch.compile``d optimizer -# with an LR Scheduler to accelerate training convergence. We used a model consisting -# of a simple sequence of linear layers with the Adam optimizer paired -# with a LinearLR scheduler to demonstrate the LR changing across iterations. -# -# See also: -# * tutorial on the compiled optimizer - `Compiled optimizer tutorial `_ -# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 `_ - - ###################################################################### # Extension: What happens with a non-tensor LR? # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -142,3 +128,16 @@ def fn(): # # With this example, we can see that we recompile the optimizer 4 additional # due to the guard failure on the 'lr' in param_groups[0] + +###################################################################### +# Conclusion +# ~~~~~~~~~~ +# +# In this tutorial we showed how to pair the ``torch.compile``d optimizer +# with an LR Scheduler to accelerate training convergence. We used a model consisting +# of a simple sequence of linear layers with the Adam optimizer paired +# with a LinearLR scheduler to demonstrate the LR changing across iterations. +# +# See also: +# * tutorial on the compiled optimizer - `Compiled optimizer tutorial `_ +# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 `_ From 258955cec89956025972327f8577e2c0e6d1b4da Mon Sep 17 00:00:00 2001 From: Michael Lazos Date: Mon, 20 May 2024 12:27:19 -0700 Subject: [PATCH 07/14] Update version number --- recipes_source/compiling_optimizer_lr_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index e8ec8e721b3..33b1ea94304 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -11,7 +11,7 @@ # # .. note:: # -# This tutorial requires PyTorch 2.4.0 or later. +# This tutorial requires PyTorch 2.3.0 or later. ##################################################################### # Model Setup From 9d07b46f607563a7cc9112cb2bbac48ee38b75b3 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 08:17:23 -0700 Subject: [PATCH 08/14] Put on linux.g5.4xlarge.nvidia.gpu --- .jenkins/metadata.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 9cf51efaf9b..b9c29a9e289 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -36,6 +36,9 @@ "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." }, + "recipes_source/compiling_optimizer_lr_scheduler.py": { + "needs": "linux.g5.4xlarge.nvidia.gpu" + }, "intermediate_source/torch_compile_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu" }, From 8b0edb0f3fa02cd04c1a87a1e2281832cca553a2 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 09:08:53 -0700 Subject: [PATCH 09/14] Update --- .jenkins/metadata.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index b9c29a9e289..14e6cd5a619 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -28,6 +28,10 @@ "intermediate_source/model_parallel_tutorial.py": { "needs": "linux.16xlarge.nvidia.gpu" }, + "advanced_source/pendulum.py": { + "needs": "linux.4xlarge.nvidia.gpy", + "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." + }, "intermediate_source/torchvision_tutorial.py": { "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." From 6f413828a1a78fdac7ec7417886e13bf0d978bf1 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 09:52:05 -0700 Subject: [PATCH 10/14] Update --- .jenkins/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 14e6cd5a619..254b31fc948 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -29,7 +29,7 @@ "needs": "linux.16xlarge.nvidia.gpu" }, "advanced_source/pendulum.py": { - "needs": "linux.4xlarge.nvidia.gpy", + "needs": "linux.g5.4xlarge.nvidia.gpy", "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." }, "intermediate_source/torchvision_tutorial.py": { From d759583110a1e9ff951b520bcfe7647e18c5722d Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 10:32:51 -0700 Subject: [PATCH 11/14] Update metadata.json --- .jenkins/metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 254b31fc948..4814f9a7d2b 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -29,7 +29,7 @@ "needs": "linux.16xlarge.nvidia.gpu" }, "advanced_source/pendulum.py": { - "needs": "linux.g5.4xlarge.nvidia.gpy", + "needs": "linux.g5.4xlarge.nvidia.gpu", "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." }, "intermediate_source/torchvision_tutorial.py": { From dc62ced354b23b9a855ad67dd8ed2894e7d72b00 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 12:42:14 -0700 Subject: [PATCH 12/14] Update compiling_optimizer_lr_scheduler.py --- .../compiling_optimizer_lr_scheduler.py | 59 +++++++++---------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index 33b1ea94304..ae958a09ad0 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -1,5 +1,6 @@ """ (beta) Running the compiled optimizer with an LR Scheduler +============================================================ **Author:** `Michael Lazos `_ """ @@ -37,6 +38,7 @@ ##################################################################### # Setting up and running the compiled optimizer with LR Scheduler # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# # In this section, we'll use the Adam optimizer with LinearLR Scheduler # and create a helper function to wrap the ``step()`` call for each of them # in ``torch.compile()``. @@ -46,7 +48,7 @@ # ``torch.compile`` is only supported on CUDA devices that have a compute capability of 7.0 or higher. -# exit cleanly if we are on a device that doesn't support torch.compile +# exit cleanly if we are on a device that doesn't support ``torch.compile`` if torch.cuda.get_device_capability() < (7, 0): print("Exiting because torch.compile is not supported on this device.") import sys @@ -70,14 +72,6 @@ def fn(): fn() print(opt.param_groups[0]["lr"]) -######################################################################## -# Sample Output: -# -# >> tensor(0.0047) -# >> tensor(0.0060) -# >> tensor(0.0073) -# >> tensor(0.0087) -# >> tensor(0.0100) ###################################################################### # Extension: What happens with a non-tensor LR? @@ -106,28 +100,30 @@ def fn(): ###################################################################### # Sample Output: +# +# .. code-block:: bash # -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 -# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 +# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 +# >> triggered by the following guard failure(s): +# >> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 +# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 +# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 +# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 # -# With this example, we can see that we recompile the optimizer 4 additional -# due to the guard failure on the 'lr' in param_groups[0] +# With this example, we can see that we recompile the optimizer 4 additional times +# due to the guard failure on the 'lr' in param_groups[0]. ###################################################################### # Conclusion @@ -139,5 +135,6 @@ def fn(): # with a LinearLR scheduler to demonstrate the LR changing across iterations. # # See also: -# * tutorial on the compiled optimizer - `Compiled optimizer tutorial `_ -# * deeper technical details on the compiled optimizer see `Compiling the optimizer with PT2 `_ +# +# * `Compiled optimizer tutorial `__ - an intro into the compiled optimizer. +# * `Compiling the optimizer with PT2 `__ - deeper technical details on the compiled optimizer. From 45c53eacb3a291060208754e90b88bab715b2304 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 12:43:13 -0700 Subject: [PATCH 13/14] Update compiling_optimizer_lr_scheduler.py --- .../compiling_optimizer_lr_scheduler.py | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index ae958a09ad0..2bae48fc7ee 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -99,30 +99,7 @@ def fn(): ###################################################################### -# Sample Output: -# -# .. code-block:: bash -# -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# >>[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adam.py:191 -# >> triggered by the following guard failure(s): -# >> - L['self'].param_groups[0]['lr'] == 0.007333333333333335 -# >> - L['self'].param_groups[0]['lr'] == 0.006000000000000001 -# >> - L['self'].param_groups[0]['lr'] == 0.004666666666666667 -# >> - L['self'].param_groups[0]['lr'] == 0.003333333333333333 -# -# With this example, we can see that we recompile the optimizer 4 additional times +# With this example, we can see that we recompile the optimizer a few times # due to the guard failure on the 'lr' in param_groups[0]. ###################################################################### From 56c0b833d27c58e4903add8aa1bae3e9cf1d6d49 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 21 May 2024 14:44:07 -0700 Subject: [PATCH 14/14] Update compiling_optimizer_lr_scheduler.py --- recipes_source/compiling_optimizer_lr_scheduler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes_source/compiling_optimizer_lr_scheduler.py b/recipes_source/compiling_optimizer_lr_scheduler.py index 2bae48fc7ee..c0402729403 100644 --- a/recipes_source/compiling_optimizer_lr_scheduler.py +++ b/recipes_source/compiling_optimizer_lr_scheduler.py @@ -7,7 +7,7 @@ ######################################################### # The optimizer is a key algorithm for training any deep learning model. -# In this example, we will show how to pair the an optimizer, which has been compiled using ``torch.compile``, +# In this example, we will show how to pair the optimizer, which has been compiled using ``torch.compile``, # with the LR schedulers to accelerate training convergence. # # .. note:: @@ -100,13 +100,13 @@ def fn(): ###################################################################### # With this example, we can see that we recompile the optimizer a few times -# due to the guard failure on the 'lr' in param_groups[0]. +# due to the guard failure on the ``lr`` in ``param_groups[0]``. ###################################################################### # Conclusion # ~~~~~~~~~~ # -# In this tutorial we showed how to pair the ``torch.compile``d optimizer +# In this tutorial we showed how to pair the optimizer compiled with ``torch.compile`` # with an LR Scheduler to accelerate training convergence. We used a model consisting # of a simple sequence of linear layers with the Adam optimizer paired # with a LinearLR scheduler to demonstrate the LR changing across iterations.