diff --git a/_static/img/invpendulum.gif b/_static/img/invpendulum.gif new file mode 100644 index 00000000000..3102c5b55cf Binary files /dev/null and b/_static/img/invpendulum.gif differ diff --git a/conf.py b/conf.py index 9e3c3068dc4..94b8f819743 100644 --- a/conf.py +++ b/conf.py @@ -76,6 +76,8 @@ intersphinx_mapping = { "torch": ("https://pytorch.org/docs/stable/", None), + "tensordict": ("https://pytorch-labs.github.io/tensordict/", None), + "torchrl": ("https://pytorch.org/rl/", None), "torchaudio": ("https://pytorch.org/audio/stable/", None), "torchtext": ("https://pytorch.org/text/stable/", None), "torchvision": ("https://pytorch.org/vision/stable/", None), diff --git a/index.rst b/index.rst index d5a44b7efcf..71fcb717403 100644 --- a/index.rst +++ b/index.rst @@ -269,6 +269,13 @@ What's new in PyTorch tutorials? :link: intermediate/reinforcement_q_learning.html :tags: Reinforcement-Learning +.. customcarditem:: + :header: Reinforcement Learning (PPO) with TorchRL + :card_description: Learn how to use PyTorch and TorchRL to train a Proximal Policy Optimization agent on the Inverted Pendulum task from Gym. + :image: _static/img/invpendulum.gif + :link: intermediate/reinforcement_ppo.html + :tags: Reinforcement-Learning + .. customcarditem:: :header: Train a Mario-playing RL Agent :card_description: Use PyTorch to train a Double Q-learning agent to play Mario. @@ -874,6 +881,7 @@ Additional Resources :caption: Reinforcement Learning intermediate/reinforcement_q_learning + intermediate/reinforcement_ppo intermediate/mario_rl_tutorial .. toctree:: diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py new file mode 100644 index 00000000000..6b0e8522f8d --- /dev/null +++ b/intermediate_source/reinforcement_ppo.py @@ -0,0 +1,702 @@ +# -*- coding: utf-8 -*- +""" +Reinforcement Learning (PPO) with TorchRL Tutorial +================================================== +**Author**: `Vincent Moens `_ + +This tutorial demonstrates how to use PyTorch and :py:mod:`torchrl` to train a parametric policy +network to solve the Inverted Pendulum task from the `OpenAI-Gym/Farama-Gymnasium +control library `__. + +.. figure:: /_static/img/invpendulum.gif + :alt: Inverted pendulum + + Inverted pendulum + +Key learnings: + +- How to create an environment in TorchRL, transform its outputs, and collect data from this env; +- How to make your classes talk to each other using :class:`tensordict.TensorDict`; +- The basics of building your training loop with TorchRL: + + - How to compute the advantage signal for policy gradient methods; + - How to create a stochastic policy using a probabilistic neural network; + - How to create a dynamic replay buffer and sample from it without repetition. + +We will cover six crucial components of TorchRL: + +* `environments `__ +* `transforms `__ +* `models (policy and value function) `__ +* `loss modules `__ +* `data collectors `__ +* `replay buffers `__ + +""" + +###################################################################### +# If you are running this in Google Colab, make sure you install the following dependencies: +# +# .. code-block:: bash +# +# !pip3 install torchrl +# !pip3 install gym[mujoco] +# !pip3 install tqdm +# +# Proximal Policy Optimization (PPO) is a policy-gradient algorithm where a +# batch of data is being collected and directly consumed to train the policy to maximise +# the expected return given some proximality constraints. You can think of it +# as a sophisticated version of `REINFORCE `_, +# the foundational policy-optimization algorithm. For more information, see the +# `Proximal Policy Optimization Algorithms `_ paper. +# +# PPO is usually regarded as a fast and efficient method for online, on-policy +# reinforcement algorithm. TorchRL provides a loss-module that does all the work +# for you, so that you can rely on this implementation and focus on solving your +# problem rather than re-inventing the wheel every time you want to train a policy. +# +# For completeness, here is a brief overview of what the loss computes, even though +# this is taken care of by our :class:`ClipPPOLoss` module—the algorithm works as follows: +# 1. we will sample a batch of data by playing the +# policy in the environment for a given number of steps. +# 2. Then, we will perform a given number of optimization steps with random sub-samples of this batch using +# a clipped version of the REINFORCE loss. +# 3. The clipping will put a pessimistic bound on our loss: lower return estimates will +# be favored compared to higher ones. +# The precise formula of the loss is: +# +# .. math:: +# +# L(s,a,\theta_k,\theta) = \min\left( +# \frac{\pi_{\theta}(a|s)}{\pi_{\theta_k}(a|s)} A^{\pi_{\theta_k}}(s,a), \;\; +# g(\epsilon, A^{\pi_{\theta_k}}(s,a)) +# \right), +# +# There are two components in that loss: in the first part of the minimum operator, +# we simply compute an importance-weighted version of the REINFORCE loss (for example, a +# REINFORCE loss that we have corrected for the fact that the current policy +# configuration lags the one that was used for the data collection). +# The second part of that minimum operator is a similar loss where we have clipped +# the ratios when they exceeded or were below a given pair of thresholds. +# +# This loss ensures that whether the advantage is positive or negative, policy +# updates that would produce significant shifts from the previous configuration +# are being discouraged. +# +# This tutorial is structured as follows: +# +# 1. First, we will define a set of hyperparameters we will be using for training. +# +# 2. Next, we will focus on creating our environment, or simulator, using TorchRL's +# wrappers and transforms. +# +# 3. Next, we will design the policy network and the value model, +# which is indispensable to the loss function. These modules will be used +# to configure our loss module. +# +# 4. Next, we will create the replay buffer and data loader. +# +# 5. Finally, we will run our training loop and analyze the results. +# +# Throughout this tutorial, we'll be using the :mod:`tensordict` library. +# :class:`tensordict.TensorDict` is the lingua franca of TorchRL: it helps us abstract +# what a module reads and writes and care less about the specific data +# description and more about the algorithm itself. +# + +from collections import defaultdict + +import matplotlib.pyplot as plt +import torch +from tensordict.nn import TensorDictModule +from tensordict.nn.distributions import NormalParamExtractor +from torch import nn +from torchrl.collectors import SyncDataCollector +from torchrl.data.replay_buffers import ReplayBuffer +from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement +from torchrl.data.replay_buffers.storages import LazyTensorStorage +from torchrl.envs import ( + Compose, + DoubleToFloat, + ObservationNorm, + StepCounter, + TransformedEnv, +) +from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.utils import check_env_specs, set_exploration_mode +from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator +from torchrl.objectives import ClipPPOLoss +from torchrl.objectives.value import GAE +from tqdm import tqdm + +###################################################################### +# Define Hyperparameters +# ---------------------- +# +# We set the hyperparameters for our algorithm. Depending on the resources +# available, one may choose to execute the policy on GPU or on another +# device. +# The ``frame_skip`` will control how for how many frames is a single +# action being executed. The rest of the arguments that count frames +# must be corrected for this value (since one environment step will +# actually return ``frame_skip`` frames). +# + +device = "cpu" if not torch.has_cuda else "cuda:0" +num_cells = 256 # number of cells in each layer +lr = 3e-4 +max_grad_norm = 1.0 + +###################################################################### +# Data collection parameters +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# When collecting data, we will be able to choose how big each batch will be +# by defining a ``frames_per_batch`` parameter. We will also define how many +# frames (such as the number of interactions with the simulator) we will allow ourselves to +# use. In general, the goal of an RL algorithm is to learn to solve the task +# as fast as it can in terms of environment interactions: the lower the ``total_frames`` +# the better. +# We also define a ``frame_skip``: in some contexts, repeating the same action +# multiple times over the course of a trajectory may be beneficial as it makes +# the behavior more consistent and less erratic. However, "skipping" +# too many frames will hamper training by reducing the reactivity of the actor +# to observation changes. +# +# When using ``frame_skip`` it is good practice to +# correct the other frame counts by the number of frames we are grouping +# together. If we configure a total count of X frames for training but +# use a ``frame_skip`` of Y, we will be actually collecting XY frames in total +# which exceeds our predefined budget. +# +frame_skip = 1 +frames_per_batch = 1000 // frame_skip +# For a complete training, bring the number of frames up to 1M +total_frames = 50_000 // frame_skip + +###################################################################### +# PPO parameters +# ~~~~~~~~~~~~~~ +# +# At each data collection (or batch collection) we will run the optimization +# over a certain number of *epochs*, each time consuming the entire data we just +# acquired in a nested training loop. Here, the ``sub_batch_size`` is different from the +# ``frames_per_batch`` here above: recall that we are working with a "batch of data" +# coming from our collector, which size is defined by ``frames_per_batch``, and that +# we will further split in smaller sub-batches during the inner training loop. +# The size of these sub-batches is controlled by ``sub_batch_size``. +# +sub_batch_size = 64 # cardinality of the sub-samples gathered from the current data in the inner loop +num_epochs = 10 # optimisation steps per batch of data collected +clip_epsilon = ( + 0.2 # clip value for PPO loss: see the equation in the intro for more context. +) +gamma = 0.99 +lmbda = 0.95 +entropy_eps = 1e-4 + +###################################################################### +# Define an environment +# --------------------- +# +# In RL, an *environment* is usually the way we refer to a simulator or a +# control system. Various libraries provide simulation environments for reinforcement +# learning, including Gymnasium (previously OpenAI Gym), DeepMind control suite, and +# many others. +# As a generalistic library, TorchRL's goal is to provide an interchangeable interface +# to a large panel of RL simulators, allowing you to easily swap one environment +# with another. For example, creating a wrapped gym environment can be achieved with few characters: +# + +base_env = GymEnv("InvertedDoublePendulum-v4", device=device, frame_skip=frame_skip) + +###################################################################### +# There are a few things to notice in this code: first, we created +# the environment by calling the ``GymEnv`` wrapper. If extra keyword arguments +# are passed, they will be transmitted to the ``gym.make`` method, hence covering +# the most common env construction commands. +# Alternatively, one could also directly create a gym environment using ``gym.make(env_name, **kwargs)`` +# and wrap it in a `GymWrapper` class. +# +# Also the ``device`` argument: for gym, this only controls the device where +# input action and observered states will be stored, but the execution will always +# be done on CPU. The reason for this is simply that gym does not support on-device +# execution, unless specified otherwise. For other libraries, we have control over +# the execution device and, as much as we can, we try to stay consistent in terms of +# storing and execution backends. +# +# Transforms +# ~~~~~~~~~~ +# +# We will append some transforms to our environments to prepare the data for +# the policy. In Gym, this is usually achieved via wrappers. TorchRL takes a different +# approach, more similar to other pytorch domain libraries, through the use of transforms. +# To add transforms to an environment, one should simply wrap it in a :class:`TransformedEnv` +# instance, and append the sequence of transforms to it. The transformed env will inherit +# the device and meta-data of the wrapped env, and transform these depending on the sequence +# of transforms it contains. +# +# Normalization +# ~~~~~~~~~~~~~ +# +# The first to encode is a normalization transform. +# As a rule of thumbs, it is preferable to have data that loosely +# match a unit Gaussian distribution: to obtain this, we will +# run a certain number of random steps in the environment and compute +# the summary statistics of these observations. +# +# We'll append two other transforms: the :class:`DoubleToFloat` transform will +# convert double entries to single-precision numbers, ready to be read by the +# policy. The :class:`StepCounter` transform will be used to count the steps before +# the environment is terminated. We will use this measure as a supplementary measure +# of performance. +# +# As we will see later, many of the TorchRL's classes rely on :class:`tensordict.TensorDict` +# to communicate. You could think of it as a python dictionary with some extra +# tensor features. In practice, this means that many modules we will be working +# with need to be told what key to read (``in_keys``) and what key to write +# (``out_keys``) in the tensordict they will receive. Usually, if ``out_keys`` +# is omitted, it is assumed that the ``in_keys`` entries will be updated +# in-place. For our transforms, the only entry we are interested in is referred +# to as ``"observation"`` and our transform layers will be told to modify this +# entry and this entry only: +# + +env = TransformedEnv( + base_env, + Compose( + # normalize observations + ObservationNorm(in_keys=["observation"]), + DoubleToFloat(in_keys=["observation"]), + StepCounter(), + ), +) + +###################################################################### +# As you may have noticed, we have created a normalization layer but we did not +# set its normalization parameters. To do this, :class:`ObservationNorm` can +# automatically gather the summary statistics of our environment: +# +env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0) + +###################################################################### +# The :class:`ObservationNorm` transform has now been populated with a +# location and a scale that will be used to normalize the data. +# +# Let us do a little sanity check for the shape of our summary stats: +# +print("normalization constant shape:", env.transform[0].loc.shape) + +###################################################################### +# An environment is not only defined by its simulator and transforms, but also +# by a series of metadata that describe what can be expected during its +# execution. +# For efficiency purposes, TorchRL is quite stringent when it comes to +# environment specs, but you can easily check that your environment specs are +# adequate. +# In our example, the :class:`GymWrapper` and :class:`GymEnv` that inherits +# from it already take care of setting the proper specs for your env so +# you should not have to care about this. +# +# Nevertheless, let's see a concrete example using our transformed +# environment by looking at its specs. +# There are three specs to look at: ``observation_spec`` which defines what +# is to be expected when executing an action in the environment, +# ``reward_spec`` which indicates the reward domain and finally the +# ``input_spec`` (which contains the ``action_spec``) and which represents +# everything an environment requires to execute a single step. +# +print("observation_spec:", env.observation_spec) +print("reward_spec:", env.reward_spec) +print("input_spec:", env.input_spec) +print("action_spec (as defined by input_spec):", env.action_spec) + +###################################################################### +# the :func:`check_env_specs` function runs a small rollout and compares its output against the environemnt +# specs. If no error is raised, we can be confident that the specs are properly defined: +# +check_env_specs(env) + +###################################################################### +# For fun, let's see what a simple random rollout looks like. You can +# call `env.rollout(n_steps)` and get an overview of what the environment inputs +# and outputs look like. Actions will automatically be drawn from the action spec +# domain, so you don't need to care about designing a random sampler. +# +# Typically, at each step, an RL environment receives an +# action as input, and outputs an observation, a reward and a done state. The +# observation may be composite, meaning that it could be composed of more than one +# tensor. This is not a problem for TorchRL, since the whole set of observations +# is automatically packed in the output :class:`tensordict.TensorDict`. After executing a rollout +# (ie a sequence of environment steps and random action generations) over a given +# number of steps, we will retrieve a :class:`tensordict.TensorDict` instance with a shape +# that matches this trajectory length: +# +rollout = env.rollout(3) +print("rollout of three steps:", rollout) +print("Shape of the rollout TensorDict:", rollout.batch_size) + +###################################################################### +# Our rollout data has a shape of ``torch.Size([3])`, which matches the number of steps +# we ran it for. The ``"next"`` entry points to the data coming after the current step. +# In most cases, the ``"next""`` data at time `t` matches the data at ``t+1``, but this +# may not be the case if we are using some specific transformations (e.g. mutli-step). +# +# Policy +# ------ +# +# PPO utilizes a stochastic policy to handle exploration. This means that our +# neural network will have to output the parameters of a distribution, rather +# than a single value corresponding to the action taken. +# +# As the data is continuous, we use a Tanh-Normal distribution to respect the +# action space boundaries. TorchRL provides such distribution, and the only +# thing we need to care about is to build a neural network that outputs the +# right number of parameters for the policy to work with (a location, or mean, +# and a scale): +# +# .. math:: +# +# f_{\theta}(\text{observation}) = \mu_{\theta}(\text{observation}), \sigma^{+}_{\theta}(\text{observation}) +# +# The only extra-difficulty that is brought up here is to split our output in two +# equal parts and map the second to a scrictly positive space. +# +# We design the policy in three steps: +# +# 1. Define a neural network ``D_obs`` -> ``2 * D_action``. Indeed, our ``loc`` (mu) and ``scale`` (sigma) both have dimension ``D_action``; +# +# 2. Append a :class:`NormalParamExtractor` to extract a location and a scale (ie splits the input in two equal parts +# and applies a positive transformation to the scale parameter); +# +# 3. Create a probabilistic :class:`TensorDictModule` that can create this distribution and sample from it. +# + +actor_net = nn.Sequential( + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(2 * env.action_spec.shape[-1], device=device), + NormalParamExtractor(), +) + +###################################################################### +# To enable the policy to "talk" with the environment through the tensordict +# data carrier, we wrap the ``nn.Module`` in a :class:`TensorDictModule`. This +# class will simply ready the ``in_keys`` it is provided with and write the +# outputs in-place at the registered ``out_keys``. +# +policy_module = TensorDictModule( + actor_net, in_keys=["observation"], out_keys=["loc", "scale"] +) + +###################################################################### +# We now need to build a distribution out of the location and scale of our +# normal distribution. To do so, we instruct the :class:`ProbabilisticActor` +# class to build a :class:`TanhNormal` out of the location and scale +# parameters. We also provide the minimum and maximum values of this +# distribution, which we gather from the environment specs. +# +# The name of the ``in_keys`` (and hence the name of the ``out_keys`` from +# the :class:`TensorDictModule` above) cannot be set to any value one may +# like, as the :class:`TanhNormal` distribution constructor will expect the +# ``loc`` and ``scale`` keyword arguments. That being said, +# :class:`ProbabilisticActor` also accepts ``Dict[str, str]`` typed ``in_keys`` +# where the key-value pair indicates what ``in_key`` string should be used for +# every keyword argument that is to be used. +# +policy_module = ProbabilisticActor( + module=policy_module, + spec=env.action_spec, + in_keys=["loc", "scale"], + distribution_class=TanhNormal, + distribution_kwargs={ + "min": env.action_spec.space.minimum, + "max": env.action_spec.space.maximum, + }, + return_log_prob=True, + # we'll need the log-prob for the numerator of the importance weights +) + +###################################################################### +# Value network +# ------------- +# +# The value network is a crucial component of the PPO algorithm, even though it +# won't be used at inference time. This module will read the observations and +# return an estimation of the discounted return for the following trajectory. +# This allows us to amortize learning by relying on the some utility estimation +# that is learnt on-the-fly during training. Our value network share the same +# structure as the policy, but for simplicity we assign it its own set of +# parameters. +# +value_net = nn.Sequential( + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(num_cells, device=device), + nn.Tanh(), + nn.LazyLinear(1, device=device), +) + +value_module = ValueOperator( + module=value_net, + in_keys=["observation"], +) + +###################################################################### +# let's try our policy and value modules. As we said earlier, the usage of +# :class:`TensorDictModule` makes it possible to directly read the output +# of the environment to run these modules, as they know what information to read +# and where to write it: +# +print("Running policy:", policy_module(env.reset())) +print("Running value:", value_module(env.reset())) + +###################################################################### +# Data collector +# -------------- +# +# TorchRL provides a set of :class:`DataCollector` classes. Briefly, these +# classes execute three operations: reset an environment, compute an action +# given the latest observation, execute a step in the environment, and repeat +# the last two steps until the environment reaches a stop signal (or ``"done"`` +# state). +# +# They allow you to control how many frames to collect at each iteration +# (through the ``frames_per_batch`` parameter), +# when to reset the environment (through the ``max_frames_per_traj`` argument), +# on which ``device`` the policy should be executed, etc. They are also +# designed to work efficiently with batched and multiprocessed environments. +# +# The simplest data collector is the :class:`SyncDataCollector`: it is an +# iterator that you can use to get batches of data of a given length, and +# that will stop once a total number of frames (``total_frames``) have been +# collected. +# Other data collectors (``MultiSyncDataCollector`` and +# ``MultiaSyncDataCollector``) will execute the same operations in synchronous +# and asynchronous manner over a set of multiprocessed workers. +# +# As for the policy and environment before, the data collector will return +# :class:`tensordict.TensorDict` instances with a total number of elements that will +# match ``frames_per_batch``. Using :class:`tensordict.TensorDict` to pass data to the +# training loop allows you to write dataloading pipelines +# that are 100% oblivious to the actual specificities of the rollout content. +# +collector = SyncDataCollector( + env, + policy_module, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + split_trajs=False, + device=device, +) + +###################################################################### +# Replay buffer +# ------------- +# +# Replay buffers are a common building piece of off-policy RL algorithms. +# In on-policy contexts, a replay buffer is refilled every time a batch of +# data is collected, and its data is repeatedly consumed for a certain number +# of epochs. +# +# TorchRL's replay buffers are built using a common container +# :class:`ReplayBuffer` which takes as argument the components of the buffer: +# a storage, a writer, a sampler and possibly some transforms. Only the +# storage (which indicates the replay buffer capacity) is mandatory. We +# also specify a sampler without repetition to avoid sampling multiple times +# the same item in one epoch. +# Using a replay buffer for PPO is not mandatory and we could simply +# sample the sub-batches from the collected batch, but using these classes +# make it easy for us to build the inner training loop in a reproducible way. +# + +replay_buffer = ReplayBuffer( + storage=LazyTensorStorage(frames_per_batch), + sampler=SamplerWithoutReplacement(), +) + +###################################################################### +# Loss function +# ------------- +# +# The PPO loss can be directly imported from torchrl for convenience using the +# :class:`ClipPPOLoss` class. This is the easiest way of utilizing PPO: +# it hides away the mathematical operations of PPO and the control flow that +# goes with it. +# +# PPO requires some "advantage estimation" to be computed. In short, an advantage +# is a value that reflects an expectancy over the return value while dealing with +# the bias / variance tradeoff. +# To compute the advantage, one just needs to (1) build the advantage module, which +# utilizes our value operator, and (2) pass each batch of data through it before each +# epoch. +# The GAE module will update the input tensordict with new ``"advantage"`` and +# ``"value_target"`` entries. +# The ``"value_target"`` is a gradient-free tensor that represents the empirical +# value that the value network should represent with the input observation. +# Both of these will be used by :class:`ClipPPOLoss` to +# return the policy and value losses. +# + +advantage_module = GAE( + gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True +) + +loss_module = ClipPPOLoss( + actor=policy_module, + critic=value_module, + advantage_key="advantage", + clip_epsilon=clip_epsilon, + entropy_bonus=bool(entropy_eps), + entropy_coef=entropy_eps, + # these keys match by default but we set this for completeness + value_target_key=advantage_module.value_target_key, + critic_coef=1.0, + gamma=0.99, + loss_critic_type="smooth_l1", +) + +optim = torch.optim.Adam(loss_module.parameters(), lr) +scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optim, total_frames // frames_per_batch, 0.0 +) + +###################################################################### +# Training loop +# ------------- +# We now have all the pieces needed to code our training loop. +# The steps include: +# +# * Collect data +# +# * Compute advantage +# +# * Loop over the collected to compute loss values +# * Back propagate +# * Optimize +# * Repeat +# +# * Repeat +# +# * Repeat +# + + +logs = defaultdict(list) +pbar = tqdm(total=total_frames * frame_skip) +eval_str = "" + +# We iterate over the collector until it reaches the total number of frames it was +# designed to collect: +for i, tensordict_data in enumerate(collector): + # we now have a batch of data to work with. Let's learn something from it. + for _ in range(num_epochs): + # We'll need an "advantage" signal to make PPO work. + # We re-compute it at each epoch as its value depends on the value + # network which is updated in the inner loop. + advantage_module(tensordict_data) + data_view = tensordict_data.reshape(-1) + replay_buffer.extend(data_view.cpu()) + for _ in range(frames_per_batch // sub_batch_size): + subdata, *_ = replay_buffer.sample(sub_batch_size) + loss_vals = loss_module(subdata.to(device)) + loss_value = ( + loss_vals["loss_objective"] + + loss_vals["loss_critic"] + + loss_vals["loss_entropy"] + ) + + # Optimization: backward, grad clipping and optim step + loss_value.backward() + # this is not strictly mandatory but it's good practice to keep + # your gradient norm bounded + torch.nn.utils.clip_grad_norm_(loss_module.parameters(), max_grad_norm) + optim.step() + optim.zero_grad() + + logs["reward"].append(tensordict_data["next", "reward"].mean().item()) + pbar.update(tensordict_data.numel() * frame_skip) + cum_reward_str = ( + f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})" + ) + logs["step_count"].append(tensordict_data["step_count"].max().item()) + stepcount_str = f"step count (max): {logs['step_count'][-1]}" + logs["lr"].append(optim.param_groups[0]["lr"]) + lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}" + if i % 10 == 0: + # We evaluate the policy once every 10 batches of data. + # Evaluation is rather simple: execute the policy without exploration + # (take the expected value of the action distribution) for a given + # number of steps (1000, which is our env horizon). + # The ``rollout`` method of the env can take a policy as argument: + # it will then execute this policy at each step. + with set_exploration_mode("mean"), torch.no_grad(): + # execute a rollout with the trained policy + eval_rollout = env.rollout(1000, policy_module) + logs["eval reward"].append(eval_rollout["next", "reward"].mean().item()) + logs["eval reward (sum)"].append( + eval_rollout["next", "reward"].sum().item() + ) + logs["eval step_count"].append(eval_rollout["step_count"].max().item()) + eval_str = ( + f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} " + f"(init: {logs['eval reward (sum)'][0]: 4.4f}), " + f"eval step-count: {logs['eval step_count'][-1]}" + ) + del eval_rollout + pbar.set_description(", ".join([eval_str, cum_reward_str, stepcount_str, lr_str])) + + # We're also using a learning rate scheduler. Like the gradient clipping, + # this is a nice-to-have but nothing necessary for PPO to work. + scheduler.step() + +###################################################################### +# Results +# ------- +# +# Before the 1M step cap is reached, the algorithm should have reached a max +# step count of 1000 steps, which is the maximum number of steps before the +# trajectory is truncated. +# +plt.figure(figsize=(10, 10)) +plt.subplot(2, 2, 1) +plt.plot(logs["reward"]) +plt.title("training rewards (average)") +plt.subplot(2, 2, 2) +plt.plot(logs["step_count"]) +plt.title("Max step count (training)") +plt.subplot(2, 2, 3) +plt.plot(logs["eval reward (sum)"]) +plt.title("Return (test)") +plt.subplot(2, 2, 4) +plt.plot(logs["eval step_count"]) +plt.title("Max step count (test)") +plt.show() + +###################################################################### +# Conclusion and next steps +# ------------------------- +# +# In this tutorial, we have learned: +# +# 1. How to create and customize an environment with :py:mod:`torchrl`; +# 2. How to write a model and a loss function; +# 3. How to set up a typical training loop. +# +# If you want to experiment with this tutorial a bit more, you can apply the following modifications: +# +# * From an efficiency perspective, +# we could run several simulations in parallel to speed up data collection. +# Check :class:`torchrl.envs.ParallelEnv` for further information. +# +# * From a logging perspective, one could add a :class:`torchrl.record.VideoRecorder` transform to +# the environment after asking for rendering to get a visual rendering of the +# inverted pendulum in action. Check :py:mod:`torchrl.record` to +# know more. +# diff --git a/requirements.txt b/requirements.txt index 7f6760428f5..321ce5a3ca9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,7 @@ tensorboard jinja2==3.0.3 pytorch-lightning torchx +torchrl ax-platform nbformat>=4.2.0 datasets @@ -50,9 +51,9 @@ scikit-image scipy pillow==9.3.0 wget -gym==0.25.1 gym-super-mario-bros==7.4.0 -gymnasium==0.27.0 +pyopengl +gymnasium[mujoco]==0.27.0 timm iopath pygame==2.1.2