In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

if __name__ == "__main__":
    __this_file = (
        Path().resolve() / "actor_critic.ipynb"
    )  # jupyter does not have __file__

In [2]:
import torch

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if __name__ == "__main__":
    print(DEVICE)

cuda:0


In [3]:
from torch import nn

# Actor Critic

A concept that Actor Critic introduces is that it hybridizes policy based methods (the actor) and value based methods (the critic). This gives you many of the advantages of both methods, for example you can use off-policy learning of value based methods (though A2C does not), and the continuous action space and stochasticity of policy based methods is maintained.

The actor is a policy that outputs a probability distribution over actions. The critic is a value function that estimates the expected return of a state. The actor is trained to maximize the expected return, while the critic is trained to minimize the error between the estimated return and the actual return.

The objective function is the same as in REINFORCE, except we use the value network to estimate the average return, rather than a batch average, and we subtract the value from the return to get the advantage. This stabilizes the scaling of the policy gradient by having a better baseline.

The huggingface tutorial on A2C wants us to use stable-baselines3, but I call that cheating. We will implement A2C from scratch using PyTorch.

In [None]:
from continuing_education.policy_gradient_methods.reinforce import SamplePolicy


class Actor(SamplePolicy):
    """This is exactly the same as we use in REINFORCE."""

    pass

In [None]:
from continuing_education.lib.episodes import SARS


class ValueCritic:
    """This is a value network, rather than a Q network, to reduce the number of samples needed to train the network."""

    def __init__(
        self, *, state_size: int, hidden_sizes: list[int], gamma: float
    ) -> None:
        super().__init__()
        self.gamma = gamma
        assert len(hidden_sizes) > 0, "Need at least one hidden layer"
        self.state_size = state_size
        self.hidden_sizes = hidden_sizes

        # Dimensions in the network are (batch_size, input_size, 1)
        network: list[nn.Module] = []
        network.append(
            nn.Linear(state_size, hidden_sizes[0])
        )  # Shape: (:, state_size, hidden_sizes[0])
        network.append(nn.ReLU())
        for i in range(len(hidden_sizes) - 1):
            network.append(
                nn.Linear(hidden_sizes[i], hidden_sizes[i + 1])
            )  # Shape: (:, hidden_sizes[i], hidden_sizes[i+1])
            network.append(nn.ReLU())
        network.append(
            nn.Linear(hidden_sizes[-1], 1)
        )  # Shape: (:, hidden_sizes[-1], 1)
        self.network = nn.Sequential(*network).to(DEVICE)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        """Takes a state tensor and returns logits along the action space"""
        state = state.to(DEVICE)
        return self.network(state)

    def advantage(self, sars: SARS) -> torch.Tensor:
        """Computes the advantage of the current state, next state, and reward."""
        state = torch.from_numpy(sars.state).to(DEVICE)
        next_state = torch.from_numpy(sars.next_state).to(DEVICE)
        return sars.reward + self.gamma * self.forward(next_state) - self.forward(state)

# References

1. Mnih, V., Badia, A. P., Mirza, M., Graves, A., Lillicrap, T. P., Harley, T., â€¦ Kavukcuoglu, K. (2016). Asynchronous Methods for Deep Reinforcement Learning. arXiv [Cs.LG]. Retrieved from http://arxiv.org/abs/1602.01783
2. https://huggingface.co/blog/deep-rl-a2c
3. UNIT 6. ACTOR CRITIC METHODS WITH ROBOTICS ENVIRONMENTS. Hugging Face. (n.d.). https://huggingface.co/learn/deep-rl-course/unit6/introduction
4. https://samuelebolotta.medium.com/3-actor-critic-algorithms-779f14465b74