# Reinforce

Tutorial: https://huggingface.co/learn/deep-rl-course/unit4/introduction

In [1]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [2]:
import pytest
import ipytest

ipytest.autoconfig()

# Environment

First we will create the cartpole environment.

The observation_space of cartpole is a 4-dimensional float vector,
and the action_space is a discrete space with 2 possible actions (left or right).

In [3]:
import gym

env = gym.make("CartPole-v1")
observation_space_shape = env.observation_space.shape
action_space_size = env.action_space.n  # type: ignore
print("State size:", observation_space_shape)
print("Action size:", action_space_size)
state = env.reset()
print(f"Example state: {state}")
action_return = env.step(1)
print(f"Action return: {action_return}")

State size: (4,)
Action size: 2
Example state: (array([ 0.01437391, -0.01910752,  0.00174418,  0.04432971], dtype=float32), {})
Action return: (array([ 0.01399176,  0.17598937,  0.00263077, -0.2478024 ], dtype=float32), 1.0, False, False, {})


  if not isinstance(terminated, (bool, np.bool8)):


# Model

This is the policy network, in the paper represented by $\pi_{\theta}(s_t)$

Meaning the policy $\pi$ given the parameters $\theta$ (which in this code
represents the weights and biases of self.input, self.hidden and self.output) when
doing a forward pass with the state $s$ at time $t$ as input.

The network is very simple feed forward network, with relu activation functions and a softmax output.

The output of the forward method is what the paper calls $\pi_{\theta}(a_i | s_t)$, which is a PDF due to the `softmax`.

The action method is a translation from a numpy state vector into an int action, using the forward pass of the network and the REINFORCE score function.


In [4]:
from typing import NewType
import numpy.typing as npt
import numpy as np

# Lets make some types to make type annotation easier
State = NewType("State", npt.NDArray[np.float64])
Action = NewType("Action", int)
Reward = NewType("Reward", float)

In [5]:
from typing import List, Tuple
from torch import nn
from torch.distributions import Categorical


class Policy(nn.Module):
    def __init__(
        self, state_size: int, action_size: int, hidden_sizes: List[int]
    ) -> None:
        super().__init__()
        assert len(hidden_sizes) > 0, "Need at least one hidden layer"
        network = [nn.Linear(state_size, hidden_sizes[0]), nn.ReLU()]
        for i in range(len(hidden_sizes) - 1):
            network.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]))
            network.append(nn.ReLU())
        network.append(nn.Linear(hidden_sizes[-1], action_size))
        network.append(nn.Softmax())
        self.network = nn.Sequential(*network)

    def forward(self, state: torch.Tensor) -> torch.Tensor:
        return self.network(state)

    def act(self, state: State) -> Tuple[Action, float]:
        # First we got to convert out of numpy and into pytorch
        state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
        print(f"state: {state}")
        print(f"state_unsqueezed: {np.expand_dims(state, axis=0)}")
        pdf = self.forward(
            state_tensor
        ).cpu()  # TODO: If softmax produces a PDF, why do we need Categorical to sample from it?
        print(f"PDF: {pdf}")
        multinomial = Categorical(
            pdf
        )  # TODO: Study up on multinomial distributions and log probs
        # TODO: Take the argmax of the pdf instead and see what happens!
        print(f"Multinomial: {multinomial}")
        action_idx = np.argmax(multinomial)  # type: ignore
        return Action(action_idx.item()), multinomial.log_prob(action_idx)

# Training

Training is done by assembling a sample of trajectories, which are lists of tuples of (state, action, reward).

In [6]:
from dataclasses import dataclass


@dataclass
class SAR:
    state: State
    action: Action
    reward: Reward
    log_prob: float


Trajectory = NewType("Trajectory", List[SAR])
RewardTrajectory = NewType("RewardTrajectory", List[Reward])

In [7]:
def collect_episode(policy: Policy) -> Tuple[Trajectory, Reward]:
    """Returns the trajectory and the sum of all rewards."""
    state, _ = env.reset()
    done = False
    trajectory = []
    while not done:
        action, log_prob = policy.act(state)
        state, reward, done, _, _ = env.step(action)
        trajectory.append(
            SAR(
                state=State(state),
                action=action,
                reward=Reward(reward),
                log_prob=log_prob,
            )
        )
    return Trajectory(trajectory), Reward(sum(sar.reward for sar in trajectory))

This represents the formula $R(\tau)$ in the tutorial. It's a simple reward decay formula.

In [8]:
def cumulative_return(trajectory: RewardTrajectory, gamma: float = 0.5) -> float:
    if len(trajectory) == 0:
        raise ValueError("Trajectory needs at least one item.")
    if len(trajectory) == 1:
        return 0.0
    out: float = trajectory[1]
    if len(trajectory) == 2:
        return out
    for i in range(2, len(trajectory)):
        out += gamma * trajectory[i]
        gamma *= gamma
    return out


# Its important to test equations like this!
@pytest.mark.parametrize(
    "test_input,expected",
    [([0], 0), ([1, 1], 1), ([1, 1, 1], 1.5), ([1, 1, 1, 1], 1.75)],
)
def test_cumulative_return(test_input: RewardTrajectory, expected: float) -> None:
    assert cumulative_return(test_input, gamma=0.5) == expected

# Run Tests

In [9]:
ipytest.run("-vv")

platform darwin -- Python 3.10.6, pytest-8.1.1, pluggy-1.4.0 -- /Users/ryan.peach/Library/Caches/pypoetry/virtualenvs/continuing-education-vJKa4-To-py3.10/bin/python
cachedir: .pytest_cache
rootdir: /Users/ryan.peach/Documents/ryanpeach/continuing_education
configfile: pyproject.toml
plugins: anyio-4.3.0
[1mcollecting ... [0mcollected 4 items

t_bd785e35317a4246a16b95ee6dbc4fd0.py::test_cumulative_return[test_input0-0] [32mPASSED[0m[32m          [ 25%][0m
t_bd785e35317a4246a16b95ee6dbc4fd0.py::test_cumulative_return[test_input1-1] [32mPASSED[0m[32m          [ 50%][0m
t_bd785e35317a4246a16b95ee6dbc4fd0.py::test_cumulative_return[test_input2-1.5] [32mPASSED[0m[32m        [ 75%][0m
t_bd785e35317a4246a16b95ee6dbc4fd0.py::test_cumulative_return[test_input3-1.75] [32mPASSED[0m[32m       [100%][0m



<ExitCode.OK: 0>