Install Gymnasium

Import the prerequisite packages

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions
import numpy as np
import gymnasium as gym  


ModuleNotFoundError: No module named 'torch'

Check the available environments

Create a new CartPole-v1 environment

In [4]:
import gymnasium as gym
env = gym.make('CartPole-v1')


Check the observation space

In [5]:
print("observation space: ", env.observation_space)

observation space:  Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)


Check an example of an observation

In [6]:
observation, info = env.reset()
print("observation: ", observation)


observation:  [-0.01170683 -0.04804509  0.04595344 -0.0026174 ]


Check the action space

In [7]:
print("action space: ", env.action_space)

action space:  Discrete(2)


In [8]:
env = gym.make('CartPole-v1')

Choose a random seed for the training 

In [9]:
SEED = 1234

env.reset(seed=SEED);

In [10]:
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7fc22b771790>

Create the neural net for the policy

In [11]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()

        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.layer1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x


Write the function to calculate the returns

In [12]:
def calculate_stepwise_returns(rewards, discount_factor):
    returns = []
    R = 0

    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)
    returns = torch.tensor(returns)
    normalized_returns = (returns - returns.mean()) / returns.std()
    return normalized_returns


Implement the forward pass

In [13]:
def forward_pass(env, policy, discount_factor):
    log_prob_actions = []
    rewards = []
    done = False
    episode_return = 0

    policy.train()
    observation, info = env.reset()

    while not done:
        observation = torch.FloatTensor(observation).unsqueeze(0)
        action_pred = policy(observation)
        action_prob = F.softmax(action_pred, dim = -1)
        dist = distributions.Categorical(action_prob)
        action = dist.sample()
        log_prob_action = dist.log_prob(action)

        observation, reward, terminated, truncated, info = env.step(action.item())
        done = terminated or truncated

        log_prob_actions.append(log_prob_action)
        rewards.append(reward)
        episode_return += reward

    log_prob_actions = torch.cat(log_prob_actions)
    stepwise_returns = calculate_stepwise_returns(rewards, discount_factor)

    return episode_return, stepwise_returns, log_prob_actions


Calculate the loss function as the expected value of the returns

In [14]:
def calculate_loss(stepwise_returns, log_prob_actions):
    loss = -(stepwise_returns * log_prob_actions).sum()
    return loss


Use back propagation to update the policy

In [15]:
def update_policy(stepwise_returns, log_prob_actions, optimizer):
    stepwise_returns = stepwise_returns.detach()
    loss = calculate_loss(stepwise_returns, log_prob_actions)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()



Declare the hyperparameters, run the training epochs, and print the results

In [16]:
def main(): 
    MAX_EPOCHS = 500
    DISCOUNT_FACTOR = 0.99
    N_TRIALS = 25
    REWARD_THRESHOLD = 475
    PRINT_INTERVAL = 10
    INPUT_DIM = env.observation_space.shape[0]
    HIDDEN_DIM = 128
    OUTPUT_DIM = env.action_space.n
    DROPOUT = 0.5

    episode_returns = []

    policy = PolicyNetwork(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)

    LEARNING_RATE = 0.01
    optimizer = optim.Adam(policy.parameters(), lr = LEARNING_RATE)

    for episode in range(1, MAX_EPOCHS+1):
        episode_return, stepwise_returns, log_prob_actions = forward_pass(env, policy, DISCOUNT_FACTOR)
        _ = update_policy(stepwise_returns, log_prob_actions, optimizer)

        episode_returns.append(episode_return)
        mean_episode_return = np.mean(episode_returns[-N_TRIALS:])

        if episode % PRINT_INTERVAL == 0:
            print(f'| Episode: {episode:3} | Mean Rewards: {mean_episode_return:5.1f} |')

        if mean_episode_return >= REWARD_THRESHOLD:
            print(f'Reached reward threshold in {episode} episodes')
            break


Run the program. 
If the training doesn't converge, run it again, or change the SEED values.

In [17]:
main()

| Episode:  10 | Mean Rewards:  24.4 |
| Episode:  20 | Mean Rewards:  18.1 |
| Episode:  30 | Mean Rewards:  14.4 |
| Episode:  40 | Mean Rewards:  14.3 |
| Episode:  50 | Mean Rewards:  23.8 |
| Episode:  60 | Mean Rewards:  44.1 |
| Episode:  70 | Mean Rewards:  71.1 |
| Episode:  80 | Mean Rewards: 113.9 |
| Episode:  90 | Mean Rewards: 129.1 |
| Episode: 100 | Mean Rewards: 148.7 |
| Episode: 110 | Mean Rewards: 225.4 |
| Episode: 120 | Mean Rewards: 319.4 |
| Episode: 130 | Mean Rewards: 359.7 |
| Episode: 140 | Mean Rewards: 361.8 |
| Episode: 150 | Mean Rewards: 422.7 |
| Episode: 160 | Mean Rewards: 420.8 |
| Episode: 170 | Mean Rewards: 279.8 |
| Episode: 180 | Mean Rewards: 125.2 |
| Episode: 190 | Mean Rewards: 126.1 |
| Episode: 200 | Mean Rewards: 169.5 |
| Episode: 210 | Mean Rewards: 240.3 |
| Episode: 220 | Mean Rewards: 263.0 |
| Episode: 230 | Mean Rewards: 276.6 |
| Episode: 240 | Mean Rewards: 273.4 |
| Episode: 250 | Mean Rewards: 309.0 |
| Episode: 260 | Mean Rew