<a href="https://colab.research.google.com/github/prantoran/ai-prac/blob/master/rl/actor_critic_gym_cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
# Define the actor-critic network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc_pi = nn.Linear(64, action_dim)
        self.fc_v = nn.Linear(64, 1)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        pi = torch.softmax(self.fc_pi(x), dim=0)
        v = self.fc_v(x)

        return pi, v


In [3]:
# Define the environment and other parameters
env = gym.make('CartPole-v1')
num_episodes = 1000
discount_factor = 0.99

learning_rate = 0.001

# Initialize the ActorCritic network
agent = ActorCritic(env.observation_space.shape[0], env.action_space.n)

# Define the optimizer
optimizer = optim.Adam(agent.parameters(), lr=learning_rate)


In [4]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<CartPoleEnv<CartPole-v1>>>>>

In [5]:
type(env)

In [6]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

In [8]:
type(env.observation_space)

In [7]:
env.observation_space.shape

(4,)

In [17]:
env.observation_space.shape[0]

4

In [9]:
env.action_space

Discrete(2)

In [10]:
type(env.action_space)

In [11]:
env.action_space.n

np.int64(2)

# training

For each episode,


- We begin by resetting the environment and the other variables.
- Until the episode ends, we:
  - Select an action using the agent’s policy
  - Take a step in the environment using the action
  - Calculate the TD error and thereby, the loss
  - Update the network parameters using the loss
  - Set the new state as the current state

In [29]:
# Define the training loop
for episode in range(num_episodes):
    # Initialize the environment
    state, _ = env.reset() # type(state) = numpy.ndarray, i.e. [ 0.00016056  0.02509673 -0.01012855 -0.0311483 ], box dimensions
    done = False
    total_reward = 0

    while not done:
        # Select an action using the agent's policy
        probs, val = agent(torch.tensor(state, dtype=torch.float32))
        # probs: tensor([0.2676, 0.7324], grad_fn=<SoftmaxBackward0>) type: <class 'torch.Tensor'>, size == action_dim
        # val: tensor([27.9125], grad_fn=<ViewBackward0>) type: <class 'torch.Tensor'>

        action = np.random.choice(np.arange(len(probs)), p=probs.detach().numpy())
        # len(probs): 2
        # np.arange(len(probs)): [0 1]
        # probs.detach(): i.e.  tensor([0.1713, 0.8287]) (grad_fn removed)
        # probs.detach().numpy(): i.e. [0.1712847 0.8287153]
        # Take a step in the environment

        next_state, reward, done, _, _ = env.step(action) # state type: numpy.ndarray size 4, reward: float

        total_reward += reward

        # Calculate the TD error and loss
        _, next_val = agent(torch.tensor(next_state, dtype=torch.float32))
        err = reward + discount_factor * (next_val * (1 - done)) - val
        actor_loss = -torch.log(probs[action]) * err
        critic_loss = torch.square(err)
        loss = actor_loss + critic_loss

        # Update the network
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Set the state to the next state
        state = next_state

    # Print the total reward for the episode
    print(f'Episode {episode}: Total reward = {total_reward}')


Episode 0: Total reward = 113.0
