<a href="https://colab.research.google.com/github/samuelajala01/cartpole-v1-gym/blob/main/cartpole_v1_gym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install gymnasium



In [3]:
import gymnasium as gym
for i in gym.envs.registry.keys():
 print(i)

CartPole-v0
CartPole-v1
MountainCar-v0
MountainCarContinuous-v0
Pendulum-v1
Acrobot-v1
phys2d/CartPole-v0
phys2d/CartPole-v1
phys2d/Pendulum-v0
LunarLander-v3
LunarLanderContinuous-v3
BipedalWalker-v3
BipedalWalkerHardcore-v3
CarRacing-v3
Blackjack-v1
FrozenLake-v1
FrozenLake8x8-v1
CliffWalking-v1
CliffWalkingSlippery-v1
Taxi-v3
tabular/Blackjack-v0
tabular/CliffWalking-v0
Reacher-v2
Reacher-v4
Reacher-v5
Pusher-v2
Pusher-v4
Pusher-v5
InvertedPendulum-v2
InvertedPendulum-v4
InvertedPendulum-v5
InvertedDoublePendulum-v2
InvertedDoublePendulum-v4
InvertedDoublePendulum-v5
HalfCheetah-v2
HalfCheetah-v3
HalfCheetah-v4
HalfCheetah-v5
Hopper-v2
Hopper-v3
Hopper-v4
Hopper-v5
Swimmer-v2
Swimmer-v3
Swimmer-v4
Swimmer-v5
Walker2d-v2
Walker2d-v3
Walker2d-v4
Walker2d-v5
Ant-v2
Ant-v3
Ant-v4
Ant-v5
Humanoid-v2
Humanoid-v3
Humanoid-v4
Humanoid-v5
HumanoidStandup-v2
HumanoidStandup-v4
HumanoidStandup-v5
GymV21Environment-v0
GymV26Environment-v0


In [4]:
import gymnasium as gym
env = gym.make("CartPole-v1")

In [5]:
print("observation space: ", env.observation_space)

observation space:  Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)


In [36]:
observation, info = env.reset()
print("observation: ", observation)

observation:  [ 0.00635547 -0.03516999 -0.00747336  0.04203025]


In [7]:
print("action space: ", env.action_space)

action space:  Discrete(2)


In [8]:
SEED = 1111
env.reset(seed=SEED)

(array([ 0.03593444,  0.01567786, -0.00182151,  0.01612927], dtype=float32),
 {})

In [28]:

import numpy as np
import torch,torch.nn
import torch.optim as optim

np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7e11f4496610>

In [29]:
import torch.nn.functional as F

class PolicyNetwork(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.layer1 = torch.nn.Linear(input_dim, hidden_dim)
        self.layer2 = torch.nn.Linear(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
    def forward(self, x):
        x = self.layer1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.layer2(x)
        return x

In [30]:
def calculate_stepwise_returns(rewards, discount_factor):
    returns = []
    R = 0
    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)
    returns = torch.tensor(returns)
    normalized_returns = (returns - returns.mean()) / returns.std()
    return normalized_returns

In [32]:
def calculate_loss(stepwise_returns, log_prob_actions):
    loss = -(stepwise_returns * log_prob_actions).sum()
    return loss

In [33]:
def update_policy(stepwise_returns, log_prob_actions, optimizer):
    stepwise_returns = stepwise_returns.detach()
    loss = calculate_loss(stepwise_returns, log_prob_actions)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

In [34]:
def main():
    MAX_EPOCHS = 500
    DISCOUNT_FACTOR = 0.99
    N_TRIALS = 25
    REWARD_THRESHOLD = 475
    PRINT_INTERVAL = 10
    INPUT_DIM = env.observation_space.shape[0]
    HIDDEN_DIM = 128
    OUTPUT_DIM = env.action_space.n
    DROPOUT = 0.5
    episode_returns = []
    policy = PolicyNetwork(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)
    LEARNING_RATE = 0.01
    optimizer = optim.Adam(policy.parameters(), lr = LEARNING_RATE)
    for episode in range(1, MAX_EPOCHS+1):
        episode_return, stepwise_returns, log_prob_actions = forward_pass(env, policy, DISCOUNT_FACTOR)
        _ = update_policy(stepwise_returns, log_prob_actions, optimizer)
        episode_returns.append(episode_return)
        mean_episode_return = np.mean(episode_returns[-N_TRIALS:])
        if episode % PRINT_INTERVAL == 0:
            print(f'| Episode: {episode:3} | Mean Rewards: {mean_episode_return:5.1f} |')
        if mean_episode_return >= REWARD_THRESHOLD:
            print(f'Reached reward threshold in {episode} episodes')
            break

In [35]:
main()

| Episode:  10 | Mean Rewards:  33.4 |
| Episode:  20 | Mean Rewards:  50.2 |
| Episode:  30 | Mean Rewards:  57.0 |
| Episode:  40 | Mean Rewards:  85.7 |
| Episode:  50 | Mean Rewards: 105.9 |
| Episode:  60 | Mean Rewards: 131.2 |
| Episode:  70 | Mean Rewards: 145.2 |
| Episode:  80 | Mean Rewards: 213.2 |
| Episode:  90 | Mean Rewards: 193.4 |
| Episode: 100 | Mean Rewards: 119.0 |
| Episode: 110 | Mean Rewards:  65.2 |
| Episode: 120 | Mean Rewards:  95.4 |
| Episode: 130 | Mean Rewards: 198.8 |
| Episode: 140 | Mean Rewards: 242.8 |
| Episode: 150 | Mean Rewards: 195.4 |
| Episode: 160 | Mean Rewards: 143.1 |
| Episode: 170 | Mean Rewards: 121.1 |
| Episode: 180 | Mean Rewards:  92.5 |
| Episode: 190 | Mean Rewards:  60.0 |
| Episode: 200 | Mean Rewards:  53.7 |
| Episode: 210 | Mean Rewards:  75.6 |
| Episode: 220 | Mean Rewards: 124.8 |
| Episode: 230 | Mean Rewards: 256.1 |
| Episode: 240 | Mean Rewards: 397.4 |
| Episode: 250 | Mean Rewards: 370.2 |
| Episode: 260 | Mean Rew