### CartPole with PPO

In [None]:
import torch
from tqdm import tqdm
import gymnasium as gym
from RLTools.RLPolicies.PPO import PPO
# torch based implementation

state_size = 4
action_size = 1
hidden_size = 4
output_size = 2

env = gym.make('CartPole-v1')

p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

v_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, 1)
)

policy = PPO(p_net, v_net)

optimizer = torch.optim.Adam(policy.parameters(), 0.001)

update_policy_every = 5

counter = 0
for _ in tqdm(range(10000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    entropies = []

    while not episode_over:
        action, log_prob, entropy = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    loss = policy.reward(observations, actions, rewards, entropies).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if counter % update_policy_every == 0 and counter > 0:
        policy.swap()
    counter += 1
    
env.close()

### Visualization

In [None]:
env = gym.make('CartPole-v1', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = counter > 100 #terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)

env.close()

### Now let's try lunar lander

In [1]:
import torch
from tqdm import tqdm
import gymnasium as gym
from RLTools.RLPolicies.PPO import PPO
from torch.utils.tensorboard import SummaryWriter
# torch based implementation

state_size = 8
action_size = 1
hidden_size = 16
output_size = 4

env = gym.make('LunarLander-v3')

p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

v_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, 1)
)

policy = PPO(p_net, v_net, device='cuda')

optimizer = torch.optim.Adam(policy.parameters(), 0.01)

update_policy_every = 5

import datetime
log_dir = 'logs/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir)

counter = 0
for episode in tqdm(range(200000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    entropies = []

    while not episode_over:
        action, log_prob, entropy = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    loss = policy.reward(observations, actions, rewards, entropies).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    writer.add_scalar("loss", loss.item(), episode)
    writer.add_scalar("reward", sum(rewards), episode)

    if counter % update_policy_every == 0 and counter > 0:
        policy.swap()
    counter += 1
    
env.close()

  states = torch.tensor(states, dtype=torch.float32).to(self.device)
  3%|▎         | 5756/200000 [43:43<24:35:38,  2.19it/s]


KeyboardInterrupt: 

In [None]:
env = gym.make('LunarLander-v3', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)

env.close()

### Vectorized Lunar Lander

In [4]:
import torch
from tqdm import tqdm
import gymnasium as gym
from RLTools.RLPolicies.PPO import PPO, BPPO
from torch.utils.tensorboard import SummaryWriter
import numpy as np
# torch based implementation

state_size = 8
action_size = 1
hidden_size = 16
output_size = 4

# env = gym.make('LunarLander-v3')

n_env = 64
env = gym.vector.AsyncVectorEnv([lambda: gym.make('LunarLander-v3') for _ in range(n_env)]) # reminder gym.vector.AsyncVectorEnv allows different envs!


p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

v_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, 1)
)

policy = BPPO(p_net, v_net, device='cuda')

optimizer = torch.optim.Adam(policy.parameters(), 0.01)

update_policy_every = 5
update_gradients_every = 1

import datetime
log_dir = 'logs/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir)

counter_for_swap = 0
counter_for_backward = 0
for episode in tqdm(range(200000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    entropies = []

    done_envs = np.zeros(n_env, dtype=bool)
    while not episode_over:
        action, log_prob, entropy = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)



        observation, reward, terminated, truncated, info = env.step(action.cpu().numpy())
        rewards.append(reward)

        done_envs |= (terminated | truncated)
        episode_over = np.all(done_envs)
        if not episode_over:
            observations.append(observation)

    # Tensorize
    obs_seq = [torch.tensor([observations[t][i] for t in range(len(observations))], dtype=torch.float32) for i in range(n_env)]
    act_seq = [torch.tensor([actions[t][i] for t in range(len(actions))], dtype=torch.int64) for i in range(n_env)]
    rew_seq = [torch.tensor([rewards[t][i] for t in range(len(rewards))], dtype=torch.float32) for i in range(n_env)]
    logp_seq = [torch.stack([log_probs[t][i] for t in range(len(log_probs))]) for i in range(n_env)]
    entr_seq = [torch.stack([entropies[t][i] for t in range(len(entropies))]) for i in range(n_env)]
    # Pad
    obs_tensor = torch.nn.utils.rnn.pad_sequence(obs_seq, batch_first=True).to(policy.device)         # [batch_size, T, state_size]
    actions_tensor = torch.nn.utils.rnn.pad_sequence(act_seq, batch_first=True).to(policy.device)     # [batch_size, T]
    rewards_tensor = torch.nn.utils.rnn.pad_sequence(rew_seq, batch_first=True).to(policy.device)     # [batch_size, T]
    log_probs_tensor = torch.nn.utils.rnn.pad_sequence(logp_seq, batch_first=True).to(policy.device)  # [batch_size, T]
    entropies_tensor = torch.nn.utils.rnn.pad_sequence(entr_seq, batch_first=True).to(policy.device)  # [batch_size, T]
    # mask
    lengths = torch.tensor([len(seq) for seq in rew_seq], device=policy.device)    # [batch_size]
    max_len = rewards_tensor.size(1)  # time dimension
    mask = torch.arange(max_len, device=policy.device).unsqueeze(0) < lengths.unsqueeze(1)  # [batch_size, T]

    loss = policy.reward(obs_tensor, actions_tensor, rewards_tensor, entropies_tensor, mask).mean()
    if counter_for_backward % update_gradients_every == 0 and counter_for_backward > 0:
        loss.backward()
        counter_for_swap += 1 
        optimizer.step()
        optimizer.zero_grad()
        writer.add_scalar("loss", loss.item(), episode)
        writer.add_scalar("reward", rewards_tensor.sum(dim=1).mean(), episode)

    if counter_for_swap % update_policy_every == 0 and counter_for_swap > 0:
        policy.swap()

    counter_for_backward += 1
    
env.close()

  logger.warn(
  self._raise_if_errors(successes)
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/gymnasium/vector/async_vector_env.py", line 701, in _async_worker
    command, data = pipe.recv()
                    ^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
          ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 430, in _recv_bytes
    buf = self._recv(4)
          ^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 395, in _recv
    chunk = read(handle, remaining)
            ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
[0m
  self._raise_if_errors(successes)
  self._raise_if_errors(successes)
  File "/local0/scratch/git/RLTests/.venv/lib/python3.11/site-packages/gymnasium/vector/async_vector_env.py", line 723, in _async_worker
    ) = env.step(data)
        ^^^^^^^^^^^^^^
  File "/local0/scratch/git/RLTests/.venv/lib/python3

KeyboardInterrupt: 

### Visualization

In [9]:
env = gym.make('LunarLander-v3', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action.cpu().numpy())
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)

env.close()