### CartPole with PPO

In [None]:
import torch
from tqdm import tqdm
import gymnasium as gym
from RLTools.RLPolicies.PPO import PPO
# torch based implementation

state_size = 4
action_size = 1
hidden_size = 4
output_size = 2

env = gym.make('CartPole-v1')

p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

v_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, 1)
)

policy = PPO(p_net, v_net)

optimizer = torch.optim.Adam(policy.parameters(), 0.001)

update_policy_every = 5

counter = 0
for _ in tqdm(range(10000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    entropies = []

    while not episode_over:
        action, log_prob, entropy = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    loss = policy.reward(observations, actions, rewards, entropies).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if counter % update_policy_every == 0 and counter > 0:
        policy.swap()
    counter += 1
    
env.close()

### Visualization

In [None]:
env = gym.make('CartPole-v1', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = counter > 100 #terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)

env.close()

### Now let's try lunar lander

In [None]:
import torch
from tqdm import tqdm
import gymnasium as gym
from RLTools.RLPolicies.PPO import PPO
from torch.utils.tensorboard import SummaryWriter
# torch based implementation

state_size = 8
action_size = 1
hidden_size = 16
output_size = 4

env = gym.make('LunarLander-v3')

p_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, output_size),
    torch.nn.Softmax(dim=-1)
)

v_net = torch.nn.Sequential(
    torch.nn.Linear(state_size, hidden_size),
    torch.nn.LeakyReLU(),
    torch.nn.Linear(hidden_size, 1)
)

policy = PPO(p_net, v_net)

optimizer = torch.optim.Adam(policy.parameters(), 0.001)

update_policy_every = 5

import datetime
log_dir = 'logs/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = SummaryWriter(log_dir)

counter = 0
for episode in tqdm(range(10000)):
    observation, info = env.reset()

    episode_over = False

    rewards = []
    actions = []
    observations = [observation]
    log_probs = []
    entropies = []

    while not episode_over:
        action, log_prob, entropy = policy.sample_training(observation)
        actions.append(action)
        log_probs.append(log_prob)
        entropies.append(entropy)

        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)

        episode_over = terminated or truncated
        if not episode_over:
            observations.append(observation)

    loss = policy.reward(observations, actions, rewards, entropies).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    writer.add_scalar("loss", loss.item(), episode)
    writer.add_scalar("reward", sum(rewards), episode)

    if counter % update_policy_every == 0 and counter > 0:
        policy.swap()
    counter += 1
    
env.close()

In [None]:
env = gym.make('LunarLander-v3', render_mode="human")

for _ in range(10):
    observation, info = env.reset()


    episode_over = False

    rewards = []
    actions = []
    observations = [observation]

    counter = 0
    while not episode_over:

        #action = env.action_space.sample()  # agent policy that uses the observation and info
        action = policy.sample_best(observation)
        actions.append(action)


        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
        # print(counter)
        counter +=1
        episode_over = terminated or truncated#counter > 100 #truncated #terminated # or truncated
        if not episode_over:
            observations.append(observation)

    #gradients = policy.policy_gradient(actions, rewards, observations)
    #policy.update(0.01, gradients)

env.close()