# PPO

### Let's run PPO on LunarLander!

#### Imports

In [1]:
from optimrl.algorithms.ppo import PPOLossFunction, PPOOptimizer
import gym
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as td
import numpy as np
from tqdm import tqdm

#### Seeds

In [2]:
SEED = 1
EVAL_SEED = 2

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1058dca90>

#### Grab obs + act dims

In [3]:
env = gym.make("LunarLander-v2")
obs_dims = env.observation_space.shape[-1]
act_dims = env.action_space.n
print(f"Obs dims: {obs_dims}, Action dims: {act_dims}")

Obs dims: 8, Action dims: 4


### Create policy

In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class PPOCategoricalPolicy(nn.Module):
    def __init__(
        self,
        obs_dims,
        act_dims
    ):
        super().__init__()
        self.obs_dims = obs_dims
        self.act_dims = act_dims

        self.critic = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 1)),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.ReLU(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, self.act_dims)),
        )

    def forward(self, obs: torch.Tensor, *args, **kwargs):
        logits = self.actor(obs)
        values = self.critic(obs)
        dist = td.Categorical(logits=logits)
        actions = dist.sample()
        entropy = dist.entropy()
        log_probs = dist.log_prob(actions)
        return {"actions":actions, "values":values, "log_probs":log_probs, "dist":dist, "logits":logits, "entropy":entropy}

    def act(self, obs: torch.Tensor, prev_output = {}):
        with torch.no_grad():
            out = self.forward(torch.from_numpy(obs), prev_output = {})
            return out["actions"].detach().cpu().numpy(), out

#### Train

##### Setup optimizer

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
policy = PPOCategoricalPolicy(8, 4)
loss_fn = PPOLossFunction(clip_ratio=0.2, ent_coef=0.01, vf_coef=1.0)
optimizer = PPOOptimizer(policy, loss_fn, pi_lr=0.0002, n_updates=5, num_minibatches=1)

policy = policy.to(device)
optimizer = optimizer.to(device)

##### Create vector envs

In [6]:
def make_env(env_name):
    env =  gym.make(env_name)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    return env

train_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(4)],
)
test_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(1)],
)

##### Main loop

In [7]:
NUM_STEPS = 20000
MAX_ENV_STEPS = 256

In [8]:
mean_train_rewards = []
mean_test_rewards = []
bar = tqdm(np.arange(NUM_STEPS))

for i in bar:
    with torch.no_grad():
        train_rollout = optimizer.rollout(train_envs, policy, MAX_ENV_STEPS, SEED, evaluate=False)
        eval_rollout = optimizer.rollout(test_envs, policy, MAX_ENV_STEPS, EVAL_SEED, evaluate=True)
    loss, rewards, stats = optimizer.step(
        train_rollout.to_torch(
            device=optimizer.device
        )
    )
    mean_train_rewards.append(train_rollout.stats["sum_rewards"])
    mean_test_rewards.append(eval_rollout.stats["sum_rewards"])
    if i % 50 == 0:
        print(f"Train: {np.mean(mean_train_rewards[-20:])} Test: {np.mean(mean_test_rewards[-20:])}")

  if not isinstance(terminated, (bool, np.bool8)):
  0%|              | 2/20000 [00:00<56:02,  5.95it/s]

Train: -393.9243048403554 Test: -373.5218742747213


  0%|             | 52/20000 [00:08<56:52,  5.85it/s]

Train: -273.5642783934559 Test: -175.4668695229023


  1%|          | 102/20000 [00:18<1:01:57,  5.35it/s]

Train: -254.56763639811135 Test: -131.64911344327896


  1%|            | 152/20000 [00:27<56:16,  5.88it/s]

Train: -217.76374657240564 Test: -141.10066555474953


  1%|          | 201/20000 [00:38<1:01:35,  5.36it/s]

Train: -144.05246665770775 Test: -102.68073270098117


  1%|▏           | 252/20000 [00:48<56:19,  5.84it/s]

Train: -201.76728110507602 Test: -114.17133899898866


  2%|▏         | 301/20000 [00:58<1:28:51,  3.69it/s]

Train: -70.97767920094306 Test: -41.99667969367815


  2%|▏         | 351/20000 [01:12<1:30:10,  3.63it/s]

Train: -4.048877855830725 Test: -1.0958662008003877


  2%|▏         | 401/20000 [01:25<1:30:49,  3.60it/s]

Train: 8.491294481650153 Test: 10.302631858558048


  2%|▏         | 451/20000 [01:40<1:43:16,  3.15it/s]

Train: 14.229816917864113 Test: 29.157609616637803


  3%|▎         | 501/20000 [01:56<1:51:07,  2.92it/s]

Train: 39.40215731603261 Test: 47.35812656231085


  3%|▎         | 551/20000 [02:14<1:53:08,  2.86it/s]

Train: 28.741395309290958 Test: 26.539667491681023


  3%|▎         | 601/20000 [02:30<1:44:09,  3.10it/s]

Train: 69.04661865034937 Test: 52.765414533312104


  3%|▎         | 651/20000 [02:44<1:38:55,  3.26it/s]

Train: 72.08714115042994 Test: 63.66232220177574


  3%|▎         | 669/20000 [02:50<1:22:00,  3.93it/s]

KeyboardInterrupt

