# PPO

### Let's run PPO on LunarLander!

#### Imports

In [1]:
from optimrl.algorithms.ppo import PPOLossFunction, PPOOptimizer
import gym
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as td
import numpy as np
from tqdm import tqdm

#### Seeds

In [2]:
SEED = 1
EVAL_SEED = 2

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x115a70af0>

#### Grab obs + act dims

In [3]:
env = gym.make("LunarLander-v2")
obs_dims = env.observation_space.shape[-1]
act_dims = env.action_space.n
print(f"Obs dims: {obs_dims}, Action dims: {act_dims}")

Obs dims: 8, Action dims: 4


### Create policy

In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class PPOCategoricalPolicy(nn.Module):
    def __init__(
        self,
        obs_dims,
        act_dims
    ):
        super().__init__()
        self.obs_dims = obs_dims
        self.act_dims = act_dims

        self.critic = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 1)),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.ReLU(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, self.act_dims)),
        )

    def forward(self, obs: torch.Tensor, actions = None):
        logits = self.actor(obs)
        values = self.critic(obs)
        dist = td.Categorical(logits=logits)
        if actions is None:
            actions = dist.sample()
        entropy = dist.entropy()
        log_probs = dist.log_prob(actions)
        return {"actions":actions, "values":values, "log_probs":log_probs, "dist":dist, "logits":logits, "entropy":entropy}

    def act(self, obs: torch.Tensor, prev_output = {}):
        with torch.no_grad():
            out = self.forward(torch.from_numpy(obs))
            return out["actions"].detach().cpu().numpy(), out

#### Train

##### Setup optimizer

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
policy = PPOCategoricalPolicy(8, 4)
loss_fn = PPOLossFunction(clip_ratio=0.2, ent_coef=0.01, vf_coef=1.0)
optimizer = PPOOptimizer(policy, loss_fn, pi_lr=0.0002, n_updates=5, num_minibatches=1)

policy = policy.to(device)
optimizer = optimizer.to(device)

##### Create vector envs

In [6]:
def make_env(env_name):
    env =  gym.make(env_name)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    return env

train_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(4)],
)
test_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(1)],
)

##### Main loop

In [7]:
NUM_STEPS = 20000
MAX_ENV_STEPS = 256

In [8]:
mean_train_rewards = []
mean_test_rewards = []
bar = tqdm(np.arange(NUM_STEPS))

for i in bar:
    with torch.no_grad():
        train_rollout = optimizer.rollout(train_envs, policy, MAX_ENV_STEPS, SEED, evaluate=False)
        eval_rollout = optimizer.rollout(test_envs, policy, MAX_ENV_STEPS, EVAL_SEED, evaluate=True)
    loss, rewards, stats = optimizer.step(
        train_rollout.to_torch(
            device=optimizer.device
        )
    )
    mean_train_rewards.append(train_rollout.stats["sum_rewards"])
    mean_test_rewards.append(eval_rollout.stats["sum_rewards"])
    if i % 50 == 0:
        print(f"Train: {np.mean(mean_train_rewards[-20:])} Test: {np.mean(mean_test_rewards[-20:])}")

  if not isinstance(terminated, (bool, np.bool8)):
  0%|              | 2/20000 [00:00<56:01,  5.95it/s]

Train: -393.9243048403554 Test: -373.5218742747213


  0%|             | 52/20000 [00:08<55:27,  6.00it/s]

Train: -274.72977256507704 Test: -151.4301853190978


  1%|            | 101/20000 [00:17<58:39,  5.65it/s]

Train: -226.08828297263986 Test: -118.25417795389743


  1%|            | 152/20000 [00:26<59:04,  5.60it/s]

Train: -185.47341344921622 Test: -126.04781152072255


  1%|          | 201/20000 [00:38<1:17:17,  4.27it/s]

Train: -146.15805684895832 Test: -143.67406486778333


  1%|▏         | 251/20000 [00:48<1:13:28,  4.48it/s]

Train: -57.111805110556645 Test: -39.88096994938027


  2%|▏         | 301/20000 [00:59<1:14:17,  4.42it/s]

Train: -24.35812990504827 Test: -46.02572250673692


  2%|▏         | 351/20000 [01:13<1:53:45,  2.88it/s]

Train: -50.54331651953147 Test: -51.48056194621903


  2%|▏         | 401/20000 [01:28<1:39:24,  3.29it/s]

Train: 16.423162516561657 Test: 36.05912656126602


  2%|▏         | 451/20000 [01:42<1:44:59,  3.10it/s]

Train: 39.92314675425124 Test: 30.777093664942107


  3%|▎         | 501/20000 [01:58<1:43:12,  3.15it/s]

Train: 50.2628253629227 Test: 49.48842975058435


  3%|▎         | 551/20000 [02:14<1:44:34,  3.10it/s]

Train: 64.89179249971582 Test: 48.05182488810087


  3%|▎         | 601/20000 [02:31<1:51:50,  2.89it/s]

Train: 74.11895321577315 Test: 72.88248089889167


  3%|▎         | 651/20000 [02:48<1:50:34,  2.92it/s]

Train: 75.90713243423974 Test: 56.513849171164885


  4%|▎         | 701/20000 [03:05<1:46:42,  3.01it/s]

Train: 81.36831672448784 Test: 85.4187235381402


  4%|▍         | 751/20000 [03:22<1:53:03,  2.84it/s]

Train: 84.41375933213045 Test: 91.01992543915401


  4%|▍         | 801/20000 [03:39<1:58:25,  2.70it/s]

Train: 86.49154817891915 Test: 88.24281142622044


  4%|▍         | 831/20000 [03:50<1:28:31,  3.61it/s]


KeyboardInterrupt: 