# PPO

### Let's run PPO on LunarLander!

#### Imports

In [1]:
from optimrl.algorithms.ppo import PPOLossFunction, PPOOptimizer
from optimrl.policy import GymPolicy
import gym
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
import torch.distributions as td
import numpy as np
from tqdm import tqdm

#### Seeds

In [2]:
SEED = 1
EVAL_SEED = 2

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1079cca50>

#### Grab obs + act dims

In [3]:
env = gym.make("LunarLander-v2")
obs_dims = env.observation_space.shape[-1]
act_dims = env.action_space.n
print(f"Obs dims: {obs_dims}, Action dims: {act_dims}")

Obs dims: 8, Action dims: 4


### Create policy

In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

class PPOCategoricalPolicy(GymPolicy):
    def __init__(
        self,
        obs_dims,
        act_dims
    ):
        super().__init__()
        self.obs_dims = obs_dims
        self.act_dims = act_dims

        self.critic = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, 1)),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(self.obs_dims, 128)),
            nn.ReLU(),
            layer_init(nn.Linear(128, 128)),
            nn.Tanh(),
            layer_init(nn.Linear(128, self.act_dims)),
        )

    def forward(self, obs: torch.Tensor, *args, **kwargs):
        logits = self.actor(obs)
        values = self.critic(obs)
        dist = td.Categorical(logits=logits)
        actions = dist.sample()
        entropy = dist.entropy()
        log_probs = dist.log_prob(actions)
        return {"actions":actions, "values":values, "log_probs":log_probs, "dist":dist, "logits":logits, "entropy":entropy}

    def act(self, obs: torch.Tensor, prev_output = {}):
        with torch.no_grad():
            out = self.forward(torch.from_numpy(obs), prev_output = {})
            return out["actions"].detach().cpu().numpy(), out

#### Train

##### Setup optimizer

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
policy = PPOCategoricalPolicy(8, 4)
loss_fn = PPOLossFunction(clip_ratio=0.2, ent_coef=0.01, vf_coef=1.0)
optimizer = PPOOptimizer(policy, loss_fn, pi_lr=0.0002, n_updates=5, num_minibatches=1)

policy = policy.to(device)
optimizer = optimizer.to(device)

##### Create vector envs

In [6]:
def make_env(env_name):
    env =  gym.make(env_name)
    env = gym.wrappers.RecordEpisodeStatistics(env)
    return env

train_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(4)],
)
test_envs = gym.vector.SyncVectorEnv(
    [lambda: make_env("LunarLander-v2") for i in range(1)],
)

##### Main loop

In [8]:
NUM_STEPS = 20000
MAX_ENV_STEPS = 256

In [9]:
mean_train_rewards = []
mean_test_rewards = []
bar = tqdm(np.arange(NUM_STEPS))

for i in bar:
    with torch.no_grad():
        train_rollout = optimizer.rollout(train_envs, policy, MAX_ENV_STEPS, SEED, evaluate=False)
        eval_rollout = optimizer.rollout(test_envs, policy, MAX_ENV_STEPS, EVAL_SEED, evaluate=True)
    loss, rewards, stats = optimizer.step(
        train_rollout.to_torch(
            device=optimizer.device
        )
    )
    mean_train_rewards.append(train_rollout.stats["sum_rewards"])
    mean_test_rewards.append(eval_rollout.stats["sum_rewards"])
    if i % 50 == 0:
        print(f"Train: {np.mean(mean_train_rewards[-20:])} Test: {np.mean(mean_test_rewards[-20:])}")

  if not isinstance(terminated, (bool, np.bool8)):
  0%|           | 2/20000 [00:00<58:57,  5.65it/s]

Train: -393.9243048403554 Test: -373.5218742747213


  0%|          | 51/20000 [00:08<58:16,  5.70it/s]

Train: -273.5642783934559 Test: -175.4668695229023


  1%|       | 102/20000 [00:18<1:00:40,  5.47it/s]

Train: -254.56763639811135 Test: -131.64911344327896


  1%|         | 152/20000 [00:27<57:54,  5.71it/s]

Train: -217.76374657240564 Test: -141.10066555474953


  1%|       | 202/20000 [00:38<1:03:08,  5.23it/s]

Train: -144.05246665770775 Test: -102.68073270098117


  1%|       | 252/20000 [00:49<1:08:13,  4.82it/s]

Train: -76.38105119147858 Test: -98.79461653134295


  2%|       | 301/20000 [01:00<1:16:01,  4.32it/s]

Train: -5.3193237054889835 Test: -24.242900900393096


  2%|       | 351/20000 [01:13<1:26:28,  3.79it/s]

Train: 1.0020061876276016 Test: -10.782976824479098


  2%|▏      | 401/20000 [01:27<1:36:35,  3.38it/s]

Train: 21.21756061264304 Test: -3.426255855493906


  2%|▏      | 451/20000 [01:41<1:28:39,  3.67it/s]

Train: 57.737975783986585 Test: -20.79175366656107


  3%|▏      | 501/20000 [01:56<1:45:47,  3.07it/s]

Train: 37.771156970021785 Test: 9.045226947642893


  3%|▏      | 551/20000 [02:12<1:39:05,  3.27it/s]

Train: 44.891096920855794 Test: 53.98073737554519


  3%|▏      | 601/20000 [02:27<1:29:07,  3.63it/s]

Train: 83.54920384532238 Test: 29.617076124684974


  3%|▏      | 651/20000 [02:42<1:39:12,  3.25it/s]

Train: 85.35021258918049 Test: 71.44511289284927


  4%|▏      | 701/20000 [02:58<1:43:04,  3.12it/s]

Train: 102.76942228943685 Test: 62.74157267552177


  4%|▎      | 751/20000 [03:14<1:43:14,  3.11it/s]

Train: 93.60628461185051 Test: 69.60163909319085


  4%|▎      | 801/20000 [03:31<1:48:18,  2.95it/s]

Train: 111.6116886770515 Test: 87.29225770937755


  4%|▎      | 851/20000 [03:48<1:49:13,  2.92it/s]

Train: 109.12187029349818 Test: 107.31142128989909


  5%|▎      | 901/20000 [04:05<1:34:31,  3.37it/s]

Train: 84.68232680964925 Test: 85.2608123190793


  5%|▎      | 951/20000 [04:22<1:52:21,  2.83it/s]

Train: 110.65297784919444 Test: 114.98998684806027


  5%|▎     | 1001/20000 [04:39<1:51:15,  2.85it/s]

Train: 125.58778862965798 Test: 119.26638858275062


  5%|▎     | 1011/20000 [04:42<1:28:32,  3.57it/s]


KeyboardInterrupt: 