In [1]:
from typing import Any, Dict  # noqa

import gym
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim

from ezrl.algorithms.ppo import PPOOptimizer
from ezrl.policy import ACPolicy



def ppo_rollout(
    policy: ACPolicy, env_name: str = None, env=None, env_creation_fn=None
) -> Dict[str, np.array]:
    if env_name is None and env is None:
        raise ValueError("env_name or env must be provided!")
    if env is None:
        if env_creation_fn is None:
            env_creation_fn = gym.make
        env = env_creation_fn(env_name)
    done = False
    observations, actions, rewards, log_probs, values = ([], [], [], [], [])
    observation = env.reset()
    with torch.no_grad():
        while not done:
            obs = torch.from_numpy(observation).to(policy.device)
            action, out = policy.act(obs)
            v = policy.critic(obs)
            next_observation, reward, done, info = env.step(action)

            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(out["log_probs"].detach().cpu().numpy())
            values.append(v.detach().cpu().numpy())

            observation = next_observation
    env.close()
    return {
        "observations": np.array(observations),
        "actions": np.array(actions),
        "rewards": np.array(rewards),
        "log_probs": np.array(log_probs),
        "values": np.array(values),
    }


In [2]:
import torch.nn as nn
import torch.distributions as td

class LunarLanderACPolicy(ACPolicy):
    def __init__(self):
        super().__init__()
        self.input_dims = 8
        self.output_dims = 4

        self.policy_net = nn.Sequential(
            nn.Linear(8, 32),
            nn.Tanh(),
            nn.Linear(32, 32),
            nn.Tanh(),
            nn.Linear(32, 4, bias=False)
        )

        self.critic_net = nn.Sequential(
            nn.Linear(8, 32),
            nn.Tanh(),
            nn.Linear(32, 32),
            nn.Tanh(),
            nn.Linear(32, 1, bias=False)
        )

    def forward(self, obs: Any) -> Dict[str, Any]:
        logits = self.policy_net(obs)
        dist = td.Categorical(logits=logits)
        action = dist.sample()
        log_probs = dist.log_prob(action)
        return {"action":action, "dist":dist, "log_probs":log_probs}

    def critic(self, obs:Any):
        return self.critic_net(obs).squeeze()

    def act(self, obs: Any):
        out = self.forward(obs)
        return out["action"].item(), out

In [3]:
policy = LunarLanderACPolicy()
device = torch.device('cuda')
policy = policy.to(device)

In [4]:
from ezrl.utils import get_tensorboard_logger

In [5]:
from typing import Any, Dict  # noqa

import gym
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim

from ezrl.optimizer import RLOptimizer
from ezrl.policy import ACPolicy


def ppo_rollout(
    policy: ACPolicy, env_name: str = None, env=None, env_creation_fn=None
) -> Dict[str, np.array]:
    if env_name is None and env is None:
        raise ValueError("env_name or env must be provided!")
    if env is None:
        if env_creation_fn is None:
            env_creation_fn = gym.make
        env = env_creation_fn(env_name)
    done = False
    observations, actions, rewards, log_probs, values = ([], [], [], [], [])
    observation = env.reset()
    with torch.no_grad():
        while not done:
            obs = torch.from_numpy(observation).to(policy.device)
            action, out = policy.act(obs)
            v = policy.critic(obs)
            next_observation, reward, done, info = env.step(action)

            observations.append(observation)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(out["log_probs"].detach().cpu().numpy())
            values.append(v.detach().cpu().numpy())

            observation = next_observation
    env.close()
    return {
        "observations": np.array(observations),
        "actions": np.array(actions),
        "rewards": np.array(rewards),
        "log_probs": np.array(log_probs),
        "values": np.array(values),
    }


class PPOOptimizer(RLOptimizer):
    def __init__(
        self,
        policy: ACPolicy,
        pi_lr: float = 0.0005,
        vf_coef: float = 0.5,
        entropy_weight: float = 0.001,
        gamma: float = 0.99,
        lam: float = 0.95,
        clip_ratio: float = 0.2,
        train_pi_iters: int = 5,
        num_rollouts: int = 1,
    ):
        self.policy = policy
        self.pi_lr = pi_lr
        self.vf_coef = vf_coef
        self.entropy_weight = entropy_weight
        self.gamma = gamma
        self.lam = lam
        self.clip_ratio = clip_ratio
        self.train_pi_iters = train_pi_iters
        self.num_rollouts = num_rollouts
        self.setup_optimizer()

    def discount_rewards(self, rews: torch.Tensor) -> torch.Tensor:
        n = len(rews)
        rtgs = torch.zeros_like(rews)
        for i in reversed(range(n)):
            rtgs[i] = rews[i] + self.gamma * (rtgs[i + 1] if i + 1 < n else 0)
        return rtgs

    def setup_optimizer(self):
        self.optimizer = optim.Adam(self.policy.parameters(), lr=self.pi_lr)

    def calculate_advantages(
        self, returns: np.array, values: np.array, normalize: bool = True
    ):

        advantages = returns - values

        if normalize:
            advantages = (advantages - np.mean(advantages)) / np.std(advantages)
        return advantages.squeeze()

    def calculate_returns(
        self, rewards: np.array, discount_factor: float, normalize: bool = True
    ):

        returns = []
        R = 0

        for r in reversed(rewards):
            R = r + R * discount_factor
            returns.insert(0, R)

        returns = np.squeeze(np.array(returns))

        if normalize:
            returns = (returns - np.mean(returns)) / np.std(returns)
        return returns

    def value_loss(self, values, returns):
        assert tuple(values.squeeze().size()) == tuple(returns.squeeze().size())
        return F.mse_loss(returns.squeeze(), values.squeeze()).mean()

    def actor_loss(self, log_probs, old_logprobs, advantages):
        ratio = torch.exp(log_probs.squeeze() - old_logprobs.squeeze())
        assert tuple(ratio.size()) == tuple(advantages.size())
        surr1 = ratio * advantages
        surr2 = (
            torch.clamp(ratio, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio)
            * advantages
        )
        loss_pi = -(torch.min(surr1, surr2)).mean()
        return loss_pi

    def zero_grad(self):
        self.optimizer.zero_grad()

    def loss_fn(
        self,
        observations: torch.Tensor,
        actions: torch.Tensor,
        returns: torch.Tensor,
        advantages: torch.Tensor,
        old_log_probs: torch.Tensor,
    ):

        out = self.policy(observations)
        dist = out["dist"]
        log_probs = self.policy.log_prob(dist, actions)
        actor_loss = self.actor_loss(log_probs, old_log_probs, advantages)

        values = self.policy.critic(observations)
        value_loss = self.value_loss(values, returns)
        loss = actor_loss + value_loss
        return loss, actor_loss, value_loss

    def step(self):
        self.optimizer.step()

    def update(
        self,
        observations: torch.Tensor,
        actions: torch.Tensor,
        returns: torch.Tensor,
        advantages: torch.Tensor,
        log_probs: torch.Tensor,
    ) -> Any:
        losses = []
        actor_losses = []
        value_losses = []
        for _ in range(self.train_pi_iters):
            self.zero_grad()
            loss, actor_loss, value_loss = self.loss_fn(
                observations, actions, returns, advantages, log_probs
            )
            loss.backward()
            self.step()
            losses.append(loss.item())
            actor_losses.append(actor_loss.item())
            value_losses.append(value_loss.item())
        return np.array(losses), np.array(actor_losses), np.array(value_losses)

    def rollout_fn(self):
        return ppo_rollout

    def rollout(
        self, rollout_fn=None, pool=None, num_rollouts: int = 1, *args, **kwargs
    ) -> Dict[str, np.array]:
        """
        Optional default rollout_fn for the algorithm.
        """
        if rollout_fn is None:
            rollout_fn = ppo_rollout
        if pool is None:
            rollouts = [
                rollout_fn(self.policy, *args, **kwargs) for _ in range(num_rollouts)
            ]
        else:
            rollouts = list(
                pool.starmap(
                    [tuple(self.policy, *args, **kwargs) for _ in range(num_rollouts)]
                )
            )
        for r in rollouts:
            r["returns"] = self.calculate_returns(r["rewards"], self.gamma)
            r["advantages"] = self.calculate_advantages(r["returns"], r["values"])
        return rollouts


In [6]:
optimizer = PPOOptimizer(policy)

In [7]:
writer = get_tensorboard_logger("PPOOptimizer")


Follow tensorboard logs with: tensorboard --logdir '/home/shyam/Code/ez-rl/examples/tensorboard_logs/PPOOptimizer_2022-03-01 01:02:25.520193'


In [8]:
from tqdm import tqdm

bar = tqdm(np.arange(50000))

for i in bar:
    rollouts = optimizer.rollout(ppo_rollout, env_name = "LunarLander-v2")

    losses = []
    actor_losses = []
    value_losses = []
    rewards = []

    for r in rollouts:
        observations = torch.from_numpy(r["observations"]).float().to(policy.device)
        actions = torch.from_numpy(r["actions"]).float().to(policy.device)
        returns = torch.from_numpy(r["returns"]).float().to(policy.device)
        advantages = torch.from_numpy(r["advantages"]).float().to(policy.device)
        log_probs = torch.from_numpy(r["log_probs"]).float().to(policy.device)

        for _ in range(5): # train each episode for 5 iterations
            optimizer.zero_grad()
            loss, actor_loss, value_loss = optimizer.loss_fn(observations, actions, returns, advantages, log_probs)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.0)
            optimizer.step()

            grad_dict = {}
            for n, W in policy.named_parameters():
                if W.grad is not None:
                    grad_dict["{}_grad".format(n)] = float(torch.sum(W.grad).item())

        rewards.append(np.sum(r["rewards"]))
        losses.append(loss.item())
        actor_losses.append(actor_loss.item())
        value_losses.append(value_loss.item())

    metrics_dict = {"loss":np.mean(losses), "actor_loss":np.mean(actor_losses), "value_loss":np.mean(value_losses), "sum_reward":np.mean(rewards), **grad_dict}

    for key in metrics_dict:
        writer.add_scalar(key, metrics_dict[key], i)


    bar.set_description("Loss: {}, Reward: {}".format(loss, np.mean(rewards)))

Loss: 1.0612518787384033, Reward: -67.23527357043673:   0%|          | 236/50000 [01:39<28:01:36,  2.03s/it]  