In [1]:
from ezrl.policy import GymPolicy
from ezrl.algorithms.es import ESOptimizer

In [2]:
from typing import Any, Dict
import torch.nn as nn
import torch.distributions as td

class LunarLanderPolicy(GymPolicy):
    def __init__(self):
        super().__init__()
        self.input_dims = 8
        self.output_dims = 4

        self.net = nn.Sequential(
            nn.Linear(8, 32),
            nn.Tanh(),
            nn.Linear(32, 32),
            nn.Tanh(),
            nn.Linear(32, 4, bias=False)
        )

    def forward(self, obs: Any) -> Dict[str, Any]:
        logits = self.net(obs)
        dist = td.Categorical(logits=logits)
        action = dist.sample()
        return {"action":action}

    def act(self, obs: Any):
        out = self.forward(obs)
        return out["action"].item(), out

In [3]:
import gym
import torch
import numpy as np

def es_rollout(policy: GymPolicy, env_name: str = None, env=None, env_creation_fn=None):
    if env_name is None and env is None:
        raise ValueError("env_name or env must be provided!")
    if env is None:
        if env_creation_fn is None:
            env_creation_fn = gym.make
        env = env_creation_fn(env_name)
    done = False
    rewards = []
    observation = env.reset()
    with torch.no_grad():
        while not done:
            action, _ = policy.act(
                torch.from_numpy(observation).unsqueeze(0).to(policy.device)
            )
            next_observation, reward, done, info = env.step(action)

            rewards.append(reward)

            observation = next_observation

    return np.array(rewards)


In [4]:
from ezrl.utils import get_tensorboard_logger

In [5]:
policy = LunarLanderPolicy()
device = torch.device('cuda')
policy = policy.to(device)

In [6]:
writer = get_tensorboard_logger("ESOptimizer")
optimizer = ESOptimizer(policy, lr=0.02)

Follow tensorboard logs with: tensorboard --logdir '/home/shyam/Code/ez-rl/examples/tensorboard_logs/ESOptimizer_2022-02-28 19:32:42.916647'


In [7]:
from tqdm import tqdm

bar = tqdm(np.arange(50000))

for i in bar:
    rewards, epsilon, mean = optimizer.rollout(es_rollout, env_name = "LunarLander-v2")
    optimizer.zero_grad()
    loss = optimizer.loss_fn(rewards, epsilon, mean)
    loss.backward()
    optimizer.step()

    grad_dict = {}
    for n, W in policy.named_parameters():
        if W.grad is not None:
            grad_dict["{}_grad".format(n)] = float(torch.sum(W.grad).item())

    avg_reward = np.mean(rewards)


    metrics_dict = {"loss":avg_reward, "sum_reward":avg_reward, **grad_dict}

    for key in metrics_dict:
        writer.add_scalar(key, metrics_dict[key], i)


    bar.set_description("Loss: {}, Reward: {}".format(loss, avg_reward))

  0%|          | 0/50000 [00:00<?, ?it/s]


AttributeError: 'ESLoss' object has no attribute 'epsilons'