In [1]:
from optim_rl.policy import GymPolicy
from optim_rl.algorithms.es import ESOptimizer



In [2]:
import gym
from gym import wrappers as w
from gym.spaces import Discrete, Box
import pybullet_envs
import numpy as np
import torch
import torch.nn as nn
from typing import List, Any
import pickle
import os
import time

from torchvision import datasets, transforms
import torchvision.transforms as T

gym.logger.set_level(40)


  warn(f"Failed to load image Python extension: {e}")


In [3]:
from typing import Any, Dict
import torch.nn as nn
import torch.distributions as td

class LunarLanderPolicy(GymPolicy):
    def __init__(self):
        super().__init__()
        self.input_dims = 8
        self.output_dims = 4

        self.net = nn.Sequential(
            nn.Linear(8, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 4, bias=False)
        )

    def forward(self, obs: Any) -> Dict[str, Any]:
        logits = self.net(obs)
        dist = td.Categorical(logits=logits)
        action = dist.sample()
        return {"action":action}

    def act(self, obs: Any):
        out = self.forward(obs)
        return out["action"].item(), out

In [4]:
import gym
import torch
import numpy as np

def es_rollout(policy: GymPolicy, env_name: str = None, env=None, env_creation_fn=None):
    if env_name is None and env is None:
        raise ValueError("env_name or env must be provided!")
    if env is None:
        if env_creation_fn is None:
            env_creation_fn = gym.make
        env = env_creation_fn(env_name)
    done = False
    rewards = []
    observation = env.reset()
    with torch.no_grad():
        while not done:
            action, _ = policy.act(
                torch.from_numpy(observation).unsqueeze(0).to(policy.device)
            )
            next_observation, reward, done, info = env.step(action)

            rewards.append(reward)

            observation = next_observation
    env.close()
    return {"rewards":np.array(rewards)}


In [5]:
from optim_rl.utils import get_tensorboard_logger

In [6]:
policy = LunarLanderPolicy()
device = torch.device('cuda')
policy = policy.to(device)

In [7]:
optimizer = ESOptimizer(policy, lr=0.02)

In [8]:
writer = get_tensorboard_logger("ESOptimizer")

Follow tensorboard logs with: tensorboard --logdir '/home/shyam/Code/ez-rl/examples/tensorboard_logs/ESOptimizer_2023-02-19 18:44:47.328402'


In [9]:
from tqdm import tqdm

bar = tqdm(np.arange(50000))

for i in bar:
    rewards, epsilon, mean = optimizer.rollout(es_rollout, env_name = "LunarLander-v2")
    optimizer.zero_grad()
    loss = optimizer.loss_fn(rewards, epsilon, mean)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.0)
    optimizer.step()

    grad_dict = {}
    for n, W in policy.named_parameters():
        if W.grad is not None:
            grad_dict["{}_grad".format(n)] = float(torch.sum(W.grad).item())

    avg_reward = np.mean(rewards)


    metrics_dict = {"loss":avg_reward, "sum_reward":avg_reward, **grad_dict}

    for key in metrics_dict:
        writer.add_scalar(key, metrics_dict[key], i)


    bar.set_description("Loss: {}, Reward: {}".format(np.mean(loss.rewards), avg_reward))

Loss: -48.468420333291434, Reward: -48.468420333291434:   1%|     | 314/50000 [03:04<8:07:38,  1.70it/s]

KeyboardInterrupt

