In [1]:
from ezrl.policy import GymPolicy
from ezrl.algorithms.es import ESOptimizer

In [4]:
from hypernn.torch.hypernet import TorchHyperNetwork
from hypernn.torch.weight_generator import TorchWeightGenerator
from hypernn.torch.embedding_module import TorchEmbeddingModule

In [29]:
from typing import Optional, Any, Dict

class DefaultTorchEmbeddingModule(TorchEmbeddingModule):
    def __init__(self, embedding_dim: int, num_embeddings: int, input_shape: Optional[Any] = None):
        super().__init__(embedding_dim, num_embeddings, input_shape)
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)

    def forward(self, inp: Optional[Any] = None, *args, **kwargs):
        indices = torch.arange(self.num_embeddings).to(self.device)
        return self.embedding(indices), {}


In [77]:
import torch.nn.functional as F
import torch

class DeepTorchWeightGenerator(TorchWeightGenerator):
    def __init__(self, embedding_dim: int, num_embeddings: int, hidden_dim: int, input_shape: Optional[Any] = None):
        super().__init__(embedding_dim, num_embeddings, hidden_dim, input_shape)
        self.linear1 = nn.Linear(embedding_dim, 8, bias=False)
        self.linear2 = nn.Linear(8, hidden_dim, bias=False)

    def forward(
        self, embedding: torch.Tensor, inp: Optional[Any] = None
    ) -> torch.Tensor:
        x = self.linear1(embedding)
        x = F.relu(x)
        return self.linear2(x).view(-1), {}



In [18]:
target_network = nn.Sequential(
            nn.Linear(28, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 8, bias=False)
        )


In [19]:
pytorch_total_params = sum(p.numel() for p in target_network.parameters() if p.requires_grad)
pytorch_total_params

1792

In [21]:
import torch
import torch.nn as nn

def make_sequental_mlp(input_space, action_space, hidden_dim, bias, layers): 
    
    fc_in = nn.Linear(input_space, hidden_dim, bias=bias)
    fc_out = nn.Linear(hidden_dim, action_space , bias=bias)
    tanh = torch.nn.Tanh()
    layer_list = [fc_in, tanh]
    for i in range(1, layers-1):
        layer_list.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
        layer_list.append(torch.nn.Tanh())
    layer_list.append(fc_out)
    
    return torch.nn.Sequential(*layer_list)


class MLP(nn.Module):        
    
    def __init__(self, input_space, action_space, hidden_dim, bias, layers):
        super(MLP, self).__init__()
        self.out = make_sequental_mlp(input_space, action_space, hidden_dim, bias, layers)

    def forward(self, x):
        return self.out(x)
        

In [23]:
import gym
from gym import wrappers as w
from gym.spaces import Discrete, Box
import pybullet_envs
import numpy as np
import torch
import torch.nn as nn
from typing import List, Any
import pickle
import os
import time

from torchvision import datasets, transforms
import torchvision.transforms as T

gym.logger.set_level(40)


In [24]:
env = gym.make("AntBulletEnv-v0")

pybullet build time: Dec 23 2020 01:49:05


In [26]:
mlp = MLP(env.observation_space.shape[0], env.action_space.shape[0], 28, False, 3)

In [28]:
pytorch_total_params = sum(p.numel() for p in mlp.parameters() if p.requires_grad)
pytorch_total_params

1792

In [98]:
EMBEDDING_DIM = 7
NUM_EMBEDDINGS = 56

embedding_network = DefaultTorchEmbeddingModule.from_target(target_network, EMBEDDING_DIM, NUM_EMBEDDINGS)
weight_generator = DeepTorchWeightGenerator.from_target(target_network, EMBEDDING_DIM, NUM_EMBEDDINGS)

In [99]:
hypernetwork = TorchHyperNetwork.from_target(
                            mlp,
                            embedding_module=embedding_network,
                            weight_generator=weight_generator,
                        )
pytorch_total_params = sum(p.numel() for p in hypernetwork.parameters() if p.requires_grad)
pytorch_total_params

704

In [46]:
from typing import Any, Dict
import torch.nn as nn
import torch.distributions as td

class LunarLanderPolicy(GymPolicy):
    def __init__(self):
        super().__init__()
        self.input_dims = 8
        self.output_dims = 4

        self.net = nn.Sequential(
            nn.Linear(8, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 28, bias=False),
            nn.Tanh(),
            nn.Linear(28, 8, bias=False)
        )

    def forward(self, obs: Any) -> Dict[str, Any]:
        logits = self.net(obs)
        dist = td.Categorical(logits=logits)
        action = dist.sample()
        return {"action":action}

    def act(self, obs: Any):
        out = self.forward(obs)
        return out["action"].item(), out

In [3]:
import gym
import torch
import numpy as np

def es_rollout(policy: GymPolicy, env_name: str = None, env=None, env_creation_fn=None):
    if env_name is None and env is None:
        raise ValueError("env_name or env must be provided!")
    if env is None:
        if env_creation_fn is None:
            env_creation_fn = gym.make
        env = env_creation_fn(env_name)
    done = False
    rewards = []
    observation = env.reset()
    with torch.no_grad():
        while not done:
            action, _ = policy.act(
                torch.from_numpy(observation).unsqueeze(0).to(policy.device)
            )
            next_observation, reward, done, info = env.step(action)

            rewards.append(reward)

            observation = next_observation
    env.close()
    return np.array(rewards)


In [4]:
from ezrl.utils import get_tensorboard_logger

In [5]:
policy = LunarLanderPolicy()
device = torch.device('cuda')
policy = policy.to(device)

In [6]:
writer = get_tensorboard_logger("ESOptimizer")
optimizer = ESOptimizer(policy, lr=0.02)

Follow tensorboard logs with: tensorboard --logdir '/home/shyam/Code/ez-rl/examples/tensorboard_logs/ESOptimizer_2022-02-28 19:35:45.936224'


In [7]:
from tqdm import tqdm

bar = tqdm(np.arange(50000))

for i in bar:
    rewards, epsilon, mean = optimizer.rollout(es_rollout, env_name = "LunarLander-v2")
    optimizer.zero_grad()
    loss = optimizer.loss_fn(rewards, epsilon, mean)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.0)
    optimizer.step()

    grad_dict = {}
    for n, W in policy.named_parameters():
        if W.grad is not None:
            grad_dict["{}_grad".format(n)] = float(torch.sum(W.grad).item())

    avg_reward = np.mean(rewards)


    metrics_dict = {"loss":avg_reward, "sum_reward":avg_reward, **grad_dict}

    for key in metrics_dict:
        writer.add_scalar(key, metrics_dict[key], i)


    bar.set_description("Loss: {}, Reward: {}".format(loss, avg_reward))

Loss: <ezrl.algorithms.es.ESLoss object at 0x7f195b1a13d0>, Reward: 148.65967579629182:   5%|▌         | 2670/50000 [1:26:35<25:34:53,  1.95s/it] 


KeyboardInterrupt: 

In [8]:
from ezrl.utils import render

In [12]:
env = gym.make("LunarLander-v2")

In [13]:
render(env, policy)

([array([[[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
  
         [[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
  
         [[  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],
  
         ...,
  
         [[255, 255, 255],
          [255, 255, 255],
          [255, 255, 255],
          ...,
          [255, 255, 255],
          [255, 255, 255],
          [255, 255, 255]],
  
         [[255, 255, 255],
          [255, 255, 255],
          [255, 255, 255],
          ...,
          [255, 255, 255],
          [255, 255, 255],
          [255, 255, 255]],
  
         [[255, 255, 255],
          [255, 255, 255],
          [255, 255, 255