In [1]:
import os, random, time
from collections import deque
from dataclasses import dataclass
import math

import gymnasium as gym
import numpy as np
import torch
from tqdm import tqdm

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from stable_baselines3 import HerReplayBuffer, DDPG, DQN, SAC, TD3
from stable_baselines3.her.goal_selection_strategy import GoalSelectionStrategy
from gymnasium.core import Wrapper
from callbacks.Eval_Callback import Eval_Callback

random.seed(1)                                                 
np.random.seed(1)
torch.manual_seed(1)
torch.backends.cudnn.deterministic = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
env = gym.make("FetchPickAndPlace-v2", render_mode="rgb_array")

model_class = DDPG  # works also with SAC, DDPG and TD3

eval_callback = Eval_Callback(eval_env=env, eval_freq=50000, n_eval_episodes=10)

# Available strategies (cf paper): future, final, episode
goal_selection_strategy = "future" # equivalent to GoalSelectionStrategy.FUTURE

# Initialize the model
model = model_class("MultiInputPolicy", env=env, tau=0.05, batch_size=1024, learning_rate=0.001, gamma=0.95,   policy_kwargs=dict(n_critics=2, net_arch=[256, 256, 256]), replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict(n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,),verbose=1)

model.learn(total_timesteps=500000, log_interval=10, callback=eval_callback)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -45      |
|    success_rate    | 0.1      |
| time/              |          |
|    episodes        | 10       |
|    fps             | 20       |
|    time_elapsed    | 24       |
|    total_timesteps | 500      |
| train/             |          |
|    actor_loss      | 1.89     |
|    critic_loss     | 0.199    |
|    learning_rate   | 0.001    |
|    n_updates       | 350      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -45      |
|    success_rate    | 0.1      |
| time/              |          |
|    episodes        | 20       |
|    fps             | 19       |
|    time_elapsed    | 51       |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | 1.64     |
|    critic_loss     | 0.0842   |
|    learning_

KeyboardInterrupt: 

In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)                # Initialize layer weights according to orthogonal method.
    torch.nn.init.constant_(layer.bias, bias_const)             # Set the bias of the layer.
    return layer

In [8]:
class ProbabilisticModel(nn.Module):
    def __init__(self, input_size, initial_var=1, min_var=1e-8, max_var=100,
                 mean_scale=1, var_scale=1,
                 use_spectral_norm_mean=False,
                 use_spectral_norm_var=False):
        super(ProbabilisticModel, self).__init__()

        self.min_var = min_var
        self.max_var = max_var
        self.init_var_offset = np.log(np.exp(initial_var - min_var) - 1)

        self.mean_scale = mean_scale
        self.var_scale = var_scale

        self.mlp = nn.Sequential(
            layer_init(nn.Linear(input_size, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)),
            nn.Tanh(),
            layer_init(nn.Linear(256, 256)))
        
        if use_spectral_norm_mean:
            self.mean = nn.utils.spectral_norm(nn.Linear(256, 1))
        else:
            self.mean = nn.Linear(256, 3)

        if use_spectral_norm_var:
            self.var = nn.utils.spectral_norm(nn.Linear(256, 1))
        else:
            self.var = nn.Linear(256, 3)
        

    def forward(self, x):
        x = self.mlp(x)
        mean = self.mean(x) * self.mean_scale
        var = self.var(x) * self.var_scale

        var = F.softplus(var + self.init_var_offset) + self.min_var
        var = torch.clamp(var, self.min_var, self.max_var)                                                                  # Ensure std is positive
        return mean, var
    
_LOG_2PI = math.log(2 * math.pi)

# Define gaussian negative log likelihood
def gaussian_log_likelihood_loss(pred, target, with_logvar=True,
                                 fixed_variance=None, detach_mean=False,
                                 detach_var=False):
    mean = pred[0]
    if detach_mean:
        mean = mean.detach()

    if with_logvar:
        logvar = pred[1]
        if detach_var:
            logvar = logvar.detach()

        if fixed_variance is not None:
            logvar = torch.ones_like(mean) * math.log(fixed_variance)
        ll = -0.5 * ((target - mean)**2 * (-logvar).exp() + logvar + _LOG_2PI)
    else:
        var = pred[1]
        if detach_var:
            var = var.detach()

        if fixed_variance is not None:
            var = torch.ones_like(mean) * fixed_variance
        ll = -0.5 * ((target - mean)**2 / var + torch.log(var) + _LOG_2PI)

    return -torch.sum(ll, axis=-1)




In [9]:
class CIDWrapper(Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.based_bonus = 0.001
        self.eps = []
                             
        self.cid_model = ProbabilisticModel(input_size=29).to(device)
        self.optimizer = optim.Adam(self.cid_model.parameters(), lr=1e-4)
        self.criterion = gaussian_log_likelihood_loss
        self.model_trained = False
        self.batch_size = 500
        self.step_retrained_model = 0    
        self.tracking_step = 0   

        self.history = deque(maxlen=50)
        self.record_obs = None
        self.lambda_params = 0.2
        self.maximum_bonus = 10
        self.K = 64

    def kl_div(self, m1, v1, m2, v2):
        """KL divergence between two Gaussians"""
        d = m1.shape[-1]
        return (0.5 * (-d + ((v1 + (m2 - m1)**2) / v2 + torch.log(v2) - torch.log(v1)).sum(dim=-1)))

    def reset(self, **kwargs):
        obs = self.env.reset(**kwargs)
        self.record_obs = obs[0]
        return obs
        
    def step(self, action):
        self.eps.append([self.record_obs,action])

        obs, reward, terminated, truncated, info = self.env.step(action)

        self.record_obs = obs
 
        if terminated or truncated: 
            self.history.append(self.eps.copy())
            self.eps = []

        # Train r_model
        self.step_retrained_model += 1
        self.tracking_step += 1
        if self.step_retrained_model == 10000 and len(self.history) != 0:
            X, y = self.create_training_data()
            self.train_cid_model(X, y)
            self.step_retrained_model = 0
            self.model_trained = True

        if self.model_trained:
            bonus = 0

            for k in range(self.K):
                scoring_step = np.concatenate((obs["observation"], action))
                scoring_step_t = torch.tensor(scoring_step, dtype=torch.float32).to(device)
                scoring_mean, scoring_var = self.cid_model(scoring_step_t)
                scoring_mean, scoring_var = scoring_mean.detach(), scoring_var.detach()

                sampled_actions = [np.random.uniform(low=-1.0, high=1.0, size=(4,)) for _ in range(10)]
                sampled_obs_action = [np.concatenate((obs["observation"], sampled_action)) for sampled_action in sampled_actions]
                sampled_obs_action_t = [torch.tensor(obs_action, dtype=torch.float32).to(device) for obs_action in sampled_obs_action]
                sampled_mean_var = [self.cid_model(obs_action_t) for obs_action_t in sampled_obs_action_t]
                sampled_mean_var = [[mean_var[0].detach(), mean_var[1].detach()] for mean_var in sampled_mean_var]
                mean_tensors = [mean_var[0] for mean_var in sampled_mean_var]
                var_tensors = [mean_var[1] for mean_var in sampled_mean_var]
                sampled_mean, sampled_var = torch.mean(torch.stack(mean_tensors)), torch.mean(torch.stack(var_tensors))

                bonus += self.kl_div(scoring_mean, scoring_var, sampled_mean, sampled_var)
            
            bonus /= self.K
            
            if bonus > self.maximum_bonus:
                bonus = self.maximum_bonus
            
            reward += bonus * self.lambda_params

        return obs, reward, terminated, truncated, info
    
    def create_training_data(self):
        X = []
        y = []
        for episode in self.history:
            for i in range(len(episode)-1):
                observation, action = episode[i][0]["observation"], episode[i][1]
                input = np.concatenate((observation, action))
                target = episode[i+1][0]["achieved_goal"] 
                X.append(input)
                y.append(target)

        return X, y
 

    def train_cid_model(self, X, y):
        if self.tracking_step <= 10000:
            training_epochs = 100
        elif 10000 < self.tracking_step <= 250000:
            training_epochs = 50
        elif 25000 < self.tracking_step <= 500000:
            training_epochs = 20
        else:
            return
        for _ in tqdm(range(training_epochs)):
            shuffle_list = list(zip(X, y))
            random.shuffle(shuffle_list)
            X_shuffle, y_shuffle = zip(*shuffle_list)
            for input_sequence, target_output in zip(X_shuffle[:self.batch_size], y_shuffle[:self.batch_size]):
                input_sequence_t = torch.tensor(input_sequence, dtype=torch.float32).to(device)
                target_output_t = torch.tensor(target_output, dtype=torch.float32).to(device)

                self.optimizer.zero_grad()
                mean, var = self.cid_model(input_sequence_t)
                predict = [mean, var]
                loss = self.criterion(predict, target_output_t)
                # print(loss)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

In [10]:
env = gym.make("FetchPickAndPlace-v2", render_mode="rgb_array")
train_env = CIDWrapper(env)

model_class = DDPG  # works also with SAC, DDPG and TD3

eval_callback = Eval_Callback(eval_env=env, eval_freq=50000, n_eval_episodes=10)

# Available strategies (cf paper): future, final, episode
goal_selection_strategy = "future" # equivalent to GoalSelectionStrategy.FUTURE

# Initialize the model
model = model_class("MultiInputPolicy", env=train_env, tau=0.05, batch_size=1024, learning_rate=0.001, gamma=0.95,   policy_kwargs=dict(n_critics=2, net_arch=[256, 256, 256]), replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict(n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,),verbose=1)

model.learn(total_timesteps=500000, log_interval=10, callback=eval_callback)



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0.0      |
| time/              |          |
|    episodes        | 10       |
|    fps             | 43       |
|    time_elapsed    | 11       |
|    total_timesteps | 500      |
| train/             |          |
|    actor_loss      | 2.35     |
|    critic_loss     | 0.158    |
|    learning_rate   | 0.001    |
|    n_updates       | 350      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -50      |
|    success_rate    | 0.0      |
| time/              |          |
|    episodes        | 20       |
|    fps             | 37       |
|    time_elapsed    | 26       |
|    total_timesteps | 1000     |
| train/             

100%|██████████| 100/100 [03:57<00:00,  2.37s/it]


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -47.4    |
|    success_rate    | 0.05     |
| time/              |          |
|    episodes        | 200      |
|    fps             | 17       |
|    time_elapsed    | 569      |
|    total_timesteps | 10000    |
| train/             |          |
|    actor_loss      | 4.34     |
|    critic_loss     | 0.361    |
|    learning_rate   | 0.001    |
|    n_updates       | 9850     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | -37.9    |
|    success_rate    | 0.04     |
| time/              |          |
|    episodes        | 210      |
|    fps             | 9        |
|    time_elapsed    | 1088     |
|    total_timesteps | 10500    |
| train/             |          |
|    actor_loss      | 3.58     |
|    critic_loss     | 0.375    |
|    learning_

100%|██████████| 50/50 [01:21<00:00,  1.63s/it]


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 53       |
|    success_rate    | 0.06     |
| time/              |          |
|    episodes        | 400      |
|    fps             | 2        |
|    time_elapsed    | 8976     |
|    total_timesteps | 20000    |
| train/             |          |
|    actor_loss      | -6.18    |
|    critic_loss     | 0.221    |
|    learning_rate   | 0.001    |
|    n_updates       | 19850    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 53.5     |
|    success_rate    | 0.07     |
| time/              |          |
|    episodes        | 410      |
|    fps             | 2        |
|    time_elapsed    | 9310     |
|    total_timesteps | 20500    |
| train/             |          |
|    actor_loss      | -7.21    |
|    critic_loss     | 0.203    |
|    learning_

100%|██████████| 50/50 [01:20<00:00,  1.61s/it]


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 51.5     |
|    success_rate    | 0.03     |
| time/              |          |
|    episodes        | 600      |
|    fps             | 1        |
|    time_elapsed    | 15686    |
|    total_timesteps | 30000    |
| train/             |          |
|    actor_loss      | -2.46    |
|    critic_loss     | 0.289    |
|    learning_rate   | 0.001    |
|    n_updates       | 29850    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 50       |
|    ep_rew_mean     | 52       |
|    success_rate    | 0.04     |
| time/              |          |
|    episodes        | 610      |
|    fps             | 1        |
|    time_elapsed    | 16023    |
|    total_timesteps | 30500    |
| train/             |          |
|    actor_loss      | -2.65    |
|    critic_loss     | 0.582    |
|    learning_