# Inverse model on FetchNoTask

## Instanciate and train the model

Create the environment.

In [1]:
import gym
import gym_robotics
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecTransposeImage

n_envs = 8

env =make_vec_env(
        "__root__/FetchNoTask-v1",
        n_envs=n_envs,
        env_kwargs=dict(image_obs_space=False),
    )


Initilize the device.

In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

Create a buffer and feed it.

In [3]:
import numpy as np
from stable_baselines3.common.buffers import ReplayBuffer

from copy import deepcopy


def collect_rollouts(n, env, replay_buffer):
    num_timesteps = 0
    last_obs = env.reset()

    while num_timesteps < n:
        # Select action randomly or according to policy
        action = np.array([env.action_space.sample() for _ in range(env.num_envs)])

        # Rescale and perform action
        new_obs, rewards, dones, infos = env.step(action)

        num_timesteps += env.num_envs
        # Avoid modification by reference
        next_obs = deepcopy(new_obs)

        # As the VecEnv resets automatically, new_obs is already the
        # first observation of the next episode
        for i, done in enumerate(dones):
            if done and infos[i].get("terminal_observation") is not None:
                next_obs[i] = infos[i]["terminal_observation"]

        replay_buffer.add(
            last_obs,
            next_obs,
            action,
            rewards,
            dones,
            infos,
        )

        last_obs = new_obs


train_buffer = ReplayBuffer(
    10_000,
    env.observation_space,
    env.action_space,
    n_envs=n_envs,
    device=device,
)
test_buffer = ReplayBuffer(
    1_000,
    env.observation_space,
    env.action_space,
    n_envs=n_envs,
    device=device,
)

collect_rollouts(10_000, env, train_buffer)
collect_rollouts(1_000, env, test_buffer)

Instanciate the model and the optimizer.

In [4]:
from lge.inverse_model import LinearInverseModel

from torch import optim

inverse_model = LinearInverseModel(obs_size=env.observation_space.shape[0], action_size=env.action_space.shape[0], latent_size=16).to(device)
optimizer = optim.Adam(inverse_model.parameters(), lr=1e-3)

Train the model.

In [5]:
import torch.nn.functional as F


for epoch in range(3_000):
    # Sample
    sample = train_buffer.sample(128)
    observations = sample.observations.float()
    next_observations = sample.next_observations.float()
    actions = sample.actions

    # Compute the output image
    inverse_model.train()
    pred_actions = inverse_model(observations, next_observations)

    # Compute the loss
    loss = F.mse_loss(pred_actions, actions)

    # Step the optimizer
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        sample = test_buffer.sample(128)
        observations = sample.observations.float()
        next_observations = sample.next_observations.float()
        actions = sample.actions

        # Compute the output image
        inverse_model.eval()
        pred_actions = inverse_model(observations, next_observations)
        # Compute the loss
        test_loss = F.mse_loss(pred_actions, actions)
        print("epoch: {:5d}\tprediction loss: train {:.5f}, test: {:.5f} ".format(epoch, loss.item(), test_loss.item()))

epoch:     0	prediction loss: train 0.36344, test: 0.31484 
epoch:   100	prediction loss: train 0.34127, test: 0.33228 
epoch:   200	prediction loss: train 0.35044, test: 0.31731 
epoch:   300	prediction loss: train 0.36411, test: 0.33084 
epoch:   400	prediction loss: train 0.32048, test: 0.31557 
epoch:   500	prediction loss: train 0.33997, test: 0.31311 
epoch:   600	prediction loss: train 0.32696, test: 0.31758 
epoch:   700	prediction loss: train 0.30179, test: 0.28957 
epoch:   800	prediction loss: train 0.27369, test: 0.24094 
epoch:   900	prediction loss: train 0.22831, test: 0.19520 
epoch:  1000	prediction loss: train 0.25117, test: 0.19578 
epoch:  1100	prediction loss: train 0.24141, test: 0.19008 
epoch:  1200	prediction loss: train 0.21563, test: 0.18296 
epoch:  1300	prediction loss: train 0.21224, test: 0.18053 
epoch:  1400	prediction loss: train 0.23942, test: 0.17828 
epoch:  1500	prediction loss: train 0.20874, test: 0.15739 
epoch:  1600	prediction loss: train 0.20

See the result for one transition.

In [7]:
# Sample one transition
sample = test_buffer.sample(1)
observations = sample.observations.float()
next_observations = sample.next_observations.float()
actions = sample.actions

# Predict the action
inverse_model.eval()
pred_actions = inverse_model(observations, next_observations)

# See the result
print("true action\t\t", actions)
print("predicted action\t", pred_actions)


true action		 tensor([[-0.8390,  0.3584, -0.8453, -0.5738]])
predicted action	 tensor([[-0.6592,  0.2004, -0.5361, -0.0446]], grad_fn=<AddmmBackward0>)
