In [4]:
import torch
import numpy as np
import pickle

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Product, ConstantKernel as C

import gym_sin
from gym import spaces

from utilities.arguments import get_args
from learner.posterior_multi_task import PosteriorMTAgent
from inference.inference_network import InferenceNetwork
from task.ExploreTaskGenerator import ExploreTaskGenerator
from utilities.folder_management import handle_folder_creation


In [5]:
env_name = "gaussexplore-v0"

action_space = spaces.Box(low=np.array([-1]), high=np.array([1]))

latent_dim = 1

device = "cpu"

x_min=-100
x_max=100
noise_std=0.001
std=10
mean_max=60 
mean_min=40

vae_min_seq = 1
vae_max_seq = 20

max_old = [100, 25]
min_old = [-100, 0]

obs_shape = (2,)

In [6]:
task_generator = ExploreTaskGenerator(x_min=x_min, x_max=x_max, noise_std=noise_std, std=std, mean_max=mean_max,
                                     mean_min=mean_min, amplitude=1)
f = task_generator.create_task_family(n_tasks=5000, n_batches=1, test_perc=0, batch_size=1)

In [7]:
task_generator.sample_pair_tasks(1)

([{'min_x': -100,
   'max_x': 100,
   'noise_std': 0.001,
   'std': 10,
   'mean': 58.48133087158203,
   'amplitude': 1}],
 None,
 [tensor([[50.],
          [20.]])],
 tensor([[58.4813]]))

In [8]:
vi = InferenceNetwork(n_in=4, z_dim=latent_dim)
vi_optim = torch.optim.Adam(vi.parameters(), lr=1e-3)

In [9]:
agent = PosteriorMTAgent(action_space=action_space, device=device, gamma=1,
                                 num_steps=20, num_processes=32,
                                 clip_param=0.1, ppo_epoch=4,
                                 num_mini_batch=8,
                                 value_loss_coef=0.5,
                                 entropy_coef=0.001,
                                 lr=0.00005,
                                 eps=1e-6, max_grad_norm=0.5,
                                 use_linear_lr_decay=False,
                                 use_gae=False,
                                 gae_lambda=0.95,
                                 use_proper_time_limits=False,
                                 obs_shape=obs_shape,
                                 latent_dim=latent_dim,
                                 recurrent_policy=False,
                                 hidden_size=8,
                                 use_elu=True,
                                 variational_model=vi,
                                 vae_optim=vi_optim,
                                 rescale_obs=True,
                                 max_old=max_old,
                                 min_old=min_old,
                                 vae_min_seq=vae_min_seq,
                                 vae_max_seq=vae_max_seq,
                                 max_action=x_max, 
                                 min_action=x_min,
                        max_sigma=30,
                        use_decay_kld=True,
                        decay_kld_rate=1, env_dim=0)

In [10]:
res_eval, res_vae, test_list = agent.train(training_iter=60000,
                                           env_name=env_name,
                                           seed=0,
                                           task_generator=task_generator,
                                           eval_interval=100,
                                           log_dir=".",
                                           use_env_obs=False,
                                           init_vae_steps=1,
                                           sw_size=10,
                                           num_random_task_to_eval=32,
                                           num_test_processes=2,
                                           gp_list_sequences=[],
                                           prior_sequences=[],
                                           init_prior_test_sequences=[],
                                           verbose=True
                                          )

RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING
RESETTING


TypeError: encode() takes 3 positional arguments but 4 were given

In [12]:
vi_2 = InferenceNetwork(n_in=4, z_dim=latent_dim)
vi_optim_2 = torch.optim.Adam(vi_2.parameters(), lr=0.0005)

In [13]:
agent_2 = PosteriorMTAgent(action_space=action_space, device=device, gamma=1,
                                 num_steps=20, num_processes=1,
                                 clip_param=0.1, ppo_epoch=4,
                                 num_mini_batch=8,
                                 value_loss_coef=0.5,
                                 entropy_coef=0.001,
                                 lr=0.000005,
                                 eps=1e-6, max_grad_norm=0.5,
                                 use_linear_lr_decay=False,
                                 use_gae=False,
                                 gae_lambda=0.95,
                                 use_proper_time_limits=False,
                                 obs_shape=obs_shape,
                                 latent_dim=latent_dim,
                                 recurrent_policy=False,
                                 hidden_size=8,
                                 use_elu=True,
                                 variational_model=vi_2,
                                 vae_optim=vi_optim_2,
                                 rescale_obs=True,
                                 max_old=max_old,
                                 min_old=min_old,
                                 vae_min_seq=vae_min_seq,
                                 vae_max_seq=vae_max_seq,
                                 max_action=x_max, 
                                 min_action=x_min,
                        max_sigma=30, env_dim=0,
                        use_decay_kld=True,
                        decay_kld_rate=1)

from ppo_a2c.envs import get_vec_envs_multi_task
from utilities.observation_utils import augment_obs_posterior 
from ppo_a2c.storage import RolloutStorage

envs_kwargs, prev_task, prior_list, new_tasks = task_generator.sample_pair_tasks(agent_2.num_processes)
agent_2.envs = get_vec_envs_multi_task(env_name, 0, agent_2.num_processes,  agent_2.gamma, ".",  agent_2.device,
                                    True, envs_kwargs,  agent_2.envs, num_frame_stack=None)

prior = torch.empty(agent_2.num_processes, agent_2.latent_dim * 2)
mu_prior = torch.empty(agent_2.num_processes, agent_2.latent_dim)
logvar_prior = torch.empty(agent_2.num_processes, agent_2.latent_dim)

for t_idx in range(agent_2.num_processes):
    prior[t_idx] = prior_list[t_idx].reshape(1, agent_2.latent_dim * 2).squeeze(0).clone().detach()
    mu_prior[t_idx] = prior_list[t_idx][0].clone().detach()
    logvar_prior[t_idx] = prior_list[t_idx][1].clone().detach().log()

# Sample data under the current policy
obs = agent_2.envs.reset()
obs = augment_obs_posterior(obs, agent_2.latent_dim, prior, False, rescale_obs=agent_2.rescale_obs,
                            is_prior=True, max_old=agent_2.max_old, min_old=agent_2.min_old)

rollouts_multi_task = RolloutStorage(agent_2.num_steps, agent_2.num_processes,
                                             agent_2.obs_shape, agent_2.action_space,
                                             agent_2.actor_critic.recurrent_hidden_state_size)
rollouts_multi_task.obs[0].copy_(obs)
rollouts_multi_task.to(agent_2.device)

num_data_context = torch.randint(low=agent_2.vae_min_seq, high=agent_2.vae_max_seq, size=(1,)).item()
context = torch.empty(agent_2.num_processes, num_data_context, 2)

action_list = []
for step in range(num_data_context):
    with torch.no_grad():
        _, action, _, _ = agent_2.actor_critic.act(
            rollouts_multi_task.obs[0], rollouts_multi_task.recurrent_hidden_states[0],
            rollouts_multi_task.masks[0])
    _, reward, _, _ = agent_2.envs.step(action)
    action_list.append(action)
    
    
action_list = [a.tolist() for a in action_list]
action_list = torch.tensor(action_list)
action_list = action_list.squeeze(2)
torch.sum(action_list < 0) / torch.tensor([(action_list.shape[0] * action_list.shape[1])], dtype=torch.float32)

RESETTING
RESETTING


tensor([0.6364])

In [14]:
action_list.shape

torch.Size([11, 1])

In [19]:
agent_2.envs.reset()

RESETTING


tensor([[0.4092]])

In [20]:
for s in range(20):
    a = torch.ones((1, 1))
    obs, r, done, _ = agent_2.envs.step(a)
    if done[0] == True:
        print("Here {}".format(s))
        print(obs)

RESETTING
Here 9
tensor([[0.1077]])
RESETTING
Here 19
tensor([[0.0859]])


In [83]:
done[0]

True

In [88]:
p = task_generator.sample_pair_tasks(10)[2]

In [92]:
p[0].shape

torch.Size([2, 1])