In [1]:
import torch
import numpy as np
import pickle


from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Product, ConstantKernel as C

import gym_sin
from gym import spaces

from active_learning.recurrent import RL2
from task.GuassianTaskGenerator import GaussianTaskGenerator

In [2]:
def get_const_task_sequence(alpha, n_restarts, num_test_processes):
    kernel = C(1.0, (1e-5, 1e5)) * RBF(1, (1e-5, 1e5))

    gp_list = []
    for i in range(2):
        gp_list.append([GaussianProcessRegressor(kernel=kernel,
                                                 alpha=alpha ** 2,
                                                 normalize_y=True,
                                                 n_restarts_optimizer=n_restarts)
                        for _ in range(num_test_processes)])
    test_kwargs = []
    init_prior_test = [torch.tensor([[-10], [5]], dtype=torch.float32) for _ in range(num_test_processes)]
        
    mean = -5
    std = 15
    
    for idx in range(50):
        test_kwargs.append({'amplitude': 1,
                            'mean': mean,
                            'std': std,
                            'noise_std': 0.001,
                            'scale_reward': False})

    return gp_list, test_kwargs, init_prior_test

def get_linear_task_sequence(alpha, n_restarts, num_test_processes):
    kernel = C(1.0, (1e-5, 1e5)) * RBF(1, (1e-5, 1e5))

    gp_list = []
    for i in range(2):
        gp_list.append([GaussianProcessRegressor(kernel=kernel,
                                                 alpha=alpha ** 2,
                                                 normalize_y=True,
                                                 n_restarts_optimizer=n_restarts)
                        for _ in range(num_test_processes)])
    test_kwargs = []
    init_prior_test = [torch.tensor([[30], [5]], dtype=torch.float32) for _ in range(num_test_processes)]

    for idx in range(50):
        std = 15
        mean = 30 - idx
        
        test_kwargs.append({'amplitude': 1,
                            'mean': mean,
                            'std': std,
                            'noise_std': 0.001,
                            'scale_reward': False})

    return gp_list, test_kwargs, init_prior_test

def get_phase_task_sequence(alpha, n_restarts, num_test_processes):
    kernel = C(1.0, (1e-5, 1e5)) * RBF(1, (1e-5, 1e5))

    gp_list = []
    for i in range(2):
        gp_list.append([GaussianProcessRegressor(kernel=kernel,
                                                 alpha=alpha ** 2,
                                                 normalize_y=True,
                                                 n_restarts_optimizer=n_restarts)
                        for _ in range(num_test_processes)])
    test_kwargs = []
    init_prior_test = [torch.tensor([[-5], [5]], dtype=torch.float32) for _ in range(num_test_processes)]

    for idx in range(50):
        if idx < 15:
            std = 15
            mean = 0
        elif idx < 30:
            std = 15
            mean = 10
        else:
            std = 15
            mean = 0
            
        test_kwargs.append({'amplitude': 1,
                            'mean': mean,
                            'std': std,
                            'noise_std': 0.001,
                            'scale_reward': False})

    return gp_list, test_kwargs, init_prior_test

def get_abrupt_and_smooth(alpha, n_restarts, num_test_processes):
    kernel = C(1.0, (1e-5, 1e5)) * RBF(1, (1e-5, 1e5))

    gp_list = []
    for i in range(2):
        gp_list.append([GaussianProcessRegressor(kernel=kernel,
                                                 alpha=alpha ** 2,
                                                 normalize_y=True,
                                                 n_restarts_optimizer=n_restarts)
                        for _ in range(num_test_processes)])
    test_kwargs = []
    init_prior_test = [torch.tensor([[-30], [5]], dtype=torch.float32) for _ in range(num_test_processes)]

    for idx in range(80):
        if idx < 15:
            std = 15
            mean = -30
        elif idx < 50:
            std = 15
            mean = -20 + (idx - 15)
        else:
            std = 15
            mean = -20 + 50 - 15 

            
        test_kwargs.append({'amplitude': 1,
                            'mean': mean,
                            'std': std,
                            'noise_std': 0.001,
                            'scale_reward': False})

    return gp_list, test_kwargs, init_prior_test


In [3]:
env_name = "gauss-v0"

action_space = spaces.Box(low=np.array([-1]), high=np.array([1]))
latent_dim = 1

x_min = -100
x_max = 100

min_mean = -40
max_mean = 40

prior_mu_min = -10
prior_mu_max = 10
prior_std_min = 1
prior_std_max = 10

std = 15
amplitude=1

device = "cpu"

task_generator = GaussianTaskGenerator(x_min, x_max, min_mean, max_mean,
                 prior_mu_min, prior_mu_max, prior_std_min, prior_std_max, std, amplitude)
fam = task_generator.create_task_family(n_tasks=5000, n_batches=1, test_perc=0, batch_size=1)

In [4]:
hidden_size = 32
use_elu = True
clip_param = 0.2
ppo_epoch = 4 
num_mini_batch = 8
value_loss_coef = 0.5
entropy_coef = 0.
lr = 0.0001
eps = 1e-6
max_grad_norm = 0.5

use_obs_env = False
obs_shape = (2,)
num_processes = 32
gamma = 1
device = "cpu"
num_steps = 150
action_dim = 1
use_gae = False
gae_lambda = 0.95
use_proper_time_limits = False

agent = RL2(hidden_size, use_elu, clip_param, ppo_epoch, num_mini_batch, value_loss_coef,
                 entropy_coef, lr, eps, max_grad_norm, action_space, obs_shape, use_obs_env,
                 num_processes, gamma, device, num_steps, action_dim, use_gae, gae_lambda,
                 use_proper_time_limits)

In [5]:
n_iter = 5000
env_name = 'gauss-v0'
seed = 0
eval_interval = 20
num_test_processes = 1
num_random_task_to_eval = 32

test_kwargs_sequences = [get_const_task_sequence(0.25, 1, 1)[1]]

In [None]:
eval_list, test_list = agent.train(n_iter, env_name, seed, task_generator,
            eval_interval, num_test_processes, num_random_task_to_eval,
            test_kwargs_sequences=test_kwargs_sequences, log_dir=".", verbose=True)

Iteration 0 / 5000
Evaluation using 32 tasks. Mean reward: 15.670923062499998
Iteration 20 / 5000
Evaluation using 32 tasks. Mean reward: 14.608395156250001
Iteration 40 / 5000
Evaluation using 32 tasks. Mean reward: 15.230333593749998
Iteration 60 / 5000
Evaluation using 32 tasks. Mean reward: 16.773078062499998
Iteration 80 / 5000
Evaluation using 32 tasks. Mean reward: 16.14672053125
Iteration 100 / 5000
Evaluation using 32 tasks. Mean reward: 16.49008359375
Iteration 120 / 5000
Evaluation using 32 tasks. Mean reward: 16.240571125
Iteration 140 / 5000
Evaluation using 32 tasks. Mean reward: 16.36997071875
Iteration 160 / 5000
Evaluation using 32 tasks. Mean reward: 16.21297646875
Iteration 180 / 5000
Evaluation using 32 tasks. Mean reward: 15.97599171875
Iteration 200 / 5000
Evaluation using 32 tasks. Mean reward: 17.0086595
Iteration 220 / 5000
Evaluation using 32 tasks. Mean reward: 17.51644803125
Iteration 240 / 5000
Evaluation using 32 tasks. Mean reward: 18.59178071875
Iteratio

Evaluation using 32 tasks. Mean reward: 48.57933953125
Iteration 2140 / 5000
Evaluation using 32 tasks. Mean reward: 48.77452446875
Iteration 2160 / 5000
Evaluation using 32 tasks. Mean reward: 45.80615190625
Iteration 2180 / 5000
Evaluation using 32 tasks. Mean reward: 45.7687906875
Iteration 2200 / 5000
Evaluation using 32 tasks. Mean reward: 47.30917303125
Iteration 2220 / 5000
Evaluation using 32 tasks. Mean reward: 44.666329968750006
Iteration 2240 / 5000
Evaluation using 32 tasks. Mean reward: 51.202162875
Iteration 2260 / 5000
Evaluation using 32 tasks. Mean reward: 46.778560562500004
Iteration 2280 / 5000
Evaluation using 32 tasks. Mean reward: 37.2735596875
Iteration 2300 / 5000
Evaluation using 32 tasks. Mean reward: 49.693194625
Iteration 2320 / 5000
Evaluation using 32 tasks. Mean reward: 49.9084514375
Iteration 2340 / 5000
Evaluation using 32 tasks. Mean reward: 50.341419218750005
Iteration 2360 / 5000
Evaluation using 32 tasks. Mean reward: 47.493546656250004
Iteration 23