In [1]:
%matplotlib notebook

In [2]:
import tensorflow as tf

import numpy as np

from pilco.environments import Environment
from pilco.policies import RBFPolicy, SineBoundedActionPolicy
from pilco.costs import EQCost
from pilco.agents import EQGPAgent

import not_tf_opt as ntfo

from tqdm.notebook import trange

import matplotlib.pyplot as plt

In [3]:
dtype = tf.float64

# Create pendulum environment from Gym
env = Environment(name='Pendulum-v0')
env.reset()

# Create stuff for our controller
# Upright position, stationary and zero action position for pendulum is [0, 0, 0]
target_loc = tf.zeros([1, 2])
target_scale = 1.

eq_cost = EQCost(target_loc=target_loc,
                 target_scale=target_scale,
                 dtype=dtype)

# create EQ policy
eq_policy = RBFPolicy(state_dim=2,
                      action_dim=1,
                      num_rbf_features=50,
                      dtype=dtype)

eq_policy = SineBoundedActionPolicy(eq_policy,
                                    lower=-2,
                                    upper=2)

# create agent
eq_agent = EQGPAgent(state_dim=2,
                     action_dim=1,
                     policy=eq_policy,
                     cost=eq_cost,
                     dtype=dtype)

In [4]:
num_episodes = 50
num_steps = 1

eq_agent.policy.reset()

for episode in trange(num_episodes):
    
    state = env.reset()
    
    state = np.array([np.pi, 8]) * (2 * np.random.uniform(size=(2,)) - 1)
    env.env.env.state = state
    
    for step in range(num_steps):
        
        action = tf.random.uniform(shape=()) * 4. - 2
        state, action, next_state = env.step(action[None].numpy())
        
        eq_agent.observe(state, action, next_state)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [None]:
horizon = 30
num_optim_steps = 20
num_episodes = 5
num_experience_steps = 20

tolerance = 1e-5

init_state = tf.constant([[-np.pi, 0.]], dtype=tf.float64)
init_cov = tf.eye(2, dtype=tf.float64)

agent_optimizer = tf.optimizers.Adam(1e-2)
optimiser = tf.optimizers.Adam(1e-2)

for episode in range(num_episodes):

    print('Optimising policy')
    
    prev_loss = np.inf
    
    eq_agent.set_eq_scales_from_data()
    
#     for n in trange(num_optim_steps):
#         with tf.GradientTape(watch_accessed_variables=False) as tape:
#             tape.watch(eq_agent.parameters)
            
#             loss = -eq_agent.dynamics_log_marginal()
            
#         gradients = tape.gradient(loss, eq_agent.parameters)
#         agent_optimizer.apply_gradients(zip(gradients, eq_agent.parameters))
        
#         eq_agent.eq_scales.assign(tf.clip_by_value(eq_agent.eq_scales(), 0., 1.))
    
#         print(loss)

#         if tf.abs(loss - prev_loss) < tolerance:
#             print(f"Early convergence!")
#             break
#         prev_loss = loss
        
#     print(f"GP length scales:\n{eq_agent.eq_scales()}")
#     print(f"GP variance:\n{eq_agent.eq_coeff()}")
#     print(f"GP length scales:\n{eq_agent.eq_noise_coeff()}")
    
    eq_agent.policy.reset()
    
    prev_loss = np.inf
    for n in trange(num_optim_steps):

        cost = 0.
        loc = init_state
        cov = init_cov

        with tf.GradientTape(watch_accessed_variables=False) as tape:

            tape.watch(eq_agent.policy.parameters)

            for t in range(horizon):

                mean_full, cov_full = eq_agent.policy.match_moments(loc, cov)

                loc, cov = eq_agent.match_moments(mean_full, cov_full)

                cost = cost + eq_agent.cost.expected_cost(loc[None, :], cov)

        gradients = tape.gradient(cost, eq_agent.policy.parameters)

        optimiser.apply_gradients(zip(gradients, eq_agent.policy.parameters))

        if tf.abs(cost - prev_loss) < tolerance:
            print(f"Early convergence!")
            break
            
        prev_loss = cost
        
        print(cost)
    
    print(cost)
        
    print(f'Performing episode {episode + 1}:')
    
    state = env.reset()
    
    for step in trange(num_experience_steps):
        
        action = eq_agent.act(state)
        state, action, next_state = env.step(action[None].numpy())
        eq_agent.observe(state, action, next_state)

Optimising policy


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

tf.Tensor(29.131525225518242, shape=(), dtype=float64)
tf.Tensor(29.118752046138884, shape=(), dtype=float64)
tf.Tensor(29.10602143914646, shape=(), dtype=float64)
tf.Tensor(29.09365455038195, shape=(), dtype=float64)
tf.Tensor(29.08202243224251, shape=(), dtype=float64)
tf.Tensor(29.071515168019896, shape=(), dtype=float64)
tf.Tensor(29.062400793209118, shape=(), dtype=float64)
tf.Tensor(29.05465529532259, shape=(), dtype=float64)
tf.Tensor(29.047948915784172, shape=(), dtype=float64)
tf.Tensor(29.041768512035983, shape=(), dtype=float64)
tf.Tensor(29.035589175304796, shape=(), dtype=float64)
tf.Tensor(29.029023638894742, shape=(), dtype=float64)
tf.Tensor(29.021881890317427, shape=(), dtype=float64)
tf.Tensor(29.014176061538862, shape=(), dtype=float64)
tf.Tensor(29.00610361818694, shape=(), dtype=float64)
tf.Tensor(28.99800255428612, shape=(), dtype=float64)
tf.Tensor(28.990266509120048, shape=(), dtype=float64)
tf.Tensor(28.983222818510974, shape=(), dtype=float64)
tf.Tensor(28.977

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Optimising policy


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

tf.Tensor(29.44031796333766, shape=(), dtype=float64)
tf.Tensor(29.437975631651206, shape=(), dtype=float64)
tf.Tensor(29.434946148391, shape=(), dtype=float64)
tf.Tensor(29.43133650723909, shape=(), dtype=float64)
tf.Tensor(29.427213695302576, shape=(), dtype=float64)
tf.Tensor(29.42263481743317, shape=(), dtype=float64)
tf.Tensor(29.417656870573847, shape=(), dtype=float64)
tf.Tensor(29.41234013637726, shape=(), dtype=float64)
tf.Tensor(29.40674860303413, shape=(), dtype=float64)
tf.Tensor(29.400948663130478, shape=(), dtype=float64)


In [None]:
x = eq_agent.dynamics_inputs
y = eq_agent.dynamics_outputs.value().numpy()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2])

plt.show()

In [None]:
plt.figure()
plt.scatter(y[:, 0], y[:, 1])

In [None]:
plt.figure()
plt.scatter(x[:,0], y[:,0])