In [1]:
%matplotlib inline

In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

import numpy as np

from pilco.policies import RBFPolicy, SineBoundedActionPolicy

from pilco.agents.agents import EQGPAgent
from pilco.costs.costs import EQCost

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from tqdm import trange

# Policy: match moments (closed form and MC)

## RBF Policy

In [2]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
rbf_policy.reset()

In [3]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

mean_full, cov_full = rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[ 0.          0.         -0.85934794]]
cov_full:
[[ 1.          0.          0.92527944]
 [ 0.          1.         -0.20175733]
 [ 0.92527944 -0.20175733  1.3220766 ]]


In [4]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:00<00:00, 1356.74it/s]


In [5]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full.numpy())

MC mean_full:
[[ 0.01983453  0.02856064 -0.824412  ]]
MC cov_full:
[[ 1.1124946  -0.0107068   1.0042939 ]
 [-0.0107068   1.0111924  -0.18531339]
 [ 1.0042939  -0.18531339  1.4053669 ]]


## Sine Bounded RBF Policy

In [3]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
sb_rbf_policy = SineBoundedActionPolicy(rbf_policy, lower=-2, upper=10)
sb_rbf_policy.reset()

In [4]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

# mean_full_ = tf.convert_to_tensor([[ 0.,        0.,         -0.25994033]], dtype=tf.float32)
# cov_full_ = tf.convert_to_tensor([[1.,         0.,         0.09250697],
#  [0.,         1.,         0.06342697],
#  [0.09250697, 0.06342697, 0.16243385]], dtype=tf.float32)

# joint_dist_ = tfd.MultivariateNormalTriL(loc=mean_full_,
#                                         scale_tril=tf.linalg.cholesky(cov_full_))

mean_full, cov_full = sb_rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[0.        0.        6.6483717]]
cov_full:
[[ 1.          0.         -1.3690133 ]
 [ 0.          1.          0.84770405]
 [-1.3690133   0.84770405  9.849387  ]]


In [5]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
#     samp = joint_dist_.sample()
#     s = samp[0, :2]
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = sb_rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:00<00:00, 1135.98it/s]


In [6]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full)

MC mean_full:
[[ 0.05841371 -0.0142893   6.3827133 ]]
MC cov_full:
tf.Tensor(
[[ 1.0405152  -0.09474523 -1.6307832 ]
 [-0.09474523  0.94430625  1.0460137 ]
 [-1.6307832   1.0460137   8.115406  ]], shape=(3, 3), dtype=float32)


# Agent: match moments (closed form and MC)

## Add dummy data to agent

In [7]:
tf.random.set_seed(24)

rbf_policy = RBFPolicy(state_dim=2,
                       action_dim=1,
                       num_rbf_features=5,
                       dtype=tf.float64)
rbf_policy.reset()

eq_cost = EQCost(target_loc=tf.ones((1, 3)),
                 target_scale=1.,
                 dtype=tf.float64)

agent = EQGPAgent(state_dim=2,
                  action_dim=1,
                  policy=rbf_policy,
                  cost=eq_cost,
                  dtype=tf.float64)

states = []
actions = []
next_states = []

for i in range(50):
    
    state = tf.random.normal(mean=0., stddev=5., shape=(2,), dtype=tf.float64) 
    next_state = tf.random.normal(mean=0., stddev=5., shape=(2,), dtype=tf.float64)
    action = tf.random.normal(mean=0., stddev=5., shape=(1,), dtype=tf.float64)
    
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    
    agent.observe(state, action, next_state)

## Match moments analytically

In [8]:
state_loc = tf.zeros(2, dtype=tf.float64)
state_cov = tf.eye(2, dtype=tf.float64)

# Match moments for the joint state-action distribution
mean_full, cov_full = rbf_policy.match_moments(state_loc, state_cov)

joint_dist = tfd.MultivariateNormalTriL(loc=mean_full,
                                        scale_tril=tf.linalg.cholesky(cov_full))

# Stuff we're tying to get to match
analytic_moments = agent.match_moments(mean_full, cov_full)

<class 'pilco.agents.agents.EQGPAgent'> cov before tf.Tensor(
[[1.35712445 0.02847125]
 [0.02847125 1.31747261]], shape=(2, 2), dtype=float64)
<class 'pilco.agents.agents.EQGPAgent'> cross_cov_s tf.Tensor(
[[-0.3693801  -0.30024672]
 [-0.37663962  0.30347296]], shape=(2, 2), dtype=float64)


## Match moments by MC

In [9]:
num_samples = 10**3

means = []
covs = []
state_actions = []

# MC approx
for i in trange(num_samples):
    
    state_action = joint_dist.sample()
    
    mean, cov = agent.gp_posterior_predictive(state_action)
    
    # S x 1 x N
    k = agent.exponentiated_quadratic(state_action, agent.dynamics_inputs)
    
    means.append(mean)
    covs.append(cov)
    state_actions.append(state_action)
    
means = tf.concat(means, axis=0)
covs = tf.stack(covs, axis=0)
state_actions = tf.stack(state_actions, axis=0)

100%|██████████| 1000/1000 [00:09<00:00, 106.61it/s]


In [24]:
emp_mean = tf.reduce_mean(means, axis=0) + mean_full[:, :agent.state_dim]
print(emp_mean)

emp_var = tf.reduce_mean(covs, axis=[0, 2, 3])
emp_cov = tf.reduce_mean(means[:, :, None] * means[:, None, :], axis=0) - emp_mean * tf.transpose(emp_mean)
emp_cov = emp_cov + tf.linalg.diag(emp_var)

states = state_actions[:, :, :agent.state_dim]
emp_cross_cov = tf.reduce_mean(states * means[:, :, None], axis=0)
emp_cross_cov = emp_cross_cov - tf.reduce_mean(states, axis=0) * tf.reduce_mean(means[:, :, None], axis=0)

emp_cov = cov_full[:agent.state_dim, :agent.state_dim] + emp_cov + emp_cross_cov + tf.transpose(emp_cross_cov)
print(emp_cov)

tf.Tensor([[ 0.5365647  -0.03342383]], shape=(1, 2), dtype=float64)
tf.Tensor(
[[ 1.60035328 -0.63295332]
 [-0.63295332  2.84512342]], shape=(2, 2), dtype=float64)


In [25]:
agent.match_moments(mean_full, cov_full)

<class 'pilco.agents.agents.EQGPAgent'> cov before tf.Tensor(
[[1.35712445 0.02847125]
 [0.02847125 1.31747261]], shape=(2, 2), dtype=float64)
<class 'pilco.agents.agents.EQGPAgent'> cross_cov_s tf.Tensor(
[[-0.3693801  -0.30024672]
 [-0.37663962  0.30347296]], shape=(2, 2), dtype=float64)


(<tf.Tensor: shape=(2,), dtype=float64, numpy=array([ 0.55350528, -0.03865606])>,
 <tf.Tensor: shape=(2, 2), dtype=float64, numpy=
 array([[ 1.61836424, -0.64841509],
        [-0.64841509,  2.92441853]])>)

In [26]:
num_samples = 10**3

emp_costs = []

for s in trange(num_samples):
    
    sample = joint_dist.sample()
    
    c = eq_cost(sample)
    
    emp_costs.append(c)
    
emp_costs = tf.stack(emp_costs)

100%|██████████| 1000/1000 [00:03<00:00, 310.57it/s]


In [27]:
emp_cost = tf.reduce_mean(emp_costs)
emp_cost

<tf.Tensor: shape=(), dtype=float64, numpy=0.9314016540191794>

In [29]:
eq_cost.expected_cost(loc=mean_full,
                      cov=cov_full)

<tf.Tensor: shape=(), dtype=float64, numpy=0.9312332897741745>