In [1]:
import tensorflow as tf
from pilco.policies.rbf_policy import RBFPolicy
from pilco.agents.agents import EQGPAgent
import matplotlib.pyplot as plt

from tqdm import trange

In [2]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
rbf_policy.reset()

In [3]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

mean_u, cov_su, cov_uu = rbf_policy.match_moments(loc, cov)

cov1 = tf.concat([cov, cov_su[None, :]], axis=0)
cov2 = tf.concat([cov_su[:, None], cov_uu[None, None]], axis=0)

cov_full = tf.concat([cov1, cov2], axis=1)

mean_full = tf.concat([loc, mean_u[None]], axis=0)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print('mean_full:')
print(mean_full.numpy())
print('cov_full:')
print(cov_full.numpy())

All eigenvalues are postive: True
mean_full:
[0.         0.         0.42394078]
cov_full:
[[ 1.          0.         -0.23522164]
 [ 0.          1.          0.08149082]
 [-0.23522164  0.08149082  0.23706882]]


In [4]:
num_samples = 10**1

states = []
actions = []

for i in trange(num_samples):
    
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 10/10 [00:00<00:00, 585.89it/s]


In [5]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full.numpy())

MC mean_full:
[[-0.6797276   0.03726443  0.91941345]]
MC cov_full:
[[ 0.38322514 -0.28995478 -0.2973714 ]
 [-0.28995478  0.9781152   0.14863373]
 [-0.2973714   0.14863373  0.44626868]]


In [6]:
rbf_policy = RBFPolicy(state_dim=2,
                       action_dim=1,
                       num_rbf_features=5,
                       dtype=tf.float32)
rbf_policy.reset()

agent = EQGPAgent(state_dim=2,
                  action_dim=1,
                  policy=rbf_policy,
                  dtype=tf.float32)


for i in range(10):
    
    state = tf.random.normal(mean=0., stddev=1., shape=(2,))
    action = tf.random.normal(mean=0., stddev=1., shape=(1,))
    next_state = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    agent.observe(state, action, next_state)

In [7]:
agent.match_moments(mean_full, cov_full)

(2,) (2, 10)


<tf.Tensor: shape=(2,), dtype=float32, numpy=array([1.1391567 , 0.15179396], dtype=float32)>