In [1]:
%matplotlib inline

In [1]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

import numpy as np

from pilco.policies import RBFPolicy, SineBoundedActionPolicy

from pilco.agents.agents import EQGPAgent
from pilco.costs.costs import EQCost

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from tqdm import trange

# Policy: match moments (closed form and MC)

## RBF Policy

In [21]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
rbf_policy.reset()

In [22]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

mean_full, cov_full = rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[0.        0.        0.0799581]]
cov_full:
[[ 1.0000000e+00  0.0000000e+00 -2.7423289e-01]
 [ 0.0000000e+00  1.0000000e+00 -3.6088098e-04]
 [-2.7423289e-01 -3.6088098e-04  2.6626879e-01]]


In [23]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:00<00:00, 1045.95it/s]


In [24]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full.numpy())

MC mean_full:
[[0.00547609 0.03609228 0.09434412]]
MC cov_full:
[[ 0.92497575 -0.02206209 -0.27942798]
 [-0.02206209  0.933189    0.00312486]
 [-0.27942798  0.00312486  0.27524218]]


## Sine Bounded RBF Policy

In [25]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
sb_rbf_policy = SineBoundedActionPolicy(rbf_policy, lower=-2, upper=10)
sb_rbf_policy.reset()

In [26]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

# mean_full_ = tf.convert_to_tensor([[ 0.,        0.,         -0.25994033]], dtype=tf.float32)
# cov_full_ = tf.convert_to_tensor([[1.,         0.,         0.09250697],
#  [0.,         1.,         0.06342697],
#  [0.09250697, 0.06342697, 0.16243385]], dtype=tf.float32)

# joint_dist_ = tfd.MultivariateNormalTriL(loc=mean_full_,
#                                         scale_tril=tf.linalg.cholesky(cov_full_))

mean_full, cov_full = sb_rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[0.        0.        5.5460596]]
cov_full:
[[ 1.0000000e+00  0.0000000e+00 -6.3147694e-03]
 [ 0.0000000e+00  1.0000000e+00  2.1010876e+00]
 [-6.3147694e-03  2.1010876e+00  1.1517870e+01]]


In [27]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
#     samp = joint_dist_.sample()
#     s = samp[0, :2]
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = sb_rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:01<00:00, 837.51it/s]


In [28]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full)

MC mean_full:
[[-0.03194174  0.02510313  5.328069  ]]
MC cov_full:
tf.Tensor(
[[ 1.0528259   0.06048473  0.1408468 ]
 [ 0.06048473  0.9832092   2.2496135 ]
 [ 0.1408468   2.2496135  11.543514  ]], shape=(3, 3), dtype=float32)


# Agent: match moments (closed form and MC)

## Add dummy data to agent

In [2]:
tf.random.set_seed(24)

rbf_policy = RBFPolicy(state_dim=2,
                       action_dim=1,
                       num_rbf_features=5,
                       dtype=tf.float64)
# sb_rbf_policy = SineBoundedActionPolicy(rbf_policy,
#                                         lower=-20,
#                                         upper=15)
rbf_policy.reset()
# sb_rbf_policy.reset()

eq_cost = EQCost(target_loc=tf.ones((1, 3)),
                 target_scale=1.,
                 dtype=tf.float64)

agent = EQGPAgent(state_dim=2,
                  action_dim=1,
                  policy=rbf_policy,
                  cost=eq_cost,
                  dtype=tf.float64)

states = []
actions = []
next_states = []

for i in range(50):
    
    state = tf.random.normal(mean=0., stddev=5., shape=(2,), dtype=tf.float64) 
    next_state = tf.random.normal(mean=0., stddev=5., shape=(2,), dtype=tf.float64)
    action = tf.random.normal(mean=0., stddev=5., shape=(1,), dtype=tf.float64)
    
    states.append(state)
    actions.append(action)
    next_states.append(next_state)
    
    agent.observe(state, action, next_state)

## Match moments analytically

In [3]:
state_loc = 3 * tf.ones(2, dtype=tf.float64)
state_cov = 9 * tf.eye(2, dtype=tf.float64)

# Match moments for the joint state-action distribution
mean_full, cov_full = rbf_policy.match_moments(state_loc, state_cov)

joint_dist = tfd.MultivariateNormalTriL(loc=mean_full,
                                        scale_tril=tf.linalg.cholesky(cov_full))

## Match moments by MC

In [4]:
num_samples = 10**3

means = []
covs = []
state_actions = []

# MC approx
for i in trange(num_samples):
    
    state_action = joint_dist.sample()
    
    #Note: mean is the expectation of the deltas!
    mean, cov = agent.gp_posterior_predictive(state_action)
    means.append(mean)
    
    covs.append(cov)
    state_actions.append(state_action)
    
means = tf.concat(means, axis=0)
covs = tf.stack(covs, axis=0)
state_actions = tf.stack(state_actions, axis=0)

100%|██████████| 1000/1000 [00:11<00:00, 86.02it/s]


In [9]:
emp_mean = tf.reduce_mean(means, axis=0) + mean_full[:, :agent.state_dim]
print(f"Emp mean:\n{emp_mean}")

emp_var = tf.reduce_mean(covs, axis=[0, 2, 3])
print(f"Emp diagonal cov components:\n{emp_var}")
emp_cov = tf.reduce_mean(means[:, :, None] * means[:, None, :], axis=0) - emp_mean * tf.transpose(emp_mean)
emp_cov = emp_cov + tf.linalg.diag(emp_var)
print(f"Emp cov without cross cov:\n{emp_cov}")

states = state_actions[:, :, :agent.state_dim]
emp_cross_cov = tf.reduce_mean(states * means[:, :, None], axis=0)
emp_cross_cov = emp_cross_cov - tf.reduce_mean(states, axis=0) * tf.reduce_mean(means[:, :, None], axis=0)
print(f"Emp cross cov:\n{tf.transpose(emp_cross_cov)}")

emp_cov = cov_full[:agent.state_dim, :agent.state_dim] + emp_cov + emp_cross_cov + tf.transpose(emp_cross_cov)
print(f"Emp cross cov:\n{emp_cov}")

Emp mean:
[[2.99909105 2.99902564]]
Emp diagonal cov components:
[0.9999897 0.9999897]
Emp cov without cross cov:
[[-7.99429197 -8.9940664 ]
 [-8.9940664  -7.99386007]]
Emp cross cov:
[[ 0.00115037  0.00123314]
 [-0.00085346 -0.00091487]]
Emp cross cov:
[[ 1.00800877 -8.99368672]
 [-8.99368672  1.00431018]]


In [8]:
m, c = agent.match_moments(mean_full, cov_full)
print(f"Analytic mean:\n{m}")
print(f"Analytic cov:\n{c}")

Cov diag components:
[0.99988921 0.99988921]
cov without cross cov:
 [[1.00276027 0.00304514]
 [0.00304514 1.00317058]]
Cross cov:
[[ 0.00179781  0.00194422]
 [-0.00166921 -0.0018463 ]]
Cov with cov full:
[[1.00027603e+01 3.04514331e-03]
 [3.04514331e-03 1.00031706e+01]]
Analytic mean:
[2.99842198 2.99830533]
Analytic cov:
[[1.00063559e+01 3.32015972e-03]
 [3.32015972e-03 9.99947798e+00]]


In [7]:
tf.reduce_max(tf.abs(emp_cov - c))

<tf.Tensor: shape=(), dtype=float64, numpy=9.000144481671695>

In [26]:
num_samples = 10**3

emp_costs = []

for s in trange(num_samples):
    
    sample = joint_dist.sample()
    
    c = eq_cost(sample)
    
    emp_costs.append(c)
    
emp_costs = tf.stack(emp_costs)

100%|██████████| 1000/1000 [00:03<00:00, 310.57it/s]


In [27]:
emp_cost = tf.reduce_mean(emp_costs)
emp_cost

<tf.Tensor: shape=(), dtype=float64, numpy=0.9314016540191794>

In [29]:
eq_cost.expected_cost(loc=mean_full,
                      cov=cov_full)

<tf.Tensor: shape=(), dtype=float64, numpy=0.9312332897741745>