In [3]:
%matplotlib inline

In [4]:
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

import numpy as np

from pilco.policies import RBFPolicy, SineBoundedActionPolicy

from pilco.agents.agents import EQGPAgent
from pilco.costs.costs import EQCost
from pilco.environments import Environment

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from tqdm import trange

# Policy: match moments (closed form and MC)

## RBF Policy

In [3]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
rbf_policy.reset()

In [4]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

mean_full, cov_full = rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[0.       0.       0.539999]]
cov_full:
[[ 1.          0.         -0.18453534]
 [ 0.          1.          0.01663259]
 [-0.18453534  0.01663259  0.10607606]]


In [5]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:01<00:00, 954.11it/s]


In [6]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full.numpy())

MC mean_full:
[[ 0.00160797 -0.00733619  0.5346511 ]]
MC cov_full:
[[ 1.0191172   0.04218094 -0.1819982 ]
 [ 0.04218094  1.038064    0.01108719]
 [-0.1819982   0.01108719  0.10729006]]


## Sine Bounded RBF Policy

In [7]:
rbf_policy = RBFPolicy(2, 1, 5, dtype=tf.float32)
sb_rbf_policy = SineBoundedActionPolicy(rbf_policy, lower=-2, upper=10)
sb_rbf_policy.reset()

In [8]:
loc = tf.zeros(2, dtype=tf.float32)
cov = tf.eye(2, dtype=tf.float32)

# mean_full_ = tf.convert_to_tensor([[ 0.,        0.,         -0.25994033]], dtype=tf.float32)
# cov_full_ = tf.convert_to_tensor([[1.,         0.,         0.09250697],
#  [0.,         1.,         0.06342697],
#  [0.09250697, 0.06342697, 0.16243385]], dtype=tf.float32)

# joint_dist_ = tfd.MultivariateNormalTriL(loc=mean_full_,
#                                         scale_tril=tf.linalg.cholesky(cov_full_))

mean_full, cov_full = sb_rbf_policy.match_moments(loc, cov)

print('All eigenvalues are postive:', bool(tf.reduce_all(tf.cast(tf.linalg.eig(cov_full)[0], dtype=tf.float32) > 0)))

print(f'mean_full:\n{mean_full.numpy()}')
print(f'cov_full:\n{cov_full.numpy()}')

All eigenvalues are postive: True
mean_full:
[[0.        0.        5.9844503]]
cov_full:
[[ 1.          0.         -1.0597148 ]
 [ 0.          1.         -0.45615625]
 [-1.0597148  -0.45615625  3.722326  ]]


In [9]:
num_samples = 10**3

states = []
actions = []

for i in trange(num_samples):
    
#     samp = joint_dist_.sample()
#     s = samp[0, :2]
    s = tf.random.normal(mean=0., stddev=1., shape=(2,))
    
    u = sb_rbf_policy(s)
    
    states.append(s)
    actions.append(u)
    
s = tf.convert_to_tensor(states)
u = tf.convert_to_tensor(actions)

100%|██████████| 1000/1000 [00:01<00:00, 792.39it/s]


In [10]:
su_samples = tf.concat([s, u[..., None]], axis=-1)

print('MC mean_full:')
mean_full = tf.reduce_mean(su_samples, axis=0)[None, ...]
print(mean_full.numpy())

print('MC cov_full:')
cov_full = (tf.einsum('ij, ik -> jk', su_samples, su_samples) / su_samples.shape[0])
cov_full = cov_full - (tf.einsum('ij, ik -> jk', mean_full, mean_full) / mean_full.shape[0])
print(cov_full)

MC mean_full:
[[-0.03008542 -0.01884396  6.001739  ]]
MC cov_full:
tf.Tensor(
[[ 1.0014623   0.05778026 -1.0733577 ]
 [ 0.05778026  1.038319   -0.5239479 ]
 [-1.0733577  -0.5239479   3.3305779 ]], shape=(3, 3), dtype=float32)


# Agent: match moments (closed form and MC)

## Add dummy data to agent

In [8]:
tf.random.set_seed(24)

rbf_policy = RBFPolicy(state_dim=2,
                       action_dim=1,
                       num_rbf_features=5,
                       dtype=tf.float64)

sb_rbf_policy = SineBoundedActionPolicy(rbf_policy,
                                        lower=-20,
                                        upper=15)

# rbf_policy.reset()
sb_rbf_policy.reset()

eq_cost = EQCost(target_loc=tf.ones((1, 3)),
                 target_scale=1.,
                 dtype=tf.float64)

eq_agent = EQGPAgent(state_dim=2,
                     action_dim=1,
                     policy=sb_rbf_policy,
                     cost=eq_cost,
                     dtype=tf.float64)

# Create pendulum environment from Gym
env = Environment(name='Pendulum-v0')
env.reset()

num_episodes = 50
num_steps = 1

for episode in range(num_episodes):
    
    state = env.reset()
    
    state = np.array([np.pi, 8]) * (2 * np.random.uniform(size=(2,)) - 1)
    env.env.env.state = state
    
    
    for step in range(num_steps):
        
        action = tf.random.uniform(shape=()) * 4. - 2
        state, action, next_state = env.step(action[None].numpy())
        
        eq_agent.observe(state, action, next_state)

## Match moments analytically

In [10]:
state_loc = 1. * tf.ones(2, dtype=tf.float64)
state_cov = 10. * tf.eye(2, dtype=tf.float64)

# Match moments for the joint state-action distribution
mean_full, cov_full = sb_rbf_policy.match_moments(state_loc, state_cov)

# mean_full = 0. * tf.ones((1, 3), dtype=tf.float64)
# cov_full = 1. * tf.eye(3, dtype=tf.float64)

joint_dist = tfd.MultivariateNormalTriL(loc=mean_full,
                                        scale_tril=tf.linalg.cholesky(cov_full))

## Match moments by MC

In [11]:
num_samples = 10**3

means = []
covs = []
state_actions = []

# MC approx
for i in trange(num_samples):
    
    state_action = joint_dist.sample()
    
    #Note: mean is the expectation of the deltas!
    mean, cov = eq_agent.gp_posterior_predictive(state_action)
    means.append(mean)
    
    covs.append(cov)
    state_actions.append(state_action)
    
means = tf.concat(means, axis=0)
covs = tf.stack(covs, axis=0)
state_actions = tf.stack(state_actions, axis=0)

100%|██████████| 1000/1000 [00:09<00:00, 105.44it/s]


In [15]:
emp_mean = tf.reduce_mean(means, axis=0)

cov_mean_delta = tf.reduce_mean(means[:, None, :] * means[:, :, None], axis=0)
cov_mean_delta = cov_mean_delta - emp_mean * tf.transpose(emp_mean)
print(f'Cov[E(Δ | x)]:\n{cov_mean_delta}')
mean_cov_delta = tf.linalg.diag(tf.reduce_mean(covs, axis=[0, 1]))
print(f'E[Cov(Δ | x)]:\n{mean_cov_delta}')

states = state_actions[:, :, :eq_agent.state_dim]
emp_cross_cov = tf.reduce_mean(states * means[:, :, None], axis=0)
emp_cross_cov = emp_cross_cov - tf.reduce_mean(states, axis=0) * tf.reduce_mean(means[:, :, None], axis=0)
print(f"Cov[x, Δ]:\n{tf.transpose(emp_cross_cov)}")

emp_mean = tf.reduce_mean(means, axis=0) + mean_full[:, :eq_agent.state_dim]
emp_cov = cov_full[:eq_agent.state_dim, :eq_agent.state_dim] 
emp_cov = emp_cov + cov_mean_delta + mean_cov_delta + emp_cross_cov + tf.transpose(emp_cross_cov)
print(f"Emp mean:\n{emp_mean}")
print(f"Emp cov:\n{emp_cov}")

Cov[E(Δ | x)]:
[[ 3.62445213e-05 -1.09271150e-04]
 [-1.05383661e-04  5.77354493e-04]]
E[Cov(Δ | x)]:
[[0.99857792 0.        ]
 [0.         0.99857792]]
Cov[x, Δ]:
[[-0.00130591  0.0063959 ]
 [ 0.00221726 -0.00497356]]
Emp mean:
[[1.00058466 0.99794347]]
Emp cov:
[[1.09960024e+01 8.50388573e-03]
 [8.50777322e-03 1.09892082e+01]]


In [16]:
m, c = eq_agent.match_moments(mean_full, cov_full)
print(50 * '=')
print(f"Analytic mean:\n{m}")
print(f"Analytic cov:\n{c}")

Cov diag components:
[0.9975902 0.9975902]
cov without cross cov:
 [[ 9.97700928e-01 -2.38692532e-04]
 [-2.38692532e-04  9.98784661e-01]]
Cross cov:
[[-0.00185533  0.00850044]
 [ 0.00395453 -0.00873363]]
Cov with cov full:
[[ 1.09977009e+01 -2.38692532e-04]
 [-2.38692532e-04  1.09987847e+01]]
Analytic mean:
[1.0009076  0.99734086]
Analytic cov:
[[10.99399027  0.01221628]
 [ 0.01221628 10.98131739]]


In [7]:
num_samples = 10**3

emp_costs = []

for s in trange(num_samples):
    
    sample = joint_dist.sample()
    
    c = eq_cost(sample)
    
    emp_costs.append(c)
    
emp_costs = tf.stack(emp_costs)

  0%|          | 0/1000 [00:00<?, ?it/s]


NameError: name 'joint_dist' is not defined

In [None]:
emp_cost = tf.reduce_mean(emp_costs)
emp_cost

In [None]:
eq_cost.expected_cost(loc=mean_full,
                      cov=cov_full)

# Checking accuracy of GP dynamics model

In [8]:
def sample_transitions_uniformly(num_episodes, num_steps, seed):
    
    np.random.seed(seed)
    
    # Create pendulum environment from Gym
    env = Environment(name='Pendulum-v0')
    env.reset()
    
    state_actions = []
    next_states = []

    for episode in range(num_episodes):

        state = env.reset()

        state = np.array([np.pi, 8]) * (2 * np.random.uniform(size=(2,)) - 1)
        env.env.env.state = state


        for step in range(num_steps):

            action = tf.random.uniform(shape=()) * 4. - 2
            state, action, next_state = env.step(action[None].numpy())
            
            state_action = np.concatenate([state, action], axis=0)
            
            state_actions.append(state_action)
            next_states.append(next_state)
            
    state_actions = np.stack(state_actions, axis=0)
    next_states = np.stack(next_states, axis=0)
            
    return state_actions, next_states

In [20]:
def evaluate_agent_dynamics(agent, test_data):
    
    test_inputs, test_outputs = test_data
    
    pred_means, pred_vars = agent.gp_posterior_predictive(test_inputs)
    pred_means = pred_means + test_inputs[:, :2]
    
    sq_diff = tf.math.squared_difference(pred_means,
                                         test_outputs)
    
    max_diff = tf.reduce_max(sq_diff ** 0.5, axis=0)
    min_diff = tf.reduce_min(sq_diff ** 0.5, axis=0)
    
    rmse = tf.reduce_mean(sq_diff, axis=0) ** 0.5
    smse = tf.reduce_mean(sq_diff / pred_vars, axis=0)
    
    rmse = [round(num, 3) for num in rmse.numpy()]
    smse = [round(num, 3) for num in smse.numpy()]
    max_diff = [round(num, 3) for num in max_diff.numpy()]
    min_diff = [round(num, 3) for num in min_diff.numpy()]
    
    print(f'RMSE: {rmse} SMSE {smse} Min {min_diff} Max {max_diff}')

In [60]:
rbf_policy = RBFPolicy(state_dim=2,
                       action_dim=1,
                       num_rbf_features=5,
                       dtype=tf.float64)
rbf_policy.reset()

eq_cost = EQCost(target_loc=tf.ones((1, 3)),
                 target_scale=1.,
                 dtype=tf.float64)

eq_agent = EQGPAgent(state_dim=2,
                     action_dim=1,
                     policy=rbf_policy,
                     cost=eq_cost,
                     dtype=tf.float64)

train_state_actions, train_next_states = sample_transitions_uniformly(100, 1, seed=0)

eq_agent.observe(train_state_actions[:, :2], train_state_actions[:, 2:3], train_next_states)

eq_agent.set_eq_scales_from_data()

<tf.Tensor: shape=(50,), dtype=float64, numpy=
array([ 0.        ,  1.10063546,  1.62490614,  2.01091775,  2.25060616,
        2.49596148,  2.72551987,  2.93264035,  3.08131323,  3.25744446,
        3.40711989,  3.54545977,  3.70617489,  3.86446703,  4.016283  ,
        4.17855919,  4.35790156,  4.5213279 ,  4.63969456,  4.82173785,
        4.97942248,  5.14372672,  5.3145127 ,  5.4690288 ,  5.63773962,
        5.81552562,  5.9987452 ,  6.19443251,  6.38362588,  6.57661843,
        6.74367591,  6.9952209 ,  7.23087437,  7.43436501,  7.69809812,
        7.97421631,  8.26074753,  8.5363077 ,  8.83518304,  9.18606769,
        9.53671423,  9.85136613, 10.19600471, 10.58047959, 11.08966064,
       11.75246638, 12.47036638, 13.27642822, 14.0227504 , 16.40752203])>

In [61]:
test_data = sample_transitions_uniformly(1000, 1, seed=1)

evaluate_agent_dynamics(eq_agent, test_data)

RMSE: [0.009, 0.077] SMSE [0.004, 0.576] Min [0.0, 0.0] Max [0.068, 0.555]
