In [1]:
import tensorflow as tf

import numpy as np

from pilco.environments import Environment
from pilco.policies import RBFPolicy
from pilco.costs import EQCost
from pilco.agents import EQGPAgent

from tqdm.notebook import trange

In [2]:
dtype = tf.float64

# Create pendulum environment from Gym
env = Environment(name='Pendulum-v0')
env.reset()

# Create stuff for our controller
# Upright position, stationary and zero action position for pendulum is [0, 0, 0]
target_loc = tf.zeros([1, 2])
target_scale = 1.

eq_cost = EQCost(target_loc=target_loc,
                 target_scale=target_scale,
                 dtype=dtype)

# create EQ policy
eq_policy = RBFPolicy(state_dim=2,
                      action_dim=1,
                      num_rbf_features=100,
                      dtype=dtype)

# create agent
eq_agent = EQGPAgent(state_dim=2,
                     action_dim=1,
                     policy=eq_policy,
                     cost=eq_cost,
                     dtype=dtype)

In [3]:
num_episodes = 1
num_steps = 20

eq_agent.policy.reset()

for episode in range(num_episodes):
    
    print(f"Episode {episode + 1}")
    
    state = env.reset()
    
    for step in trange(num_steps):
        
        action = eq_agent.act(state)
        state, action, next_state = env.step(action[None].numpy())
        eq_agent.observe(state, action, next_state)
        
        #env.env.render()
        
        print(f"state, action, next_state: {state}, {action}, {next_state}")

Episode 1


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

state, action, next_state: [-0.92758772  0.36685919], [-12.94512724], [-0.95425134 -0.53327241]
state, action, next_state: [-0.95425134 -0.53327241], [-12.94512724], [-1.0265105  -1.44518322]
state, action, next_state: [-1.0265105  -1.44518322], [-12.92327], [-1.14585081 -2.38680621]
state, action, next_state: [-1.14585081 -2.38680621], [-10.78427358], [-1.31435592 -3.3701021 ]
state, action, next_state: [-1.31435592 -3.3701021 ], [-7.2978862], [-1.53413473 -4.39557632]
state, action, next_state: [-1.53413473 -4.39557632], [-3.78411042], [-1.80638835 -5.44507235]
state, action, next_state: [-1.80638835 -5.44507235], [-1.3566641], [-2.12528106 -6.37785421]
state, action, next_state: [-2.12528106 -6.37785421], [-0.25248388], [-2.47794884 -7.05335567]
state, action, next_state: [-2.47794884 -7.05335567], [0.02141182], [-2.85355571 -7.51213742]
state, action, next_state: [-2.85355571 -7.51213742], [0.04262904], [-3.23949551 -7.718796  ]
state, action, next_state: [-3.23949551 -7.718796  ],

In [4]:
horizon = 10
num_optim_steps = 50
num_episodes = 10

init_state = tf.constant([[-np.pi, 0.]], dtype=tf.float64)
init_cov = tf.eye(2, dtype=tf.float64)

optimiser = tf.optimizers.Adam(1. / horizon)

for episode in range(num_episodes):

    print('Optimising policy')
    
    eq_agent.policy.reset()
    
    for n in trange(num_optim_steps):

        cost = 0.
        loc = init_state
        cov = init_cov

        with tf.GradientTape(watch_accessed_variables=False) as tape:

            tape.watch(eq_agent.policy.parameters)

            for t in range(horizon):

                mean_full, cov_full = eq_agent.policy.match_moments(loc, cov)

                loc, cov = eq_agent.match_moments(mean_full, cov_full)

                cost = cost + eq_agent.cost.expected_cost(loc[None, :], cov)

        gradients = tape.gradient(cost, eq_agent.policy.parameters)

        optimiser.apply_gradients(zip(gradients, eq_agent.policy.parameters))

        # print(cost)
    
    print(cost)
        
    print(f'Performing episode {episode + 1}:')
    
    state = env.reset()
    
    for step in trange(num_steps):
        
        action = eq_agent.act(state)
        state, action, next_state = env.step(action[None].numpy())
        eq_agent.observe(state, action, next_state)

Optimising policy


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/stratis/Documents/sbrml/pilco/venv-pilco/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-6a8fdd73cd68>", line 32, in <module>
    cost = cost + eq_agent.cost.expected_cost(loc[None, :], cov)
  File "/Users/stratis/Documents/sbrml/pilco/pilco/costs/costs.py", line 65, in expected_cost
    cov_plus_target_scale = cov + I * self.target_scale**2
  File "/Users/stratis/Documents/sbrml/pilco/venv-pilco/lib/python3.7/site-packages/tensorflow_core/python/ops/math_ops.py", line 903, in binary_op_wrapper
    elif not isinstance(y, sparse_tensor.SparseTensor):
  File "/Users/stratis/Documents/sbrml/pilco/venv-pilco/bin/../lib/python3.7/abc.py", line 137, in __instancecheck__
    def __instancecheck__(cls, instance):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call la

KeyboardInterrupt: 

In [None]:
gradients