In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import seaborn as sns
import tensorflow as tf

host,=!uname -n
print("Running on " + host)
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
print("Tensorflow version is: %s" % tf.__version__)
print("GPU available for TF:  %s" % tf.test.is_gpu_available())

Running on scylla
2 Physical GPUs, 2 Logical GPUs
Tensorflow version is: 2.0.0-beta0
GPU available for TF:  True


In [4]:
import tensorflow.keras.layers as tkl

In [5]:
N_BATCH=1

In [6]:
s_i = np.array([3.,2.,1.]).reshape([1,3]).astype(np.float32)
s_i1 = np.array([4.,3.,2.]).reshape([1,3]).astype(np.float32)
a_i =  np.array([6.,5.]).reshape([1,2]).astype(np.float32)
r_i = np.array([3.]).reshape([1,1]).astype(np.float32)

In [7]:
inp_s_i = tf.keras.Input(shape=[3], dtype=tf.float32, name="s_i")
#inp_s_i1 = tf.keras.Input(shape=[3], dtype=tf.float32)
inp_a_i = tf.keras.Input(shape=[2], dtype=tf.float32, name="a_i")
inp_sa_i = tf.keras.Input(shape=[5], dtype=tf.float32, name="sa_i")
inp_r_i = tf.keras.Input(shape=[1], dtype=tf.float32)

### Compute Q-targets with $Q^\prime(s_{i+1}, \mu^\prime(s_{i+1}))$


In [8]:
_mu_p = tkl.Dense(units=2)(inp_s_i)
mu_p = tf.keras.Model(inputs=inp_s_i, outputs = _mu_p)

mu_p.set_weights([np.array([[2., -1.], [1., 3.], [4., -3.]]), np.array([0., 0.])])

mu_p(s_i).numpy() == np.array([12., 0.])

array([[ True,  True]])

In [9]:
_qa = tf.concat([inp_s_i, _mu_p], axis=1)
_q_p = tkl.Dense(units=1)(_qa)
q_p = tf.keras.Model(inputs = inp_s_i, outputs = _q_p)
q_p.set_weights([
    np.array([[2., -1.], [1., 3.], [4., -3.]]),
    np.array([0., 0.]),
    np.array([[1.], [3.], [2.], [1.], [-3.]]), 
    np.array([0.])
])
q_p.trainable_weights
q_p(s_i1)

<tf.Tensor: id=100, shape=(1, 1), dtype=float32, numpy=array([[39.]], dtype=float32)>

In [10]:
np.shape(r_i), q_p(s_i1).shape

((1, 1), TensorShape([1, 1]))

In [11]:
gamma=.1
y_i = r_i + gamma * q_p(s_i1)
y_i

<tf.Tensor: id=125, shape=(1, 1), dtype=float32, numpy=array([[6.9]], dtype=float32)>

### Update the primary critic $Q(s,a)$

In [12]:
sa_i = np.concatenate([s_i, a_i], axis=-1)
sa_i

array([[3., 2., 1., 6., 5.]], dtype=float32)

In [13]:
QWeights = tkl.Dense(units=1)
_qsa = QWeights(inp_sa_i)
q = tf.keras.Model(inputs = [inp_sa_i], outputs = _qsa)
q.set_weights([
    np.array([[1.], [3.], [2.], [1.], [-3.]]), 
    np.array([0.])
])
q(sa_i)

<tf.Tensor: id=162, shape=(1, 1), dtype=float32, numpy=array([[2.]], dtype=float32)>

In [14]:
L = tf.keras.losses.mean_squared_error
sgd = tf.keras.optimizers.SGD(learning_rate=1e-2)

In [15]:
q.compile(optimizer=sgd, loss=L)

In [16]:
q.train_on_batch(x=sa_i, y=y_i)

24.01

In [17]:
q.trainable_variables

[<tf.Variable 'dense_2/kernel:0' shape=(5, 1) dtype=float32, numpy=
 array([[ 1.294],
        [ 3.196],
        [ 2.098],
        [ 1.588],
        [-2.51 ]], dtype=float32)>,
 <tf.Variable 'dense_2/bias:0' shape=(1,) dtype=float32, numpy=array([0.098], dtype=float32)>]

What we see is the new weight matrix of $Q(s, a | \phi)$ after a single gradient descent step.

$$
\phi_{t+1} = \phi_t - 0.01 \frac{dL}{dQ} \cdot \frac{dQ}{d\phi}
$$

Here's the proof:

In [18]:
(
    np.array([[1.], [3.], [2.], [1.], [-3.]]) #   phi_t
    - 0.01 * 2 * (2 - 6.9)                    # - learning rate * dL/dQ
    * np.array([[3],[2],[1],[6],[5]]))        # * dQ/dphi (=sa_i)

array([[ 1.294],
       [ 3.196],
       [ 2.098],
       [ 1.588],
       [-2.51 ]])

### The Primary Actor

In [19]:
_mu = tkl.Dense(units=2)(inp_s_i)
mu = tf.keras.Model(inputs=inp_s_i, outputs = _mu)
mu.set_weights([np.array([[2., -1.], [1., 3.], [4., -3.]]), np.array([0., 0.])])
mu(s_i)

<tf.Tensor: id=448, shape=(1, 2), dtype=float32, numpy=array([[12.,  0.]], dtype=float32)>

In [20]:
_qs = tf.concat([inp_s_i, _mu], axis=1)
_q = QWeights(_qs)
qs = tf.keras.Model(inputs = inp_s_i, outputs = _q)
qs(s_i)

<tf.Tensor: id=479, shape=(1, 1), dtype=float32, numpy=array([[31.526]], dtype=float32)>

In [21]:
qs.trainable_variables

[<tf.Variable 'dense_3/kernel:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2., -1.],
        [ 1.,  3.],
        [ 4., -3.]], dtype=float32)>,
 <tf.Variable 'dense_3/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>,
 <tf.Variable 'dense_2/kernel:0' shape=(5, 1) dtype=float32, numpy=
 array([[ 1.294],
        [ 3.196],
        [ 2.098],
        [ 1.588],
        [-2.51 ]], dtype=float32)>,
 <tf.Variable 'dense_2/bias:0' shape=(1,) dtype=float32, numpy=array([0.098], dtype=float32)>]

### Update the actor policy using $\nabla_\theta J$

Now, ```qs``` is the *full* Q-function $Q(s, \mu(s | \phi) | \theta)$, the derivative of which is by virtue of the chain rule just our policy gradient:

$$
\nabla_\theta J \approx \frac{1}{N_B}\sum_{i=1}^{N_B}\nabla_a Q(s, a|\phi)|_{s=s_i, a=\mu(s_i)}
\nabla_\theta \mu(s|\theta)|_{s_i}
$$


Now, we compute the gradients and use an SGD optimizer to handle the parameter updates. Note that we intend to maximize the Q-function itself, so the negative Q-function plays the role of a *loss*.

In [43]:
with tf.GradientTape() as tape:
    loss = -qs(s_i)

In [44]:
theta = mu.trainable_variables
theta

[<tf.Variable 'dense_4/kernel:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2.004764, -1.00753 ],
        [ 1.003176,  2.99498 ],
        [ 4.001588, -3.00251 ]], dtype=float32)>,
 <tf.Variable 'dense_4/bias:0' shape=(2,) dtype=float32, numpy=array([ 0.001588, -0.00251 ], dtype=float32)>]

In [45]:
gradients = tape.gradient(loss, theta)

In [46]:
gradients

[<tf.Tensor: id=916, shape=(3, 2), dtype=float32, numpy=
 array([[-4.764    ,  7.5299997],
        [-3.176    ,  5.02     ],
        [-1.588    ,  2.51     ]], dtype=float32)>,
 <tf.Tensor: id=915, shape=(2,), dtype=float32, numpy=array([-1.588,  2.51 ], dtype=float32)>]

In [47]:
sgd = tf.keras.optimizers.SGD(learning_rate=1e-3)

In [48]:
sgd.apply_gradients(zip(gradients, theta));

In [49]:
qs.trainable_variables

[<tf.Variable 'dense_4/kernel:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2.0095282, -1.01506  ],
        [ 1.006352 ,  2.9899602],
        [ 4.0031757, -3.0050201]], dtype=float32)>,
 <tf.Variable 'dense_4/bias:0' shape=(2,) dtype=float32, numpy=array([ 0.003176, -0.00502 ], dtype=float32)>,
 <tf.Variable 'dense_2/kernel:0' shape=(5, 1) dtype=float32, numpy=
 array([[ 1.294],
        [ 3.196],
        [ 2.098],
        [ 1.588],
        [-2.51 ]], dtype=float32)>,
 <tf.Variable 'dense_2/bias:0' shape=(1,) dtype=float32, numpy=array([0.098], dtype=float32)>]

We observe that only $\theta$ has been updated. Now we expect that $Q(s, \mu(s))$ increased a bit, that is to say: We updated the policy, such that the target objective $J$ is maximized.

In [50]:
qs(s_i)

<tf.Tensor: id=971, shape=(1, 1), dtype=float32, numpy=array([[31.790653]], dtype=float32)>

Q.E.D.!

### An Alternative: Avoiding the Explicit Calculus
We can avoid the gradient calculus by faking the loss function such that we can use it for the ```model.fit(...)``` API. Note that we're not actually fitting anything - we're must maximizing $Q$ by minimizing $-Q$.

In [22]:
@tf.function
def qloss(y_true, y_pred):
    return -qs(s_i)

In [23]:
QWeights.trainable = False
sgd = tf.keras.optimizers.SGD(learning_rate=1e-3)
qs.compile(loss=qloss, optimizer=sgd)

In [25]:
qs.train_on_batch(s_i, np.zeros_like(s_i))

-31.658327

Here we can see that ```model.train_on_batch(...)``` does exactly the same thing that the gradient step did in the previous section. 

In [26]:
QWeights.trainable = True
qs.trainable_variables

[<tf.Variable 'dense_3/kernel:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2.0095282, -1.01506  ],
        [ 1.006352 ,  2.9899602],
        [ 4.0031757, -3.0050201]], dtype=float32)>,
 <tf.Variable 'dense_3/bias:0' shape=(2,) dtype=float32, numpy=array([ 0.003176, -0.00502 ], dtype=float32)>,
 <tf.Variable 'dense_2/kernel:0' shape=(5, 1) dtype=float32, numpy=
 array([[ 1.294],
        [ 3.196],
        [ 2.098],
        [ 1.588],
        [-2.51 ]], dtype=float32)>,
 <tf.Variable 'dense_2/bias:0' shape=(1,) dtype=float32, numpy=array([0.098], dtype=float32)>]

### Updating the Target Network

In [27]:
tau = 1e-1

In [28]:
pairs = zip(qs.trainable_weights, q_p.trainable_weights)

In [29]:
new_weights = [(1-tau) * target + tau * primary for primary, target in pairs]

In [30]:
new_weights

[<tf.Tensor: id=735, shape=(3, 2), dtype=float32, numpy=
 array([[ 2.0009527, -1.001506 ],
        [ 1.0006351,  2.9989958],
        [ 4.0003176, -3.0005019]], dtype=float32)>,
 <tf.Tensor: id=742, shape=(2,), dtype=float32, numpy=array([ 0.0003176, -0.000502 ], dtype=float32)>,
 <tf.Tensor: id=749, shape=(5, 1), dtype=float32, numpy=
 array([[ 1.0294   ],
        [ 3.0196   ],
        [ 2.0098   ],
        [ 1.0588   ],
        [-2.9509997]], dtype=float32)>,
 <tf.Tensor: id=756, shape=(1,), dtype=float32, numpy=array([0.0098], dtype=float32)>]

In [31]:
q_p.trainable_weights

[<tf.Variable 'dense/kernel:0' shape=(3, 2) dtype=float32, numpy=
 array([[ 2., -1.],
        [ 1.,  3.],
        [ 4., -3.]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>,
 <tf.Variable 'dense_1/kernel:0' shape=(5, 1) dtype=float32, numpy=
 array([[ 1.],
        [ 3.],
        [ 2.],
        [ 1.],
        [-3.]], dtype=float32)>,
 <tf.Variable 'dense_1/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

In [32]:
q_p.set_weights(new_weights)

In [33]:
q_p(s_i1)

<tf.Tensor: id=786, shape=(1, 1), dtype=float32, numpy=array([[40.31217]], dtype=float32)>