In [11]:
import tensorflow as tf
import numpy as np
from Tensor import Tensor
from typing import List


class ActorModel(tf.keras.Model):
    def __init__(self, input_dim: int):
        super().__init__(self)

        #self.l1 = tf.keras.layers.Dense(32)
        self.actions = tf.keras.layers.Dense(1)

    def call(self, state: Tensor, training: bool = None, mask: bool = None) -> Tensor:
        #x = self.l1(state)
        actions = self.actions(state)
        return actions

class CriticModel(tf.keras.Model):
    def __init__(self, input_dim: int):
        super().__init__(self)
        self.l1 = tf.keras.layers.Dense(2)
        self.value = tf.keras.layers.Dense(1)

    def call(self, state: Tensor, training: bool = None, mask: bool = None) -> Tensor:
        c = self.l1(state)
        value = self.value(c)
        return value

class ZModel():
    def __init__(self, input_dim: int):
        self.actor = ActorModel(1)
        self.critic = CriticModel(1)
        self.cadam = tf.optimizers.Adam()
        self.aadam = tf.optimizers.Adam()

    def update(self, state: Tensor, reward: Tensor):
        with tf.GradientTape() as actor_tape:
            action = self.actor(state, training=True, mask=None)

        with tf.GradientTape() as c_tape1, tf.GradientTape() as c_tape2:
            action_var = tf.Variable(action)
            ci = tf.concat([action_var, state], axis=1)
            critic_value = self.critic.call(ci)
            critic_loss = tf.reduce_mean(tf.square(critic_value - reward))

        # critic_grads = c_tape1.gradient(critic_loss, self.critic.trainable_variables)
        critic_grads = c_tape1.gradient(critic_loss, self.critic.trainable_variables)
        critic_grad_vars = zip(critic_grads, self.critic.trainable_variables)
        print("critic_grads--------------------------------------------------------------------------------:")
        for g in critic_grads:
            print("g")
            print( g)

        critic_dvda = c_tape2.gradient(critic_value, action_var)
        print("critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
        print(critic_dvda)
        print(tf.concat([state, action, reward, critic_dvda], axis=1))
        print("critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
        print(critic_dvda.shape)
        for i in range(state.shape[0]):
            print("d:  state: % 7.3f  action: % 7.3f  reward: % 7.3f critic_dvda % 7.3f " %
                  ( state[i,0],
                    action[i,0],
                    reward[i,0],
                    critic_dvda[i,0])
                  )

        actor_grads = actor_tape.gradient(action, self.actor.trainable_variables, -critic_dvda)
        actor_grad_vars = zip(actor_grads, self.actor.trainable_variables)

        self.cadam.apply_gradients(critic_grad_vars)
        self.aadam.apply_gradients(actor_grad_vars)

        return critic_loss

In [12]:
import random
def r(net_out: float, state: float) -> float:
    delta = net_out - state
    probability = 1 / (1 + delta * delta)
    return probability
    # if random.uniform(0, 1) < probability:
    #     return 1
    # else:
    #     return 0

def create_block(model: ZModel):
    block_size = 16
    state = tf.random.normal(shape=(block_size, 1))
    actions= model.actor(state)
    rewards = np.array([r(state[i,0],actions[i, 0] ) for i in range(block_size)]).reshape((block_size,1))
    return state, actions, rewards


class RingSum:
    def __init__(self):
        self.sum = 0
        self.count = 0
        self.max = 10
        self.values = []
        self.index = 0
    def add(self, x):
        if len(self.values) < self.max:
            self.values.append(x)
            self.sum += x
        else:
            xx = self.values[self.index]
            self.values[self.index] = x
            self.sum -= xx
            self.sum += x
            self.index = (self.index + 1) % self.max
        return self.sum

    def mean(self):
        return self.sum / len(self.values)

def train():
    model = ZModel(1)
    rewards = RingSum()
    losses = RingSum()
    for i in range(3):
        state, action, reward = create_block(model)
        loss = model.update(state, reward)
        rewards.add(np.mean(reward))
        losses.add(np.mean(loss))
        if i > 0 and i % 100 == 0:
            print("%4d %.2f %.2f" % (i, rewards.mean(), losses.mean()))


train()

critic_grads--------------------------------------------------------------------------------:
g
tf.Tensor(
[[-0.7785758 -0.8087036]
 [-2.3961384 -2.4888597]], shape=(2, 2), dtype=float32)
g
tf.Tensor([-1.5490271 -1.6089684], shape=(2,), dtype=float32)
g
tf.Tensor(
[[0.28351495]
 [1.3802123 ]], shape=(2, 1), dtype=float32)
g
tf.Tensor([-1.4480264], shape=(1,), dtype=float32)
critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
tf.Tensor(
[[-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]
 [-1.967444]], shape=(16, 1), dtype=float32)
tf.Tensor(
[[-1.0764124  -0.349758    0.65443885 -1.967444  ]
 [ 1.0589856   0.3440955   0.66178364 -1.967444  ]
 [ 0.689916    0.22417396  0.8217497  -1.967444  ]
 [-0.88178575 -0.28651807  0.73836505 -1.967444  ]
 [-0.15901344 -0.05166813  0.98860824 -1.967444  ]
 [ 0.46051