In [24]:
import tensorflow as tf
import numpy as np
from Tensor import Tensor
from typing import List


class ActorModel(tf.keras.Model):
    def __init__(self, input_dim: int):
        super().__init__(self)

        #self.l1 = tf.keras.layers.Dense(32)
        self.actions = tf.keras.layers.Dense(1)

    def call(self, state: Tensor, training: bool = None, mask: bool = None) -> Tensor:
        #x = self.l1(state)
        actions = self.actions(state)
        return actions

class CriticModel(tf.keras.Model):
    def __init__(self, input_dim: int):
        super().__init__(self)
        self.l1 = tf.keras.layers.Dense(2)
        self.value = tf.keras.layers.Dense(1)

    def call(self, state: Tensor, training: bool = None, mask: bool = None) -> Tensor:
        c = self.l1(state)
        value = self.value(c)
        return value

class ZModel():
    def __init__(self, input_dim: int):
        self.actor = ActorModel(1)
        self.critic = CriticModel(1)
        self.cadam = tf.optimizers.Adam()
        self.aadam = tf.optimizers.Adam()

        self.global_step = 0

    def update(self, state: Tensor, reward: Tensor):
        with tf.GradientTape() as actor_tape:
            action = self.actor(state, training=True, mask=None)

        with tf.GradientTape() as c_tape1, tf.GradientTape() as c_tape2:
            action_var = tf.Variable(action)
            ci = tf.concat([action_var, state], axis=1)
            critic_value = self.critic.call(ci)
            critic_loss = tf.reduce_mean(tf.square(critic_value - reward))

        # critic_grads = c_tape1.gradient(critic_loss, self.critic.trainable_variables)
        critic_grads = c_tape1.gradient(critic_loss, self.critic.trainable_variables)
        critic_grad_vars = zip(critic_grads, self.critic.trainable_variables)

        critic_dvda = c_tape2.gradient(critic_value, action_var)
        if self.global_step % 100 == 0:
            print("global_step ", self.global_step)
            print("critic_grads--------------------------------------------------------------------------------:")
            for g in critic_grads:
                print("g")
                print( g)

            print("critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
            print(critic_dvda)
            print(tf.concat([state, action, reward, critic_dvda], axis=1))
            print("critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")
            print(critic_dvda.shape)
            cl = tf.square(critic_value - reward)
            for i in range(state.shape[0]):
                print("d:  state: % 7.3f  action: % 7.3f  reward: % 7.3f c_value % 7.3f critic_dvda % 7.3f cl % 7.3f" %
                      ( state[i,0],
                        action[i,0],
                        reward[i,0],
                        critic_value[i, 0],
                        critic_dvda[i,0],
                        cl[i,0],
                        )
                      )

        actor_grads = actor_tape.gradient(action, self.actor.trainable_variables, -critic_dvda)
        actor_grad_vars = zip(actor_grads, self.actor.trainable_variables)

        self.cadam.apply_gradients(critic_grad_vars)
        self.aadam.apply_gradients(actor_grad_vars)
        self.global_step += 1

        return critic_loss

In [25]:
import random
def r(net_out: float, state: float) -> float:
    delta = net_out - state
    probability = 1 / (1 + delta * delta)
    return probability
    # if random.uniform(0, 1) < probability:
    #     return 1
    # else:
    #     return 0

def create_block(model: ZModel):
    block_size = 16
    state = tf.random.normal(shape=(block_size, 1))
    actions= model.actor(state)
    rewards = np.array([r(state[i,0],actions[i, 0] ) for i in range(block_size)]).reshape((block_size,1))
    return state, actions, rewards


class RingSum:
    def __init__(self):
        self.sum = 0
        self.count = 0
        self.max = 10
        self.values = []
        self.index = 0
    def add(self, x):
        if len(self.values) < self.max:
            self.values.append(x)
            self.sum += x
        else:
            xx = self.values[self.index]
            self.values[self.index] = x
            self.sum -= xx
            self.sum += x
            self.index = (self.index + 1) % self.max
        return self.sum

    def mean(self):
        return self.sum / len(self.values)

def train():
    model = ZModel(1)
    rewards = RingSum()
    losses = RingSum()
    for i in range(2000):
        state, action, reward = create_block(model)
        loss = model.update(state, reward)
        rewards.add(np.mean(reward))
        losses.add(np.mean(loss))
        if i > 0 and i % 100 == 0:
            print("%4d %.2f %.2f" % (i, rewards.mean(), losses.mean()))


train()

global_step  0
critic_grads--------------------------------------------------------------------------------:
g
tf.Tensor(
[[-0.0568437   0.0948899 ]
 [-0.41267583  0.688885  ]], shape=(2, 2), dtype=float32)
g
tf.Tensor([-0.6791256  1.1336731], shape=(2,), dtype=float32)
g
tf.Tensor(
[[ 0.05869703]
 [-0.44498602]], shape=(2, 1), dtype=float32)
g
tf.Tensor([-1.5800308], shape=(1,), dtype=float32)
critic_dvda::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
tf.Tensor(
[[-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]
 [-0.15976788]], shape=(16, 1), dtype=float32)
tf.Tensor(
[[-0.5054363  -0.06962091  0.840382   -0.15976788]
 [ 0.9212      0.12688994  0.6131477  -0.15976788]
 [ 0.7651532   0.10539541  0.696728   -0.15976788]
 [ 0.8055      0.11095294  0.67458373 -0.15976788]
 [-0.305