In [1]:
import numpy as np
import tensorflow as tf
from node.base import get_node_function
from node.fix_grid import RKSolver


# for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


@tf.function
def dsigmoid(x):
    return tf.nn.sigmoid(x) * (1 - tf.nn.sigmoid(x))


@tf.function
def inv_sigmoid(x):
    return tf.math.log(x + 1e-8) - tf.math.log(1 - x + 1e-8)


@tf.function
def softmax(x, axis):
    return tf.nn.log_softmax(x, axis)


@tf.function
def softmin(x, axis):
    return -tf.nn.log_softmax(-x, axis)


@tf.function
def softrescale(x, axis):
    max = softmax(x, axis)
    min = softmin(x, axis)
    return (x - min) / (max - min)


@tf.function
def rescale(x, axis):
    max = tf.reduce_max(x, axis, keepdims=True)
    min = tf.reduce_min(x, axis, keepdims=True)
    return (x - min) / (max - min)


@tf.function
def get_accuracy(y_true, y_pred):
    y_true = tf.argmax(y_true, axis=-1)
    y_pred = tf.argmax(y_pred, axis=-1)
    accuracy = tf.reduce_mean(
        tf.where(y_true == y_pred,
                 tf.ones_like(y_pred),
                 tf.zeros_like(y_pred)))
    return accuracy


input_dim = 28 * 28
network = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    # tanh output for bounding the scale of phase vector field
    tf.keras.layers.Dense(input_dim, activation='tanh'),
])
network.build([None, input_dim])


class MyLayer(tf.keras.layers.Layer):

    def __init__(self, network, dt, num_grids, **kwargs):
        super().__init__(**kwargs)
        self.network = network
        self.dt = dt
        self.num_grids = num_grids

        t0 = tf.constant(0.)
        self.tN = t0 + num_grids * dt

#         def pvf(t, x):
#             r"""
#             $x^{\prime} = \sigma\left(\sigma^{-1}(x) + \Delta t f(t, x; \theta) \right)$,
#             element-wisely.
#             """
#             return dsigmoid(inv_sigmoid(x)) * self.network(x)

        def pvf(t, x):
            with tf.GradientTape() as g:
                g.watch(x)
                f = self.network(x)
                r = rescale(x, axis=-1)
            return g.gradient(r, [x], [f])[0]

        self._pvf = pvf
        self._node_fn = get_node_function(RKSolver(self.dt), 0., pvf)

    def call(self, x):
        y = self._node_fn(self.tN, x)
        return y


def process(X, y):
    X = X / 255.
    X = tf.reshape(X, [-1, 28 * 28])
    y = tf.one_hot(y, 10)
    return tf.cast(X, tf.float32), tf.cast(y, tf.float32)


mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, y_train = process(x_train, y_train)
x_test, y_test = process(x_test, y_test)

my_layer = MyLayer(network, dt=1e-2, num_grids=10)
output_layer = tf.keras.layers.Dense(
    10, activation='softmax',
    kernel_regularizer=tf.keras.regularizers.l2(1.))
model = tf.keras.Sequential([my_layer, output_layer])
model.build([None, 28 * 28])

optimizer = tf.compat.v1.train.AdamOptimizer()
loss_fn = tf.losses.CategoricalCrossentropy()


@tf.function
def train_one_step(x, y):
    with tf.GradientTape() as tape:
        outputs = model(x)
        loss = loss_fn(y, outputs)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    accuracy = get_accuracy(y, outputs)
    return loss, accuracy


@tf.function
def train(dataset):
    step = 0,
    loss = float('inf')
    reg = float('inf')
    accuracy = 0
    for x, y in dataset:
        loss, accuracy = train_one_step(x, y)
        if step % 1 == 0:
            tf.print(step, loss, accuracy)
        step += 1
    return loss, accuracy


def clip(min, max, x):
    min = np.ones_like(x) * min
    max = np.ones_like(x) * max
    x = np.where(x < min, min, x)
    x = np.where(x > max, max, x)
    return x

In [2]:
num_epochs = 3
batch_size = 128

involved_labels = {1, 3, 5}

labels = np.argmax(y_train, axis=-1)

X, y = [], []
for xi, yi, label in zip(x_train, y_train, labels):
    if label in involved_labels:
        X.append(xi)
        y.append(yi)
X = np.array(X)
y = np.array(y)
print(len(X))
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.repeat(num_epochs).batch(batch_size)

train(dataset)

KeyboardInterrupt: 

In [None]:
from node.utils import tracer

def flip(array, ratio):
    is_flipped = np.random.random(size=array.shape) < ratio
    return np.where(is_flipped, 1 - array, array)

t0 = 0.
t1 = 1.
dt = 1e-2
traj_size = int((t1 - t0) / dt) + 1

n_data = 20
flip_ratio = 0.1

trace = tracer(RKSolver(1e-2), my_layer._pvf)
trajectories = trace(t0, t1, dt, data)
trajectories = tf.transpose(trajectories, [1, 0, 2])
trajectories = trajectories.numpy()


def get_trajectory(x):
    """Input shape `[28 * 28]`, output shape `[frames, 28, 28]`."""
    trajectory = trace(t0, t1, dt, [x]).numpy()[:,0,:]
    trajectory = np.reshape(trajectory, [28, 28])
    return trajectory

In [None]:
preds = output_layer(trajectories[:,-1,:]).numpy()

In [None]:
i = 10
print(preds[i])
labels[i], np.argmax(preds[i])

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import animation


def rescale(array):
    shape = array.shape
    y = np.reshape(array, [-1])
    y = (y - np.min(y)) / (np.max(y) - np.min(y))
    return np.reshape(y, shape)


def visualize_trajectory(trajectory):
    """
    Args:
        trajectory: np.array
            Shape `[frames, x_pixal, y_pixal]`.
    
    Returns: animation.FuncAnimation
    """
    fig = plt.figure()
    ax = plt.axes()
    img = ax.imshow(trajectory[0], cmap='gray')

    def init():
        img.set_data([[]])
        return img,

    def animate(i):
        y = rescale(trajectory[i])
        img.set_data(y)
        return img,

    anim = animation.FuncAnimation(
        fig, animate, init_func=init, frames=traj_size, blit=True)
    return anim


for i, trajectory in enumerate(trajectories):
    label = labels[i]
    anim = visualize_trajectory(trajectory.reshape([-1, 28, 28]))
    anim.save(f'../dat/trajectory/anim_i{i}_l{label}.mp4')
    plt.show()

In [None]:
labels

## Conclusion

### Approach 1

~~1. Animation plotting shows that attractors exist for all the displayed instances.~~

~~1. By tracing the flipping ratio, we find that, while setting $\tilde{L} = 3$ in the training process, the flip ratio decreases from $\sim 0.1$ to $\sim 0.001$ only after $\tilde{L} > 30$ approximately for all trials. That is, the static phase vector field is trained without reaching the attractors. And when reaching the attractors, instances in the same class have little difference (but not vanishing), instances from different classes become evidently more distinct.~~

~~1. The attractors for the same class, even though close to each other, are far from single. It seems to confirm the conclusion in the study of Hebbian learning that high-dimensional dynamic systems have extremely many attractors.~~

1. After re-scaling by `lambda x: (x - min(x)) / (max(x) - min(x))`, there does exists attractors having the properties described above. So, it seems that the phase point flying straightly towords some direction specific for different classes. Or say, "attracted to the direction".

1. It seems that we encountered the chaos along the phase trajectory.

### Approach 2

$x^{\prime} = \sigma\left(\sigma^{-1}(x) + \Delta t f(t, x; \theta) \right)$

1. Attractors are reached at $L \sim 300$.
1. There are quite a lot of attractors.
1. Indeed, in this case, the $x^{\alpha} = 0, 1$ for $\forall \alpha$ are attractors.
1. However, this approach introduces an artificial area of fixed points (i.e. $x^{\alpha} = 0, 1$ for $\forall \alpha$), which is not what we hope for.

## References:

1. [Chaos appears in RGE (as a high dimensional non-linear ODE)](https://physics.stackexchange.com/a/55057) (the [paper](https://arxiv.org/abs/hep-th/0304178) related).