In [1]:
# Create a transformer model to predict next element of a stream cipher based on the previous elements
# Create a LSTM model to predict the next bit of a LFSR
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tqdm import tqdm


def create_data(
    num_samples=10000, initial_key1=None, initial_key2=None, ln=5, output_ln=5
):
    if initial_key1 is None:
        initial_key1 = np.random.randint(0, 2, 5)
        print("Initial key 1: ", initial_key1)
    if initial_key2 is None:
        initial_key2 = np.random.randint(0, 2, 5)
        print("Initial key 2: ", initial_key2)

    # data = initial_key
    key1 = initial_key1
    key2 = initial_key2
    x = []
    y = []
    # st = set()
    data = []
    for _ in range(num_samples):
        nxt1 = key1[0] ^ key1[1]
        nxt2 = key2[0] ^ key2[2]
        data.append(nxt1 ^ nxt2)
        if len(data) >= ln + output_ln:
            x.append([2] + data[-(ln + output_ln) : -output_ln] + [3])
            # st.add(tuple(data[-10 * ln : -9 * ln]))
            y.append([2] + data[-output_ln:] + [3])
        key1 = np.roll(key1, -1)
        key2 = np.roll(key2, -1)
        key1[-1] = nxt1
        key2[-1] = nxt2
    # print("Unique samples: ", len(st))
    return np.array(x), np.array(y)

2024-05-04 04:05:23.406903: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-04 04:05:25.235665: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/TensorRT-8.4.0.6//lib:/usr/local/cuda-11.6/lib64:/usr/local/apps/python-3.10.2/lib:/home2/shivam.sood/.mujoco/mujoco210/bin:/home2/shivam.sood/.mujoco/mujoco210/bin:/usr/local/apps/cuDNN/8.4.0-cuda-11.6/lib
2024-05-04 04:05:25.235835: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.

In [2]:
X, Y = create_data(num_samples=10000, ln=10, output_ln=100, initial_key1=np.array([1, 0, 1, 0, 1]),
        initial_key2=np.array([0, 0, 1, 1, 1]),)
X_train, Y_train = X[:500 - 110], Y[:500 - 110]
X_test, Y_test = X[500 - 110:], Y[500 - 110:]
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(390, 12) (390, 102) (9501, 12) (9501, 102)


In [3]:
def positional_embedding(length, depth):
    depth = depth // 2

    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis, :] / depth

    angle_rates = 1 / (10000**depths)
    angle_rads = positions * angle_rates
    # print(angle_rads.shape, np.sin(angle_rads).shape, np.cos(angle_rads).shape)
    positional_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)], axis=-1
    )

    return tf.cast(positional_encoding, dtype=tf.float32)


class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, max_len):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoding = positional_embedding(max_len, d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        # print(length)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.positional_encoding[tf.newaxis, :length, :]

        return x


class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()


class CrossAttention(BaseAttention):
    def call(self, x, y):
        attn_output = self.mha(x, y, y)
        x = self.add([attn_output, x])
        x = self.layernorm(x)

        return x


class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(x, x, x)
        x = self.add([attn_output, x])
        x = self.layernorm(x)

        return x


class CausalSelfAttention(BaseAttention):
    def call(self, x):

        attn_output = self.mha(x, x, x, use_causal_mask=True)
        x = self.add([attn_output, x])
        x = self.layernorm(x)

        return x


class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff):
        super().__init__()
        self.seq = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(dff, activation="relu"),
                tf.keras.layers.Dense(d_model),
            ]
        )
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        ff_output = self.seq(x)
        x = self.add([ff_output, x])
        x = self.layernorm(x)

        return x


class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.self_attn = GlobalSelfAttention(num_heads=num_heads, key_dim=d_model)
        self.feed_forward = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attn(x)
        x = self.feed_forward(x)
        return x


class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_emb = PositionalEmbedding(vocab_size, d_model, 12)

        self.enc_layers = [
            EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)
        ]

    def call(self, x):
        x = self.pos_emb(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x


class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.self_attn = CausalSelfAttention(num_heads=num_heads, key_dim=d_model)
        self.cross_attn = CrossAttention(num_heads=num_heads, key_dim=d_model)
        self.feed_forward = FeedForward(d_model, dff)

    def call(self, x, enc_out):
        x = self.self_attn(x)
        x = self.cross_attn(x, enc_out)
        x = self.feed_forward(x)

        return x


class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_emb = PositionalEmbedding(vocab_size, d_model, 101)

        self.dec_layers = [
            DecoderLayer(d_model, num_heads, dff) for _ in range(num_layers)
        ]

    def call(self, x, enc_out):
        x = self.pos_emb(x)

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, enc_out)

        return x


class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, vocab_size)
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, vocab_size)
        self.final_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs):
        x, y = inputs
        enc_out = self.encoder(x)
        dec_out = self.decoder(y, enc_out)

        return self.final_layer(dec_out)

In [4]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
# num_layers = 6
# d_model = 512
# dff = 2048
# num_heads = 8

In [5]:
transformer = Transformer(num_layers, d_model, num_heads, dff, 4)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# transformer.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])

2024-05-04 04:05:28.844393: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-04 04:05:29.426939: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10398 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:02:00.0, compute capability: 6.1


In [57]:
# print(transformer((X_train[0], Y_train[0, :-1])).shape)

In [61]:
transformer.fit((X_train, Y_train[:, :-1]), Y_train[:, 1:], epochs=1000, batch_size=256, validation_data=((X_test, Y_test[:, :-1]), Y_test[:, 1:]), validation_freq=10, verbose=2, validation_steps=10)




Epoch 1/1000
2/2 - 1s - loss: 0.0553 - sparse_categorical_accuracy: 0.9809 - 529ms/epoch - 264ms/step
Epoch 2/1000
2/2 - 0s - loss: 0.0419 - sparse_categorical_accuracy: 0.9848 - 495ms/epoch - 247ms/step
Epoch 3/1000
2/2 - 0s - loss: 0.0331 - sparse_categorical_accuracy: 0.9883 - 496ms/epoch - 248ms/step
Epoch 4/1000
2/2 - 0s - loss: 0.0237 - sparse_categorical_accuracy: 0.9924 - 495ms/epoch - 248ms/step
Epoch 5/1000
2/2 - 0s - loss: 0.0157 - sparse_categorical_accuracy: 0.9957 - 497ms/epoch - 249ms/step
Epoch 6/1000
2/2 - 0s - loss: 0.0122 - sparse_categorical_accuracy: 0.9965 - 495ms/epoch - 248ms/step
Epoch 7/1000
2/2 - 1s - loss: 0.0068 - sparse_categorical_accuracy: 0.9988 - 500ms/epoch - 250ms/step
Epoch 8/1000
2/2 - 0s - loss: 0.0060 - sparse_categorical_accuracy: 0.9990 - 495ms/epoch - 247ms/step
Epoch 9/1000
2/2 - 0s - loss: 0.0041 - sparse_categorical_accuracy: 0.9994 - 499ms/epoch - 249ms/step
Epoch 10/1000
2/2 - 9s - loss: 0.0026 - sparse_categorical_accuracy: 0.9998 - val_


KeyboardInterrupt



In [62]:
transformer.summary()

Model: "transformer_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_5 (Encoder)         multiple                  2639360   
                                                                 
 decoder_5 (Decoder)         multiple                  4750336   
                                                                 
 dense_109 (Dense)           multiple                  516       
                                                                 
Total params: 7,390,212
Trainable params: 7,390,212
Non-trainable params: 0
_________________________________________________________________


In [63]:
transformer.evaluate((X_test, Y_test[:, :-1]), Y_test[:, 1:])



[2.1919262409210205, 0.7949616312980652]

In [64]:
print(X_test[0][1:-1])
print(Y_test[0][1:-1])

[1 0 0 0 0 0 0 1 1 1]
[0 0 0 1 1 0 0 1 1 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 1 1 1
 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1
 0 0 0 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 0 0 1 0 1]


In [65]:
seq = np.array([2.0])
for i in range(101):
    nxt = transformer((X_test[0].reshape(1, -1), seq.reshape(1, -1)))
    # print(np.argmax(nxt[0, -1]))
    seq = np.concatenate([seq, [np.argmax(nxt[0, -1])]])

print(seq[1:-1])

[1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1.
 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0.
 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 0.
 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1.
 0. 1. 1. 0.]


In [68]:
print(np.sum(np.rint(seq) == Y_test[0]))

57


In [22]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
# num_layers = 6
# d_model = 512
# dff = 2048
# num_heads = 8


def create_data(
    num_samples=10000, initial_key1=None, initial_key2=None, ln=5, output_ln=5
):
    if initial_key1 is None:
        initial_key1 = np.random.randint(0, 2, 5)
        print("Initial key 1: ", initial_key1)
    if initial_key2 is None:
        initial_key2 = np.random.randint(0, 2, 5)
        print("Initial key 2: ", initial_key2)

    # data = initial_key
    key1 = initial_key1
    key2 = initial_key2
    x = []
    y = []
    # st = set()
    data = []
    for _ in range(num_samples):
        nxt1 = key1[0] ^ key1[1]
        nxt2 = key2[0] ^ key2[2]
        data.append(nxt1 ^ nxt2)
        if len(data) >= ln + output_ln:
            x.append(data[-(ln + output_ln) : -output_ln])
            # st.add(tuple(data[-10 * ln : -9 * ln]))
            y.append(data[-output_ln:] )
        key1 = np.roll(key1, -1)
        key2 = np.roll(key2, -1)
        key1[-1] = nxt1
        key2[-1] = nxt2
    # print("Unique samples: ", len(st))
    return np.array(x), np.array(y)

X, Y = create_data(num_samples=10000, ln=10, output_ln=1, initial_key1=np.array([1, 0, 1, 0, 1]),
        initial_key2=np.array([0, 0, 1, 1, 1]),)
X_train, Y_train = X[:5000 - 11], Y[:5000 - 11]
X_test, Y_test = X[5000 - 11:], Y[5000 - 11:]
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_emb = tf.keras.layers.Embedding(vocab_size, d_model)

        self.enc_layers = [
            EncoderLayer(d_model, num_heads, dff) for _ in range(num_layers)
        ]

    def call(self, x):
        x = self.pos_emb(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x


class FinalEncoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, 2)
        self.dense = tf.keras.layers.Dense(1000)
        self.dense2 = tf.keras.layers.Dense(1)
        self.flatten = tf.keras.layers.Flatten()

    def call(self, x):
        tmp = self.dense(self.flatten(self.encoder(x)))
        # print(tmp.shape)
        return self.dense2(tmp)

(4989, 10) (4989, 1) (5001, 10) (5001, 1)


In [23]:
enc = FinalEncoder(num_layers, d_model, num_heads, dff)
optimizer = tf.keras.optimizers.Adam(
    learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9
)
enc.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(
    from_logits=True), metrics=["accuracy"])


In [24]:
enc.fit(X_train, Y_train, epochs=1000, batch_size=32, validation_data=(X_test, Y_test), validation_freq=10, verbose=2, validation_steps=100)

Epoch 1/1000
156/156 - 13s - loss: 0.7361 - accuracy: 0.5077 - 13s/epoch - 85ms/step
Epoch 2/1000
156/156 - 3s - loss: 0.7514 - accuracy: 0.5097 - 3s/epoch - 21ms/step
Epoch 3/1000
156/156 - 3s - loss: 0.7422 - accuracy: 0.5039 - 3s/epoch - 21ms/step
Epoch 4/1000
156/156 - 3s - loss: 0.7284 - accuracy: 0.5077 - 3s/epoch - 21ms/step
Epoch 5/1000
156/156 - 3s - loss: 0.7310 - accuracy: 0.4955 - 3s/epoch - 21ms/step
Epoch 6/1000
156/156 - 3s - loss: 0.7245 - accuracy: 0.4997 - 3s/epoch - 21ms/step
Epoch 7/1000
156/156 - 3s - loss: 0.7261 - accuracy: 0.4937 - 3s/epoch - 21ms/step
Epoch 8/1000
156/156 - 3s - loss: 0.7263 - accuracy: 0.5003 - 3s/epoch - 21ms/step
Epoch 9/1000
156/156 - 3s - loss: 0.7383 - accuracy: 0.5031 - 3s/epoch - 21ms/step
Epoch 10/1000
156/156 - 5s - loss: 0.7182 - accuracy: 0.4935 - val_loss: 0.7045 - val_accuracy: 0.4978 - 5s/epoch - 30ms/step
Epoch 11/1000
156/156 - 3s - loss: 0.7087 - accuracy: 0.4953 - 3s/epoch - 20ms/step
Epoch 12/1000
156/156 - 3s - loss: 0.7218

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x14e4fd7f7760>
Traceback (most recent call last):
  File "/usr/local/apps/python-3.10.2/lib/python3.10/weakref.py", line 370, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [9]:
enc.summary()

Model: "final_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  10504192  
                                                                 
 dense_19 (Dense)            multiple                  513       
                                                                 
Total params: 10,504,705
Trainable params: 10,504,705
Non-trainable params: 0
_________________________________________________________________
