In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Embedding, Dense, Dropout, TimeDistributed, GlobalAveragePooling2D, Input
from tensorflow.keras.models import Model
from tensorflow.keras.applications import EfficientNetV2S
from tensorflow.keras.optimizers import Adam

class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, sequence_length, d_model):
        super(PositionalEncoding, self).__init__()
        self.pos_encoding = self.positional_encoding(sequence_length, d_model)

    def get_angles(self, position, i, d_model):
        angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return position * angles

    def positional_encoding(self, position, d_model):
        angle_rads = self.get_angles(np.arange(position)[:, np.newaxis],
                                     np.arange(d_model)[np.newaxis, :],
                                     d_model)

        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        sines = np.sin(angle_rads[:, 0::2])
        cosines = np.cos(angle_rads[:, 1::2])

        angle_rads[:, 0::2] = sines
        angle_rads[:, 1::2] = cosines

        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]

class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(key_dim=d_model, num_heads=num_heads)
        self.ffn = tf.keras.Sequential(
            [Dense(dff, activation='relu'), Dense(d_model)]
        )

        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, x, training):
        attn_output = self.mha(x, x, x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.num_layers = num_layers
        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = Dropout(rate)

    def call(self, x, training):
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training)
        return x

def create_model_with_transformer(sequence_length, frame_shape, num_classes, num_layers=2, d_model=256, num_heads=4, dff=1024, rate=0.1):
    video_input = Input(shape=(sequence_length,) + frame_shape)

    base_model = EfficientNetV2S(weights='imagenet', include_top=False, input_shape=frame_shape)
    base_model.trainable = False  #frozen weights

    encoded_frames = TimeDistributed(base_model)(video_input)
    encoded_frames = TimeDistributed(GlobalAveragePooling2D())(encoded_frames)

    encoded_frames = TimeDistributed(Dense(d_model))(encoded_frames)

    encoded_frames = PositionalEncoding(sequence_length, d_model)(encoded_frames)

    transformer_encoder = TransformerEncoder(num_layers, d_model, num_heads, dff, rate)
    encoded_sequence = transformer_encoder(encoded_frames)
    outputs = TimeDistributed(Dense(num_classes, activation='softmax'))(encoded_sequence)

    model = Model(inputs=video_input, outputs=outputs)
    return model

model = create_model_with_transformer(sequence_length=16, frame_shape=(224, 224, 3), num_classes=5)
model.compile(optimizer=Adam(learning_rate=0.00006), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(
    train_generator,
    epochs= 200,
    validation_data=test_generator
)

#then save the model, run predicitons, error analysis etc.