In [None]:
pip install -U albumentations

# Model Building

In [None]:
from __future__ import absolute_import, division,print_function, unicode_literals
import six
from keras.models import Model
from keras.layers import Input, Activation, Dense, Flatten, Dropout, BatchNormalization, AveragePooling3D
from tensorflow.keras.layers import Conv3D, MaxPooling3D
from tensorflow.keras.layers import Add
from keras.regularizers import l2
from tensorflow.keras import backend as K


def _bn_relu(input):
    norm = BatchNormalization(axis=CHANNEL_AXIS)(input)
    return Activation("relu")(norm)


def _conv_bn_relu3D(**conv_params):
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer", "he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer",
                                                l2(1e-4))

    def f(input):
        conv = Conv3D(filters=filters, kernel_size=kernel_size,
                      strides=strides, kernel_initializer=kernel_initializer,
                      padding=padding,
                      kernel_regularizer=kernel_regularizer)(input)
        return _bn_relu(conv)

    return f


def _bn_relu_conv3d(**conv_params):
    filters = conv_params["filters"]
    kernel_size = conv_params["kernel_size"]
    strides = conv_params.setdefault("strides", (1, 1, 1))
    kernel_initializer = conv_params.setdefault("kernel_initializer","he_normal")
    padding = conv_params.setdefault("padding", "same")
    kernel_regularizer = conv_params.setdefault("kernel_regularizer",
                                                l2(1e-4))

    def f(input):
        activation = _bn_relu(input)
        return Conv3D(filters=filters, kernel_size=kernel_size,
                      strides=strides, kernel_initializer=kernel_initializer,
                      padding=padding,
                      kernel_regularizer=kernel_regularizer)(activation)
    return f


def _shortcut3d(input, residual):
    input_shape = K.int_shape(input)
    residual_shape = K.int_shape(residual)

    stride_dim1 = input_shape[DIM1_AXIS] // residual_shape[DIM1_AXIS]
    stride_dim2 = input_shape[DIM2_AXIS] // residual_shape[DIM2_AXIS]
    stride_dim3 = input_shape[DIM3_AXIS] // residual_shape[DIM3_AXIS]
    equal_channels = input_shape[CHANNEL_AXIS] == residual_shape[CHANNEL_AXIS]

    shortcut = input
    if stride_dim1 > 1 or stride_dim2 > 1 or stride_dim3 > 1 or not equal_channels:
        shortcut = Conv3D(
            filters=residual_shape[CHANNEL_AXIS],
            kernel_size=(1, 1, 1),
            strides=(stride_dim1, stride_dim2, stride_dim3),
            kernel_initializer="he_normal",
            padding="same",
            kernel_regularizer=l2(1e-4)
        )(input)

    return Add()([shortcut, residual])


def _residual_block3d(block_function, filters, kernel_regularizer, repetitions, is_first_layer=False):
    def f(input):
        for i in range(repetitions):
            strides = (1, 1, 1)
            if i == 0 and not is_first_layer:
                strides = (2, 2, 2)
            input = block_function(filters=filters, strides=strides,
                                   kernel_regularizer=kernel_regularizer,
                                   is_first_block_of_first_layer=(
                                       is_first_layer and i == 0)
                                   )(input)
        return input

    return f


def basic_block(filters, strides=(1, 1, 1), kernel_regularizer=l2(1e-4),
                is_first_block_of_first_layer=False):
    def f(input):
        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv1 = Conv3D(filters=filters, kernel_size=(3, 3, 3),
                           strides=strides, padding="same",
                           kernel_initializer="he_normal",
                           kernel_regularizer=kernel_regularizer
                           )(input)
        else:
            conv1 = _bn_relu_conv3d(filters=filters,
                                    kernel_size=(3, 3, 3),
                                    strides=strides,
                                    kernel_regularizer=kernel_regularizer
                                    )(input)

        residual = _bn_relu_conv3d(filters=filters, kernel_size=(3, 3, 3),
                                   kernel_regularizer=kernel_regularizer
                                   )(conv1)
        return _shortcut3d(input, residual)

    return f


def bottleneck(filters, strides=(1, 1, 1), kernel_regularizer=l2(1e-4),
               is_first_block_of_first_layer=False):
    """Basic 3 X 3 X 3 convolution blocks. Extended from raghakot's 2D impl."""
    def f(input):
        if is_first_block_of_first_layer:
            # don't repeat bn->relu since we just did bn->relu->maxpool
            conv_1_1 = Conv3D(filters=filters, kernel_size=(1, 1, 1),
                              strides=strides, padding="same",
                              kernel_initializer="he_normal",
                              kernel_regularizer=kernel_regularizer
                              )(input)
        else:
            conv_1_1 = _bn_relu_conv3d(filters=filters, kernel_size=(1, 1, 1),
                                       strides=strides,
                                       kernel_regularizer=kernel_regularizer
                                       )(input)

        conv_3_3 = _bn_relu_conv3d(filters=filters, kernel_size=(3, 3, 3),
                                   kernel_regularizer=kernel_regularizer
                                   )(conv_1_1)
        residual = _bn_relu_conv3d(filters=filters * 4, kernel_size=(1, 1, 1),
                                   kernel_regularizer=kernel_regularizer
                                   )(conv_3_3)

        return _shortcut3d(input, residual)

    return f


def _handle_data_format():
    global DIM1_AXIS
    global DIM2_AXIS
    global DIM3_AXIS
    global CHANNEL_AXIS
    if K.image_data_format() == 'channels_last':
        DIM1_AXIS = 1
        DIM2_AXIS = 2
        DIM3_AXIS = 3
        CHANNEL_AXIS = 4
    else:
        CHANNEL_AXIS = 1
        DIM1_AXIS = 2
        DIM2_AXIS = 3
        DIM3_AXIS = 4


def _get_block(identifier):
    if isinstance(identifier, six.string_types):
        res = globals().get(identifier)
        if not res:
            raise ValueError('Invalid {}'.format(identifier))
        return res
    return identifier


class Resnet3DBuilder(object):
    """ResNet3D."""

    @staticmethod
    def build(input_shape, num_outputs, block_fn, repetitions, reg_factor, drop_rate=0):
     
        _handle_data_format()
        if len(input_shape) != 4:
            raise ValueError("Input shape should be a tuple "
                             "(conv_dim1, conv_dim2, conv_dim3, channels) "
                             "for tensorflow as backend or "
                             "(channels, conv_dim1, conv_dim2, conv_dim3) "
                             "for theano as backend")

        block_fn = _get_block(block_fn)
        input = Input(shape=input_shape)
        # first conv
        conv1 = _conv_bn_relu3D(filters=64, kernel_size=(7, 7, 7),
                                strides=(2, 2, 2),
                                kernel_regularizer=l2(reg_factor)
                                )(input)
        pool1 = MaxPooling3D(pool_size=(3, 3, 3), strides=(2, 2, 2),
                             padding="same")(conv1)

        # repeat blocks
        block = pool1
        filters = 64
        for i, r in enumerate(repetitions):
            block = _residual_block3d(block_fn, filters=filters,
                                      kernel_regularizer=l2(reg_factor),
                                      repetitions=r, is_first_layer=(i == 0)
                                      )(block)
            filters *= 2
            block = Dropout(drop_rate)(block)

        # last activation
        block_output = _bn_relu(block)

        # average poll and classification
        pool2 = AveragePooling3D(pool_size=(
                                            K.int_shape(block)[DIM1_AXIS],
                                            K.int_shape(block)[DIM2_AXIS],
                                            K.int_shape(block)[DIM3_AXIS]),strides=(1, 1, 1))(block_output)
        flatten1 = Flatten()(pool2)
        if num_outputs > 1:
            dense = Dense(units=num_outputs,
                          kernel_initializer="he_normal",
                          activation="softmax",
                          kernel_regularizer=l2(reg_factor))(flatten1)
        else:
            dense = Dense(units=num_outputs,
                          kernel_initializer="he_normal",
                          activation="sigmoid",
                          kernel_regularizer=l2(reg_factor))(flatten1)

        model = Model(inputs=input, outputs=dense)
        return model

    

    @staticmethod
    def build_resnet_101(input_shape, num_outputs, reg_factor=1e-4, drop_rate=0):
        """Build resnet 101."""
        return Resnet3DBuilder.build(input_shape, num_outputs, bottleneck,
                                     [3, 4, 23, 3], reg_factor=reg_factor, drop_rate=drop_rate)
    
    def build_resnet_50(input_shape, num_outputs, reg_factor=1e-4, drop_rate=0):
        """Build resnet 50."""
        return Resnet3DBuilder.build(input_shape, num_outputs, bottleneck,
                                     [3, 4, 6, 3], reg_factor=reg_factor, drop_rate=drop_rate)
        
    def build_resnet_18(input_shape, num_outputs, reg_factor=1e-4, drop_rate=0):
        """Build resnet 18."""
        return Resnet3DBuilder.build(input_shape, num_outputs, basic_block,
                                     [2, 2, 2, 2], reg_factor=reg_factor, drop_rate=drop_rate)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np


class STMEM(tf.keras.Model):
    """
    A custom TensorFlow model for spatiotemporal modeling using frame differences
    and a prebuilt 3D ResNet.

    Args:
        num_segments (int): Number of segments in the video.
        new_length (int): Number of frames per segment.
        prebuilt_resnet (tf.keras.Model): Prebuilt 3D ResNet model.
    """
    def __init__(self, num_segments, new_length, prebuilt_resnet, img_size = (224,224)):
        super(STMEM, self).__init__()
        self.num_segments = num_segments
        self.new_length = int(new_length)
        self.height, self.width = img_size

        # Layers from the STMEM model
        self.m1 = layers.Conv2D(
            filters=3,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding='same',
            activation=None,
            kernel_initializer=tf.keras.initializers.HeNormal(),
            kernel_regularizer=tf.keras.regularizers.l2(1e-4),
        )

        self.m2 = layers.Conv2D(
            filters=3,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding='same',
            activation=None,
            kernel_initializer=tf.keras.initializers.HeNormal(),
            kernel_regularizer=tf.keras.regularizers.l2(1e-4),
        )

        self.sigmoid = tf.keras.activations.sigmoid

        # The prebuilt 3D ResNet model
        self.resnet = prebuilt_resnet  # This is passed as a parameter

    def call(self, input_tensor):
        if len(input_tensor.shape) != 5:
            raise ValueError("Input tensor must have shape (B, D, H, W, C)")
        
        # input_tensor: (B, D, H, W, C)
        B = tf.shape(input_tensor)[0]
        D = input_tensor.shape[1]  # Should be num_segments * new_length
        H = input_tensor.shape[2]
        W = input_tensor.shape[3]
        C = input_tensor.shape[4]

        # Reshape from (B, D, H, W, C) → (B, SLC, H, W)
        # transpose to (B, C, D, H, W)
        input_tensor = tf.transpose(input_tensor, [0, 4, 1, 2, 3])

        # reshape to (B, C * D, H, W)
        input_tensor = tf.reshape(input_tensor, [B, C * D, H, W])
        
        #shape: (B, SLC, H, W)
        SLC = input_tensor.shape[1]  # should be num_segments * new_length * 3

        # Transpose to (B, H, W, SLC)
        input_tensor = tf.transpose(input_tensor, [0, 2, 3, 1])  # (B, H, W, SLC)
        input_tensor = tf.reshape(input_tensor, [B * self.num_segments, self.height, self.width, self.new_length * 3])

        # Frame difference
        frame_diff = input_tensor[:, :, :, 3:] - input_tensor[:, :, :, : (self.new_length - 1) * 3]

        input_and_frame_diff = tf.concat([input_tensor, frame_diff], axis=-1)
        input_and_frame_diff = self.m1(input_and_frame_diff)

        # Process frame difference
        frame_diff = tf.reshape(frame_diff, [B * self.num_segments, self.height, self.width, self.new_length - 1, 3])
        frame_diff = tf.reduce_max(frame_diff, axis=3)  # max over length axis
        frame_diff = self.m2(frame_diff)
        frame_diff = self.sigmoid(frame_diff)

        output = frame_diff * input_and_frame_diff #(B * num_segments, 224, 224, 3)

        # Reshape back to (B, num_segments, H, W, C)
        output = tf.reshape(output, [B, self.num_segments, self.height, self.width, 3])

        # Now pass the output through the prebuilt 3D ResNet
        resnet_output = self.resnet(output)  # Output from ResNet
        
        #return tf.cast(resnet_output, tf.float32)
        return resnet_output


# Example usage:

if __name__ == '__main__':
    # Example input: shape (B, SLC, H, W) → (4, 90, 224, 224)
    a = tf.convert_to_tensor(np.random.rand(2, 36, 224, 224, 3).astype(np.float32))
    
    resnet = Resnet3DBuilder.build_resnet_50(
        input_shape=(6, 224, 224, 3),  # frame, 112x112, 3 channels (RGB)
        num_outputs=18)

    model = STMEM(num_segments=6, new_length=6, prebuilt_resnet=resnet)
    out = model(a)
    print(out.shape)


In [None]:
print(out)

# Data frame processing

In [None]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tensorflow.keras.utils import Sequence
from sklearn.preprocessing import OneHotEncoder

import albumentations as A
from albumentations.core.composition import ReplayCompose

# DIR

In [None]:
TRAIN_DATA_DIR = "/kaggle/input/20-bnjester-csv-files/Train.csv"
VAL_DATA_DIR = "/kaggle/input/20-bnjester-csv-files/Validation.csv"
Cropped_TRAIN_DIR = "/kaggle/input/hand-cropped-20jester-train-dataset/Cropped_Train_Data"
Cropped_VAL_DIR = "/kaggle/input/hand-cropped-20jester-validation-dataset/Cropped_Validation_Data"

In [None]:
label_encoder = OneHotEncoder(sparse_output=False)

In [None]:
labels = np.array([["Doing other things"], ["No gesture"],
         ["Rolling Hand Backward"], ["Rolling Hand Forward"],
         ["Shaking Hand"], 
         ["Sliding Two Fingers Down"], ["Sliding Two Fingers Left"], ["Sliding Two Fingers Right"], ["Sliding Two Fingers Up"], 
         ["Stop Sign"], 
         ["Swiping Down"], ["Swiping Left"], ["Swiping Right"], ["Swiping Up"],
         ["Thumb Down"], ["Thumb Up"],
         ["Turning Hand Clockwise"], ["Turning Hand Counterclockwise"]])


label_encoder.fit(labels) # fit encoder

In [None]:
train_video_id_ls = list(map(int, os.listdir(Cropped_TRAIN_DIR)))
val_video_id_ls = list(map(int, os.listdir(Cropped_VAL_DIR)))

train_df = pd.read_csv(TRAIN_DATA_DIR)
val_df = pd.read_csv(VAL_DATA_DIR)

sort_train_df = train_df[train_df["video_id"].isin(train_video_id_ls)] # sorting only the used data
sort_val_df = val_df[val_df["video_id"].isin(val_video_id_ls)]

In [None]:
print("train_df len: ", len(train_df))
print("sort_train_df len: ", len(sort_train_df))
print("val_df len: ", len(val_df))
print("sort_val_df len: ", len(sort_val_df))

 ## DATASET

## parameters

In [None]:
batch_size = 64

## data

In [None]:


def augmentation(img, input_shape):
    """Augmentation function for albumentations."""
    #IMAGENET_MEAN = [0.485, 0.456, 0.406]
    #IMAGENET_STD = [0.229, 0.224, 0.225]


    transform = ReplayCompose([
        A.RandomBrightnessContrast(p=0.2),
        A.Resize(height=input_shape[0], width=input_shape[1]),
    ])

    aug = transform(image=img)

    return aug

In [None]:
class VideoDataset(Sequence):
    def __init__(self, usage, data_frame, video_dir, batch_size, input_shape, shuffle=True):
        self.usage = usage
        self.data = data_frame
        self.video_dir = video_dir
        self.batch_size = batch_size
        self.input_shape = input_shape  # (D, H, W, C)
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.data) / self.batch_size))

    def __getitem__(self, idx):
        batch_data = self.data.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
        X = np.zeros((len(batch_data), *self.input_shape), dtype=np.float32)
        y = []

        for i, row in enumerate(batch_data.itertuples()):
            video_id = row.video_id
            label = row.label
            folder = os.path.join(self.video_dir, str(video_id))

            frames = []
            replay = None
            
            for j in range(1, 37):
                img_path = os.path.join(folder, f"{j:05d}.jpg")
                img = np.array(Image.open(img_path).convert("RGB"))

                if self.usage == "train":
                    if replay is None:
                        aug = augmentation(img=img, input_shape=(self.input_shape[1], self.input_shape[2]))
                        replay = aug["replay"]
                
                    aug_img = A.ReplayCompose.replay(replay, image=img)["image"]
                    aug_img = aug_img.astype(np.float32) / 255.0
                    frames.append(aug_img)
                else:
                    img = cv2.resize(img, (self.input_shape[2], self.input_shape[1]))
                    img = img.astype(np.float32) / 255.0
                    frames.append(img)

            X[i] = np.stack(frames, axis=0)  # (D, H, W, C)
            #print(X[i].shape)

            y.append([label])
        
        y = label_encoder.transform(y)#to_categorical(y, num_classes=self.num_classes)
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            self.data = self.data.sample(frac=1).reset_index(drop=True)

In [None]:
# VIDEO Dataset
train_dataset = VideoDataset(
    usage = "train",
    data_frame= sort_train_df,          # data frame đọc từ csv file
    video_dir=Cropped_TRAIN_DIR,   # đường dẫn tới folder chứa các video dạng frame
    batch_size=batch_size,                   # mỗi batch lấy 4 video
    input_shape=(36, 128, 128, 3),   #  frame, size 128x128, 3 kênh màu
    shuffle=True                    # có muốn shuffle dữ liệu mỗi epoch không
)

val_dataset = VideoDataset(
    usage = "val",
    data_frame= sort_val_df,          # data frame đọc từ csv file
    video_dir=Cropped_VAL_DIR,   # đường dẫn tới folder chứa các video dạng frame
    batch_size=batch_size,                   # mỗi batch lấy 4 video
    input_shape=(36, 128, 128, 3),   #  frame, size 128x128, 3 kênh màu
    shuffle=True                    # có muốn shuffle dữ liệu mỗi epoch không
)

In [None]:
# Exam the work of dataset
X_batch, y_batch = train_dataset[0]
print(np.max(X_batch))
print(np.min(X_batch))
print(y_batch.shape)

#print(X_batch[0])
print(y_batch[0])

len(train_dataset)

In [None]:
# Data generator

def generator(dataset):
    for i in range(len(dataset)):
        yield dataset[i]

# **Model**

In [None]:
!nvidia-smi

In [None]:
from tensorflow.keras.utils import plot_model
import tensorflow as tf

#from tensorflow.keras import mixed_precision
#mixed_precision.set_global_policy('mixed_float16')

from tensorflow.keras.optimizers import Adam
import wandb
from wandb.integration.keras import WandbCallback


## hyperparameters

In [None]:
PROJECT = "HandActionReg"
RESUME = "allow"
WANDB_KEY = "d9d14819dddd8a35a353b5c0b087e0f60d717140"

In [None]:
wandb.login(
    key = WANDB_KEY,
)

In [None]:
learning_rate = 1e-4
epochs = 50

## model

In [None]:
# Khởi tạo MirroredStrategy
strategy = tf.distribute.MirroredStrategy()

In [None]:
with strategy.scope():
    print("Number of GPUs: ", strategy.num_replicas_in_sync)
    print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

    num_segments = 6
    INPUT_DEPTH = 36

    resnet = Resnet3DBuilder.build_resnet_50(
        input_shape=(num_segments, 128, 128, 3),  # frame, 128x128, 3 channels (RGB)
        num_outputs=18)
    

    model = STMEM(
        num_segments=num_segments,
        new_length=INPUT_DEPTH/num_segments,
        prebuilt_resnet=resnet,
        img_size = (128,128)
    )

    # Biên dịch mô hình
    model.compile(optimizer=Adam(learning_rate = learning_rate, weight_decay = 1e-6, clipnorm=1.0),
                  loss='categorical_crossentropy',
                  metrics=['accuracy', 'precision', 'recall'])

In [None]:
model.summary()

In [None]:
#plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
#BUFFERED_BATCHES = 300


AUTOTUNE = tf.data.AUTOTUNE

gen_train_dataset = tf.data.Dataset.from_generator(
    lambda: generator(train_dataset), 
    output_signature=(
        tf.TensorSpec(shape=(None, INPUT_DEPTH, 128, 128, 3), dtype=tf.float32),  # X.shape
        tf.TensorSpec(shape=(None, 18), dtype=tf.float32)  # y.shape
    )
).prefetch(AUTOTUNE)

gen_val_dataset = tf.data.Dataset.from_generator(
    lambda: generator(val_dataset), 
    output_signature=(
        tf.TensorSpec(shape=(None, INPUT_DEPTH, 128, 128, 3), dtype=tf.float32),  # X.shape
        tf.TensorSpec(shape=(None, 18), dtype=tf.float32)  # y.shape
    )
).prefetch(AUTOTUNE)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TerminateOnNaN 

early_stop = EarlyStopping(
    monitor='val_loss',    # Metric to monitor
    patience=7,            # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,          # reduce LR by this factor
    patience=5,          # wait this many epochs
    min_lr=1e-6          # lower bound
)

checkpoint = ModelCheckpoint(
    filepath='/kaggle/working/STMEM_3D_RestNet50.keras',        # Filepath to save the model
    monitor='val_loss',              # Monitor validation loss
    save_best_only=True,             # Only save when val_loss improves
    save_weights_only=False,         # Save full model (set to True to save only weights)
    mode='min',                      # 'min' means lower val_loss is better
    verbose=1                        # Logs when the model is saved
)

In [None]:
wandb.init(
    project=PROJECT,
    resume=RESUME,
    name="STMEM_hand_init",
    config={
         "learning_rate": learning_rate,
         "epochs": epochs,
         "batch_size": batch_size,
    },
)

In [None]:
model.fit(gen_train_dataset.repeat(), 
          epochs= epochs, 
          steps_per_epoch=len(train_dataset), 
          validation_data=gen_val_dataset, 
          verbose=1,
          callbacks=[
                TerminateOnNaN(),
                WandbCallback(save_model=False),  # Logs metrics, gradients, and optionally saves the model
                early_stop,
                reduce_lr,
                checkpoint,
            ])