# 1. Import Library

In [None]:
%pip install mediapipe

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import mediapipe as mp
from tqdm import tqdm
import gc
import cv2
import tensorflow as tf
import pandas as pd
from pathlib import Path
import multiprocessing
import concurrent.futures
from tensorflow.keras.utils import to_categorical

# 2. EDA

In [None]:
df = pd.read_csv("/kaggle/input/20bn-jester/Train.csv")
df.drop(labels = "format", axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df['label'].nunique()

In [None]:
df['label'].value_counts()

In [None]:
actions = ["Doing other things", "No gesture", 'Rolling Hand Backward', 'Rolling Hand Forward', 'Shaking Hand', 
           'Sliding Two Fingers Down', 'Sliding Two Fingers Left', 'Sliding Two Fingers Right', 'Sliding Two Fingers Up',
            'Stop Sign', 'Swiping Down','Swiping Left', 'Swiping Right', 'Swiping Up',
            'Thumb Down', 'Thumb Up',
            'Turning Hand Clockwise', 'Turning Hand Counterclockwise'
            ]

In [None]:
df_filtered = df[df['label'].isin(actions)]

In [None]:
df_filtered.info()

In [None]:
label_stats = df_filtered.groupby(['label', 'label_id']).size().reset_index(name='count')
print(label_stats)

In [None]:
from pathlib import Path

train_dir = Path('/kaggle/input/20bn-jester/Train')

# Lấy danh sách tất cả folder con
all_folders = [f for f in train_dir.iterdir() if f.is_dir()]

In [None]:
num_video = len(all_folders)

In [None]:
num_video

In [None]:
# Separate the dataset into action-specific subsets for parallel joint landmark extraction.

def make_folder(action):
    df_filtered = df[df['label'] == action]
    video_ids = set(df_filtered['video_id'].astype(str))

    #List of folder with the label == action    
    filtered_folder = [f for f in all_folders if f.name in video_ids]
    #Output is the POSIX PATH of the folder respective to action
    return filtered_folder 

# 3. Keypoint using mediapipe holistics

In [None]:
mp_holistic = mp.solutions.holistic  #Download model -> make detections
mp_drawing = mp.solutions.drawing_utils #Drawing utility -> draw keypoints

In [None]:
# List of PoseLandmark we use in this project
from mediapipe.python.solutions.holistic import PoseLandmark
included_landmarks = [
    # right hand set
    PoseLandmark.RIGHT_SHOULDER, 
    PoseLandmark.RIGHT_ELBOW, 
    PoseLandmark.RIGHT_WRIST, 

    # left hand set
    PoseLandmark.LEFT_SHOULDER, 
    PoseLandmark.LEFT_ELBOW, 
    PoseLandmark.LEFT_WRIST, 
    ]

In [None]:
"""Processes an RGB image and returns the pose landmarks, left and right hand landmarks, and face landmarks on the most prominent person detected.
    Returns:
      A NamedTuple with fields describing the landmarks on the most prominate
      person detected:
        1) "pose_landmarks" field that contains the pose landmarks.
        2) "pose_world_landmarks" field that contains the pose landmarks in
        real-world 3D coordinates that are in meters with the origin at the
        center between hips.
        3) "left_hand_landmarks" field that contains the left-hand landmarks.
        4) "right_hand_landmarks" field that contains the right-hand landmarks.
        5) "face_landmarks" field that contains the face landmarks.
        6) "segmentation_mask" field that contains the segmentation mask if
           "enable_segmentation" is set to true.
    """
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #Convert to RGB Color_space
    image.flags.writeable = False #cvt numpy to read-only
    res = model.process(image) #make prediction
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, res

In [None]:
def draw_landmarks(image, res):
    #draw hand connections
    mp_drawing.draw_landmarks(image, res.pose_landmarks,mp_holistic.POSE_CONNECTIONS) #draw pose connection 
    mp_drawing.draw_landmarks(image, res.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS) #draw hand connection
    mp_drawing.draw_landmarks(image, res.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS) #draw hand connection

# 4. Extract keypoint features

In [None]:
def extract_keypoint(res):
    arr = []

    # Pose landmarks (6 points)
    if res.pose_landmarks:
        for landmark_id in included_landmarks:
            point = res.pose_landmarks.landmark[landmark_id]
            arr.extend([point.x, point.y, point.z])
    else:
        arr.extend([0, 0, 0] * len(included_landmarks))  # 6*3 = 18

    # Left hand landmarks (21 points)
    if res.left_hand_landmarks:
        for point in res.left_hand_landmarks.landmark:
            arr.extend([point.x, point.y, point.z])
    else:
        arr.extend([0, 0, 0] * 21)

    # Right hand landmarks (21 points)
    if res.right_hand_landmarks:
        for point in res.right_hand_landmarks.landmark:
            arr.extend([point.x, point.y, point.z])
    else:
        arr.extend([0, 0, 0] * 21)

    return np.array(arr).reshape(48, 3)

## 4.1. Set Folder for Collections

In [None]:
# In the following section, we will process the training dataset; similar steps will be applied to the validation and test datasets.

data_path = os.path.join("/kaggle/working/data/joint_stream/train")
os.makedirs(data_path, exist_ok=True)

# 5. Collect keypoints

## 5.1. Joint stream dataset

In [None]:
#number of frames each video
sequence_length = 37

In [None]:
def process_video(action, video_index, video_folder, data_path, sequence_length=37):
    """Process a video: read each frame, extract keypoints, and save them into one .npy file"""

    sequence = []  # List to store keypoints for 37 frames from one video

    # Initialize Mediapipe Holistic model to detect pose and hand landmarks
    with mp.solutions.holistic.Holistic(
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5,
        model_complexity=1
    ) as holistic:

        frame_processed = 0  # Counter for successfully processed frames
        for frame_num in range(1, sequence_length + 1):
            frame_path = video_folder / f"{frame_num:05d}.jpg"  # Build path to the frame image

            if not frame_path.exists():
                continue  # Skip if the frame does not exist

            frame = cv2.imread(str(frame_path))  # Read the frame

            if frame is None:
                continue  # Skip if the frame cannot be read

            # Detect landmarks using Mediapipe
            image, res = mediapipe_detection(frame, holistic)

            # Extract keypoints from detection results
            keypoints = extract_keypoint(res)
            sequence.append(keypoints)
            frame_processed += 1

            # Release memory
            del image, res

        # Save extracted keypoints to a .npy file
        npy_path = os.path.join(data_path, f"{video_index}.npy")
        np.save(npy_path, np.array(sequence))

    return action, video_index, frame_processed

In [None]:
"""
Output structure:

joint_stream/
    ├── train/
    │     ├── videoid1.npy
    │     ├── videoid2.npy
    │     └── ...
    └── val/
          ├── videoid1.npy
          ├── videoid2.npy
          └── ...

Each .npy file has shape (T, V, C) = (37, 48, 3), where:
    - T = 37 frames (temporal dimension)
    - V = 48 joints (including 6 body pose landmarks and 21 landmarks for each left and right hand)
    - C = 3 channels (x, y, z coordinates)
"""

num_workers = max(1, multiprocessing.cpu_count() - 1)
action_videos = {}  
total_videos = 0
for action in actions: 
    filtered_folder = make_folder(action)
    action_videos[action] = filtered_folder
    total_videos += len(filtered_folder)
    
# Multi-thread Processing
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = []

    for action in actions:
        for video_folder in action_videos[action]:
            video_id = int(video_folder.name)
            futures.append(
                executor.submit(
                    process_video, 
                    action,
                    video_id, 
                    video_folder, 
                    data_path, 
                    sequence_length
                )
            )
        
    with tqdm(total=total_videos, desc="Processing all videos") as pbar:
        for future in concurrent.futures.as_completed(futures):
            try:
                action, video_id, frames = future.result()
                pbar.update(1)
                pbar.set_description(f"Processed: {action} - video {video_id} ({frames}/{sequence_length} frames)")
                
                # Periodic memory cleanup
                if pbar.n % 10 == 0:
                    gc.collect()
            except Exception as e:
                print(f"Error processing video: {e}")
            # Periodic memory cleanup
            if pbar.n % 10 == 0:
                gc.collect()

## 5.2. Joint motion stream, Bone stream, Bone motion stream Dataset

In [None]:
POSE_CONNECTIONS = frozenset([(11, 12), (11, 13), (13, 15), (12, 14), (14, 16)])

In [None]:
# Mapping from pose landmark IDs to file landmark IDs
poseid2fileid = {12: 0, 14: 1, 16: 2, 11: 3, 13: 4, 15: 5}

# Create file connections based on mapped pose IDs
POSE_CONNECTIONS_FILE_INDEX = [
    (poseid2fileid[a], poseid2fileid[b])
    for (a, b) in POSE_CONNECTIONS
]

#[(3, 0), (3, 4), (4, 5), (0, 1), (1, 2)]

In [None]:
"""
The 21 hand landmarks:
  WRIST = 0
  THUMB_CMC = 1
  THUMB_MCP = 2
  THUMB_IP = 3
  THUMB_TIP = 4
  INDEX_FINGER_MCP = 5
  INDEX_FINGER_PIP = 6
  INDEX_FINGER_DIP = 7
  INDEX_FINGER_TIP = 8
  MIDDLE_FINGER_MCP = 9
  MIDDLE_FINGER_PIP = 10
  MIDDLE_FINGER_DIP = 11
  MIDDLE_FINGER_TIP = 12
  RING_FINGER_MCP = 13
  RING_FINGER_PIP = 14
  RING_FINGER_DIP = 15
  RING_FINGER_TIP = 16
  PINKY_MCP = 17
  PINKY_PIP = 18
  PINKY_DIP = 19
  PINKY_TIP = 20
"""

"Connection" 
HAND_PALM_CONNECTIONS = ((0, 1), (0, 5), (9, 13), (13, 17), (5, 9), (0, 17))

HAND_THUMB_CONNECTIONS = ((1, 2), (2, 3), (3, 4))

HAND_INDEX_FINGER_CONNECTIONS = ((5, 6), (6, 7), (7, 8))

HAND_MIDDLE_FINGER_CONNECTIONS = ((9, 10), (10, 11), (11, 12))

HAND_RING_FINGER_CONNECTIONS = ((13, 14), (14, 15), (15, 16))

HAND_PINKY_FINGER_CONNECTIONS = ((17, 18), (18, 19), (19, 20))

HAND_CONNECTIONS = frozenset().union(*[
    HAND_PALM_CONNECTIONS, HAND_THUMB_CONNECTIONS,
    HAND_INDEX_FINGER_CONNECTIONS, HAND_MIDDLE_FINGER_CONNECTIONS,
    HAND_RING_FINGER_CONNECTIONS, HAND_PINKY_FINGER_CONNECTIONS
])

In [None]:
handid2fileid = {a: a + 6 for a in range(21)}
print(handid2fileid)

In [None]:
#Mapping from joint_id to index_id of array

LEFT_HAND_CONNECTIONS_FILE_INDEX = [
    (handid2fileid[a], handid2fileid[b])
    for (a, b) in HAND_CONNECTIONS
]

RIGHT_HAND_CONNECTIONS_FILE_INDEX = [
    (handid2fileid[a] + 21, handid2fileid[b] + 21)
    for (a, b) in HAND_CONNECTIONS
]

In [None]:
# Source paths from joint stream dataset
train_src = "/kaggle/input/msstnetdataset/data/joint_stream/train"
val_src = "/kaggle/input/msstnetdataset/data/joint_stream/val"

# Destination paths for processed datasets:
# - joint_motion_stream: differences between consecutive frames of joint landmarks: (T-1, V, C)
# - bone_stream: vectors between connected joints (pose and hands): (T, V-1, C)
# - bone_motion_stream: differences between consecutive frames of bone vectors (T-1, V-1, C)

train_dst1 = "/kaggle/working/data/joint_motion_stream/train"
val_dst1 = "/kaggle/working/data/joint_motion_stream/val"
train_dst2 = "/kaggle/working/data/bone_stream/train"
val_dst2 = "/kaggle/working/data/bone_stream/val"
train_dst3 = "/kaggle/working/data/bone_motion_stream/train"
val_dst3 = "/kaggle/working/data/bone_motion_stream/val"

os.makedirs(train_dst1, exist_ok=True)
os.makedirs(val_dst1, exist_ok=True)
os.makedirs(train_dst2, exist_ok=True)
os.makedirs(val_dst2, exist_ok=True)
os.makedirs(train_dst3, exist_ok=True)
os.makedirs(val_dst3, exist_ok=True)

def process_in_batches(src_folder, dst_folder1, dst_folder2, dst_folder3, batch_size=100):
    all_files = [f for f in os.listdir(src_folder) if f.endswith(".npy")]
    total_files = len(all_files)
    
    for i in range(0, total_files, batch_size):
        batch_files = all_files[i:min(i+batch_size, total_files)]
        print(f"Processing batch {i//batch_size + 1}/{(total_files + batch_size - 1)//batch_size} ({len(batch_files)} files)")
        
        for filename in tqdm(batch_files):
            try:
                src_file = os.path.join(src_folder, filename)
                file = np.load(src_file)  # Load the original (T, V, C) array
                
                # 1. Joint motion stream
                dst_file1 = os.path.join(dst_folder1, filename)
                file1 = file[1:] - file[:-1]
                np.save(dst_file1, file1)
                
                # 2. Bone stream
                bone_vector = []
                def retrieve_bone_vector(x):
                    return np.stack([file[:, b, :] - file[:, a, :] for (a, b) in x], axis=1)
                
                bone_pose = retrieve_bone_vector(POSE_CONNECTIONS_FILE_INDEX)
                bone_left = retrieve_bone_vector(LEFT_HAND_CONNECTIONS_FILE_INDEX)
                bone_right = retrieve_bone_vector(RIGHT_HAND_CONNECTIONS_FILE_INDEX)
                
                if bone_pose is None or bone_left is None or bone_right is None:
                    continue
                    
                bone_vector = np.concatenate([bone_pose, bone_left, bone_right], axis=1)
                
                dst_file2 = os.path.join(dst_folder2, filename)
                np.save(dst_file2, bone_vector)
                
                # 3. Bone motion stream
                file3 = bone_vector[1:] - bone_vector[:-1]
                dst_file3 = os.path.join(dst_folder3, filename)
                np.save(dst_file3, file3)
                
                # Release memory after each file
                del file, file1, bone_vector, file3, bone_pose, bone_left, bone_right
                
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
                
        import gc
        gc.collect()

print("Processing training data...")
process_in_batches(train_src, train_dst1, train_dst2, train_dst3)

print("Processing validation data...")
process_in_batches(val_src, val_dst1, val_dst2, val_dst3)

print("Data processing completed!")

## 5.3. Create x, y for Training and Validation

In [None]:
label2id = {action: idx for idx, action in enumerate(actions)}  # actions = list of 18 action name

def data_train(folder, path_csv):
    x = []
    y = []
    
    df = pd.read_csv(path_csv)
    df.drop(columns = ['format', 'shape'], inplace = True)
    df_filtered = df[df['label'].isin(actions)]
    
    for f in folder.iterdir():
        if f.is_file() and f.suffix == ".npy":
            # x
            x.append(np.load(f.as_posix()))

            # y
            video_id = int(f.stem)
            label = df_filtered.loc[df_filtered['video_id'] == video_id, 'label'].values
            y.append(label2id[label[0]])
            
    y = to_categorical(y, num_classes = len(actions)).astype(int)
    return np.array(x, dtype = np.float32), np.array(y)

In [None]:
# Apply the same steps to train and val sets of joint motion, bone, and bone motion streams.

x_train, y_train = data_train(Path("/kaggle/input/msstnetdataset/data/joint_stream/train"),"/kaggle/input/20bn-jester/Train.csv")
x_val, y_val = data_train(Path("/kaggle/input/msstnetdataset/data/joint_stream/val"),"/kaggle/input/20bn-jester/Validation.csv")

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

# 6. Build Model

## 6.1. Setting Up Checkpoints and Libraries

In [None]:
import datetime
import glob
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Concatenate, Conv2D, Dense, InputLayer, Activation, GlobalAveragePooling2D, BatchNormalization, ReLU, AveragePooling2D, Layer

In [None]:
KAGGLE_WORKING = '/kaggle/working'  # Path for output files

checkpoint_dir = os.path.join(KAGGLE_WORKING, 'checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_path = os.path.join(checkpoint_dir, 'model_epoch_{epoch:03d}_acc_{val_categorical_acc: .4f}.keras')
best_model_path = os.path.join(checkpoint_dir, 'best_model.keras')
log_dir = os.path.join(KAGGLE_WORKING, 'logs', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
def find_latest_checkpoint():
    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'model_epoch_*.keras'))
    if not checkpoint_files:
        return None
    
    # Sort by epoch number
    checkpoint_files.sort(key=lambda x: int(x.split('_epoch_')[1].split('_')[0]))
    return checkpoint_files[-1]

In [None]:
# Check for existing checkpoint to resume training
latest_checkpoint = find_latest_checkpoint()
initial_epoch = 0

In [None]:
if latest_checkpoint:
    print(f"Found checkpoint: {latest_checkpoint}")
    try:
        # Extract epoch number from checkpoint filename
        epoch_str = os.path.basename(latest_checkpoint).split('_epoch_')[1].split('_')[0]
        initial_epoch = int(epoch_str)
        print(f"Resuming from epoch {initial_epoch}")
        
        # Load the model
        model = load_model(latest_checkpoint)
        print("Model loaded successfully from checkpoint")
    except Exception as e:
        print(f"Error loading checkpoint: {e}")
        print("Creating new model instead")
        latest_checkpoint = None

## 6.2. Define MSST (Multi-Scale Spatio-Temporal) Module

In [None]:
"""
MSST (Multi-Scale Spatio-Temporal) Module

Architecture:
- Input is fed into 5 parallel branches
- Each branch begins with a 1×1 convolution to reduce dimensions
- Branches have different convolutional patterns:
  * Branch 1: Simple 1×1 convolution (preserves spatial information)
  * Branch 2: 1×1 followed by separate 3×1 and 1×3 convolutions (small receptive field)
  * Branch 3: 1×1 followed by separate 5×1 and 1×5 convolutions (medium receptive field)
  * Branch 4: 1×1 followed by separate 7×1 and 1×7 convolutions (larger receptive field)
  * Branch 5: 1×1 followed by separate 11×1 and 1×11 convolutions (largest receptive field)
- All branches are concatenated at the end to produce output feature maps

Each convolution is typically followed by batch normalization and ReLU activation. This design efficiently captures multi-scale spatial information
"""

class MSST_Layer(Layer):
    def __init__(self, stride, filter1, filter2, filter3, filter4, filter5, **kwargs):
        super(MSST_Layer, self).__init__(**kwargs)
        self.stride = stride
        self.filters = [filter1, filter2, filter3, filter4, filter5]
        self.concat = Concatenate()

    def build(self, input_shape):
        self.branch1 = self._make_branch([(1, 1)], self.filters[0], self.stride)
        self.branch2 = self._make_branch([(1, 1), (3, 3)], [self.filters[0], self.filters[1]], self.stride)
        self.branch3 = self._make_branch([(1, 1), (5, 1), (1, 5)], [self.filters[2]]*3, self.stride)
        self.branch4 = self._make_branch([(1, 1), (7, 1), (1, 7)], [self.filters[3]]*3, self.stride)
        self.branch5 = self._make_branch([(1, 1), (11, 1), (1, 11)], [self.filters[4]]*3, self.stride)
        super().build(input_shape)

    def _make_branch(self, kernel_sizes, filters, first_stride):
        layers = []
        if isinstance(filters, int):
            filters = [filters] * len(kernel_sizes)
        for i, (kernel, f) in enumerate(zip(kernel_sizes, filters)):
            stride = first_stride if i == 0 else (1, 1)
            layers.append(Conv2D(f, kernel_size=kernel, strides=stride, padding='same'))
            layers.append(BatchNormalization())
            layers.append(ReLU())
        return layers

    def _apply_branch(self, inputs, branch_layers, training):
        x = inputs
        for layer in branch_layers:
            if isinstance(layer, BatchNormalization):
                x = layer(x, training=training)
            else:
                x = layer(x)
        return x

    def call(self, inputs, training=None):
        b1 = self._apply_branch(inputs, self.branch1, training)
        b2 = self._apply_branch(inputs, self.branch2, training)
        b3 = self._apply_branch(inputs, self.branch3, training)
        b4 = self._apply_branch(inputs, self.branch4, training)
        b5 = self._apply_branch(inputs, self.branch5, training)
        return self.concat([b1, b2, b3, b4, b5])

    def get_config(self):
        config = super(MSST_Layer, self).get_config()
        keys = ['filter1', 'filter2', 'filter3', 'filter4', 'filter5']
        config.update({'stride': self.stride, **{k: v for k, v in zip(keys, self.filters)}})
        return config


In [None]:
#Architecture of the MSSTNet-based Action Recognition: 
# Train 4 models using 4 datasets: joint stream, joint motion stream, bone stream, and bone motion stream. 
# Then, use ensemble learning to combine the results of each model by averaging their weights.
# In the following section, we will process the joint stream dataset. Similar steps will be applied to the other datasets.

input_shape = (37, 48, 3) 

## 6.3. Define MSSTNET

In [None]:
if not latest_checkpoint:
    print("Creating new model")
    model = Sequential()
    model.add(InputLayer(input_shape = input_shape))
    
    model.add(MSST_Layer(stride=(1,1), filter1=44, filter2=60, filter3=60, filter4=60,filter5=60))
    model.add(MSST_Layer(stride=(1,1), filter1=48, filter2=80, filter3=80, filter4=80,filter5=80))
    model.add(MSST_Layer(stride=(1,1), filter1=56, filter2=120, filter3=120, filter4=120,filter5=120))
    
    model.add(AveragePooling2D(pool_size=(3,3), strides=(1,2)))
    model.add(BatchNormalization())
    model.add(ReLU())
    
    model.add(MSST_Layer(stride=(1,1), filter1=160, filter2=160, filter3=160, filter4=160,filter5=160))
    model.add(MSST_Layer(stride=(2,1), filter1=72, filter2=200, filter3=200, filter4=200,filter5=200))
    
    model.add(AveragePooling2D(pool_size=(3,3), strides=(1,2)))
    model.add(BatchNormalization())
    model.add(ReLU())
    
    model.add(MSST_Layer(stride=(1,1), filter1=240, filter2=240, filter3=240, filter4=240,filter5=240))
    model.add(MSST_Layer(stride=(1,1), filter1=320, filter2=320, filter3=320, filter4=320,filter5=320))
    
    model.add(GlobalAveragePooling2D())
    model.add(BatchNormalization())
    model.add(ReLU())
    
    model.add(Dense(18, activation='softmax'))

In [None]:
#Compile model

optimizer = Adam(learning_rate = 0.01)
model.compile(optimizer = optimizer,
             loss = 'categorical_crossentropy',
             metrics = ['categorical_accuracy']
             )

In [None]:
model.summary()

# 7. Training

In [None]:
# Custom callback to save model every 5 epochs
class PeriodicSaver(tf.keras.callbacks.Callback):
    def __init__(self, save_path, every=5):
        super().__init__()
        self.save_path = save_path
        self.every = every

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.every == 0:
            val_acc = logs.get('val_categorical_accuracy', 0)
            filename = self.save_path.format(epoch=epoch + 1, val_categorical_acc=val_acc)
            self.model.save(filename)
            print(f"Saved periodic model at epoch {epoch + 1} to {filename}")

# Callbacks for optimized training
callbacks = [
    # Custom checkpoint every 5 epochs
    PeriodicSaver(
        save_path=checkpoint_path,
        every=5
    ),
    
    # Best model checkpoint
    ModelCheckpoint(
        filepath=best_model_path,
        monitor='val_categorical_accuracy',
        verbose=1,
        save_best_only=True,
        mode='max'
    ),
    
    # Stop training when validation accuracy doesn't improve
    EarlyStopping(
        monitor='val_categorical_accuracy',
        patience=7,
        restore_best_weights=True,
        verbose=1
    ),
    
    # Reduce learning rate when plateau is reached
    ReduceLROnPlateau(
        monitor='val_categorical_accuracy',
        factor=0.5,
        patience=4,
        min_lr=1e-5,
        verbose=1
    ),
    
    # TensorBoard visualization
    TensorBoard(log_dir=log_dir)
]

In [None]:
epochs = 30       
batch_size = 32

In [None]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=epochs,
    initial_epoch=initial_epoch,
    batch_size=batch_size,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['categorical_accuracy'], label='Train')
plt.plot(history.history['val_categorical_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()

plt.savefig(os.path.join(KAGGLE_WORKING, 'training_history.png'))
plt.show()

# 8. Testing

In [None]:
from tensorflow import keras

def load_model(model_path):
    model = keras.models.load_model(
    model_path,
    custom_objects={'MSST_Layer': MSST_Layer}
    )
    return model

model_path1 = "path/to/best_model_for_joint_stream.keras"
model_path2 = "path/to/best_model_for_joint_motion_stream.keras"
model_path3 = "path/to/best_model_for_bone_stream.keras"
model_path4 = "path/to/best_model_for_bone_motion_stream.keras"

# Load models
model1 = load_model(model_path1)
model2 = load_model(model_path2)
model3 = load_model(model_path3)
model4 = load_model(model_path4)

In [None]:
# Need output shape: (1, 37, 48, 3)
def output_process(frames, model):
    sequence = []
    
    # Process each frame to extract keypoints
    for frame_num in range(0, sequence_length):
        frame = frames[frame_num]
        if frame is None:
            print("IO Error")
            continue
        
        image, res = mediapipe_detection(frame, model)
        keypoints = extract_keypoint(res) 
        sequence.append(keypoints)  # Shape: (sequence_length, 48, 3)
    
    # 1. Joint stream
    sequence = np.array(sequence)
    sequence1 = np.expand_dims(sequence, axis = 0) # Shape: (1, sequence_length, 48, 3)

    # 2. Join motion stream
    sequence2 = sequence[1:] -sequence[:-1] # Shape: (sequence_length - 1, 48, 3)
    
    # 3. Bone stream
    sequence3 = []

    def retrieve_bone_vector(x):
        return np.stack([sequence[:, b, :] - sequence[:, a, :] for (a, b) in x], axis=1)
    
    bone_pose = retrieve_bone_vector(POSE_CONNECTIONS_FILE_INDEX)
    bone_left = retrieve_bone_vector(LEFT_HAND_CONNECTIONS_FILE_INDEX)
    bone_right = retrieve_bone_vector(RIGHT_HAND_CONNECTIONS_FILE_INDEX)    
    sequence3 = np.concatenate([bone_pose, bone_left, bone_right], axis=1) # Shape: (sequence_length, 47, 3)

    # 4. Bone motion stream
    sequence4 = sequence3[1:] - sequence3[:-1] # Shape: (sequence_length - 1, 47, 3)

    return np.expand_dims(sequence, axis = 0), np.expand_dims(sequence2, axis = 0), np.expand_dims(sequence3, axis = 0), np.expand_dims(sequence4, axis = 0)

In [None]:
# Initialize webcam
cap = cv2.VideoCapture(0)

# List to store captured frames
frames = []

# Flag to indicate whether we are currently recording frames
is_recording = False

print("Press 's' to start capturing, and 'q' to stop capturing.")

# Initialize Mediapipe Holistic model
with mp.solutions.holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
    model_complexity=1
) as holistic:
    while True:
        # Capture a frame from the webcam
        ret, frame = cap.read()
        if not ret:
            break

        # If recording, save the current frame
        if is_recording:
            frames.append(frame)

        # Show the current frame
        cv2.imshow("Webcam", frame)

        # Wait for a key press
        key = cv2.waitKey(1) & 0xFF

        # Start recording when 's' is pressed
        if key == ord('s'):
            is_recording = True
        # Stop recording and exit when 'q' is pressed
        elif key == ord('q'):
            if is_recording:
                break

    # Release the webcam and close all OpenCV windows
    cap.release()
    cv2.destroyAllWindows()

    # Downsample frames to 37 frames if necessary
    total_frames = len(frames)
    step = max(1, total_frames // 37)
    frames = [frames[i] for i in range(0, total_frames, step)][:37]

    # If not enough frames, duplicate the last frame
    while len(frames) < 37:
        frames.append(frames[-1])

    # x_test for each model
    sequence1, sequence2, sequence3, sequence4 = output_process(frames, holistic)

    def predict(model, sequence, name):
        y = model.predict(sequence)
        print(f"{name} model: {actions[np.argmax(y)]} with predicted index: {np.argmax(y)}")
        print("***")
    
    # Predict using each stream-specific model
    predict(model1, sequence1, "joint stream")
    predict(model2, sequence2, "joint motion stream")
    predict(model3, sequence3, "bone stream")
    predict(model4, sequence4, "bone motion stream")

    # Ensemble prediction: average the outputs of all models
    avg_percent = 1/4 * (
        model1.predict(sequence1) + 
        model2.predict(sequence2) + 
        model3.predict(sequence3) + 
        model4.predict(sequence4)
    )

    print(f"Assemble model: {actions[np.argmax(avg_percent)]} with predicted index: {np.argmax(avg_percent)}")
