# Pose-based Approaches

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
import os
import glob
import numpy as np

data_path = "landmarks_mapped/landmarks_mapped"  

# List all .npy files
files = glob.glob(os.path.join(data_path, "*.npy"))

print("Total files found:", len(files))

data = []
labels =[]  # Using a set to avoid repeated labels

for file in files:
    # Load npy file
    arr = np.load(file, allow_pickle=True)
    data.append(arr)

    filename = os.path.basename(file)      
    label = filename.split("_")[0]           
    label = label.split(".")[0]              
    labels.append(label)



print("Loaded samples:", len(data))
print("labels:", len(labels))
print("Number of labels:", len(labels))
print("First sample shape:", data[0].shape)



Total files found: 11980
Loaded samples: 11980
labels: 11980
Number of labels: 11980
First sample shape: (74, 438)


## Preprocessing Stage 

### Anchor Based Normalization

In [None]:


pose_videos = []
face_videos = []
lh_videos = []
rh_videos = []

for video in data:
    # Split along the last dimension (feature_dim)
    pose_videos.append(video[:, 0:132])
    face_videos.append(video[:, 132:312])
    lh_videos.append(video[:, 312:375])
    rh_videos.append(video[:, 375:438])

# Check shapes of the first video
print("First video shapes:")
print("Pose:", pose_videos[0].shape)
print("Face:", face_videos[0].shape)
print("Left hand:", lh_videos[0].shape)
print("Right hand:", rh_videos[0].shape)




First video shapes:
Pose: (74, 132)
Face: (74, 180)
Left hand: (74, 63)
Right hand: (74, 63)


In [8]:
def hand_keypoints_reconstruction(hand_videos):
    """
    Reconstruct missing hand keypoints in a list of videos.
    Supports frames of shape (num_keypoints,) or (num_keypoints, dims)
    """

    reconstructed_videos = []

    for video in hand_videos:
        video = video.copy()  # avoid modifying original
        num_frames = video.shape[0]

        # Detect valid frames
        if video.ndim == 2:
            # (num_frames, num_keypoints)
            valid_frames = video[np.any(video != 0, axis=1)]
        else:
            # (num_frames, num_keypoints, dims)
            valid_frames = video[np.any(video != 0, axis=(1,2))]

        if len(valid_frames) == 0:
            reconstructed_videos.append(video)
            continue

        avg_shape = np.mean(valid_frames, axis=0)

        # Initialize first & last frames if missing
        if np.all(video[0] == 0):
            video[0] = avg_shape
        if np.all(video[-1] == 0):
            video[-1] = avg_shape

        # Reconstruct missing frames
        for k in range(num_frames):
            if np.all(video[k] == 0):
                # previous valid frame
                alpha = 1
                while k - alpha >= 0 and np.all(video[k - alpha] == 0):
                    alpha += 1
                # next valid frame
                beta = 1
                while k + beta < num_frames and np.all(video[k + beta] == 0):
                    beta += 1
                # Apply reconstruction
                if k - alpha >= 0 and k + beta < num_frames:
                    prev_frame = video[k - alpha]
                    next_frame = video[k + beta]
                    video[k] = (beta * prev_frame + alpha * next_frame) / (alpha + beta)
                elif k - alpha >= 0:
                    video[k] = video[k - alpha]
                elif k + beta < num_frames:
                    video[k] = video[k + beta]
                else:
                    video[k] = avg_shape

        reconstructed_videos.append(video)

    return reconstructed_videos


In [9]:
lh_reconstructed=hand_keypoints_reconstruction(lh_videos)
rh_reconstructed=hand_keypoints_reconstruction(rh_videos)

In [10]:

# -----------------------------
# Anchor-based normalization for a single frame
# -----------------------------
def anchor_normalize(landmarks, anchor_index=0, scale_points=None):
    landmarks = np.asarray(landmarks, dtype=np.float32)

    # If frame has no detected keypoints → return zeros
    if np.all(landmarks == 0):
        return landmarks

    # If anchor keypoint is missing → skip normalization
    if np.all(landmarks[anchor_index] == 0):
        return landmarks

    anchor = landmarks[anchor_index]

    # Shift keypoints so anchor = (0,0)
    normalized = landmarks - anchor

    # Optional scale normalization
    if scale_points is not None:
        p1, p2 = scale_points
        if np.all(normalized[p1] == 0) or np.all(normalized[p2] == 0):
            return normalized

        scale = np.linalg.norm(normalized[p1] - normalized[p2])
        if scale > 1e-6:   # avoid division by zero
            normalized /= scale

    return normalized

# -----------------------------
# Normalize a single video (list or array of frames)
# -----------------------------
def normalize_video_list(video_frames, anchor_index=0, scale_points=None):
    """
    video_frames: array (num_frames, num_points, dims) or list
    Returns: list of normalized frames
    """
    normalized_frames = []
    for frame in video_frames:
        normalized_frames.append(anchor_normalize(frame, anchor_index, scale_points))
    return normalized_frames

# -----------------------------
# Normalize all streams (list of videos)
# -----------------------------
def normalize_all_streams_list(pose_list, lh_list, rh_list, face_list):
    """
    pose_list, lh_list, rh_list, face_list: lists of videos (num_frames_i, num_points, dims)
    Returns normalized lists for each stream
    """
    pose_norm_list = [normalize_video_list(v, anchor_index=0, scale_points=(0, 1)) for v in pose_list]
    lh_norm_list   = [normalize_video_list(v, anchor_index=0) for v in lh_list]
    rh_norm_list   = [normalize_video_list(v, anchor_index=0) for v in rh_list]
    face_norm_list = [normalize_video_list(v, anchor_index=0) for v in face_list]

    return pose_norm_list, lh_norm_list, rh_norm_list, face_norm_list


In [11]:
pose_norm, lh_norm, rh_norm, face_norm = normalize_all_streams_list(pose_videos, lh_reconstructed, rh_reconstructed, face_videos)


### Data Augmentation

In [19]:
import numpy as np

def augment_keypoints_list(videos):
    """
    Apply data augmentation: random rotation + Gaussian noise.
    Accepts a list of videos OR a NumPy array (num_videos, num_frames, num_points, dims)
    Returns a list of augmented videos.
    """
    augmented_videos = []

    # If input is a NumPy array (num_videos, num_frames, num_points, dims), convert to list
    if isinstance(videos, np.ndarray) and videos.ndim == 4:
        videos = [videos[i] for i in range(videos.shape[0])]

    for keypoints in videos:
        # Ensure keypoints is a NumPy array
        keypoints = np.asarray(keypoints, dtype=np.float32)
        keypoints_aug = keypoints.copy()

        # 1. Random rotation (x-y plane)
        angle_deg = np.random.uniform(-13, 13)
        angle = np.deg2rad(angle_deg)
        cos, sin = np.cos(angle), np.sin(angle)
        rotation_matrix = np.array([[cos, -sin],
                                    [sin,  cos]])
        if keypoints_aug.shape[-1] >= 2:
            keypoints_aug[..., :2] = keypoints_aug[..., :2] @ rotation_matrix.T

        # 2. Add Gaussian noise
        noise = np.random.normal(0, 1e-3, keypoints_aug.shape)
        keypoints_aug += noise

        # Store augmented video
        augmented_videos.append(keypoints_aug)

    return augmented_videos



In [20]:

pose_aug = augment_keypoints_list(pose_norm)
lh_aug   = augment_keypoints_list(lh_norm)
rh_aug   = augment_keypoints_list(rh_norm)

### prepare for feeding to the model 

In [25]:
all_videos = pose_aug + lh_aug + rh_aug + face_norm  # just to compute max frames
max_frames = max(len(video) for video in all_videos)

print("Maximum frames:", max_frames)

Maximum frames: 233


In [24]:
Y=np.array(labels)

In [27]:
def pad_videos_list(videos, max_frames):
    """
    Pad a list of videos to the same number of frames.
    videos: list of np.arrays, each (num_frames, num_points, dims) OR (num_frames, feature_dim)
    max_frames: int, target number of frames
    Returns: np.array of shape (num_videos, max_frames, *video_shape[1:])
    """
    padded_videos = []
    
    for video in videos:
        video = np.asarray(video, dtype=np.float32)
        num_frames = video.shape[0]
        pad_amount = max_frames - num_frames
        
        if pad_amount > 0:
            # Create zeros to pad
            if video.ndim == 3:  # (num_frames, num_points, dims)
                pad_shape = (pad_amount, video.shape[1], video.shape[2])
            elif video.ndim == 2:  # (num_frames, feature_dim)
                pad_shape = (pad_amount, video.shape[1])
            else:
                raise ValueError(f"Unsupported video shape: {video.shape}")
            
            video_padded = np.vstack([video, np.zeros(pad_shape, dtype=np.float32)])
        else:
            video_padded = video
        
        padded_videos.append(video_padded)
    
    return np.array(padded_videos, dtype=np.float32)



In [28]:
pose_input = pad_videos_list(pose_aug, max_frames)
lh_input   = pad_videos_list(lh_aug, max_frames)
rh_input   = pad_videos_list(rh_aug, max_frames)
face_input = pad_videos_list(face_norm, max_frames)

In [29]:
pose_input.shape 

(11980, 233, 132)

In [30]:
X = np.concatenate([pose_input, lh_input, rh_input,face_input], axis=-1)
print("Final input shape:", X.shape)  # (num_videos, max_frames, total_features)


Final input shape: (11980, 233, 438)


In [32]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(labels)  # convert string labels to integers
print("Label shape:", y.shape)


Label shape: (11980,)


In [36]:
from sklearn.model_selection import train_test_split

# Convert your labels to integers if not already
# Example: y = np.array([0, 1, 2, 0, 1, ...])

# Split data: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)


Train shape: (9584, 233, 438) (9584,)
Test shape: (2396, 233, 438) (2396,)


In [39]:


# Save features
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)

# Save labels
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

print("Data saved successfully!")


Data saved successfully!
