In [None]:
# libraries
import torch
import cv2
import mediapipe as mp
import time
import os
# for loading latest checkpoint
import glob
# for loading gesture recognition model
import pickle
import numpy as np

In [None]:
# Get latest checkpoint file

def find_latest_checkpoint(directory, tag=None):
    if tag:
        pattern = f"{directory}/model_gesture_recog_{tag}.pth"
    else:
        # check for all the checkpoint files
        pattern= f"{directory}/model_gesture_recog_*.pth"
    
    checkpoint_files = glob.glob(pattern)
    if not checkpoint_files:
        raise FileNotFoundError("No checkpoints found.")
    
    # sort files by modificaiton time (latest first)
    checkpoint_files.sort(key=os.path.getmtime, reverse=True)
    
    return checkpoint_files[0]
    
        
# checkpoint_path = './checkpoints/model_gesture_recog_20241203_164311'
directory = '../checkpoints'
latest_ckpt = find_latest_checkpoint(directory)
print(f"latest checkpoint: {latest_ckpt}")


latest checkpoint: ../checkpoints/model_gesture_recog_20241203_164311.pth


In [8]:
# Check for available devices
if torch.backends.mps.is_available():
    device = torch.device("mps")  # Use Metal Performance Shaders (Apple GPU)
    print("Using device: MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = torch.device("cuda")  # Use NVIDIA GPU if available
    print("Using device: CUDA")
else:
    device = torch.device("cpu")  # Fallback to CPU
    print("Using device: CPU")

Using device: MPS (Metal Performance Shaders)


In [19]:
# Load model using pickle
import sys
sys.path.append(os.path.abspath(".."))
from training.ffn import GestureNet

pickle_path = "../checkpoints/pickles/model_gesture_recog.pkl"
with open(pickle_path, "rb") as f:
    gest_recog_model = pickle.load(f)

gest_recog_model = gest_recog_model.to(device)
# set to evaluation mode
gest_recog_model.eval()


GestureNet(
  (network_stack): Sequential(
    (0): Linear(in_features=63, out_features=128, bias=True)
    (1): ReLU()
    (2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=128, out_features=128, bias=True)
    (5): ReLU()
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=7, bias=True)
  )
)

In [15]:
# Init Mediapipe hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False, 
    max_num_hands=2, 
    min_detection_confidence = 0.5, 
    min_tracking_confidence = 0.5)

mp_draw = mp.solutions.drawing_utils


I0000 00:00:1733264392.656997  515167 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M2


W0000 00:00:1733264392.733980  559909 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1733264392.743277  559909 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [16]:
cap = cv2.VideoCapture(0)


In [21]:
while cap.isOpened():
    ret, frame = cap.read()
    
    if not ret:
        break

    # Convert frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process frame with MediaPipe
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            # Extract landmark positions
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])

            # Convert to NumPy array and flatten
            input_data = np.array(landmarks).flatten()
            
            # Ensure input shape matches the model
            input_tensor = torch.tensor(input_data, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
            
            # Make predictions with the loaded model
            with torch.no_grad():
                output = gest_recog_model(input_tensor.to(device))
                predicted_class = torch.argmax(output, dim=1).item()
            
            # Display Prediction
            print(f"Predicted Gesture: {predicted_class}")

    # Show the frame
    cv2.imshow("MediaPipe Hands", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
hands.close()

Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 0
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 4
Predicted Gesture: 1
Predicted Gesture: 4
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 4
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 4
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Gesture: 4
Predicted Gesture: 1
Predicted Gesture: 1
Predicted Ges