Extracting all the keypoints from the training data

In [2]:
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore
from tensorflow.keras.utils import to_categorical # type: ignore

In [5]:
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.7)

# Dataset directory (UPDATE THIS PATH)
DATASET_DIR = '/Users/js/Desktop/Sign Recognition Application/Sign_to_Sentence Project/Asl_Sign_Data/asl_alphabet_train/asl_alphabet_train'

# Initialize lists to store extracted data
landmark_data = []
labels = []

# Listing all folders (A-Z, space, delete, nothing)
class_labels = os.listdir(DATASET_DIR)

# Process each folder (each letter class)
for label in class_labels:
    folder_path = os.path.join(DATASET_DIR, label)
    
    if not os.path.isdir(folder_path):
        continue 

    # Process each image in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".png") or file.endswith(".jpg"): 
            img_path = os.path.join(folder_path, file)

            
            image = cv2.imread(img_path)

            # Check if image is valid
            if image is None:
                print(f"❌ ERROR: Failed to load {file}")
                continue

            print(f"🔍 Processing: {file} | Class: {label}")

            # Convert image to RGB (MediaPipe requires RGB)
            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Process image with MediaPipe
            results = hands.process(image_rgb)

            # If a hand is detected, extract landmarks
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # Extract landmark points (x, y, z) for 21 keypoints
                    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]).flatten()

                    # Save data
                    landmark_data.append(landmarks)
                    labels.append(label)
                    print(f"📝 Saved: {file} -> {label}")

# Convert to DataFrame and Save
df = pd.DataFrame(landmark_data)
df["label"] = labels

# Check if data was collected
if len(df) == 0:
    print("❌ ERROR: No hand landmarks were saved. Check dataset format.")
else:
    df.to_csv("asl_mediapipe_keypoints_dataset.csv", index=False)
    print(f"✅ Dataset saved as 'asl_mediapipe_keypoints_dataset.csv' with {len(df)} samples")


I0000 00:00:1739387749.617455       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro


🔍 Processing: R2837.jpg | Class: R
📝 Saved: R2837.jpg -> R
🔍 Processing: R2189.jpg | Class: R
🔍 Processing: R1480.jpg | Class: R
📝 Saved: R1480.jpg -> R
🔍 Processing: R1494.jpg | Class: R
📝 Saved: R1494.jpg -> R
📝 Saved: R1494.jpg -> R
🔍 Processing: R2823.jpg | Class: R
📝 Saved: R2823.jpg -> R
🔍 Processing: R228.jpg | Class: R
📝 Saved: R228.jpg -> R
🔍 Processing: R200.jpg | Class: R
📝 Saved: R200.jpg -> R
🔍 Processing: R566.jpg | Class: R
📝 Saved: R566.jpg -> R
🔍 Processing: R572.jpg | Class: R
📝 Saved: R572.jpg -> R
🔍 Processing: R214.jpg | Class: R
📝 Saved: R214.jpg -> R
🔍 Processing: R1325.jpg | Class: R
🔍 Processing: R1443.jpg | Class: R
📝 Saved: R1443.jpg -> R
🔍 Processing: R599.jpg | Class: R
📝 Saved: R599.jpg -> R
🔍 Processing: R1457.jpg | Class: R
📝 Saved: R1457.jpg -> R
🔍 Processing: R1331.jpg | Class: R
📝 Saved: R1331.jpg -> R
🔍 Processing: R2638.jpg | Class: R
🔍 Processing: R1319.jpg | Class: R
📝 Saved: R1319.jpg -> R
🔍 Processing: R2610.jpg | Class: R
🔍 Processing: R2176.jp

Preprocessing the Mediapipe Keypoints file data

In [6]:
# Load dataset
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")

# Separate features and labels
X = df.iloc[:, :-1].values 
y = df["label"].values 

# Encode labels as numbers
encoder = LabelEncoder()

# Convert labels to integer indices
y_encoded = encoder.fit_transform(y)  # Convert labels to integer indices

# One-hot encoding
y_categorical = to_categorical(y_encoded, num_classes=len(np.unique(y_encoded))) 

# Split dataset into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

Creation of a Multi-Level-Perceptron Model

In [7]:
# Define MLP Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

# Compile the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


2025-02-13 01:26:53.258428: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-02-13 01:26:53.258493: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-13 01:26:53.258496: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-13 01:26:53.258883: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-13 01:26:53.259212: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Training the MLP Model

In [8]:
model.fit(X_train,
          y_train,
          epochs=10,
          batch_size=128,
          validation_split=0.2)

# Save Model
model.save("asl_mediapipe_mlp_model.h5")
print("✅ Model saved as 'asl_mediapipe_mlp_model.h5'")

Epoch 1/10


2025-02-13 01:35:40.149372: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2025-02-13 01:35:43.097976: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
✅ Model saved as 'asl_mediapipe_mlp_model.h5'


  saving_api.save_model(


Test Accuracy of the trained Model

In [9]:
model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")

# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

 14/374 [>.............................] - ETA: 3s - loss: 0.1609 - accuracy: 0.9509

2025-02-13 01:36:47.088288: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Test Accuracy: 95.52%


Testing the Mediapipe Approach for Sign Recognition

In [3]:
# Load the trained MLP model
mlp_model = tf.keras.models.load_model("asl_mediapipe_mlp_model.h5")

# Load dataset to rebuild LabelEncoder
df = pd.read_csv("asl_mediapipe_keypoints_dataset.csv")
encoder = LabelEncoder()
encoder.fit(df["label"]) 

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.7, min_tracking_confidence=0.7)

# Open webcam
cap = cv2.VideoCapture(0)

# Store the predicted sentence
predicted_sentence = ""

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame horizontally for a mirrored effect
    frame = cv2.flip(frame, 1)

    # Convert frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for hand_landmarks, handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # Extract and normalize landmark coordinates
            landmarks = np.array([[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark])

            # Check if the hand is RIGHT-HANDED and mirror coordinates
            # This is done because the training data cointains only signs done by left hand
            if handedness.classification[0].label == "Right":
                landmarks[:, 0] = 1 - landmarks[:, 0]  # Flip x-coordinates for right hand

            # Flatten for model input
            input_data = landmarks.flatten().reshape(1, -1)

            # Make prediction using MLP model
            prediction = mlp_model.predict(input_data)
            predicted_class = np.argmax(prediction)  # Get class index
            predicted_label = encoder.inverse_transform([predicted_class])[0]  # Convert index to sign label

            # Append predicted letter to the sentence (Handle special cases)
            if predicted_label == "SPACE":
                predicted_sentence += " "
            elif predicted_label == "DELETE":
                predicted_sentence = predicted_sentence[:-1]  # Remove last character
            elif predicted_label == "NOTHING":
                pass  # Ignore 'nothing' class
            else:
                predicted_sentence += predicted_label

    # Create a black bar at the bottom
    bar_height = 60
    frame_height, frame_width, _ = frame.shape
    cv2.rectangle(frame, (0, frame_height - bar_height), (frame_width, frame_height), (0, 0, 0), -1)

    # Display the predicted sentence in the black bar
    cv2.putText(frame, predicted_sentence, (50, frame_height - 20),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Display the frame
    cv2.imshow("Sign Prediction (MediaPipe MLP) - Left & Right Hand Support", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


2025-02-13 14:23:34.782391: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-02-13 14:23:34.782434: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-13 14:23:34.782443: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-02-13 14:23:34.782489: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-02-13 14:23:34.782511: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
I0000 00:00:1739436815.533058       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
2025-02-13



KeyboardInterrupt: 