In [None]:
import mediapipe as mp
import cv2
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 

In [None]:
DATA_DIR = '../data/asl_alphabet_train/asl_alphabet_train'
OUTPUT_FILE = '../data/asl_landmarks_train.csv'

In [None]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

hands_model = mp_hands.Hands(
    static_image_mode=True, 
    max_num_hands=1,            
    min_detection_confidence=0.5 
)

I0000 00:00:1763191963.424189 10285612 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 90.5), renderer: Apple M3


In [None]:
image_paths = []
labels = []

for label in sorted(os.listdir(DATA_DIR)):
    class_dir = os.path.join(DATA_DIR, label)
    
    if os.path.isdir(class_dir):
        for image_file in os.listdir(class_dir):
            if image_file.endswith(('.jpg', '.jpeg', '.png')):
                image_paths.append(os.path.join(class_dir, image_file))
                labels.append(label)

print(f"Found {len(image_paths)} images belonging to {len(set(labels))} classes.")

Found 87000 images belonging to 29 classes.


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1763191963.451303 10285754 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763191963.458626 10285754 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [8]:
processed_data = []
skipped_images = []
successful_count = 0

# We use tqdm to wrap our loop so we get a nice progress bar
for idx, file_path in enumerate(tqdm(image_paths)):
    
    # 1. Read the image with OpenCV
    image = cv2.imread(file_path)
    
    if image is None:
        processed_data.append([np.nan] * 63) 
        skipped_images.append((idx, file_path, "Failed to load"))
        continue
    
    # 2. Convert from BGR (OpenCV's default) to RGB (Mediapipe's requirement)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # 3. Process the image and get results
    results = hands_model.process(image_rgb)
    
    # 4. Extract landmarks if a hand was detected
    if results.multi_hand_landmarks:
        # We only care about the first hand (we set max_num_hands=1)
        hand_landmarks = results.multi_hand_landmarks[0]
        
        # --- This is the "Feature Engineering" step ---
        # We will extract 21 landmarks, each with x, y, z coordinates.
        # This gives 21 * 3 = 63 features.
        
        # Decision: To make the model "translation invariant" (not care *where*
        # in the frame the hand is), we make all landmarks relative to the
        # wrist (landmark 0). We take the wrist's x/y/z and subtract it
        # from all other 20 landmarks.
        
        # Get coordinates for the wrist (landmark 0)
        wrist_coords = hand_landmarks.landmark[0]
        
        # Flatten all 63 coordinates into a single list
        landmark_row = []
        for landmark in hand_landmarks.landmark:
            # Calculate relative coordinates
            relative_x = landmark.x - wrist_coords.x
            relative_y = landmark.y - wrist_coords.y
            relative_z = landmark.z - wrist_coords.z # Z is 'depth' from camera
            
            landmark_row.extend([relative_x, relative_y, relative_z])
            
        # Add the flattened landmark data and its label to our list
        processed_data.append(landmark_row)
        successful_count += 1
    else:
        # If no hand is found, we'll append a row of NaNs (Not a Number)
        # This way, we maintain the 1-to-1 match with our `labels` list.
        # We can filter these out later if we want.
        processed_data.append([np.nan] * 63) # 21 landmarks * 3 coords = 63 features
        skipped_images.append((idx, file_path, "MediaPipe failed to detect hand"))

print(f"\nSuccessfully processed {len(processed_data)} images.")
print(f"   Successfully processed: {successful_count}/{len(image_paths)} images ({successful_count/len(image_paths)*100:.2f}%)")
print(f"   Detection failures: {len(skipped_images)}/{len(image_paths)} images ({len(skipped_images)/len(image_paths)*100:.2f}%)")

  0%|          | 0/87000 [00:00<?, ?it/s]


Successfully processed 87000 images.
   Successfully processed: 63676/87000 images (73.19%)
   Detection failures: 23324/87000 images (26.81%)
