## Imports


In [1]:
import os
import cv2
import numpy as np
import mediapipe as mp
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import joblib


## Hand Landmark Extraction Using MediaPipe

Extracts hand landmarks from images using **MediaPipe Hands** and saves the processed data as a CSV file. It performs the following steps:

- Loads and processes images from a specified dataset directory.
- Uses **MediaPipe Hands** to detect hand landmarks.
- Extracts and normalizes (x, y) coordinates for each of the 21 landmarks.
- Saves the processed data into a structured DataFrame.
- Outputs the data as a CSV file for further training.

In [None]:
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, 
                      max_num_hands=1,
                      min_detection_confidence=0.5)


def extract_landmarks(image_path):

    image = cv2.imread(image_path)
    if image is None:
        print(f"Could not read image: {image_path}")
        return None
    
    # Convert image from BGR to RGB
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Process image with MediaPipe
    results = hands.process(image_rgb)
    
    # If no hand detected, return None
    if not results.multi_hand_landmarks:
        return None
    
    # Get landmarks of the first hand
    hand_landmarks = results.multi_hand_landmarks[0]
    
    # Extract x, y coordinates of each landmark (21 landmarks with x, y)
    landmarks = []
    h, w, _ = image.shape
    for landmark in hand_landmarks.landmark:
        # Normalize coordinates to be relative to image size
        x, y = landmark.x, landmark.y
        landmarks.extend([x, y])
    
    return landmarks

def process_dataset(base_dir):
    data = []
    labels = []
    
    for gesture_folder in os.listdir(base_dir):
        gesture_path = os.path.join(base_dir, gesture_folder)

        if os.path.isdir(gesture_path):

            counter = 0
            # Limiting to speed up testing and avoid long processing times while working on the project

            max_images = 100  
            # Process each image in the gesture folder
            for image_file in os.listdir(gesture_path):
                if counter >= max_images:
                    break   

                image_path = os.path.join(gesture_path, image_file)
                landmarks = extract_landmarks(image_path)
                
                if landmarks is not None:
                    data.append(landmarks)
                    labels.append(gesture_folder)
                    counter += 1
            
            print(f"Processed images for gesture: {gesture_folder}")
        
    # Create DataFrame
    landmark_names = []
    for i in range(21):
        landmark_names.extend([f'x{i}', f'y{i}'])
    
    df = pd.DataFrame(data, columns=landmark_names)
    df['gesture'] = labels
    
    return df

dataset = "images/asl_alphabet_train"
landmarks_df = process_dataset(dataset)
landmarks_df.to_csv("hand_landmarks_dataset.csv", index=False)


## Convert dataframe to trainable format

The `convert_to_trainable_format` function processes a dataset containing hand gesture data and prepares it for machine learning training. It performs the following steps:

- Handles missing values by dropping them if any exist.
- Encodes categorical labels (gesture names) into numerical values.
- Normalizes feature values using `StandardScaler`.
- Splits the dataset into training and testing sets.

In [None]:
def convert_to_trainable_format(df):

    if df.isnull().values.any():
        print("Warning: Dataset contains missing values")
        df = df.dropna()
    
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(df['gesture'])
    
    X = df.drop('gesture', axis=1).values
    
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set size: {X_train.shape[0]}")
    print(f"Testing set size: {X_test.shape[0]}")
    print(f"Number of gestures: {len(label_encoder.classes_)}")
    print(f"Gestures: {label_encoder.classes_}")
    
    return X_train, X_test, y_train, y_test, label_encoder, scaler

csv_path = "hand_landmarks_dataset.csv"
df = pd.read_csv(csv_path)
X_train, X_test, y_train, y_test, label_encoder, scaler = convert_to_trainable_format(df)

## Training

In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, label_encoder):

    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='rbf', C=10, gamma='scale', probability=True, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
        
        print(f"{name} Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
        
        # Save model
        joblib.dump(model, f"gesture_recognition_{name.replace(' ', '_').lower()}.pkl")
    
    # Find the best model
    best_model_name = max(results, key=results.get)
    print(f"\nBest model: {best_model_name} with accuracy {results[best_model_name]:.4f}")
    
    return models[best_model_name]

best_model = train_and_evaluate_models(X_train, X_test, y_train, y_test, label_encoder)

# Save preprocessing tools for later use
joblib.dump(scaler, "gesture_recognition_scaler.pkl")
joblib.dump(label_encoder, "gesture_recognition_label_encoder.pkl")

In [None]:
model = joblib.load("gesture_recognition_neural_network.pkl")  
scaler = joblib.load("gesture_recognition_scaler.pkl")
label_encoder = joblib.load("gesture_recognition_label_encoder.pkl")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, 
                       max_num_hands=1,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.8)
mp_drawing = mp.solutions.drawing_utils

def recognize_gesture_from_camera():
    cap = cv2.VideoCapture(0)
    
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            continue
        
        # Convert the image from BGR color (which OpenCV uses) to RGB color (which MediaPipe uses)
        image = cv2.flip(image, 1)  # Mirror display
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Process the image and detect hands
        results = hands.process(image_rgb)
        
        # Draw hand landmarks
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(
                    image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
                
                landmarks = []
                for landmark in hand_landmarks.landmark:
                    landmarks.extend([landmark.x, landmark.y])
                
                landmarks_array = np.array(landmarks).reshape(1, -1)
                landmarks_scaled = scaler.transform(landmarks_array)
                
                prediction = model.predict(landmarks_scaled)[0]
                gesture_name = label_encoder.inverse_transform([prediction])[0]
                
                # Display prediction
                cv2.putText(image, f"Gesture: {gesture_name}", (10, 50), 
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # Display the image
        cv2.imshow('Hand Gesture Recognition', image)
        if cv2.waitKey(5) & 0xFF == 27:  # ESC key to exit
            break
    
    cap.release()
    cv2.destroyAllWindows()

recognize_gesture_from_camera()

I0000 00:00:1742201245.178661  323478 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1742201245.181581  358762 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.3.4-arch1.1), renderer: AMD Radeon Graphics (radeonsi, renoir, LLVM 19.1.7, DRM 3.59, 6.13.1-arch2-1)
W0000 00:00:1742201245.202229  358748 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1742201245.223174  358754 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
