In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Set path to your dataset
TRAIN_DIR = "train"  # Adjust if your path is different
IMG_SIZE = 64

X = []
y = []

# Loop through each gesture folder inside 'train'
for gesture_folder in tqdm(os.listdir(TRAIN_DIR), desc="Processing gestures"):
    gesture_path = os.path.join(TRAIN_DIR, gesture_folder)

    if not os.path.isdir(gesture_path):
        continue

    for image_file in os.listdir(gesture_path):
        if not image_file.lower().endswith(".jpg"):
            continue

        image_path = os.path.join(gesture_path, image_file)
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        if img is None:
            continue

        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = img / 255.0  # normalize to 0–1 range

        X.append(img)
        y.append(gesture_folder)

# Convert to NumPy arrays
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
y = np.array(y)

# Encode class names to numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)
np.save("label_classes.npy", le.classes_)  # Save gesture names
print("✅ Saved label classes to 'label_classes.npy'")

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# One-hot encode labels for softmax
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

print("✅ Preprocessing complete!")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")


Processing gestures: 100%|█████████████████████████████████████████████████████████████| 13/13 [00:13<00:00,  1.05s/it]


✅ Saved label classes to 'label_classes.npy'
✅ Preprocessing complete!
X_train shape: (9360, 64, 64, 1)
y_train shape: (9360,)


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from keras.saving import save_model
from tensorflow.keras.callbacks import EarlyStopping

# Build the CNN Model
model = Sequential([
    Input(shape=(IMG_SIZE, IMG_SIZE, 1)),
    
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    
    Dense(len(np.unique(y_encoded)), activation='softmax')  # 13 gestures
])

# Compile
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Optional: Stop early if not improving
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train
history = model.fit(
    X_train, y_train_cat,
    epochs=15,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    callbacks=[early_stop]
)

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save
save_model(model, "gesture_new_model.keras")
print("Model saved as 'gesture_new_model.keras'")


Epoch 1/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 103ms/step - accuracy: 0.8938 - loss: 0.3748 - val_accuracy: 1.0000 - val_loss: 1.4504e-05
Epoch 2/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 89ms/step - accuracy: 0.9974 - loss: 0.0083 - val_accuracy: 1.0000 - val_loss: 1.4738e-07
Epoch 3/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 97ms/step - accuracy: 0.9988 - loss: 0.0044 - val_accuracy: 1.0000 - val_loss: 4.4393e-07
Epoch 4/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 82ms/step - accuracy: 0.9996 - loss: 0.0023 - val_accuracy: 1.0000 - val_loss: 1.0733e-07
Epoch 5/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 94ms/step - accuracy: 0.9982 - loss: 0.0041 - val_accuracy: 1.0000 - val_loss: 6.1133e-09
Epoch 6/15
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 141ms/step - accuracy: 0.9996 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 1

In [5]:
import cv2
import time
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

# Load model and labels
IMG_SIZE = 64
model = load_model("gesture_new_model.keras")

le = LabelEncoder()
le.classes_ = np.load("label_classes.npy")

# Start webcam
cap = cv2.VideoCapture(0)
print("Webcam started... Press 'q' to quit.")
time.sleep(2)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Define Region of Interest (ROI)
    x1, y1, x2, y2 = 200, 100, 400, 300
    roi = frame[y1:y2, x1:x2]

    # Preprocess ROI
    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
    resized = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
    normalized = resized / 255.0
    reshaped = normalized.reshape(1, IMG_SIZE, IMG_SIZE, 1)

    # Predict gesture
    pred = model.predict(reshaped, verbose=0)
    predicted_class = np.argmax(pred)
    gesture_name = le.inverse_transform([predicted_class])[0]

    # Draw results
    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(frame, f"Gesture: {gesture_name}", (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), 2)

    cv2.imshow("Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Webcam started... Press 'q' to quit.


In [8]:
import cv2
import numpy as np
import mediapipe as mp
import time
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

# Load trained model and label encoder
model = load_model("gesture_cnn_model.keras")
le = LabelEncoder()
le.classes_ = np.load("label_classes.npy")

IMG_SIZE = 64  # Must match training image size

# Initialize MediaPipe Hand detector
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=1,
                       min_detection_confidence=0.7)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)
print("📷 Webcam started... Press 'q' to quit.")
time.sleep(2)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    h, w, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            # Get bounding box coordinates
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]
            x_min = int(min(x_coords) * w) - 15
            x_max = int(max(x_coords) * w) + 15
            y_min = int(min(y_coords) * h) - 15
            y_max = int(max(y_coords) * h) + 15

            # Clamp coordinates within frame
            x_min, y_min = max(0, x_min), max(0, y_min)
            x_max, y_max = min(w, x_max), min(h, y_max)

            roi = frame[y_min:y_max, x_min:x_max]
            if roi.size == 0:
                continue

            # Preprocess ROI
            gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (IMG_SIZE, IMG_SIZE))
            normalized = resized / 255.0
            reshaped = normalized.reshape(1, IMG_SIZE, IMG_SIZE, 1)

            # Predict
            pred = model.predict(reshaped, verbose=0)
            predicted_class = np.argmax(pred)
            confidence = np.max(pred)

            # Display prediction if confident enough
            gesture_name = le.inverse_transform([predicted_class])[0]

            # Draw box and label
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(frame, f"{gesture_name} ({confidence*100:.1f}%)", (x_min, y_min - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)

            # Draw hand landmarks
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    # Show webcam frame
    cv2.imshow("Real-Time Gesture Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Clean up
cap.release()
cv2.destroyAllWindows()
print("Webcam closed.")


📷 Webcam started... Press 'q' to quit.
Webcam closed.
