In [30]:
import cv2
import os
import mediapipe as mp
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical

In [31]:
# Initialize MediaPipe Hands.
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
mp_drawing = mp.solutions.drawing_utils

In [32]:
# Data Collection Function
def collect_data(label, save_path='data', num_samples=100):
    os.makedirs(save_path, exist_ok=True)
    label_path = os.path.join(save_path, label)
    os.makedirs(label_path, exist_ok=True)
    
    cap = cv2.VideoCapture(0)
    count = 0
    while count < num_samples:
        ret, frame = cap.read()
        if not ret:
            break
        cv2.imshow("Frame", frame)
        
        # Save frame for dataset
        frame_path = os.path.join(label_path, f'{label}_{count}.jpg')
        cv2.imwrite(frame_path, frame)
        count += 1
        
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [33]:
# Preprocess frames and extract landmarks
def preprocess_frame(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            landmarks = [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark]
            return landmarks
    return None

In [34]:
# Load Data
def load_data(data_path='data'):
    X, y = [], []
    labels = os.listdir(data_path)
    print(f"Labels found: {labels}")  # Debugging statement
    for label in labels:
        label_path = os.path.join(data_path, label)
        for img_file in os.listdir(label_path):
            img_path = os.path.join(label_path, img_file)
            frame = cv2.imread(img_path)
            landmarks = preprocess_frame(frame)
            if landmarks:
                X.append(landmarks)
                y.append(labels.index(label))
    return np.array(X), np.array(y), labels

In [35]:
# Training Function
def train_model(X_train, y_train, num_classes):
    model = Sequential([
        Flatten(input_shape=(21, 3)),  # 21 landmarks with 3 coordinates (x, y, z)
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, validation_split=0.2)
    
    return model

In [36]:
# Real-Time Inference
def real_time_inference(model, labels):
    cap = cv2.VideoCapture(0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        landmarks = preprocess_frame(frame)
        if landmarks:
            landmarks = np.array(landmarks).reshape(1, 21, 3)
            prediction = model.predict(landmarks)
            predicted_label = np.argmax(prediction)
            print(f"Predicted sign: {labels[predicted_label]}")
        
        cv2.imshow("Frame", frame)
        key = cv2.waitKey(1)
        if key == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [37]:
# Step 1: Collect data for gestures (run multiple times for different labels)
collect_data('hello')
collect_data('thanks')

In [38]:
# Step 2: Load data
X_train, y_train, labels = load_data()

Labels found: ['hello', 'thanks']


In [39]:
# Step 3: Ensure data is not empty
if X_train.size == 0 or y_train.size == 0:
    raise ValueError("No data loaded. Please check the data directory and preprocessing steps.")

In [40]:
# Convert labels to categorical
y_train = to_categorical(y_train)

In [41]:
# Step 4: Train the model
num_classes = y_train.shape[1]
model = train_model(X_train, y_train, num_classes)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 201ms/step - accuracy: 0.7500 - loss: 0.6455 - val_accuracy: 0.0000e+00 - val_loss: 1.1692
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.7500 - loss: 0.5947 - val_accuracy: 0.0000e+00 - val_loss: 1.0798
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.7500 - loss: 0.5525 - val_accuracy: 0.0000e+00 - val_loss: 1.0762
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.7539 - loss: 0.5090 - val_accuracy: 0.0000e+00 - val_loss: 1.0042
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.7383 - loss: 0.4885 - val_accuracy: 0.0000e+00 - val_loss: 0.8323
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.7461 - loss: 0.4525 - val_accuracy: 0.0000e+00 - val_loss: 0.8346
Epoch 7/10
[1m3/3[0

In [42]:
# Step 5: Real-Time Inference
real_time_inference(model, labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted sign: hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predicted sign: thanks
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Predicted sign: thanks
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s