# Data Collection & Save Landmarks

https://www.instructables.com/How-to-Train-Custom-Hand-Gestures-Using-Mediapipe/

### Come Here gesture

In [None]:
import cv2
import mediapipe as mp
import os
import csv
from datetime import datetime

# === MediaPipe Setup ===
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)

# === Output Folders and Files ===
IMAGE_FOLDER = 'gesture_come'
CSV_FILE = 'gesture_come.csv'
LABEL = 'come_here'

# Create image directory if not exists
os.makedirs(IMAGE_FOLDER, exist_ok=True)

# Initialize CSV file (write header if it doesn't exist)
if not os.path.exists(CSV_FILE):
    with open(CSV_FILE, mode='w', newline='') as file:
        writer = csv.writer(file)
        header = ['label'] + [f'{coord}_{i}' for i in range(21) for coord in ('x', 'y', 'z')]
        writer.writerow(header)

# === Landmark Extraction Function ===
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])
            return landmarks
    return None

# === Save Landmark to CSV ===
def save_landmarks(landmarks, label):
    with open(CSV_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = [label] + [coord for point in landmarks for coord in point]
        writer.writerow(row)

# === OpenCV Camera Capture ===
cap = cv2.VideoCapture(0)
img_count = 0

print("[INFO] Press 'c' to capture frame, 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Show instructions on screen
    cv2.putText(frame, "Press 'c' to capture, 'q' to quit", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 191, 0), 2)

    cv2.imshow("Capture Gesture - Come Here", frame)

    key = cv2.waitKey(1) & 0xFF

    if key == ord('c'):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
        filename = f"{IMAGE_FOLDER}/comehere_{timestamp}.jpg"
        cv2.imwrite(filename, frame)
        print(f"[INFO] Saved image: {filename}")

        # Extract and save landmarks
        landmarks = extract_landmarks(frame)
        if landmarks:
            save_landmarks(landmarks, LABEL)
            print("[INFO] Landmarks saved to CSV.")
        else:
            print("[WARNING] No hand detected. Try again.")

    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


### Stop and Okay gesture

In [1]:
import cv2
import mediapipe as mp
import os
import csv
from datetime import datetime

# ==== CHANGE THIS LABEL TO MATCH YOUR CURRENT GESTURE ====
LABEL = 'stop'  # or 'okay', or 'come'

# === Setup ===
IMAGE_FOLDER = f'gesture_{LABEL}'
CSV_FILE = f'gesture_{LABEL}.csv'

# MediaPipe hands setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Create folder and CSV if they don't exist
os.makedirs(IMAGE_FOLDER, exist_ok=True)
if not os.path.exists(CSV_FILE):
    with open(CSV_FILE, mode='w', newline='') as file:
        writer = csv.writer(file)
        header = ['label'] + [f'{coord}_{i}' for i in range(21) for coord in ('x', 'y', 'z')]
        writer.writerow(header)

# Extract landmarks
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            return [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], hand_landmarks
    return None, None

def save_landmarks(landmarks, label):
    with open(CSV_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = [label] + [coord for point in landmarks for coord in point]
        writer.writerow(row)

# Start camera
cap = cv2.VideoCapture(0)
print(f"[INFO] Capturing samples for gesture: {LABEL}")
print("[INFO] Press 'c' to capture, 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.flip(frame, 1)

    landmarks, hand_landmarks = extract_landmarks(frame)

    if hand_landmarks:
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.putText(frame, f"Gesture: {LABEL} | Press 'c' to capture", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 191, 0), 2)

    cv2.imshow("Collect Gesture Data", frame)
    key = cv2.waitKey(1) & 0xFF

    if key == ord('c') and landmarks:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
        filename = f"{IMAGE_FOLDER}/{LABEL}_{timestamp}.jpg"
        cv2.imwrite(filename, frame)
        save_landmarks(landmarks, LABEL)
        print(f"[CAPTURED] {filename}")
    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[INFO] Capturing samples for gesture: stop
[INFO] Press 'c' to capture, 'q' to quit.
[CAPTURED] gesture_stop/stop_20250321_192224500372.jpg
[CAPTURED] gesture_stop/stop_20250321_192227759029.jpg
[CAPTURED] gesture_stop/stop_20250321_192230862450.jpg
[CAPTURED] gesture_stop/stop_20250321_192234927261.jpg
[CAPTURED] gesture_stop/stop_20250321_192239660739.jpg
[CAPTURED] gesture_stop/stop_20250321_192246241367.jpg
[CAPTURED] gesture_stop/stop_20250321_192250332225.jpg
[CAPTURED] gesture_stop/stop_20250321_192253692153.jpg
[CAPTURED] gesture_stop/stop_20250321_192300606869.jpg
[CAPTURED] gesture_stop/stop_20250321_192311205749.jpg
[CAPTURED] gesture_stop/stop_20250321_192341723422.jpg
[CAPTURED] gesture_stop/stop_20250321_192356794997.jpg
[CAPTURED] gesture_stop/stop_20250321_192359162437.jpg
[CAPTURED] gesture_stop/stop_20250321_192400924161.jpg
[CAPTURED] gesture_stop/stop_20250321_192403004416.jpg
[CAPTURED] gesture_stop/stop_20250321_192404458972.jpg
[CAPTURED] gesture_stop/stop_202503

In [2]:
import cv2
import mediapipe as mp
import os
import csv
from datetime import datetime

# ==== CHANGE THIS LABEL TO MATCH YOUR CURRENT GESTURE ====
LABEL = 'okay'  # or 'okay', or 'come'

# === Setup ===
IMAGE_FOLDER = f'gesture_{LABEL}'
CSV_FILE = f'gesture_{LABEL}.csv'

# MediaPipe hands setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Create folder and CSV if they don't exist
os.makedirs(IMAGE_FOLDER, exist_ok=True)
if not os.path.exists(CSV_FILE):
    with open(CSV_FILE, mode='w', newline='') as file:
        writer = csv.writer(file)
        header = ['label'] + [f'{coord}_{i}' for i in range(21) for coord in ('x', 'y', 'z')]
        writer.writerow(header)

# Extract landmarks
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            return [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], hand_landmarks
    return None, None

def save_landmarks(landmarks, label):
    with open(CSV_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = [label] + [coord for point in landmarks for coord in point]
        writer.writerow(row)

# Start camera
cap = cv2.VideoCapture(0)
print(f"[INFO] Capturing samples for gesture: {LABEL}")
print("[INFO] Press 'c' to capture, 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.flip(frame, 1)

    landmarks, hand_landmarks = extract_landmarks(frame)

    if hand_landmarks:
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.putText(frame, f"Gesture: {LABEL} | Press 'c' to capture", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 191, 0), 2)

    cv2.imshow("Collect Gesture Data", frame)
    key = cv2.waitKey(1) & 0xFF

    if key == ord('c') and landmarks:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
        filename = f"{IMAGE_FOLDER}/{LABEL}_{timestamp}.jpg"
        cv2.imwrite(filename, frame)
        save_landmarks(landmarks, LABEL)
        print(f"[CAPTURED] {filename}")
    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[INFO] Capturing samples for gesture: okay
[INFO] Press 'c' to capture, 'q' to quit.
[CAPTURED] gesture_okay/okay_20250321_194455097549.jpg
[CAPTURED] gesture_okay/okay_20250321_194500794012.jpg
[CAPTURED] gesture_okay/okay_20250321_194503528136.jpg
[CAPTURED] gesture_okay/okay_20250321_194505326446.jpg
[CAPTURED] gesture_okay/okay_20250321_194506920669.jpg
[CAPTURED] gesture_okay/okay_20250321_194511895916.jpg
[CAPTURED] gesture_okay/okay_20250321_194515227319.jpg
[CAPTURED] gesture_okay/okay_20250321_194518026675.jpg
[CAPTURED] gesture_okay/okay_20250321_194521495674.jpg
[CAPTURED] gesture_okay/okay_20250321_194525193651.jpg
[CAPTURED] gesture_okay/okay_20250321_194529384477.jpg
[CAPTURED] gesture_okay/okay_20250321_194531290073.jpg
[CAPTURED] gesture_okay/okay_20250321_194533331476.jpg
[CAPTURED] gesture_okay/okay_20250321_194535751853.jpg
[CAPTURED] gesture_okay/okay_20250321_194538263527.jpg
[CAPTURED] gesture_okay/okay_20250321_194540189867.jpg
[CAPTURED] gesture_okay/okay_202503

In [3]:
import cv2
import mediapipe as mp
import os
import csv
from datetime import datetime

# ==== CHANGE THIS LABEL TO MATCH YOUR CURRENT GESTURE ====
LABEL = 'come'  # or 'okay', or 'come'

# === Setup ===
IMAGE_FOLDER = f'gesture_{LABEL}'
CSV_FILE = f'gesture_{LABEL}.csv'

# MediaPipe hands setup
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# Create folder and CSV if they don't exist
os.makedirs(IMAGE_FOLDER, exist_ok=True)
if not os.path.exists(CSV_FILE):
    with open(CSV_FILE, mode='w', newline='') as file:
        writer = csv.writer(file)
        header = ['label'] + [f'{coord}_{i}' for i in range(21) for coord in ('x', 'y', 'z')]
        writer.writerow(header)

# Extract landmarks
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            return [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark], hand_landmarks
    return None, None

def save_landmarks(landmarks, label):
    with open(CSV_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        row = [label] + [coord for point in landmarks for coord in point]
        writer.writerow(row)

# Start camera
cap = cv2.VideoCapture(0)
print(f"[INFO] Capturing samples for gesture: {LABEL}")
print("[INFO] Press 'c' to capture, 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.flip(frame, 1)

    landmarks, hand_landmarks = extract_landmarks(frame)

    if hand_landmarks:
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.putText(frame, f"Gesture: {LABEL} | Press 'c' to capture", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 191, 0), 2)

    cv2.imshow("Collect Gesture Data", frame)
    key = cv2.waitKey(1) & 0xFF

    if key == ord('c') and landmarks:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S%f")
        filename = f"{IMAGE_FOLDER}/{LABEL}_{timestamp}.jpg"
        cv2.imwrite(filename, frame)
        save_landmarks(landmarks, LABEL)
        print(f"[CAPTURED] {filename}")
    elif key == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

[INFO] Capturing samples for gesture: come
[INFO] Press 'c' to capture, 'q' to quit.
[CAPTURED] gesture_come/come_20250321_195256511789.jpg
[CAPTURED] gesture_come/come_20250321_195300301269.jpg
[CAPTURED] gesture_come/come_20250321_195303133227.jpg
[CAPTURED] gesture_come/come_20250321_195314734275.jpg
[CAPTURED] gesture_come/come_20250321_195316315452.jpg
[CAPTURED] gesture_come/come_20250321_195319341152.jpg
[CAPTURED] gesture_come/come_20250321_195323805007.jpg
[CAPTURED] gesture_come/come_20250321_195326439677.jpg
[CAPTURED] gesture_come/come_20250321_195329068046.jpg
[CAPTURED] gesture_come/come_20250321_195418905620.jpg
[CAPTURED] gesture_come/come_20250321_195424534570.jpg
[CAPTURED] gesture_come/come_20250321_195426844010.jpg
[CAPTURED] gesture_come/come_20250321_195428447429.jpg
[CAPTURED] gesture_come/come_20250321_195429277971.jpg
[CAPTURED] gesture_come/come_20250321_195430809105.jpg
[CAPTURED] gesture_come/come_20250321_195434878579.jpg
[CAPTURED] gesture_come/come_202503

----------------------------

# Combine Data

In [4]:
import pandas as pd
import os

# File paths
csv_files = ['gesture_come.csv', 'gesture_stop.csv', 'gesture_okay.csv']
output_file = 'gesture_data.csv'

# Combine all CSVs
dataframes = []
for file in csv_files:
    if os.path.exists(file):
        df = pd.read_csv(file)
        dataframes.append(df)
    else:
        print(f"[WARNING] File not found: {file}")

# Concatenate and shuffle
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)  # Shuffle rows

# Save combined file
combined_df.to_csv(output_file, index=False)
print(f"[INFO] Combined CSV saved to: {output_file}")

[INFO] Combined CSV saved to: gesture_data.csv


-----

# Train the Model and fit the model on training data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# Load the CSV dataset
data = pd.read_csv("gesture_data.csv")
MODEL_NAME = 'gesture_model'

# Split into features and labels
X = data.drop('label', axis=1).values  # shape: (n_samples, 63)
y = data['label'].values

# Encode labels (e.g., 'come_here' -> 0)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)  # For multi-class classification

# Save the mapping (optional)
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_map)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(y_categorical.shape[1], activation='softmax')  # Output: num_classes
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=32, callbacks=[early_stop])

# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Save the model and label encoder
model.save(f"{MODEL_NAME}.h5")
np.save("gesture_label_classes.npy", label_encoder.classes_)
print("[INFO] Model and label encoder saved.")


Label Mapping: {'come': 0, 'okay': 1, 'stop': 2}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 134ms/step - accuracy: 0.2924 - loss: 1.1442 - val_accuracy: 0.4375 - val_loss: 1.0978
Epoch 2/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.3124 - loss: 1.1150 - val_accuracy: 0.4062 - val_loss: 1.0812
Epoch 3/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.3231 - loss: 1.1056 - val_accuracy: 0.3750 - val_loss: 1.0608
Epoch 4/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.4429 - loss: 1.0563 - val_accuracy: 0.4375 - val_loss: 1.0464
Epoch 5/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.4825 - loss: 1.0493 - val_accuracy: 0.4688 - val_loss: 1.0331
Epoch 6/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.4240 - loss: 1.0353 - val_accuracy: 0.5312 - val_loss: 1.0190
Epoch 7/50
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━



Test Accuracy: 90.62%
[INFO] Model and label encoder saved.


---

# Hand Gesture Predicition using the trained model

In [None]:
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp

MODEL_NAME = 'gesture_model'

# === Load trained model and labels ===
model = tf.keras.models.load_model(f"{MODEL_NAME}.h5")
class_names = np.load("gesture_label_classes.npy", allow_pickle=True)

# === MediaPipe Hands Setup ===
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils

# === Function to extract 21 landmarks ===
def extract_landmarks(image):
    results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])  # Flatten to 63-length vector
            return np.array(landmarks), hand_landmarks
    return None, None

# === Webcam Stream for Prediction ===
cap = cv2.VideoCapture(0)
print("[INFO] Press 'q' to quit.")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Flip for mirror view
    frame = cv2.flip(frame, 1)

    # Extract hand landmarks
    input_data, hand_landmarks = extract_landmarks(frame)
    
    if input_data is not None:
        # Predict with model
        input_data = np.expand_dims(input_data, axis=0)  # Reshape to (1, 63)
        prediction = model.predict(input_data)[0]
        predicted_index = np.argmax(prediction)
        confidence = prediction[predicted_index]
        predicted_label = class_names[predicted_index]

        # Show prediction
        display_text = f"{predicted_label} ({confidence*100:.1f}%)"
        cv2.putText(frame, display_text, (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 191, 0), 2)

        # Draw hand landmarks
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    else:
        cv2.putText(frame, "No hand detected", (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 100, 100), 2)

    cv2.imshow("Live Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




[INFO] Press 'q' to quit.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━