In [2]:
import cv2
import os

# Create folders if they don't exist
dataset_dir = 'gesture1'
gestures = ['scroll_up', 'scroll_down', 'back', 'forward', 'screenshot', 'close_window', 'openapp','none']
for gesture in gestures:
    if not os.path.exists(os.path.join(dataset_dir, gesture)):
        os.makedirs(os.path.join(dataset_dir, gesture))

# Initialize OpenCV camera
cap = cv2.VideoCapture(0)
img_count = 0
gesture_index = 0  # Index to track gestures (0 for scroll_up, 1 for scroll_down, etc.)
is_capturing = False  # Flag to indicate if capturing is active

# Define function to capture and save images
def capture_images():
    global img_count, gesture_index, is_capturing
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Flip the frame to mirror
        frame = cv2.flip(frame, 1)

        # Define region of interest (ROI) for gesture detection
        height, width, _ = frame.shape
        rect_x, rect_y = width // 4, height // 4
        rect_w, rect_h = width // 2, height // 2

        # Draw rectangle on frame to show ROI
        cv2.rectangle(frame, (rect_x, rect_y), (rect_x + rect_w, rect_y + rect_h), (0, 255, 0), 2)

        # Show instructions
        cv2.putText(frame, f"Gesture: {gestures[gesture_index]}", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        cv2.putText(frame, f"Capturing {img_count+1}/50 images", (10, 60),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

        if not is_capturing:
            # Display message to start capturing
            cv2.putText(frame, "Press 's' to start capturing", (10, 90),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        cv2.imshow("Capture Gesture", frame)

        if is_capturing:
            if img_count < 50:
                # Crop the image to the region of interest
                cropped_frame = frame[rect_y:rect_y + rect_h, rect_x:rect_x + rect_w]
                
                # Save the cropped image in the respective folder
                img_filename = os.path.join(dataset_dir, gestures[gesture_index], f'{gestures[gesture_index]}_{img_count}.jpg')
                cv2.imwrite(img_filename, cropped_frame)
                img_count += 1
                print(f"Captured {img_count} images for gesture {gestures[gesture_index]}")

            else:
                print(f"Press 'n' to switch to the next gesture")
                key = cv2.waitKey(1) & 0xFF
                if key == ord('n'):  # Move to the next gesture
                    gesture_index += 1
                    if gesture_index >= len(gestures):
                        print("Finished capturing all gestures.")
                        break  # Stop if all gestures are captured
                    img_count = 0  # Reset image count for the next gesture
                    print(f"Moving to next gesture: {gestures[gesture_index]}")
                elif key == ord('q'):  # Quit the capture
                    break

        # Start capturing when 's' is pressed
        key = cv2.waitKey(1) & 0xFF
        if key == ord('s') and not is_capturing:
            is_capturing = True
            print("Started capturing images.")

        # Quit capture if 'q' is pressed
        if key == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

# Start the image capture process
capture_images()


Started capturing images.
Captured 1 images for gesture scroll_up
Captured 2 images for gesture scroll_up
Captured 3 images for gesture scroll_up
Captured 4 images for gesture scroll_up
Captured 5 images for gesture scroll_up
Captured 6 images for gesture scroll_up
Captured 7 images for gesture scroll_up
Captured 8 images for gesture scroll_up
Captured 9 images for gesture scroll_up
Captured 10 images for gesture scroll_up
Captured 11 images for gesture scroll_up
Captured 12 images for gesture scroll_up
Captured 13 images for gesture scroll_up
Captured 14 images for gesture scroll_up
Captured 15 images for gesture scroll_up
Captured 16 images for gesture scroll_up
Captured 17 images for gesture scroll_up
Captured 18 images for gesture scroll_up
Captured 19 images for gesture scroll_up
Captured 20 images for gesture scroll_up
Captured 21 images for gesture scroll_up
Captured 22 images for gesture scroll_up
Captured 23 images for gesture scroll_up
Captured 24 images for gesture scroll_up

In [5]:
import cv2
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Directory paths
dataset_dir = 'gesture1'
gestures=['scroll_up', 'scroll_down', 'back', 'forward', 'screenshot', 'close_window', 'openapp','none']
image_size = (64, 64)  # Resize images for consistent input size

# Load images and labels
def load_images_and_labels():
    images = []
    labels = []
    for label, gesture in enumerate(gestures):
        gesture_dir = os.path.join(dataset_dir, gesture)
        for img_name in os.listdir(gesture_dir):
            img_path = os.path.join(gesture_dir, img_name)
            
            # Read, resize, and normalize the image
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            img = cv2.resize(img, image_size)
            
            images.append(img)
            labels.append(label)
    
    images = np.array(images)
    labels = np.array(labels)
    return images, labels

# Load dataset
X, y = load_images_and_labels()

# Normalize images to range [0, 1]
X = X / 255.0

# Convert labels to one-hot encoding
y = to_categorical(y, num_classes=len(gestures))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define CNN model
def create_cnn_model(input_shape):
    model = Sequential()
    
    # First Conv Layer
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Second Conv Layer
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    # Flatten the output
    model.add(Flatten())
    
    # Fully connected layer
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    
    # Output layer (softmax for multi-class classification)
    model.add(Dense(len(gestures), activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the CNN model
model = create_cnn_model(input_shape=(64, 64, 3))

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Test accuracy: {test_acc * 100:.2f}%")

# Save the trained model
model.save('gesturefinal.h5')



Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 179ms/step - accuracy: 0.1362 - loss: 2.1918 - val_accuracy: 0.0875 - val_loss: 2.0906
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 90ms/step - accuracy: 0.1645 - loss: 2.0738 - val_accuracy: 0.0625 - val_loss: 2.0742
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 90ms/step - accuracy: 0.1980 - loss: 2.0518 - val_accuracy: 0.4500 - val_loss: 2.0323
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - accuracy: 0.2835 - loss: 2.0065 - val_accuracy: 0.5875 - val_loss: 1.9907
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - accuracy: 0.2723 - loss: 1.9533 - val_accuracy: 0.4125 - val_loss: 1.8723
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 89ms/step - accuracy: 0.4524 - loss: 1.8468 - val_accuracy: 0.5500 - val_loss: 1.6807
Epoch 7/10
[1m10/10[0m [32m━━━



Test accuracy: 100.00%


In [6]:
import cv2
import numpy as np
from tensorflow.keras.models import load_model

# Load the trained model
model = load_model('gesturefinal.h5')

# List of gestures
gestures = ['scroll_up', 'scroll_down', 'back', 'forward', 'screenshot', 'close_window', 'openapp','none']

# Initialize OpenCV camera
cap = cv2.VideoCapture(0)

# Define the region of interest (ROI)
image_size = (64, 64)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Flip the frame
    frame = cv2.flip(frame, 1)

    # Region of interest (ROI)
    height, width, _ = frame.shape
    rect_x, rect_y = width // 4, height // 4
    rect_w, rect_h = width // 2, height // 2

    # Draw rectangle around ROI
    cv2.rectangle(frame, (rect_x, rect_y), (rect_x + rect_w, rect_y + rect_h), (0, 255, 0), 2)

    # Crop the image to the region of interest
    roi = frame[rect_y:rect_y + rect_h, rect_x:rect_x + rect_w]
    roi = cv2.resize(roi, image_size)
    roi = np.expand_dims(roi, axis=0) / 255.0  # Normalize

    # Predict gesture
    prediction = model.predict(roi)
    predicted_class = np.argmax(prediction, axis=1)
    predicted_gesture = gestures[predicted_class[0]]

    # Display predicted gesture on the frame
    cv2.putText(frame, f'Gesture: {predicted_gesture}', (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Display the frame
    cv2.imshow('Gesture Recognition', frame)

    # Break if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [15]:
import cv2
import numpy as np
import pyautogui
from tensorflow.keras.models import load_model
import time

# Load the trained model (gesture_model.h5 should be the saved model)
model = load_model('gesturefinal.h5')

# List of gestures (make sure they match the order in your training data)
gestures = ['scroll_up', 'scroll_down', 'back', 'forward', 'screenshot', 'close_window', 'openapp', 'none']

# Initialize the webcam
cap = cv2.VideoCapture(0)

# Define the confidence threshold (increase for more reliable actions)
threshold = 0.7  # Increased threshold for more reliable gesture recognition

# Frames to skip for smoother performance (optional)
frame_skip = 3  # Process every 3rd frame for better performance
frame_count = 0

# Flag to prevent multiple screenshots or actions in rapid succession
last_action_time = time.time()
action_cooldown = 1  # seconds between actions

# Store the time when the gesture for screenshot was detected
screenshot_time = 0

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break

    # Increment frame count and skip frames for performance
    frame_count += 1
    if frame_count % frame_skip != 0:
        continue  # Skip this frame

    # Flip the frame horizontally for a mirrored view (optional)
    frame = cv2.flip(frame, 1)

    # Define the region of interest (ROI) for gesture detection
    height, width, _ = frame.shape
    rect_x = width // 4
    rect_y = height // 4
    rect_w = width // 2
    rect_h = height // 2

    # Draw the rectangular box to highlight the region of interest (ROI)
    cv2.rectangle(frame, (rect_x, rect_y), (rect_x + rect_w, rect_y + rect_h), (0, 255, 0), 2)

    # Crop the region of interest (center part of the image)
    cropped_frame = frame[rect_y:rect_y + rect_h, rect_x:rect_x + rect_w]

    # Resize the cropped image to the size expected by the model (64x64)
    cropped_frame_resized = cv2.resize(cropped_frame, (64, 64))

    # Normalize the pixel values to be between 0 and 1
    cropped_frame_normalized = cropped_frame_resized / 255.0

    # Expand the dimensions to match the input shape of the model (batch size, height, width, channels)
    cropped_frame_input = np.expand_dims(cropped_frame_normalized, axis=0)

    # Make a prediction using the trained model
    prediction = model.predict(cropped_frame_input)

    # Get the predicted class (index of the maximum value in prediction)
    predicted_class = np.argmax(prediction, axis=1)

    # Get the confidence score (maximum value of the prediction)
    confidence = np.max(prediction)

    # Check if the confidence is above the threshold
    if confidence > threshold:
        predicted_gesture = gestures[predicted_class[0]]
    else:
        print(f"Low confidence: {confidence*100:.2f}%")
        predicted_gesture = 'none'  # No action detected if confidence is too low

    # Display the predicted gesture and confidence on the frame
    cv2.putText(frame, f"Predicted: {predicted_gesture} ({confidence*100:.2f}%)", 
                (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Get current time to manage action cooldown
    current_time = time.time()

    # Control computer screen using pyautogui based on recognized gestures
    if current_time - last_action_time > action_cooldown:  # Ensure actions happen after cooldown
        try:
            if predicted_gesture == 'scroll_up':
                pyautogui.scroll(500)  # Scroll up
                print("Scrolled up successfully")
                
            elif predicted_gesture == 'scroll_down':
                pyautogui.scroll(-500)  # Scroll down
                print("Scrolled down successfully")
                
            elif predicted_gesture == 'back':
                pyautogui.click(x=63 ,y=187)

            elif predicted_gesture == 'forward':
                pyautogui.click(x=180,y=188) 
                print("Swipe left simulated")

            elif predicted_gesture == 'screenshot':
                # Delay before performing the action
                time.sleep(2)
                
                # Capture screenshot and save
                screenshot = pyautogui.screenshot()
                screenshot.save("screenshot.png")
                print("Screenshot taken and saved as 'screenshot.png'")

                # Display the screenshot using OpenCV
                img = np.array(screenshot)
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                cv2.imshow("Screenshot", img)  # Show the screenshot
                cv2.waitKey(3000)  # Display the screenshot for 3 seconds
                
                # Close the screenshot window
                cv2.destroyWindow("Screenshot")
                
            elif predicted_gesture == 'close_window':
                pyautogui.click(x=3488, y=54)  
                print("Closing the window...")

            elif predicted_gesture == 'openapp':
                pyautogui.press('winleft')
                time.sleep(1)  
                time.sleep(0.2)
                pyautogui.write('Notepad')
                pyautogui.press('enter') 
                print("Escape key pressed")
            
            else:
                print("No action detected.")
            
            # Update last action time after a successful action
            last_action_time = current_time

        except Exception as e:
            print(f"Error performing action: {predicted_gesture}. Error: {e}")

    # Show the image with the prediction
    cv2.imshow("Gesture Recognition", frame)

    # Exit the loop if 'q' is pressed
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

# Release the webcam and close the OpenCV window
cap.release()
cv2.destroyAllWindows()




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
No action detected.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
No action detected.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
No action detected.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
