In [None]:
from model import CNNModel
import cv2
import torch
import numpy as np
import pandas as pd
import pyautogui
import mediapipe as mp

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model = CNNModel(1, 4).to(device)
model.load_state_dict(torch.load('slm_4class.pth', map_location=torch.device(device)))

model.eval()


mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=1, min_detection_confidence=0.7)
mpDraw = mp.solutions.drawing_utils

classNames = ['Right Click', 'Move Down', 'Move Left', 'Double Click']

In [None]:
cap = cv2.VideoCapture(0)

def preprocess_and_predict(hand_img, model):

    gray = cv2.cvtColor(hand_img, cv2.COLOR_BGR2GRAY)

    resized = cv2.resize(gray, (28, 28), interpolation=cv2.INTER_AREA)

    flat_image = resized.flatten()

    data_frame = pd.DataFrame([flat_image])

    img_tensor = torch.tensor(data_frame.values, dtype=torch.float32).reshape((-1, 1, 28, 28)) / 255.0
    img_tensor = img_tensor.to(device)


    with torch.no_grad():
        outputs = model(img_tensor)
        _, predicted = torch.max(outputs.data, 1)
        return predicted, classNames[predicted.item()]


while cap.isOpened():
    success, image = cap.read()
    if not success:
        print("Ignoring empty camera frame.")
        continue


    h, w, _ = image.shape
    image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
    results = hands.process(image)

    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

    if results.multi_hand_landmarks:
        for handLMs in results.multi_hand_landmarks:
            x_max, y_max = 0, 0
            x_min, y_min = w, h
            for lm in handLMs.landmark:
                x, y = int(lm.x * w), int(lm.y * h)
                if x > x_max:
                    x_max = x
                if x < x_min:
                    x_min = x
                if y > y_max:
                    y_max = y
                if y < y_min:
                    y_min = y


            y_min = max(y_min - 20, 0)
            y_max = min(y_max + 20, h)
            x_min = max(x_min - 20, 0)
            x_max = min(x_max + 20, w)

            # Crop the hand region from the image
            hand_img = image[y_min:y_max, x_min:x_max]

            if hand_img.size > 0:
                predicted, gesture_name = preprocess_and_predict(hand_img, model)


                cv2.putText(image, gesture_name, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

                # Map the model's output to mouse actions
                if predicted.item() == 0:  # Right Click
                    pyautogui.click(button='right')
                elif predicted.item() == 1:  # Down
                    pyautogui.move(0, 10)
                elif predicted.item() == 2:  # Left
                    pyautogui.move(-10, 0)
                elif predicted.item() == 3:  # Double Click
                    pyautogui.click(clicks=2)


            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)


    cv2.imshow('MediaPipe Hands with Gesture Recognition', image)
    if cv2.waitKey(5) & 0xFF == 27:  # ESC key to exit
        break

cap.release()
cv2.destroyAllWindows()