In [1]:
import torch
import torchvision.transforms as transforms
import cv2
import numpy as np
import subprocess
import mediapipe as mp
from torchvision.models import resnet18

In [2]:

# Load the pre-trained ResNet18 model
model = resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, 18)  # Assuming 18 classes for hand gestures

# Load only the model weights
checkpoint = torch.load('/Users/bunty/Desktop/HCI-Project-GitHub/SOEN-6751-Project/Model/ResNet18.pth', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['MODEL_STATE'])

model.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [3]:
# Define image transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to predict gesture from image
def predict_gesture(image):
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        outputs = model(image)
    _, predicted = torch.max(outputs, 1)
    return predicted.item()
# # Function to play or pause Spotify
# def play_pause_spotify():
#     subprocess.run(["osascript", "-e", "tell application \"Spotify\" to playpause"])
# Function to increase Volume
def increase_volume():
    subprocess.run(["osascript", "-e", "set volume output volume (output volume of (get volume settings) + 5)"])
#Function to decrease Volume
def decrease_volume():
    subprocess.run(["osascript", "-e", "set volume output volume (output volume of (get volume settings) - 5)"])
# Function to mute volume
# def mute_volume():
#     subprocess.run(["osascript", "-e", "set volume with output muted"])

# Function to play music in Spotify
def play_spotify():
    subprocess.run(["osascript", "-e", "tell application \"Spotify\" to play"])

# Function to pause music in Spotify
def pause_spotify():
    subprocess.run(["osascript", "-e", "tell application \"Spotify\" to pause"])    

# Initialize Mediapipe Hand model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()
mp_drawing = mp.solutions.drawing_utils  # Import drawing utilities from mediapipe

# Open the camera
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to capture image")
        break
    # Convert the frame to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Detect hands in the frame
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw connections between hand landmarks
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                                      landmark_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2),
                                      connection_drawing_spec=mp_drawing.DrawingSpec(color=(255, 255, 255), thickness=2, circle_radius=2))
            # Distinguish each finger with a different color
            for finger_id, finger_color in zip(range(0, 21, 4), [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255), (255, 0, 255)]):
                finger_landmarks = hand_landmarks.landmark[finger_id:finger_id+4]
                if all(landmark.visibility > 0 for landmark in finger_landmarks):
                    # Draw a line connecting each finger landmark
                    cv2.line(frame, (int(finger_landmarks[0].x * frame.shape[1]), int(finger_landmarks[0].y * frame.shape[0])),
                             (int(finger_landmarks[3].x * frame.shape[1]), int(finger_landmarks[3].y * frame.shape[0])), finger_color, 2)

#     if results.multi_hand_landmarks:
        # Perform prediction on the current frame
        prediction = predict_gesture(frame)

        # Display the prediction
        cv2.putText(frame, f"Gesture: {prediction}", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

        # Perform actions based on gesture
        if prediction == 4:  # Play/pause gesture
            increase_volume()
        elif prediction == 8 or prediction == 11:  # Volume increase gesture
            pause_spotify()
        elif prediction == 1:  # Volume decrease gesture
            decrease_volume()
#         elif prediction == 5:  # Mute Volume 
#             mute_volume()
        elif prediction == 10:  # Mute Volume 
            play_spotify()
    else:
        cv2.putText(frame, "No hand detected", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    cv2.imshow('Hand Gesture Recognition', frame)

#     # Check if gesture is 2 (play/pause gesture)
#     if prediction == 4:
#         increase_volume() 
#     elif prediction == 8 or prediction == 11:  # Volume increase gesture
#         play_pause_spotify()    
#     elif prediction ==1 :  # Volume increase gesture
#         decrease_volume()
        
    # Exit when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close OpenCV windows
cap.release()
cv2.destroyAllWindows()


I0000 00:00:1710480574.944105 1723948 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1 Pro
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


KeyboardInterrupt: 