In [None]:
#pip install keyboard pyautogui pystray pillow opencv-python mediapipe scikit-learn tensorflow pandas numpy

In [4]:
import os
import cv2
import time
import subprocess
import numpy as np
import pyautogui
import keyboard
import pygetwindow as gw
import mediapipe as mp
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [5]:
#Collect samples

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

DATA_PATH = "static_gestures"
NUM_SAMPLES = 200

def collect_data(gesture):
    os.makedirs(os.path.join(DATA_PATH, gesture), exist_ok=True)
    cap = cv2.VideoCapture(0)

    with mp_hands.Hands(min_detection_confidence=0.7) as hands:
        count = 0
        while count < NUM_SAMPLES:
            ret, frame = cap.read()
            if not ret:
                break

            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            result = hands.process(rgb)

            if result.multi_hand_landmarks:
                hand_landmarks = result.multi_hand_landmarks[0]
                mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                # Extract 42 features (x, y for 21 landmarks)
                landmarks = []
                for lm in hand_landmarks.landmark:
                    landmarks.extend([lm.x, lm.y])

                np.save(os.path.join(DATA_PATH, gesture, f"{count}.npy"), np.array(landmarks))
                count += 1
                cv2.putText(frame, f"Saved: {count}/{NUM_SAMPLES}", (30, 50),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            cv2.imshow("Static Gesture Collection", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

In [6]:
#Model Training

def train_model():
    X, y = [], []
    gestures = os.listdir(DATA_PATH)

    for gesture in gestures:
        for file in os.listdir(os.path.join(DATA_PATH, gesture)):
            landmarks = np.load(os.path.join(DATA_PATH, gesture, file))
            X.append(landmarks)
            y.append(gesture)

    X = np.array(X)
    y = LabelEncoder().fit_transform(y)
    y = to_categorical(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Sequential([
        Dense(128, activation="relu", input_shape=(X.shape[1],)),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(y.shape[1], activation="softmax")
    ])

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test))
    model.save("static_gesture_model.h5")

In [7]:
#Music control

GESTURES = ["fist", "left", "open_music", "peace", "pinch", "right", "thumbs_up", "palm"]
gesture_actions = {
    "fist": "play_pause",
    "right": "next_track",
    "left": "previous_track",
    "palm": "volume_up",
    "pinch": "volume_down",
    "peace": "stop",
    "open_music": "open_music"
}

last_triggered = {}
COOLDOWN = 1.5
YTM_COMMAND = "https://music.youtube.com"
YTM_TITLE = "YouTube Music"

def get_ytm_window():
    wins = gw.getWindowsWithTitle(YTM_TITLE)
    return wins[0] if wins else None

def close_ytm():
    win = get_ytm_window()
    if win:
        try:
            win.close()
            time.sleep(1)
        except Exception as e:
            print("Error closing window:", e)

def control_music(action):
    if action == "play_pause":
        keyboard.send("play/pause media")
    elif action == "next_track":
        keyboard.send("next track")
    elif action == "previous_track":
        keyboard.send("previous track")
    elif action == "volume_up":
        keyboard.send("volume up")
    elif action == "volume_down":
        keyboard.send("volume down")
    elif action == "stop":
        keyboard.send("stop media")
    elif action == "open_music":
        win = get_ytm_window()
        if not win:
            print("Launching YouTube Music...")
            subprocess.Popen(["explorer", YTM_COMMAND])
            time.sleep(10)
            pyautogui.press("space")  # auto-play
        else:
            if win.isActive:
                print("Closing YouTube Music...")
                close_ytm()
            else:
                print("Switching focus...")
                win.restore()

In [8]:
#Main Program

def run_gesture_control():
    model = tf.keras.models.load_model("static_gesture_model.h5")
    cap = cv2.VideoCapture(0)

    with mp_hands.Hands(max_num_hands=2, min_detection_confidence=0.7, min_tracking_confidence=0.7) as hands:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            result = hands.process(rgb)

            if result.multi_hand_landmarks:
                for hand_landmarks, hand_info in zip(result.multi_hand_landmarks, result.multi_handedness):
                    mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

                    landmarks = []
                    for lm in hand_landmarks.landmark:
                        landmarks.extend([lm.x, lm.y])

                    prediction = model.predict(np.expand_dims(landmarks, axis=0), verbose=0)[0]
                    pred_class = GESTURES[np.argmax(prediction)]
                    confidence = np.max(prediction)

                    if confidence > 0.8:
                        h, w, _ = frame.shape
                        xs = [int(lm.x * w) for lm in hand_landmarks.landmark]
                        ys = [int(lm.y * h) for lm in hand_landmarks.landmark]
                        x_min, y_min = min(xs), min(ys)

                        label = f"{hand_info.classification[0].label}: {pred_class} ({confidence:.2f})"
                        cv2.putText(frame, label, (x_min, y_min - 10),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

                        action = gesture_actions.get(pred_class)
                        if action:
                            now = time.time()
                            if now - last_triggered.get(action, 0) > COOLDOWN:
                                control_music(action)
                                last_triggered[action] = now

            cv2.imshow("Music Control via Hand Gestures", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()


In [None]:
collect_data()

In [None]:
train_model()

In [10]:
run_gesture_control()



Launching YouTube Music...
