# Speech to Control Code

In [2]:
import speech_recognition as sr
import pyautogui
import threading
import time

r = sr.Recognizer()
pyautogui.PAUSE = 0.05

keywords = {
    "left": "left",
    "right": "right",
    "down": "down",
    "up": "up",
    "write": "right",
    "town": "down",
    "aap": "up",
}

def record_text():
    while True:
        try:
            with sr.Microphone() as source2:
                r.adjust_for_ambient_noise(source2, duration=0.04)
                print("Listening... Please say a direction or keyword...")
                audio2 = r.listen(source2, phrase_time_limit=1.5)
                myText = r.recognize_google(audio2, language="en-US").lower()
                print(f"Recognized text: {myText}")
                return myText
        except sr.UnknownValueError:
            print("Could not understand audio. Please try again.")
            time.sleep(0.5)
            return None
        except sr.RequestError as e:
            print(f"Could not request results: {e}")
            return None

def process_and_trigger_keypress(text):
    word_set = set(text.split())
    for word in keywords.keys() & word_set:
        action = keywords[word]
        print(f"Triggering '{action}' key for '{word}'")
        pyautogui.press(action)

def process_in_thread(detected_text):
    thread = threading.Thread(target=process_and_trigger_keypress, args=(detected_text,))
    thread.start()

def main():
    while True:
        detected_text = record_text()
        if detected_text:
            if "please stop" in detected_text:
                print("Stopping the program as 'please stop' was detected.")
                break
            process_in_thread(detected_text)

if __name__ == "__main__":
    main()


Listening... Please say a direction or keyword...
Recognized text: write direction
Triggering 'right' key for 'write'
Listening... Please say a direction or keyword...
Recognized text: down direction
Triggering 'down' key for 'down'
Listening... Please say a direction or keyword...
Recognized text: left direction
Triggering 'left' key for 'left'
Listening... Please say a direction or keyword...
Recognized text: down direction
Triggering 'down' key for 'down'
Listening... Please say a direction or keyword...
Recognized text: write direction
Triggering 'right' key for 'write'
Listening... Please say a direction or keyword...
Recognized text: right direction
Triggering 'right' key for 'right'
Listening... Please say a direction or keyword...
Recognized text: town direction
Triggering 'down' key for 'town'
Listening... Please say a direction or keyword...
Recognized text: left direction
Triggering 'left' key for 'left'
Listening... Please say a direction or keyword...
Recognized text: town

# Head to Gesture Code

In [1]:
import cv2
import mediapipe as mp
import time

mp_holistic = mp.solutions.holistic

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def get_nose_coordinates(face_landmarks, image_shape):
    if face_landmarks:
        height, width, _ = image_shape
        nose_tip = face_landmarks.landmark[1]
        x = int(nose_tip.x * width)
        y = int(nose_tip.y * height)
        return x, y
    return None, None

def get_lips_distance(face_landmarks, image_shape):
    if face_landmarks:
        height, width, _ = image_shape
        upper_lip = face_landmarks.landmark[13]  # Upper lip
        lower_lip = face_landmarks.landmark[14]  # Lower lip
        # Calculate the Euclidean distance between the upper and lower lips
        lip_distance = ((upper_lip.x - lower_lip.x) ** 2 + (upper_lip.y - lower_lip.y) ** 2) ** 0.5
        lip_distance_pixels = int(lip_distance * height)  # Convert to pixel distance
        return lip_distance_pixels
    return 0

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    reference_position = None
    start_time = time.time()
    initialized = False

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image, results = mediapipe_detection(frame, holistic)

        if results.face_landmarks:
            nose_x, nose_y = get_nose_coordinates(results.face_landmarks, frame.shape)

            if not initialized:
                elapsed_time = time.time() - start_time
                cv2.putText(frame, f"Initializing... {5 - int(elapsed_time)} sec",
                            (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                if elapsed_time >= 5:
                    reference_position = (nose_x, nose_y)
                    initialized = True
                else:
                    cv2.imshow("Head Movement Detection", frame)
                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break
                    continue

            if reference_position:
                ref_x, ref_y = reference_position
                dx = nose_x - ref_x
                dy = nose_y - ref_y

                direction = None
                if dy < -50:
                    direction = "Up"
                elif dy > 50:
                    direction = "Down"
                elif dx < -50:  # Reversed from dx > 50
                    direction = "Right"
                elif dx > 50:  # Reversed from dx < -50
                    direction = "Left"

                if direction:
                    cv2.putText(frame, f"Head Movement: {direction}", (10, 100),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Mouth opening detection
            lip_distance = get_lips_distance(results.face_landmarks, frame.shape)
            if lip_distance > 20:  # Threshold for mouth opening
                cv2.putText(frame, "Gesture: Space", (10, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)

            cv2.circle(frame, (nose_x, nose_y), 5, (0, 255, 0), -1)
            cv2.putText(frame, f"Nose Tip: ({nose_x}, {nose_y})", (nose_x + 10, nose_y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        cv2.imshow("Head Movement and Space Gesture Detection", frame)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


In [7]:
import cv2
import mediapipe as mpq
import time
import pyautogui

mp_holistic = mp.solutions.holistic

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def get_nose_coordinates(face_landmarks, image_shape):
    if face_landmarks:
        height, width, _ = image_shape
        nose_tip = face_landmarks.landmark[1]
        x = int(nose_tip.x * width)
        y = int(nose_tip.y * height)
        return x, y
    return None, None

def get_lips_distance(face_landmarks, image_shape):
    if face_landmarks:
        height, width, _ = image_shape
        upper_lip = face_landmarks.landmark[13]
        lower_lip = face_landmarks.landmark[14]
        lip_distance = ((upper_lip.x - lower_lip.x) ** 2 + (upper_lip.y - lower_lip.y) ** 2) ** 0.5
        lip_distance_pixels = int(lip_distance * height)
        return lip_distance_pixels
    return 0

cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    reference_position = None
    start_time = time.time()
    initialized = False

    # Flags to detect if the key was already pressed for a gesture
    key_pressed = {'up': False, 'down': False, 'left': False, 'right': False, 'space': False}

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image, results = mediapipe_detection(frame, holistic)

        if results.face_landmarks:
            nose_x, nose_y = get_nose_coordinates(results.face_landmarks, frame.shape)

            if not initialized:
                elapsed_time = time.time() - start_time
                cv2.putText(frame, f"Initializing... {5 - int(elapsed_time)} sec",
                            (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                if elapsed_time >= 5:
                    reference_position = (nose_x, nose_y)
                    initialized = True
                else:
                    cv2.imshow("Head Movement Detection", frame)
                    if cv2.waitKey(10) & 0xFF == ord('q'):
                        break
                    continue

            if reference_position:
                ref_x, ref_y = reference_position
                dx = nose_x - ref_x
                dy = nose_y - ref_y

                direction = None
                if dy < -50 and not key_pressed['up']:
                    direction = "Up"
                    pyautogui.press('up')
                    key_pressed['up'] = True
                elif dy > 50 and not key_pressed['down']:
                    direction = "Down"
                    pyautogui.press('down')
                    key_pressed['down'] = True
                elif dx > 50 and not key_pressed['left']:
                    direction = "Left"
                    pyautogui.press('left')
                    key_pressed['left'] = True
                elif dx < -50 and not key_pressed['right']:
                    direction = "Right"
                    pyautogui.press('right')
                    key_pressed['right'] = True

                if direction:
                    cv2.putText(frame, f"Head Movement: {direction}", (10, 100),
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            lip_distance = get_lips_distance(results.face_landmarks, frame.shape)

            if lip_distance > 20 and not key_pressed['space']:
                cv2.putText(frame, "Space", (10, 150), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                pyautogui.press('space')
                key_pressed['space'] = True

            cv2.circle(frame, (nose_x, nose_y), 5, (0, 255, 0), -1)
            cv2.putText(frame, f"Nose Tip: ({nose_x}, {nose_y})", (nose_x + 10, nose_y - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

        # Reset key pressed flags when no gesture is detected
        if lip_distance <= 20:
            key_pressed['space'] = False
        if abs(dy) < 50 and abs(dx) < 50:
            key_pressed = {key: False for key in key_pressed}  # Reset all movement flags

        cv2.imshow("Head Movement Detection", frame)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


# Hand to Control Code

In [6]:
import cv2
import mediapipe as mp

mp_hands = mp.solutions.hands

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def highlight_finger_points(image, hand_landmarks):
    tips = [8, 12, 16, 20]
    mcps = [5, 9, 13, 17]

    if hand_landmarks:
        for hand in hand_landmarks:
            for idx in tips:
                landmark = hand.landmark[idx]
                h, w, _ = image.shape
                cx, cy = int(landmark.x * w), int(landmark.y * h)
                cv2.circle(image, (cx, cy), 8, (0, 255, 0), -1)

            for idx in mcps:
                landmark = hand.landmark[idx]
                h, w, _ = image.shape
                cx, cy = int(landmark.x * w), int(landmark.y * h)
                cv2.circle(image, (cx, cy), 8, (0, 165, 255), -1)

def check_conditions(results, image):
    if results.multi_hand_landmarks:
        for hand_landmarks, hand_handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            h, w, _ = image.shape
            fingertips = [hand_landmarks.landmark[i] for i in [8, 12, 16, 20]]
            mcps = [hand_landmarks.landmark[i] for i in [5, 9, 13, 17]]

            fingertip_y = [int(tip.y * h) for tip in fingertips]
            mcp_y = [int(mcp.y * h) for mcp in mcps]

            if all(fingertip_y[i] < mcp_y[i] for i in range(4)):
                label = hand_handedness.classification[0].label
                return label.lower()

            elif fingertip_y[0] < mcp_y[0] and all(fingertip_y[i] >= mcp_y[i] for i in range(1, 4)):
                return "up"

            elif all(fingertip_y[i] > mcp_y[i] for i in range(4)):
                return "down"

            elif fingertip_y[3] < mcp_y[3]:
                return "space"

    return ""

def display_text(image, text):
    font = cv2.FONT_HERSHEY_SIMPLEX
    position = (50, 50)
    font_scale = 1
    color = (0, 255, 0)
    thickness = 2
    cv2.putText(image, f"Action: {text}", position, font, font_scale, color, thickness)

cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break 

        image, results = mediapipe_detection(frame, hands)

        if results.multi_hand_landmarks:
            highlight_finger_points(image, results.multi_hand_landmarks)

        text = check_conditions(results, image)
             
        if text == "left":
            text = "right"
        elif text == "right":
            text = "left"

        display_text(image, text)

        cv2.imshow("Hand Gesture Detection", image)

        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()


In [15]:
import cv2
import mediapipe as mp
import pyautogui
import time  # Import time module

mp_hands = mp.solutions.hands

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def highlight_finger_points(image, hand_landmarks):
    tips = [8, 12, 16, 20]
    mcps = [5, 9, 13, 17]

    if hand_landmarks:
        for hand in hand_landmarks:
            for idx in tips:
                landmark = hand.landmark[idx]
                h, w, _ = image.shape
                cx, cy = int(landmark.x * w), int(landmark.y * h)
                cv2.circle(image, (cx, cy), 8, (0, 255, 0), -1)

            for idx in mcps:
                landmark = hand.landmark[idx]
                h, w, _ = image.shape
                cx, cy = int(landmark.x * w), int(landmark.y * h)
                cv2.circle(image, (cx, cy), 8, (0, 165, 255), -1)

def check_conditions(results, image):
    """
    Detect hand gestures and their handedness.
    """
    if results.multi_hand_landmarks:
        for hand_landmarks, hand_handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
            h, w, _ = image.shape
            fingertips = [hand_landmarks.landmark[i] for i in [8, 12, 16, 20]]
            mcps = [hand_landmarks.landmark[i] for i in [5, 9, 13, 17]]

            fingertip_y = [int(tip.y * h) for tip in fingertips]
            mcp_y = [int(mcp.y * h) for mcp in mcps]

            # Handedness: "Left" or "Right"
            label = hand_handedness.classification[0].label.lower()

            # All fingertips above their respective MCPs
            if all(fingertip_y[i] < mcp_y[i] for i in range(4)):
                return "right" if label == "left" else "left"  # Swap actions for left and right hands

            # Only index finger is raised
            elif fingertip_y[0] < mcp_y[0] and all(fingertip_y[i] >= mcp_y[i] for i in range(1, 4)):
                return "up"

            # All fingertips below their MCPs
            elif all(fingertip_y[i] > mcp_y[i] for i in range(4)):
                return "down"

            # Pinky finger is raised
            elif fingertip_y[3] < mcp_y[3]:
                return "space"

    return ""

def trigger_key_action(action, last_action, last_action_time):
    """
    Trigger a keyboard action based on the detected gesture with a delay.
    """
    current_time = time.time()
    
    if action != last_action and (current_time - last_action_time >= 0.15):  # Only trigger if 0.15 seconds have passed
        if action == "right":
            pyautogui.press("right")
        elif action == "left":
            pyautogui.press("left")
        elif action == "up":
            pyautogui.press("up")
        elif action == "down":
            pyautogui.press("down")
        elif action == "space":
            pyautogui.press("space")
        return action, current_time  # Update last_action and time
    return last_action, last_action_time  # No change

def display_text(image, text):
    font = cv2.FONT_HERSHEY_SIMPLEX
    position = (50, 50)
    font_scale = 1
    color = (0, 255, 0)
    thickness = 2
    cv2.putText(image, text, position, font, font_scale, color, thickness)

cap = cv2.VideoCapture(0)

with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    last_action = "" 
    last_action_time = time.time() 
 
    while cap.isOpened(): 
        ret, frame = cap.read()
        if not ret:
            break 

        image , results = mediapipe_detection(frame, hands)

        if results.multi_hand_landmarks:
            highlight_finger_points(image, results.multi_hand_landmarks)

        action = check_conditions(results, image)
        last_action, last_action_time = trigger_key_action(action, last_action, last_action_time)

        if action:
            display_text(image, action)

        cv2.imshow("Hand Gesture Detection", image)

        if cv2.waitKey(10) & 0xFF == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()
