Imports

In [149]:
import mediapipe as mp
import cv2
import numpy as np

The `FaseGestureDetector` class analyzes facial gesture and features using Mediapipe landmarks. 
It provides methods to extract 2D coordinates of ears, nose, and lips, and determines head direction 
(`left`, `right`, `center`, or `undifined`) and lips position (`mouth_open`, `mouth_closed`, or `undifined`).

In [166]:
class FaseGestureDetector():
    def __init__(self, head_dir_fact=0.5, lips_pos_fact = 1):
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_holistic = mp.solutions.holistic
        self.holistic = self.mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)

        self.draw_fasemesh_landmarks = False
        self.draw_pose_landmarks = True
        self.head_dir_fact = head_dir_fact  # determines the head direction detection sensitivity. (0< and >1) 
        self.lips_pos_fact = lips_pos_fact  # determins the lips position detection sensitivity. (<0 and <2)

    def ears(self, pose_landmarks, image_shape):
        """ Returns 2D coordinates of left and right ears based on the pose landmarks estimated by mediapipe"""
        if pose_landmarks:
            landmarks = pose_landmarks.landmark
            left_ear_2d = np.array([landmarks[mp.solutions.pose.PoseLandmark.LEFT_EAR].x * image_shape[1],
                                landmarks[mp.solutions.pose.PoseLandmark.LEFT_EAR].y * image_shape[0]], dtype=int)
            right_ear_2d = np.array([landmarks[mp.solutions.pose.PoseLandmark.RIGHT_EAR].x * image_shape[1],
                                landmarks[mp.solutions.pose.PoseLandmark.RIGHT_EAR].y * image_shape[0]], dtype=int)
            return left_ear_2d, right_ear_2d
        return None, None
    
    def nose(self, pose_landmarks, image_shape):
        """ Returns 2D coordinates of the nose based on the pose landmarks estimated by mediapipe"""
        if pose_landmarks:
            landmarks = pose_landmarks.landmark
            nose_2d = np.array([landmarks[mp.solutions.pose.PoseLandmark.NOSE].x * image_shape[1],
                                landmarks[mp.solutions.pose.PoseLandmark.NOSE].y * image_shape[0]], dtype=int)
            return nose_2d
        return None
    
    def lips(self, facemesh_landmarks, image_shape):
        """ Returns 2D coordinates of upper and lower lips base of the facemesh landmarks estimated by mediapipe"""
        if facemesh_landmarks:
            landmarks = facemesh_landmarks.landmark
            upper_lip_2d = np.array([landmarks[0].x * image_shape[1],
                                landmarks[0].y * image_shape[0]], dtype=int)      # Index 0 in the facemesh landmarks belongs to the upper lip
            lower_lip_2d = np.array([landmarks[17].x * image_shape[1],
                                landmarks[17].y * image_shape[0]], dtype=int)     # Index 17 in the facemesh landmarks belongs to the lower lip
            return upper_lip_2d, lower_lip_2d
        return None, None
    
    def detect_face_gesture(self, frame):
        """
        Detects the face gesture based on the pose and facemesh landmarks provided by Mediapipe.

        Returns:
            A dictionary containing:
            - 'head_direction': The direction of the head ('left', 'right', 'center', or None).
            - 'lips_position': The position of the lips ('mouth_open', 'mouth_closed', or None).
            An Image with the drawings of the landmarks.
        """
        face_gesture = {'face_detected': False,
                        'head_direction': None, 
                        'lips_position': None}
        
        # Recolor the frame from GBR to RGB
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        # Make Detections
        results = self.holistic.process(frame)
        # Recolor image back to BGR for rendering
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        # (image height, image width) of the image in pixels
        image_shape = (frame.shape[0], frame.shape[1])
        
        if not results.pose_landmarks:
            return face_gesture, frame
        
        # drawing the landmarks on the image
        if self.draw_pose_landmarks:
            self.mp_drawing.draw_landmarks(frame, results.pose_landmarks, self.mp_holistic.POSE_CONNECTIONS)
        if self.draw_fasemesh_landmarks:
            self.mp_drawing.draw_landmarks(frame, results.face_landmarks, self.mp_holistic.FACEMESH_TESSELATION)
        
        # 2D coordinates of the left and right ears in pixels: (px, py)
        left_ear_2d, right_ear_2d = self.ears(results.pose_landmarks, image_shape)
        # 2D coordinates of the nose in pixels: (px, py)
        nose_2d = self.nose(results.pose_landmarks, image_shape)
        # 2D coordinates of the upper and lower lips in pixels: (px, py)
        upper_lip_2d, lower_lip_2d = self.lips(results.face_landmarks, image_shape)

        # detecting the head direction
        if left_ear_2d is not None and right_ear_2d is not None and nose_2d is not None: 
            # when the face is turned to left, from camera perspective (x-y plane), left ear is closer to the nose
            if abs(left_ear_2d[0] - nose_2d[0]) / abs(right_ear_2d[0] - nose_2d[0]) < 1 - self.head_dir_fact:
                face_gesture['head_direction'] = "left"
            # when the face is turned to write, from camera perspective (x-y plane), right ear is closer to the nose
            elif abs(left_ear_2d[0] - nose_2d[0]) / abs(right_ear_2d[0] - nose_2d[0]) > 1 + self.head_dir_fact:
                face_gesture['head_direction'] = "right"
            # when the face is looking forward, left and right ears are in a same distance from the noce from the camera perspective
            else:
                face_gesture['head_direction'] = "center"   

        # detecting the lips' position
        if upper_lip_2d is not None and lower_lip_2d is not None:
            if lower_lip_2d[1] - upper_lip_2d[1] > 30 * self.lips_pos_fact:
                face_gesture['lips_position'] = "mouth_open"
            else:
                face_gesture['lips_position']= "mouth_closed"

        return face_gesture, frame
        

In [167]:
# Initialize the face gesture detector
face_gesture_detector = FaseGestureDetector()
face_gesture_detector.draw_pose_landmarks = True
face_gesture_detector.draw_fasemesh_landmarks = False

cap = cv2.VideoCapture(0)
    
while cap.isOpened():
    ret, frame = cap.read()
        
    face_gesture, image = face_gesture_detector.detect_face_gesture(frame)

    cv2.putText(image, f"head direction: {face_gesture['head_direction']}", (20, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 2)
    cv2.putText(image, f"lips position: {face_gesture['lips_position']}", (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 2)

    cv2.imshow('Raw Webcam Feed', image)

    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

  if abs(left_ear_2d[0] - nose_2d[0]) / abs(right_ear_2d[0] - nose_2d[0]) < 1 - self.head_dir_fact:
  elif abs(left_ear_2d[0] - nose_2d[0]) / abs(right_ear_2d[0] - nose_2d[0]) > 1 + self.head_dir_fact:


KeyboardInterrupt: 

In [165]:
cap.release()
cv2.destroyAllWindows()