In [11]:
# USAGE
# python speaking_detection.py --shape-predictor shape_predictor_68_face_landmarks.dat
# python speaking_detection.py --shape-predictor shape_predictor_68_face_landmarks.dat --picamera 1

# import the necessary packages
import face_recognition
import face_recognition.api
from imutils.video import VideoStream
from imutils import face_utils
import datetime
import argparse
import imutils
import time
import dlib
import cv2
import numpy as np


def is_speaking(prev_img, curr_img, debug=False, threshold=500, width=400, height=400):
    """
    Args:
        prev_img:
        curr_img:
    Returns:
        Bool value if a person is speaking or not
    """
    prev_img = cv2.resize(prev_img, (width, height))
    curr_img = cv2.resize(curr_img, (width, height))

    diff = cv2.absdiff(prev_img, curr_img)
    norm = np.sum(diff) / (width*height) * 100
    if debug:
        print(norm)
    return norm > threshold

# initialize dlib's face detector (HOG-based) and then create
# the facial landmark predictor
print("[INFO] loading facial landmark predictor...")
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

# grab the indices of the facial landmarks for mouth
m_start, m_end = face_utils.FACIAL_LANDMARKS_IDXS['mouth']

# initialize the video stream and allow the cammera sensor to warmup
print("[INFO] camera sensor warming up...")
camera = cv2.VideoCapture(0)
time.sleep(2.0)

prev_mouth_img = None
i = 0
margin = 10
# loop over the frames from the video stream
while True:
    # grab the frame from the threaded video stream, resize it to
    # have a maximum width of 400 pixels, and convert it to
    # grayscale
    ret, frame = camera.read()
    frame = imutils.resize(frame, width=800)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # detect faces in the grayscale frame
    faces = detector(gray, 0)
    largest_face = max(faces, key=lambda x: x[2] * x[3] if len(faces) else 0, default=None)

    # loop over the face detections
    if largest_face is not None:
        rect = largest_face
        # determine the facial landmarks for the face region, then
        # convert the facial landmark (x, y)-coordinates to a NumPy
        # array
        shape = predictor(gray, rect)
        shape = face_utils.shape_to_np(shape)

        # face_recognition.face_locations(frame)

        mouth_shape = shape[m_start:m_end+1]

        leftmost_x = min(x for x, y in mouth_shape) - margin
        bottom_y = min(y for x, y in mouth_shape) - margin
        rightmost_x = max(x for x, y in mouth_shape) + margin
        top_y = max(y for x, y in mouth_shape) + margin

        w = rightmost_x - leftmost_x
        h = top_y - bottom_y

        x = int(leftmost_x - 0.1 * w)
        y = int(bottom_y - 0.1 * h)

        w = int(1.2 * w)
        h = int(1.2 * h)

        mouth_img = gray[bottom_y:top_y, leftmost_x:rightmost_x]

        # loop over the (x, y)-coordinates for the facial landmarks
        # and draw them on the image
        # for (x, y) in mouth_shape:
            # cv2.circle(frame, (x, y), 1, (0, 0, 255), -1)
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)

        # confer this
        # https://github.com/seanexplode/LipReader/blob/master/TrackFaces.c#L68
        if prev_mouth_img is None:
            prev_mouth_img = mouth_img
        if is_speaking(prev_mouth_img, mouth_img, threshold=700,
                                debug=True):
            print(str(i), "speaking")
            i += 1

        prev_mouth_img = mouth_img
        
    # show the frame
    cv2.imshow("Frame", frame)
    key = cv2.waitKey(1) & 0xFF

    # if the `q` key was pressed, break from the loop
    if key == ord("q"):
        break

# do a bit of cleanup
cv2.destroyAllWindows()
vs.stop()

[INFO] loading facial landmark predictor...
[INFO] camera sensor warming up...


TypeError: '_dlib_pybind11.rectangle' object is not subscriptable

In [6]:
import cv2
from cv2.data import haarcascades
import face_recognition
from deepface.DeepFace import extract_faces
import numpy as np

camera = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(haarcascades + "haarcascade_frontalface_default.xml")


while True:

    ret, frame = camera.read()

    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Convert grayscale frame to RGB format
    rgb_frame = cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)

    # Detect faces in the frame
    faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    largest_face = max(faces, key=lambda x: x[2] * x[3], default=None)

    if largest_face is not None:
        x, y, w, h = largest_face
        face_roi = rgb_frame[y : y + h, x : x + w]

        face_landmarks_list = face_recognition.face_landmarks(face_roi, model="large")

        for face_landmarks in face_landmarks_list:
            # Print the location of each facial feature in this image
            facial_features = [
                'chin',
                'left_eyebrow',
                'right_eyebrow',
                'nose_bridge',
                'nose_tip',
                'left_eye',
                'right_eye',
                'top_lip',
                'bottom_lip'
            ]

            # Let's trace out each facial feature in the image with a line!
            for facial_feature in facial_features:
                cv2.polylines(frame, [np.array(face_landmarks[facial_feature], np.int32) + (x, y)], isClosed=False, color=(0, 255, 0), thickness=2)

        
    # Display the resulting frame
    cv2.imshow("Real-time Emotion Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

KeyboardInterrupt: 

: 

In [6]:
import cv2
from gaze_tracking import GazeTracking

gaze = GazeTracking()
webcam = cv2.VideoCapture(0)

while True:
    _, frame = webcam.read()
    gaze.refresh(frame)

    new_frame = gaze.annotated_frame()
    text = ""

    if gaze.is_right():
        text = "Looking right"
    elif gaze.is_left():
        text = "Looking left"
    elif gaze.is_center():
        text = "Looking center"

    cv2.putText(new_frame, text, (60, 60), cv2.FONT_HERSHEY_DUPLEX, 2, (255, 0, 0), 2)
    cv2.imshow("Demo", new_frame)

    if cv2.waitKey(1) == 27:
        break

Action: emotion: 100%|██████████| 4/4 [00:00<00:00, 12.13it/s]


In [2]:
import cv2
from cv2.data import haarcascades
from deepface.DeepFace import extract_faces

camera = cv2.VideoCapture(0)
face_cascade = cv2.CascadeClassifier(haarcascades + "haarcascade_frontalface_default.xml")


while True:

    ret, frame = camera.read()

    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Convert grayscale frame to RGB format
    rgb_frame = cv2.cvtColor(gray_frame, cv2.COLOR_GRAY2RGB)

    # Detect faces in the frame
    faces = face_cascade.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    largest_face = max(faces, key=lambda x: x[2] * x[3], default=None)

    if largest_face is not None:
        x, y, w, h = largest_face
        face_roi = rgb_frame[y : y + h, x : x + w]

        result = analyze(face_roi, actions=["emotion"], enforce_detection=False)

        # Determine the dominant emotion
        emotion = result[0]["dominant_emotion"]
        print(result[0]["face_confidence"])

        cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
        cv2.putText(frame, emotion, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)

        # Define the colors for each emotion
        emotion_colors = {
            'angry': (0, 0, 255),
            'disgust': (0, 255, 0),
            'fear': (255, 0, 0),
            'happy': (255, 255, 0),
            'sad': (0, 255, 255),
            'surprise': (255, 0, 255),
            'neutral': (128, 128, 128)
        }

        # Define the position and size of the emotion bars
        bar_x = 30
        bar_y = 20
        bar_width = 150
        bar_height = 20
        bar_distance = 10

        emotions: dict[str, float] = result[0]["emotion"]
        i = 0
        # Draw the emotion bars
        for emotion, probability in emotions.items():
            # Calculate the height of the bar based on the probability
            bar_length = int(probability / 100 * bar_width)
            
            bar_offset = i * (bar_height + bar_distance)

            # Calculate the position of the top-left corner of the bar
            bar_top_left = (bar_x, bar_y + bar_offset)

            # Calculate the position of the bottom-right corner of the bar
            bar_bottom_right = (bar_x + bar_length, bar_y + bar_height + bar_offset)

            # Draw the bar
            cv2.rectangle(frame, bar_top_left, bar_bottom_right, emotion_colors[emotion], -1)

            # Add the emotion label
            cv2.putText(frame, emotion, (bar_x + bar_width + 30, bar_bottom_right[1]), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
            i += 1

    # for result in results:
    #     # Extract the face ROI (Region of Interest)
    #     # face_roi = rgb_frame[y : y + h, x : x + w]

    #     # Perform emotion analysis on the face ROI
    #     # result = analyze(face_roi, actions=["emotion"], enforce_detection=False)
    #     x,y,w,h,_,_ = result["region"].values()

    #     # Determine the dominant emotion
    #     emotion = result["dominant_emotion"]

    #     # Draw rectangle around face and label with predicted emotion
    #     cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
    #     cv2.putText(frame, emotion, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)


    # Display the resulting frame
    cv2.imshow("Real-time Emotion Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.96
0.93
0.92
0.92
0.94
0.94
0.94
0.94
0.94
0.91
0.93
0.92
0.92
0.92
0.92
0.91
0.92
0.91
0.92
0.94
0.95
0.96
0.95
0.94
0.93
0
0.93
0.93
0.94
0.95
0.94
0.93
0.94
0.96
0.96
0.97
0
0
0
0
0
0
0
0
0
0
0
0.99
0.97
0.98
0.98
0.99
0.98
0.96
0.95
0.97
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.98
0.98
0.96
0.98
0.96
0.96
0.95
0.94
0.93
0.92
0.92
0.93
0.94
0.94
0.95
0.96
0.93
0.94
0.95
0.92
0.93
0.94
0.92
0.91
0.91
0.94
0.94
0.94
0.94
0.94
0.96
0.96
0.96
0.96
0.96
0.95
0.95
0.92
0.94
0.94
0.95
0.94
0.96
0.95
0.94
0.93
0.93
0.95
0.94
0.95
0.95
0.95
0.98
0.96
0
0
0
0
0.99
0.95
0.95
0.96
0.96
0.98
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.96
0
0.96
0
0.98
0.96
0
0
0.96
0
0.97
0.97
0.96
0.97
0
0
0.98
0.93
0.97
0.95
0
0
0.97
0.97
0.93
0.95
0.97
0.96
0.98
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.99
0.99
0.98
0.99
0.99
0.99
0
0.98
0
0.98
0.97
1.0
0.99
0.99
0.98
0.99
0.9

KeyboardInterrupt: 

: 