In [1]:
from ultralytics import YOLO
import mediapipe as mp
import cv2
import numpy as np
import math
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from scipy.spatial.transform import Rotation as R

In [44]:
class HeadPose:
    def __init__(self):
        self.bb_detection = YOLO(r'C:/Users/Stepan/Documents/ClassMood/yunet_headpose/yolov8n-face-lindevs.pt')
        self.face_mesh = mp.solutions.face_mesh.FaceMesh(
            static_image_mode=True,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )

    def calibrate_matrix(self, chb_size, images=None, video=None):
        chessboard_size = (chb_size[0] - 1, chb_size[1] - 1)
        objp = np.zeros((chessboard_size[0] * chessboard_size[1], 3), dtype=np.float32)
        objp[:, :2] = np.mgrid[0:chessboard_size[0], 0:chessboard_size[1]].T.reshape(-1, 2)
        objpoints, imgpoints = [], []
        calibration_data = []

        if images:
            if len(images) < 10:
                raise Exception('Пожалуйста, введите не менее 10 фотографий')
            calibration_data = images
            calibration_mode = 'img'
        else:
            cap = cv2.VideoCapture(video)
            if (cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)) < 2:
                raise Exception('Пожалуйста, введите видео длинной более двух секунд')
            
            frame_skip = cap.get(cv2.CAP_PROP_FRAME_COUNT) // 5
            frame_count = 0

            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                if frame_count % frame_skip == 0:
                    calibration_data.append(frame)
                frame_count += 1
            calibration_mode = 'vid'

        for path in calibration_data:
            if calibration_mode == 'img':
                img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY)
            else:
                img = cv2.cvtColor(path, cv2.COLOR_BGR2GRAY)
            found, corners = cv2.findChessboardCorners(img, chessboard_size, None)

            if found:
                objpoints.append(objp)

                corners_refined = cv2.cornerSubPix(img, corners, (11, 11), (-1, -1), (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001))
                imgpoints.append(corners_refined)
        h, w = img.shape[:2]

        ret, camera_matrix, self.dist_coeffs, rvecs, tvecs = cv2.calibrateCamera(objpoints, imgpoints, (w, h), None, None)
        self.matrix = cv2.getOptimalNewCameraMatrix(camera_matrix, self.dist_coeffs, (w, h), 1, (w, h))[0]

    def frame_headpose(self, path):
        bb_results = self.bb_detection(path, conf=0.4)
        try:
            image = cv2.imread(path)
        except:
            image = path

        if not bb_results:
            return None
        
        h, w = image.shape[:2]
        boxes = bb_results[0].boxes.xyxy.numpy()
        head_rotations = dict()
        point_names = ('nose', 'chin', 'left eye inner', 'right eye inner', 'mouth left', 'mouth right', 'right eye outer', 'left eye outer')

        for face_id, coords in enumerate(boxes):
            face_roi = image[int(coords[1]):int(coords[3]), int(coords[0]):int(coords[2])]
            mesh_results = self.face_mesh.process(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))

            roi_h, roi_w = face_roi.shape[:2]
            landmarks_3d = dict(zip(point_names, [(mesh_results.multi_face_landmarks[-1].landmark[i].x, mesh_results.multi_face_landmarks[-1].landmark[i].y, mesh_results.multi_face_landmarks[-1].landmark[i].z) for i in (1, 152, 130, 359, 78, 308, 362, 133)]))

            roll = math.degrees(math.atan2(landmarks_3d['right eye outer'][0] - landmarks_3d['left eye outer'][0], landmarks_3d['right eye outer'][1] - landmarks_3d['left eye outer'][1]))
            yaw_approx = ((((landmarks_3d['left eye inner'][0] - landmarks_3d['nose'][0]) ** 2 + (landmarks_3d['left eye inner'][1] - landmarks_3d['nose'][1]) ** 2) ** 0.5) - (((landmarks_3d['right eye inner'][0] - landmarks_3d['nose'][0]) ** 2 + (landmarks_3d['right eye inner'][1] - landmarks_3d['nose'][1]) ** 2) ** 0.5)) / roi_w * 100000 
            pitch_approx = (landmarks_3d['nose'][2] + landmarks_3d['chin'][2]) * -100

            head_rotations[face_id] = (yaw_approx / (w / max((coords[0] + coords[2]), 0.1 ** 6)), pitch_approx, roll)
        
        return head_rotations
    
    def video_headpose(self, path):
        cap = cv2.VideoCapture(path)
        frame_count = 0
        fps = cap.get(cv2.CAP_PROP_FPS)
        main_df = dict()

        while cap.isOpened():
            ret, frame = cap.read()

            if not ret:
                break
            
            if frame_count % 5 == 0:
                temp = self.frame_headpose(frame)
                main_df[round(frame_count / fps, 3)] = {i: {'yaw': round(temp[i][0], 3), 'pitch': round(temp[i][1], 3), 'roll': round(temp[i][2], 3)} for i in temp.keys()}

            frame_count += 1
        
        return main_df

In [45]:
headpose = HeadPose()
headpose.calibrate_matrix((8, 8), images=[f'c:/Users/Stepan/Documents/ClassMood/yunet_headpose/images/calib_images/{i}.jpg' for i in range(12, 22)])

In [46]:
print([headpose.frame_headpose(path=f'C:/Users/Stepan/Documents/ClassMood/yunet_headpose/images/test_images/shift ({i}).jpg') for i in range(1, 8)])


image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\shift (1).jpg: 640x480 1 face, 35.5ms
Speed: 3.2ms preprocess, 35.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\shift (2).jpg: 640x480 1 face, 32.0ms
Speed: 2.9ms preprocess, 32.0ms inference, 4.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\shift (3).jpg: 640x480 1 face, 31.8ms
Speed: 2.4ms preprocess, 31.8ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\shift (4).jpg: 640x480 1 face, 31.1ms
Speed: 2.2ms preprocess, 31.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)

image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\shift (5).jpg: 640x480 1 face, 30.6ms
Speed: 3.0ms prep

In [71]:
mesh_results = headpose.frame_headpose(path='C:/Users/Stepan/Documents/ClassMood/yunet_headpose/images/test_images/test (2).jpg')
landmarks_3d = dict(zip(('nose', 'chin', 'left eye inner', 'right eye inner', 'mouth left', 'mouth right', 'right eye outer', 'left eye outer'), [(mesh_results.multi_face_landmarks[0].landmark[i].x, mesh_results.multi_face_landmarks[0].landmark[i].y, mesh_results.multi_face_landmarks[0].landmark[i].z) for i in (1, 152, 130, 359, 78, 308, 362, 133)]))
roll = math.degrees(math.atan2(landmarks_3d['right eye outer'][0] - landmarks_3d['left eye outer'][0], landmarks_3d['right eye outer'][1] - landmarks_3d['left eye outer'][1]))
yaw_approx = (landmarks_3d['right eye inner'][0] - landmarks_3d['left eye inner'][0] - landmarks_3d['right eye outer'][2] + landmarks_3d['left eye outer'][2]) * 50
pitch_approx = (landmarks_3d['nose'][2] + landmarks_3d['chin'][2]) * -100

print('inner eye to mouth, left and right:', ((landmarks_3d['left eye inner'][0] - landmarks_3d['mouth left'][0]) ** 2 + (landmarks_3d['left eye inner'][1] - landmarks_3d['mouth left'][1]) ** 2) ** 0.5, ((landmarks_3d['right eye inner'][0] - landmarks_3d['mouth right'][0]) ** 2 + (landmarks_3d['right eye inner'][1] - landmarks_3d['mouth right'][1]) ** 2) ** 0.5)


image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\test_images\test (2).jpg: 640x480 2 faces, 29.0ms
Speed: 1.8ms preprocess, 29.0ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 480)
inner eye to mouth, left and right: 0.4470449854166757 0.384401154430455


In [121]:
headpose = HeadPose()
headpose.calibrate_matrix((8, 8), video='C:/Users/Stepan/Documents/ClassMood/yunet_headpose/videos/calib/vid1.MOV')

In [None]:
result = headpose.frame_headpose(path=r'C:/Users/Stepan/Documents/ClassMood/yunet_headpose/images/full_face_images/1.png')

In [None]:
result

In [122]:
result1 = headpose.video_headpose(r'c:/Users/Stepan/Documents/ClassMood/yunet_headpose/videos/vid2.MOV')


0: 640x384 1 face, 34.3ms
Speed: 2.1ms preprocess, 34.3ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 384)
286

0: 640x384 1 face, 29.9ms
Speed: 1.6ms preprocess, 29.9ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)
285

0: 640x384 1 face, 26.5ms
Speed: 1.4ms preprocess, 26.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 384)
280

0: 640x384 1 face, 26.7ms
Speed: 1.2ms preprocess, 26.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)
284

0: 640x384 1 face, 26.3ms
Speed: 1.1ms preprocess, 26.3ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)
286

0: 640x384 1 face, 27.9ms
Speed: 1.5ms preprocess, 27.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)
284

0: 640x384 1 face, 24.9ms
Speed: 1.1ms preprocess, 24.9ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)
283

0: 640x384 1 face, 34.4ms
Speed: 1.9ms preprocess, 34.4ms inference, 0.9ms postprocess per image at sha

In [123]:
result1

{0.0: {0: {'yaw': -9.071, 'pitch': 3.763, 'roll': 88.509}},
 0.167: {0: {'yaw': -11.485, 'pitch': 3.748, 'roll': 88.425}},
 0.333: {0: {'yaw': -15.315, 'pitch': 3.092, 'roll': 88.528}},
 0.5: {0: {'yaw': -6.745, 'pitch': 3.722, 'roll': 88.602}},
 0.667: {0: {'yaw': -8.866, 'pitch': 4.698, 'roll': 87.901}},
 0.833: {0: {'yaw': -5.991, 'pitch': 3.09, 'roll': 89.163}},
 1.0: {0: {'yaw': 9.399, 'pitch': 3.932, 'roll': 89.35}},
 1.167: {0: {'yaw': 24.382, 'pitch': 3.18, 'roll': 90.298}},
 1.333: {0: {'yaw': 49.504, 'pitch': 4.322, 'roll': 90.684}},
 1.5: {0: {'yaw': 79.847, 'pitch': 5.599, 'roll': 93.916}},
 1.667: {0: {'yaw': 97.385, 'pitch': 5.456, 'roll': 91.834}},
 1.833: {0: {'yaw': 123.303, 'pitch': 6.255, 'roll': 92.661}},
 2.0: {0: {'yaw': 120.203, 'pitch': 5.475, 'roll': 92.981}},
 2.167: {0: {'yaw': 131.051, 'pitch': 8.048, 'roll': 93.208}},
 2.333: {0: {'yaw': 127.304, 'pitch': 6.342, 'roll': 93.761}},
 2.5: {0: {'yaw': 119.683, 'pitch': 5.641, 'roll': 93.002}},
 2.667: {0: {'yaw