In [3]:
from ultralytics import YOLO
import mediapipe as mp
import cv2
import numpy as np
import math
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from scipy.spatial.transform import Rotation as R

In [60]:
class HeadPose:
    def __init__(self):
        self.bb_detection = YOLO(r'C:\Users\Stepan\Documents\ClassMood\yunet_headpose\yolov8n-face-lindevs.pt')
        self.face_mesh = mp.solutions.face_mesh.FaceMesh(
            static_image_mode=True,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.3,
            min_tracking_confidence=0.5
        )

    def calibrate_matrix(self, chb_size, images=None, video=None):
        chessboard_size = (chb_size[0] - 1, chb_size[1] - 1)
        objp = np.zeros((chessboard_size[0] * chessboard_size[1], 3), dtype=np.float32)
        objp[:, :2] = np.mgrid[0:chessboard_size[0], 0:chessboard_size[1]].T.reshape(-1, 2)
        objpoints, imgpoints = [], []
        calibration_data = []

        if images:
            if len(images) < 10:
                raise Exception('Пожалуйста, введите не менее 10 фотографий')
            calibration_data = images
            calibration_mode = 'img'
        else:
            cap = cv2.VideoCapture(video)
            if (cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)) < 2:
                raise Exception('Пожалуйста, введите видео длинной более двух секунд')
            
            frame_skip = cap.get(cv2.CAP_PROP_FRAME_COUNT) // 5
            frame_count = 0

            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                if frame_count % frame_skip == 0:
                    calibration_data.append(frame)
                frame_count += 1
            calibration_mode = 'vid'

        for path in calibration_data:
            if calibration_mode == 'img':
                img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY)
            else:
                img = cv2.cvtColor(path, cv2.COLOR_BGR2GRAY)
            found, corners = cv2.findChessboardCorners(img, chessboard_size, None)

            if found:
                objpoints.append(objp)

                corners_refined = cv2.cornerSubPix(img, corners, (11, 11), (-1, -1), (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001))
                imgpoints.append(corners_refined)
        h, w = img.shape[:2]

        ret, camera_matrix, self.dist_coeffs, rvecs, tvecs = cv2.calibrateCamera(objpoints, imgpoints, (w, h), None, None)
        self.matrix = cv2.getOptimalNewCameraMatrix(camera_matrix, self.dist_coeffs, (w, h), 1, (w, h))[0]

    def frame_headpose(self, path):
        bb_results = self.bb_detection(path, conf=0.4)
        try:
            image = cv2.imread(path)
        except:
            image = path

        if not bb_results:
            return None
        
        boxes = bb_results[0].boxes.xyxy.numpy()
        head_rotations = dict()

        model_points_8 = np.array([
            (0.000000, -3.406404, 5.979507),           # 0 — нос 1
            (0.000000, 6.545390, 5.027311),      # 1 — подбородок 152
            (-1.785794, -0.978284, 4.850470),     # 2 — левый глаз (внутренний) 130
            (1.785794, -0.978284, 4.850470),      # 3 — правый глаз (внутренний) 359
            (-7.270895, -2.890917, -2.252455),     # 4 — левый глаз (внешний) 133
            (7.270895, -2.890917, -2.252455),      # 5 — правый глаз (внешний) 362
            (-2.056846, -4.477671, 4.520883),      # 6 — левый угол рта 78
            (2.056846, -4.477671, 4.520883)        # 7 — правый угол рта 308
        ], dtype=np.float64)
        
        for face_id, coords in enumerate(boxes):
            face_roi = image[int(coords[1]):int(coords[3]), int(coords[0]):int(coords[2])]
            mesh_results = self.face_mesh.process(cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB))
            h, w = face_roi.shape[:2]
            landmarks_2d = np.array([(mesh_results.multi_face_landmarks[0].landmark[i].x * w, mesh_results.multi_face_landmarks[0].landmark[i].y * h) for i in (1, 152, 130, 359, 133, 362, 78, 308)], dtype=np.float64)
            
            success, rvec, tvec = cv2.solvePnP(
                model_points_8,
                landmarks_2d,
                self.matrix,
                self.dist_coeffs,
                flags=cv2.SOLVEPNP_SQPNP
            )

            if success:
                rotation_matrix = cv2.Rodrigues(rvec)[0]
                euler_angles = R.from_matrix(rotation_matrix).as_euler('xyz', degrees=True)

                head_rotations[face_id] = euler_angles.tolist()

        return head_rotations
    
    def video_headpose(self, path):
        cap = cv2.VideoCapture(path)
        frame_count = 0
        fps = cap.get(cv2.CAP_PROP_FPS)
        main_df = dict()

        while cap.isOpened():
            ret, frame = cap.read()

            if not ret:
                break
            
            if frame_count % 5 == 0:
                temp = self.frame_headpose(frame)
                main_df[round(frame_count / fps, 3)] = {i: {'yaw': round(temp[i][0], 3), 'pitch': round(temp[i][1], 3), 'roll': round(temp[i][2], 3)} for i in temp.keys()}

            frame_count += 1
        
        return main_df

In [61]:
headpose = HeadPose()
headpose.calibrate_matrix((8, 8), video='C:/Users/Stepan/Documents/ClassMood/yunet_headpose/videos/calib/vid1.MOV')

In [63]:
result = headpose.frame_headpose(path=r'C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\full_face_images\1.png')


image 1/1 C:\Users\Stepan\Documents\ClassMood\yunet_headpose\images\full_face_images\1.png: 640x544 1 face, 41.2ms
Speed: 4.1ms preprocess, 41.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 544)


In [64]:
result

{0: [-97.21984893183745, -1.97108583414799, -2.2642545248036985]}

In [62]:
result1 = headpose.video_headpose(r'c:\Users\Stepan\Documents\ClassMood\yunet_headpose\videos\vid2.MOV')


0: 640x384 1 face, 31.8ms
Speed: 1.8ms preprocess, 31.8ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 26.8ms
Speed: 2.4ms preprocess, 26.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 29.3ms
Speed: 1.3ms preprocess, 29.3ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 27.6ms
Speed: 2.7ms preprocess, 27.6ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 27.2ms
Speed: 1.3ms preprocess, 27.2ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 26.5ms
Speed: 1.6ms preprocess, 26.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 26.1ms
Speed: 1.1ms preprocess, 26.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 face, 26.3ms
Speed: 1.0ms preprocess, 26.3ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 384)

0: 640x

In [59]:
result1

{0.0: {0: {'yaw': 95.55, 'pitch': -9.893, 'roll': -13.894}},
 0.167: {0: {'yaw': 95.543, 'pitch': -10.007, 'roll': -14.069}},
 0.333: {0: {'yaw': 95.801, 'pitch': -10.095, 'roll': -14.597}},
 0.5: {0: {'yaw': 95.535, 'pitch': -10.044, 'roll': -14.077}},
 0.667: {0: {'yaw': 95.443, 'pitch': -9.877, 'roll': -13.892}},
 0.833: {0: {'yaw': 95.568, 'pitch': -10.053, 'roll': -14.702}},
 1.0: {0: {'yaw': 95.576, 'pitch': -9.529, 'roll': -14.263}},
 1.167: {0: {'yaw': 95.439, 'pitch': -8.746, 'roll': -13.54}},
 1.333: {0: {'yaw': 95.217, 'pitch': -7.694, 'roll': -12.41}},
 1.5: {0: {'yaw': 95.174, 'pitch': -6.441, 'roll': -11.939}},
 1.667: {0: {'yaw': 95.228, 'pitch': -6.814, 'roll': -11.918}},
 1.833: {0: {'yaw': 95.14, 'pitch': -6.22, 'roll': -11.006}},
 2.0: {0: {'yaw': 95.098, 'pitch': -5.725, 'roll': -10.568}},
 2.167: {0: {'yaw': 95.394, 'pitch': -5.409, 'roll': -10.991}},
 2.333: {0: {'yaw': 95.386, 'pitch': -5.392, 'roll': -10.65}},
 2.5: {0: {'yaw': 95.19, 'pitch': -5.512, 'roll': -1

In [None]:
for i in [130, 359, 133, 362, 1, 78, 308, 152]:
                cv2.circle(face_roi, (int(mesh_results.multi_face_landmarks[0].landmark[i].x * w), int(mesh_results.multi_face_landmarks[0].landmark[i].y * h)), 1, (0, 255, 255), -1)
                print(int(mesh_results.multi_face_landmarks[0].landmark[i].x * w), int(mesh_results.multi_face_landmarks[0].landmark[i].y * h))
            cv2.imwrite(f'C:/Users/Stepan/Documents/ClassMood/yunet_headpose/images/images_with_landmarks/face{face_num}.png', face_roi)