# Deploy to FastAPI

In [1]:
from fastapi import FastAPI, Request
from pydantic import BaseModel
import base64
import mediapipe as mp
import cv2
import numpy as np
import uvicorn
import nest_asyncio
import tensorflow as tf
from helpers import relative, relativeT

mp_face_mesh = mp.solutions.face_mesh  # initialize the face mesh model
font = cv2.FONT_HERSHEY_COMPLEX
detector_model = tf.saved_model.load('./models/tf_retinaface_mbv2/')

blue = (0, 0, 255)
red = (255, 0, 0)
green = (0,128,0)

2024-09-09 18:24:59.494719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-09 18:24:59.507281: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-09 18:24:59.511034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-09 18:24:59.520390: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1725881101.325884   11994 cuda_executor.c

In [2]:
def one_face(frame, bbs, pointss):
    """
    Parameters
    ----------
    frame : uint8
        RGB image (numpy array).
    bbs : float64, Size = (N, 4)
        coordinates of bounding boxes for all detected faces.
    pointss : flaot32, Size = (N, 10)
        coordinates of landmarks for all detected faces.

    Returns
    -------
    bb : float64, Size = (5,)
        coordinates of bounding box for the selected face.
    points : float32
        coordinates of five landmarks for the selected face.

    """
    # select only process only one face (center ?)
    offsets = [(bbs[:,0]+bbs[:,2])/2-frame.shape[1]/2,
               (bbs[:,1]+bbs[:,3])/2-frame.shape[0]/2]
    offset_dist = np.sum(np.abs(offsets),0)
    index = np.argmin(offset_dist)
    bb = bbs[index]
    points = pointss[:,index]
    return bb, points

def are_coordinates_in_frame(frame, box, pts):
    """
    Parameters
    ----------
    frame : uint8
        RGB image (numpy array).
    bbs : float64
        coordinates of bounding box.
    points : flaot32
        coordinates of landmarks.

    Returns
    -------
    boolean
    """
    
    height, width = frame.shape[:2]
    height = height +1.5*height
    width = width +1.5*width
    
    if np.any(box <= 0) or np.any(box >= height) or np.any(box >= width):
        return False
    if np.any(pts <= 0) or np.any(pts >= height) or np.any(pts >= width):
        return False
    
    return True

def are_centered(frame, box, pts):
    """
    Parameters
    ----------
    frame : uint8
        RGB image (numpy array).
    bbs : float64
        coordinates of bounding box.
    points : flaot32
        coordinates of landmarks.

    Returns
    -------
    boolean
    """
    
    height, width = frame.shape[:2]
    # height = height +1.5*height
    # width = width +1.5*width
    
    if np.any(box <= 0) or np.any(box >= height) or np.any(box >= width):
        return False
    if np.any(pts <= 0) or np.any(pts >= height) or np.any(pts >= width):
        return False
    
    return True
    
            
def draw_landmarks(frame, bb, points):
    '''
    Parameters
    ----------
    frame : uint8
        RGB image
    bb : float64, Size = (5,)
        coordinates of bounding box for the selected face.
    points : float32, Size = (10,)
        coordinates of landmarks for the selected faces.

    Returns
    -------
    None.

    '''
    bb = bb.astype(int)
    points = points.astype(int)
    # draw rectangle and landmarks on face
    cv2.rectangle(frame, (bb[0], bb[1]), (bb[2], bb[3]), red, 1)
    cv2.circle(frame, (points[0], points[5]), 2, blue, 2)# left eye
    cv2.circle(frame, (points[1], points[6]), 2, blue, 2)# right eye
    cv2.circle(frame, (points[2], points[7]), 2, blue, 2)# nose
    cv2.circle(frame, (points[3], points[8]), 2, blue, 2)# mouth - left
    cv2.circle(frame, (points[4], points[9]), 2, blue, 2)# mouth - right 
    
    w = int(bb[2])-int(bb[0])# width
    h = int(bb[3])-int(bb[1])# height
    w2h_ratio = w/h# width to height ratio
    eye2box_ratio = (points[0]-bb[0]) / (bb[2]-points[1])
    font_size = 14
    #cv2.putText(frame, "Width (pixels): {}".format(w), (10,30), font, font_size, red, 1)
    #cv2.putText(frame, "Height (pixels): {}".format(h), (10,40), font, font_size, red, 1)
    
    # if eye2box_ratio > 1.5 or eye2box_ratio < 0.88:
    #     cv2.putText(frame, "Face: not in center of the bounding box", (10, 140), font, font_size, blue, 1)
    # if w2h_ratio < 0.7 or w2h_ratio > 0.9:
    #     cv2.putText(frame, "Face: long and narrow", (10, 160), font, font_size, blue, 1)

def find_smile(points):
    """
    Parameters
    ----------
    points : flaot32
        coordinates of landmarks.

    Returns
    -------
    smile_ratio : float32
        a value that determines if the face is smiling.
    """
    dx_eyes = points[1] - points[0]# pixels between pupils
    dx_mout = points[4] - points[3]# pixles between mouth corners
    smile_ratio = dx_mout/dx_eyes    
    return smile_ratio

def find_roll(points):
    """
    Parameters
    ----------
    points : float32
        coordinates of landmarks.

    Returns
    -------
    flaot32
        an indication of roll.

    """
    return points[6] - points[5]

def find_yaw(points):
    """
    Parameters
    ----------
    points : float32, Size = (10,)
        coordinates of landmarks.
    Returns
    -------
    float32
        an indication of yaw.

    """
    le2n = points[2] - points[0]
    re2n = points[1] - points[2]
    return le2n - re2n

def find_pitch(points):
    """
    Parameters
    ----------
    points : float32, Size = (10,)
        coordinates of landmarks.
    Returns
    -------
    float32
        an indication of pitch.
    """
    eye_y = (points[5] + points[6]) / 2
    mou_y = (points[8] + points[9]) / 2
    e2n = eye_y - points[7]
    n2m = points[7] - mou_y
    return e2n / n2m

def find_pose(points):
    """
    Parameters
    ----------
    points : float32, Size = (10,)
        coordinates of landmarks for the selected faces.
    Returns
    -------
    float32, float32, float32
    """
    LMx = points[0:5]# horizontal coordinates of landmarks
    LMy = points[5:10]# vertical coordinates of landmarks
    
    dPx_eyes = max((LMx[1] - LMx[0]), 1)
    dPy_eyes = (LMy[1] - LMy[0])
    angle = np.arctan(dPy_eyes / dPx_eyes) # angle for rotation based on slope
    
    alpha = np.cos(angle)
    beta = np.sin(angle)
    
    # rotated landmarks
    LMxr = (alpha * LMx + beta * LMy + (1 - alpha) * LMx[2] / 2 - beta * LMy[2] / 2) 
    LMyr = (-beta * LMx + alpha * LMy + beta * LMx[2] / 2 + (1 - alpha) * LMy[2] / 2)
    
    # average distance between eyes and mouth
    dXtot = (LMxr[1] - LMxr[0] + LMxr[4] - LMxr[3]) / 2
    dYtot = (LMyr[3] - LMyr[0] + LMyr[4] - LMyr[1]) / 2
    
    # average distance between nose and eyes
    dXnose = (LMxr[1] - LMxr[2] + LMxr[4] - LMxr[2]) / 2
    dYnose = (LMyr[3] - LMyr[2] + LMyr[4] - LMyr[2]) / 2
    
    # relative rotation 0 degree is frontal 90 degree is profile
    Xfrontal = (-90+90 / 0.5 * dXnose / dXtot) if dXtot != 0 else 0
    Yfrontal = (-90+90 / 0.5 * dYnose / dYtot) if dYtot != 0 else 0

    return angle * 180 / np.pi, Xfrontal, Yfrontal

def detect_faces(image, image_shape_max=640):
    '''
    Performs face detection using retinaface method with speed boost and 
    initial quality checks based on whole image size
    
    Parameters
    ----------
    image : uint8
        image for face detection.
    image_shape_max : int, optional
        maximum size (in pixels) of image.

    Returns
    -------
    float array
        landmarks.
    float array
        bounding boxes.
    flaot array
        detection scores.
    '''

    image_shape = image.shape[:2]
    
    # perform image resize for faster detection    
    if image_shape_max:
        scale_factor = max([1, max(image_shape)/image_shape_max])
    else:
        scale_factor = 1
        
    if scale_factor > 1:        
        scaled_image = cv2.resize(image, (0, 0), fx = 1 / scale_factor, 
                                  fy = 1 / scale_factor)
        bbs_all, points_all = retinaface(scaled_image)
        bbs_all[:,:4] *= scale_factor
        points_all *= scale_factor
    else:
        bbs_all, points_all = retinaface(image)              
    
    scores = bbs_all[:,-1]
    bbs = bbs_all[:, :4]
    
    return points_all, bbs, scores

def retinaface(image):
    """ retinaface face detector"""

    height = image.shape[0]
    width = image.shape[1]
    
    image_pad, pad_params = pad_input_image(image)    
    image_pad = tf.convert_to_tensor(image_pad[np.newaxis, ...])
    image_pad = tf.cast(image_pad, tf.float32)  
   
    outputs = detector_model(image_pad).numpy()

    outputs = recover_pad_output(outputs, pad_params)
    Nfaces = len(outputs)
    
    bbs = np.zeros((Nfaces,5))
    lms = np.zeros((Nfaces,10))
    
    bbs[:,[0,2]] = outputs[:,[0,2]]*width
    bbs[:,[1,3]] = outputs[:,[1,3]]*height
    bbs[:,4] = outputs[:,-1]
    
    lms[:,0:5] = outputs[:,[4,6,8,10,12]]*width
    lms[:,5:10] = outputs[:,[5,7,9,11,13]]*height
    
    return bbs, lms

def pad_input_image(img, max_steps=32):
    """pad image to suitable shape - required for retinaface"""
    img_h, img_w, _ = img.shape

    img_pad_h = 0
    if img_h % max_steps > 0:
        img_pad_h = max_steps - img_h % max_steps

    img_pad_w = 0
    if img_w % max_steps > 0:
        img_pad_w = max_steps - img_w % max_steps

    padd_val = np.mean(img, axis=(0, 1)).astype(np.uint8)
    img = cv2.copyMakeBorder(img, 0, img_pad_h, 0, img_pad_w,
                             cv2.BORDER_CONSTANT, value=padd_val.tolist())
    pad_params = (img_h, img_w, img_pad_h, img_pad_w)

    return img, pad_params

def recover_pad_output(outputs, pad_params):
    """recover the padded output effect"""
    img_h, img_w, img_pad_h, img_pad_w = pad_params
    recover_xy = np.reshape(outputs[:, :14], [-1, 7, 2]) * \
        [(img_pad_w + img_w) / img_w, (img_pad_h + img_h) / img_h]
    outputs[:, :14] = np.reshape(recover_xy, [-1, 14])

    return outputs

In [3]:
def detect_pose_gaze(img):
    blue = (0, 0, 255)
    red = (255, 0, 0)
    green = (0,128,0)
    image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    image_rgb = cv2.flip(image_rgb, 1)

    is_gazing = False
    is_centered = False
    
    try:
        landmarks, bboxes, scores = detect_faces(image_rgb, 720)
    except:
        print("Error: face detector error.")
        return {'is_gazing':is_gazing, 'is_no_face':False, 'is_error':True, 'is_centered':is_centered}

    if len(bboxes) > 0:
        lmarks = np.transpose(landmarks)
        bbs = bboxes.copy()
    
        bb, lmarks_5 = one_face(image_rgb, bbs, lmarks)
        if are_coordinates_in_frame(image_rgb, bb, lmarks_5):
            is_centered = are_centered(image_rgb, bb, lmarks_5)
            angle, Xfrontal, Yfrontal = find_pose(lmarks_5)
            if Xfrontal > 50 or Xfrontal < -50:
                is_gazing = True
            if Yfrontal > 10 or Yfrontal < -35:
                is_gazing = True
            return {'is_gazing':is_gazing, 'is_no_face':False, 'is_error':False, 'is_centered':is_centered}
            
        else:
            return {'is_gazing':is_gazing, 'is_no_face':False, 'is_error':False, 'is_centered':is_centered}
            
    else:
        return {'is_gazing':is_gazing, 'is_no_face':True, 'is_error':False, 'is_centered':is_centered}

In [4]:
def detect_left_right_gaze(img):

    blue = (0, 0, 255)
    red = (255, 0, 0)
    green = (0,128,0)
    face_mesh = mp_face_mesh.FaceMesh(
            max_num_faces=1,  # number of faces to track in each frame
            refine_landmarks=True,  # includes iris landmarks in the face mesh model
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5)
    img.flags.writeable = False
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # frame to RGB for the face-mesh model
    results = face_mesh.process(img)
    frame = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)  

    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    avg_brightness = gray.mean()

    if results.multi_face_landmarks:
        points = results.multi_face_landmarks[0]
        right_pupil = relative(points.landmark[468], frame.shape)
        left_pupil = relative(points.landmark[473], frame.shape)
        # cv2.circle(frame, (left_pupil[0], left_pupil[1]), 2, blue, 2)
        # cv2.circle(frame, (right_pupil[0], right_pupil[1]), 2, red, 2)
        
        
        # corner eye
        left_corner = relative(points.landmark[263], frame.shape)
        right_corner = relative(points.landmark[33], frame.shape)
        # cv2.circle(frame, (left_corner[0], left_corner[1]), 2, blue, 2)
        # cv2.circle(frame, (right_corner[0], right_corner[1]), 2, red, 2)
    
        right_corner_inside = relative(points.landmark[133], frame.shape)
        left_corner_inside = relative(points.landmark[362], frame.shape)
        # cv2.circle(frame, (left_corner_inside[0], left_corner_inside[1]), 2, blue, 2)
        # cv2.circle(frame, (right_corner_inside[0], right_corner_inside[1]), 2, red, 2)
    
    
        right_upper = relative(points.landmark[159], frame.shape)
        left_upper = relative(points.landmark[386], frame.shape)
        # cv2.circle(frame, (left_upper[0], left_upper[1]), 2, blue, 2)
        # cv2.circle(frame, (right_upper[0], right_upper[1]), 2, red, 2)
    
        left_pupil_ratio = (left_pupil[0] - left_corner_inside[0]) / (left_corner[0] - left_corner_inside[0])
        right_pupil_ratio = (right_pupil[0] - right_corner[0]) / (right_corner_inside[0] - right_corner[0])
    
        left_gazing = left_pupil_ratio >=0.6 and right_pupil_ratio >= 0.6
        right_gazing = left_pupil_ratio < 0.4 and right_pupil_ratio < 0.4
    
        is_gazing = left_gazing or right_gazing
        return {'is_gazing':is_gazing, 'is_no_face':False, 'is_error':False, 'is_centered':True, 'left_pupil':left_pupil, 'right_pupil':right_pupil}
    else:
        return {'is_gazing':False, 'is_no_face':True, 'is_error':False, 'is_centered':True}

In [5]:
nest_asyncio.apply()
app = FastAPI()

# class Frame(BaseModel):
#     frame: str
#     timestamp: str

@app.post("/eye-tracking")
async def eye_tracking(request: Request):
    # Decode base64 frame

    image_bytes = await request.body()
    nparr = np.frombuffer(image_bytes, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    gazing_response = detect_left_right_gaze(img)

    return gazing_response
    
@app.post("/pose-estimate")
async def pose_estimate(request: Request):
    # Decode base64 frame

    image_bytes = await request.body()
    nparr = np.frombuffer(image_bytes, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

    gazing_response = detect_pose_gaze(img)

    return gazing_response
    # try:
    #     image_bytes = await request.body()
    #     nparr = np.frombuffer(image_bytes, np.uint8)
    #     img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    
    #     gazing_response = detect_left_right_gaze(img)

    #     return gazing_response
    # except:
    #     return {'is_gazing':False, 'is_no_face':False, 'is_error':True}
        
def start_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8080)


    #cv2.putText(frame, f"Avg Brightness : {avg_brightness}", (10, 300), font, 0.4, green, 1)

In [None]:
start_fastapi()