Face Landmarks Detection


###Getting the required face-alignment package
https://github.com/1adrianb/face-alignment

In [None]:
!pip install face-alignment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting face-alignment
  Downloading face_alignment-1.3.5.tar.gz (27 kB)
Building wheels for collected packages: face-alignment
  Building wheel for face-alignment (setup.py) ... [?25l[?25hdone
  Created wheel for face-alignment: filename=face_alignment-1.3.5-py2.py3-none-any.whl size=28241 sha256=f5e243391d864297c473de81a5fe07f7ff79c38b7535642926b987225e37e9fd
  Stored in directory: /root/.cache/pip/wheels/c9/ba/4d/2d368f55e5f929f9472da59e356fbdf1483f885de80a5bc620
Successfully built face-alignment
Installing collected packages: face-alignment
Successfully installed face-alignment-1.3.5


### Access Webcam

In [None]:
# import dependencies
from IPython.display import display, Javascript, Image
from google.colab.output import eval_js
from base64 import b64decode, b64encode
import matplotlib.pyplot as plt
import face_alignment
from skimage import io, img_as_ubyte
import cv2
import numpy as np
import PIL
import io
import html
import time

Helper functions that convert between image types

In [None]:
# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)
  return img

# function to convert OpenCV image into base64 byte string to be overlayed on video stream
def lm_to_bytes(lm_array):
  """
  Params:
          img_array: Numpy array (pixels) containing landmarks to overlay on video stream.
  Returns:
        bytes: Base64 image byte string
  """
  # convert array into PIL image
  lm_PIL = PIL.Image.fromarray(lm_array, 'RGBA')
  iobuf = io.BytesIO()
  # format img into png for return
  lm_PIL.save(iobuf, format='png')
  # format return string
  lm_bytes = 'data:image/png;base64,{}'.format((str(b64encode(iobuf.getvalue()), 'utf-8')))

  return lm_bytes

###Face landmarks detection for images from webcam

In [None]:
# capture one image from webcam
def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
    ''')
  display(js)

  # get photo data
  data = eval_js('takePhoto({})'.format(quality))
  # get OpenCV format image
  img = js_to_image(data)
  # convert BGR to RGB
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  # load the face landmark model and show model load time
  fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False, device='cuda', face_detector='blazeface')
  t_start = time.time()
  preds = fa.get_landmarks(img)
  print(f'Execution time for a single image: {time.time() - t_start}')
  img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
  # dealing with the case of no face: saving the image but return error message
  if not preds:
    cv2.imwrite(filename, img)
    return filename
  # draw face landmarks on image
  for pred in preds:
    for (x, y) in pred:
      cv2.circle(img, (int(x), int(y)), radius=1, color=(0,0,255), thickness=-1)
  # save image
  cv2.imwrite(filename, img)

  return filename

There will be a window for webcam in the area below the next code block. Remember to press "Capture" to capture an image.

In [None]:
try:
  filename = take_photo('photo.jpg')
  print('Saved to {}'.format(filename))

  # Show the image which was just taken
  display(Image(filename))
except Exception as err:
  # Errors will be thrown if the user does not have a webcam or if they do not
  # grant the page permission to access it
  print(str(err))

###Face landmarks detection for real-time videos from webcam

The function below is used to start up the video stream using similar JavaScript like the one that was used for images.

In [None]:
# JavaScript to properly create our live video stream using our webcam as input
def video_stream():
  js = Javascript('''
    var video;
    var div = null;
    var stream;
    var captureCanvas;
    var imgElement;
    var labelElement;

    var pendingResolve = null;
    var shutdown = false;

    function removeDom() {
       stream.getVideoTracks()[0].stop();
       video.remove();
       div.remove();
       video = null;
       div = null;
       stream = null;
       imgElement = null;
       captureCanvas = null;
       labelElement = null;
    }

    function onAnimationFrame() {
      if (!shutdown) {
        window.requestAnimationFrame(onAnimationFrame);
      }
      if (pendingResolve) {
        var result = "";
        if (!shutdown) {
          captureCanvas.getContext('2d').drawImage(video, 0, 0, 640, 480);
          result = captureCanvas.toDataURL('image/jpeg', 0.8)
        }
        var lp = pendingResolve;
        pendingResolve = null;
        lp(result);
      }
    }

    async function createDom() {
      if (div !== null) {
        return stream;
      }

      div = document.createElement('div');
      div.style.border = '2px solid black';
      div.style.padding = '3px';
      div.style.width = '100%';
      div.style.maxWidth = '600px';
      document.body.appendChild(div);

      const modelOut = document.createElement('div');
      modelOut.innerHTML = "<span>Status:</span>";
      labelElement = document.createElement('span');
      labelElement.innerText = 'No data';
      labelElement.style.fontWeight = 'bold';
      modelOut.appendChild(labelElement);
      div.appendChild(modelOut);

      video = document.createElement('video');
      video.style.display = 'block';
      video.width = div.clientWidth - 6;
      video.setAttribute('playsinline', '');
      video.onclick = () => { shutdown = true; };
      stream = await navigator.mediaDevices.getUserMedia(
          {video: { facingMode: "environment"}});
      div.appendChild(video);

      imgElement = document.createElement('img');
      imgElement.style.position = 'absolute';
      imgElement.style.zIndex = 1;
      imgElement.onclick = () => { shutdown = true; };
      div.appendChild(imgElement);

      const instruction = document.createElement('div');
      instruction.innerHTML =
          '<span style="color: red; font-weight: bold;">' +
          'When finished, click here or on the video to stop this demo</span>';
      div.appendChild(instruction);
      instruction.onclick = () => { shutdown = true; };

      video.srcObject = stream;
      await video.play();

      captureCanvas = document.createElement('canvas');
      captureCanvas.width = 640; //video.videoWidth;
      captureCanvas.height = 480; //video.videoHeight;
      window.requestAnimationFrame(onAnimationFrame);

      return stream;
    }
    async function stream_frame(label, imgData) {
      if (shutdown) {
        removeDom();
        shutdown = false;
        return '';
      }

      var preCreate = Date.now();
      stream = await createDom();

      var preShow = Date.now();
      if (label != "") {
        labelElement.innerHTML = label;
      }

      if (imgData != "") {
        var videoRect = video.getClientRects()[0];
        imgElement.style.top = videoRect.top + "px";
        imgElement.style.left = videoRect.left + "px";
        imgElement.style.width = videoRect.width + "px";
        imgElement.style.height = videoRect.height + "px";
        imgElement.src = imgData;
      }

      var preCapture = Date.now();
      var result = await new Promise(function(resolve, reject) {
        pendingResolve = resolve;
      });
      shutdown = false;

      return {'create': preShow - preCreate,
              'show': preCapture - preShow,
              'capture': Date.now() - preCapture,
              'img': result};
    }
    ''')

  display(js)

def video_frame(label, img):
  data = eval_js('stream_frame("{}", "{}")'.format(label, img))
  return data

Run the face landmarks detection on Webcam Video.
There are around 10 seconds of lag at the beginning of the video. The duration of the lag seems to be depending on the network connection.

In [None]:
# start streaming video from webcam
video_stream()
# label for video
label_html = 'Capturing...'
# initialze landmarks to empty
lm = ''
count = 0
t_load = time.time()
fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=False, device='cuda', face_detector='blazeface')
print(f'Model load time: {time.time() - t_load}')

while True:
    t_start = time.time()
    js_reply = video_frame(label_html, lm)
    if not js_reply:
        break

    # convert JS response to OpenCV Image
    frame = js_to_image(js_reply["img"])
    # create transparent overlay for face landmarks
    lm_array = np.zeros([480, 640, 4], dtype=np.uint8)

    t_model_start = time.time()
    # load the face landmark result
    preds = fa.get_landmarks(frame)
    if not preds:
      continue
    # calculate model execution time
    model_exec_time = time.time() - t_model_start
    # draw face landmarks on image
    mouth = [58,67]
    config = {'eyebrows':[,], 'mouth':[,],'jaw':[,]}
    for pred in preds:
      for (x,y) in pred[58:67]:
        lm_array = cv2.circle(lm_array, (int(x), int(y)), radius=2, color=(0,0,255), thickness=-1)
      for (x, y) in pred[:58]:
        lm_array = cv2.circle(lm_array, (int(x), int(y)), radius=2, color=(0,0,255), thickness=-1)
    # calculate execution time
    exec_time = time.time() - t_start
    # calculate fps defined as 1/execution time of a frame
    fps = 1 / exec_time
    lm_array = cv2.putText(lm_array, 'Execution time: {:.4f}'.format(exec_time), (20, 20), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (255, 0, 0), 2)
    lm_array = cv2.putText(lm_array, 'Model execution time: {:.4f}'.format(model_exec_time), (20, 40), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (255, 0, 0), 2)
    lm_array = cv2.putText(lm_array, 'FPS: {:.2f}'.format(fps), (20, 60), cv2.FONT_HERSHEY_SIMPLEX,
                0.5, (255, 0, 0), 2)

    lm_array[:,:,3] = (lm_array.max(axis = 2) > 0 ).astype(int) * 255
    # convert overlay of landmarks into bytes
    lm = lm_to_bytes(lm_array)


<IPython.core.display.Javascript object>

Model load time: 0.356860876083374


From the result of the total execution time and model execution time, we could see the bottleneck in this specific case is sending and receiving frames from local webcam to Google servers.

###Reference:

Webcam input help guide: https://colab.research.google.com/notebooks/snippets/advanced_outputs.ipynb

