

Face Detection Algorithm credit: [Facial landmarks with dlib, OpenCV, and Python](https://pyimagesearch.com/2017/04/03/facial-landmarks-dlib-opencv-python/)


In [1]:
! pip install --upgrade imutils
! wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
! bzip2 -d /content/shape_predictor_68_face_landmarks.dat.bz2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-12-07 05:14:05--  http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
Resolving dlib.net (dlib.net)... 107.180.26.78
Connecting to dlib.net (dlib.net)|107.180.26.78|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64040097 (61M)
Saving to: ‘shape_predictor_68_face_landmarks.dat.bz2’


2022-12-07 05:14:07 (31.1 MB/s) - ‘shape_predictor_68_face_landmarks.dat.bz2’ saved [64040097/64040097]



In [2]:
from imutils import face_utils
import numpy as np
import imutils
import dlib
import cv2
from os import listdir
from google.colab.patches import cv2_imshow
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def rect_to_bb(rect):
	# take a bounding predicted by dlib and convert it
	# to the format (x, y, w, h) as we would normally do
	# with OpenCV
	x = rect.left()
	y = rect.top()
	w = rect.right() - x
	h = rect.bottom() - y
	# return a tuple of (x, y, w, h)
	return (x, y, w, h)

In [4]:
def shape_to_np(shape, dtype="int"):
	# initialize the list of (x, y)-coordinates
	coords = np.zeros((68, 2), dtype=dtype)
	# loop over the 68 facial landmarks and convert them
	# to a 2-tuple of (x, y)-coordinates
	for i in range(0, 68):
		coords[i] = (shape.part(i).x, shape.part(i).y)
	# return the list of (x, y)-coordinates
	return coords

In [5]:
# initialize dlib's face detector (HOG-based) and then create
# the facial landmark predictor
pretrained_dlib_detector = "/content/shape_predictor_68_face_landmarks.dat"
detector = detector = dlib.get_frontal_face_detector()

predictor = dlib.shape_predictor(pretrained_dlib_detector)

Each of the 7356 RAVDESS files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 02-01-06-01-02-01-12.mp4). These identifiers define the stimulus characteristics: 

Filename identifiers 

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
<br>
Vocal channel (01 = speech, 02 = song).
<br>
Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
<br>
Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
<br>
Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
<br>
Repetition (01 = 1st repetition, 02 = 2nd repetition).
<br>
Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

Filename example: 02-01-06-01-02-01-12.mp4 

Video-only (02)
<br>
Speech (01)
<br>
Fearful (06)
<br>
Normal intensity (01)
<br>
Statement "dogs" (02)
<br>
1st Repetition (01)
<br>
12th Actor (12)
<br>
Female, as the actor ID number is even.

For this task, we will be using:
<br>
Modality = 02
<br>
Vocal channel = 01
<br>
Emotion = [01, 02, 03, 04, 05, 06, 07, 08]
<br>
Emotion intensity = [01, 02]
<br>
statement = [01, 02]
<br>
Repetition = [01, 02]
<br>
Actor = [01, ... , 24]

In [6]:
def get_distance(landmarks_coordinate):
  distance_embedding = np.zeros(68*68)
  for i in range(68*68):
    a = i // 68
    b = i % 68
    distance_embedding[i] = np.linalg.norm(landmarks_coordinate[a] - landmarks_coordinate[b])
  return distance_embedding

In [None]:
from IPython.display import clear_output

emotion_labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

video_speech_path = '/content/drive/MyDrive/CV Project Workspace/Project_Code/Data/RAVDESS/Video_Speech/'

# restrict the number of examples/data
emotion_data = np.zeros([8, 1700, 68*68])
emotion_ids = [0, 0, 0, 0, 0, 0, 0, 0]


NUM_VIDEOS = 60*24

video_num = 0
for actor in listdir(video_speech_path):
  ### test use
  # if actor_num == 1:
  #   break
  # actor_num += 1
  ###
  for video in listdir(video_speech_path+actor):
    print('processing...' , video_num/NUM_VIDEOS)
    clear_output(wait=True)
    # filter and only process speech video
    if video[:5] != '02-01':
      continue
    
    ### test use
    # if video_num == 1:
    #   break
    video_num += 1
    ###

    # found video 
    video_path = video_speech_path + actor + "/" + video
    cap = cv2.VideoCapture(video_path)
    # video's emotion
    emotion_label = int(video[7])-1

    # down sampling the amount of frames
    down_sample_rate = 15
    frame_num = 0

    while True:
      success, img = cap.read()

      if not success:
        break

      if frame_num % down_sample_rate != 0:
        frame_num += 1
        continue
      
      # load the input image, resize it, and convert it to grayscale
      image = imutils.resize(img, width=500)
      gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
      # detect faces in the grayscale image
      rects = detector(gray, 1)


      # loop over the face detections
      for (i, rect) in enumerate(rects):

        # determine the facial landmarks for the face region, then
        # convert the facial landmark (x, y)-coordinates to a NumPy
        # array
        shape = predictor(gray, rect)
        shape = shape_to_np(shape)

        # data of a frame in a video
        distance_data = get_distance(shape)
        
        emotion_data[emotion_label][emotion_ids[emotion_label]] = distance_data
          # print(emotion_label, distance_data[:10])

        emotion_ids[emotion_label] += 1
      frame_num += 1

for c in range(len(emotion_ids)) :
    print(f"Number of examples in class {emotion_labels[c]} are {emotion_ids[c]}")



processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing... 0.0
processing

Save preprocessed data

In [None]:
file_path = '/content/drive/MyDrive/CV Project Workspace/Project_Code/VideoFaceData/'
file_name = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

labels = []
data = np.array(emotion_data[0])
data = data[~np.all(data == 0, axis=1)]
labels.extend([0 for k in range(data.shape[0])])
with open(file_path + file_name[0] + ".csv", 'w', newline='') as csvfile:
  writer = csv.writer(csvfile, delimiter=',')
  writer.writerows(data)
for i in range(1, 8):
  new_data = np.array(emotion_data[i])
  new_data = new_data[~np.all(new_data == 0, axis=1)]
  labels.extend([i for k in range(data.shape[0])])
  with open(file_path + file_name[i] + ".csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(new_data)

  data = np.concatenate((data, new_data), axis=0)


In [None]:
print(data.shape)
print(sum([718, 1547, 1472, 1512, 1570, 1456, 1602, 1430]))
print(labels.shape)
print(max(labels), min(labels))

(11307, 4624)
11307


In [None]:
# Put the processed data into csv
with open(file_path + 'total_data' + ".csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(data)
with open(file_path + 'total_labels' + ".csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerows(labels)