# Load libraries

In [1]:
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import Image
import matplotlib.pyplot as plt
from imutils import face_utils
import imutils
import dlib
import cv2
from google.colab.patches import cv2_imshow
import csv

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
AUDIO_DATA_PATH = 'drive/MyDrive/CV Project Workspace/Project_Code/Data/RAVDESS/Audio_Speech_Actors_01-24/'
VIDEO_DATA_PATH = 'drive/MyDrive/CV Project Workspace/Project_Code/Data/RAVDESS/Video_Speech/'
EMOTIONS = {0:'neutral', 1:'calm', 2:'happy', 3:'sad', 4:'angry', 5:'fear', 6:'disgust', 7:'surprise'}
ACTORS = ['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 
          'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 
          'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 
          'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']
OUT_DIR = 'drive/MyDrive/CV Project Workspace/Project_Code/DNN_multimodal_data/'

AUDIO_SAMPLE_RATE = 10000     ### https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate
VIDEO_FRAME_DOWNSAMPLE_RATE = 15

# Video face recognition and landmark features extraction

In [3]:
! pip install --upgrade imutils
! wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
! bzip2 -d /content/shape_predictor_68_face_landmarks.dat.bz2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-12-17 15:43:30--  http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
Resolving dlib.net (dlib.net)... 107.180.26.78
Connecting to dlib.net (dlib.net)|107.180.26.78|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64040097 (61M)
Saving to: ‘shape_predictor_68_face_landmarks.dat.bz2’


2022-12-17 15:43:44 (4.85 MB/s) - ‘shape_predictor_68_face_landmarks.dat.bz2’ saved [64040097/64040097]



In [4]:
# initialize dlib's face detector (HOG-based) and then create
# the facial landmark predictor
pretrained_dlib_detector = "/content/shape_predictor_68_face_landmarks.dat"
detector = detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(pretrained_dlib_detector)

In [5]:
def rect_to_bb(rect):
	# take a bounding predicted by dlib and convert it
	# to the format (x, y, w, h) as we would normally do
	# with OpenCV
	x = rect.left()
	y = rect.top()
	w = rect.right() - x
	h = rect.bottom() - y
	# return a tuple of (x, y, w, h)
	return (x, y, w, h)

In [6]:
def shape_to_np(shape, dtype="int"):
	# initialize the list of (x, y)-coordinates
	coords = np.zeros((68, 2), dtype=dtype)
	# loop over the 68 facial landmarks and convert them
	# to a 2-tuple of (x, y)-coordinates
	for i in range(0, 68):
		coords[i] = (shape.part(i).x, shape.part(i).y)
	# return the list of (x, y)-coordinates
	return coords

In [7]:
def get_distance(landmarks_coordinate):
  distance_embedding = np.zeros(68*68)
  for i in range(68*68):
    a = i // 68
    b = i % 68
    distance_embedding[i] = np.linalg.norm(landmarks_coordinate[a] - landmarks_coordinate[b])
  return distance_embedding

# Audio features extraction

In [8]:
def getMELspectrogram(audio, sample_rate):
    mel_spec = librosa.feature.melspectrogram(y=audio,
                                              sr=sample_rate,
                                              n_fft=1024,
                                              win_length = 512,
                                              window='hamming',
                                              hop_length = 256,
                                              n_mels=128,
                                              fmax=sample_rate/2
                                             )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

# Walk through the data

In [9]:
### This is going to take about 1 hour
num = 0
for i in ACTORS:
  num += 1
  file_path = VIDEO_DATA_PATH+i+"/"
  for dirname, _, files in os.walk(file_path):
    for CURRENT_FILE in files:
      # filter and only process speech video
      if CURRENT_FILE[:5] != '02-01':
        continue

      ### set video and audio path
      VIDEO_PATH = os.path.join(dirname, CURRENT_FILE)
      print(i, CURRENT_FILE)
      AUDIO_PATH = AUDIO_DATA_PATH+i+'/'+'03'+CURRENT_FILE[2:20]+'.wav'
      OUT_PATH = OUT_DIR+i+'/'
      emotion_label = int(CURRENT_FILE[7])-1

      ### face recognition
      face_data = []
      cap = cv2.VideoCapture(VIDEO_PATH)
      frame_num = 0
      while True:
        success, img = cap.read()
        if not success:
          break
        if frame_num % VIDEO_FRAME_DOWNSAMPLE_RATE != 0:
          frame_num += 1
          continue
        # load the input image, resize it, and convert it to grayscale
        image = imutils.resize(img, width=500)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # detect faces in the grayscale image
        rects = detector(gray, 1)
        # loop over the face detections
        for (j, rect) in enumerate(rects):
          # determine the facial landmarks for the face region, then
          # convert the facial landmark (x, y)-coordinates to a NumPy array
          shape = predictor(gray, rect)
          shape = shape_to_np(shape)
          # data of a frame in a video
          distance_data = get_distance(shape)
          face_data.append(distance_data)
        frame_num += 1
      face_data = np.array(face_data)

      ### Audio feature
      mel_spectrograms = []
      signals = []
      audio, sample_rate = librosa.load(AUDIO_PATH, duration=3, offset=0.5, sr=AUDIO_SAMPLE_RATE)
      signal = np.zeros((int(AUDIO_SAMPLE_RATE*3,)))
      signal[:len(audio)] = audio
      mel_spectrogram = getMELspectrogram(signal, AUDIO_SAMPLE_RATE)

      ### write data
      with open(OUT_PATH + CURRENT_FILE[:20] + '-face' + '.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerows(face_data)
      with open(OUT_PATH + CURRENT_FILE[:20] + '-audio' + '.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerows(mel_spectrogram)

Actor_01 02-01-01-01-02-02-01.mp4
Actor_01 02-01-01-01-01-01-01.mp4
Actor_01 02-01-01-01-02-01-01.mp4
Actor_01 02-01-02-01-01-02-01.mp4
Actor_01 02-01-01-01-01-02-01.mp4
Actor_01 02-01-02-01-01-01-01.mp4
Actor_01 02-01-02-01-02-01-01.mp4
Actor_01 02-01-02-01-02-02-01.mp4
Actor_01 02-01-02-02-02-02-01.mp4
Actor_01 02-01-02-02-01-02-01.mp4
Actor_01 02-01-02-02-01-01-01.mp4
Actor_01 02-01-02-02-02-01-01.mp4
Actor_01 02-01-03-01-02-01-01.mp4
Actor_01 02-01-03-02-01-01-01.mp4
Actor_01 02-01-03-01-01-01-01.mp4
Actor_01 02-01-03-01-01-02-01.mp4
Actor_01 02-01-03-01-02-02-01.mp4
Actor_01 02-01-03-02-01-02-01.mp4
Actor_01 02-01-04-01-01-01-01.mp4
Actor_01 02-01-03-02-02-01-01.mp4
Actor_01 02-01-04-01-02-01-01.mp4
Actor_01 02-01-04-01-02-02-01.mp4
Actor_01 02-01-04-01-01-02-01.mp4
Actor_01 02-01-03-02-02-02-01.mp4
Actor_01 02-01-05-01-01-02-01.mp4
Actor_01 02-01-04-02-02-02-01.mp4
Actor_01 02-01-05-01-01-01-01.mp4
Actor_01 02-01-04-02-01-02-01.mp4
Actor_01 02-01-04-02-02-01-01.mp4
Actor_01 02-01