In [1]:
import numpy as np
import pandas as pd
import os

### Download videos for training

Downloads videos as mp4 and saves the audio as mp3. mp4 may or may not contain audio due to the API being limited, though this should not matter (except for the small negative impact on file size). Currently videos are being downloaded in 720p since 1080p seems bugged

In [2]:
from pytube import YouTube 
  
urls = ["https://www.youtube.com/watch?v=Yt3-a9mExZg", "https://www.youtube.com/watch?v=cGt8bEcd9Ms"]
pathVideo = 'Data/Video'
pathAudio = 'Data/Audio'

for i, url in enumerate(urls):
    yt = YouTube(url)
    yt.streams.filter(abr="160kbps", progressive=False).first().download(filename="Audio"+str(i)+".mp3", output_path = pathAudio)
    yt.streams.filter(res="720p", progressive=False).first().download(filename="Video"+str(i)+".mp4", output_path= pathVideo)
##

### Split video into frames

Goes through each downloaded video. Creates a directory per video inside 'tempFrames' and then saves the frames here. We can set the amount of frames we want per minute. Note that file sizes of these images can balloon quickly, exceeding the filesize of the video itself.

In [3]:
import cv2
FRAMES_PER_MINUTE = 10
frame_counter = 60000 / FRAMES_PER_MINUTE
pathVideo = 'Data/Video/'
videoNames = []
for filename in os.listdir(pathVideo):
    f = os.path.join(pathVideo, filename)
    if os.path.isfile(f): ## checking if it is a file
        videoNames.append(filename)

for video in videoNames:
    path = pathVideo + video
    outputPath = 'Data/tempFrames/' + video[:-4] + '/' ## make directory per video
    if not os.path.exists(outputPath):
        os.mkdir(outputPath)
        print(f'Created new dir {outputPath}')
    vidcap = cv2.VideoCapture(pathVideo + video)
    success,image = vidcap.read()
    print(f'Succesful videocapture?: {success}')
    count = 0
    while success:
        cv2.imwrite(outputPath + "%d_seconds.jpg" % (count*(frame_counter/1000)), image)     # save frame as JPEG file   
        count += 1
        vidcap.set(cv2.CAP_PROP_POS_MSEC,(count*frame_counter))   
        success,image = vidcap.read()
        

Succesful videocapture?: False
Succesful videocapture?: True
Succesful videocapture?: True


### Segmenting faces/webcam feeds

By segmenting individuals, we can create temporary folders containing the frames per individual participant. This might be neccessary depending on the emotion detection model that we apply. It seems better to avoid this step is possible, as it introduces an extra model (segmentation), and also requires the creation of more files which need to be operated on. This could be rather inefficient

In [4]:
## Some code here for extracting individuals, saving new cropped images

### Video emotion detection

Now we run an emotion detection model to acquire some score. For every frame, we should get some "emotion scores" per individual. We can then extract some values such as minimum, maximum and mean scores per person. We can eventually combine this into a final score. For this to be as accurate as possible we will measure audio later 

In [1]:
from deepface import DeepFace
import cv2
import matplotlib.pyplot as plt
face_model = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')


In [34]:
capture = cv2.VideoCapture('/Data/Video/Video0.mp4')
length = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
numberOfFrames = 10
currentFrame = 0
for i in range(12):
    _, frame = capture.read()
    if currentFrame != numberOfFrames:
        currentFrame += 1
        continue
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_model.detectMultiScale(gray, 1.1, 5)

    for (x, y, w ,h) in faces:
        cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),2)

    for face in faces:
        print(face)
        emotion = DeepFace.analyze(frame, actions = ['gender','emotion'])
        print(emotion)

    currentFrame = 0
    cv2.imshow('x',frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

[1151  277   64   64]


Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.06it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.08it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.55it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.28it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.46it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.00018886947827922995, 'Man': 99.99980926513672}, 'dominant_gender': 'Man', 'region': {'x': 914, 'y': 486, 'w': 114, 'h': 114}, 'emotion': {'angry': 2.2502200677990913, 'dis

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  7.96it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  6.73it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.40it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.58it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.56it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.38it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.48it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.33it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.59it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.21it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.68it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.67it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  7.75it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.26it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.06it/s]


[{'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust': 4.165743661133092e-06, 'fear': 5.77203743159771, 'happy': 0.0018566197468317114, 'sad': 82.11588263511658, 'surprise': 0.004311347584007308, 'neutral': 7.132849097251892}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.6

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.01it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  5.21it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.86it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.02it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.33it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.73it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  6.47it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  7.51it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.69it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.14it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust

Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.24it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  7.94it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.33it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.52it/s]
Action: emotion: 100%|██████████| 2/2 [00:00<00:00,  8.83it/s]


[{'gender': {'Woman': 5.294562131166458, 'Man': 94.7054386138916}, 'dominant_gender': 'Man', 'region': {'x': 472, 'y': 545, 'w': 115, 'h': 115}, 'emotion': {'angry': 0.023690535410953933, 'disgust': 6.643793822520522e-06, 'fear': 99.4342624803938, 'happy': 4.84737114918727e-06, 'sad': 0.5307663082788174, 'surprise': 0.011244136782712074, 'neutral': 2.5825453183185254e-05}, 'dominant_emotion': 'fear'}, {'gender': {'Woman': 0.2635783748701215, 'Man': 99.73642230033875}, 'dominant_gender': 'Man', 'region': {'x': 928, 'y': 111, 'w': 110, 'h': 110}, 'emotion': {'angry': 1.4324935153126717, 'disgust': 0.022462835477199405, 'fear': 23.60522598028183, 'happy': 0.005210157178225927, 'sad': 74.85388517379761, 'surprise': 6.297289019130403e-05, 'neutral': 0.08065410074777901}, 'dominant_emotion': 'sad'}, {'gender': {'Woman': 0.02211132232332602, 'Man': 99.97789263725281}, 'dominant_gender': 'Man', 'region': {'x': 273, 'y': 101, 'w': 145, 'h': 145}, 'emotion': {'angry': 4.973066598176956, 'disgust

### Audio analysis

#### Speech to text

In [7]:
from datasets import load_dataset
from transformers import pipeline

dataset = load_dataset("anton-l/superb_demo", "er", split="session1")

classifier = pipeline("audio-classification", model="superb/wav2vec2-base-superb-er")
labels = classifier(dataset[0]["file"], top_k=5)



Found cached dataset superb_demo (C:/Users/ramon.cremers/.cache/huggingface/datasets/anton-l___superb_demo/er/1.9.0/77d23894ff429329a7fe80f9007cabb0deec321316f8dda1a1e9d10ffa089d08)


In [8]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

def map_to_array(example):
    speech, _ = librosa.load(example["file"], sr=16000, mono=True)
    example["speech"] = speech
    return example

# load a demo dataset and read audio files
dataset = load_dataset("anton-l/superb_demo", "er", split="session1")
print(dataset)
dataset = dataset.map(map_to_array)

model = Wav2Vec2ForSequenceClassification.from_pretrained("superb/wav2vec2-base-superb-er")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-er")

# compute attention masks and normalize the waveform if needed
inputs = feature_extractor(dataset[:4]["speech"], sampling_rate=16000, padding=True, return_tensors="pt")

logits = model(**inputs).logits
predicted_ids = torch.argmax(logits, dim=-1)
labels = [model.config.id2label[_id] for _id in predicted_ids.tolist()]


: 

: 

In [None]:
labels

['hap', 'hap', 'ang', 'hap']

: 