## Video Dataset

Up until this point we have been training models on single frames from a video. Let's try training them on multiple frames instead.

To start, we'll take all of the videos in `HardDeepFakes` and create 4 frame subsets of them.

In [1]:
import os
import glob
import torch

from pathlib import Path
import numpy as np
import torch.nn.functional as F
import fastai
from fastai.vision import *
from fastai.basics import *
from video_utils import read_random_sequential_frames, plot_detections, load_all_metadata
from video_utils import read_frames
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from EasyBlazeFace import EasyBlazeFace
from EasyRetinaFace import EasyRetinaFace

In [2]:
all_metadata = load_all_metadata()

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [15]:
def get_faces_from_multiple_frames(detector, path, num_frames=10):
    """
    Given the path to a video (.mp4) read `num_frames` sequential random frames
    and return any faces found within the frames.
    """
    
    frames = read_frames(path, start=0, end=num_frames)
          
    # Get a set of detections for these frames
    detections_for_frames = detector.get_detections_with_multiple_crops(frames)
    
    detections_for_frames = [x for x in detections_for_frames if len(x) != 0]

    if len(detections_for_frames) == 0:
        return []
    
    frame_height, frame_width, _ = frames[0].shape
    
    
    # A list of the detections for each face in the video.
    # Each face has one set of coordinates that contains ALL of the bounding boxes from every frame.
    largest_detections = []

    # Get detections for the first frame
    firstFrameDetections = detections_for_frames[0]
    
    for x_min, y_min, x_max, y_max, _  in firstFrameDetections:
        largest_detections.append([x_min, y_min, x_max, y_max])
        
    for detections in detections_for_frames[1:]:
        
        # If there are any detections that indicate multiple people, ignore this video
        if len(detections) != 1:
            return []
    
        only_detection = detections[0]
        x_min, y_min, x_max, y_max, _ = only_detection

        # TODO: Generalize for videos with multiple people
        current_largest_detection = largest_detections[0]
        current_x_min, current_y_min, current_x_max, current_y_max = current_largest_detection

        # Expand the bounding box if neccessary to include this one
        current_x_min = min(x_min, current_x_min)
        current_y_min = min(y_min, current_y_min)
        current_x_max = max(x_max, current_x_max)
        current_y_max = max(y_max, current_y_max)
        
        # Make sure dets are within the frame
        current_x_min = max(current_x_min, 0)
        current_y_min = max(current_y_min, 0)
        current_x_max = min(current_x_max, frame_width)
        current_y_max = min(current_y_max, frame_height)
        
        largest_detections[0] = [current_x_min, current_y_min, current_x_max, current_y_max]
        
    # Now that we have a set of detections, apply them against the frames and 
    # return only the portions of the frames that contain the face
    faces = []
    for x_min, y_min, x_max, y_max in largest_detections:
        # Get only the face from the frames
        face_frames = frames[:, int(y_min):int(y_max), int(x_min):int(x_max)]
        faces.append(face_frames)
        

    return faces

In [16]:
def create_frames_from_videos(df, folder='train', num_frames=10):
    
    no_good = []
    
    easyBlazeFace = EasyBlazeFace()

    for i, row in tqdm(df.iterrows()):
        try:
            video_path = row['directory'] + "/" + row['fname']
            suffix = row['label']

            # Get detections from video with BlazeFace by default
            faces = get_faces_from_multiple_frames(easyBlazeFace, video_path, num_frames=num_frames)

            if len(faces) == 0:
                no_good.append(video_path)
                if len(no_good) % 500 == 0:
                    print("No faces: {}".format(len(no_good)))
                continue

            # Assume a single person
            faces = faces[0]

            # Stack frames along channel dimension
            #(n, h, w, c) -> (n * c, h, w)
            faces = np.concatenate([f.squeeze() for f in faces], axis=-1)

            # Save as .npy file    
            save_path = "../data/" + folder + "/" + row['fname'] + "_"  + suffix + ".npy"    
            np.save(save_path, faces)
        except Exception as e: 
            print("ERROR")
            print(row['fname'])
            print(e)
            print()

In [None]:
# Make training files
os.makedirs('../data/train', exist_ok=True)

train = pd.read_csv('../data/cropped_faces/train.csv')

create_frames_from_videos(train, folder="train", num_frames=16)

In [None]:
# Make training files
os.makedirs('../data/valid', exist_ok=True)

valid = pd.read_csv('../data/cropped_faces/valid.csv')

create_frames_from_videos(valid, folder="valid", num_frames=16)