## Video Dataset

Up until this point we have been training models on single frames from a video. Let's try training them on multiple frames instead.

To start, we'll take all of the videos in `HardDeepFakes` and create 4 frame subsets of them.

In [1]:
import os
import glob
import torch

from pathlib import Path
import numpy as np
import torch.nn.functional as F
import fastai
from fastai.vision import *
from fastai.basics import *
from utils import read_random_sequential_frames, plot_detections
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from EasyBlazeFace import EasyBlazeFace
from EasyRetinaFace import EasyRetinaFace

In [2]:
# Join metadata files into single dataframe
metadata_list = []

for i in range(50):
    folder = Path("../data/dfdc_train_part_" + str(i))
    metadata_file_path = folder/'metadata.json'
    metadata = pd.read_json(metadata_file_path).T

    metadata.reset_index(inplace=True)
    metadata.rename({'index':'fname'}, axis=1, inplace=True)
    
    metadata['directory'] =  str(folder) 
    
    metadata_list.append(metadata)
    
    
all_metadata = pd.concat(metadata_list)

In [3]:
fake_files = ['aaiqsitvnd.mp4', 'afvlrevses.mp4', 'azqycoiiuk.mp4', 'aawkmrlilr.mp4', 'bjxzcaifpw.mp4',
              'cjstqmiyud.mp4', 'ajxpaiqcee.mp4', 'yumeecupaw.mp4', 'bbbmfffsad.mp4', 'bbaeewtqei.mp4',
               'aafcgzwvmy.mp4', 'dwqhiwmswx.mp4', 'cczisiwrmd.mp4', 'danyfkofxk.mp4', 'aetpnobkcv.mp4',
               'agswjtuhss.mp4', 'ctjewcqxee.mp4', 'baxdvqglnu.mp4', 'houzrrfbqg.mp4', 'acfaxepklq.mp4',
               'tfaxqkuhoh.mp4', 'cmoqplbifs.mp4', 'aihvsjluzl.mp4', 'aazkzzkbns.mp4', 'ahqausishe.mp4',
               'atwmgvaauf.mp4', 'aafezqchru.mp4', 'aetbyrujti.mp4', 'dqqwmmwlbg.mp4', 'zuweqkkudv.mp4',
               'cathvygbkb.mp4', 'bgkmgallha.mp4', 'alhsvhumdw.mp4', 'bbfewvawtu.mp4', 'dkrvorliqc.mp4',
               'abkxlatant.mp4', 'ajnzuiktuo.mp4', 'bdbryopkaa.mp4', 'xnmmweqmdt.mp4', 'bvhzopqkek.mp4',
               'aaqaifqrwn.mp4', 'bsprlfyyyc.mp4', 'agbfymrrhv.mp4', 'aiuvaldnqj.mp4', 'cmgmhevsmr.mp4',
               'aabkwhhgwv.mp4', 'djjafmdtxy.mp4', 'zwwvpycsis.mp4', 'dmzbzcjsrg.mp4', 'efghsdmasb.mp4',
               'coulkppulq.mp4', 'adtovspidj.mp4', 'arqhhrzzfe.mp4', 'cjetyhnwpr.mp4', 'akqjhilhtc.mp4',
               'alddoocxqo.mp4', 'aahzjreawd.mp4', 'zvhdwittmk.mp4', 'avfoukjslp.mp4', 'abteztchqz.mp4',
               'abfvpzjkwr.mp4', 'aakkdgsmvl.mp4', 'cdyoiiuszk.mp4', 'beshruwzkt.mp4', 'aakjukgjme.mp4',
               'aqgkloqxld.mp4', 'hfkwogkzid.mp4', 'avocsaizhx.mp4', 'ammkabwonw.mp4', 'avcnyamvrb.mp4',
               'anrlivgkfq.mp4', 'cnbtegsjlx.mp4', 'cbulmphxfg.mp4', 'acgyspwyqr.mp4', 'abhdaoikio.mp4',
               'azivnorcbt.mp4', 'agsrndrnjg.mp4', 'aemldzfofe.mp4', 'aaeucwtkdx.mp4', 'cqnutosgsm.mp4',
               'ackyrwxeew.mp4', 'aagundkpoh.mp4', 'bzqemivbgx.mp4', 'akaqyxcfep.mp4', 'bnbxarxyqk.mp4',
               'dywkdfclzy.mp4', 'aqfeqkckjs.mp4', 'azpamhpoyy.mp4', 'aoclawrydd.mp4', 'ahpaydsovz.mp4',
               'ahencqpiin.mp4', 'aaaoqepxnf.mp4', 'caqderfjql.mp4', 'aawhvztjdt.mp4', 'ayclhmjajg.mp4',
               'bmjmjmbglm.mp4', 'bacnffbyky.mp4', 'abhgyltuqg.mp4', 'azylwizjmc.mp4', 'asgzesadhh.mp4']

len(fake_files)

100

In [4]:
real_files = []

for file in fake_files:
    
    row = all_metadata.loc[all_metadata['fname'] == file]
    original = row['original'].iloc[0]
    
    real_files.append(original)
    
len(real_files)

100

In [5]:
train_fake_files = fake_files[:80]
train_real_files = real_files[:80]

val_fake_files = fake_files[80:]
val_real_files = real_files[80:]

In [6]:
def get_faces_from_multiple_frames(detector, path, num_frames=10):
    """
    Given the path to a video (.mp4) read `num_frames` sequential random frames
    and return any faces found within the frames.
    """
    
    frames = read_random_sequential_frames(path, num_frames=num_frames)
          
    # Get a set of detections for these frames
    detections_for_frames = detector.detect_on_multiple_frames(frames)
    
    if len(detections_for_frames) == 0:
        return []
    
    # A list of the detections for each face in the video.
    # Each face has one set of coordinates that contains ALL of the bounding boxes from every frame.
    largest_detections = []

    # Get detections for the first frame
    firstFrameDetections = detections_for_frames[0]
    for x_min, y_min, x_max, y_max, _  in firstFrameDetections:
        largest_detections.append([x_min, y_min, x_max, y_max])
        
    for detections in detections_for_frames[1:]:
    
        # TODO: Generalize for videos with multiple people
        only_detection = detections[0]
        x_min, y_min, x_max, y_max, _ = only_detection

        # TODO: Generalize for videos with multiple people
        current_largest_detection = largest_detections[0]
        current_x_min, current_y_min, current_x_max, current_y_max = current_largest_detection

        # Expand the bounding box if neccessary to include this one
        current_x_min = min(x_min, current_x_min)
        current_y_min = min(y_min, current_y_min)
        current_x_max = max(x_max, current_x_max)
        current_y_max = max(y_max, current_y_max)

        # TODO: Generalize for videos with multiple people
        largest_detections[0] = [current_x_min, current_y_min, current_x_max, current_y_max]
        
    # Now that we have a set of detections, apply them against the frames and 
    # return only the portions of the frames that contain the face
    faces = []
    for x_min, y_min, x_max, y_max in largest_detections:
        # Get only the face from the frames
        face_frames = frames[:, int(y_min):int(y_max), int(x_min):int(x_max)]
        faces.append(face_frames)
        

    
    return faces

In [7]:
def create_frames_from_videos(files, suffix, folder='train', num_frames=10):
    
    easyBlazeFace = EasyBlazeFace()
    easyRetinaFace = EasyRetinaFace()

    for file in tqdm(files):
        
        # Get 5 clips from each video
        for i in range(5):
            try:

                row = all_metadata.loc[all_metadata['fname'] == file].iloc[0]
                video_path = row['directory'] + "/" + row['fname']

                # Get detections from video with BlazeFace by default
                faces = get_faces_from_multiple_frames(easyBlazeFace, video_path, num_frames=num_frames)

                if len(faces) == 0:
                    # If BlazeFace cannot find detections, try again with RetinaFace
                    faces = get_faces_from_multiple_frames(easyRetinaFace, video_path, num_frames=num_frames)

                    # If we still can't find any faces, just log and continue
                    if len(faces) == 0:
                        print("NOTHING FOR:", video_path)
                        continue

                # TODO: Generalize for videos with multiple people
                faces = faces[0]

                # Stack frames along channel dimension
                #(n, h, w, c) -> (n * c, h, w)
                faces = np.concatenate([f.squeeze() for f in faces], axis=-1)

                # Save as .npy file    
                save_path = "../data/" + folder + "/" + row['fname'] + "_" + str(i) + "_" + suffix + ".npy"    
                np.save(save_path, faces)
            except Exception as e: 
                print("ERROR")
                print(file)
                print()

                print(e)

In [8]:
# Make training files
os.makedirs('../data/train', exist_ok=True)

create_frames_from_videos(train_fake_files, suffix="FAKE", folder="train", num_frames=10)
create_frames_from_videos(train_real_files, suffix="REAL", folder="train", num_frames=10)

Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))


Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




In [9]:
# Make validation files
os.makedirs('../data/val', exist_ok=True)
create_frames_from_videos(val_fake_files, suffix="FAKE", folder="val", num_frames=10)
create_frames_from_videos(val_real_files, suffix="REAL", folder="val", num_frames=10)

Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


