## Create Easy Dataset

I have manually curated 100 **FAKE** videos in which I believe it is trivially easy for a human to identify that there is a deepfake present in the video.

Most of these videos contain one person in a well-lit environment.

In [1]:
import os
import cv2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from pathlib import Path
from os import listdir
from tqdm.notebook import tqdm

from EasyBlazeFace import EasyBlazeFace
from EasyRetinaFace import EasyRetinaFace

Since we're working with files across all directories, let's join all the metadata file together.

In [2]:
# Join metadata files into single dataframe
metadata_list = []

for i in range(50):
    folder = Path("../data/dfdc_train_part_" + str(i))
    metadata_file_path = folder/'metadata.json'
    metadata = pd.read_json(metadata_file_path).T

    metadata.reset_index(inplace=True)
    metadata.rename({'index':'fname'}, axis=1, inplace=True)
    
    metadata['directory'] =  str(folder) 
    
    metadata_list.append(metadata)
    
    
all_metadata = pd.concat(metadata_list)

In [3]:
all_metadata.tail()

Unnamed: 0,fname,label,split,original,directory
3129,pdooqxqfrm.mp4,FAKE,train,ikebomnsiq.mp4,../data/dfdc_train_part_49
3130,djjdcnhlma.mp4,FAKE,train,kudvvlgiff.mp4,../data/dfdc_train_part_49
3131,fgmbxfqoze.mp4,REAL,train,,../data/dfdc_train_part_49
3132,cywebjaezn.mp4,REAL,train,,../data/dfdc_train_part_49
3133,ohmkmcfxul.mp4,FAKE,train,hysmzkqsdl.mp4,../data/dfdc_train_part_49


In [4]:
# A list of easy FAKE videos from folders 0 to 19
fake_files = ['avwwbtbtqr.mp4', 'atosiqqljt.mp4', 'adcbeqixvb.mp4', 'copybkdfji.mp4', 'bokwfrmeyv.mp4', 'eqmokglwxl.mp4',
          'iceyyvssxc.mp4', 'aofuhtnouj.mp4', 'asjzklsxgb.mp4', 'cfgcngmdjn.mp4', 'agikkrfetn.mp4', 'bnhjdjlszu.mp4',
          'fdhmvpdhmg.mp4', 'jhnrzqlhxk.mp4', 'addyjceutz.mp4', 'mzccvujkfo.mp4', 'abbazlhmhz.mp4', 'agrzhzapku.mp4',
          'afrgmowivl.mp4', 'brrdxgrikt.mp4', 'aadlvezwah.mp4', 'drvtugrrjx.mp4', 'rlkygewnwi.mp4', 'golmvrouze.mp4',
          'ahbroumuqx.mp4', 'afcxyiwatf.mp4', 'asngnptgeu.mp4', 'bynggtiynv.mp4', 'fviqzhbyfd.mp4', 'aatzuimcwr.mp4',
          'feyilafcbb.mp4', 'aysxspqqdm.mp4', 'bvmplbmqde.mp4', 'blbcnohsup.mp4', 'elxxsfuyhl.mp4', 'fukhunlqmn.mp4',
          'iilrffkxoh.mp4', 'abssjquwjy.mp4', 'ezlehpbfya.mp4', 'aebgvmafzx.mp4', 'aasjtiyjip.mp4', 'deagrwoqak.mp4',
          'ccqrbbudlr.mp4', 'bsctqevgne.mp4', 'sbmfakhsez.mp4', 'asifdtncje.mp4', 'bnzlmqqdyh.mp4', 'gvekjxzsik.mp4',
          'hogmvuuhhl.mp4', 'bxdbfogqbx.mp4', 'azlwgfnjpi.mp4', 'fknonbqgdm.mp4', 'iipackcsoi.mp4', 'rbfbzjmxot.mp4',
          'bgxvtdyush.mp4', 'acsnnvnvhy.mp4', 'dgkdcvnhrr.mp4', 'abvmydmmpd.mp4', 'utjwjpkidv.mp4', 'fbenvdzzqa.mp4',
          'acpaumltcm.mp4', 'dhprsxhlmn.mp4', 'ipblyaohlp.mp4', 'blplivuqcr.mp4', 'aknnjyuypy.mp4', 'eduebqhpua.mp4',
          'dpygtovvqe.mp4', 'bflarmyzne.mp4', 'adarbftbnt.mp4', 'bqgreinner.mp4', 'clzyspsagz.mp4', 'hkfecpjiua.mp4',
          'dtrasqdgid.mp4', 'fargvrswxy.mp4', 'flnyxtghwk.mp4', 'dlrfpvskey.mp4', 'fpakfdhuqh.mp4', 'gmcpavifht.mp4',
          'fkzmdpmcbc.mp4', 'lzrzihalqz.mp4', 'fhxomiwsov.mp4', 'ibelgqucrk.mp4', 'ahthfectji.mp4', 'amsqbtaqxr.mp4',
          'ccjorzzmdo.mp4', 'bqrqsamrah.mp4', 'gsvlvvaliu.mp4', 'gglukbwbin.mp4', 'aisumjkudv.mp4', 'bgpnbmjuqq.mp4',
          'jeivlervlc.mp4', 'aauhqwwncp.mp4', 'blggyzbach.mp4', 'cpsttjlkme.mp4', 'hsagxqxhni.mp4', 'hvgwvloonr.mp4',
          'aaagqkcdis.mp4', 'ajiwnpctlh.mp4', 'axfekcjvqk.mp4', 'abjvfvhtnp.mp4']
len(fake_files)

100

We would like to build up a training dataset using these and other videos. We currently have 100 fake videos.

We would like:

- 100 Fake Videos
- 100 Real Videos corresponding to the fake videos

In [5]:
real_files = []

for file in fake_files:
    
    row = all_metadata.loc[all_metadata['fname'] == file]
    original = row['original'].iloc[0]
    
    real_files.append(original)

In [6]:
len(real_files)

100

## Generate Image Dataset

We want to take all of these videos, break them apart into frames and take some of the frames and create an image dataset with them.

In [7]:
import random

In [8]:
def read_frame_as_size(video_path, size=(128, 128)):
    capture = cv2.VideoCapture(str(video_path))
    ret, frame = capture.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = cv2.resize(frame, size)
    capture.release()
    return frame


def read_frame(video_path):
    capture = cv2.VideoCapture(str(video_path))
    ret, frame = capture.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    capture.release()
    return frame


def read_all_frames(video_path):
    capture = cv2.VideoCapture(str(video_path))
    all_frames = []
    ret = True
    while True:
        ret, frame = capture.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            all_frames.append(frame)
        else:
            break

    capture.release()
    return all_frames


def read_random_frame(video_path):
    
    capture = cv2.VideoCapture(str(video_path))
    frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))    
    random_frame = int(random.random() * frame_count)
    # Set to read specific frame
    capture.set(cv2.CAP_PROP_POS_FRAMES, random_frame)
    ret, frame = capture.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    capture.release()
    return frame

In [9]:
blazeface = EasyRetinaFace()

Loading pretrained model from Pytorch_Retinaface/weights/Resnet50_Final.pth
remove prefix 'module.'
Missing keys:0
Unused checkpoint keys:0
Used keys:456


In [10]:
def create_images_from_videos(files, suffix, folder='train', num_frames=10):
    for file in tqdm(files):
        row = all_metadata.loc[all_metadata['fname'] == file].iloc[0]

        video_path = row['directory'] + "/" + row['fname']
        
        
        for i in range(num_frames):
            random_frame = read_random_frame(video_path)

            # Face Detection
            detections = blazeface.detect(random_frame)

            if len(detections) > 0:
                x_min, y_min, x_max, y_max, prob = detections[0]

                #Crop the face
                raw_crop = random_frame[int(y_min):int(y_max), int(x_min):int(x_max)]

                #Save to disk
                im = Image.fromarray(raw_crop)
                im.save("../data/" + folder + "/" + row['fname'] + "_" + str(i) + "_" + suffix +".png")    
            else:
                print("Couldn't find image for", video_path)

In [11]:
len(fake_files), len(real_files)

(100, 100)

In [12]:
train_fake_files = fake_files[:80]
train_real_files = real_files[:80]

val_fake_files = fake_files[80:]
val_real_files = real_files[80:]

In [13]:
# Make training files
os.makedirs('../data/train', exist_ok=True)

create_images_from_videos(train_fake_files, suffix="FAKE", folder="train", num_frames=10)
create_images_from_videos(train_real_files, suffix="REAL", folder="train", num_frames=10)



HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))

Couldn't find image for ../data/dfdc_train_part_18/asifdtncje.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4



HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))

Couldn't find image for ../data/dfdc_train_part_18/asifdtncje.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4
Couldn't find image for ../data/dfdc_train_part_8/fargvrswxy.mp4



HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




In [14]:
# Make validation files
os.makedirs('../data/val', exist_ok=True)
create_images_from_videos(val_fake_files, suffix="FAKE", folder="val", num_frames=10)
create_images_from_videos(val_real_files, suffix="REAL", folder="val", num_frames=10)

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

Couldn't find image for ../data/dfdc_train_part_6/ahthfectji.mp4
Couldn't find image for ../data/dfdc_train_part_6/ahthfectji.mp4



HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Great, so now we have a dataset we can use!