# Preprocessing v3

In [2]:
import cv2
import numpy as np 
import os
from joblib import Parallel, delayed
from PIL import Image
import torch
from facenet_pytorch import MTCNN

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
detector = MTCNN(image_size=224, device=device, post_process=False)

# list of videos
video_path = '/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_0'
video_fnames = os.listdir(video_path)
videos = []
for fname in video_fnames:
    videos.append(os.path.join(video_path, fname))


# set output path
path = '/home/ec2-user/SageMaker/data/frames_17x'
os.makedirs(path, exist_ok=True)

# frame indices to grab
frame_idxs = [i for i in range(17, 300, 17)]

In [3]:
def read_frames_at_indices(videos, frame_idxs):
    images = {}
    for video in videos:
        for frame_num in frame_idxs:
            images.update(grab_frames(video, frame_num))
    return images


def grab_frames(video, frame_num):
    #video = os.path.join(video_dir, sample)
    filename = video[:-4]+'_'+ str(frame_num) +'.jpg'
    reader = cv2.VideoCapture(video)
    reader.set(1, frame_num)
    _, image = reader.read()
    images = {}
    images[filename] = image
    #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # cv2.imwrite(filename, image)
    #images_dict = {}
    #images_dict[filename] = image
    reader.release()
    return images


def multiprocess_read_frames_at_indices(videos, frame_idxs, job_num):
    results = Parallel(n_jobs=job_num)(delayed(grab_frames)(video, frame_num) 
        for video in videos for frame_num in frame_idxs)
    images = {}
    for item in results:
        images.update(item)
    return images


def detect_facenet_pytorch(detector, images):
    faces = {}
    for key in images.keys():
        imgs_pil = Image.fromarray(images[key])
        try:
            faces.update( {key: detector(imgs_pil)} )
        except:
            pass
    return faces


def write_images_to_disk(path, faces):
    for face in faces.keys():
        try:
            image = faces[face].permute(1, 2, 0).int().numpy()
            jpg = face.split('/')[-1]
            filename = os.path.join(path, jpg)
            cv2.imwrite(filename, image)
        except:
            faces[face] = 'no face detected'
            pass

In [4]:
# for single processing
# images = read_frames_at_indices(videos, frame_idxs)

# for multiprocessing
images = multiprocess_read_frames_at_indices(videos[100:200], frame_idxs, 4)


In [5]:
# detect faces and crop
faces = detect_facenet_pytorch(detector, images)

# save to disk
write_images_to_disk(path, faces)

In [6]:

import boto3

sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message='finished processing v3'
)


"\nimport boto3\n\nsns = boto3.client('sns')\nresponse = sns.publish(\n    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',\n    Message='finished processing v3'\n)\n"