-------------------------------------------------------------------------------------------------  

### Preprocessing Video

Working on streamlining the preprocessing. Will attempt to pull frame 100 from the entire dataset. Plan is to use cv2 to grab the frame and save the frame to a list. Then use facenet to detect face and crop on face. And then save to disk. 

<img style="float: center;" src="deepfake.jpg">

-------------------------------------------------------------------------------------------------  

In [None]:
# from pypi https://pypi.org/project/facenet-pytorch/
#
# install facenet-pytorch on kaggle without internet
# !pip install ../facenet-pytorch/facenet_pytorch-2.0.0-py3-none-any.whl --user

In [4]:
!pip install facenet_pytorch

Collecting facenet_pytorch
  Using cached https://files.pythonhosted.org/packages/ce/60/5192979f70b14681c698f61aace14e906bc92abc0790b0002ebd017dd3d3/facenet_pytorch-2.0.1-py3-none-any.whl
Installing collected packages: facenet-pytorch
Successfully installed facenet-pytorch-2.0.1


In [1]:
import os
import json
import boto3
import pandas as pd
import cv2
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image
import torch
from tqdm import tqdm
import time
from joblib import Parallel, delayed
from facenet_pytorch import MTCNN

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
# set deepfake directory chunk
chunk = 18
# frame number to grab from videos
frame_num = 99

In [None]:
video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

In [None]:
#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)



In [None]:
# single process cv2
'''
def grab_frames(sample, frame_num):
    video = os.path.join(video_dir, sample)
    reader = cv2.VideoCapture(video)
    reader.set(1, frame_num)
    _, image = reader.read()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    reader.release()
    return image


images_dict= {}
images = []
for sample in metadata.index:
    filename = sample[:-3]+'jpg'
    frame = grab_frames(sample, frame_num)
    images_dict[filename] = frame
    images.append(frame)
    
'''

In [None]:
def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

In [None]:
# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

In [None]:
# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

In [None]:
for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass

In [None]:
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')

In [None]:
len(os.listdir(frame_dir))

In [None]:
len(os.listdir(video_dir))

In [None]:
sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 19

In [None]:
# set deepfake directory chunk
chunk = 19
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 20

In [None]:
# set deepfake directory chunk
chunk = 20
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 21

In [None]:
# set deepfake directory chunk
chunk = 21
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 22

In [None]:
# set deepfake directory chunk
chunk = 22
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 23

In [None]:
# set deepfake directory chunk
chunk = 23
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 24

In [None]:
# set deepfake directory chunk
chunk = 24
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 25

In [None]:
# set deepfake directory chunk
chunk = 25
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 26

In [None]:
# set deepfake directory chunk
chunk = 26
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 27

In [None]:
# set deepfake directory chunk
chunk = 27
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 28

In [None]:
# set deepfake directory chunk
chunk = 28
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 29

In [None]:
# set deepfake directory chunk
chunk = 29
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 30

In [None]:
# set deepfake directory chunk
chunk = 30
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 31

In [None]:
# set deepfake directory chunk
chunk = 31
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 32

In [None]:
# set deepfake directory chunk
chunk = 32
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 33

In [None]:
# set deepfake directory chunk
chunk = 33
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 34

In [None]:
# set deepfake directory chunk
chunk = 34
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 35

In [None]:
# set deepfake directory chunk
chunk = 35
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 36

In [None]:
# set deepfake directory chunk
chunk = 36
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 37

In [None]:
# set deepfake directory chunk
chunk = 37
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 38

In [None]:
# set deepfake directory chunk
chunk = 38
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 39

In [None]:
# set deepfake directory chunk
chunk = 39
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

----------------------------------------------------  
# New Chunk 40

In [2]:
# set deepfake directory chunk
chunk = 40
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 146.491 seconds
3 frames without faces detected


----------------------------------------------------  
# New Chunk 41

In [4]:
# set deepfake directory chunk
chunk = 41
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 257.697 seconds
1 frames without faces detected


----------------------------------------------------  
# New Chunk 42

In [5]:
# set deepfake directory chunk
chunk = 42
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 157.785 seconds
1 frames without faces detected


----------------------------------------------------  
# New Chunk 43

In [6]:
# set deepfake directory chunk
chunk = 43
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 156.675 seconds
19 frames without faces detected


----------------------------------------------------  
# New Chunk 44

In [7]:
# set deepfake directory chunk
chunk = 44
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 144.249 seconds
2 frames without faces detected


----------------------------------------------------  
# New Chunk 45

In [8]:
# set deepfake directory chunk
chunk = 45
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

Detecting faces in frames, 154.130 seconds
0 frames without faces detected


----------------------------------------------------  
# New Chunk 46

In [3]:
# set deepfake directory chunk
chunk = 46
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)
del(results)

Detecting faces in frames, 124.399 seconds
9 frames without faces detected


----------------------------------------------------  
# New Chunk 47

In [4]:
# set deepfake directory chunk
chunk = 47
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)
del(results)

Detecting faces in frames, 147.767 seconds
30 frames without faces detected


----------------------------------------------------  
# New Chunk 48

In [2]:
# set deepfake directory chunk
chunk = 48
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


# parallel job 
# not iterating over all videos, iterating over a single video?
results = Parallel(n_jobs=3)(delayed(
                   grab_frames)(sample, frame_num)
                       for sample in metadata.index)

# unpack reults
images, results_dict = zip(*results)
images = list(images)
images_dict = {}
for i in results_dict:
    images_dict.update(i)

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        imgs_pil = [Image.fromarray(images_dict[key])]
        #print(type(imgs_pil[0]))
        try:
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)
del(results)

Detecting faces in frames, 141.249 seconds
9 frames without faces detected


----------------------------------------------------  
# New Chunk 49

TypeError: zip argument #2688 must support iteration

In [None]:
# set deepfake directory chunk
chunk = 49
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        reader.release()
        return image
    except:
        pass


images_dict= {}
images = []
for sample in metadata.index:
    filename = sample[:-3]+'jpg'
    frame = grab_frames(sample, frame_num)
    images_dict[filename] = frame
    images.append(frame)
    

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        try:
            imgs_pil = [Image.fromarray(images_dict[key])]
            #print(type(imgs_pil[0]))
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)


----------------------------------------------------  
# New Chunk 18

In [6]:
# set deepfake directory chunk
chunk = 18
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        reader.release()
        return image
    except:
        pass


images_dict= {}
images = []
for sample in metadata.index:
    filename = sample[:-3]+'jpg'
    frame = grab_frames(sample, frame_num)
    images_dict[filename] = frame
    images.append(frame)
    

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        try:
            imgs_pil = [Image.fromarray(images_dict[key])]
            #print(type(imgs_pil[0]))
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)


Detecting faces in frames, 183.676 seconds
11 frames without faces detected


----------------------------------------------------  
# New Chunk 27

error: zip argument #9 must support iteration

In [5]:
# set deepfake directory chunk
chunk = 27
# frame number to grab from videos
frame_num = 99

video_dir = f'/home/ec2-user/SageMaker/data/deepfake/dfdc_train_part_{chunk}'
frame_dir = f'../data/frames/f{frame_num}'
meta_file = os.path.join(video_dir, 'metadata.json')

os.makedirs(frame_dir, exist_ok=True)

with open(meta_file) as f:
    metadata = pd.read_json(f).T

#parallel processing cv2

def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        filename = sample[:-3]+'jpg'
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        images_dict = {}
        images_dict[filename] = image
        reader.release()
        return image, images_dict
    except:
        pass


    
def grab_frames(sample, frame_num):
    try:
        video = os.path.join(video_dir, sample)
        reader = cv2.VideoCapture(video)
        reader.set(1, frame_num)
        _, image = reader.read()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        reader.release()
        return image
    except:
        pass


images_dict= {}
images = []
for sample in metadata.index:
    filename = sample[:-3]+'jpg'
    frame = grab_frames(sample, frame_num)
    images_dict[filename] = frame
    images.append(frame)
    
    

def timer(detector, detect_fn, images, *args):
    start = time.time()
    faces, faces_dict = detect_fn(detector, images, *args)
    elapsed = time.time() - start
    print(f', {elapsed:.3f} seconds')
    return faces, elapsed, faces_dict

# my attempt at adding a dictionary to the for loop to keep track of filenames

detector = MTCNN(image_size=224, device=device, post_process=False)


def detect_facenet_pytorch(detector, images, batch_size):
    faces = []
    faces_dict = {}
    n = 0
    for key in images_dict.keys():
    #for lb in np.arange(0, len(images), batch_size):
        try:
            imgs_pil = [Image.fromarray(images_dict[key])]
            #print(type(imgs_pil[0]))
            faces.extend(detector(imgs_pil))
            #print(len(faces))
            faces_dict[key] = faces[n]
            n += 1
        except:
            pass
    return faces, faces_dict

times_facenet_pytorch_nb = [] # non-batched

# dtect faces from images
print('Detecting faces in frames', end='')
faces, elapsed, faces_dict = timer(detector, detect_facenet_pytorch, images, 1)
times_facenet_pytorch_nb.append(elapsed)

for face in faces_dict.keys():
    try:
        image = faces_dict[face].permute(1, 2, 0).int().numpy()
        filename = os.path.join(frame_dir, face)
        cv2.imwrite(filename, image)
    except:
        faces_dict[face] = 'no face detected'
        pass
    
no_faces_detected = 0
for face in faces_dict.keys():
    if isinstance(faces_dict[face], str):
        no_faces_detected += 1
        
print(f'{no_faces_detected} frames without faces detected')


sns = boto3.client('sns')
response = sns.publish(
    TopicArn='arn:aws:sns:us-east-1:364430515305:deepfake',
    Message=f' chunk #{chunk} frame#{frame_num} finished processing'
)

del(faces)
del(faces_dict)
del(images)
del(images_dict)


Detecting faces in frames, 162.733 seconds
14 frames without faces detected


NameError: name 'results' is not defined