In [None]:
# images from https://vis-www.cs.umass.edu/lfw/

In [None]:
# example of face detection with mtcnn
from matplotlib import pyplot
from PIL import Image
from numpy import asarray
from mtcnn.mtcnn import MTCNN
 
# extract a single face from a given photograph
def extract_face(filename, required_size=(224, 224)):
    # load image from file
    pixels = pyplot.imread(filename)
    # create the detector, using default weights
    detector = MTCNN()
    # detect faces in the image
    results = detector.detect_faces(pixels)
    # extract the bounding box from the first face
    x1, y1, width, height = results[0]['box']
    x2, y2 = x1 + width, y1 + height
    # extract the face
    face = pixels[y1:y2, x1:x2]
    # resize pixels to the model size
    image = Image.fromarray(face)
    image = image.resize(required_size)
    face_array = asarray(image)
    return face_array

In [None]:
# read peopleDevTrain.txt and peopleDevTest.txt into df, one line per file, skipping first line, \t separated
import pandas as pd

train = pd.read_csv('peopleDevTrain.txt', sep='\t', header=None, names=['name', 'count'], skiprows=1)
test = pd.read_csv('peopleDevTest.txt', sep='\t', header=None, names=['name', 'count'], skiprows=1)

# turn count from float to int
train['count'] = train['count'].astype(int)
test['count'] = test['count'].astype(int)

# create filename column, for count more than 1, filename is name_0001.jpg, name_0002.jpg, split into different rows
train['filename'] = train.apply(lambda x: [f'{x["name"]}/{x["name"]}_{i:04d}.jpg' for i in range(1, x['count']+1)], axis=1)
train = train.explode('filename')

test['filename'] = test.apply(lambda x: [f'{x["name"]}/{x["name"]}_{i:04d}.jpg' for i in range(1, x['count']+1)], axis=1)
test = test.explode('filename')

# apply extract_face to each filename
train['face'] = train['filename'].apply(lambda x: extract_face(f'lfw-deepfunneled/{x}'))
test['face'] = test['filename'].apply(lambda x: extract_face(f'lfw-deepfunneled/{x}'))

In [None]:
# save train and test
train.to_pickle('train.pkl')
test.to_pickle('test.pkl')

In [None]:
# plot first few images of face
for i in range(9):
    pyplot.subplot(330 + 1 + i)
    pyplot.imshow(train['face'][i])