In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
import cv2

# Load the cascade
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')

def extract_face(filename):
    # if _cropped.jpg already exists, skip
    if os.path.exists(filename.replace('.jpg', '_cropped.jpg')):
        return filename.replace('.jpg', '_cropped.jpg')
    # Read the input image
    img = cv2.imread(filename)

    # Convert into grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Detect faces
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)

    # Crop the faces
    if len(faces) == 0:
        print(f'No face detected in {filename}')
        return None

    # Get the first face
    x, y, w, h = faces[0]

    # Crop the face
    face = img[y:y+h, x:x+w]

    # Resize to 224x224
    face = cv2.resize(face, (224, 224))

    # Save the result save *.jpg to *_cropped.jpg
    cv2.imwrite(filename.replace('.jpg', '_cropped.jpg'), face)

    return filename.replace('.jpg', '_cropped.jpg')


In [3]:
# read peopleDevTrain.txt and peopleDevTest.txt into df, one line per file, skipping first line, \t separated
import pandas as pd

train = pd.read_csv('peopleDevTrain.txt', sep='\t', header=None, names=['name', 'count'], skiprows=1)
test = pd.read_csv('peopleDevTest.txt', sep='\t', header=None, names=['name', 'count'], skiprows=1)

# turn count from float to int
train['count'] = train['count'].astype(int)
test['count'] = test['count'].astype(int)

# create filename column, for count more than 1, filename is name_0001.jpg, name_0002.jpg, split into different rows
train['filename'] = train.apply(lambda x: [f'{x["name"]}/{x["name"]}_{i:04d}.jpg' for i in range(1, x['count']+1)], axis=1)
train = train.explode('filename')

test['filename'] = test.apply(lambda x: [f'{x["name"]}/{x["name"]}_{i:04d}.jpg' for i in range(1, x['count']+1)], axis=1)
test = test.explode('filename')

# apply extract_face to each filename
train['cropped'] = train['filename'].apply(lambda x: extract_face(f'lfw-deepfunneled/{x}'))
test['cropped'] = test['filename'].apply(lambda x: extract_face(f'lfw-deepfunneled/{x}'))

# save to csv
train.to_csv('peopleDevTrain.csv', index=False)
test.to_csv('peopleDevTest.csv', index=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


No face detected in lfw-deepfunneled/Andy_Roddick/Andy_Roddick_0010.jpg
No face detected in lfw-deepfunneled/Ariel_Sharon/Ariel_Sharon_0017.jpg
No face detected in lfw-deepfunneled/Arnold_Schwarzenegger/Arnold_Schwarzenegger_0036.jpg
No face detected in lfw-deepfunneled/Barbara_Boxer/Barbara_Boxer_0001.jpg
No face detected in lfw-deepfunneled/Ben_Curtis/Ben_Curtis_0001.jpg
No face detected in lfw-deepfunneled/Bill_Clinton/Bill_Clinton_0012.jpg
No face detected in lfw-deepfunneled/Brian_Lara/Brian_Lara_0001.jpg
No face detected in lfw-deepfunneled/Brian_Schneider/Brian_Schneider_0001.jpg
No face detected in lfw-deepfunneled/Budd_Schulberg/Budd_Schulberg_0001.jpg
No face detected in lfw-deepfunneled/Charles_Mathews/Charles_Mathews_0002.jpg
No face detected in lfw-deepfunneled/Christian_Fittipaldi/Christian_Fittipaldi_0002.jpg
No face detected in lfw-deepfunneled/Clive_Lloyd/Clive_Lloyd_0001.jpg
No face detected in lfw-deepfunneled/Colin_Powell/Colin_Powell_0227.jpg
No face detected in lf

In [4]:
# Release the window
cv2.destroyAllWindows()