In [None]:
import os
import pandas as pd
from mtcnn import MTCNN
import numpy as np
import cv2

# Function to process a single image
def process_image(image_path):
    try:
        detector = MTCNN()  # Initialize MTCNN here
        # Extract the image filename
        image_file = os.path.basename(image_path)
        # Extract video name (assumes the video name is in the parent directory of the image)
        video_name = os.path.basename(os.path.dirname(image_path)) + ".mp4"

        image = cv2.imread(image_path)
        if image is None:
            print(f"Could not read image: {image_path}")
            return None

        # Resize image for faster detection
        if image.shape[1] > 640:  # Resize if width is greater than 640
            image = cv2.resize(image, (640, int(640 * image.shape[0] / image.shape[1])))
        
        results = detector.detect_faces(image)
        boxes_mtcnn = []
        faces_mtcnn = len(results)
        faces_mtcnn_avg = 0
        faces_mtcnn_median = 0

        for result in results:
            box = result['box']
            confidence = result['confidence']
            if len(box) == 4:
                boxes_mtcnn.append((box, confidence))

        if faces_mtcnn > 0:
            areas = [box[2] * box[3] for box, _ in boxes_mtcnn]
            faces_mtcnn_avg = np.mean(areas)
            faces_mtcnn_median = np.median(areas)

        return {
            'filename': video_name,  # Video filename (e.g., videoname.mp4)
            'image': image_path,  # Full image path as in the original
            'boxes_mtcnn': boxes_mtcnn,
            'faces_mtcnn': faces_mtcnn,
            'faces_mtcnn_avg': faces_mtcnn_avg,
            'faces_mtcnn_median': faces_mtcnn_median
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to get all image paths from the input folder
def get_image_paths(input_folder):
    image_paths = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.jpg'):
                image_paths.append(os.path.join(root, file))
    return image_paths

# Main function
def main(input_folder, output_csv_path):
    # Get all valid image paths
    image_paths = get_image_paths(input_folder)
    print(f"Found {len(image_paths)} images.")
    
    results = []
    for image_path in image_paths:
        result = process_image(image_path)
        if result is not None:
            results.append(result)
    
    # Convert the results to a pandas DataFrame
    df = pd.DataFrame(results)

    # Save the data to CSV
    if not df.empty:
        df.to_csv(output_csv_path, index=False)
        print(f"CSV file saved at {output_csv_path}")
    else:
        print("No valid results to save.")

# Main block
if __name__ == "__main__":
    input_folder = 'dfdc/image'  # Folder containing all images
    output_csv_path = 'faces_mtcnn_data.csv'  # Output CSV path

    # Run the main function
    main(input_folder, output_csv_path)


Found 52834 images.


In [2]:
import pandas as pd
df = pd.read_csv("faces_mtcnn_data.csv")

In [3]:
df.head()

Unnamed: 0,filename,image,boxes_mtcnn,faces_mtcnn,faces_mtcnn_avg,faces_mtcnn_median
0,aatgqvvrta.mp4,dfdc/image\aatgqvvrta\frame_00000.jpg,"[([351, 73, 22, 28], 0.9943623542785645)]",1,616.0,616.0
1,aatgqvvrta.mp4,dfdc/image\aatgqvvrta\frame_00001.jpg,"[([352, 72, 22, 28], 0.9981972575187683)]",1,616.0,616.0
2,aatgqvvrta.mp4,dfdc/image\aatgqvvrta\frame_00002.jpg,"[([349, 72, 22, 28], 0.9976887702941895)]",1,616.0,616.0
3,aatgqvvrta.mp4,dfdc/image\aatgqvvrta\frame_00003.jpg,"[([348, 72, 23, 27], 0.998422384262085)]",1,621.0,621.0
4,aatgqvvrta.mp4,dfdc/image\aatgqvvrta\frame_00004.jpg,"[([350, 72, 22, 26], 0.9976015686988831)]",1,572.0,572.0


In [6]:
df.nunique

<bound method DataFrame.nunique of              filename                                  image  \
0      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00000.jpg   
1      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00001.jpg   
2      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00002.jpg   
3      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00003.jpg   
4      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00004.jpg   
...               ...                                    ...   
52829  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00025.jpg   
52830  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00026.jpg   
52831  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00027.jpg   
52832  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00028.jpg   
52833  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00029.jpg   

                                     boxes_mtcnn  faces_mtcnn  \
0      [([351, 73, 22, 28], 0.9943623542785645)]            1   
1      [([352, 72, 22, 28], 0.9981972575187683)]            1   
2

In [11]:
len(df)

52834

In [12]:
duplicates = df['filename'].duplicated(keep=False)  # `keep=False` marks all duplicates as True
repeating_rows = df[duplicates]  # Filter the rows with repeating filenames

print("Repeating rows:")
print(repeating_rows)

Repeating rows:
             filename                                  image  \
0      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00000.jpg   
1      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00001.jpg   
2      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00002.jpg   
3      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00003.jpg   
4      aatgqvvrta.mp4  dfdc/image\aatgqvvrta\frame_00004.jpg   
...               ...                                    ...   
52829  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00025.jpg   
52830  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00026.jpg   
52831  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00027.jpg   
52832  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00028.jpg   
52833  rhcifqslqe.mp4  dfdc/image\rhcifqslqe\frame_00029.jpg   

                                     boxes_mtcnn  faces_mtcnn  \
0      [([351, 73, 22, 28], 0.9943623542785645)]            1   
1      [([352, 72, 22, 28], 0.9981972575187683)]            1   
2      [([349, 72, 2

In [13]:

# Check if there are duplicate filenames
if df['filename'].duplicated().any():
    print("There are rows with the same filename.")
else:
    print("All filenames are unique.")

There are rows with the same filename.
