In [None]:
import os
import pandas as pd
from mtcnn import MTCNN
import numpy as np
import cv2
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

# Initialize MTCNN detector globally for parallel workers
detector = MTCNN()

# Function to process a single image
def process_image(image_path):
    try:
        image_file = os.path.basename(image_path)
        
        # Read the image and detect faces
        image = cv2.imread(image_path)
        results = detector.detect_faces(image)
        
        # Extract face details
        boxes_mtcnn = []
        faces_mtcnn = len(results)
        faces_mtcnn_avg = 0
        faces_mtcnn_median = 0
        
        for result in results:
            box = result['box']
            confidence = result['confidence']
            if len(box) == 4:  # Ensure the box is in the expected format
                boxes_mtcnn.append((box, confidence))
        
        # Calculate the average and median of the box areas if faces are detected
        if faces_mtcnn > 0:
            areas = [box[2] * box[3] for box, _ in boxes_mtcnn]
            faces_mtcnn_avg = np.mean(areas)
            faces_mtcnn_median = np.median(areas)
        
        # Return the details for this image
        return {
            'filename': image_file,
            'image': image_path,
            'boxes_mtcnn': boxes_mtcnn,
            'faces_mtcnn': faces_mtcnn,
            'faces_mtcnn_avg': faces_mtcnn_avg,
            'faces_mtcnn_median': faces_mtcnn_median
        }
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

# Function to get all image paths from the input folder
def get_image_paths(input_folder):
    image_paths = []
    for root, dirs, files in os.walk(input_folder):
        for file in files:
            if file.endswith('.jpg'):
                image_paths.append(os.path.join(root, file))
    return image_paths

# Wrapper to integrate tqdm with multiprocessing
def process_images_with_progress(image_paths, num_workers):
    results = []
    with Pool(num_workers) as pool:
        # Use tqdm to show progress
        for result in tqdm(pool.imap(process_image, image_paths), total=len(image_paths), desc="Processing Images", unit="image"):
            results.append(result)
    return results

# Main function to run the processing
def main(input_folder, output_csv_path):
    # Get all image paths
    image_paths = get_image_paths(input_folder)
    print(f"Found {len(image_paths)} images.")

    # Use a multiprocessing pool to process images in parallel with tqdm
    num_workers = max(1, cpu_count() - 1)  # Reserve one core for the main process
    print(f"Using {num_workers} parallel workers.")
    
    # Process images with progress tracking
    results = process_images_with_progress(image_paths, num_workers)

    # Filter out any None results due to errors
    results = [res for res in results if res is not None]
    
    # Convert the results to a pandas DataFrame
    df = pd.DataFrame(results)

    # Save the data to CSV
    df.to_csv(output_csv_path, index=False)
    print(f"CSV file saved at {output_csv_path}")

# Define paths
input_folder = 'dfdc/image'  # Folder containing all images
output_csv_path = 'faces_mtcnn_data_parallel.csv'

# Run the main function
if __name__ == "__main__":
    main(input_folder, output_csv_path)


Found 52834 images.
Using 13 parallel workers.


Processing Images:   0%|                                                                  | 0/52834 [00:00<?, ?image/s]