In [None]:
#Balancing

import random
import shutil
from pathlib import Path

# Function to balance the dataset
def balance_dataset(original_videos_dir, manipulated_videos_dir, output_dir, target_count=150):
    """
    Creates a balanced dataset by sampling an equal number of videos from
    the original and manipulated video directories.

    Args:
        original_videos_dir (Path): Path to the directory with original videos.
        manipulated_videos_dir (Path): Path to the directory with manipulated videos.
        output_dir (Path): Path to the directory where the balanced dataset will be saved.
        target_count (int): Number of videos to sample for each class.

    Returns:
        tuple: Two tuples containing sampled videos and their corresponding labels.
    """
    # Create output directories
    balanced_original_dir = output_dir / "original"
    balanced_manipulated_dir = output_dir / "manipulated"
    balanced_original_dir.mkdir(parents=True, exist_ok=True)
    balanced_manipulated_dir.mkdir(parents=True, exist_ok=True)

    # Gather video lists
    original_videos = list(original_videos_dir.glob("*.mp4"))
    manipulated_videos = list(manipulated_videos_dir.glob("*.mp4"))

    # Check if enough videos are available
    if len(original_videos) < target_count or len(manipulated_videos) < target_count:
        raise ValueError(f"Not enough videos to sample {target_count} videos from one or both classes.")

    # Random sampling
    random.seed(42)  # For reproducibility
    sampled_original = random.sample(original_videos, target_count)
    sampled_manipulated = random.sample(manipulated_videos, target_count)

    # Copy sampled videos to output directories
    for file in sampled_original:
        shutil.copy(file, balanced_original_dir / file.name)
    for file in sampled_manipulated:
        shutil.copy(file, balanced_manipulated_dir / file.name)

    print(f"Balanced dataset created with {target_count} videos in each class.")
    return (sampled_original, [0] * len(sampled_original)), (sampled_manipulated, [1] * len(sampled_manipulated))


# Paths to the datasets
original_videos_dir = Path("M-DATA/Celeb-real")  # Adjust the path if needed
manipulated_videos_dir = Path("M-DATA/Celeb-synthesis")
output_dir = Path("./balanced_dataset3")

# Count the number of videos in the original directories
num_original_videos = len(list(original_videos_dir.glob("*.mp4")))
num_manipulated_videos = len(list(manipulated_videos_dir.glob("*.mp4")))

print(f"Original videos available: {num_original_videos}")
print(f"Manipulated videos available: {num_manipulated_videos}")

# Create the balanced dataset
try:
    balanced_samples = balance_dataset(
        original_videos_dir, 
        manipulated_videos_dir, 
        output_dir, 
        target_count=500
    )
except ValueError as e:
    print(e)


In [6]:
#EXTRACTING FRAMES

#MAIN WORKING CODE
import os
import cv2
from pathlib import Path

def crop_faces_from_videos(video_dir, output_dir, face_cascade_path, num_images=10, resize_dim=(224, 224)):
    """
    Extracts cropped face images from each video, resizes them to a specified dimension, 
    and saves them in separate folders for each video.

    Args:
        video_dir (Path): Path to the directory containing videos.
        output_dir (Path): Path to the directory where cropped images will be saved.
        face_cascade_path (str): Path to the Haar cascade XML file for face detection.
        num_images (int): Number of cropped images to extract per video.
        resize_dim (tuple): Target size for resizing the cropped face images (width, height).

    Returns:
        None
    """
    # Load Haar Cascade for face detection
    face_cascade = cv2.CascadeClassifier(face_cascade_path)
    if face_cascade.empty():
        raise ValueError("Error loading Haar cascade XML file.")

    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    # Process each video in the directory
    for video_path in video_dir.glob("*.mp4"):
        video_name = video_path.stem
        video_output_dir = output_dir / video_name
        video_output_dir.mkdir(parents=True, exist_ok=True)

        cap = cv2.VideoCapture(str(video_path))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Calculate frame interval
        frame_interval = max(total_frames // num_images, 1)  # Avoid division by zero

        frame_count = 0
        images_saved = 0

        while cap.isOpened() and images_saved < num_images:
            ret, frame = cap.read()
            if not ret:
                break

            # Process every `frame_interval` frame
            if frame_count % frame_interval == 0:
                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
                faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

                for (x, y, w, h) in faces:
                    cropped_face = frame[y:y + h, x:x + w]

                    # Resize cropped face to the target dimensions
                    resized_face = cv2.resize(cropped_face, resize_dim)

                    # Save the resized face image
                    output_image_path = video_output_dir / f"frame{images_saved}_face.jpg"
                    cv2.imwrite(str(output_image_path), resized_face)
                    images_saved += 1
                    break  # Save only one face per frame

            frame_count += 1

        cap.release()

    print(f"Cropped and resized images saved in {output_dir}.")

# Paths to videos and output directory
# original_videos_dir = Path("balanced_dataset3/original")  # Path to original videos
manipulated_videos_dir = Path("balanced_dataset3/manipulated")  # Path to manipulated videos
output_dir = Path("./balanced_dataset_cropped_faces")  # Output directory for cropped images
face_cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_default.xml"

# # Process original and manipulated videos
# crop_faces_from_videos(original_videos_dir, output_dir / "original", face_cascade_path, resize_dim=(224, 224))
crop_faces_from_videos(manipulated_videos_dir, output_dir / "manipulated", face_cascade_path, resize_dim=(224, 224))


Cropped and resized images saved in balanced_dataset_cropped_faces\manipulated.
