In [39]:
BATCH_SIZE = 1
SCALE = 0.5
N_FRAMES = 50

In [40]:
import matplotlib.pyplot as plt
from torchvision import transforms
import torch

def display_faces_grid(faces_list):
    """
    Display a grid of face images from a list of tensors where each tensor shape is [1, 3, H, W].
    Args:
        faces_list (list): List of tensors, each tensor is of shape [1, 3, H, W].
    """
    n_images = len(faces_list)
    n_rows = int(torch.sqrt(torch.tensor(n_images)).ceil())
    n_cols = (n_images // n_rows) + (0 if n_images % n_rows == 0 else 1)
    
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols * 2, n_rows * 2))
    axs = axs.flatten() if n_images > 1 else [axs]
    
    for idx, ax in enumerate(axs):
        if idx < n_images:
            # Remove batch dimension and convert tensor image to a format suitable for displaying
            image = faces_list[idx].squeeze(0).cpu().detach()
            image = transforms.ToPILImage()(image)
            ax.imshow(image)
            ax.axis('off')
        else:
            ax.axis('off')  # Hide unused axes

    plt.tight_layout()
    plt.show()


In [41]:
import os
import glob
import json
import torch
import cv2
from PIL import Image
import numpy as np
import pandas as pd
import logging
from tqdm.notebook import tqdm
from facenet_pytorch import MTCNN, InceptionResnetV1

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'Running on device: {device}')

class DetectionPipeline:
    """Pipeline class for detecting faces in the frames of a video file."""
    
    def __init__(self, detector, n_frames=None, batch_size=60, resize=None):
        """Constructor for DetectionPipeline class."""
        self.detector = detector
        self.n_frames = n_frames
        self.batch_size = batch_size
        self.resize = resize
        logging.basicConfig(level=logging.INFO)
    
    def __call__(self, filename):
        """Load frames from an MP4 video and detect faces."""
        v_cap = cv2.VideoCapture(filename)
        if not v_cap.isOpened():
            logging.error(f"Failed to open video file {filename}")
            return []
        
        v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
        logging.info(f"Video length: {v_len} frames")

        sample = np.linspace(0, v_len - 1, self.n_frames).astype(int) if self.n_frames is not None else np.arange(v_len)

        faces = []
        frames = []
        for j in range(v_len):
            if not v_cap.grab():
                logging.warning(f"Frame {j} could not be grabbed")
                continue

            if j in sample:
               
                success, frame = v_cap.retrieve()
                if not success:
                    logging.warning(f"Frame {j} could not be retrieved")
                    continue
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
              
                frame = Image.fromarray(frame)
                if self.resize is not None:
                    frame = frame.resize([int(d * self.resize) for d in frame.size])
                    

                frames.append(frame)
              

                if len(frames) % self.batch_size == 0 or j == sample[-1]:
                    faces.extend(self.detector(frames))
                    frames = []

        v_cap.release()
        return faces

Running on device: cuda:0


In [42]:
import torch

def process_faces(faces, feature_extractor):
    """
    Process face images to calculate the centroid of their embeddings and the distances from the centroid.

    Args:
        faces (list of torch.Tensor): List of tensors representing detected faces.
        feature_extractor (torch.nn.Module): Pretrained neural network model to generate embeddings.

    Returns:
        numpy.ndarray or None: Array of distances of each face's embedding from the centroid, or None if no faces.
    """
    # Filter out None values that represent frames without detected faces
    
    try:
        # Concatenate all face tensors and move to the specified device
        faces = torch.cat(faces).to(device)

        # Generate embeddings for each face using the feature extractor
        embeddings = feature_extractor(faces)
        embeddings

        # Compute the centroid of all embeddings
        centroid = embeddings.mean(dim=0)

        # Calculate the Euclidean distance from each embedding to the centroid
        distances = (embeddings - centroid).norm(dim=1).cpu().numpy()

        return distances
    except Exception as e:
        # Log or handle exceptions that may occur during processing
        logging.error(f"An error occurred while processing faces: {str(e)}")
        return None


In [43]:
# Load face detector
face_detector = MTCNN(margin=40, keep_all=False, device=device, factor=0.5).eval()

# Load facial recognition model
feature_extractor = InceptionResnetV1(pretrained='vggface2', device=device).eval()

# Define face detection pipeline
detection_pipeline = DetectionPipeline(detector=face_detector, n_frames=N_FRAMES, batch_size=BATCH_SIZE, resize=SCALE)

In [44]:
import pandas as pd
import torch
from tqdm.notebook import tqdm


# Load the CSV containing full paths and labels
data = pd.read_csv('full_dataset.csv')

# Count the instances of each label
label_counts = data['label'].value_counts()
print("Initial label counts:", label_counts)

# Find the number of instances in the minority class
min_class_size = label_counts.min()

# Downsample the majority class
balanced_data = pd.concat([
    data[data['label'] == 0].sample(min_class_size, random_state=42),
    data[data['label'] == 1].sample(min_class_size, random_state=42)
])

# Shuffle the DataFrame
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced DataFrame to a new CSV file
balanced_csv_path = 'balanced_train_data.csv'
balanced_data.to_csv(balanced_csv_path, index=False)

print("Balanced dataset saved.")




Initial label counts: label
0    5979
1    1068
Name: count, dtype: int64
Balanced dataset saved.


In [45]:


# Initialize an empty DataFrame for storing results
df = pd.DataFrame(columns=['filename', 'distance', 'label'])
with torch.no_grad():
    data = []
    for idx, row in tqdm(balanced_data.iterrows(), total=balanced_data.shape[0]):
        video_path = row['path']
        file_name = os.path.basename(video_path)

        faces = detection_pipeline(video_path)  # Returns a list of [1, 3, 160, 160] tensors
          # Display the grid of faces

        distances = process_faces(faces, feature_extractor)
        if distances is not None:
            for distance in distances:
                data.append({
                    'filename': file_name,
                    'distance': distance,
                    'label': row['label']
                })

    df = pd.DataFrame(data)
    df.to_csv('processed_full_dataset.csv', index=False)


  0%|          | 0/2136 [00:00<?, ?it/s]

INFO:root:Video length: 611 frames
ERROR:root:An error occurred while processing faces: Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 150, 160, 160] to have 3 channels, but got 150 channels instead
INFO:root:Video length: 409 frames
ERROR:root:An error occurred while processing faces: Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 150, 160, 160] to have 3 channels, but got 150 channels instead
INFO:root:Video length: 333 frames
ERROR:root:An error occurred while processing faces: Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 150, 160, 160] to have 3 channels, but got 150 channels instead
INFO:root:Video length: 307 frames
ERROR:root:An error occurred while processing faces: Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 150, 160, 160] to have 3 channels, but got 150 channels instead
INFO:root:Video length: 342 frames
ERROR:root:An error occurred while processing faces: Given groups=1, weight of size [32, 3, 3, 3], ex

KeyboardInterrupt: 