In [1]:
import os
import cv2  # OpenCV library for image processing
import numpy as np
from tqdm import tqdm

# --- Configuration ---
# Path to the base directory of the extracted dataset
BASE_DATA_DIR = '../data/fer2013/'

# Directory where the processed NumPy arrays will be saved
OUTPUT_DIR = '../data/fer2013_processed/'

# Image dimensions for the model
IMG_WIDTH = 48
IMG_HEIGHT = 48

# Mapping folder names to integer labels for the model
EMOTION_MAP = {
    'angry': 0,
    'disgust': 1,
    'fear': 2,
    'happy': 3,
    'sad': 4,
    'surprise': 5,
    'neutral': 6
}

def load_data_from_directory(directory_path):
    """
    Loads images and labels from a directory structured with emotion subfolders.
    
    Args:
        directory_path (str): Path to the directory (e.g., '../data/fer2013/train').

    Returns:
        tuple: A tuple containing two lists: images and labels.
    """
    images = []
    labels = []
    
    print(f"Loading data from: {directory_path}")
    # Loop through each emotion folder (e.g., 'happy', 'sad')
    for emotion_name, label in tqdm(EMOTION_MAP.items(), desc="Processing emotions"):
        emotion_folder_path = os.path.join(directory_path, emotion_name)

        # Check if the folder exists
        if not os.path.isdir(emotion_folder_path):
            continue

        # Loop through each image file in the emotion folder
        for image_name in os.listdir(emotion_folder_path):
            image_path = os.path.join(emotion_folder_path, image_name)
            
            # Read the image in grayscale format
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            
            # Ensure the image was loaded correctly
            if image is not None:
                # Resize to ensure uniform dimensions
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                images.append(image)
                labels.append(label)
                
    return images, labels

def process_images_from_folders():
    """
    Main function to process training and testing images from folders,
    normalize them, and save as NumPy arrays.
    """
    # Load training data
    train_path = os.path.join(BASE_DATA_DIR, 'train')
    X_train, y_train = load_data_from_directory(train_path)

    # Load validation/testing data
    val_path = os.path.join(BASE_DATA_DIR, 'test')
    X_val, y_val = load_data_from_directory(val_path)

    print("\nConverting lists to NumPy arrays and normalizing data...")
    # Convert lists to NumPy arrays and normalize pixel values (0-1 range)
    X_train = np.array(X_train, 'float32') / 255.0
    y_train = np.array(y_train)
    
    X_val = np.array(X_val, 'float32') / 255.0
    y_val = np.array(y_val)

    # Add the channel dimension for the CNN (48, 48) -> (48, 48, 1)
    X_train = np.expand_dims(X_train, -1)
    X_val = np.expand_dims(X_val, -1)

    # Create the output directory if it doesn't exist
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created directory: {OUTPUT_DIR}")

    # Save the processed data
    print(f"Saving processed data to {OUTPUT_DIR}...")
    np.save(os.path.join(OUTPUT_DIR, 'X_train.npy'), X_train)
    np.save(os.path.join(OUTPUT_DIR, 'y_train.npy'), y_train)
    np.save(os.path.join(OUTPUT_DIR, 'X_val.npy'), X_val)
    np.save(os.path.join(OUTPUT_DIR, 'y_val.npy'), y_val) 

    print("\n--- Preprocessing Complete ---")
    print(f"Training data shape: {X_train.shape}")
    print(f"Training labels shape: {y_train.shape}")
    print(f"Validation data shape: {X_val.shape}")
    print(f"Validation labels shape: {y_val.shape}")
    print("----------------------------")

if __name__ == '__main__':
    process_images_from_folders()

Loading data from: ../data/fer2013/train


Processing emotions: 100%|███████████████████████████████████████████████████████████████| 7/7 [06:41<00:00, 57.34s/it]


Loading data from: ../data/fer2013/test


Processing emotions: 100%|███████████████████████████████████████████████████████████████| 7/7 [01:52<00:00, 16.00s/it]



Converting lists to NumPy arrays and normalizing data...
Saving processed data to ../data/fer2013_processed/...

--- Preprocessing Complete ---
Training data shape: (28709, 48, 48, 1)
Training labels shape: (28709,)
Validation data shape: (7178, 48, 48, 1)
Validation labels shape: (7178,)
----------------------------
