In [11]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import shutil
from PIL import Image
from typing import Tuple, List

In [12]:
dataset_name = 'mini_mnist'

In [13]:
input_dir = '.' 
output_dir = f'./../../processed/{dataset_name}/'

# Process Mini MNIST Data

In [14]:
# Load MNIST data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [15]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(60000, 28, 28) (60000,) (10000, 28, 28) (10000,)


In [16]:
def clear_data_folders(base_dir: str) -> None:
    """
    Clears the contents of the training and testing directories within the specified base directory.

    Args:
        base_dir (str): The path to the base directory containing 'training' and 'testing' subdirectories.

    Returns:
        None: This function does not return a value but clears specified directories.
    """
    for dataset_type in ['training', 'testing']:
        dir_path = os.path.join(base_dir, dataset_type)
        # Check if the directory exists
        if os.path.exists(dir_path):
            # Remove the directory and its contents, then recreate the directory
            shutil.rmtree(dir_path)
            os.makedirs(dir_path, exist_ok=True)
            print(f"Cleared {dataset_type} directory.")
        else:
            # If the directory does not exist, create it
            os.makedirs(dir_path, exist_ok=True)
            print(f"Created {dataset_type} directory.")


clear_data_folders(output_dir)

Cleared training directory.
Cleared testing directory.


In [17]:
def save_images(
        images: np.ndarray,
        labels: np.ndarray,
        dataset_type: str,
        sampling_frac: float,
        seed: int = 42
    ) -> None:
    """
    Saves a fraction of images from the MNIST dataset to disk, organized in directories corresponding
    to their labels, with stratified sampling to maintain class distribution.

    Args:
        images (np.ndarray): A numpy array of images, where each image is represented as a 28x28 matrix
                             of pixel values.
        labels (np.ndarray): A numpy array of labels corresponding to the images, indicating the digit
                            (0 through 9).
        dataset_type (str): A string indicating the dataset type ('training' or 'testing') for directory
                            organization.
        sampling_frac (float): A float between 0 and 1 indicating the fraction of the dataset to save.
        seed (int, optional): An integer seed for reproducibility of the sampling. Defaults to 42.

    Returns:
        None: This function does not return a value but saves a fraction of images to the disk.
    """
    np.random.seed(seed)  # Fix the seed for reproducibility
    unique_labels = np.unique(labels)
    
    print(f"Processing {sampling_frac*100}% of images from {dataset_type} set...")

    for label in unique_labels:
        # Filter images and labels for the current label
        idxs = np.where(labels == label)[0]
        # Stratified sampling: select a fraction of indices for the current label
        sampled_idxs = np.random.choice(idxs, size=int(len(idxs) * sampling_frac), replace=False)

        for idx in sampled_idxs:
            image = images[idx]
            # Directory path for the current label
            label_dir = os.path.join(output_dir, dataset_type, str(label))
            # Create directory if it doesn't exist
            os.makedirs(label_dir, exist_ok=True)
            # Image file path
            image_file = os.path.join(label_dir, f'{idx}.jpg')
            # Save the image
            img = Image.fromarray(image)
            img.save(image_file, 'JPEG')
    
    print(f"Done processing images in {dataset_type} set.")

In [18]:
# Sample and Save training images
sampling_frac = 0.1

save_images(x_train, y_train, 'training', sampling_frac)
# Save testing images
save_images(x_test, y_test, 'testing', sampling_frac)

print("Images have been saved.")

Processing 10.0% of images from training set...
Done processing images in training set.
Processing 10.0% of images from testing set...
Done processing images in testing set.
Images have been saved.
