In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import shutil
from PIL import Image
from typing import Tuple, List

2024-04-02 14:50:49.897196: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset_name = 'cifar100'

In [3]:
input_dir = '.' 
output_dir = f'./../../processed/{dataset_name}/'

# Process cifar100 Data

In [4]:
# Load MNIST data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data()

y_train = y_train.flatten()
y_test = y_test.flatten()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
[1m169001437/169001437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 0us/step


In [5]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(50000, 32, 32, 3) (50000,) (10000, 32, 32, 3) (10000,)


In [6]:
y_train

array([19, 29,  0, ...,  3,  7, 73])

In [9]:
idx_to_label = {
    0: 'apple',
    1: 'aquarium fish',
    2: 'baby',
    3: 'bear',
    4: 'beaver',
    5: 'bed',
    6: 'bee',
    7: 'beetle',
    8: 'bicycle',
    9: 'bottle',
    10: 'bowl',
    11: 'boy',
    12: 'bridge',
    13: 'bus',
    14: 'butterfly',
    15: 'camel',
    16: 'can',
    17: 'castle',
    18: 'caterpillar',
    19: 'cattle',
    20: 'chair',
    21: 'chimpanzee',
    22: 'clock',
    23: 'cloud',
    24: 'cockroach',
    25: 'couch',
    26: 'crab',
    27: 'crocodile',
    28: 'cup',
    29: 'dinosaur',
    30: 'dolphin',
    31: 'elephant',
    32: 'flatfish',
    33: 'forest',
    34: 'fox',
    35: 'girl',
    36: 'hamster',
    37: 'house',
    38: 'kangaroo',
    39: 'keyboard',
    40: 'lamp',
    41: 'lawn mower',
    42: 'leopard',
    43: 'lion',
    44: 'lizard',
    45: 'lobster',
    46: 'man',
    47: 'maple tree',
    48: 'motorcycle',
    49: 'mountain',
    50: 'mouse',
    51: 'mushroom',
    52: 'oak tree',
    53: 'orange',
    54: 'orchid',
    55: 'otter',
    56: 'palm tree',
    57: 'pear',
    58: 'pickup truck',
    59: 'pine tree',
    60: 'plain',
    61: 'plate',
    62: 'poppy',
    63: 'porcupine',
    64: 'possum',
    65: 'rabbit',
    66: 'raccoon',
    67: 'ray',
    68: 'road',
    69: 'rocket',
    70: 'rose',
    71: 'sea',
    72: 'seal',
    73: 'shark',
    74: 'shrew',
    75: 'skunk',
    76: 'skyscraper',
    77: 'snail',
    78: 'snake',
    79: 'spider',
    80: 'squirrel',
    81: 'streetcar',
    82: 'sunflower',
    83: 'sweet pepper',
    84: 'table',
    85: 'tank',
    86: 'telephone',
    87: 'television',
    88: 'tiger',
    89: 'tractor',
    90: 'train',
    91: 'trout',
    92: 'tulip',
    93: 'turtle',
    94: 'wardrobe',
    95: 'whale',
    96: 'willow tree',
     97: 'wolf',
    98: 'woman',
    99: 'worm'
}


In [10]:
def clear_data_folders(base_dir: str) -> None:
    """
    Clears the contents of the training and testing directories within the specified base directory.

    Args:
        base_dir (str): The path to the base directory containing 'training' and 'testing' subdirectories.

    Returns:
        None: This function does not return a value but clears specified directories.
    """
    for dataset_type in ['training', 'testing']:
        dir_path = os.path.join(base_dir, dataset_type)
        # Check if the directory exists
        if os.path.exists(dir_path):
            # Remove the directory and its contents, then recreate the directory
            shutil.rmtree(dir_path)
            os.makedirs(dir_path, exist_ok=True)
            print(f"Cleared {dataset_type} directory.")
        else:
            # If the directory does not exist, create it
            os.makedirs(dir_path, exist_ok=True)
            print(f"Created {dataset_type} directory.")

clear_data_folders(output_dir)

Created training directory.
Created testing directory.


In [11]:
# Function to save images
def save_images(images: np.ndarray, labels: np.ndarray, dataset_type: str) -> None:
    """
    Saves images from the MNIST dataset to disk, organized in directories corresponding to their labels.

    This function iterates over images and their corresponding labels, saving each image in a JPEG format
    inside a directory structure organized first by dataset type (training or testing), then by class labels.

    Args:
        images (np.ndarray): A numpy array of images from the MNIST dataset, where each image is represented
                             as a 28x28 matrix of pixel values.
        labels (np.ndarray): A numpy array of labels corresponding to the images, indicating the digit
                             (0 through 9) that each image represents.
        dataset_type (str): A string indicating the dataset type, either 'training' or 'testing', which
                            is used to organize the saved images into separate directories.

    Returns:
        None: This function does not return a value but saves images to the disk.
    """
    print(f"Processing images from {dataset_type} set...")
    for idx, (image, label_idx) in enumerate(zip(images, labels)):
        # Directory path for the current label
        label = idx_to_label[label_idx]
        label_dir = os.path.join(output_dir, dataset_type, str(label))
        # Create directory if it doesn't exist
        os.makedirs(label_dir, exist_ok=True)
        # Image file path
        image_file = os.path.join(label_dir, f'{idx}.jpg')
        # Save the image
        img = Image.fromarray(image)
        img.save(image_file, 'JPEG')
        
        if idx > 0 and idx % 5000 == 0: 
            print(f"Processed {idx} images.")
    print(f"Done processing {idx+1} images in {dataset_type} set")

In [12]:
# Save training images
save_images(x_train, y_train, 'training')
# Save testing images
save_images(x_test, y_test, 'testing')

print("Images have been saved.")

Processing images from training set...
Processed 5000 images.
Processed 10000 images.
Processed 15000 images.
Processed 20000 images.
Processed 25000 images.
Processed 30000 images.
Processed 35000 images.
Processed 40000 images.
Processed 45000 images.
Done processing 50000 images in training set
Processing images from testing set...
Processed 5000 images.
Done processing 10000 images in testing set
Images have been saved.
