In [2]:
import os
import cv2
import numpy as np
import random
import gc
import sys
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

### MobileNet Data Preprocessing: 224x224

In [7]:
# Define data augmentation parameters for training
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    rescale=1./255,  # Normalizing the images
    validation_split=0.2  # Set the validation split
)

# No augmentation for validation data, just rescaling
val_datagen = ImageDataGenerator(rescale=1./255)

# Base directory containing all images
base_directory = './data/train'

# Creating a DataFrame containing file paths and labels
data = []
for label in os.listdir(base_directory):
    label_dir = os.path.join(base_directory, label)
    if not os.path.isdir(label_dir) or label.startswith('.'):
        continue

    image_files = os.listdir(label_dir)  # Refresh the list of image files for the current label

    # If there are more than 500 images, randomly select 500
    if len(image_files) > 500:
        image_files = random.sample(image_files, 750)
    
    print(f"Processing label: {label} - {len(image_files)} images") 

    for img in image_files:
        data.append({'filename': os.path.join(label, img), 'class': label})

data_df = pd.DataFrame(data)


# Splitting the data into training and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.2, stratify=data_df['class'])

# Flow images in batches from dataframe for training and validation
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=base_directory,
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=base_directory,
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

def main():
    # Initialize lists to store data
    X_train, y_train = [], []
    X_val, y_val = [], []

    # Total number of batches
    total_train_batches = train_generator.n // train_generator.batch_size
    total_val_batches = val_generator.n // val_generator.batch_size

    # Iterate over the train generator and collect images and labels
    for i in range(total_train_batches):
        imgs, labels = next(train_generator)
        X_train.append(imgs)
        y_train.append(labels)
        progress = ((i + 1) / total_train_batches) * 100  # Calculate progress percentage
        print(f"Training progress: {progress:.2f}%", end="\r", flush=True)
        sys.stdout.flush()

    # Convert lists to numpy arrays
    X_train = np.concatenate(X_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)

    # Repeat the process for the validation generator
    for i in range(total_val_batches):
        imgs, labels = next(val_generator)
        X_val.append(imgs)
        y_val.append(labels)
        progress = ((i + 1) / total_val_batches) * 100  # Calculate progress percentage
        print(f"Validation progress: {progress:.2f}%", end="\r", flush=True)
        sys.stdout.flush()


    X_val = np.concatenate(X_val, axis=0)
    y_val = np.concatenate(y_val, axis=0)

    # Save the numpy arrays to files
    np.save('X_train-s.npy', X_train)
    np.save('X_val-s.npy', X_val)
    np.save('y_train-s.npy', y_train)
    np.save('y_val-s.npy', y_val)
    print('Files saved')


if __name__ == '__main__':
    main()



Processing label: I - 750 images
Processing label: P - 750 images
Processing label: N - 750 images
Processing label: H - 750 images
Processing label: K - 750 images
Processing label: C - 750 images
Processing label: D - 750 images
Processing label: W - 750 images
Processing label: X - 750 images
Processing label: O - 750 images
Processing label: Q - 750 images
Processing label: F - 750 images
Processing label: J - 750 images
Processing label: V - 750 images
Processing label: G - 750 images
Processing label: U - 750 images
Processing label: A - 750 images
Processing label: L - 750 images
Processing label: B - 750 images
Processing label: Z - 750 images
Processing label: R - 750 images
Processing label: M - 750 images
Processing label: E - 750 images
Processing label: T - 750 images
Processing label: S - 750 images
Processing label: Y - 750 images
Found 15600 validated image filenames belonging to 26 classes.
Found 3900 validated image filenames belonging to 26 classes.
Files savedprogre

### GoogleNet Preprocessing: 299x299

In [None]:
# Define data augmentation parameters for training
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    rescale=1./255,  # Normalizing the images
    validation_split=0.2  # Set the validation split
)

# No augmentation for validation data, just rescaling
val_datagen = ImageDataGenerator(rescale=1./255)

# Base directory containing all images
base_directory = './data/train'

# Creating a DataFrame containing file paths and labels
data = []
for label in os.listdir(base_directory):
    label_dir = os.path.join(base_directory, label)
    if not os.path.isdir(label_dir) or label.startswith('.'):
        continue

    image_files = os.listdir(label_dir)  # Refresh the list of image files for the current label

    image_files = random.sample(image_files, 300)
    
    print(f"Processing label: {label} - {len(image_files)} images") 

    for img in image_files:
        data.append({'filename': os.path.join(label, img), 'class': label})

data_df = pd.DataFrame(data)


# Splitting the data into training and validation sets
train_df, val_df = train_test_split(data_df, test_size=0.2, stratify=data_df['class'])

# Flow images in batches from dataframe for training and validation
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=base_directory,
    x_col='filename',
    y_col='class',
    target_size=(299, 299),
    batch_size=32,
    class_mode='categorical'
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=base_directory,
    x_col='filename',
    y_col='class',
    target_size=(299, 299),
    batch_size=32,
    class_mode='categorical'
)

def main():
    # Initialize lists to store data
    X_train, y_train = [], []
    X_val, y_val = [], []

    # Total number of batches
    total_train_batches = train_generator.n // train_generator.batch_size
    total_val_batches = val_generator.n // val_generator.batch_size

    # Iterate over the train generator and collect images and labels
    for i in range(total_train_batches):
        imgs, labels = next(train_generator)
        X_train.append(imgs)
        y_train.append(labels)
        progress = ((i + 1) / total_train_batches) * 100  # Calculate progress percentage
        print(f"Training progress: {progress:.2f}%", end="\r", flush=True)
        sys.stdout.flush()

    # Convert lists to numpy arrays
    X_train = np.concatenate(X_train, axis=0)
    y_train = np.concatenate(y_train, axis=0)

    # Repeat the process for the validation generator
    for i in range(total_val_batches):
        imgs, labels = next(val_generator)
        X_val.append(imgs)
        y_val.append(labels)
        progress = ((i + 1) / total_val_batches) * 100  # Calculate progress percentage
        print(f"Validation progress: {progress:.2f}%", end="\r", flush=True)
        sys.stdout.flush()

    X_val = np.concatenate(X_val, axis=0)
    y_val = np.concatenate(y_val, axis=0)

    # Save the numpy arrays to files
    np.save('X_train-299.npy', X_train)
    np.save('X_val-299.npy', X_val)
    np.save('y_train-299.npy', y_train)
    np.save('y_val-299.npy', y_val)
    
    print('Files saved')


if __name__ == '__main__':
    main()



Processing label: I - 300 images
Processing label: P - 300 images
Processing label: N - 300 images
Processing label: H - 300 images
Processing label: K - 300 images
Processing label: C - 300 images
Processing label: D - 300 images
Processing label: W - 300 images
Processing label: X - 300 images
Processing label: O - 300 images
Processing label: Q - 300 images
Processing label: F - 300 images
Processing label: J - 300 images
Processing label: V - 300 images
Processing label: G - 300 images
Processing label: U - 300 images
Processing label: A - 300 images
Processing label: L - 300 images
Processing label: B - 300 images
Processing label: Z - 300 images
Processing label: R - 300 images
Processing label: M - 300 images
Processing label: E - 300 images
Processing label: T - 300 images
Processing label: S - 300 images
Processing label: Y - 300 images
Found 6240 validated image filenames belonging to 26 classes.
Found 1560 validated image filenames belonging to 26 classes.
Training progress: