In [None]:
import os
import shutil
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
# Define directories
base_dir = 'data/dataset'
train_dir = 'data/train'
val_dir = 'data/validation'
test_dir = 'data/test'

# Create directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)


In [None]:
# Split data (70/15/15)
classes = os.listdir(base_dir)
for cls in classes:
    os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(test_dir, cls), exist_ok=True)
    
    # Get all images for this class
    images = os.listdir(os.path.join(base_dir, cls))
    random.shuffle(images)
    
    # Calculate split points
    train_split = int(0.7 * len(images))
    val_split = int(0.85 * len(images))
    
    # Split images into train/val/test
    train_images = images[:train_split]
    val_images = images[train_split:val_split]
    test_images = images[val_split:]
    
    # Copy images to respective folders
    for img in train_images:
        shutil.copy(os.path.join(base_dir, cls, img), os.path.join(train_dir, cls, img))
    for img in val_images:
        shutil.copy(os.path.join(base_dir, cls, img), os.path.join(val_dir, cls, img))
    for img in test_images:
        shutil.copy(os.path.join(base_dir, cls, img), os.path.join(test_dir, cls, img))

In [None]:
# Create data generators with augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
# Create data generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

validation_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

In [None]:
# Save class indices for later use
import json
with open('model/class_indices.json', 'w') as f:
    json.dump(train_generator.class_indices, f)

print("Data preparation complete!")
print(f"Number of training samples: {train_generator.samples}")
print(f"Number of validation samples: {validation_generator.samples}")
print(f"Number of test samples: {test_generator.samples}")
print(f"Number of classes: {len(train_generator.class_indices)}")