In [7]:
# 02_data_preprocessing.ipynb

# ----------------------------
# Step 1: Imports and Setup
# ----------------------------
import os
import sys
from pathlib import Path
import tensorflow as tf
import matplotlib.pyplot as plt

# Add project root to path
sys.path.append(os.path.abspath(".."))

from src.model_config import ModelConfig
from src.data_preprocessing import DataLoader
from src.utils import (
    validate_data_structure,
    print_system_info,
    display_sample_images
)
from src.data_augmentation import get_augmentation_pipeline

# ----------------------------
# Step 2: Load Configurations
# ----------------------------
config = ModelConfig()
data_params = config.get_data_params()

DATASET_PATH = Path(data_params["dataset_path"])
PROCESSED_PATH = Path(data_params["processed_path"])
IMAGE_SIZE = tuple(data_params["image_size"])
BATCH_SIZE = data_params["batch_size"]
VALIDATION_SPLIT = data_params["validation_split"]
TEST_SPLIT = data_params["test_split"]
SEED = data_params["random_seed"]

print_system_info()

# ----------------------------
# Step 3: Prepare Dataset
# ----------------------------
if not validate_data_structure(PROCESSED_PATH):
    print("[INFO] Processed directory structure missing. Generating split datasets...")
    config.create_directories()

    loader = DataLoader(
        dataset_path=DATASET_PATH,
        processed_path=PROCESSED_PATH,
        image_size=IMAGE_SIZE,
        seed=SEED
    )
    loader.prepare_and_split_data(
        val_split=VALIDATION_SPLIT,
        test_split=TEST_SPLIT
    )
else:
    print("[INFO] Processed dataset structure found. Skipping splitting step.")

# ----------------------------
# Step 4: Load TF Datasets
# ----------------------------
print("\n[INFO] Loading datasets...")
train_dir = PROCESSED_PATH / "train"
val_dir = PROCESSED_PATH / "validation"
test_dir = PROCESSED_PATH / "test"

train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=True,
    seed=SEED
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=False
)

class_names = train_ds.class_names
print(f"\n[INFO] Classes: {class_names}")

# ----------------------------
# Step 5: Show Sample Images
# ----------------------------
print("\n[INFO] Displaying sample training images...")
display_sample_images(train_ds, class_names)

# ----------------------------
# Step 6: Data Augmentation
# ----------------------------
print("\n[INFO] Applying augmentation to training data...")
augmentation_layer = get_augmentation_pipeline(config.augmentation_config)
train_ds = train_ds.map(lambda x, y: (augmentation_layer(x, training=True), y))

# ----------------------------
# Step 7: Prefetch for Performance
# ----------------------------
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

# ----------------------------
# Step 8: Print Summary
# ----------------------------
print("\n✅ Dataset pipeline ready!")
print(f"Training batches:   {len(train_ds)}")
print(f"Validation batches: {len(val_ds)}")
print(f"Test batches:       {len(test_ds)}")

ModuleNotFoundError: No module named 'seaborn'