In [2]:
# 02_data_preprocessing.ipynb

# Step 1: Imports and Configuration
import os
import sys
import shutil
from pathlib import Path

import tensorflow as tf
import matplotlib.pyplot as plt

# Add project root to path
sys.path.append("..")

from src.model_config import ModelConfig
from src.data_preprocessing import DataLoader
from src.utils import validate_data_structure, print_system_info, display_sample_images
from src.data_augmentation import get_augmentation_pipeline

# Step 2: Load Configuration
config = ModelConfig()
data_params = config.get_data_params()

DATASET_PATH = Path(data_params["dataset_path"])
PROCESSED_PATH = Path(data_params["processed_path"])
IMAGE_SIZE = tuple(data_params["image_size"])
BATCH_SIZE = data_params["batch_size"]
VALIDATION_SPLIT = data_params["validation_split"]
TEST_SPLIT = data_params["test_split"]
SEED = data_params["random_seed"]

# Step 3: Verify Dataset Structure
print("Validating dataset structure...")
if not validate_data_structure(PROCESSED_PATH):
    print("Processed directory structure is missing. Creating splits...")
    # Create processed folders
    config.create_directories()
    
    # Step 4: Load and Split Dataset
    loader = DataLoader(
        dataset_path=DATASET_PATH,
        processed_path=PROCESSED_PATH,
        image_size=IMAGE_SIZE,
        seed=SEED
    )
    loader.prepare_and_split_data(val_split=VALIDATION_SPLIT, test_split=TEST_SPLIT)

else:
    print("Data directory structure is valid.")

# Step 5: Display Sample Images from Training Set
print("Displaying sample training images...")
train_dir = PROCESSED_PATH / "train"
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=True,
    seed=SEED
)
class_names = train_ds.class_names
display_sample_images(train_ds, class_names)

# Step 6: Augmentation Pipeline
print("Creating data augmentation pipeline...")
aug_pipeline = get_augmentation_pipeline(config.augmentation_config)

# Step 7: Apply Augmentations to Training Data
train_ds = train_ds.map(lambda x, y: (aug_pipeline(x, training=True), y))
val_ds = tf.keras.utils.image_dataset_from_directory(
    PROCESSED_PATH / "validation",
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=False
)
test_ds = tf.keras.utils.image_dataset_from_directory(
    PROCESSED_PATH / "test",
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="int",
    shuffle=False
)

# Step 8: Prefetch for Performance
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.prefetch(buffer_size=AUTOTUNE)

# Step 9: Print Dataset Sizes
print(f"\nDataset Summary:")
print(f"Training Batches: {len(train_ds)}")
print(f"Validation Batches: {len(val_ds)}")
print(f"Test Batches: {len(test_ds)}")

# Step 10: Optional - Save datasets for reuse (optional step)
# You can export TFRecord or cache datasets as needed

print("✅ Data preprocessing complete.")

ScannerError: mapping values are not allowed here
  in "c:\Users\HP\OneDrive\Desktop\leukemia_detection\notebooks\..\config\config.yaml", line 3, column 39