In [13]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
! pip install tensorflow

In [15]:
# Configurations
IMAGE_SIZE = 128  # Resize all images to 128x128
DATA_DIR = "data"  # path where our sample data is present (data folder)

In [16]:
# Data holders
images = []
labels = []

# ----------------------------- Load Images -----------------------------
# Loop through each class folder inside the 'data' directory
for label in os.listdir(DATA_DIR):
    label_folder = os.path.join(DATA_DIR, label)  # Full path to the class folder

    # Make sure it's a directory (not a file)
    if os.path.isdir(label_folder):
        # Loop through each image in the class folder
        for img_file in os.listdir(label_folder):
            img_path = os.path.join(label_folder, img_file)  # Full path to the image file
            try:
                img = cv2.imread(img_path)  # Read the image using OpenCV
                img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE))  # Resize to 128x128
                img = img / 255.0  # Normalize pixel values to range [0, 1]
                images.append(img)  # Add image to list
                labels.append(label)  # Add label (class name) to list
            except Exception as e:
                print(f"Failed to process {img_path}: {e}")  # Print error if image fails

# ----------------------------- Convert to Numpy Arrays -----------------------------
X = np.array(images)  # Convert image list to numpy array
y = np.array(labels)  # Convert label list to numpy array

# ----------------------------- Encode Labels -----------------------------
label_encoder = LabelEncoder()  # Create label encoder instance
y_encoded = label_encoder.fit_transform(y)  # Convert class names to integers
y_categorical = to_categorical(y_encoded)  # One-hot encode the labels

# ----------------------------- Train-Test Split -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_categorical
)
# 20% data will be used for testing; 80% for training

# ----------------------------- Save Data to .npy Files -----------------------------
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)
np.save("classes.npy", label_encoder.classes_)  # Save class names for future reference

print("✅ Preprocessing complete. Data saved.")

✅ Preprocessing complete. Data saved.


In [17]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")
classes = np.load("classes.npy")

In [18]:
# check the first image and label
print(X_train[0].shape)  # (128, 128, 3)
print(y_train[0])        # [0, 1] or similar
print(classes)           # ['Diseased', 'Healthy']

(128, 128, 3)
[0. 1. 0. 0. 0. 0. 0. 0.]
['Pepper_bell_Bacterial_spot' 'Pepper_bell_healthy' 'Potato_Late_blight'
 'Potato_heatlhy' 'Tomato_Bacterial_spot' 'Tomato_Septorial_leaf_spot'
 'Tomato__Tomato_mosaic_virus' 'Tomato_healthy']
