In [1]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split

# Define paths
data_dir = 'dataset'
categories = [chr(i) for i in range(65, 91)]  # 'A' to 'Z'
categories.append('Space')  # Add space to categories

# Function to load and preprocess data
def load_data():
    data = []
    labels = []
    
    for category in categories:
        path = os.path.join(data_dir, category)
        class_num = categories.index(category)  # Assign a numeric label based on category

        # Check if the directory exists
        if not os.path.exists(path):
            print(f"No images found in {category} folder. Add images to continue.")
            continue

        for img_name in os.listdir(path):
            img_path = os.path.join(path, img_name)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (64, 64))  # Resize to 64x64
            data.append(img)
            labels.append(class_num)

    # Convert lists to numpy arrays
    data = np.array(data).reshape(-1, 64, 64, 1)  # Reshape for CNN input
    labels = np.array(labels)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    
    print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test

# Example usage
if __name__ == "__main__":
    X_train, X_test, y_train, y_test = load_data()


Training data shape: (2145, 64, 64, 1), Testing data shape: (537, 64, 64, 1)
