Test 2

In [None]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

Loading train test validation data tondarray

In [None]:
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

In [None]:
# Example usage:
data_dir = "F:/thesis/data"

In [None]:
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')
val_dir = os.path.join(data_dir, 'validation')

def load_data(dataset_dir):
    images = []
    labels = []
    
    num_files = sum(len(files) for _, _, files in os.walk(dataset_dir))
    progress_bar = tqdm(total=num_files, desc='Loading images', unit='image')

    # Iterate through each subfolder in the dataset directory
    for class_folder in sorted(os.listdir(dataset_dir)):
        class_dir = os.path.join(dataset_dir, class_folder)
        if os.path.isdir(class_dir):
            # Iterate through each image file in the class folder
            for image_file in sorted(os.listdir(class_dir)):
                image_path = os.path.join(class_dir, image_file)
                # Load image using PIL
                image = Image.open(image_path)
                # Convert image to numpy array and normalize pixel values
                image = np.array(image) / 255.0
                # Append image and corresponding label to lists
                images.append(image)
                labels.append(int(class_folder))
                progress_bar.update(1)  # Update progress bar

    progress_bar.close()  # Close progress bar after completion

    return np.array(images), np.array(labels)

# Load data for training set
start_time = time.time()
train_images, train_labels = load_data(train_dir)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training set loaded in {elapsed_time:.2f} seconds")

# Load data for test set
start_time = time.time()
test_images, test_labels = load_data(test_dir)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Test set loaded in {elapsed_time:.2f} seconds")

# Load data for validation set
start_time = time.time()
val_images, val_labels = load_data(val_dir)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Validation set loaded in {elapsed_time:.2f} seconds")

In [None]:
print("Shape of train images array:", train_images.shape)
print("Shape of train labels array:", train_labels.shape)
print("Shape of test images array:", test_images.shape)
print("Shape of test labels array:", test_labels.shape)
print("Shape of validation images array:", val_images.shape)
print("Shape of validation labels array:", val_labels.shape)
print(train_images)
print(type(train_images))

EDA

In [None]:
def plot_sample_images(images, labels, num_samples=5, dataset_name=""):
    """
    Plot a few sample images from the dataset.
    
    Args:
    - images: Numpy array representing images
    - labels: Numpy array representing corresponding labels
    - num_samples: Number of samples to plot (default: 5)
    - dataset_name: Name of the dataset (e.g., "Train", "Test", "Validation")
    """
    fig, axes = plt.subplots(1, num_samples, figsize=(15, 3))
    for i in range(num_samples):
        axes[i].imshow(images[i])
        axes[i].set_title(f"Class: {labels[i]}")
        axes[i].axis('off')
    fig.suptitle(f"{dataset_name} Sample Images", fontsize=16)
    plt.show()

# Plot sample images from the training set
plot_sample_images(train_images, train_labels, dataset_name="Train")

# Plot sample images from the test set
plot_sample_images(test_images, test_labels, dataset_name="Test")

# Plot sample images from the validation set
plot_sample_images(val_images, val_labels, dataset_name="Validation")



In [None]:
def plot_class_distribution(labels_list, dataset_names):
    """
    Plot the distribution of classes in multiple datasets.

    Args:
    - labels_list: List of numpy arrays representing corresponding labels for each dataset
    - dataset_names: List of dataset names (e.g., ["Train", "Test", "Validation"])
    """
    plt.figure(figsize=(10, 6))

    for labels, dataset_name in zip(labels_list, dataset_names):
        plt.hist(labels, bins=len(np.unique(labels)), alpha=0.5, label=dataset_name)

    plt.xlabel('Class')
    plt.ylabel('Frequency')
    plt.title('Class Distribution')
    plt.legend()
    plt.show()

# Plot class distribution for train, test, and validation sets
plot_class_distribution([train_labels, test_labels, val_labels], ["Train", "Test", "Validation"])

In [None]:
def plot_image_size_distribution(images_list, dataset_names):
    """
    Plot the distribution of image sizes (height x width) in multiple datasets.
    
    Args:
    - images_list: List of numpy arrays representing images for each dataset
    - dataset_names: List of dataset names (e.g., ["Train", "Test", "Validation"])
    """
    plt.figure(figsize=(10, 6))

    for images, dataset_name in zip(images_list, dataset_names):
        sizes = [(img.shape[0], img.shape[1]) for img in images]
        sizes = np.array(sizes)
        plt.scatter(sizes[:, 0], sizes[:, 1], alpha=0.5, label=dataset_name)

    plt.xlabel('Height')
    plt.ylabel('Width')
    plt.title('Image Size Distribution')
    plt.legend()
    plt.show()

# Plot image size distribution for train, test, and validation sets
plot_image_size_distribution([train_images, test_images, val_images], ["Train", "Test", "Validation"])