# Data Preprocessing for Chest X-Ray Classification
This notebook handles the loading and preprocessing of the chest X-ray dataset. The dataset is divided into training, validation, and test sets. Each set contains images and their corresponding labels. The images are normalized and converted to PyTorch tensors for model training.

In [2]:
import numpy as np
import torch
from torchvision import transforms, models
from torch.utils.data import TensorDataset, DataLoader
from pathlib import Path

In [2]:
def load_data(path):
    """
    Load the dataset from the given path and preprocess it.
    
    Args:
        path (Path): Path to the dataset files.
    
    Returns:
        train_ds (TensorDataset): Training dataset.
        val_ds (TensorDataset): Validation dataset.
        test_ds (TensorDataset): Test dataset.
    """
    
    train_data = np.load(path/"Dataset5_raw_train.npz")
    val_data = np.load(path/"Dataset5_raw_val.npz")
    test_data = np.load(path/"Dataset5_raw_test.npz")

    # Define class labels
    classes = ["COVID-19", "Lung-Opacity", "Normal", "Viral Pneumonia", "Tuberculosis"]

    # Extract data from dictionaries
    train_images = train_data["image"].astype(np.float16)
    val_images = val_data["image"].astype(np.float16)
    test_images = test_data["image"].astype(np.float16)
    train_labels = train_data["image_label"].astype(int)
    val_labels = val_data["image_label"].astype(int)
    test_labels = test_data["image_label"].astype(int)

    # Convert numpy arrays to PyTorch tensors
    train_x = torch.tensor(train_images).float().permute(0, 3, 1, 2) / 255
    train_y = torch.tensor(train_labels).long().squeeze()
    val_x = torch.tensor(val_images).float().permute(0, 3, 1, 2) / 255
    val_y = torch.tensor(val_labels).long().squeeze()
    test_x = torch.tensor(test_images).float().permute(0, 3, 1, 2) / 255
    test_y = torch.tensor(test_labels).long().squeeze()

   # Create TensorDatasets
    train_ds = TensorDataset(train_x, train_y)
    val_ds = TensorDataset(val_x, val_y)
    test_ds = TensorDataset(test_x, test_y)

    return train_ds, val_ds, test_ds

In [3]:
def create_data_loaders(train_ds, val_ds, test_ds, batch_size=64):
    """
    Create DataLoader objects for training, validation, and test datasets.

    Args:
        train_ds (TensorDataset): Training dataset.
        val_ds (TensorDataset): Validation dataset.
        test_ds (TensorDataset): Test dataset.
        batch_size (int): Batch size for DataLoader.
    
    Returns:
        train_dl (DataLoader): DataLoader for training dataset.
        val_dl (DataLoader): DataLoader for validation dataset.
        test_dl (DataLoader): DataLoader for test dataset.
    """
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=batch_size)
    test_dl = DataLoader(test_ds, batch_size=batch_size)
    return train_dl, val_dl, test_dl