In [1]:
!pip install dask[complete] boto3 torch -q

In [2]:
import dask
from dask import delayed
!pip install opencv-python -q
import cv2
import numpy as np
import glob
import os

In [3]:
@delayed

def load_and_preprocess_pair(image_path, mask_path):
    img = cv2.imread(image_path)
    img = cv2.resize(img, (256, 256))
    img = img / 255.0

    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img = (img - mean) / std
    img = np.transpose(img, (2, 0, 1))

    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    mask = cv2.resize(mask, (256, 256), interpolation = cv2.INTER_NEAREST)
    
    return img, mask

# Training Dataset
# Load file paths
image_paths = sorted(glob.glob("./archive/train/img/*.jpg"))
mask_paths = sorted(glob.glob("./archive/train/masks/*.png"))

# Process images in parallel
pairs = [load_and_preprocess_pair(img, mask) for img, mask in zip(image_paths, mask_paths)]
results = dask.compute(*pairs)

# Unpack
images, masks = zip(*results)
images = np.array(images)
masks = np.array(masks)

# Save to disk
np.save("preprocessed_images.npy", images)
np.save("preprocessed_masks.npy", masks)

MemoryError: Unable to allocate 7.30 GiB for an array with shape (4983, 3, 256, 256) and data type float64

In [None]:
# Validation Dataset
# Load file paths
val_image_paths = sorted(glob.glob("./archive/val/img/*.jpg"))
val_mask_paths = sorted(glob.glob("./archive/val/masks/*.png"))

# Process images in parallel
val_pairs = [load_and_preprocess_pair(img, mask) for img, mask in zip(val_image_paths, val_mask_paths)]
val_results = dask.compute(*pairs)

# Unpack
val_images, val_masks = zip(*val_results)
val_images = np.array(val_images)
val_masks = np.array(val_masks)

# Save to disk
np.save("preprocessed_val_images.npy", val_images)
np.save("preprocessed_val_masks.npy", val_masks)

In [None]:
from torch.utils.data import Dataset

class SegmentationNPYDataset(Dataset):
    def __init__(self, img_path, masks_path):
        self.images = np.load(img_path)
        self.masks = np.load(masks_path)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = torch.tensor(self.images[idx], dtype = torch.float32)
        mask = torch.tensor(self.masks[idx], dtype = torch.long)

        return image, mask

In [None]:
train_dataset = SegmentationNPYDataset(
    img_path = "preprocessed_images.npy",
    masks_path = "preprocessed_masks.npy"
)

val_dataset = SegmentationNPYDataset(
    img_path = "preprocessed_val_images.npy",
    masks_path = "preprocessed_val_masks.npy"
)