In [None]:
!pip install torch torchvision

Load Data
- Traverse train/test folders
- Create Xtrain/ytrain & Xtest/ytest vectors
- Load into matrix

In [None]:
# Encoding: 1 represents defective, 0 represents non-defective
"""
import os
from PIL import Image
import numpy as np

X_train = []
X_test = []
base_path = os.path.join("pepsico-lab-potato-quality-control", "Pepsico RnD Potato Lab Dataset")

for folder in ['Train', 'Test']:
    for category in ['Non-Defective', 'Defective']:
        folder_path = os.path.join(base_path, folder, category)
        for image_path in os.listdir(folder_path):
            if image_path.endswith('.jpg') is False:
                continue
            img = Image.open(os.path.join(folder_path, image_path))
            if folder == 'Train':
                X_train.append(np.asarray(img))
            else:
                X_test.append(np.asarray(img))

X_train = np.array(X_train)
X_test = np.array(X_test)

print(f'X_test Shape: {X_test.shape}')
print(f'X_train Shape: {X_train.shape}')

with open('X_train.npy', 'wb') as file:
    np.save(file, X_train)

with open('X_test.npy', 'wb') as file:
    np.save(file, X_test)
"""

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, file_ids, labels, base_path):
        """
        Constructor for the Dataset.

        Parameters
        ----------
        file_ids : list[str]
            List of file names (not paths)
        labels : list[int]
            List of class identifiers, corresponding to the list of file IDs
        base_path : str
            Path to the folder containing the files in `file_ids`

        Returns
        -------
        Dataset
        """
        self.file_ids = file_ids
        self.labels = labels
        self.base_path = base_path

    def __len__(self):
        """
        Return the size of the dataset.

        Parameters
        ----------

        Returns
        -------
        int
        """
        return len(self.file_ids)

    def __getitem__(self, index):
        """
        Get a feature and label tuple based on the index.

        Parameters
        ----------
        index : int
            This function does not check the bounds of the Dataset

        Returns
        -------
        (torch.tensor, int) : Image vector and class
        """
        filename = self.file_ids[index]
        label = self.labels[index]
        img = Image.open(os.path.join(self.base_path, "Defective" if label == 1 else "Non-Defective", filename))
        X = torch.tensor(np.asarray(img))
        return X, self.labels[index]


def generate_dataset(is_train):
    """
    Generate training and testing datasets for the PepsiCo folder structure.

    Parameters
    ----------
    is_train: bool

    Returns
    -------
    Dataset
    """
    base_path = os.path.join("pepsico-lab-potato-quality-control", "Pepsico RnD Potato Lab Dataset")
    train_path = os.path.join(base_path, "Train" if is_train else "Test")
    file_ids = []
    labels = []
    for category in ["Defective", "Non-Defective"]:
        for filename in os.listdir(os.path.join(train_path, category)):
            if filename.endswith(".jpg") is False:
                continue
            file_ids.append(filename)
            labels.append(1 if category == 'Defective' else 0)
    return Dataset(file_ids, labels, train_path)

In [None]:
from torch.utils.data import DataLoader, Subset
from torch import randperm

raw_train = generate_dataset(is_train=True)
raw_test = generate_dataset(is_train=False)

raw_train_len = len(raw_train)
indices = randperm(raw_train_len)
train = Subset(raw_train, indices[:int(raw_train_len * 0.8)])
train_loader = DataLoader(train, batch_size=32, shuffle=True)