<a href="https://colab.research.google.com/github/shenghaoc/ee5907-ca2/blob/main/pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from PIL import Image
from numpy.random import default_rng
from pathlib import Path


In [2]:
# CONSTANTS
NUM_SUBJECTS = 68
NUM_CHOSEN = 25
NUM_IMAGES_PER_SUBJECT = 170

TRAIN_RATIO = 0.7
NUM_IMAGES = NUM_CHOSEN * NUM_IMAGES_PER_SUBJECT
NUM_TRAIN_IMAGES_PER_SUBJECT = int(TRAIN_RATIO * NUM_IMAGES_PER_SUBJECT)
NUM_TRAIN_IMAGES = NUM_CHOSEN * NUM_TRAIN_IMAGES_PER_SUBJECT
NUM_TEST_IMAGES = NUM_IMAGES - NUM_TRAIN_IMAGES

NUM_SELFIES = 10
NUM_TRAIN_SELFIES = int(TRAIN_RATIO * NUM_SELFIES)
NUM_TEST_SELFIES = NUM_SELFIES - NUM_TRAIN_SELFIES

SEED1 = 2021
SEED2 = 2022

WIDTH = 32
HEIGHT = 32
NUM_PIXELS = WIDTH * HEIGHT


In [3]:
# Must start from 1 to accommodate folder naming scheme
# Choose NUM_CHOSEN elements from NUM_SUBJECTS integers without replacement
chosen = np.random.default_rng(SEED1).choice(
    np.arange(1, NUM_SUBJECTS + 1), NUM_CHOSEN, replace=False
)


In [4]:
# Load images from disk
# Use lists for manual looping without use of numpy functions
images = []
labels = []

# Assume PIE is in pwd
directory = Path("PIE")
for i in chosen:
    # Do not flatten yet, need to split train and test for each subject
    subject_images = []
    subject_labels = []
    subdirectory = directory / str(i)
    # Order is arbitrary for glob, good enough to split later without shuffling
    # Also no need to state selection in report
    for filename in subdirectory.glob("*.jpg"):
        # PIL is slower but OpenCV is unnecessary
        im = Image.open(filename)
        subject_images.append(np.array(im))
        subject_labels.append(i)  # Use number in PIE for label
    images.append(subject_images)
    labels.append(subject_labels)


In [5]:
# Slightly altered code for selfies
selfie_images = []
selfie_labels = []

directory = Path("resized")
# Assume selfies have been resized and are in pwd
for filename in directory.glob("*.jpg"):
    im = Image.open(filename)
    selfie_images.append(np.array(im))
    selfie_labels.append(NUM_SUBJECTS + 1)  # add 1 to max PIE number to avoid clashes


In [6]:
# Further processing without disk access
images_train, images_test = np.split(
    np.array(images), [NUM_TRAIN_IMAGES_PER_SUBJECT], axis=1
)
labels_train, labels_test = np.split(
    np.array(labels), [NUM_TRAIN_IMAGES_PER_SUBJECT], axis=1
)

selfie_images_train, selfie_images_test = np.split(
    np.array(selfie_images), [NUM_TRAIN_SELFIES]
)
selfie_labels_train, selfie_labels_test = np.split(
    np.array(selfie_labels), [NUM_TRAIN_SELFIES]
)


In [7]:
images_train = np.append(
    images_train.reshape(NUM_TRAIN_IMAGES, NUM_PIXELS),
    selfie_images_train.reshape(NUM_TRAIN_SELFIES, NUM_PIXELS),
    axis=0,
)
labels_train = np.append(labels_train.reshape(NUM_TRAIN_IMAGES), selfie_labels_train)

images_test = np.append(
    images_test.reshape(NUM_TEST_IMAGES, NUM_PIXELS),
    selfie_images_test.reshape(NUM_TEST_SELFIES, NUM_PIXELS),
    axis=0,
)
labels_test = np.append(labels_test.reshape(NUM_TEST_IMAGES), selfie_labels_test)
