# Part A - Feature Extraction

## 1. Feature Extraction

This part of the coursework is involved with extracting all the features from the big folder of images celeba_selection.

In [179]:
import numpy
import pandas
import torch
from torchvision import models, transforms
from PIL import Image
from tqdm.notebook import tqdm
import os

EMBEDDINGS_FILE = "extracted_features.npz"
NUM_IMAGES_TO_PROCESS = 20000
IMAGE_DIR = "img_align_celeba"
ATTRIBUTE_FILE = "list_attr_celeba.txt"
SAMPLE_ATTRIBUTES_FILENAME = "celeba_sampled_attributes"
SELECTED_ATTRIBUTES = ["Smiling", "Male", "Young", "Blond_Hair", "Wearing_Hat"]
PREDICTION_ATTRIBUTE = "Smiling"

TRAINING_PERCENT = 0.7
CALIB_PERCENT = 0.15

RESET = False

First, we cut down the attribute dataset to only include the attributes we want.

In [180]:
# First we open the attribute file

with open(ATTRIBUTE_FILE, "r") as file:
    lines = file.readlines()


# Extract the headers from first row

columns = lines[1].strip().split()


# Extract the data part (from line 2 onwards)

dataRows = lines[2:]

data = []
imageNames = []
for row in dataRows:
    

    # Seperating the image name (first column) from rest of the dataset
    
    parts = row.strip().split()
    imageNames.append(parts[0])    
    data.append([int(x) for x in parts[1:]])

attributeData = pandas.DataFrame(data, columns=columns, index=imageNames)


# Now we create the subset based on our specific attributes

cutAttributes = attributeData[SELECTED_ATTRIBUTES]
print(cutAttributes.head())

            Smiling  Male  Young  Blond_Hair  Wearing_Hat
000001.jpg        1    -1      1          -1           -1
000002.jpg        1    -1      1          -1           -1
000003.jpg       -1     1      1          -1           -1
000004.jpg       -1    -1      1          -1           -1
000005.jpg       -1    -1      1          -1           -1


The attribute dataset is now sampled randomly based on the sample size.

In [181]:
# Have random_state filled in to ensure consistency for testing

sampledAttributes = cutAttributes.sample(n=NUM_IMAGES_TO_PROCESS, random_state=42)
sampledImageNames = sampledAttributes.index.to_list()
sampledAttributes.to_csv(SAMPLE_ATTRIBUTES_FILENAME + ".csv", index=True)
print(f"Sampled attributes saved to: {SAMPLE_ATTRIBUTES_FILENAME}.csv")

Sampled attributes saved to: celeba_sampled_attributes.csv


Below is the code for executing the feature extraction process

In [182]:
def extractFeatures():

    # First we skip this expensive step if feature file already exists.

    if not RESET:
        if not os.path.exists(EMBEDDINGS_FILE):
            print("Embeddings file missing — need to re-extract.")
        else:
            # Load existing embeddings
            data = numpy.load(EMBEDDINGS_FILE)
            existing_embeddings = data["embeddings"]
            savedImageNames = data["image_names"]

            # Load sampled attribute list
            sampledAttributes = pandas.read_csv(SAMPLE_ATTRIBUTES_FILENAME + ".csv", index_col=0)
            currentImageNames = sampledAttributes.index.tolist()

            if len(currentImageNames) != len(savedImageNames):
                print("Number of images changed — re-extracting.")
            
            # Check 2 — embeddings size matches?
            elif existing_embeddings.shape[0] != len(currentImageNames):
                print("Number of embeddings does not match sampled images: re-extracting.")

            # Check 3 — 1-by-1 filename match
            elif not numpy.array_equal(currentImageNames, savedImageNames):
                print("Image order or content mismatch: re-extracting.")

            else:
                print("Embeddings match sampled image list. Skipping extraction.")
                return
            

    # We now load the pre-trained Vision Transformer and set it to evaluation mode

    device = torch.device("cuda")
    vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1).to(device)
    vit.eval()


    # Preprocessing is defined; resizing, centre-cropping to 224x224, converted to tensors, and normalised to match ImageNet training

    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])


    # List of images is prepared from feature list earlier

    sampledAttributes = pandas.read_csv(SAMPLE_ATTRIBUTES_FILENAME + ".csv", index_col=0)
    sampledImageNames = sampledAttributes.index.tolist()
    


    allFeatures = []


    # Features are extracted for each image

    with torch.no_grad():
        for fname in tqdm(sampledImageNames, desc="Extracting ViT Feature Embeddings", colour="#ebbcba"):
            img = Image.open(os.path.join(IMAGE_DIR, fname)).convert("RGB")
            x = preprocess(img).unsqueeze(0).to(device)


            ## Forward pass to get feature embeddings without classification head

            x_processed = vit._process_input(x)
            n = x_processed.shape[0]
            
            batch_class_token = vit.class_token.expand(n, -1, -1)
            x_with_token = torch.cat([batch_class_token, x_processed], dim=1)

            encoded_features = vit.encoder(x_with_token)
            features = encoded_features[:, 0]

            allFeatures.append(features.cpu().numpy().flatten())
    

    all_features_np = numpy.array(allFeatures)
    print(f"Feature matrix shape: {all_features_np.shape}")


    # Save feature embeddings for future use

    numpy.savez(
        EMBEDDINGS_FILE, 
        embeddings=all_features_np,
        image_names=numpy.array(sampledImageNames)
    )
    print(f"Embeddings + image names saved to '{EMBEDDINGS_FILE}'.")




In [183]:
extractFeatures()

Embeddings match sampled image list. Skipping extraction.


## 2. Training the Classifiers

First, we need top load the features and labels generated in part 1

In [184]:
data = numpy.load(EMBEDDINGS_FILE)
features = data["embeddings"]
sampledAttributes = pandas.read_csv(SAMPLE_ATTRIBUTES_FILENAME + ".csv", index_col=0)


# Only want to confirm if the image is similing or not

labels = sampledAttributes[PREDICTION_ATTRIBUTE].values

# Since the attribute data records binary values as -1 and 1, we need to convert to 0 and 1 for Gaussian/Normal classifier

labels = ((labels + 1) // 2).astype(int)

Now, the split sizes need to be computed to ensure the correct number of cases for each data set.

In [185]:
trainSize = int(TRAINING_PERCENT * NUM_IMAGES_TO_PROCESS)
calibSize = int(CALIB_PERCENT * NUM_IMAGES_TO_PROCESS)
testSize = NUM_IMAGES_TO_PROCESS - trainSize - calibSize

The datasets can now be produced.

In [186]:
# Training Set


numpy.random.seed(42)

shuffledIndicies = numpy.random.permutation(NUM_IMAGES_TO_PROCESS)

trainEnd = trainSize
calibEnd = trainSize + calibSize

trainIndex = shuffledIndicies[:trainEnd]
calibIndex = shuffledIndicies[trainEnd:calibEnd]
testIndex = shuffledIndicies[calibEnd:]

XTrain = features[trainIndex]
yTrain = labels[trainIndex]

# Calibration Set

XCalib = features[calibIndex]
yCalib = labels[calibIndex]

# Testing Set

XTest = features[testIndex]
yTest = labels[testIndex]

print("Train:", XTrain.shape, yTrain.shape)
print("Calibration:", XCalib.shape, yCalib.shape)
print("Test:", XTest.shape, yTest.shape)


Train: (14000, 768) (14000,)
Calibration: (3000, 768) (3000,)
Test: (3000, 768) (3000,)


### Naive Bayes Classifier modelling each feature as an independent Gaussian (Normal) Distribution

We are going to define a class called GaussianNaiveBayes that will contain the functions for training, and making predictions

In [187]:
class GaussianNaiveBayes:

    def __init__(self):
        
        self.labelValues = None
        self.means = None
        self.variances = None
        self.priors = None

    def train(self, X, y):

        # We need to first indentify all our unique values our labels can hold

        self.labelValues = numpy.unique(y)

        self.means = {}
        self.variances = {}
        self.priors = {}

        # We loop through all possible label values and identify the training samples belonging to the label value

        for labelValue in self.labelValues:

            # Selects all samples in dataset X that have that label value 

            XsWithLabelValue = X[y == labelValue]

            # Compute mean and variance of each feature for this class

            self.means[labelValue] = XsWithLabelValue.mean(axis=0)
            self.variances[labelValue] = XsWithLabelValue.var(axis=0)

            # Compute the prior probability that a random sample will have the same label value

            self.priors[labelValue] = len(XsWithLabelValue) / len(X)


    
    def gaussianPDF(self, x, mean, variance):

        # If every feature has the exact same value for a specific label value then variance will be 0, so replace every 0 with tiny value just in case

        variance = numpy.where(variance == 0, 1e-6, variance)

        normCoeff = 1.0 / numpy.sqrt(2.0 * numpy.pi * variance)

        exponent = numpy.exp(- ( (x - mean) ** 2 / (2 * variance) ))

        return normCoeff * exponent

        

    def predict(self, X):

        predictions = []

        for sample in X:
            
            labelValuePosteriors = {}

            for labelValue in self.labelValues:

                logPrior = numpy.log(self.priors[labelValue])

                featureLogLiklihoods = numpy.log(self.gaussianPDF(sample, self.means[labelValue], self.variances[labelValue]))
                
                logLiklihood = numpy.sum(featureLogLiklihoods)

                labelValuePosteriors[labelValue] = logPrior + logLiklihood

            bestLabelValue = max(labelValuePosteriors, key=labelValuePosteriors.get)
            predictions.append(bestLabelValue)

        return numpy.array(predictions)

Now that the class has been defined we can use it to see how effective the naiive bayes classifier is for this dataset

In [188]:
gnb = GaussianNaiveBayes()

gnb.train(XTrain, yTrain)
yPred = gnb.predict(XTest)

accuracy = numpy.mean(yPred == yTest)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.702
