In [95]:
import torch
from torchvision import models, transforms
from PIL import Image
import numpy as np
from tqdm import tqdm
import os

In [96]:
# Collecting a subset of 20,000 images to train, calibrate and test
# Shuffling the dataset to remove patterns/groups of related images
import random
random.seed(10) # comment out later
img_index = [i for i in range(20000)]
random.shuffle(img_index)


In [97]:
# Storing attributes Smiling, Male, Young.. for each image
import pandas as pd
df = pd.read_csv('list_attr_celeba.txt', skiprows=1, header=0, sep='\s+')

img_df = df[["Smiling", "Male", "Young", "Blond_Hair"]].head(20000)
print(img_df.iloc[10:15, :])

            Smiling  Male  Young  Blond_Hair
000011.jpg        1    -1      1          -1
000012.jpg        1     1      1          -1
000013.jpg        1     1      1           1
000014.jpg        1    -1      1          -1
000015.jpg       -1     1     -1          -1


In [98]:
# Feature extraction
def extract_features():
    """
    Extracts features from images using a pre-trained Vision Transformer (ViT)
    and saves them to a file.
    """
    EMBEDDINGS_FILE = 'embeddings.npy'
    IMAGE_DIR = 'celeba_selection'
    NUM_IMAGES_TO_PROCESS = 20000 #later change to SAMPLE_SIZE
    random.seed(10) # comment out later, for reproducibility

    if os.path.exists(EMBEDDINGS_FILE):
        print(f"Embeddings file ’{EMBEDDINGS_FILE}’ already exists. Skipping feature extraction.")
        return
    print("Starting feature extraction with Vision Transformer...")

    # 1. Load pre-trained Vision Transformer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    vit = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1).to(device)
    vit.eval() # Set model to evaluation mode

    # 2. Define preprocessing steps consistent with ImageNet training
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # 3. Get the list of images to process

    image_list = sorted(os.listdir(IMAGE_DIR))[:NUM_IMAGES_TO_PROCESS]
    all_features = []

    # 4. Extract embeddings for each image
    with torch.no_grad():
        for fname in tqdm(image_list, desc="Extracting ViT Embeddings"):
            img = Image.open(os.path.join(IMAGE_DIR, fname)).convert("RGB")
            x = preprocess(img).unsqueeze(0).to(device)
            # Manually replicate the forward pass to get the features before the
            # classification head, as the internal API (like .process_input)
            # can change.
            # 1. Process input using the private _process_input method
            x_processed = vit._process_input(x)
            n = x_processed.shape[0]
            # 2. Add the class token
            batch_class_token = vit.class_token.expand(n, -1, -1)
            x_with_token = torch.cat([batch_class_token, x_processed], dim=1)
            # 3. Pass through the encoder
            encoded_features = vit.encoder(x_with_token)
            # 4. Get the class token’s output (this is the feature vector)
            features = encoded_features[:, 0]
            all_features.append(features.cpu().numpy().flatten())
    all_features_np = np.array(all_features)
    print(f"Feature matrix shape: {all_features_np.shape}")
    
    # 5. Save embeddings for later use
    np.save(EMBEDDINGS_FILE, all_features_np)
    print(f"Embeddings saved to ’{EMBEDDINGS_FILE}’.")

extract_features()



Embeddings file ’embeddings.npy’ already exists. Skipping feature extraction.


In [None]:
# Partitioning the data into training/calibration/test data
embeddings = np.load("embeddings.npy")

SAMPLE_SIZE = 20000

TRAIN_RATIO = 0.7
CALIBRATION_RATIO = 0.15
TEST_RATIO = 0.15

train_index = int(TRAIN_RATIO * SAMPLE_SIZE)
calibration_index = train_index + int(CALIBRATION_RATIO * SAMPLE_SIZE)

# note - shuffle with labels later
train_embed, calibration_embed, test_embed = np.split(embeddings, [train_index, calibration_index]) 
train_labels = img_df.iloc[:train_index]
calibration_labels = img_df.iloc[train_index:calibration_index]
test_labels = img_df.iloc[calibration_index:]

In [185]:
# Training
from numpy import random
train_embed_df = pd.DataFrame(train_embed)


# We model each feature (conditioned on label) under a gaussian distribution
def pd_gaussian(x, mean, sd):  
    var = sd**2
    exponent = (-(x - mean)**2) / (2 * var)
    frac = 1 / (np.sqrt(2 * np.pi * var))
    return  frac * np.exp(exponent)


def naive_bayes_class(feature, label):
    # calc bayes prob for both classes of label
    # compare both
    print(naive_bayes(feature, label, 1))
    print(naive_bayes(feature, label, -1))
    classification = 1 if naive_bayes(feature, label, 1) > naive_bayes(feature, label, -1) else 0
    return classification


def naive_bayes(feature, label, label_class):
    # calc prob of feature conditioned on class (pos)
    # calc prob of label
    return likelihood(feature, label, label_class) * prior(label, label_class)


# Returns probability a sample is given a given label/class 
def prior(label, label_class):
    label_count = train_labels[label].value_counts()[label_class]
    train_size = SAMPLE_SIZE * TRAIN_RATIO
    return (label_count/train_size)


# Returns conditional probability a feature is present given a label and class
def likelihood(feature, label, label_class):  
    likelihood_total = 1

    #----may need to move this ----#
    means = []
    sds = []
    temp_df = train_embed_df.assign(label=train_labels[label].values)
    feature_class = temp_df[temp_df['label'] == label_class]

    for col in feature_class.iloc[:, :-1]:
        means.append(feature_class[col].mean())
        sds.append(feature_class[col].std())
    #------------------------------#

    for i in range(len(feature)):
        likelihood_total *= pd_gaussian(feature[i], means[i], sds[i])

    print("likelihood for label " + str(label) + " and class " + str(label_class) + " is " + str(likelihood_total))
    return likelihood_total
    
print(naive_bayes_class(embeddings[5], 'Smiling'))



likelihood for label Smiling and class 1 is 0.0
0.0
likelihood for label Smiling and class -1 is 0.0
0.0
likelihood for label Smiling and class 1 is 0.0
likelihood for label Smiling and class -1 is 0.0
0
