# Example of VGG16 in use to extract features and targets
may need some modifications

In [1]:
import torch
import numpy as np
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV  # Logistic Regression with cross-validation
from time import time  # For measuring performance time

In [None]:
# Load the pretrained VGG16 model, without the classifier layers
model = models.vgg16(pretrained=True)
model.classifier = torch.nn.Identity()  # Replace classifier with identity to get features
model.eval()  # Set the model to evaluation mode (no gradients needed)

In [7]:
# Transformation for the images (resize, normalization, etc.)
# from Lenet.ipynb
img_size = 32
batch_size = 512

transform = transforms.Compose([
    transforms.Resize((img_size, img_size)), # Resizing to 28x28 pixels as in the original MNIST format
    transforms.ToTensor(),         # Convert image to tensor
    transforms.Normalize(          # Normalize with ImageNet mean and std
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

In [4]:
def extract_features(img_path, model):
    """Extract features from a single image file using the VGG16 model."""
    # Load and preprocess the image
    img = Image.open(img_path).convert('RGB')  # Ensure it's RGB
    img = transform(img)
    img = img.unsqueeze(0)  # Add a batch dimension

    # Extract features
    with torch.no_grad():  # Disable gradient calculation
        features = model(img)
    return features.flatten().numpy()  # Convert to 1D array

In [None]:
# if we have a directory of all the images with different subfolders of the different categories
def process_image_directory(dataset_path, model):
    """
    Process all images in a dataset directory where each class is in a separate subfolder.
    Extract features for each image and store the label as well.
    """
    features = []
    labels = []
    class_names = os.listdir(dataset_path)  # Each subfolder is a class name

    for label, class_name in enumerate(class_names):
        class_folder = os.path.join(dataset_path, class_name)
        if not os.path.isdir(class_folder):  # Skip non-folder files
            continue

        for img_file in os.listdir(class_folder):
            img_path = os.path.join(class_folder, img_file)
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only process image files
                img_features = extract_features(img_path, model)
                features.append(img_features)
                labels.append(label)  # Store class label as integer

    return np.array(features), np.array(labels), class_names

In [None]:
# Main code block to load data, extract features, and save them
dataset_path = 'Data_Sets\Images'  # TODO: Replace with images directory
features, labels, class_names = process_image_directory(dataset_path, model)
print("Features shape: ", features.shape)
print("First feature vector (first 10 indicies):", features[0][10])

# Prints first 5 lines
print(features[:5])