In [4]:
import os 
# Example image paths and labels
image_paths_train, image_paths_test = [], []
labels_train, labels_test = [], []
path = "../../data/combined_dataset"
train_percentage = .5
    
for sub in os.listdir(path):
    imgs = os.listdir(f"{path}/{sub}/face")
    train_ix = int(len(imgs) * train_percentage)
    
    train_imgs = imgs[:train_ix]
    test_imgs = imgs[train_ix:]
    
    for img in train_imgs:
        image_paths_train.append(f"{path}/{sub}/face/{img}")
        labels_train.append(int(sub))
        
    for img in test_imgs:
        image_paths_test.append(f"{path}/{sub}/face/{img}")
        labels_test.append(int(sub))

In [5]:
len(image_paths_train), len(image_paths_test), len(labels_train), len(labels_test)

(5928, 6083, 5928, 6083)

In [6]:
from transformers import ViTModel, ViTForImageClassification, ViTImageProcessor, Trainer, TrainingArguments
from datasets import load_dataset
from PIL import Image
import torch
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
from torch.utils.data import Dataset
from facenet_pytorch import mtcnn

# Load the pretrained ViT model for image classification
model_name = "google/vit-base-patch16-224-in21k"
model_name = "facebook/dino-vits16"
model = ViTForImageClassification.from_pretrained(model_name, num_labels=500)

# Load the feature extractor
processor = ViTImageProcessor.from_pretrained(model_name)

# Define custom dataset
class ImageDataset(Dataset):
    def __init__(self, image_paths, labels, processor):
        self.image_paths = image_paths
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and preprocess the image
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = mtcnn(image)
        inputs = self.processor(images=image, return_tensors="pt")
        label = self.labels[idx]
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }


# Create dataset
train_dataset = ImageDataset(image_paths_train, labels_train, processor)
eval_dataset = ImageDataset(image_paths_test, labels_test, processor)

# Define training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,  # Use processor for feature extraction
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_vit")
processor.save_pretrained("./fine_tuned_vit")


ModuleNotFoundError: No module named 'facenet_pytorch'

In [None]:
import evaluate

# Load the accuracy metric from the evaluate library
metric = evaluate.load("accuracy")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # Get the predicted class
    return metric.compute(predictions=predictions, references=labels)

# Evaluate the model
results = trainer.evaluate()

# Print the accuracy
print("Accuracy:", results["eval_accuracy"])


100%|██████████| 761/761 [01:04<00:00, 11.88it/s]

Accuracy: 0.0026302811112937697



