In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
import timm #good for image classification specific architectures
from transformers import AutoModelForImageClassification, AutoModel
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import seaborn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [2]:
MODEL = "trpakov/vit-face-expression"

In [3]:
class ImageDataset(Dataset):
  def __init__(self,data_dir, transform=None):
    self.data = ImageFolder(data_dir, transform=transform)

  def __len__(self): #DataLoader will need to know how many examples we have in a dataset once we create it
    return len(self.data)

  def __getitem__(self,idx): #takes in index location in dataset and returns one item
    return self.data[idx] #returns data item of the index provided

  @property
  def classes(self):
    return self.data.classes

In [4]:
dataset = ImageDataset(data_dir="/content/fane_data")

In [5]:
#len(dataset)

In [6]:
#image,label = dataset[0]
#image

In [7]:
#print(label) #angry

In [8]:
#Dictionairy to associate output labels with their corresponding class
data_dir = "/content/fane_data"
target_to_class = {v: k for k, v in ImageFolder(data_dir).class_to_idx.items()}
print(target_to_class)

{0: 'angry', 1: 'confused', 2: 'disgust', 3: 'fear', 4: 'happy', 5: 'neutral', 6: 'sad', 7: 'shy', 8: 'surprise'}


In [9]:
transform = transforms.Compose([
    transforms.Resize((224,224)), #ViT pretrained model expects 224x224 input
    transforms.ToTensor(),
])

dataset = ImageDataset(data_dir, transform) #apply transformations to dataset

DataLoaders

In [10]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) #will train faster

PyTorch Model

In [11]:
class ImageClassifier(nn.Module):
  def __init__(self, num_classes=9): #Define all the parts of the model
    super(ImageClassifier, self).__init__() #initialize object with everything from parent class
    self.model = AutoModel.from_pretrained(MODEL)
    self.features = nn.Sequential(*list(self.model.children())[:-1]) #Get rid of final classification head
    hidden_size = self.model.config.hidden_size

    current_output_size = 7

    #Add MLP Layers after finetuned ViT
    self.classifier = nn.Sequential(
        nn.Linear(hidden_size, 512),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, num_classes)
    )

  def forward(self,x): #Connect the defined parts and return the output
    output = self.model(pixel_values=x, return_dict=True)
    cls_token = output.last_hidden_state[:, 0]  # CLS token
    return self.classifier(cls_token)

In [12]:
model = ImageClassifier()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ViTModel were not initialized from the model checkpoint at trpakov/vit-face-expression and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#print(model)

In [14]:
"""
for images, labels in dataloader:
  break

model(images)
"""

'\nfor images, labels in dataloader:\n  break\n\nmodel(images)\n'

Training

In [15]:
SEED = 42
BATCH_SIZE = 32
np.random.seed(SEED)
torch.manual_seed(SEED)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 2e-5)

In [16]:
#Error identify image file for _io.BufferedReader name='/content/fane_data/happy/happy1283.jpg' in dataset. Will just delete
#os.remove('/content/fane_data/happy/happy1283.jpg')

In [17]:
dataset = ImageFolder(data_dir, transform=transform) #Reload data so we get no errors when splitting the data

In [18]:
train_ds, val_ds = train_test_split(dataset, test_size=0.2, random_state=SEED)
val_ds, test_ds = train_test_split(val_ds, test_size=0.5, random_state=SEED)


In [19]:
train_loader = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle = False)
test_loader = DataLoader(test_ds, batch_size = BATCH_SIZE, shuffle = False)

In [None]:
EPOCHS = 5
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

model = ImageClassifier(num_classes=9)

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    train_loss = running_loss / len(train_loader.dataset)
    train_accuracy = correct / total
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    # Validation
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(val_loader):
            outputs = model(images)
            loss = loss_fn(outputs, labels)
            running_loss += loss.item() * images.size(0)

            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / len(val_loader.dataset)
    val_accuracy = correct / total
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch+1}/{EPOCHS} - "
          f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f} - "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")


Some weights of ViTModel were not initialized from the model checkpoint at trpakov/vit-face-expression and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/423 [00:00<?, ?it/s]

In [None]:
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.legend()
plt.title("Loss Over Epochs")
plt.show()

In [None]:
#Save Weights
torch.save(model.state_dict(), "model_weights.pth")
print("Saved model weights to model_weights.pth")

In [None]:
#Save entire model
torch.save(model, "full_model.pth")
print("Saved full model to full_model.pth")