# Zero-shot webcam facial emotion classification with CLIP

In [1]:
import numpy as np
import torch
import clip
from tqdm.notebook import tqdm
from pkg_resources import packaging

import cv2
from PIL import Image

print("Torch version:", torch.__version__)


Torch version: 2.1.1


## Loading the model

Download and instantiate a CLIP model using the `clip` module.

In [2]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [4]:
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


## Preparing the labels and prompts

In [5]:
emotion_classes = ["happy", "sad", "angry", "surprised", "fearful", "disgusted", "serious", "scared"]

In [6]:
emotion_templates = [
    'a photo of a person who is feeling {}.',
    'a close-up of a {} face.',
    'a portrait of a {} individual.',
    'a webcam photo of a {} person.',
    'a picture of someone looking {}.',
    'a facial expression that is {}.',
    'a snapshot of a {} expression.',
    'an image of a person expressing {} emotion.',
    'a headshot of a {} person.',
    'a digital rendering of a {} facial expression.',
    'an artistic interpretation of a {} face.',
    'a cartoon representation of a {} expression.',
    'a black and white photo of a {} person.',
    'a photo of a person feeling very {}.',
    'a stylized drawing of a {} facial expression.',
    'a photo captured by a webcam of a {} individual.',
    'a high-resolution image of a {} expression.',
    'a pixelated photo of a {} face.',
    'a blurred photo of a {} expression.',
    'a graphic illustration of a {} emotion.',
    'a candid shot of a {} person.',
    'a facial study of {} emotion.',
    'a computer-generated face looking {}.',
    'a photo depicting {} emotion.'
]

In [7]:
print(f"{len(emotion_classes)} classes, {len(emotion_templates)} templates")

8 classes, 24 templates


## Ensembling embedded labels

In [8]:
def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [template.format(classname) for template in templates] #format with class
            texts = clip.tokenize(texts).cuda() #tokenize
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights


zeroshot_weights = zeroshot_classifier(emotion_classes, emotion_templates)

  0%|          | 0/8 [00:00<?, ?it/s]

In [9]:
with torch.no_grad():
    zeroshot_weights = zeroshot_weights.to(device)


## Emotion classifier using webcam feed

In [17]:
# Webcam
cap = cv2.VideoCapture(0)

try:
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            break

        # Flip the frame horizontally
        frame = cv2.flip(frame, 1)

        # Image preprocessing
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Image features computation and prediction
        with torch.no_grad():
            image_features = model.encode_image(image_input).to(device)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            logits = 100. * image_features @ zeroshot_weights
            logits = logits.type(torch.float32) 
            probabilities = logits.softmax(dim=-1)
            top_prob, top_label = probabilities.cpu().topk(1)

        # Frame with the predicted emotion display
        predicted_emotion = emotion_classes[top_label[0]]
        cv2.rectangle(frame, (20,20), (220,130), (0, 0, 0), -1)
        cv2.putText(frame, predicted_emotion, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(frame, "p=" + str(round(float(top_prob[0]), 2)), (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('Webcam - Predicted Emotion', frame)

        # Break with 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    cap.release()
    cv2.destroyAllWindows()

: 

## Or using a prerecorded video:

In [40]:

# Replace the path with the path to the video file
cap = cv2.VideoCapture('media/video.mp4')

try:
    while cap.isOpened():
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            break

        # Image preprocessing
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        image_input = preprocess(image).unsqueeze(0).to(device)

        # Image features computation and prediction
        with torch.no_grad():
            image_features = model.encode_image(image_input).to(device)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            logits = 100. * image_features @ zeroshot_weights
            logits = logits.type(torch.float32) 
            probabilities = logits.softmax(dim=-1)
            top_prob, top_label = probabilities.cpu().topk(1)

        # Frame with the predicted emotion display
        predicted_emotion = emotion_classes[top_label[0]]
        cv2.rectangle(frame, (20,20), (220,130), (0, 0, 0), -1)
        cv2.putText(frame, predicted_emotion, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.putText(frame, "p=" + str(round(float(top_prob[0]), 2)), (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        cv2.imshow('Webcam - Predicted Emotion', frame)

        # Break with 'q'
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    cap.release()
    cv2.destroyAllWindows()