In [None]:
import cv2
import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.models import vgg16
import numpy as np
from PIL import Image
from Utils.expression_model import ExpressionRecognitionModel
import time
from yolov7.models.experimental import attempt_load
from yolov7.utils.datasets import LoadImages
from yolov7.utils.general import check_img_size, non_max_suppression, scale_coords, set_logging
from yolov7.utils.torch_utils import select_device, time_synchronized

load emotion model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
emotion_model = ExpressionRecognitionModel(num_classes=5)
emotion_model.to(device)
emotion_model.load_state_dict(torch.load("path/to/emotion_model_weights.pth", map_location=device))

def classify_emotion(face_image, emotion_model, device):
    emotion_list = ['Cry', 'Surprise', 'angry', 'confuse', 'happy', 'neutral', 'sad']
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])
    img = transform(face_image).unsqueeze(0).to(device)
    with torch.no_grad():
        emotion = emotion_model(img)
    emotion = emotion.cpu().numpy()
    return emotion_list[np.argmax(emotion)]

helper function

In [None]:
def extract_bounding_boxes(video_path, weights='yolov7.pt', img_size=640, conf_thres=0.25, iou_thres=0.45): # location of yolov7 best weights
    set_logging()
    device = select_device('')
    half = device.type != 'cpu'  # half precision only supported on CUDA

    model = attempt_load(weights, map_location=device)
    stride = int(model.stride.max())
    img_size = check_img_size(img_size, s=stride)

    if half:
        model.half()

    dataset = LoadImages(video_path, img_size=img_size, stride=stride)
    bounding_boxes = []

    t0 = time.time()
    for _, img, im0s, _ in dataset:
        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()
        img /= 255.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        with torch.no_grad():
            pred = model(img, augment=False)[0]

        pred = non_max_suppression(pred, conf_thres, iou_thres)

        frame_boxes = []
        for det in pred:
            if len(det):
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0s.shape).round()
                frame_boxes.extend([[*xyxy, conf.item(), cls.item()] for *xyxy, conf, cls in reversed(det)])

        bounding_boxes.append(frame_boxes)

    print(f'Done. ({time.time() - t0:.3f}s)')
    return bounding_boxes

def process_video_and_classify(video_path, bounding_boxes, emotion_model, emotion_list, output_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file: {video_path}")
        return

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx < len(bounding_boxes):
            boxes = bounding_boxes[frame_idx]

            for box in boxes:
                x1, y1, x2, y2, conf, cls = map(int, box)

                x1, y1 = max(x1, 0), max(y1, 0)
                x2, y2 = min(x2, frame.shape[1]), min(y2, frame.shape[0])

                x1, y1 = x1 + int(0.1 * (x2 - x1)), y1 + int(0.1 * (y2 - y1))
                x2, y2 = x2 - int(0.1 * (x2 - x1)), y2 - int(0.1 * (y2 - y1))

                crop_img = frame[y1:y2, x1:x2]
                if crop_img.size == 0:
                    continue

                crop_img_pil = Image.fromarray(crop_img)

                emotion_model.eval()
                emotion = classify_emotion(crop_img_pil, emotion_model, device)

                label = f'{emotion}'
                color = (255, 0, 0)
                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 1)
                cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

        out.write(frame)
        frame_idx += 1

    cap.release()
    out.release()
    cv2.destroyAllWindows()
    print(f'Output video saved to {output_path}')

Main

In [None]:
video_path = "path/to/input/video.mp4"
output_path = "path/to/output/video.mp4"
emotion_list = ['Cry', 'Surprise', 'angry', 'confuse', 'happy', 'neutral', 'sad']
bounding_boxes = extract_bounding_boxes(video_path)
process_video_and_classify(video_path, bounding_boxes, emotion_model, emotion_list, output_path)