In [None]:
# Import necessary libraries
!pip install retina-face

In [2]:
!git clone https://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@github.com/sanazgit/FER.git

Cloning into 'FER'...
remote: Enumerating objects: 344, done.[K
remote: Counting objects: 100% (65/65), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 344 (delta 53), reused 44 (delta 44), pack-reused 279 (from 1)[K
Receiving objects: 100% (344/344), 1.19 MiB | 9.41 MiB/s, done.
Resolving deltas: 100% (186/186), done.


In [2]:
import sys
sys.path.append('./FER')

In [3]:
import zipfile
with zipfile.ZipFile('/kaggle/working/FER/complexnn.zip', 'r') as z:
    z.extractall('/kaggle/working/')

In [4]:
import cv2
import numpy as np
import tensorflow as tf
from retinaface import RetinaFace
from LA_QVIT import create_qvit_classifier
from torchvision import transforms
import torch
import random
import shutil
import math
import resnet_pose_attention_v2 as resnet
import time
from PIL import Image

In [5]:
def preprocess_frame(frame):
    # Convert BGR (OpenCV format) to RGB
    if frame.shape[2] == 3:  # Check if the frame has 3 channels
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    mytransform = transforms.Compose([
        transforms.ToPILImage(),  # Convert ndarray to PIL Image
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Apply the transformations
    frame = mytransform(frame)
    frame = torch.unsqueeze(frame, 0)  # Add batch dimension
    return frame


def rotate(ps,m):
    pts = np.float32(ps).reshape([-1, 2])
    pts = np.hstack([pts, np.ones([len(pts), 1])]).T
    target_point = np.dot(m, pts)
    target_point = [[target_point[0][x],target_point[1][x]] for x in range(len(target_point[0]))]
    return target_point

def rotate_img_and_point(img,points,angle,center_x,center_y,resize_rate=1.0):
    h,w,c = img.shape
    M = cv2.getRotationMatrix2D((center_x,center_y), angle, resize_rate)
    res_img = cv2.warpAffine(img, M, (w, h))
    out_points = rotate(points,M)
    return res_img,out_points


def Feature_Orthogonal(image, rect, rect_local, model_cla):
    if not rect_local or not isinstance(rect_local[0], list) or len(rect_local[0]) < 2:
        print("Error: rect_local is not in the expected format.")
        return None

    with torch.no_grad():
        image = image.cuda()
        model_cla.module.set_rect(rect)
        model_cla.module.set_rect_local(rect_local)
        x_gf_1, x_gf_2, x_gf_3, _, _, _, _ = model_cla(image)

        # Convert PyTorch tensors to NumPy arrays
        x_gf_1 = x_gf_1.cpu().numpy()
        x_gf_2 = x_gf_2.cpu().numpy()
        x_gf_3 = x_gf_3.cpu().numpy()

        return x_gf_1, x_gf_2, x_gf_3

    

def load_emotin_model():
    model = create_qvit_classifier()
    model.load_weights('/kaggle/input/la-qvit-main4/tmp/RAFDB/model_018-0.8893.h5')
    return model  

def load_Orf_model():
    model = resnet.resnet50()  # Use pretrained=False if not using pretrained model
    model = torch.nn.DataParallel(model).cuda()
    checkpoint = torch.load('/kaggle/input/la-qvit-orfe0-1/checkpoint_cnn/[11-18]-[10-35]-model_best.pth.tar')
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()
    return model

In [6]:
def pre_processing(frame, facial, p, pp, l):
    im = frame.squeeze(0).transpose((1, 2, 0))  # Remove batch dimension and transpose

    for face_landmarks in facial:
        single_face_landmarks = np.array(face_landmarks)

        if random.random() < p:  # Image rotation
            resize_rate = round(random.uniform(0.9, 1.1), 2)
            angle = random.randint(-15, 15)
            im, single_face_landmarks = rotate_img_and_point(im, single_face_landmarks, angle, 112, 112, resize_rate)

        if random.random() < p:  # Image translation
            horizon = random.randint(-20, 20)
            vertial = random.randint(-20, 20)
            mat_translation = np.float32([[1, 0, horizon], [0, 1, vertial]])
            im = cv2.warpAffine(im, mat_translation, (224, 224))
            
            for j in range(len(single_face_landmarks)):
                single_face_landmarks[j][0]= single_face_landmarks[j][0]+horizon
                single_face_landmarks[j][1] = single_face_landmarks[j][1] + vertial
                if single_face_landmarks[j][0]<0:
                    single_face_landmarks[j][0]=0
                if single_face_landmarks[j][0]>224:
                    single_face_landmarks[j][0]=224
                if single_face_landmarks[j][1] < 0:
                    single_face_landmarks[j][1] = 0
                if single_face_landmarks[j][1] > 224:
                    single_face_landmarks[j][1] = 224
    


        if random.random() < pp:  # Image flipping
            im = np.fliplr(im)
            landmark=single_face_landmarks
            single_face_landmarks[0]=  [224-landmark[1][0] ,  landmark[1][1]]
            single_face_landmarks[1] = [224 - landmark[0][0], landmark[0][1]]
            single_face_landmarks[2] = [224 - landmark[2][0], landmark[2][1]]
            single_face_landmarks[3] = [224 - landmark[4][0], landmark[4][1]]
            single_face_landmarks[4] = [224 - landmark[3][0], landmark[3][1]]
  

        frame = im.transpose((2, 0, 1))
    
    
    facial=np.array(facial)
    facial[facial<0]=0
    facial[facial > 224] = 224
        
    rect_all=[]
        
    for i in range(len(facial)):
        rect = []
        rect_local = []

        land_resize=np.around(facial[i]*(28/224)).astype(int)

        a_width = int((land_resize[0][0] + land_resize[1][0]) / 2)
        a_high = int(land_resize[2][1])
        min_length=min(a_high, a_width,28-a_width)
        if min_length>=28/3:
            rect.append([a_width - min_length, a_high - min_length, a_width, a_high])
            rect.append([a_width + min_length, a_high - min_length, a_width, a_high])
        if min_length<28/3:
            eyemin=np.array([a_high, a_width,28-a_width])
            a_width_ind=np.where(eyemin==eyemin.min())[0][0]
            if a_width_ind==0:
                rect.append([a_width - min_length, a_high - min_length, a_width, a_high])
                rect.append([a_width + min_length, a_high - min_length, a_width, a_high])
            if a_width_ind==1:
                rect.append([a_width - min_length, a_high - min_length, a_width, a_high])
                min_eye_length1 = min(a_high,28-a_width, 14)
                rect.append([a_width + min_eye_length1, a_high - min_eye_length1, a_width, a_high])
            if a_width_ind==2:
                min_eye_length2 = min(a_high,a_width, 14)
                rect.append([a_width - min_eye_length2, a_high - min_eye_length2, a_width, a_high])
                rect.append([a_width + min_length, a_high - min_length, a_width, a_high])


        b_width = int((land_resize[3][0] + land_resize[4][0]) / 2)
        min_mou_length=min(28-a_high, b_width,28-b_width)
        if min_mou_length>=28/3:
            rect.append([b_width - min_mou_length, a_high + min_mou_length, b_width, a_high])
            rect.append([b_width + min_mou_length, a_high + min_mou_length, b_width, a_high])
        if min_mou_length<28/3:
            moumin=np.array([28-a_high, b_width,28-b_width])
            moumin_ind=np.where(moumin==moumin.min())[0][0]
            if moumin_ind==0:
                rect.append([b_width - min_mou_length, a_high + min_mou_length, b_width, a_high])
                rect.append([b_width + min_mou_length, a_high + min_mou_length, b_width, a_high])
            if moumin_ind==1:
                rect.append([b_width - min_mou_length, a_high + min_mou_length, b_width, a_high])
                min_mou_length1 = min(28 - a_high, 28 - b_width,14)
                rect.append([b_width + min_mou_length1, a_high + min_mou_length1, b_width, a_high])
            if moumin_ind==2:
                min_mou_length2 = min(28 - a_high, b_width,14)
                rect.append([b_width - min_mou_length2, a_high + min_mou_length2, b_width, a_high])
                rect.append([b_width + min_mou_length, a_high + min_mou_length, b_width, a_high])


        land=land_resize
        eye1 = land[0]
        eye2 = land[1]
        eye_midle = (eye1 + eye2) / 2
        mouth1 = land[3]
        mouth2 = land[4]
        landmark = np.array([eye1, eye2, eye_midle, mouth1, mouth2]).astype(int)

        for j in range(len(landmark)):
            if landmark[j][0] < l:
                landmark[j][0] = l
            if landmark[j][0] + l > 28:
                landmark[j][0] = 28 - l
            if landmark[j][1] < l:
                landmark[j][1] = l
            if landmark[j][1] > 28 - l:
                landmark[j][1] = 28 - l

        rect.append(landmark)
        rect_all.append(rect)


    return rect_all


In [7]:
def pre_pro(frame, facial, p, pp, l):
    images = frame.numpy() if not isinstance(frame, np.ndarray) else frame
    
    # Call pre_processing for the single image and facial landmarks
    rect_all = pre_processing(images, facial, p, pp, l)
    
    rect = []
    for i in range(len(rect_all)):
        # Convert scalars to 1D arrays and then concatenate
        rect_part = [np.array([rect_all[i][j]]) for j in range(4)]
        concatenated = np.concatenate(rect_part, axis=0)
        rect.append(concatenated)

    # Process rect_local
    rect_local = []
    for i in range(len(rect_all)):
        if len(rect_all[i]) > 4:  # Ensure the 5th element (facial landmarks array) exists
            landmark_array = rect_all[i][4]
            local = []
            for landmark in landmark_array:
                # Check if landmark is an array or list with at least two elements
                if isinstance(landmark, (list, np.ndarray)) and len(landmark) > 1:
                    local.extend([landmark[0] - l, landmark[0] + l, landmark[1] - l, landmark[1] + l])
            rect_local.append(local)

    # Convert image to a PyTorch tensor
    image_tensor = torch.tensor(images)

    return image_tensor, rect, rect_local


In [8]:
def pre_pro_infer(frame_tensor_224, face_landmarks_224, l=5):

    image, rect, rect_local = pre_pro(frame_tensor_224, [face_landmarks_224], 0.0, 0.0, l)
    return image, rect, rect_local

In [9]:
def detect_faces_and_landmarks(frame, face_detector):
    detections = RetinaFace.detect_faces(frame, model=face_detector)
    if isinstance(detections, tuple) or detections is None:
        return [], []

    faces = []
    landmarks_per_face = []

    for k, face in detections.items():
        x1, y1, x2, y2 = face['facial_area']
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        w, h = max(1, x2 - x1), max(1, y2 - y1)


        lm = np.array(list(face['landmarks'].values()), dtype=np.float32)  # (5, 2)

       
        lm_crop = np.stack([
            (lm[:, 0] - x1) * (224.0 / w),
            (lm[:, 1] - y1) * (224.0 / h)
        ], axis=1)

        faces.append((x1, y1, x2, y2))
        landmarks_per_face.append(lm_crop)

    return faces, landmarks_per_face

In [10]:
import cv2

video = cv2.VideoCapture('./your_video.mp4')

# Get the FPS of the video
fps = video.get(cv2.CAP_PROP_FPS)
print("Frames per second:", fps)
video.release()

Frames per second: 29.772408488303697


In [29]:
def draw_optimized(frame, x1, y1, x2, y2, box_color=(10, 200, 10), margin_size=10):
    # Adjust the coordinates for the margin
    x1_margin = max(x1 - margin_size, 0)  # Ensure x1_margin is not less than 0
    y1_margin = max(y1 - margin_size, 0)  # Ensure y1_margin is not less than 0
    x2_margin = min(x2 + margin_size, frame.shape[1])  # Ensure x2_margin does not exceed frame width
    y2_margin = min(y2 + margin_size, frame.shape[0])  # Ensure y2_margin does not exceed frame height

    # Draw a bounding box with margin
    cv2.rectangle(frame, (x1_margin, y1_margin), (x2_margin, y2_margin), box_color, 2)

    # Use pre-calculated text size

    # Prepare text background with standard size
    cv2.rectangle(frame, (x1_margin, y1_margin), (x1_margin, y1_margin), box_color, -1)

In [30]:
# --- helper: 5-point similarity alignment to 224x224
def align_face_5pt(img_crop_bgr, lm_224):
    dst = np.float32([
        [70, 96], [154, 96], [112, 128], [84, 164], [140, 164],
    ])
    src = np.float32(lm_224)
    H, _ = cv2.estimateAffinePartial2D(src.reshape(-1,1,2), dst.reshape(-1,1,2), method=cv2.LMEDS)
    crop_resized = cv2.resize(img_crop_bgr, (224,224))
    aligned = cv2.warpAffine(crop_resized, H, (224,224), flags=cv2.INTER_LINEAR)
    return aligned, H

def put_probs(frame, x, y, probs, labels, k=7):
    s = [f"{labels[i]}: {probs[0,i]:.2f}" for i in np.argsort(-probs[0])[:k]]
    y0 = y - 10 if y - 10 > 10 else y + 20
    for i, t in enumerate(s):
        cv2.putText(frame, t, (x+5, y0 + 18*i), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (255,255,255), 2, cv2.LINE_AA)

def draw_lm_on_224(img_bgr_224, lm_224):
    out = img_bgr_224.copy()
    for (px,py) in lm_224.astype(int):
        cv2.circle(out, (int(px),int(py)), 3, (0,255,255), -1)
    return out


In [36]:
def main(video_path):

    # ---------------- Models ----------------
    model = load_emotin_model()   
    model_cla = load_Orf_model()  
    torch.set_grad_enabled(False)

    emotion_labels = ['anger','disgust','fear','happy','neutral','sad','surprise']

    # ---- Prior correction (RAF) + Temperature ----
    raf_priors = np.array([
        0.0575,  # anger
        0.0584,  # disgust
        0.0229,  # fear
        0.3889,  # happy
        0.2057,  # neutral
        0.1615,  # sad
        0.1051   # surprise
    ], dtype=np.float32)
    target_priors = np.ones_like(raf_priors) / len(raf_priors)  
    SURPRISE_BOOST = 1.05  

    class_thresholds = {
        'happy': 0.58,
        'neutral': 0.45,
        'surprise': 0.28,
        'anger': 0.35, 'disgust': 0.35, 'fear': 0.35, 'sad': 0.38
    }
    to_tensor_and_norm = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
    ])

    text_sizes = {
        f'Emotion: {lbl}': cv2.getTextSize('Emotion...', cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2)[0]
        for lbl in emotion_labels + ['Uncertain']
    }

    # ---------------- Video IO ----------------
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = cap.get(cv2.CAP_PROP_FPS) or 25
    W  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    H  = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter('result.mp4', fourcc, fps, (W, H))

 
    face_detector = RetinaFace.build_model()

    # EMA 
    ema_probs = None
    alpha = 0.6  

    frame_count, total_process_time = 0, 0.0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        t0 = time.time()

        faces, lm_list = detect_faces_and_landmarks(frame, face_detector)
        if len(faces) == 0:
            out.write(frame); frame_count += 1
            continue

        for (x1, y1, x2, y2), lm_224 in zip(faces, lm_list):
            face_bgr = frame[y1:y2, x1:x2]
            if face_bgr.size == 0:
                continue

            # ---------- Alignment 5-point ----------
            aligned_bgr_224, _ = align_face_5pt(face_bgr, lm_224)

            # ---------- Preprocess (RGB + Normalize) ----------
            aligned_rgb_224 = cv2.cvtColor(aligned_bgr_224, cv2.COLOR_BGR2RGB)
            tens = to_tensor_and_norm(aligned_rgb_224).unsqueeze(0)    # 1×3×224×224
            frame_prepro = tens

            # ---------- Pre-processing (augmentation) ----------
            image, rect, rect_local = pre_pro_infer(frame_prepro, lm_224, l=5)

            # ---------- Features ----------
            orth_f1, orth_f2, orth_f3 = Feature_Orthogonal(image, rect, rect_local, model_cla)

            C = orth_f1.shape[-1]
            q_test = np.zeros([orth_f1.shape[0], orth_f1.shape[1], orth_f1.shape[2], 4*C], dtype=orth_f1.dtype)
            temp_sum = (orth_f1 + orth_f2 + orth_f3) / 3.0
            q_test[:, :, :, 0*C:1*C] = temp_sum
            q_test[:, :, :, 1*C:2*C] = orth_f1
            q_test[:, :, :, 2*C:3*C] = orth_f2
            q_test[:, :, :, 3*C:4*C] = orth_f3

            test = np.transpose(q_test, (0, 3, 1, 2))                 # N, 512, 7, 7 (C=128)
            test = np.reshape(test, (orth_f1.shape[0], 128*4, 49)).astype(np.float32)

            # ---------- Predict ----------
            logits = model.predict(test, verbose=0)                    # (1,7)
            logits_adj = (logits / T) - np.log(raf_priors + 1e-8) + np.log(target_priors + 1e-8)

            idx_surprise = emotion_labels.index('surprise')
            logits_adj[0, idx_surprise] += np.log(SURPRISE_BOOST)

            probs = tf.nn.softmax(logits_adj).numpy()                  # 1×7

            # ---------- EMA ----------
            if ema_probs is None:
                ema_probs = probs.copy()
            else:
                ema_probs = alpha * ema_probs + (1 - alpha) * probs
            probs_use = ema_probs

            # ---------- top-2 logic + class thresholds ----------
            order = np.argsort(-probs_use[0])
            top1, top2 = int(order[0]), int(order[1])
            p1, p2 = float(probs_use[0, top1]), float(probs_use[0, top2])
            lab1, lab2 = emotion_labels[top1], emotion_labels[top2]

            MIN_FOR_TOP2 = 0.20   
            DELTA = 0.07          

            if (p1 >= MIN_FOR_TOP2) and (p2 >= MIN_FOR_TOP2) and ((p1 - p2) < DELTA):
                chosen_label, chosen_prob = lab1, p1
            else:
                chosen_label, chosen_prob = lab1, p1

            thr = class_thresholds.get(chosen_label, 0.40)
            label = chosen_label if chosen_prob >= thr else 'Uncertain'

            # ---------- Draw ----------
            draw_optimized(frame, x1, y1, x2, y2)
            put_probs(frame, x1, min(y2 + 20, H - 120), probs_use, emotion_labels)

        total_process_time += (time.time() - t0)
        out.write(frame)
        frame_count += 1

    avg_fps = frame_count / total_process_time if total_process_time > 0 else 0.0
    print(f"Average FPS: {avg_fps:.2f}\nTotal Processed Frames: {frame_count}")

    cap.release()
    out.release()

video_path = '/your_video.mp4'
main(video_path)

Average FPS: 3.11
Total Processed Frames: 1124
