In [None]:
dima806/facial_emotions_image_detection
CynthiaCR/emotions_classifier

In [12]:
#!conda create --yes --name myenv python=3.9
#!conda activate myenv
#!conda install -y -c conda-forge \
#    accelerate==0.23.0 \
#    validators==0.22.0 \
#    diffusers==0.18.2 \
#    transformers==4.32.1 \
#    pillow \
#    ipywidgets \
#    ipython
#import sys
#!{sys.executable} -m pip install invisible-watermark
#!conda list


Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 24.1.2
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c conda-forge conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.3.0



## Package Plan ##

  environment location: /home/u2a67f3b0c6b1806f806f7159018a426/.conda/envs/myenv

  added / updated specs:
    - python=3.9


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-3.2.1              |       hd590300_1         2.7 MB  intel
    ------------------------------------------------------------
                                           Total:         2.7 MB

The following NEW packages will be INSTALLED:

  _libgcc_mutex      intel/linux-64::_libgcc_mutex-0.1-conda_forge 
  _openmp_mutex      intel/linux-64::_openmp_mutex-4.5-2_gnu 
  bzi

In [None]:
#Each frame decoded and given emotions output. WITHOUT IPEX (Look from next cell for intended purpose)
import torch
import cv2
from PIL import Image
from transformers import AutoModelForImageClassification, BasicTokenizer
from torchvision.transforms import functional as F
import time

# Load the model
model_name = "dima806/facial_emotions_image_detection"
model = AutoModelForImageClassification.from_pretrained(model_name)

# Use a basic tokenizer
tokenizer = BasicTokenizer()

# Function to preprocess image
def preprocess_image(image):
    image = image.convert("RGB")
    image = image.resize((224, 224))  # Resize image to match model input size
    image = F.to_tensor(image)  # Convert PIL image to PyTorch tensor
    image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize image
    return image.unsqueeze(0)  # Add batch dimension

# Function to perform facial emotion detection on an image
def predict_emotion(image):
    # Preprocess the image
    image = preprocess_image(image)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(image)
        predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
    
    # Map class index to emotion label
    emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
    predicted_emotion = emotions[predicted_class_idx]
    
    return predicted_emotion

# Function to extract frames from video and perform facial emotion detection
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    seconds_count = 0

    print("AI is processing the emotions of the feeded video. Please wait...")
    print(f"Processing file: {video_path}\n")
    time.sleep(2)
    try:
        while(cap.isOpened()):
            ret, frame = cap.read()
            if ret == False:
                break
        
            # Convert frame to PIL image
            pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        
            # Print predicted emotion 
            if frame_count % 300 == 0: #(For every 300 frames for 30fps video)
                # Perform facial emotion detection
                predicted_emotion = predict_emotion(pil_image)
                print(f"Frame {frame_count}: Predicted emotion - {predicted_emotion}")
        
            frame_count += 1
    except KeyboardInterrupt:
                print(f"\nUser interrupted the process. Run again...")
    except Exception as e:
                print(f"An error occurred: {e}")
    finally:
        cap.release()
        cv2.destroyAllWindows()

# Example usage
video_path = 'emotions.mp4'
process_video(video_path)

In [15]:
#For every 300 Frames. WITHOUT ipex optimization
import torch
import cv2
from PIL import Image
from transformers import AutoModelForImageClassification, BasicTokenizer
from torchvision.transforms import functional as F
import time

class EmotionDetector:
    def __init__(self, model_name):
        self.model = AutoModelForImageClassification.from_pretrained(model_name)
        self.tokenizer = BasicTokenizer()
    
    def preprocess_image(self, image):
        image = image.convert("RGB")
        image = image.resize((224, 224))  # Resize image to match model input size
        image = F.to_tensor(image)  # Convert PIL image to PyTorch tensor
        image = F.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize image
        return image.unsqueeze(0)  # Add batch dimension
    
    def predict_emotion(self, image):
        image = self.preprocess_image(image)
        with torch.no_grad():
            outputs = self.model(image)
            predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
        emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
        predicted_emotion = emotions[predicted_class_idx]
        return predicted_emotion

class VideoProcessor:
    def __init__(self, video_path, detector):
        self.video_path = video_path
        self.detector = detector
    
    def process_video(self):
        cap = cv2.VideoCapture(self.video_path)
        frame_count = 0
        seconds_count = 0
        print("❌ Without ipex optimization")
        print("AI is processing the emotions of the feeded video. Please wait...")
        print(f"Processing file: {self.video_path}\n")
        start_time = time.time()  # Start counting time
        try:
            while(cap.isOpened()):
                ret, frame = cap.read()
                if ret == False:
                    break
                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                if frame_count % 300 == 0:
                    predicted_emotion = self.detector.predict_emotion(pil_image)
                    print(f"{seconds_count}s | Frame {frame_count + 1}: Predicted emotion - {predicted_emotion}")
                    # Save the frame to disk
                    cv2.imwrite(f"emo/frame_{frame_count}.jpg", frame)
                    seconds_count += 10  # Increment seconds count by 10 (assuming 30 fps)
                frame_count += 1
        except KeyboardInterrupt:
            print(f"\nUser interrupted the process. Run again...")
        except Exception as e:
            print(f"An error occurred: {e}")
        finally:
            end_time = time.time()  # Stop counting time
            elapsed_time = end_time - start_time
            print(f"Total execution time: {elapsed_time} seconds")
            cap.release()

# Example usage
model_name = "dima806/facial_emotions_image_detection"
detector = EmotionDetector(model_name)
video_path = 'emotions.mp4'
processor = VideoProcessor(video_path, detector)
processor.process_video()


❌ Without ipex optimization
AI is processing the emotions of the feeded video. Please wait...
Processing file: emotions.mp4

0s | Frame 1: Predicted emotion - Surprise
10s | Frame 301: Predicted emotion - Surprise
20s | Frame 601: Predicted emotion - Surprise
30s | Frame 901: Predicted emotion - Fear
40s | Frame 1201: Predicted emotion - Angry
50s | Frame 1501: Predicted emotion - Angry
60s | Frame 1801: Predicted emotion - Angry
70s | Frame 2101: Predicted emotion - Fear
Total execution time: 3.0263562202453613 seconds


In [17]:
#For every 300 frames. WITH IPEX OPTIMIZED!
import torch
import cv2
from PIL import Image
from transformers import AutoModelForImageClassification, BasicTokenizer
from torchvision.transforms import functional as F
import time
import intel_extension_for_pytorch as ipex  # Import Ipex

class EmotionDetector:
    def __init__(self, model_name):
        # Optimize the model with Ipex, handling potential warnings
        self.model = ipex.optimize(AutoModelForImageClassification.from_pretrained(model_name))
        self.tokenizer = BasicTokenizer()

    def preprocess_image(self, image):
        # Preprocess image on CPU (avoiding PIL's 'to' method)
        cpu_image = image.convert("RGB").resize((224, 224))
        cpu_image = F.to_tensor(cpu_image)  # Convert to tensor on CPU
        cpu_image = F.normalize(cpu_image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        return cpu_image.unsqueeze(0)  # Add batch dimension

    def predict_emotion(self, image):
        image = self.preprocess_image(image)
        with torch.no_grad():
            outputs = self.model(image)
            predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
        emotions = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
        predicted_emotion = emotions[predicted_class_idx]
        return predicted_emotion

class VideoProcessor:
    def __init__(self, video_path, detector):
        self.video_path = video_path
        self.detector = detector

    def process_video(self):
        cap = cv2.VideoCapture(self.video_path)
        frame_count = 0
        seconds_count = 0
        print("✅ With ipex optimization")
        print("AI is processing the emotions of the feeded video. Please wait...")
        print(f"Processing file: {self.video_path}\n")
        start_time = time.time()  # Start counting time
        try:
            while cap.isOpened():
                ret, frame = cap.read()
                if ret == False:
                    break
                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                if frame_count % 300 == 0:
                    predicted_emotion = self.detector.predict_emotion(pil_image)  # Process on CPU
                    print(f"{seconds_count}s | Frame {frame_count + 1}: Predicted emotion - {predicted_emotion}")
                    # Save the frame to disk
                    cv2.imwrite(f"emo/frame_{frame_count}.jpg", frame)
                    seconds_count += 10  # Increment seconds count by 10 (assuming 30 fps)
                frame_count += 1
        except KeyboardInterrupt:
            print(f"\nUser interrupted the process. Run again...")
        except Exception as e:
            print(f"An error occurred: {e}")
        finally:
            end_time = time.time()  # Stop counting time
            elapsed_time = end_time - start_time
            print(f"Total execution time: {elapsed_time} seconds")
            cap.release()

# Example usage
model_name = "dima806/facial_emotions_image_detection"
detector = EmotionDetector(model_name)
video_path = 'emotions.mp4'
processor = VideoProcessor(video_path, detector)
processor.process_video()


✅ With ipex optimization
AI is processing the emotions of the feeded video. Please wait...
Processing file: emotions.mp4

0s | Frame 1: Predicted emotion - Surprise
10s | Frame 301: Predicted emotion - Surprise
20s | Frame 601: Predicted emotion - Surprise
30s | Frame 901: Predicted emotion - Fear
40s | Frame 1201: Predicted emotion - Angry
50s | Frame 1501: Predicted emotion - Angry
60s | Frame 1801: Predicted emotion - Angry
70s | Frame 2101: Predicted emotion - Fear
Total execution time: 2.9871420860290527 seconds
