In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os

In [3]:
# -------------------------------------------------
# Device configuration
# -------------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
class CrossAttention(nn.Module):
    #Attention is split into 8 parallel heads
    def __init__(self, dim=768, num_heads=8, dropout=0.3):#Each head learns different alignment patterns.
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True #[Batch, Sequence, Features]
        )
        self.norm = nn.LayerNorm(dim) #Normalizes across features, Prevents one modality from dominating

    def forward(self, query, key_value):
        """
        This function lets one token (query) attend to a sequence of tokens (key/value) and returns a context-aware enriched query.

        query: [B, 1, D]
        B → batch size
        1 → exactly one token
        D → embedding dimension (e.g., 768)

        key_value: [B, N, D]
        N → number of tokens in another modality
        Audio frames
        Video patches
        Text tokens

        """
        #cross-attention computation
        attn_out, _ = self.attn(query, key_value, key_value) #attn_out=αV , _ contains attention maps
       # Residual connection , LayerNorm —> stabilizing the fusion
        return self.norm(attn_out + query)


class MultimodalEmotionModel(nn.Module):
    def __init__(self, embed_dim=768, num_heads=8, num_classes=8, dropout=0.3):
        super().__init__()
        #Learned NULL embeddings
        """ These are learnable placeholders.
            If one modality is missing, the model uses a learned null vector instead of undefined prediction/behaviour.

            Importance:
            1.Handles missing audio / text / video
            2.NULL vectors are trained, not fixed zeros
            3.Model learns how “absence” should influence emotion

            Shape: [768]
        """
        self.null_text   = nn.Parameter(torch.zeros(embed_dim))
        self.null_vision = nn.Parameter(torch.zeros(embed_dim))
        self.null_audio  = nn.Parameter(torch.zeros(embed_dim))


        #Cross-attention layers
        #This creates bidirectional multimodal understanding.
        self.audio_to_tv = CrossAttention(embed_dim, num_heads)  #Audio attends to Text + Vision
        self.text_to_av  = CrossAttention(embed_dim, num_heads)  #Text attends to Audio + Vision
        self.vision_to_at = CrossAttention(embed_dim, num_heads) #Vision attends to Audio + Text

        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim * 3, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, num_classes)
        )

    def forward(self, audio, text, vision, masks):

        # -------- MASKS --------
        text_mask   = masks[:, 0].unsqueeze(1)   # [B,1]
        vision_mask = masks[:, 1].unsqueeze(1)   # [B,1]
        audio_mask = masks[:, 2].unsqueeze(1)   # [B,1]

        # Replace missing modalities with learned NULL embeddings
        text = text_mask * text + (1 - text_mask) * self.null_text.unsqueeze(0)
        vision = vision_mask * vision + (1 - vision_mask) * self.null_vision.unsqueeze(0)
        audio = audio_mask * audio + (1 - audio_mask) * self.null_audio.unsqueeze(0)

        # -------- Attention Preparation --------
         #unsqueeze(1) turns a vector into a one-step sequence so attention can read it. ( attention only works on sequences)
        # [B,D](Batch, Features)->[B,T,D]([Batch, Sequence, Features])
        a = audio.unsqueeze(1)   # [B,1,D]
        t = text.unsqueeze(1)
        v = vision.unsqueeze(1)

        # Context pairs
        """
            Concatenation creates a context window of other modalities.
            a : [B, 1, 768]  → audio
            t : [B, 1, 768]  → text
            v : [B, 1, 768]  → vision

            Audio  → looks at [Text, Vision]
            Text   → looks at [Audio, Vision]
            Vision → looks at [Audio, Text]
        """

        tv = torch.cat([t, v], dim=1) #When AUDIO is thinking, let it look at both TEXT and VISION together.
        av = torch.cat([a, v], dim=1) #When TEXT is thinking, let it look at AUDIO and VISION together.
        at = torch.cat([a, t], dim=1) #When VISION is thinking, let it look at AUDIO and TEXT together.

        # Cross-attention fusion
        """
        Audio listens to text and vision, learns what matters, and becomes smarter

        a is the query (audio)
        tv is key/value (text + vision)
        Cross-attention happens inside

        a_fused = audio embedding enriched by text + vision

        """
        a_fused = self.audio_to_tv(a, tv).squeeze(1)
        t_fused = self.text_to_av(t, av).squeeze(1)
        v_fused = self.vision_to_at(v, at).squeeze(1)

        # Final fused representation
        fused = torch.cat([a_fused, t_fused, v_fused], dim=1)

        return self.classifier(fused)

#Load Audio extraction module

In [5]:
import os
import torch
import numpy as np
import soundfile as sf

from moviepy.editor import VideoFileClip
from transformers import Wav2Vec2Processor, Wav2Vec2Model

  IMAGEMAGICK_BINARY = r"C:\Program Files\ImageMagick-6.8.8-Q16\magick.exe"
  lines_video = [l for l in lines if ' Video: ' in l and re.search('\d+x\d+', l)]
  rotation_lines = [l for l in lines if 'rotate          :' in l and re.search('\d+$', l)]
  match = re.search('\d+$', rotation_line)
  if event.key is 'enter':



In [6]:
# -------------------------------------------------
# Load pretrained Wav2Vec2 model
# -------------------------------------------------
AUDIO_MODEL_NAME = "facebook/wav2vec2-base"
processor_audio = Wav2Vec2Processor.from_pretrained(AUDIO_MODEL_NAME)
model_audio = Wav2Vec2Model.from_pretrained(AUDIO_MODEL_NAME).to(DEVICE)
model_audio.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]




vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [7]:
# -------------------------------------------------
# Extract audio from MP4 using MoviePy
# -------------------------------------------------
def extract_audio_mp4_to_wav(mp4_path, wav_path):
    """
    Extracts audio from an MP4 file and saves it as a 16kHz mono WAV file.

    """
    video = VideoFileClip(mp4_path)

    if video.audio is None:
        video.close()
        raise ValueError(f"No audio stream found in {mp4_path}")

    video.audio.write_audiofile(
        wav_path,
        fps=16000,
        nbytes=2,
        codec="pcm_s16le",
        logger=None
    )

    video.close()

    # -------------------------------------------------
# Audio embedding extraction + saving
# -------------------------------------------------
@torch.no_grad()
def extract_audio_embedding(
    video_path,
    temp_wav_dir="_temp_wav"
):
    os.makedirs(temp_wav_dir, exist_ok=True)

    video_name = os.path.splitext(os.path.basename(video_path))[0]
    temp_wav_path = os.path.join(temp_wav_dir, f"{video_name}.wav")

    # 1. Extract WAV from MP4
    extract_audio_mp4_to_wav(video_path, temp_wav_path)

    # 2. Load WAV using soundfile
    waveform, sr = sf.read(temp_wav_path, dtype="float32")

    # Convert to mono if stereo
    if waveform.ndim == 2:
        waveform = waveform.mean(axis=1)

    waveform = torch.from_numpy(waveform)

    # 3. Prepare input for Wav2Vec2
    inputs = processor_audio(
        waveform,
        sampling_rate=sr,
        return_tensors="pt"
    ).to(DEVICE)

    # 4. Forward pass through Wav2Vec2
    outputs = model_audio(**inputs)

    # Frame-level embeddings: [time_steps, hidden_dim]
    hidden_states = outputs.last_hidden_state.squeeze(0)

    # -------------------------------------------------
    # ONE audio embedding per video (temporal mean)
    # -------------------------------------------------
    audio_embedding = hidden_states.mean(dim=0)  # [768]

    # L2 normalization
    audio_embedding = audio_embedding / audio_embedding.norm()

    # Cleanup temp WAV
    if os.path.exists(temp_wav_path):
        os.remove(temp_wav_path)

    return audio_embedding.cpu()


In [8]:
import os
import cv2
import torch
import numpy as np
from transformers import CLIPModel, CLIPProcessor

In [9]:
# -------------------------------------------------
# Load CLIP Vision model
# -------------------------------------------------
CLIP_NAME = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(CLIP_NAME)
model_visual = CLIPModel.from_pretrained(CLIP_NAME).to(DEVICE)
model_visual.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [10]:
# -------------------------------------------------
# Frame sampling using OpenCV
# -------------------------------------------------

def sample_frames_cv2(video_path, num_frames=8):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Cannot open video: {video_path}")

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if frame_count == 0:
        cap.release()
        raise ValueError(f"No frames found in video: {video_path}")

    indices = np.linspace(0, frame_count - 1, num_frames, dtype=int)
    frames = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ok, frame_bgr = cap.read()

        if not ok:
            frame_rgb = np.zeros((224, 224, 3), dtype=np.uint8)
        else:
            frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            frame_rgb = cv2.resize(frame_rgb, (224, 224), interpolation=cv2.INTER_AREA)

        frames.append(frame_rgb)

    cap.release()
    return frames

   # -------------------------------------------------
# CLIP visual feature extraction + saving
# -------------------------------------------------

@torch.no_grad()
def extract_visual_embedding(
    video_path,
    num_frames=8
):
    # Sample frames
    frames = sample_frames_cv2(video_path, num_frames)

    # Preprocess frames
    inputs = processor(images=frames, return_tensors="pt").to(DEVICE)

    # Forward pass through CLIP vision encoder
    vision_outputs = model_visual.vision_model(**inputs)

    # CLS token embeddings (frame-level)
    z_v = vision_outputs.last_hidden_state[:, 0, :]  # [num_frames, 768]

    # ---------------------------------------------
    # Video-level embedding (ONE vector per video)
    # ---------------------------------------------
    video_embedding = z_v.mean(dim=0)  # [768]

    #l2 normalization

    video_embedding = video_embedding / video_embedding.norm()

    return video_embedding.cpu()



#initialize text feature extraction module

In [11]:
!pip install openai-whisper


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m30.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=d15f437e9579627abf87254656980ac251083f38bd5ed26e22b3625bd82c6405
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b

In [12]:
#!pip install openai-whisper

import os
import torch
import numpy as np
import soundfile as sf
from moviepy.editor import VideoFileClip
import whisper
from transformers import AutoTokenizer, AutoModel

In [13]:
# -------------------------------------------------
# Load Whisper (Speech → Text)
# -------------------------------------------------
WHISPER_MODEL_SIZE = "base"  # tiny | base | small | medium
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device=DEVICE)

100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 43.7MiB/s]


In [14]:
# -------------------------------------------------
# Load Text Embedding Model
# -------------------------------------------------
TEXT_MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
text_model = AutoModel.from_pretrained(TEXT_MODEL_NAME).to(DEVICE)
text_model.eval()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [15]:
# -------------------------------------------------
# Extract audio from MP4 using MoviePy
# -------------------------------------------------
def extract_audio_mp4_to_wav(mp4_path, wav_path):
    video = VideoFileClip(mp4_path)

    if video.audio is None:
        video.close()
        raise ValueError(f"No audio stream found in {mp4_path}")

    video.audio.write_audiofile(
        wav_path,
        fps=16000,
        nbytes=2,
        codec="pcm_s16le",
        logger=None
    )

    video.close()

    # -------------------------------------------------
# Text embedding extraction
# -------------------------------------------------
@torch.no_grad()
def extract_text_embedding(
    video_path,
    temp_wav_dir="_temp_wav"
):
    os.makedirs(temp_wav_dir, exist_ok=True)

    video_name = os.path.splitext(os.path.basename(video_path))[0]
    temp_wav_path = os.path.join(temp_wav_dir, f"{video_name}.wav")

    # 1. Extract WAV from MP4
    extract_audio_mp4_to_wav(video_path, temp_wav_path)

    # 2. Load audio
    audio, sr = sf.read(temp_wav_path, dtype="float32")

    if audio.ndim == 2:
        audio = audio.mean(axis=1)

    # Whisper expects 16 kHz
    if sr != 16000:
        audio = whisper.audio.resample(audio, sr, 16000)

    # 3. Speech → Text
    result = whisper_model.transcribe(audio, fp16=False)
    transcript = result["text"].strip()

    if not transcript:
        print(f"Warning: No transcript generated for {video_path}")
        return

    # 4. Text → Embedding
    inputs = tokenizer(
        transcript,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(DEVICE)

    outputs = text_model(**inputs)

    # CLS token embedding
    text_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0)

    # L2 normalization
    text_embedding = text_embedding / text_embedding.norm(p=2)

    # Cleanup
    if os.path.exists(temp_wav_path):
        os.remove(temp_wav_path)

    return text_embedding.cpu()

#Test one video for prediction

#Inference app

In [16]:
import gradio as gr

In [17]:
MODEL_LOAD_PATH = "/content/drive/MyDrive/Dissertion/model/multimodal_emotion_model.pth"

EMOTION_LABELS = ["Neutral",
                  "Calm",
                  "Happy",
                  "Sad",
                  "Angry",
                  "Fearful",
                  "Disgust",
                  "Surprised"
                  ]

In [18]:

# ===============================
# Load Emotion Model
# ===============================

print("Loading Multimodal Model...")
model_em = MultimodalEmotionModel()
model_em.load_state_dict(torch.load(MODEL_LOAD_PATH, map_location=DEVICE))
model_em.to(DEVICE)
model_em.eval()

Loading Multimodal Model...


MultimodalEmotionModel(
  (audio_to_tv): CrossAttention(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (text_to_av): CrossAttention(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (vision_to_at): CrossAttention(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (classifier): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=8, bias=True)
  )
)

In [19]:
import torch
import os

# ============================================
# Utilities
# ============================================

def count_parameters(model):
    """Returns total and trainable parameter counts."""
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params


def model_size_in_mb(model):
    """Returns model size in MB."""
    temp_file = "temp_model.pth"
    torch.save(model.state_dict(), temp_file)
    size_mb = os.path.getsize(temp_file) / (1024 * 1024)
    os.remove(temp_file)
    return size_mb


def layerwise_parameters(model):
    """Prints parameters per layer/module."""
    print("\n✅ Layer-wise Parameter Distribution:\n")
    for name, param in model.named_parameters():
        print(f"{name:60s} | Shape: {list(param.shape)} | Params: {param.numel()}")


def model_summary_report(model):
    """Generates full model attribute summary."""
    print("\n" + "="*60)
    print("✅ MULTIMODAL MODEL ATTRIBUTES REPORT")
    print("="*60)

    # Model Class Name
    print(f"Model Architecture     : {model.__class__.__name__}")

    # Device Info
    device = next(model.parameters()).device
    print(f"Device Loaded On       : {device}")

    # Parameter Counts
    total, trainable = count_parameters(model)
    print(f"Total Parameters       : {total:,}")
    print(f"Trainable Parameters   : {trainable:,}")

    # Trainability %
    print(f"Trainable Percentage   : {100*trainable/total:.2f}%")

    # Model Size
    size_mb = model_size_in_mb(model)
    print(f"Model Size (Disk)      : {size_mb:.2f} MB")

    print("="*60)


# ============================================
#Summary of model
# ============================================

model_summary_report(model_em)
layerwise_parameters(model_em)



✅ MULTIMODAL MODEL ATTRIBUTES REPORT
Model Architecture     : MultimodalEmotionModel
Device Loaded On       : cuda:0
Total Parameters       : 8,278,280
Trainable Parameters   : 8,278,280
Trainable Percentage   : 100.00%
Model Size (Disk)      : 31.59 MB

✅ Layer-wise Parameter Distribution:

null_text                                                    | Shape: [768] | Params: 768
null_vision                                                  | Shape: [768] | Params: 768
null_audio                                                   | Shape: [768] | Params: 768
audio_to_tv.attn.in_proj_weight                              | Shape: [2304, 768] | Params: 1769472
audio_to_tv.attn.in_proj_bias                                | Shape: [2304] | Params: 2304
audio_to_tv.attn.out_proj.weight                             | Shape: [768, 768] | Params: 589824
audio_to_tv.attn.out_proj.bias                               | Shape: [768] | Params: 768
audio_to_tv.norm.weight                                 

In [20]:
# ===============================
#  Main Prediction Function
# ===============================

def predict_emotion(video_file):

    if video_file is None:
        return "❌ No video uploaded!"

    video_path = video_file

    print("🎥 Video received:", video_path)

    # -------------------------------
    # 1. Extract Visual Features
    # -------------------------------
    visual_feat = extract_visual_embedding(video_path)
    if visual_feat is None:
        print("ℹ️ No visuals detected → using NULL visual embedding")
        visual_feat = torch.zeros(768)
        visual_mask = 0
    else:
        visual_mask = 1

    visual_feat = visual_feat.to(DEVICE)

    # -------------------------------
    # 2. Extract Audio Features
    # -------------------------------
    audio_feat = extract_audio_embedding(video_path)

    if audio_feat is None:
        print("ℹ️ No Audio detected → using NULL audio embedding")
        audio_feat = torch.zeros(768)
        audio_mask = 0
    else:
      audio_mask = 1

    audio_feat = audio_feat.to(DEVICE)

    # -------------------------------
    # 3. Extract Text Features (robust)
    # -------------------------------
    text_feat = extract_text_embedding(video_path)

    if text_feat is None:
        print("ℹ️ No speech detected → using NULL text embedding")
        text_feat = torch.zeros(768)
        text_mask = 0
    else:
      text_mask = 1

    text_feat = text_feat.to(DEVICE)

    masks = torch.tensor([[text_mask, visual_mask,audio_mask]], device=DEVICE)

    # -------------------------------
    # 4. Model Prediction
    # -------------------------------
    with torch.no_grad():

        logits = model_em(
            audio_feat.unsqueeze(0),
            text_feat.unsqueeze(0),
            visual_feat.unsqueeze(0),
            masks
        )

        pred_class = torch.argmax(logits, dim=1).item()
        emotion = EMOTION_LABELS[pred_class]

    return  f"Predicted Emotion: **{emotion}**"

In [21]:
TEST_VIDEO_PATH = "/content/drive/MyDrive/Dissertion/Data/RAVDESS/Actor_07/01-02-01-01-01-01-07.mp4"
a = extract_audio_embedding(TEST_VIDEO_PATH).to(DEVICE)
v = extract_visual_embedding(TEST_VIDEO_PATH).to(DEVICE)
t = extract_text_embedding(TEST_VIDEO_PATH).to(DEVICE)
print(a.shape)
print(v.shape)
print(t.shape)
em_op = predict_emotion(TEST_VIDEO_PATH)
print(em_op)

torch.Size([768])
torch.Size([768])
torch.Size([768])
🎥 Video received: /content/drive/MyDrive/Dissertion/Data/RAVDESS/Actor_07/01-02-01-01-01-01-07.mp4
Predicted Emotion: **Neutral**


In [22]:
import gradio as gr
# -------------------------------------------------
# CSS (future-proof, passed to launch)
# -------------------------------------------------
CUSTOM_CSS = """/* ---------- Page centering & outer spacing ---------- */
.center-wrapper {
    display: flex;
    justify-content: center;
    width: 100%;
    padding: 16px;                 /* equal top / bottom / left / right */
    box-sizing: border-box;
}

/* ---------- Main card ---------- */
.form-card {
    width: 100%;
    max-width: 640px;              /* fixed feel on desktop */
    padding: 20px;                 /* inner spacing */
    border-radius: 14px;
    background: white;
    box-shadow: 0 10px 25px rgba(0,0,0,0.08);
    box-sizing: border-box;
}

/* ---------- Title ---------- */
.form-title {
    font-size: 24px;
    font-weight: 700;
    margin-bottom: 6px;
    text-align: center;
}

/* ---------- Subtitle ---------- */
.form-subtitle {
    color: #555;
    margin-bottom: 18px;
    text-align: center;
}

/* ---------- Button row ---------- */
.button-row {
    display: flex;
    justify-content: center;
    gap: 10px;
    flex-wrap: wrap;
    margin-top: 12px;
    margin-bottom: 8px;
}

/* ---------- Buttons (smaller & clean) ---------- */
button {
    padding: 6px 16px !important;   /* controls height & width */
    font-size: 14px !important;
    border-radius: 8px !important;
}

/* Slight emphasis for primary button */
button.primary {
    font-weight: 600;
}

/* ---------- Result box ---------- */
.result-box {
    margin-top: 14px;
    padding: 14px;
    border-radius: 10px;
    background: #f0fdf4;
    border: 1px solid #22c55e;
    font-size: 17px;
    text-align: center;
}
"""


# -------------------------------------------------
# UI
# -------------------------------------------------
with gr.Blocks(title="Multimodal Emotion Detection", css=CUSTOM_CSS) as app:

    # ---- HEADER CARD ----
    gr.HTML("""
    <div class="center-wrapper">
        <div class="form-card">
            <div class="form-title">🎭 Multimodal Emotion Detection</div>
            <div class="form-subtitle">
                Upload a short video clip to predict emotion.
            </div>
        </div>
    </div>
    """)

    # ---- FORM CARD ----
    gr.HTML('<div class="center-wrapper"><div class="form-card">')

    video_input = gr.Video(
        label="Upload Video",
        format="mp4"
    )

    with gr.Row():
        clear_btn = gr.Button("Clear")
        submit_btn = gr.Button("Submit", variant="primary")

    result_html = gr.HTML()

    gr.HTML('</div></div>')

    # ---- LOGIC ----
    def run_inference(video):
        emotion = predict_emotion(video)
        return f"""
        <div class="result-box">
            <b>Predicted Emotion:</b> {emotion}
        </div>
        """

    submit_btn.click(
        fn=run_inference,
        inputs=video_input,
        outputs=result_html
    )

    clear_btn.click(
        fn=lambda: ("", None),
        outputs=[result_html, video_input]
    )


# -------------------------------------------------
# Launch (CSS passed here – correct way)
# -------------------------------------------------
app.launch()


  with gr.Blocks(title="Multimodal Emotion Detection", css=CUSTOM_CSS) as app:



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7fd3e88d4e0b445caa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


