# Experiment 1: Late-fusion

### Text + Audio
- Best TER model on MELD:
- Best SER model on MELD:

### Text + Audio + Image
- Best TER model on MELD:
- Best SER model on MELD:
- Deepface

In [2]:
import os, sys
import pandas as pd
import librosa
import torch
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm
tqdm.pandas()
module_path = os.path.abspath(os.path.join('..', '..')) # or the path to your source code
sys.path.insert(0, module_path)
from src.data_loading import filter_emotions, load_meld
#from src.fusion import LateFusion

#meld_train = load_meld(split='train')
#meld_val = load_meld(split='dev')
meld_test = filter_emotions(load_meld(split='test'))



class LateFusion:
    def __init__(self, weights=None, neutral_label="neutral"):
        """
        weights: dict (e.g., {'text': 0.4, 'audio': 0.3, 'video': 0.3})
        neutral_label: label used for neutral emotion (default='neutral')
        """
        self.weights = weights or {'text': 1/3, 'audio': 1/3, 'video': 1/3}
        self.modalities = list(self.weights.keys())
        self.neutral_label = neutral_label

    def _normalize_confidences(self, confidences):
        """Normalize probabilities to sum to 1."""
        total = sum(confidences.values())
        if total == 0:
            return {k: 1/len(confidences) for k in confidences}
        return {k: v / total for k, v in confidences.items()}

    def _aggregate_video(self, video_outputs):
        """
        Aggregate multiple frame-level predictions (labels or probability dicts)
        into a single averaged emotion distribution.
        """
        if not video_outputs:
            return {self.neutral_label: 1.0}

        # if is list of dicts (probabilities)
        if isinstance(video_outputs[0], dict):
            agg = defaultdict(float)
            for frame_pred in video_outputs:
                normed = self._normalize_confidences(frame_pred)
                for emotion, prob in normed.items():
                    agg[emotion] += prob
            return self._normalize_confidences({k: v / len(video_outputs) for k, v in agg.items()})

        # if is list of labels
        else:
            counts = Counter(video_outputs)
            return self._normalize_confidences({k: v / len(video_outputs) for k, v in counts.items()})


    def fuse(self, predictions, top_k=3):
        """
        Fuse predictions from multiple modalities.
        
        predictions: dict
            {
              'text': {'happy': 0.8, 'sad': 0.1, 'neutral': 0.1},
              'audio': 'sad',
              'video': [ {'happy':0.4,'neutral':0.6}, {'happy':0.3,'neutral':0.7} ]
            }

        Returns:
            final_label (str)
            fused_probs (dict)
            top_k_list (list of tuples)
        """
        combined = defaultdict(float)

        for modality, output in predictions.items():
            weight = self.weights.get(modality, 0)

            # modality output type
            if isinstance(output, list):  # frame-wise
                confs = self._aggregate_video(output)
            elif isinstance(output, dict):  # probability distribution
                confs = self._normalize_confidences(output)
            elif isinstance(output, str):  # single label
                confs = {output: 1.0}
            else:
                raise ValueError(f"Unsupported output type for modality {modality}: {type(output)}")

            # weighted accumulation
            for emotion, prob in confs.items():
                combined[emotion] += weight * prob

        # normalize final fused probabilities
        fused_probs = self._normalize_confidences(combined)
        final_label = max(fused_probs, key=fused_probs.get)

        # compute top-k
        sorted_emotions = sorted(fused_probs.items(), key=lambda x: x[1], reverse=True)
        top_k_list = sorted_emotions[:top_k]

        return final_label, fused_probs, top_k_list

    # optional neutral rule
    def rule_based_neutral_override(self, predictions, fused_label):
        """
        Optional rule: if all modalities predict 'neutral', return neutral;
        otherwise keep the fused label.
        """
        non_neutrals = [
            p for p in predictions.values()
            if (isinstance(p, str) and p != self.neutral_label)
            or (isinstance(p, dict) and max(p, key=p.get) != self.neutral_label)
        ]

        if not non_neutrals:
            return self.neutral_label
        return fused_label

fusion = LateFusion(weights={'text': 0.4, 'audio': 0.3, 'video': 0.3})

preds = {
    'text': {'happy': 0.7, 'sad': 0.2, 'neutral': 0.1},
    'audio': 'neutral',
    'video': [
        {'happy': 0.4, 'neutral': 0.6},
        {'happy': 0.3, 'neutral': 0.7}
    ]
}

label, probs, top3 = fusion.fuse(preds)
print("🎯 Final label:", label)
print("📊 Probabilities:", probs)
print("🏆 Top-3:", top3)

  from .autonotebook import tqdm as notebook_tqdm


Subfolders in the dataset: ['JSON files', 'MELD.Raw']
Subfolders in the raw data: ['dia47_utt11.mp4', 'dia35_utt2.mp4', 'dia47_utt9.mp4', 'dia167_utt14.mp4', 'dia232_utt4.mp4', '._dia118_utt11.mp4', 'dia34_utt2.mp4', 'dia268_utt8.mp4', 'dia233_utt4.mp4', '._dia39_utt1.mp4', '._dia230_utt3.mp4', 'dia137_utt0.mp4', '._dia268_utt11.mp4', '._dia231_utt3.mp4', 'dia136_utt0.mp4', '._dia38_utt1.mp4', '._dia34_utt16.mp4', '._dia128_utt2.mp4', 'dia28_utt7.mp4', 'dia195_utt8.mp4', 'dia22_utt15.mp4', '._dia129_utt2.mp4', '._dia100_utt5.mp4', 'dia29_utt7.mp4', 'dia103_utt2.mp4', 'final_videos_testdia48_utt3.mp4', 'dia154_utt12.mp4', '._dia95_utt11.mp4', '._dia205_utt1.mp4', 'dia102_utt2.mp4', 'dia107_utt8.mp4', 'dia253_utt11.mp4', '._dia52_utt5.mp4', 'dia175_utt3.mp4', 'dia93_utt0.mp4', '._dia272_utt0.mp4', '._dia192_utt5.mp4', 'dia174_utt3.mp4', '._dia53_utt5.mp4', 'dia175_utt13.mp4', 'dia92_utt0.mp4', 'dia191_utt2.mp4', 'dia77_utt1.mp4', 'dia259_utt0.mp4', 'dia270_utt7.mp4', 'dia123_utt11.mp4', 

### Process on video file (6s < 10s)

In [5]:
from src.processor import FileProcessor
from src.recognizers_old import TextEmotionRecognizer, SpeechEmotionRecognizer, FaceEmotionRecognizer

fp = FileProcessor()
text_recognizer = TextEmotionRecognizer("qwen")
audio_recognizer = SpeechEmotionRecognizer(model_path="/Users/krazmic/Documents/GitHub/Repos/EmoReA/emorea-backend/notebooks/speech/logreg_C.joblib")
vision_recognizer = FaceEmotionRecognizer()
file_data = fp.process_file(meld_test.iloc[0]['filename'])
file_data

{'modality': 'video',
 'data': {'audio': {'transcript': ' Why do all your coffee mugs have numbers on the bottom?',
   'segments': [{'start': 0.0,
     'end': 2.0,
     'text': ' Why do all your coffee mugs have numbers on the bottom?'}],
   'audio_chunks': [array([0.00650966, 0.01585733, 0.01330644, ..., 0.001369  , 0.00129954,
           0.00085882], dtype=float32)],
   'raw_audio': array([ 0.00650966,  0.01585733,  0.01330644, ...,  0.00072808,
          -0.000457  , -0.00041247], dtype=float32)},
  'frames': [<PIL.Image.Image image mode=RGB size=92x92>]},
 'meta': {'audio_duration_s': 2.25,
  'num_frames': 1,
  'video_duration_s': 2.25,
  'fps': 23.976023976023978}}

In [7]:
audio_emo = audio_recognizer.analyze(file_data["data"]["audio"]['raw_audio'], 16000)
face_emo = vision_recognizer.analyze_image(file_data["data"]["frames"])
text_emo = text_recognizer.analyze(file_data["data"]["audio"]["transcript"])

: 

In [None]:
from src.processor import FileProcessor
from src.recognizers import TextEmotionRecognizer, SpeechEmotionRecognizer, FaceEmotionRecognizer

fp = FileProcessor()
text_recognizer = TextEmotionRecognizer("qwen")
audio_recognizer = SpeechEmotionRecognizer(model_path="/Users/krazmic/Documents/GitHub/Repos/EmoReA/emorea-backend/notebooks/speech/logreg_C.joblib")
vision_recognizer = FaceEmotionRecognizer()

file_data = fp.process_file(meld_test.iloc[0]['filename'])
audio_emo = audio_recognizer.analyze(file_data["data"]["audio"]['raw_audio'])
face_emo = vision_recognizer.analyze_image(file_data["data"]["frames"])
text_emo = text_recognizer.analyze(file_data["data"]["audio"]["transcript"])

In [5]:
file_data

{'modality': 'video',
 'data': {'audio': {'transcript': ' Why do all your coffee mugs have numbers on the bottom?',
   'segments': [{'start': 0.0,
     'end': 2.0,
     'text': ' Why do all your coffee mugs have numbers on the bottom?'}],
   'audio_chunks': [array([0.00650966, 0.01585733, 0.01330644, ..., 0.001369  , 0.00129954,
           0.00085882], dtype=float32)],
   'raw_audio': array([ 0.00650966,  0.01585733,  0.01330644, ...,  0.00072808,
          -0.000457  , -0.00041247], dtype=float32)},
  'frames': [<PIL.Image.Image image mode=RGB size=92x92>]},
 'meta': {'audio_duration_s': 2.25,
  'num_frames': 1,
  'video_duration_s': 2.25,
  'fps': 23.976023976023978}}