In [1]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys
import os
import requests
import numpy as np
from mmsdk import mmdatasdk as md

# Append the SDK path
if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

# Create folder for storing data if it doesn't exist
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH, exist_ok=True)

# Helper function to download a file from a URL
def download_file(url, dest):
    try:
        print(f"Downloading from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(dest, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {dest}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Function to get the filename from the URL (last segment)
def get_filename_from_url(url):
    return url.split('/')[-1]  # Extract the last part of the URL

# Download function that iterates over dataset features
def download_dataset_features(feature_dict, feature_type):
    for feature_name, url in feature_dict.items():
        print(f"\nProcessing {feature_type} feature: {feature_name}")
        
        # Use the filename derived from the URL instead of the dictionary key
        filename = get_filename_from_url(url)
        dest_path = os.path.join(DATA_PATH, filename)

        if not os.path.exists(dest_path):
            download_file(url, dest_path)
        else:
            print(f"{feature_type} feature '{feature_name}' already downloaded at {dest_path}")

# Dataset initialization
DATASET = md.cmu_mosei

# Process high-level, raw, and label features
try:
    print("\nStarting download of high-level features...")
    download_dataset_features(DATASET.highlevel, "High-level")
except Exception as e:
    print(f"Error downloading high-level features: {e}")

try:
    print("\nStarting download of raw features...")
    download_dataset_features(DATASET.raw, "Raw")
except Exception as e:
    print(f"Error downloading raw features: {e}")

try:
    print("\nStarting download of labels...")
    download_dataset_features(DATASET.labels, "Label")
except Exception as e:
    print(f"Error downloading labels: {e}")


Starting download of high-level features...

Processing High-level feature: glove_vectors
High-level feature 'glove_vectors' already downloaded at ./data/CMU_MOSEI_TimestampedWordVectors.csd

Processing High-level feature: COVAREP
High-level feature 'COVAREP' already downloaded at ./data/CMU_MOSEI_COVAREP.csd

Processing High-level feature: OpenFace_2
High-level feature 'OpenFace_2' already downloaded at ./data/CMU_MOSEI_VisualOpenFace2.csd

Processing High-level feature: FACET 4.2
High-level feature 'FACET 4.2' already downloaded at ./data/CMU_MOSEI_VisualFacet42.csd

Starting download of raw features...

Processing Raw feature: words
Raw feature 'words' already downloaded at ./data/CMU_MOSEI_TimestampedWords.csd

Processing Raw feature: phones
Raw feature 'phones' already downloaded at ./data/CMU_MOSEI_TimestampedPhones.csd

Starting download of labels...

Processing Label feature: All Labels
Label feature 'All Labels' already downloaded at ./data/CMU_MOSEI_Labels.csd


In [2]:
# list the directory contents... let's see what features there are
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSEI_COVAREP.csd
CMU_MOSEI_Labels.csd
CMU_MOSEI_TimestampedPhones.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_VisualFacet42.csd
CMU_MOSEI_VisualOpenFace2.csd


We have multiple files which can be broadly classified into three categories, highlevel, raw and labels. 

<strong>Highlevel</strong> contains the extracted features for each modality (e.g OpenFace facial landmarks, openSMILE acoustic features) while <strong>raw</strong> contains the raw transctripts, phonemes. 

We have multiple files with the .csd extension. This stands for <strong>computational sequences</strong>, which is the underlying data structure for all features in the SDK. 

<strong> Highlevel features: </strong>
- CMU_MOSEI_VisualFacet42.csd (Video modality)
- CMU_MOSEI_VisualOpenFace2.csd (Video Modality)
- CMU_MOSEI_COVAREP.csd (Audio Modality)
- CMU_MOSEI_TimestampedWordVectors.csd (Text Modality)

## Loading the data

In [3]:
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'
text_field = 'CMU_MOSEI_TimestampedWordVectors'

features = [
    text_field, 
    visual_field, 
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2024-10-31 13:31:01.470] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWordVectors.csd ...
[94m[1m[2024-10-31 13:31:01.577] | Status  | [0mChecking the integrity of the <glove_vectors> computational sequence ...
[94m[1m[2024-10-31 13:31:01.577] | Status  | [0mChecking the format of the data in <glove_vectors> computational sequence ...


                                                                                  

[92m[1m[2024-10-31 13:31:02.695] | Success | [0m<glove_vectors> computational sequence data in correct format.
[94m[1m[2024-10-31 13:31:02.695] | Status  | [0mChecking the format of the metadata in <glove_vectors> computational sequence ...
[92m[1m[2024-10-31 13:31:02.696] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_VisualFacet42.csd ...
[94m[1m[2024-10-31 13:31:02.775] | Status  | [0mChecking the integrity of the <FACET 4.2> computational sequence ...
[94m[1m[2024-10-31 13:31:02.775] | Status  | [0mChecking the format of the data in <FACET 4.2> computational sequence ...


                                                                                  

[92m[1m[2024-10-31 13:31:03.897] | Success | [0m<FACET 4.2> computational sequence data in correct format.
[94m[1m[2024-10-31 13:31:03.897] | Status  | [0mChecking the format of the metadata in <FACET 4.2> computational sequence ...
[92m[1m[2024-10-31 13:31:03.898] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-10-31 13:31:03.968] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-10-31 13:31:03.968] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                  

[92m[1m[2024-10-31 13:31:05.092] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-10-31 13:31:05.092] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-10-31 13:31:05.093] | Success | [0mDataset initialized successfully ... 




To load the dataset, we need to tell the SDK which features we need and where they exist. Thus, we construct a dictionary with format {feature_name: csd_path} and feed it to mmdataset object in the SDK.

From the highlevel features, VisualFacet is used for video modality since this file stores facial expression data extracted using the FACET tool. FACET analyzes microexpressions, including movements of facial muscles (like eyebrow raises or smiles) and emotional states (e.g., joy, anger). Thus, making it more suitable for emotion detection than openFace since it
tracks facial landmarks, head poses, and eye gaze, offering detailed spatial and motion-related facial features across video frames.

COVAREP is used for audio related features and TimeStampedWordVectors provides Pre-trained embeddings using GLoVe capture semantic relationships and contextual meaning between words.

In [4]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[visual_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(list(dataset[visual_field][some_id].keys()))
print("=" * 80)

print(list(dataset[visual_field][some_id]['intervals'].shape))
print("=" * 80)

print(list(dataset[visual_field][some_id]['features'].shape))
print(list(dataset[text_field][some_id]['features'].shape))
print(list(dataset[acoustic_field][some_id]['features'].shape))
print("Different modalities have different number of time steps!")

['CMU_MOSEI_TimestampedWordVectors', 'CMU_MOSEI_VisualFacet42', 'CMU_MOSEI_COVAREP']
['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY']
['features', 'intervals']
[3658, 2]
[3658, 35]
[321, 300]
[12209, 74]
Different modalities have different number of time steps!


In [5]:
# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features

# first we align to words with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])

[94m[1m[2024-10-31 13:31:39.088] | Status  | [0mUnify was called ...
[92m[1m[2024-10-31 13:31:39.094] | Success | [0mUnify completed ...
[94m[1m[2024-10-31 13:31:39.094] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWordVectors> computational sequence started ...
[94m[1m[2024-10-31 13:38:03.268] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-10-31 13:40:58.221] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-10-31 13:43:30.298] | Status  | [0mAlignment starting ...


                                                                                                      

[92m[1m[2024-10-31 15:25:37.358] | Success | [0mAlignment to <CMU_MOSEI_TimestampedWordVectors> complete.
[94m[1m[2024-10-31 15:25:37.359] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-10-31 15:25:41.778] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWordVectors> computational sequence.
[94m[1m[2024-10-31 15:25:41.779] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWordVectors> computational sequence ...


                                                                                          

[92m[1m[2024-10-31 15:25:46.417] | Success | [0m<CMU_MOSEI_TimestampedWordVectors> computational sequence data in correct format.
[94m[1m[2024-10-31 15:25:46.417] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWordVectors> computational sequence ...
[92m[1m[2024-10-31 15:25:46.417] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2024-10-31 15:25:46.417] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                                          

[92m[1m[2024-10-31 15:25:47.704] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2024-10-31 15:25:47.704] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2024-10-31 15:25:47.704] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-10-31 15:25:47.704] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                                          

[92m[1m[2024-10-31 15:25:48.884] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-10-31 15:25:48.886] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...


In [6]:
label_field = 'CMU_MOSEI_Labels'

# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2024-10-31 15:49:20.385] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-10-31 15:49:21.093] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-10-31 15:49:21.094] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-10-31 15:49:22.615] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-10-31 15:49:22.615] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[94m[1m[2024-10-31 15:49:22.615] | Status  | [0mUnify was called ...
[92m[1m[2024-10-31 15:53:12.155] | Success | [0mUnify completed ...
[94m[1m[2024-10-31 15:53:12.195] | Status  | [0mPre-alignment based on <CMU_MOSEI_Labels> computational sequence started ...
[94m[1m[2024-10-31 15:53:22.411] | Status  | [0mPre-alignment done for <CMU_MOSEI_TimestampedWordVectors> ...
[94m[1m[2024-10-31 15:53:26.385] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-10-31 15:53:29.025] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-10-31 15:53:29.093] | Status  | [0mAlignment starting ...


                                                                                                   

[92m[1m[2024-10-31 15:54:29.433] | Success | [0mAlignment to <CMU_MOSEI_Labels> complete.
[94m[1m[2024-10-31 15:54:29.433] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-10-31 15:54:37.325] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWordVectors> computational sequence.
[94m[1m[2024-10-31 15:54:37.325] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWordVectors> computational sequence ...


                                                                      

[92m[1m[2024-10-31 15:54:37.356] | Success | [0m<CMU_MOSEI_TimestampedWordVectors> computational sequence data in correct format.
[94m[1m[2024-10-31 15:54:37.356] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWordVectors> computational sequence ...
[92m[1m[2024-10-31 15:54:37.356] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2024-10-31 15:54:37.356] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                      

[92m[1m[2024-10-31 15:54:37.385] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2024-10-31 15:54:37.385] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2024-10-31 15:54:37.385] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-10-31 15:54:37.385] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2024-10-31 15:54:37.417] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-10-31 15:54:37.417] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
[92m[1m[2024-10-31 15:54:37.417] | Success | [0mInitialized empty <CMU_MOSEI_Labels> computational sequence.
[94m[1m[2024-10-31 15:54:37.417] | Status  | [0mChecking the format of the data in <CMU_MOSEI_Labels> computational sequence ...


                                                                      

[92m[1m[2024-10-31 15:54:37.449] | Success | [0m<CMU_MOSEI_Labels> computational sequence data in correct format.
[94m[1m[2024-10-31 15:54:37.449] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_Labels> computational sequence ...


In [12]:
# Print some keys to see the segmentation structure
print(list(dataset[label_field].keys())[:10])

# Pick a specific segmented key to explore further
some_segmented_key = list(dataset[label_field].keys())[0]

# Check the aligned features and intervals for this segment
print("Label intervals:", dataset[label_field][some_segmented_key]['intervals'].shape)
print("Label features:", dataset[label_field][some_segmented_key]['features'].shape)

print("Text features:", dataset[text_field][some_segmented_key]['features'].shape)
print("Visual features:", dataset[visual_field][some_segmented_key]['features'].shape)
print("Acoustic features:", dataset[acoustic_field][some_segmented_key]['features'].shape)


['--qXJuDtHPw[0]', '-3g5yACwYnA[0]', '-3g5yACwYnA[1]', '-3g5yACwYnA[2]', '-3g5yACwYnA[3]', '-3g5yACwYnA[4]', '-3g5yACwYnA[5]', '-3nNcZdcdvU[0]', '-3nNcZdcdvU[1]', '-3nNcZdcdvU[2]']
Label intervals: (1, 2)
Label features: (1, 7)
Text features: (22, 300)
Visual features: (22, 35)
Acoustic features: (22, 74)


In [14]:
# check out what the keys look like now
print(list(dataset[text_field].keys())[55])

-HwX2H8Z4hY[4]


## Train Test Split

In [15]:
# obtain the train/dev/test splits - these splits are based on video IDs
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

# inspect the splits: they only contain video IDs
print(test_split)

['7l3BNtSE0xc', 'dZFV0lyedX4', '286943', '126872', 'qgC8_emxSIU', 'kld9r0iFkWM', 'rC29Qub0U7A', '4YfyP0uIqw0', 'FMenDv3y8jc', '4wLP4elp1uM', 'KYQTwFVBzME', '27v7Blr0vjw', 'DnBHq5I52LM', 'HR18U0yAlTc', 'x266rUJQC_8', 'd1CDP6sMuLA', 'xSCvspXYU9k', '4EDblUpJieU', '4o4ilPK9rl8', '53609', 'SZ7HK5ns6mE', '243981', 'ySblgk7T7eQ', 'MYEyQUpMe3k', 'EujJ0SwiCRE', '3HyAaqre_Fk', 'iQDB_OkAQWs', 'gE7kUqMqQ9g', 'eFV7iFPYZB4', 'IRSxo_XXArg', '3hOlJf_JQDs', 'BRSyH6yfDLk', '1jogeKX0wGw', '3At-BKm9eYk', 'NVLPURuAVLU', 'pZye4zFzk3o', 'l1jW3OMXUzs', 'XKyumlBmix8', 'eKQKEi2-0Ws', 'WgI8IbJtXHw', 'tnWmVXZ87h0', 'YCEllKyaCrc', 'W1CWpktWtTs', '8wQhzezNcUY', '0bxhZ-LIfZY', 'lrjm6F3JJgg', 'Vdf1McvE9ao', 'eQc5uI7FKCU', '2QXHdu2zlQY', 'YCI-ZzclIPQ', '2Ky9DBSl49w', 'SKTyBOhDX6U', 'b86B3hP8ARM', '23656', 'kpS4BXif_Sw', 'dR68gbeOWOc', 'tC2KicUHB9Q', 'absh1hsZeF0', 'c5zxqITn3ZM', 'uogwnZGb-iE', '46495', 'Sq6DIhFxPqQ', 'PexNiFbPTYM', 'z441aDJvAcU', 'OORklkFql3k', 'WbtsuXkaGeg', 'grsV1YN1z5s', 'Gc_zIjqqUys', '424SXFTCFsA

In [None]:
# we can see they are in the format of 'video_id[segment_no]', but the splits was specified with video_id only
# we need to use regex or something to match the video IDs...
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from collections import defaultdict

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 0

# construct a word2id mapping that automatically takes increment when new words are encountered
word2id = defaultdict(lambda: len(word2id))
UNK = word2id['<unk>']
PAD = word2id['<pad>']

# place holders for the final train/dev/test dataset
train = []
dev = []
test = []

# define a regular expression to extract the video ID out of the keys
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues

for segment in dataset[label_field].keys():
    
    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[text_field][segment]['features']
    _visual = dataset[visual_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    visual = []
    acoustic = []
    for i, word in enumerate(_words):
        if word[0] != b'sp':
            words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
            visual.append(_visual[i, :])
            acoustic.append(_acoustic[i, :])

    words = np.asarray(words)
    visual = np.asarray(visual)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))

    if vid in train_split:
        train.append(((words, visual, acoustic), label, segment))
    elif vid in dev_split:
        dev.append(((words, visual, acoustic), label, segment))
    elif vid in test_split:
        test.append(((words, visual, acoustic), label, segment))
    else:
        print(f"Found video that doesn't belong to any splits: {vid}")

print(f"Total number of {num_drop} datapoints have been dropped.")

# turn off the word2id - define a named function here to allow for pickling
def return_unk():
    return UNK
word2id.default_factory = return_unk

KeyError: 0

In [20]:
print("Data types in _words:", [type(w[0]) for w in _words])


Data types in _words: [<class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>, <class 'numpy.float64'>]
