Connect to jupyter kernel of KCL HPC Create

And cd to the Tutorials directory, that's where the data is stored

In [10]:
%cd cmu-mosei-experiments/CMU-MultimodalSDK-Tutorials/

/cephfs/volumes/hpc_data_usr/k24083007/2070c87e-fe07-4f03-a6c4-cae0de8ce617/cmu-mosei-experiments/CMU-MultimodalSDK-Tutorials


In [11]:
import mmsdk
import os
import re
import numpy as np
from mmsdk import mmdatasdk as md

import sys
import requests
from constants.paths import SDK_PATH, DATA_PATH

In [12]:
!nvidia-smi

Thu Jun 26 19:26:59 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  | 00000000:B1:00.0 Off |                    0 |
| N/A   32C    P0              50W / 400W |      0MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [22]:
# Ensure SDK is in path
sys.path.append(SDK_PATH)

# Make sure DATA_PATH exists
if not os.path.exists(DATA_PATH):
    print(f"Error: DATA_PATH does not exist: {DATA_PATH}")
    print("Please modify DATA_PATH to point to your .csd files directory")
    # os.makedirs(DATA_PATH, exist_ok=True)
else:
    data_files = os.listdir(DATA_PATH)
    print("Available data files:")
    print('\n'.join(data_files))

Available data files:
CMU_MOSEI_COVAREP.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_OpenFace2.csd
CMU_MOSEI_VisualFacet42.csd
CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_TimestampedPhones.csd
CMU_MOSEI_Labels.csd


### Viewing the features
Loading the multimodal dataset

In [24]:
text_field = 'CMU_MOSEI_TimestampedWords'
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'

# Define the features to load
features = [
    text_field, 
    visual_field, 
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}

# Load the dataset
try:
    dataset = md.mmdataset(recipe)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Available files:", data_files)

[92m[1m[2025-06-26 18:47:39.518] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2025-06-26 18:47:39.589] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2025-06-26 18:47:39.589] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2025-06-26 18:47:40.561] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2025-06-26 18:47:40.561] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2025-06-26 18:47:40.625] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_VisualFacet42.csd ...
[94m[1m[2025-06-26 18:48:20.628] | Status  | [0mChecking the integrity of the <FACET 4.2> computational sequence ...
[94m[1m[2025-06-26 18:48:20.628] | Status  | [0mChecking the format of the data in <FACET 4.2> computational sequence ...


                                                                                  

[92m[1m[2025-06-26 18:48:21.692] | Success | [0m<FACET 4.2> computational sequence data in correct format.
[94m[1m[2025-06-26 18:48:21.692] | Status  | [0mChecking the format of the metadata in <FACET 4.2> computational sequence ...
[92m[1m[2025-06-26 18:48:21.753] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2025-06-26 18:51:27.417] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2025-06-26 18:51:27.417] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                 

[92m[1m[2025-06-26 18:51:59.119] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2025-06-26 18:51:59.120] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2025-06-26 18:51:59.120] | Success | [0mDataset initialized successfully ... 
Dataset loaded successfully!




In [26]:
# Explore the dataset structure
print("Dataset keys:", list(dataset.keys()))
print("=" * 80)

# Get video IDs from one modality
video_ids = list(dataset[visual_field].keys())
print(f"Number of videos: {len(video_ids)}")
print("First 10 video IDs:", video_ids[:10])
print("=" * 80)

# Examine a specific video
some_id = video_ids[15] if len(video_ids) > 15 else video_ids[0]
print(f"Examining video: {some_id}")
print("Available data for this video:", list(dataset[visual_field][some_id].keys()))
print("=" * 80)

# Check shapes of different modalities for this video
print("Visual features shape:", dataset[visual_field][some_id]['features'].shape)
print("Visual intervals shape:", dataset[visual_field][some_id]['intervals'].shape)
print("Text features shape:", dataset[text_field][some_id]['features'].shape)
print("Acoustic features shape:", dataset[acoustic_field][some_id]['features'].shape)

Dataset keys: ['CMU_MOSEI_TimestampedWords', 'CMU_MOSEI_VisualFacet42', 'CMU_MOSEI_COVAREP']
Number of videos: 3837
First 10 video IDs: ['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY']
Examining video: -IqSFQePnpU
Available data for this video: ['features', 'intervals']
Visual features shape: (3658, 35)
Visual intervals shape: (3658, 2)
Text features shape: (321, 1)
Acoustic features shape: (12209, 74)


Different modalities have different number of time steps

In [27]:
# Align the modalities (following the tutorial approach)
def avg(intervals: np.array, features: np.array) -> np.array:
    """Simple averaging function that does not depend on intervals"""
    try:
        return np.average(features, axis=0)
    except:
        return features

# Align to words with averaging
print("Aligning modalities to text...")
dataset.align(text_field, collapse_functions=[avg])
print("Alignment completed!")

Aligning modalities to text...
[94m[1m[2025-06-26 18:54:03.875] | Status  | [0mUnify was called ...
[92m[1m[2025-06-26 18:54:03.881] | Success | [0mUnify completed ...
[94m[1m[2025-06-26 18:54:03.881] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWords> computational sequence started ...
[94m[1m[2025-06-26 18:54:19.987] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2025-06-26 18:56:20.207] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2025-06-26 18:56:20.878] | Status  | [0mAlignment starting ...


                                                                                                   

[92m[1m[2025-06-26 19:31:02.359] | Success | [0mAlignment to <CMU_MOSEI_TimestampedWords> complete.
[94m[1m[2025-06-26 19:31:02.359] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2025-06-26 19:31:02.444] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2025-06-26 19:31:02.445] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                                          

[92m[1m[2025-06-26 19:31:04.335] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2025-06-26 19:31:04.366] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...
[92m[1m[2025-06-26 19:31:04.372] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2025-06-26 19:31:04.434] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                                          

[92m[1m[2025-06-26 19:31:06.034] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2025-06-26 19:31:06.034] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2025-06-26 19:31:06.034] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2025-06-26 19:31:06.034] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                                          

[92m[1m[2025-06-26 19:31:07.413] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2025-06-26 19:31:07.414] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
Alignment completed!


In [None]:
# Add labels and align to them
label_field = 'CMU_MOSEI_Labels'

# Add labels to the dataset
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)

# Align to labels to get labeled segments
dataset.align(label_field)

# Check the new keys format
new_keys = list(dataset[text_field].keys())
print(f"After alignment, keys changed to format: {new_keys[55]}")
print(f"Total number of segments: {len(new_keys)}")

In [None]:
# Analyze the data statistics
print("=== DATA ANALYSIS ===")

# Count segments per modality
text_segments = len(list(dataset[text_field].keys()))
visual_segments = len(list(dataset[visual_field].keys()))
acoustic_segments = len(list(dataset[acoustic_field].keys()))
label_segments = len(list(dataset[label_field].keys()))

print(f"Text segments: {text_segments}")
print(f"Visual segments: {visual_segments}")
print(f"Acoustic segments: {acoustic_segments}")
print(f"Label segments: {label_segments}")

# Sample a few segments to check data
sample_segments = list(dataset[label_field].keys())[:5]
for segment in sample_segments:
    try:
        text_shape = dataset[text_field][segment]['features'].shape
        visual_shape = dataset[visual_field][segment]['features'].shape
        acoustic_shape = dataset[acoustic_field][segment]['features'].shape
        label_shape = dataset[label_field][segment]['features'].shape
        
        print(f"\nSegment: {segment}")
        print(f"  Text: {text_shape}, Visual: {visual_shape}, Acoustic: {acoustic_shape}, Label: {label_shape}")
        print(f"  Label value: {dataset[label_field][segment]['features']}")
    except KeyError as e:
        print(f"Missing data for segment {segment}: {e}")

## Directory Structure

Based on the attached image, here's the markdown-displayable directory structure:

```
cmu-mosei-experiments/
├── .ipynb_checkpoints/
├── CMU-MultimodalSDK/
├── CMU-MultimodalSDK-Tutorials/
│   ├── .ipynb_checkpoints/
│   ├── constants/
│   └── data/
│       ├── CMU_MOSEI_COVAREP.csd
│       ├── CMU_MOSEI_Labels.csd
│       ├── CMU_MOSEI_OpenFace2.csd
│       ├── CMU_MOSEI_TimestampedPhones.csd
│       ├── CMU_MOSEI_TimestampedWords.csd
│       ├── CMU_MOSEI_TimestampedWordVectors.csd
│       └── CMU_MOSEI_VisualFacet42.csd
├── debug_mmsdk.py
├── README.md
├── text_LSTM.py
└── tutorial_interactive.ipynb
```

### Description of Key Components:

- **data/**: Contains the CMU-MOSEI dataset files in `.csd` format
  - `CMU_MOSEI_COVAREP.csd`: Acoustic features
  - `CMU_MOSEI_Labels.csd`: Emotion labels and annotations
  - `CMU_MOSEI_OpenFace2.csd`: Visual facial features
  - `CMU_MOSEI_TimestampedPhones.csd`: Phone-level timestamps
  - `CMU_MOSEI_TimestampedWords.csd`: Word-level text features
  - `CMU_MOSEI_TimestampedWordVectors.csd`: Word vector embeddings
  - `CMU_MOSEI_VisualFacet42.csd`: 42-dimensional visual features

- **constants/**: Configuration and path constants
- **CMU-MultimodalSDK/**: Core SDK for multimodal data processing
- **CMU-MultimodalSDK-Tutorials/**: Tutorial notebooks and examples