In [64]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys
import os
import requests
import numpy as np
from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError

# Append the SDK path
if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

# Create folder for storing data if it doesn't exist
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH, exist_ok=True)

# Helper function to download a file from a URL
def download_file(url, dest):
    try:
        print(f"Downloading from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(dest, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {dest}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Function to get the filename from the URL (last segment)
def get_filename_from_url(url):
    return url.split('/')[-1]  # Extract the last part of the URL

# Download function that iterates over dataset features
def download_dataset_features(feature_dict, feature_type):
    for feature_name, url in feature_dict.items():
        print(f"\nProcessing {feature_type} feature: {feature_name}")
        
        # Use the filename derived from the URL instead of the dictionary key
        filename = get_filename_from_url(url)
        dest_path = os.path.join(DATA_PATH, filename)

        if not os.path.exists(dest_path):
            download_file(url, dest_path)
        else:
            print(f"{feature_type} feature '{feature_name}' already downloaded at {dest_path}")

# Dataset initialization
DATASET = md.cmu_mosei
SETUP = True
# Process high-level, raw, and label features
# try:
#     print("\nStarting download of high-level features...")
#     download_dataset_features(DATASET.highlevel, "High-level")
# except Exception as e:
#     print(f"Error downloading high-level features: {e}")

# try:
#     print("\nStarting download of raw features...")
#     download_dataset_features(DATASET.raw, "Raw")
# except Exception as e:
#     print(f"Error downloading raw features: {e}")

# try:
#     print("\nStarting download of labels...")
#     download_dataset_features(DATASET.labels, "Label")
# except Exception as e:
#     print(f"Error downloading labels: {e}")

In [4]:
# list the directory contents... let's see what features there are
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))



CMU_MOSEI_COVAREP.csd
CMU_MOSEI_Labels.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_VisualFacet42.csd


We have multiple files which can be broadly classified into three categories, highlevel, raw and labels. 

<strong>Highlevel</strong> contains the extracted features for each modality (e.g OpenFace facial landmarks, openSMILE acoustic features) while <strong>raw</strong> contains the raw transctripts, phonemes. 

We have multiple files with the .csd extension. This stands for <strong>computational sequences</strong>, which is the underlying data structure for all features in the SDK. 

<strong> Highlevel features: </strong>
- CMU_MOSEI_VisualFacet42.csd (Video modality)
- CMU_MOSEI_VisualOpenFace2.csd (Video Modality)
- CMU_MOSEI_COVAREP.csd (Audio Modality)
- CMU_MOSEI_TimestampedWordVectors.csd (Text Modality)

## Loading the data

In [5]:
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'
word_field = 'CMU_MOSEI_TimestampedWords'
text_field = 'CMU_MOSEI_TimestampedWordVectors'

features = [
    # text_field, 
    visual_field, 
    acoustic_field
]

raw_features = [word_field]

# recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
# dataset = md.mmdataset(recipe)

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
recipe[word_field] = os.path.join(DATA_PATH, word_field) + '.csd'
print(recipe)
dataset = md.mmdataset(recipe)


{'CMU_MOSEI_VisualFacet42': './data/CMU_MOSEI_VisualFacet42.csd', 'CMU_MOSEI_COVAREP': './data/CMU_MOSEI_COVAREP.csd', 'CMU_MOSEI_TimestampedWords': './data/CMU_MOSEI_TimestampedWords.csd'}
[92m[1m[2024-11-04 14:24:35.659] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_VisualFacet42.csd ...
[94m[1m[2024-11-04 14:24:36.424] | Status  | [0mChecking the integrity of the <FACET 4.2> computational sequence ...
[94m[1m[2024-11-04 14:24:36.424] | Status  | [0mChecking the format of the data in <FACET 4.2> computational sequence ...


                                                                                  

[92m[1m[2024-11-04 14:24:38.993] | Success | [0m<FACET 4.2> computational sequence data in correct format.
[94m[1m[2024-11-04 14:24:38.993] | Status  | [0mChecking the format of the metadata in <FACET 4.2> computational sequence ...
[92m[1m[2024-11-04 14:24:39.003] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-11-04 14:24:39.644] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-11-04 14:24:39.645] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                  

[92m[1m[2024-11-04 14:24:41.791] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-11-04 14:24:41.791] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-11-04 14:24:41.800] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWords.csd ...
[94m[1m[2024-11-04 14:24:42.617] | Status  | [0mChecking the integrity of the <words> computational sequence ...
[94m[1m[2024-11-04 14:24:42.617] | Status  | [0mChecking the format of the data in <words> computational sequence ...


                                                                                  

[92m[1m[2024-11-04 14:24:44.702] | Success | [0m<words> computational sequence data in correct format.
[94m[1m[2024-11-04 14:24:44.702] | Status  | [0mChecking the format of the metadata in <words> computational sequence ...
[92m[1m[2024-11-04 14:24:44.702] | Success | [0mDataset initialized successfully ... 




To load the dataset, we need to tell the SDK which features we need and where they exist. Thus, we construct a dictionary with format {feature_name: csd_path} and feed it to mmdataset object in the SDK.

From the highlevel features, VisualFacet is used for video modality since this file stores facial expression data extracted using the FACET tool. FACET analyzes microexpressions, including movements of facial muscles (like eyebrow raises or smiles) and emotional states (e.g., joy, anger). Thus, making it more suitable for emotion detection than openFace since it
tracks facial landmarks, head poses, and eye gaze, offering detailed spatial and motion-related facial features across video frames.

COVAREP is used for audio related features and TimeStampedWordVectors provides Pre-trained embeddings using GLoVe capture semantic relationships and contextual meaning between words.

In [6]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[visual_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(list(dataset[visual_field][some_id].keys()))
print("=" * 80)

word_id = list(dataset[word_field].keys())[15]
print(list(dataset[word_field][word_id].keys()))
print("=" * 80)
print(dataset[word_field].keys())

print('Intervals')
print(list(dataset[visual_field][some_id]['intervals'].shape))
#print(list(dataset[text_field][some_id]['intervals'].shape))
print(list(dataset[word_field][word_id]['intervals'].shape))
print(list(dataset[acoustic_field][some_id]['intervals'].shape))
print("=" * 80)

print('Features')
print(list(dataset[visual_field][some_id]['features'].shape))
#print(list(dataset[text_field][some_id]['features'].shape))
print(list(dataset[word_field][word_id]['features'].shape))
print(list(dataset[acoustic_field][some_id]['features'].shape))
print("Different modalities have different number of time steps!")

['CMU_MOSEI_VisualFacet42', 'CMU_MOSEI_COVAREP', 'CMU_MOSEI_TimestampedWords']
['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY']
['features', 'intervals']
['features', 'intervals']
dict_keys(['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY', '-HeZS2-Prhc', '-HvKLjmsO5U', '-HwX2H8Z4hY', '-IUUR2yyNbw', '-I_e4mIh0yE', '-IqSFQePnpU', '-KCahx2qBOI', '-LnuDPiuuZw', '-MeTTeMJBNc', '-NFrJFQijFE', '-RfYyzHpjk4', '-RpZEe4w4fY', '-SYSVSQnTnA', '-THoVjtIkeU', '-UUCSKoHeMA', '-UacrmKiTn4', '-UuX1xuaiiE', '-VmheDA92mM', '-WXXTNIJcVM', '-ZgjBOA1Yhw', '-a55Q6RWvTA', '-aNfi7CP8vM', '-aqamKhZ1Ec', '-bl5PfNIYrk', '-cEhr0cQcDM', '-cmk6cfUeMs', '-dZ1TCboxcQ', '-dxfTGcXJoc', '-egA8-b7-3M', '-hPfPhUIzfA', '-hnBHBN8p5A', '-iRBcNs9oI8', '-l_53IwQoj0', '-lqc32Zpr7M', '-lzEya4AM_4', '-m9KtvCk_L8', '-mJ2ud6oKI8', '-

In [7]:
if SETUP:
# we define a simple averaging function that does not depend on intervals
    def avg(intervals: np.array, features: np.array) -> np.array:
        try:
            return np.average(features, axis=0)
        except:
            return features

    # first we align to words with averaging, collapse_function receives a list of functions
    dataset.align(word_field, collapse_functions=[avg])

[94m[1m[2024-11-04 14:24:53.898] | Status  | [0mUnify was called ...
[92m[1m[2024-11-04 14:24:53.902] | Success | [0mUnify completed ...
[94m[1m[2024-11-04 14:24:53.902] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWords> computational sequence started ...
[94m[1m[2024-11-04 14:25:21.871] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-11-04 14:33:06.372] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-11-04 14:35:31.209] | Status  | [0mAlignment starting ...


                                                                                                      

[92m[1m[2024-11-04 15:58:21.970] | Success | [0mAlignment to <CMU_MOSEI_TimestampedWords> complete.
[94m[1m[2024-11-04 15:58:21.970] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-11-04 15:58:25.724] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2024-11-04 15:58:25.725] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                                          

[92m[1m[2024-11-04 15:58:28.263] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2024-11-04 15:58:28.263] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2024-11-04 15:58:28.263] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-11-04 15:58:28.263] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                                          

[92m[1m[2024-11-04 15:58:29.476] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-11-04 15:58:29.476] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
[92m[1m[2024-11-04 15:58:29.476] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2024-11-04 15:58:29.476] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                                          

[92m[1m[2024-11-04 15:58:30.698] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-11-04 15:58:30.699] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...


In [None]:
# if SETUP:
#     deploy_files={x:x for x in dataset.keys()}
#     dataset.deploy("hl1",deploy_files)

NameError: name 'SETUP' is not defined

In [8]:
label_field = 'CMU_MOSEI_Labels'

# we add and align to lables to obtain labeled segments
# this time we don't apply collapse functions so that the temporal sequences are preserved
label_recipe = {label_field: os.path.join(DATA_PATH, label_field + '.csd')}
dataset.add_computational_sequences(label_recipe, destination=None)
dataset.align(label_field)

[92m[1m[2024-11-04 16:39:33.228] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_Labels.csd ...
[94m[1m[2024-11-04 16:39:33.949] | Status  | [0mChecking the integrity of the <All Labels> computational sequence ...
[94m[1m[2024-11-04 16:39:33.950] | Status  | [0mChecking the format of the data in <All Labels> computational sequence ...


                                                                                  

[92m[1m[2024-11-04 16:39:35.633] | Success | [0m<All Labels> computational sequence data in correct format.
[94m[1m[2024-11-04 16:39:35.633] | Status  | [0mChecking the format of the metadata in <All Labels> computational sequence ...
[94m[1m[2024-11-04 16:39:35.633] | Status  | [0mUnify was called ...
[92m[1m[2024-11-04 16:43:21.861] | Success | [0mUnify completed ...
[94m[1m[2024-11-04 16:43:21.892] | Status  | [0mPre-alignment based on <CMU_MOSEI_Labels> computational sequence started ...
[94m[1m[2024-11-04 16:43:24.335] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-11-04 16:43:26.565] | Status  | [0mPre-alignment done for <CMU_MOSEI_TimestampedWords> ...
[94m[1m[2024-11-04 16:43:28.880] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-11-04 16:43:28.950] | Status  | [0mAlignment starting ...


                                                                                                   

[92m[1m[2024-11-04 16:44:00.940] | Success | [0mAlignment to <CMU_MOSEI_Labels> complete.
[94m[1m[2024-11-04 16:44:00.940] | Status  | [0mReplacing dataset content with aligned computational sequences
[92m[1m[2024-11-04 16:44:03.762] | Success | [0mInitialized empty <CMU_MOSEI_VisualFacet42> computational sequence.
[94m[1m[2024-11-04 16:44:03.762] | Status  | [0mChecking the format of the data in <CMU_MOSEI_VisualFacet42> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.793] | Success | [0m<CMU_MOSEI_VisualFacet42> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.793] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_VisualFacet42> computational sequence ...
[92m[1m[2024-11-04 16:44:03.793] | Success | [0mInitialized empty <CMU_MOSEI_COVAREP> computational sequence.
[94m[1m[2024-11-04 16:44:03.793] | Status  | [0mChecking the format of the data in <CMU_MOSEI_COVAREP> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.821] | Success | [0m<CMU_MOSEI_COVAREP> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.822] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_COVAREP> computational sequence ...
[92m[1m[2024-11-04 16:44:03.822] | Success | [0mInitialized empty <CMU_MOSEI_TimestampedWords> computational sequence.
[94m[1m[2024-11-04 16:44:03.822] | Status  | [0mChecking the format of the data in <CMU_MOSEI_TimestampedWords> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.851] | Success | [0m<CMU_MOSEI_TimestampedWords> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.851] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_TimestampedWords> computational sequence ...
[92m[1m[2024-11-04 16:44:03.851] | Success | [0mInitialized empty <CMU_MOSEI_Labels> computational sequence.
[94m[1m[2024-11-04 16:44:03.851] | Status  | [0mChecking the format of the data in <CMU_MOSEI_Labels> computational sequence ...


                                                                      

[92m[1m[2024-11-04 16:44:03.886] | Success | [0m<CMU_MOSEI_Labels> computational sequence data in correct format.
[94m[1m[2024-11-04 16:44:03.886] | Status  | [0mChecking the format of the metadata in <CMU_MOSEI_Labels> computational sequence ...


In [11]:
# Print some keys to see the segmentation structure
print(list(dataset[label_field].keys())[:10])

# Pick a specific segmented key to explore further
some_segmented_key = list(dataset[label_field].keys())[0]

# Check the aligned features and intervals for this segment
print("Label intervals:", dataset[label_field][some_segmented_key]['intervals'].shape)
print("Label features:", dataset[label_field][some_segmented_key]['features'].shape)

print("Text features:", dataset[word_field][some_segmented_key]['features'].shape)
print("Visual features:", dataset[visual_field][some_segmented_key]['features'].shape)
print("Acoustic features:", dataset[acoustic_field][some_segmented_key]['features'].shape)


['--qXJuDtHPw[0]', '-3g5yACwYnA[0]', '-3g5yACwYnA[1]', '-3g5yACwYnA[2]', '-3g5yACwYnA[3]', '-3g5yACwYnA[4]', '-3g5yACwYnA[5]', '-3nNcZdcdvU[0]', '-3nNcZdcdvU[1]', '-3nNcZdcdvU[2]']
Label intervals: (1, 2)
Label features: (1, 7)
Text features: (22, 1)
Visual features: (22, 35)
Acoustic features: (22, 74)


In [12]:
# check out what the keys look like now
print(list(dataset[word_field].keys())[55])

-HwX2H8Z4hY[4]


In [77]:
# Define the label field where the emotions are stored
label_field = 'CMU_MOSEI_Labels'

# Emotion mapping (assuming order is [happy, sad, anger, surprise, disgust, fear])
emotions = ["happy", "sad", "anger", "surprise", "disgust", "fear"]

# Print emotion labels for each segment with additional checks
print("Emotion Labels for each segment:")

for segment in dataset[label_field].keys():
    # Access emotion labels for this segment
    emotion_data = dataset[label_field][segment]['features']
    
    # Print shape of emotion_data for debugging purposes
    print(f"Segment: {segment}, Emotion Data Shape: {emotion_data.shape}")

    # Flatten emotion data and ensure it aligns with the number of defined emotions
    emotion_data_flat = emotion_data.flatten()
    emotion_labels = []
    
    # Only iterate up to the length of `emotions` to prevent out-of-range errors
    for idx in range(min(len(emotion_data_flat), len(emotions))):
        if emotion_data_flat[idx] == 1:
            emotion_labels.append(emotions[idx])
    
    print(f"Segment: {segment} -> Emotions: {emotion_labels}")

# Count occurrences of each emotion
emotion_count = {emotion: 0 for emotion in emotions}
for segment in dataset[label_field].keys():
    emotion_data = dataset[label_field][segment]['features']
    emotion_data_flat = emotion_data.flatten()
    for idx in range(min(len(emotion_data_flat), len(emotions))):
        if emotion_data_flat[idx] == 1:
            emotion_count[emotions[idx]] += 1

print("\nEmotion Distribution in Dataset:", emotion_count)


Emotion Labels for each segment:
Segment: --qXJuDtHPw[0], Emotion Data Shape: (1, 7)
Segment: --qXJuDtHPw[0] -> Emotions: ['happy']
Segment: -3g5yACwYnA[0], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[0] -> Emotions: ['happy']
Segment: -3g5yACwYnA[1], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[1] -> Emotions: []
Segment: -3g5yACwYnA[2], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[2] -> Emotions: []
Segment: -3g5yACwYnA[3], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[3] -> Emotions: []
Segment: -3g5yACwYnA[4], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[4] -> Emotions: ['happy']
Segment: -3g5yACwYnA[5], Emotion Data Shape: (1, 7)
Segment: -3g5yACwYnA[5] -> Emotions: []
Segment: -3nNcZdcdvU[0], Emotion Data Shape: (1, 7)
Segment: -3nNcZdcdvU[0] -> Emotions: []
Segment: -3nNcZdcdvU[1], Emotion Data Shape: (1, 7)
Segment: -3nNcZdcdvU[1] -> Emotions: ['happy']
Segment: -3nNcZdcdvU[2], Emotion Data Shape: (1, 7)
Segment: -3nNcZdcdvU[2] -> Emotions: []
Segment: -571d8cVau

## Train Test Split

In [14]:
# obtain the train/dev/test splits - these splits are based on video IDs
train_split = DATASET.standard_folds.standard_train_fold
dev_split = DATASET.standard_folds.standard_valid_fold
test_split = DATASET.standard_folds.standard_test_fold

# inspect the splits: they only contain video IDs
print(test_split)

['7l3BNtSE0xc', 'dZFV0lyedX4', '286943', '126872', 'qgC8_emxSIU', 'kld9r0iFkWM', 'rC29Qub0U7A', '4YfyP0uIqw0', 'FMenDv3y8jc', '4wLP4elp1uM', 'KYQTwFVBzME', '27v7Blr0vjw', 'DnBHq5I52LM', 'HR18U0yAlTc', 'x266rUJQC_8', 'd1CDP6sMuLA', 'xSCvspXYU9k', '4EDblUpJieU', '4o4ilPK9rl8', '53609', 'SZ7HK5ns6mE', '243981', 'ySblgk7T7eQ', 'MYEyQUpMe3k', 'EujJ0SwiCRE', '3HyAaqre_Fk', 'iQDB_OkAQWs', 'gE7kUqMqQ9g', 'eFV7iFPYZB4', 'IRSxo_XXArg', '3hOlJf_JQDs', 'BRSyH6yfDLk', '1jogeKX0wGw', '3At-BKm9eYk', 'NVLPURuAVLU', 'pZye4zFzk3o', 'l1jW3OMXUzs', 'XKyumlBmix8', 'eKQKEi2-0Ws', 'WgI8IbJtXHw', 'tnWmVXZ87h0', 'YCEllKyaCrc', 'W1CWpktWtTs', '8wQhzezNcUY', '0bxhZ-LIfZY', 'lrjm6F3JJgg', 'Vdf1McvE9ao', 'eQc5uI7FKCU', '2QXHdu2zlQY', 'YCI-ZzclIPQ', '2Ky9DBSl49w', 'SKTyBOhDX6U', 'b86B3hP8ARM', '23656', 'kpS4BXif_Sw', 'dR68gbeOWOc', 'tC2KicUHB9Q', 'absh1hsZeF0', 'c5zxqITn3ZM', 'uogwnZGb-iE', '46495', 'Sq6DIhFxPqQ', 'PexNiFbPTYM', 'z441aDJvAcU', 'OORklkFql3k', 'WbtsuXkaGeg', 'grsV1YN1z5s', 'Gc_zIjqqUys', '424SXFTCFsA

In [34]:
# we can see they are in the format of 'video_id[segment_no]', but the splits was specified with video_id only
# we need to use regex or something to match the video IDs...
import re
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
from collections import defaultdict

# a sentinel epsilon for safe division, without it we will replace illegal values with a constant
EPS = 0

# construct a word2id mapping that automatically takes increment when new words are encountered
word2id = defaultdict(lambda: len(word2id))
UNK = word2id['<unk>']
PAD = word2id['<pad>']

# place holders for the final train/dev/test dataset
train = []
dev = []
test = []

# define a regular expression to extract the video ID out of the keys
pattern = re.compile('(.*)\[.*\]')
num_drop = 0 # a counter to count how many data points went into some processing issues

problematic_segments = {"HuIKyKkEL0Q[0]","JGEEA_JVriE[0]","JGEEA_JVriE[1]","JGEEA_JVriE[2]","JGEEA_JVriE[3]","aa0J1AXSseY[4]","aa0J1AXSseY[5]","aa0J1AXSseY[6]","zsRTbbKlsEg[0]"}

for segment in dataset[label_field].keys():
    
    if segment in problematic_segments:
        print(f"Skipping problematic segment: {segment}")
        num_drop += 1
        continue
    
    # get the video ID and the features out of the aligned dataset
    vid = re.search(pattern, segment).group(1)
    label = dataset[label_field][segment]['features']
    _words = dataset[word_field][segment]['features']
    _visual = dataset[visual_field][segment]['features']
    _acoustic = dataset[acoustic_field][segment]['features']

    # if the sequences are not same length after alignment, there must be some problem with some modalities
    # we should drop it or inspect the data again
    if not _words.shape[0] == _visual.shape[0] == _acoustic.shape[0]:
        print(f"Encountered datapoint {vid} with text shape {_words.shape}, visual shape {_visual.shape}, acoustic shape {_acoustic.shape}")
        num_drop += 1
        continue

    # remove nan values
    label = np.nan_to_num(label)
    _visual = np.nan_to_num(_visual)
    _acoustic = np.nan_to_num(_acoustic)

    # remove speech pause tokens - this is in general helpful
    # we should remove speech pauses and corresponding visual/acoustic features together
    # otherwise modalities would no longer be aligned
    words = []
    visual = []
    acoustic = []
    for i, word in enumerate(_words):
        if word[0] != b'sp':
            words.append(word2id[word[0].decode('utf-8')]) # SDK stores strings as bytes, decode into strings here
            visual.append(_visual[i, :])
            acoustic.append(_acoustic[i, :])

    words = np.asarray(words)
    visual = np.asarray(visual)
    acoustic = np.asarray(acoustic)

    # z-normalization per instance and remove nan/infs
    visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
    acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))

    if vid in train_split:
        train.append(((words, visual, acoustic), label, segment))
    elif vid in dev_split:
        dev.append(((words, visual, acoustic), label, segment))
    elif vid in test_split:
        test.append(((words, visual, acoustic), label, segment))
    else:
        print(f"Found video that doesn't belong to any splits: {vid}")

print(f"Total number of {num_drop} datapoints have been dropped.")

# turn off the word2id - define a named function here to allow for pickling
def return_unk():
    return UNK
word2id.default_factory = return_unk

  acoustic = np.nan_to_num((acoustic - acoustic.mean(0, keepdims=True)) / (EPS + np.std(acoustic, axis=0, keepdims=True)))
  x = um.multiply(x, x, out=x)
  visual = np.nan_to_num((visual - visual.mean(0, keepdims=True)) / (EPS + np.std(visual, axis=0, keepdims=True)))
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)


Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -9YyBTjo1zo
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7euuU
Found video that doesn't belong to any splits: -Alixo7eu

In [58]:
# let's see the size of each set and shape of data
print(len(train))
print(len(dev))
print(len(test))

print(train[0][0][1].shape)
print(train[0][1].shape)
print(train[0][1])

print(f"Total vocab size: {len(word2id)}")

16315
1871
4654
(42, 35)
(1, 7)
[[1.        0.6666667 0.6666667 0.        0.        0.        0.6666667]]
Total vocab size: 16819



Collate function in PyTorch

Collate functions are functions used by PyTorch dataloader to gather batched data from dataset. It loads multiple data points from an iterable dataset object and put them in a certain format. Here we just use the lists we've constructed as the dataset and assume PyTorch dataloader will operate on that.


In [59]:
def multi_collate(batch):
    '''
    Collate functions assume batch = [Dataset[i] for i in index_set]
    '''
    # for later use we sort the batch in descending order of length
    batch = sorted(batch, key=lambda x: x[0][0].shape[0], reverse=True)
    
    # get the data out of the batch - use pad sequence util functions from PyTorch to pad things
    labels = torch.cat([torch.from_numpy(sample[1]) for sample in batch], dim=0)
    sentences = pad_sequence([torch.LongTensor(sample[0][0]) for sample in batch], padding_value=PAD)
    visual = pad_sequence([torch.FloatTensor(sample[0][1]) for sample in batch])
    acoustic = pad_sequence([torch.FloatTensor(sample[0][2]) for sample in batch])
    
    # lengths are useful later in using RNNs
    lengths = torch.LongTensor([sample[0][0].shape[0] for sample in batch])
    return sentences, visual, acoustic, labels, lengths

# construct dataloaders, dev and test could use around ~X3 times batch size since no_grad is used during eval
batch_sz = 56
train_loader = DataLoader(train, shuffle=True, batch_size=batch_sz, collate_fn=multi_collate)
dev_loader = DataLoader(dev, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)
test_loader = DataLoader(test, shuffle=False, batch_size=batch_sz*3, collate_fn=multi_collate)

# let's create a temporary dataloader just to see how the batch looks like
temp_loader = iter(DataLoader(test, shuffle=True, batch_size=8, collate_fn=multi_collate))
batch = next(temp_loader)

print(batch[0].shape) # word vectors, padded to maxlen
print(batch[1].shape) # visual features
print(batch[2].shape) # acoustic features
print(batch[3]) # labels
print(batch[4]) # lengths

torch.Size([40, 8])
torch.Size([40, 8, 35])
torch.Size([40, 8, 74])
tensor([[ 0.6667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.6667,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.3333,  1.0000,  0.0000,  0.0000,  0.0000,  0.3333,  0.0000],
        [-0.3333,  0.0000,  0.3333,  0.3333,  0.0000,  0.0000,  0.0000],
        [-2.6667,  0.0000,  0.3333,  0.6667,  0.0000,  1.3333,  0.0000],
        [ 1.3333,  0.3333,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.6667,  1.6667,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 2.6667,  2.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000]])
tensor([40, 34, 34, 28, 24, 22, 19, 14])


In [60]:
# Let's actually inspect the transcripts to ensure it's correct
id2word = {v:k for k, v in word2id.items()}
examine_target = train
idx = np.random.randint(0, len(examine_target))
print(' '.join(list(map(lambda x: id2word[x], examine_target[idx][0][0].tolist()))))
# print(' '.join(examine_target[idx][0]))
print(examine_target[idx][1])
print(examine_target[idx][2])

behavior and any marketer that wants to be successful needs to consider these and be able to adapt their programs to the changing needs of the consumer consumer
[[0.33333334 0.33333334 0.         0.         0.         0.
  0.        ]]
Ha7DMd_iKyM[0]


## Defining Multimodal model

In [82]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LFLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate):
        super(LFLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate

        # Define the RNN layers for each modality
        self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(2 * hidden_sizes[0], hidden_sizes[0], bidirectional=True)
        
        self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True)
        
        self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(2 * hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        # Define fully connected layers for fusion and classification
        self.fc1 = nn.Linear(sum(hidden_sizes) * 4, fc1_size)
        self.fc2 = nn.Linear(fc1_size, output_size)  # Output layer with 6 classes for MOSEI
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.tlayer_norm = nn.LayerNorm((hidden_sizes[0] * 2,))
        self.vlayer_norm = nn.LayerNorm((hidden_sizes[1] * 2,))
        self.alayer_norm = nn.LayerNorm((hidden_sizes[2] * 2,))
        self.bn = nn.BatchNorm1d(sum(hidden_sizes) * 4)

    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths, enforce_sorted=False)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths, enforce_sorted=False)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        sentences = self.embed(sentences)

        # Extract features from each modality
        final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)
        final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)
        final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        # Late fusion by concatenating modality outputs
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a),
                      dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        return self.bn(h)

    def forward(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)  # Final output layer with six classes
        return o



- Load pretrained embeddings

We define a function for loading pretrained word embeddings stored in GloVe-style file. Contextualized embeddings obviously cannot be stored and loaded this way, though.


In [62]:
def load_emb(w2i, path_to_embedding, embedding_size=300, embedding_vocab=2196017, init_emb=None):
    if init_emb is None:
        emb_mat = np.random.randn(len(w2i), embedding_size)
    else:
        emb_mat = init_emb
    f = open(path_to_embedding, 'r')
    found = 0
    for line in tqdm_notebook(f, total=embedding_vocab):
        content = line.strip().split()
        vector = np.asarray(list(map(lambda x: float(x), content[-300:])))
        word = ' '.join(content[:-300])
        if word in w2i:
            idx = w2i[word]
            emb_mat[idx, :] = vector
            found += 1
    print(f"Found {found} words in the embedding file.")
    return torch.tensor(emb_mat).float()

In [None]:
# from tqdm import tqdm_notebook
# from torch.optim import Adam, SGD
# from sklearn.metrics import accuracy_score

# torch.manual_seed(123)
# torch.cuda.manual_seed_all(123)

# CUDA = torch.cuda.is_available()
# MAX_EPOCH = 1000

# text_size = 300
# visual_size = 47
# acoustic_size = 74

# # define some model settings and hyper-parameters
# input_sizes = [text_size, visual_size, acoustic_size]
# hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
# fc1_size = sum(hidden_sizes) // 2
# dropout = 0.25
# output_size = 1
# curr_patience = patience = 8
# num_trials = 3
# grad_clip_value = 1.0
# weight_decay = 0.1

# if os.path.exists(CACHE_PATH):
#     pretrained_emb, word2id = torch.load(CACHE_PATH)
# elif WORD_EMB_PATH is not None:
#     pretrained_emb = load_emb(word2id, WORD_EMB_PATH)
#     torch.save((pretrained_emb, word2id), CACHE_PATH)
# else:
#     pretrained_emb = None

# model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
# if pretrained_emb is not None:
#     model.embed.weight.data = pretrained_emb
# model.embed.requires_grad = False
# optimizer = Adam([param for param in model.parameters() if param.requires_grad], weight_decay=weight_decay)

# if CUDA:
#     model.cuda()
# criterion = nn.L1Loss(reduction='sum')
# criterion_test = nn.L1Loss(reduction='sum')
# best_valid_loss = float('inf')
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
# lr_scheduler.step() # for some reason it seems the StepLR needs to be stepped once first
# train_losses = []
# valid_losses = []
# for e in range(MAX_EPOCH):
#     model.train()
#     train_iter = tqdm_notebook(train_loader)
#     train_loss = 0.0
#     for batch in train_iter:
#         model.zero_grad()
#         t, v, a, y, l = batch
#         batch_size = t.size(0)
#         if CUDA:
#             t = t.cuda()
#             v = v.cuda()
#             a = a.cuda()
#             y = y.cuda()
#             l = l.cuda()
#         y_tilde = model(t, v, a, l)
#         loss = criterion(y_tilde, y)
#         loss.backward()
#         torch.nn.utils.clip_grad_value_([param for param in model.parameters() if param.requires_grad], grad_clip_value)
#         optimizer.step()
#         train_iter.set_description(f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item()/batch_size, 4)}")
#         train_loss += loss.item()
#     train_loss = train_loss / len(train)
#     train_losses.append(train_loss)
#     print(f"Training loss: {round(train_loss, 4)}")

#     model.eval()
#     with torch.no_grad():
#         valid_loss = 0.0
#         for batch in dev_loader:
#             model.zero_grad()
#             t, v, a, y, l = batch
#             if CUDA:
#                 t = t.cuda()
#                 v = v.cuda()
#                 a = a.cuda()
#                 y = y.cuda()
#                 l = l.cuda()
#             y_tilde = model(t, v, a, l)
#             loss = criterion(y_tilde, y)
#             valid_loss += loss.item()
    
#     valid_loss = valid_loss/len(dev)
#     valid_losses.append(valid_loss)
#     print(f"Validation loss: {round(valid_loss, 4)}")
#     print(f"Current patience: {curr_patience}, current trial: {num_trials}.")
#     if valid_loss <= best_valid_loss:
#         best_valid_loss = valid_loss
#         print("Found new best model on dev set!")
#         torch.save(model.state_dict(), 'model.std')
#         torch.save(optimizer.state_dict(), 'optim.std')
#         curr_patience = patience
#     else:
#         curr_patience -= 1
#         if curr_patience <= -1:
#             print("Running out of patience, loading previous best model.")
#             num_trials -= 1
#             curr_patience = patience
#             model.load_state_dict(torch.load('model.std'))
#             optimizer.load_state_dict(torch.load('optim.std'))
#             lr_scheduler.step()
#             print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")
    
#     if num_trials <= 0:
#         print("Running out of patience, early stopping.")
#         break

# model.load_state_dict(torch.load('model.std'))
# y_true = []
# y_pred = []
# model.eval()
# with torch.no_grad():
#     test_loss = 0.0
#     for batch in test_loader:
#         model.zero_grad()
#         t, v, a, y, l = batch
#         if CUDA:
#             t = t.cuda()
#             v = v.cuda()
#             a = a.cuda()
#             y = y.cuda()
#             l = l.cuda()
#         y_tilde = model(t, v, a, l)
#         loss = criterion_test(y_tilde, y)
#         y_true.append(y_tilde.detach().cpu().numpy())
#         y_pred.append(y.detach().cpu().numpy())
#         test_loss += loss.item()
# print(f"Test set performance: {test_loss/len(test)}")
# y_true = np.concatenate(y_true, axis=0)
# y_pred = np.concatenate(y_pred, axis=0)
                  
# y_true_bin = y_true >= 0
# y_pred_bin = y_pred >= 0
# bin_acc = accuracy_score(y_true_bin, y_pred_bin)
# print(f"Test set accuracy is {bin_acc}")

In [None]:
# from tqdm import tqdm_notebook
# from torch.optim import Adam, SGD
# from sklearn.metrics import accuracy_score
# import torch
# import torch.nn as nn
# import numpy as np
# import os

# torch.manual_seed(123)

# MAX_EPOCH = 1000
# text_size = 300
# visual_size = 35
# acoustic_size = 74

# # Define model settings and hyper-parameters
# input_sizes = [text_size, visual_size, acoustic_size]
# hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
# fc1_size = sum(hidden_sizes) // 2
# dropout = 0.25
# output_size = 1
# curr_patience = patience = 8
# num_trials = 3
# grad_clip_value = 1.0
# weight_decay = 0.1

# # Load embeddings if available
# if os.path.exists(CACHE_PATH):
#     pretrained_emb, word2id = torch.load(CACHE_PATH)
# elif WORD_EMB_PATH is not None:
#     pretrained_emb = load_emb(word2id, WORD_EMB_PATH)
#     torch.save((pretrained_emb, word2id), CACHE_PATH)
# else:
#     pretrained_emb = None

# # Initialize model
# model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
# if pretrained_emb is not None:
#     model.embed.weight.data = pretrained_emb
# model.embed.requires_grad = False
# optimizer = Adam([param for param in model.parameters() if param.requires_grad], weight_decay=weight_decay)

# criterion = nn.L1Loss(reduction='sum')
# criterion_test = nn.L1Loss(reduction='sum')
# best_valid_loss = float('inf')
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
# lr_scheduler.step()  # Step once initially to set up scheduler
# train_losses = []
# valid_losses = []

# # Training loop
# for e in range(MAX_EPOCH):
#     model.train()
#     train_iter = tqdm_notebook(train_loader)
#     train_loss = 0.0
#     for batch in train_iter:
#         model.zero_grad()
#         t, v, a, y, l = batch
#         batch_size = t.size(0)

#         y_tilde = model(t, v, a, l)
#         loss = criterion(y_tilde, y)
#         loss.backward()
#         torch.nn.utils.clip_grad_value_([param for param in model.parameters() if param.requires_grad], grad_clip_value)
#         optimizer.step()
#         train_iter.set_description(f"Epoch {e}/{MAX_EPOCH}, current batch loss: {round(loss.item() / batch_size, 4)}")
#         train_loss += loss.item()

#     train_loss = train_loss / len(train_loader)
#     train_losses.append(train_loss)
#     print(f"Training loss: {round(train_loss, 4)}")

#     # Validation
#     model.eval()
#     with torch.no_grad():
#         valid_loss = 0.0
#         for batch in dev_loader:
#             t, v, a, y, l = batch
#             y_tilde = model(t, v, a, l)
#             loss = criterion(y_tilde, y)
#             valid_loss += loss.item()
    
#     valid_loss = valid_loss / len(dev_loader)
#     valid_losses.append(valid_loss)
#     print(f"Validation loss: {round(valid_loss, 4)}")
#     print(f"Current patience: {curr_patience}, current trial: {num_trials}.")
    
#     if valid_loss <= best_valid_loss:
#         best_valid_loss = valid_loss
#         print("Found new best model on dev set!")
#         torch.save(model.state_dict(), 'model.std')
#         torch.save(optimizer.state_dict(), 'optim.std')
#         curr_patience = patience
#     else:
#         curr_patience -= 1
#         if curr_patience <= -1:
#             print("Running out of patience, loading previous best model.")
#             num_trials -= 1
#             curr_patience = patience
#             model.load_state_dict(torch.load('model.std'))
#             optimizer.load_state_dict(torch.load('optim.std'))
#             lr_scheduler.step()
#             print(f"Current learning rate: {optimizer.state_dict()['param_groups'][0]['lr']}")

#     if num_trials <= 0:
#         print("Running out of patience, early stopping.")
#         break

# # Load best model for testing
# model.load_state_dict(torch.load('model.std'))
# y_true = []
# y_pred = []
# model.eval()
# with torch.no_grad():
#     test_loss = 0.0
#     for batch in test_loader:
#         t, v, a, y, l = batch
#         y_tilde = model(t, v, a, l)
#         loss = criterion_test(y_tilde, y)
#         y_true.append(y_tilde.detach().cpu().numpy())
#         y_pred.append(y.detach().cpu().numpy())
#         test_loss += loss.item()

# print(f"Test set performance: {test_loss / len(test_loader)}")
# y_true = np.concatenate(y_true, axis=0)
# y_pred = np.concatenate(y_pred, axis=0)

# y_true_bin = y_true >= 0
# y_pred_bin = y_pred >= 0
# bin_acc = accuracy_score(y_true_bin, y_pred_bin)
# print(f"Test set accuracy is {bin_acc}")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  train_iter = tqdm_notebook(train_loader)


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 139.9862
Validation loss: 326.0772
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 131.0943
Validation loss: 306.0521
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 121.8259
Validation loss: 291.0132
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 114.6997
Validation loss: 278.8834
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 109.3622
Validation loss: 269.3297
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 105.7305
Validation loss: 264.6006
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.6286
Validation loss: 264.0867
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.5212
Validation loss: 263.8559
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.4826
Validation loss: 263.9465
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.4547
Validation loss: 263.7984
Current patience: 7, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.4314
Validation loss: 263.7466
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.4266
Validation loss: 263.7829
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.4131
Validation loss: 263.8425
Current patience: 7, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3979
Validation loss: 263.6789
Current patience: 6, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3967
Validation loss: 263.7393
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3764
Validation loss: 263.6493
Current patience: 7, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3641
Validation loss: 263.6223
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3571
Validation loss: 263.5821
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3454
Validation loss: 263.5686
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.335
Validation loss: 263.5586
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3291
Validation loss: 263.5496
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3246
Validation loss: 263.5305
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3158
Validation loss: 263.5481
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3127
Validation loss: 263.5044
Current patience: 7, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3077
Validation loss: 263.4877
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.3025
Validation loss: 263.5
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2969
Validation loss: 263.4667
Current patience: 7, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.293
Validation loss: 263.4631
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2902
Validation loss: 263.4557
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2888
Validation loss: 263.4474
Current patience: 8, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2878
Validation loss: 263.4483
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2877
Validation loss: 263.4554
Current patience: 7, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2881
Validation loss: 263.4721
Current patience: 6, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2891
Validation loss: 263.4534
Current patience: 5, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2889
Validation loss: 263.4461
Current patience: 4, current trial: 3.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2907
Validation loss: 263.4634
Current patience: 8, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2895
Validation loss: 263.459
Current patience: 7, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.291
Validation loss: 263.4535
Current patience: 6, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2883
Validation loss: 263.447
Current patience: 5, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2898
Validation loss: 263.4565
Current patience: 4, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2898
Validation loss: 263.4677
Current patience: 3, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2894
Validation loss: 263.4485
Current patience: 2, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2893
Validation loss: 263.4492
Current patience: 1, current trial: 3.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2897
Validation loss: 263.4488
Current patience: 0, current trial: 3.
Running out of patience, loading previous best model.
Current learning rate: 1e-05


  model.load_state_dict(torch.load('model.std'))
  optimizer.load_state_dict(torch.load('optim.std'))


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2864
Validation loss: 263.4464
Current patience: 8, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2857
Validation loss: 263.4455
Current patience: 7, current trial: 2.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2857
Validation loss: 263.4453
Current patience: 8, current trial: 2.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2858
Validation loss: 263.4454
Current patience: 8, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.286
Validation loss: 263.445
Current patience: 7, current trial: 2.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.286
Validation loss: 263.4446
Current patience: 8, current trial: 2.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.286
Validation loss: 263.4459
Current patience: 8, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2858
Validation loss: 263.4445
Current patience: 7, current trial: 2.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2858
Validation loss: 263.4455
Current patience: 8, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2859
Validation loss: 263.4453
Current patience: 7, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2859
Validation loss: 263.4469
Current patience: 6, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2859
Validation loss: 263.4456
Current patience: 5, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2857
Validation loss: 263.4453
Current patience: 4, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2861
Validation loss: 263.4447
Current patience: 3, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2861
Validation loss: 263.4451
Current patience: 2, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2859
Validation loss: 263.4462
Current patience: 1, current trial: 2.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2861
Validation loss: 263.4455
Current patience: 0, current trial: 2.
Running out of patience, loading previous best model.
Current learning rate: 1.0000000000000002e-06


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2856
Validation loss: 263.4445
Current patience: 8, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2855
Validation loss: 263.4445
Current patience: 7, current trial: 1.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2855
Validation loss: 263.4445
Current patience: 8, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2855
Validation loss: 263.4445
Current patience: 7, current trial: 1.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 8, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4446
Current patience: 7, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 6, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 5, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4447
Current patience: 4, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 3, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 2, current trial: 1.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 8, current trial: 1.
Found new best model on dev set!


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 8, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4447
Current patience: 7, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2855
Validation loss: 263.4447
Current patience: 6, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2855
Validation loss: 263.4445
Current patience: 5, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 4, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 3, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 2, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4445
Current patience: 1, current trial: 1.


  0%|          | 0/292 [00:00<?, ?it/s]

Training loss: 104.2854
Validation loss: 263.4446
Current patience: 0, current trial: 1.
Running out of patience, loading previous best model.
Current learning rate: 1.0000000000000002e-07
Running out of patience, early stopping.


  model.load_state_dict(torch.load('model.std'))
  return F.l1_loss(input, target, reduction=self.reduction)


Test set performance: 302.27976771763394


ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets

In [None]:
# from torch.optim import Adam
# from sklearn.metrics import accuracy_score

# torch.manual_seed(123)
# torch.cuda.manual_seed_all(123)

# # Model settings and hyperparameters
# CUDA = torch.cuda.is_available()
# MAX_EPOCH = 1000
# output_size = 7  # Six classes for MOSEI
# model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)
# if CUDA:
#     model.cuda()

# # Define loss, optimizer, and scheduler
# criterion = nn.CrossEntropyLoss()
# optimizer = Adam(model.parameters(), weight_decay=weight_decay)
# best_valid_loss = float('inf')
# patience = 8
# num_trials = 3
# curr_patience = patience

# for e in range(MAX_EPOCH):
#     model.train()
#     train_loss = 0.0
#     for batch in tqdm_notebook(train_loader):
#         t, v, a, y, l = batch
#         if CUDA:
#             t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

#         optimizer.zero_grad()
#         y_pred = model(t, v, a, l)
#         loss = criterion(y_pred, y.squeeze())  # Cross-entropy expects target in the form [N]
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()
#     print(f"Epoch {e}, Training loss: {train_loss / len(train_loader)}")

#     # Validation
#     model.eval()
#     valid_loss = 0.0
#     with torch.no_grad():
#         for batch in dev_loader:
#             t, v, a, y, l = batch
#             if CUDA:
#                 t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

#             y_pred = model(t, v, a, l)
#             y = torch.argmax(y, dim=1).long()
#             loss = criterion(y_pred, y.squeeze())
#             valid_loss += loss.item()
#     valid_loss /= len(dev_loader)
#     print(f"Validation loss: {valid_loss}")

#     # Early stopping
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'model_best.pt')
#         curr_patience = patience
#     else:
#         curr_patience -= 1
#         if curr_patience <= 0:
#             num_trials -= 1
#             curr_patience = patience
#             model.load_state_dict(torch.load('model_best.pt'))

#     if num_trials <= 0:
#         print("Early stopping.")
#         break

# # Test Evaluation
# model.load_state_dict(torch.load('model_best.pt'))
# model.eval()
# y_true, y_pred = [], []
# with torch.no_grad():
#     for batch in test_loader:
#         t, v, a, y, l = batch
#         if CUDA:
#             t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

#         logits = model(t, v, a, l)
#         predictions = torch.argmax(logits, dim=1).cpu().numpy()  # Predicted class labels
#         y_pred.extend(predictions)
#         y_true.extend(y.cpu().numpy().squeeze())

# # Calculate accuracy
# accuracy = accuracy_score(y_true, y_pred)
# print(f"Test set accuracy: {accuracy}")

# # Print sample predictions
# labels = {0: "Happy", 1: "Sad", 2: "Anger", 3: "Surprise", 4: "Disgust", 5: "Fear"}
# for i, pred in enumerate(y_pred[:5]):  # Print a few sample predictions
#     print(f"Predicted Label: {pred}, Corresponding Class: {labels[pred]}")


In [88]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score

torch.manual_seed(123)

# Model settings and hyperparameters
MAX_EPOCH = 1000
output_size = 7  
model = LFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), weight_decay=weight_decay)
best_valid_loss = float('inf')
patience = 8
num_trials = 3
curr_patience = patience

for e in range(MAX_EPOCH):
    model.train()
    train_loss = 0.0
    for batch in tqdm_notebook(train_loader):
        t, v, a, y, l = batch
        if CUDA:
            t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

        optimizer.zero_grad()
        y_pred = model(t, v, a, l)

        # Ensure `y` is a 1D tensor with integer class labels
        y = torch.argmax(y, dim=1).long()
        
        loss = criterion(y_pred, y)  # CrossEntropyLoss expects target as [batch_size]
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {e}, Training loss: {train_loss / len(train_loader)}")


    # Validation
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for batch in dev_loader:
            t, v, a, y, l = batch

            y_pred = model(t, v, a, l)
            loss = criterion(y_pred, y.squeeze())
            valid_loss += loss.item()
    valid_loss /= len(dev_loader)
    print(f"Validation loss: {valid_loss}")

    # Early stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model_best.pt')
        curr_patience = patience
    else:
        curr_patience -= 1
        if curr_patience <= 0:
            num_trials -= 1
            curr_patience = patience
            model.load_state_dict(torch.load('model_best.pt'))

    if num_trials <= 0:
        print("Early stopping.")
        break

# Test Evaluation
model.load_state_dict(torch.load('model_best.pt'))
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in test_loader:
        t, v, a, y, l = batch

        logits = model(t, v, a, l)
        predictions = torch.argmax(logits, dim=1).numpy()  # Predicted class labels
        y_pred.extend(predictions)
        y_true.extend(y.numpy().squeeze())

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Test set accuracy: {accuracy}")

# Print sample predictions
labels = {0: "Happy", 1: "Sad", 2: "Anger", 3: "Surprise", 4: "Disgust", 5: "Fear"}
for i, pred in enumerate(y_pred[:5]):  # Print a few sample predictions
    print(f"Predicted Label: {pred}, Corresponding Class: {labels[pred]}")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(train_loader):


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 0, Training loss: 1.5417299005266738
Validation loss: 1.9466325640678406


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 1, Training loss: 1.5254446737570306
Validation loss: 1.9445278942584991


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 2, Training loss: 1.5261231146446645
Validation loss: 1.9785599410533905


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 3, Training loss: 1.5370414644888002
Validation loss: 1.9277793268362682


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 4, Training loss: 1.5420455430468467
Validation loss: 1.910531481107076


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 5, Training loss: 1.5411946291792882
Validation loss: 1.9197011987368267


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 6, Training loss: 1.5301286496528208
Validation loss: 1.9315890272458394


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 7, Training loss: 1.5385510223369077
Validation loss: 1.939244528611501


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 8, Training loss: 1.5495178711740938
Validation loss: 1.9653646349906921


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 9, Training loss: 1.5604674228250164
Validation loss: 1.9671567976474762


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 10, Training loss: 1.5626854569944617
Validation loss: 1.9730343619982402


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 11, Training loss: 1.5615409335044965
Validation loss: 1.9723345736662548


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 12, Training loss: 1.5625904516814506
Validation loss: 1.9648740788300831


  model.load_state_dict(torch.load('model_best.pt'))


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 13, Training loss: 1.5564843612174466
Validation loss: 1.9739822149276733


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 14, Training loss: 1.5615766252556893
Validation loss: 1.9720546404520671


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 15, Training loss: 1.5622159291620124
Validation loss: 1.9646567503611247


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 16, Training loss: 1.5614457697901007
Validation loss: 1.9769028723239899


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 17, Training loss: 1.562521610766241
Validation loss: 1.9622259140014648


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 18, Training loss: 1.5627142909454972
Validation loss: 1.9781807561715443


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 19, Training loss: 1.562657409334836
Validation loss: 1.9646568695704143


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 20, Training loss: 1.5630605257537267
Validation loss: 1.9689100881417592


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 21, Training loss: 1.5634722142186883
Validation loss: 1.9828787843386333


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 22, Training loss: 1.5616856809348276
Validation loss: 1.989474594593048


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 23, Training loss: 1.5627455952232832
Validation loss: 1.969591091076533


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 24, Training loss: 1.5618079394510347
Validation loss: 1.9742011725902557


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 25, Training loss: 1.5621826828342595
Validation loss: 1.9806284805138905


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 26, Training loss: 1.5630036714958817
Validation loss: 1.9829756617546082


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 27, Training loss: 1.5623362321559697
Validation loss: 1.9831279913584392


  0%|          | 0/292 [00:00<?, ?it/s]

Epoch 28, Training loss: 1.5624813992683202
Validation loss: 1.969900220632553
Early stopping.


  model.load_state_dict(torch.load('model_best.pt'))


ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [87]:
print(f"y_pred shape: {y_pred.shape}")  # Should be [batch_size, num_classes]
print(f"y shape after squeeze: {y.squeeze().shape}")  # Should be [batch_size]
print("Min label:", y.min().item(), "Max label:", y.max().item())


y_pred shape: torch.Size([56, 6])
y shape after squeeze: torch.Size([56])
Min label: 0 Max label: 6


In [90]:
# Test Evaluation
model.load_state_dict(torch.load('model_best.pt'))
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        t, v, a, y, l = batch
        if CUDA:
            t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

        logits = model(t, v, a, l)
        predictions = torch.argmax(logits, dim=1).cpu().numpy()  # Predicted class labels

        # Convert `y` to 1D and ensure it's integer labels only
        y_true_labels = torch.argmax(y, dim=1).cpu().numpy() if y.dim() > 1 else y.cpu().numpy()

        y_pred.extend(predictions)
        y_true.extend(y_true_labels)

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Test set accuracy: {accuracy}")

# Print sample predictions with class mapping
labels = {0: "Happy", 1: "Sad", 2: "Anger", 3: "Surprise", 4: "Disgust", 5: "Fear", 6: "Neutral"}
for i, pred in enumerate(y_pred[:5]):  # Print a few sample predictions
    print(f"Predicted Label: {pred}, Corresponding Class: {labels[pred]}")


  model.load_state_dict(torch.load('model_best.pt'))


Test set accuracy: 0.4351095831542759
Predicted Label: 0, Corresponding Class: Happy
Predicted Label: 0, Corresponding Class: Happy
Predicted Label: 0, Corresponding Class: Happy
Predicted Label: 0, Corresponding Class: Happy
Predicted Label: 0, Corresponding Class: Happy


## Enhanced LSTM

In [107]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class EnhancedLFLSTM(nn.Module):
    def __init__(self, input_sizes, hidden_sizes, fc1_size, output_size, dropout_rate):
        super(EnhancedLFLSTM, self).__init__()
        self.input_size = input_sizes
        self.hidden_size = hidden_sizes
        self.fc1_size = fc1_size
        self.output_size = output_size
        self.dropout_rate = dropout_rate

        # Define the RNN layers for each modality
        self.embed = nn.Embedding(len(word2id), input_sizes[0])
        self.trnn1 = nn.LSTM(input_sizes[0], hidden_sizes[0], bidirectional=True)
        self.trnn2 = nn.LSTM(2 * hidden_sizes[0], hidden_sizes[0], bidirectional=True)
        
        self.vrnn1 = nn.LSTM(input_sizes[1], hidden_sizes[1], bidirectional=True)
        self.vrnn2 = nn.LSTM(2 * hidden_sizes[1], hidden_sizes[1], bidirectional=True)
        
        self.arnn1 = nn.LSTM(input_sizes[2], hidden_sizes[2], bidirectional=True)
        self.arnn2 = nn.LSTM(2 * hidden_sizes[2], hidden_sizes[2], bidirectional=True)

        # Define fully connected layers for fusion and classification
        self.fc1 = None  # Initialized dynamically based on fusion_size in fusion()
        self.fc2 = nn.Linear(fc1_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

        # Layer normalization for each modality
        self.tlayer_norm = nn.LayerNorm(hidden_sizes[0] * 2)
        self.vlayer_norm = nn.LayerNorm(hidden_sizes[1] * 2)
        self.alayer_norm = nn.LayerNorm(hidden_sizes[2] * 2)
        
        # Initialize BatchNorm1d to None - we'll set it dynamically in `fusion`
        self.bn = None

    def extract_features(self, sequence, lengths, rnn1, rnn2, layer_norm):
        packed_sequence = pack_padded_sequence(sequence, lengths, enforce_sorted=False)
        packed_h1, (final_h1, _) = rnn1(packed_sequence)
        padded_h1, _ = pad_packed_sequence(packed_h1)
        normed_h1 = layer_norm(padded_h1)
        packed_normed_h1 = pack_padded_sequence(normed_h1, lengths, enforce_sorted=False)
        _, (final_h2, _) = rnn2(packed_normed_h1)
        return final_h1, final_h2

    def fusion(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        sentences = self.embed(sentences)

        # Extract features from each modality
        final_h1t, final_h2t = self.extract_features(sentences, lengths, self.trnn1, self.trnn2, self.tlayer_norm)
        final_h1v, final_h2v = self.extract_features(visual, lengths, self.vrnn1, self.vrnn2, self.vlayer_norm)
        final_h1a, final_h2a = self.extract_features(acoustic, lengths, self.arnn1, self.arnn2, self.alayer_norm)

        # Concatenate modality outputs and print sizes to verify
        h = torch.cat((final_h1t, final_h2t, final_h1v, final_h2v, final_h1a, final_h2a), dim=2).permute(1, 0, 2).contiguous().view(batch_size, -1)
        fusion_size = h.size(1)  # Dynamically calculate fusion size
        # print(f"Concatenated feature size (batch_size, fusion_size): {h.size()}")  # Debugging print

        # Dynamically initialize BatchNorm1d based on fusion size, if not already initialized
        if self.bn is None or self.bn.num_features != fusion_size:
            self.bn = nn.BatchNorm1d(fusion_size).to(h.device)
            # print(f"BatchNorm initialized with num_features={fusion_size}")  # Debugging print

        # Dynamically initialize fc1 layer based on fusion_size
        if self.fc1 is None or self.fc1.in_features != fusion_size:
            self.fc1 = nn.Linear(fusion_size, self.fc1_size).to(h.device)
            # print(f"fc1 initialized with in_features={fusion_size}, out_features={self.fc1_size}")  # Debugging print

        h = self.bn(h)  # Apply batch normalization
        return h

    def forward(self, sentences, visual, acoustic, lengths):
        batch_size = lengths.size(0)
        h = self.fusion(sentences, visual, acoustic, lengths)
        h = self.fc1(h)
        h = self.dropout(h)
        h = self.relu(h)
        o = self.fc2(h)  # Final output layer with seven classes
        return o


In [None]:
from torch.optim import Adam
from sklearn.metrics import accuracy_score
import torch

torch.manual_seed(123)

text_size = 300
visual_size = 35
acoustic_size = 74

# Define model settings and hyper-parameters
input_sizes = [text_size, visual_size, acoustic_size]
hidden_sizes = [int(text_size * 1.5), int(visual_size * 1.5), int(acoustic_size * 1.5)]
fc1_size = sum(hidden_sizes) // 2
dropout = 0.25
curr_patience = patience = 8
num_trials = 3
grad_clip_value = 1.0
weight_decay = 0.1

# Load embeddings if available
if os.path.exists(CACHE_PATH):
    pretrained_emb, word2id = torch.load(CACHE_PATH)
elif WORD_EMB_PATH is not None:
    pretrained_emb = load_emb(word2id, WORD_EMB_PATH)
    torch.save((pretrained_emb, word2id), CACHE_PATH)
else:
    pretrained_emb = None

# Model settings and hyperparameters
MAX_EPOCH = 1000
output_size = 7  
model = EnhancedLFLSTM(input_sizes, hidden_sizes, fc1_size, output_size, dropout)

# Move model to CUDA if available
if CUDA:
    model.cuda()

# Define loss, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.0005, weight_decay=0.01)
best_valid_loss = float('inf')
patience = 8
num_trials = 3
curr_patience = patience

for epoch in range(MAX_EPOCH):
    model.train()
    train_loss = 0.0
    for batch in tqdm_notebook(train_loader, desc=f"Epoch {epoch+1}/{MAX_EPOCH}"):
        t, v, a, y, l = batch
        if CUDA:
            t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

        optimizer.zero_grad()
        y_pred = model(t, v, a, l)

        # Ensure `y` is a 1D tensor with integer class labels
        y = torch.argmax(y, dim=1).long()
        
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {train_loss:.4f}")

    # Validation
    model.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for batch in dev_loader:
            t, v, a, y, l = batch
            if CUDA:
                t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

            y_pred = model(t, v, a, l)
            y = torch.argmax(y, dim=1).long()  # Convert labels to 1D
            loss = criterion(y_pred, y)
            valid_loss += loss.item()
    valid_loss /= len(dev_loader)
    print(f"Validation Loss: {valid_loss:.4f}")
    print(f"Current Patience: {curr_patience}, Trials Left: {num_trials}")

    # Early stopping
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model_best_enhanced.pt')
        curr_patience = patience
        print("New best model found and saved.")
    else:
        curr_patience -= 1
        if curr_patience <= 0:
            num_trials -= 1
            curr_patience = patience
            model.load_state_dict(torch.load('model_best_enhanced.pt'))
            print("Patience exhausted. Loading best model and reducing learning rate.")
            for g in optimizer.param_groups:
                g['lr'] = g['lr'] * 0.5  # Halve learning rate

    if num_trials <= 0:
        print("Early stopping triggered. Ending training.")
        break

# Test Evaluation
model.load_state_dict(torch.load('model_best_enhanced.pt'))
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in test_loader:
        t, v, a, y, l = batch
        if CUDA:
            t, v, a, y, l = t.cuda(), v.cuda(), a.cuda(), y.cuda(), l.cuda()

        logits = model(t, v, a, l)
        predictions = torch.argmax(logits, dim=1).cpu().numpy()  # Predicted class labels
        y_pred.extend(predictions)
        y_true.extend(y.cpu().numpy().squeeze())

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Test Set Accuracy: {accuracy:.4f}")

# Print sample predictions
labels = {0: "Happy", 1: "Sad", 2: "Anger", 3: "Surprise", 4: "Disgust", 5: "Fear", 6: "Neutral"}
for i, pred in enumerate(y_pred[:5]):  # Print a few sample predictions
    print(f"Predicted Label: {pred}, Corresponding Class: {labels[pred]}")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm_notebook(train_loader, desc=f"Epoch {epoch+1}/{MAX_EPOCH}"):


Epoch 1/1000:   0%|          | 0/292 [00:00<?, ?it/s]