## Loading Libraries and Data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# UPDATE THE MAIN DIRECTORY LOCATION ACCORDINGLY
main_dir = '/content/drive/MyDrive/IITB_EE/CS772_project/'

In [None]:
from IPython.display import Audio, display
import os
import pandas as pd
import re
import os
# import opensmile
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchaudio
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import random
# Set the random seed
random.seed(42)

In [None]:
# !unzip '/content/drive/MyDrive/IITB_EE/CS772_project/IEMOCAP.zip'
# # Change directory to the root folder
# os.chdir('/content/IEMOCAP')
# # List the contents of the root folder
# !ls

In [None]:
# # save the DataFrame to a pickle file
# data.to_pickle('data_processed.pickle')
data_pickel_path = os.path.join(main_dir,'data_processed.pickle')
data = pd.read_pickle(data_pickel_path)
data.shape

In [None]:
data = data[(data['trans_words'] != '<s> ++GARBAGE++ </s>') & (data['trans_words'] != '<s> ++BREATHING++ </s>')].reset_index(drop=True)
data.shape

# **BLSTM Based Unimodal Models**

**Note: Same models will be used for the lexical only and audio only classification**



### **Model I -output of the final BLSTM block**

In [None]:
# use the output of the final BLSTM  block for the classification
class BLSTM_lastblock(nn.Module):
    def __init__(self, input_dim, hidden_dim,num_layers, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x,lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Select the output of the final LSTM block
        x = x[torch.arange(x.size(0)), lengths - 1, :]
        # Pass the output through the linear layer for classification
        x = self.fc(x)
        return x

### **Model II -Using averaging pooling**

In [None]:
# use average pooling of the outputs of the BLSTM block for classification
class BLSTM_avg_pooling(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Compute the average across the sequence dimension (axis=1)
        x = torch.mean(x, dim=1)
        # Pass the output through the linear layer for classification
        x = self.fc(x)
        return x

### **Model III - Using context based attention pooling**

In [None]:
class ContextBasedAttention(nn.Module):
    def __init__(self, hidden_dim, attention_dim):
        super().__init__()
        self.wh = nn.Linear(hidden_dim, attention_dim)
        self.v = nn.Parameter(torch.rand(attention_dim, 1))
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        hi = self.wh(x)
        ei = self.tanh(hi).matmul(self.v)
        ai = self.softmax(ei)
        z = torch.sum(ai * x, dim=1)
        return z

class BLSTMWithContextBasedAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, attention_dim, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.attention = ContextBasedAttention(hidden_dim * 2, attention_dim)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Compute the attention based weighted average across the sequence dimension (axis=1)
        x = self.attention(x)
        x = self.fc(x)
        return x

# Acousitc Modality

### Audio Dataset Class

In [None]:
class IEMOCAP_audio(Dataset):

    def __init__(self, mean=None, std=None):
        # Initialize data, download etc.
        data = pd.read_pickle(data_pickel_path)
        self.x = [np.array(samp_feat) for samp_feat in data['features']]
        y = data['emotion'].values
        self.label_encoder = LabelEncoder()
        self.spk = data['spk'] 
        self.y = self.label_encoder.fit_transform(y)
        self.n_samples = data.shape[0]
        self.mean = mean
        self.std = std

    def __getitem__(self, index):
        x_ = torch.tensor(self.x [index])
        if self.mean is not None and self.std is not None:
            x_ = (x_ - self.mean) / self.std
        seq_size = x_.shape[0]
        return x_, torch.tensor(self.y[index]), seq_size

    def __len__(self):
        return self.n_samples

    def get_spk(self):
        return self.spk
        
    def get_encoder(self):
        return self.label_encoder

In [None]:
iemocap_dataset  = IEMOCAP_audio()
index = 400
print('label = ',iemocap_dataset[index][1])
print('sample feature_shape = ',iemocap_dataset [index][0].shape)
print('original sequence length = ',iemocap_dataset [index][2])
print('inverse label transform',iemocap_dataset.get_encoder().inverse_transform([0, 1, 2,3]))
iemocap_dataset[0]

label =  tensor(2)
sample feature_shape =  torch.Size([690, 65])
original sequence length =  690


**Custom Collate Function**

In [None]:
def custom_collate_fn(batch):
    features, labels,seq_lengths = zip(*batch)
    # seq_lengths = [len(seq) for seq in features]

    # Sort sequences by length in descending order
    seq_lengths, perm_idx = torch.tensor(seq_lengths).sort(0, descending=True)
    features = [features[i] for i in perm_idx]
    labels = torch.tensor([labels[i] for i in perm_idx])

    # Pad the sequences
    padded_features = pad_sequence(features, batch_first=True)
    return padded_features, labels, seq_lengths

**Randomly splitting the dataset and storing the test (20%) and train (80%) indices**

In [None]:
num_samp = len(iemocap_dataset)
train_size = int(0.8 * len(iemocap_dataset))
test_size = len(iemocap_dataset) - train_size
train_indices = random.sample(range(num_samp), train_size)
test_indices = [i for i in range(num_samp) if i not in train_indices ]
print("train_indices",train_indices)
print("test_indices",test_indices)

In [None]:
batch_size = 32
train_dataset_unnormalized = torch.utils.data.Subset(iemocap_dataset, train_indices)
train_loader_unnormalized = DataLoader(dataset=train_dataset_unnormalized, batch_size=batch_size, shuffle=True,collate_fn=custom_collate_fn)

[5238, 912, 204, 2253, 2006, 1828, 1143, 839, 4467, 712, 4837, 3456, 260, 244, 767, 1791, 1905, 4139, 4931, 217, 4597, 1628, 5323, 4464, 3436, 1805, 3679, 4827, 2278, 53, 1307, 3462, 2787, 2276, 1273, 1763, 2757, 837, 759, 3112, 792, 2940, 2817, 4945, 2166, 355, 3763, 4392, 1022, 3100, 645, 4522, 2401, 5149, 5066, 2962, 4729, 1575, 569, 375, 5417, 1866, 2370, 653, 1907, 827, 3113, 2277, 3714, 5207, 2988, 1332, 3032, 2910, 1716, 2187, 5308, 584, 4990, 5201, 1401, 4375, 2005, 1338, 3786, 3108, 2211, 5242, 4562, 1799, 2656, 458, 1876, 262, 2584, 3286, 2193, 542, 1728, 4646, 2577, 1741, 5369, 4089, 3241, 5266, 3758, 1170, 2169, 5513, 2020, 4598, 4415, 2152, 4788, 3509, 4780, 3271, 2965, 1796, 1133, 4174, 4042, 744, 385, 898, 1252, 5140, 1310, 3458, 4885, 520, 3152, 3126, 4881, 3834, 4334, 2059, 4532, 94, 938, 4398, 2185, 5250, 2786, 913, 2404, 3561, 1295, 3716, 26, 2157, 4100, 1463, 4158, 871, 5122, 2444, 5234, 5365, 4988, 1629, 5393, 3063, 1323, 4418, 4344, 4, 4906, 2655, 4002, 159, 916, 

### **Normalizing the Audio Data**

In [None]:
def calculate_mean_std(train_loader):
    running_sum = 0
    running_sum_sq = 0
    total_count = 0

    for i,batch in enumerate(train_loader):
        # if i <= 2:
        features_batch, labels_batch, seq_sizes_batch = batch
        for features, seq_size in zip(features_batch, seq_sizes_batch):
            features = features.numpy()
            # print(features[:seq_size].shape)
            # print(features[:seq_size].shape,features[:seq_size])
            running_sum += np.sum(features[:seq_size], axis=0)
            running_sum_sq += np.sum(features[:seq_size] ** 2, axis=0)
            total_count += seq_size

    mean = running_sum / total_count
    std = np.sqrt(running_sum_sq / total_count - mean ** 2)

    return mean, std

In [None]:
mean, std = calculate_mean_std(train_loader_unnormalized)
print(mean.shape,mean[:10])
print(std.shape,std[:10])

### Results for Acoustic only models:
**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (epochs=15) | 1.1084 | 49.95% | 51.57% | 52.19% | 37.83% | 48.80% | 67.46% |
| Avg. Pool (epochs15) | 0.9579 | 61.41% | 62.69% | 65.69% | 51.56% | 60.72% | 72.79% |
| Attention based (attention layer size - 64) (e=14) | 0.6589 | 75.23% | 76.07% | 79.86% | 68.67% | 75.00% | 80.73% | 


**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=15) | 1.1296 | 49.82% | 50.03% | 38.79% | 38.44% | 57.91% | 64.97% |
| Avg. Pool (e=15) | 1.0815 | 54.98% | 54.45% | 37.38% | 49.38% | 63.00% | 68.02% |
| Attention based (attention layer size - 64) (e=14) | 1.0620 | 59.22% | 60.32% | 60.71% | 54.18% | 56.82% | 69.57% | 

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

# **lexical Only Model**

In [None]:
!pip install transformers
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [None]:
from transformers import BertTokenizer, BertModel
import contractions

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens': ['<sil>', '++laughter++', '++breathing++']}) # Modify the tokenizer to add special tokens
bert_model.resize_token_embeddings(len(tokenizer))  # Update the BERT model to account for the new tokens

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trans_words = data['trans_words'].copy()

In [None]:
#  remove the <s>, </s>  tokens from the text before tokenizing 
def preprocess_text(text,tokenizer):
    text = text.lower()
    text = contractions.fix(text)  # Expand contractions

    # note: since we have lowered the case, we are using garbage instead of GARBAGE 
    text = text.replace('<s>', '[CLS]').replace('</s>', '[SEP]').replace('++garbage++', '') #.replace('++breathing++', '').replace('++laughter++', '').replace('<sil>', '')
    # print(text)
    tokens = tokenizer.tokenize(text)
    # print(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    return input_ids


In [None]:
idx = 18
trans_word = trans_words.iloc[idx]
lst = trans_word.split()
print(len(lst),lst)
print('transcribed words:',trans_word)
tokenised = preprocess_text(trans_words.iloc[idx],tokenizer)
print('transcribed words:',tokenised,len(tokenised))

32 ['<s>', '<sil>', 'AW', 'DO', 'YOU', 'KNOW', 'I', 'SHOULD', 'HAVE', 'BROUGHT', 'A', 'SIX', 'PACK', '<sil>', 'TOO', '<sil>', 'A', 'SIX', 'PACK', 'THAT', 'WOULD', 'BE', 'JUST', 'THE', 'TICKET', 'RIGHT', 'ABOUT', 'NOW', '<sil>', 'HUH', '<sil>', '</s>']
transcribed words: <s> <sil> AW DO YOU KNOW I SHOULD HAVE BROUGHT A SIX PACK <sil> TOO <sil> A SIX PACK THAT WOULD BE JUST THE TICKET RIGHT ABOUT NOW <sil> HUH <sil> </s>
[CLS] <sil> aw do you know i should have brought a six pack <sil> too <sil> a six pack that would be just the ticket right about now <sil> huh <sil> [SEP]
['[CLS]', '<sil>', 'aw', 'do', 'you', 'know', 'i', 'should', 'have', 'brought', 'a', 'six', 'pack', '<sil>', 'too', '<sil>', 'a', 'six', 'pack', 'that', 'would', 'be', 'just', 'the', 'ticket', 'right', 'about', 'now', '<sil>', 'huh', '<sil>', '[SEP]']
transcribed words: [101, 30522, 22091, 2079, 2017, 2113, 1045, 2323, 2031, 2716, 1037, 2416, 5308, 30522, 2205, 30522, 1037, 2416, 5308, 2008, 2052, 2022, 2074, 1996, 728

### **Reslut for Lexical Only:**
**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=8/10) | 0.6268 | 76.49% | 76.50% | 78.63% | 78.26% | 74.74% | 74.38% |
| Avg. Pool (e=9/10) | 0.7926 | 70.70% | 70.29% | 71.65% | 68.73% | 75.71% | 65.08% |
| Attention based (attention layer size - 64) (e=7/10) | 0.5780 | 77.65% | 77.60% | 79.53% | 76.74% | 78.86% | 75.28% | 

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=8/10) | 0.9218 | 65.04% | 64.03% | 53.74% | 64.06% | 71.31% | 67.01% |
| Avg. Pool (e=9/10) | 1.0551 | 62.68% | 60.56% | 52.34% | 70.31% | 67.83% | 51.78% |
| Attention based (attention layer size - 64) (e=7/10) | 0.9797 | 63.59% | 61.59% | 54.67% | 70.00% | 68.90% | 52.79% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

1. Why the last block method performed better?
    1. Shorter sequences show BLSTM able to capture most of the utterance context in the last block. So the last block has most of the information. But since we have a BLSTM model the other blocks output would also have the context to the entire utterance so averag pooling should also work. But maybe focusing on just one block's output makes it easier for the model to optimize weights sice we have a start and end token that are same across all the samples. It was also evident as when we didnt use the start and end tokens all the three models performed worst and the last block output method didnt do as good.
    2. Something to do with BERT
    3. Large feature input size so overfitting is possible. The more complex attention model overfits more compared to the simepler non attention based model.

## MM Dataset

In [None]:
# data_pickel_path = ''

In [None]:
class IEMOCAP_mm(Dataset):
    def __init__(self, tokenizer, bert_model, mean, std, data_pickel_path, max_len=512):
        data = pd.read_pickle(data_pickel_path)
        self.x_l = data['trans_words']
        self.x_a = [np.array(samp_feat) for samp_feat in data['features']]

        y = data['emotion'].values
        self.label_encoder = LabelEncoder()
        self.y = self.label_encoder.fit_transform(y)

        self.tokenizer = tokenizer
        self.bert_model = bert_model
        self.max_len = max_len
        self.mean = mean
        self.std = std
        self.n_samples = data.shape[0]
        # self.spk = data['spk'] 

    def __getitem__(self, idx):

        # lexical features
        text = self.x_l[idx]
        word_ids = preprocess_text(text,self.tokenizer)
        word_ids = self.truncate(word_ids)
        embeddings = self.extract_embeddings(word_ids) 
        seq_size_l = len(word_ids)

        # acoustic features
        feat_a = torch.tensor(self.x_a[idx])
        if self.mean is not None and self.std is not None:
            feat_a = (feat_a - self.mean) / self.std
        seq_size_a = feat_a.shape[0]

        return feat_a,seq_size_a,torch.tensor(embeddings),seq_size_l,torch.tensor(self.y[idx])

    def truncate(self, sequence):
        if len(sequence) > self.max_len:
            return sequence[:self.max_len]
        else:
            return sequence # + [0] * (self.max_len - len(sequence))

    def create_attention_mask(self, input_ids):
        return [1 if token_id > 0 else 0 for token_id in input_ids]

    def extract_embeddings(self, input_ids):#, attention_mask):
        with torch.no_grad():
            input_ids = torch.tensor(input_ids).unsqueeze(0)
            # attention_mask = torch.tensor(attention_mask).unsqueeze(0)
            outputs = self.bert_model(input_ids)#, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.squeeze(0).numpy()
        return embeddings

    def __len__(self):
        return self.n_samples

    def get_spk(self):
        return self.spk
        
    def get_encoder(self):
        return self.label_encoder

**Collate function for MM**

In [None]:
def custom_collate_fn_mm(batch):
    feat_a, seq_size_a, feat_l, seq_size_l, labels = zip(*batch)

    # Pad the sequences
    padded_feat_a = pad_sequence(feat_a, batch_first=True)
    padded_feat_l = pad_sequence(feat_l, batch_first=True)

    # Convert sequence lengths and labels to tensors
    seq_size_a = torch.tensor(seq_size_a)
    seq_size_l = torch.tensor(seq_size_l)
    labels = torch.tensor(labels)

    return padded_feat_a, seq_size_a, padded_feat_l, seq_size_l, labels


In [None]:
dataset_mm = IEMOCAP_mm(tokenizer, bert_model,mean,std,data_pickel_path)
feat = dataset_mm[0]
print(feat[0].shape,feat[2].shape,feat)

In [None]:
num_samp = len(dataset_mm)
train_size = int(0.8 * len(dataset_mm))
test_size = len(dataset_mm) - train_size
train_indices = random.sample(range(num_samp), train_size)
test_indices = [i for i in range(num_samp) if i not in train_indices ]
print(train_indices)
print(test_indices)

In [None]:
batch_size = 32

train_dataset_mm = torch.utils.data.Subset(dataset_mm, train_indices)
train_loader_mm = DataLoader(dataset=train_dataset_mm, batch_size=batch_size, shuffle=True,collate_fn=custom_collate_fn_mm)

test_dataset_mm = torch.utils.data.Subset(dataset_mm, test_indices)
test_loader_mm = DataLoader(dataset=test_dataset_mm, batch_size=batch_size, shuffle=False,collate_fn=custom_collate_fn_mm)

[5238, 912, 204, 2253, 2006, 1828, 1143, 839, 4467, 712, 4837, 3456, 260, 244, 767, 1791, 1905, 4139, 4931, 217, 4597, 1628, 5323, 4464, 3436, 1805, 3679, 4827, 2278, 53, 1307, 3462, 2787, 2276, 1273, 1763, 2757, 837, 759, 3112, 792, 2940, 2817, 4945, 2166, 355, 3763, 4392, 1022, 3100, 645, 4522, 2401, 5149, 5066, 2962, 4729, 1575, 569, 375, 5417, 1866, 2370, 653, 1907, 827, 3113, 2277, 3714, 5207, 2988, 1332, 3032, 2910, 1716, 2187, 5308, 584, 4990, 5201, 1401, 4375, 2005, 1338, 3786, 3108, 2211, 5242, 4562, 1799, 2656, 458, 1876, 262, 2584, 3286, 2193, 542, 1728, 4646, 2577, 1741, 5369, 4089, 3241, 5266, 3758, 1170, 2169, 5513, 2020, 4598, 4415, 2152, 4788, 3509, 4780, 3271, 2965, 1796, 1133, 4174, 4042, 744, 385, 898, 1252, 5140, 1310, 3458, 4885, 520, 3152, 3126, 4881, 3834, 4334, 2059, 4532, 94, 938, 4398, 2185, 5250, 2786, 913, 2404, 3561, 1295, 3716, 26, 2157, 4100, 1463, 4158, 871, 5122, 2444, 5234, 5365, 4988, 1629, 5393, 3063, 1323, 4418, 4344, 4, 4906, 2655, 4002, 159, 916, 

## Multimodal Baseline Model

### Architecture

In [None]:
class MultimodalClassifier_baseline(nn.Module):
    def __init__(self, pretrained_acoustic_dict, pretr_model_lex, hidden_dim, num_classes, input_dim_a=65, input_dim_l=768, num_layers=2, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__()

        model_acoustic = BLSTM_avg_pooling(input_dim_a, hidden_dim, num_layers, num_classes).to(device)
        model_acoustic.load_state_dict(pretrained_acoustic_dict)

        self.blstm_acoustic = model_acoustic.rnn
        self.blstm_lex = pretr_model_lex.rnn

        self.fc_mm = torch.nn.Linear(hidden_dim * 4, num_classes) 
        self.fc_mm.weight.data = torch.cat((pretrained_acoustic_dict['fc.weight'], pretr_model_lex.state_dict()['fc.weight']), dim=1)
        self.fc_mm.bias.data = pretrained_acoustic_dict['fc.bias'] + pretr_model_lex.state_dict()['fc.bias']

    def forward(self, x_acoustic, lengths_acoustic, x_lex, lengths_lex):
        # Pack the padded sequence
        x_acoustic = torch.nn.utils.rnn.pack_padded_sequence(x_acoustic, lengths_acoustic,
                                                             batch_first=True, enforce_sorted=False)
        x_lex = torch.nn.utils.rnn.pack_padded_sequence(x_lex, lengths_lex,
                                                        batch_first=True, enforce_sorted=False)

        output_acoustic,_ = self.blstm_acoustic(x_acoustic)
        output_lex,_ = self.blstm_lex(x_lex)

        # Unpack the packed sequence
        output_acoustic, _ = torch.nn.utils.rnn.pad_packed_sequence(output_acoustic, batch_first=True)
        output_lex, _ = torch.nn.utils.rnn.pad_packed_sequence(output_lex, batch_first=True)

        # Compute the average across the sequence dimension (axis=1)
        output_acoustic = torch.mean(output_acoustic, dim=1)
        output_lex = torch.mean(output_lex, dim=1)

        # Concatenate the two outputs
        output_concat = torch.cat((output_acoustic, output_lex), dim=1)

        # Pass the concatenated output through the linear layer for classification
        output = self.fc_mm(output_concat)
        return output

### Training and Testing

In [None]:
train_dataset_mm
test_dataset_mm

<torch.utils.data.dataloader.DataLoader at 0x7f3bceedae50>

In [None]:
# Set up the training settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Initialize the hyperparameters
learning_rate = 1e-4 
input_dim_a = 65
input_dim_l = 768 # dataset_lex[0][0].shape[-1] #  = 768 (the size of the word embeddings)
hidden_dim = 128
num_layers = 2
num_classes = len(np.unique(iemocap_dataset.y))
attention_dim = 64

In [None]:
# Load the pre-trained model
acoustic_model_path = os.path.join(main_dir,"model_acoustic_II.pth")
pretrained_acoustic_dict = torch.load(acoustic_model_path,map_location=torch.device(device))
# Load the pre-trained model
lex_model_path =os.path.join(main_dir,"model_lex_II.pth")
pretr_model_lex = torch.load(lex_model_path,map_location=torch.device(device))

In [None]:
model_bmm = MultimodalClassifier_baseline(pretrained_acoustic_dict, pretr_model_lex, hidden_dim,
                                          num_classes,input_dim_a,input_dim_l,num_layers)
print(model_bmm)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_bmm.parameters(), lr=learning_rate)

MultimodalClassifier_baseline(
  (blstm_acoustic): LSTM(65, 128, num_layers=2, batch_first=True, bidirectional=True)
  (blstm_lex): LSTM(768, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc_mm): Linear(in_features=512, out_features=4, bias=True)
)


In [None]:
file_name = 'model_bmm'
save_path =os.path.join(main_dir,"model_mm/")

In [None]:
num_epochs = 5

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from collections import defaultdict

# Define a function to compute class-wise accuracy
def class_wise_accuracy(true_labels, predicted_labels, num_classes):
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    for t, p in zip(true_labels, predicted_labels):
        class_correct[t] += (t == p)
        class_total[t] += 1
        
    return [correct / total if total > 0 else 0 for correct, total in zip(class_correct, class_total)]

# Training loop
for epoch in range(2,8):

    model_bmm.train()   #----<----
    
    train_loss = 0
    train_iter = 0
    train_labels = []
    train_preds = []
    
    loop = tqdm(enumerate(train_dataset_mm), total=len(train_dataset_mm), leave=True)
    for i, (feat_a,seq_size_a,feat_l,seq_size_l,labels) in loop:
        feat_a = feat_a.to(device)
        feat_l = feat_l.to(device)
        labels = labels.to(device)

        # Forward pass

        outputs = model_bmm(feat_a,seq_size_a,feat_l,seq_size_l)  # ----<----

        loss = criterion(outputs, labels)
        train_loss += loss.item()
        train_iter += 1

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

        # Store labels and predictions for training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(predicted.cpu().numpy())

    # Evaluate the model on the test set
    model_bmm.eval() #----<----
    test_loss = 0
    test_iter = 0
    test_labels = []
    test_preds = []
    
    with torch.no_grad():
        for feat_a,seq_size_a,feat_l,seq_size_l,labels in test_dataset_mm:
            feat_a = feat_a.to(device)
            feat_l = feat_l.to(device)
            labels = labels.to(device)

            outputs = model_bmm(feat_a,seq_size_a,feat_l,seq_size_l)  #  ----<----
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            test_iter += 1
            
            _, predicted = torch.max(outputs.data, 1)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(predicted.cpu().numpy())

    avg_train_loss = train_loss / train_iter
    avg_test_loss = test_loss / test_iter
    weighted_train_accuracy = accuracy_score(train_labels, train_preds)
    unweighted_train_accuracy = balanced_accuracy_score(train_labels, train_preds)
    weighted_test_accuracy = accuracy_score(test_labels, test_preds)
    unweighted_test_accuracy = balanced_accuracy_score(test_labels, test_preds)
    train_class_accuracies = class_wise_accuracy(train_labels, train_preds, num_classes)
    test_class_accuracies = class_wise_accuracy(test_labels, test_preds, num_classes)

    print(f"Epoch[{epoch + 1}] Avg Train Loss: {avg_train_loss:.4f}, Weighted Train Accuracy: {100 * weighted_train_accuracy:.2f}%, Unweighted Train Accuracy: {100 * unweighted_train_accuracy:.2f}%")
    print(f" Avg Test Loss: {avg_test_loss:.4f}, Weighted Test Accuracy: {100 * weighted_test_accuracy:.2f}%, Unweighted Test Accuracy: {100 * unweighted_test_accuracy:.2f}%")
    print(f"Train Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(train_class_accuracies)])}")
    print(f"Test Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(test_class_accuracies)])}")
    file_path_epoch = os.path.join(save_path, file_name + str(epoch+1)+'.pth')
    torch.save(model_bmm, file_path_epoch)

### Baseline Multimodal Results
**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| B-MM (epochs: 7) | 0.4708 | 83.99% | 84.63% | 89.65% | 80.24% | 82.46% | 86.17% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| B-MM (epochs: 7) | 0.8217 | 72.46% | 72.24% | 72.90% | 72.81% | 73.19% | 70.05% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

## MultiModal Classifier using GMU (Gated Multimodal Unit) Arevalo et al.
Modality-based attention is a mechanism that aims to prioritize one modality over another based on the relevance of the input features for a specific task, such as capturing emotions. The bimodal Gated Multimodal Unit (GMU) cell proposed by Arevalo et al. (2017) is used to achieve this behavior. The GMU cell consists of a set of equations that incorporate a complementary mechanism over the modalities, allowing the model to prioritize one modality when necessary.

The GMU equations can be explained as follows:

$h_a = tanh(W_ax_a + b_a)$: This equation calculates the hidden acoustic vector ($h_a$) by multiplying the acoustic input vector ($x_a$) with a weight matrix ($W_a$) and adding a bias term ($b_a$). The result is passed through the hyperbolic tangent ($tanh$) activation function.

$h_l = tanh(W_lx_l + b_l)$: This equation calculates the hidden lexical vector ($h_l$) by multiplying the lexical input vector ($x_l$) with a weight matrix ($W_l$) and adding a bias term ($b_l$). The result is passed through the hyperbolic tangent ($tanh$) activation function.

$z = \sigma(W_z[x_a, x_l] + b_z)$: This equation computes the gating mechanism ($z$) by concatenating the acoustic and lexical input vectors ($x_a$ and $x_l$) and multiplying them with a weight matrix ($W_z$) and adding a bias term ($b_z$). The result is passed through the sigmoid activation function ($\sigma$) to obtain values between 0 and 1.

$h = z * h_a + (1 − z) * h_l$: This final equation computes the output hidden vector ($h$) by taking the element-wise product of the gating mechanism ($z$) and the hidden acoustic vector ($h_a$) and adding it to the element-wise product of $(1 − z)$ and the hidden lexical vector $(h_l)$. This process allows the model to prioritize one modality over the other based on the input features.

In summary, the modality-based attention mechanism enables a model to focus on either the acoustic or lexical input features when determining the output, depending on the relevance of each modality for the task at hand. The bimodal GMU cell achieves this by incorporating a complementary mechanism over the modalities, allowing the model to prioritize one modality over another when necessary.

$h_a,\ h_l \ and \ z$ are of same dimension $d_g$

The sizes of the hidden acoustic and lexical vectors and the gating vector z depend on the design choices made during the implementation of the GMU cell. The number of hidden units in the model is a tunable hyperparameter, and the gating vector z has the same size as the hidden vectors to enable element-wise multiplication for combining the contributions from both modalities.

### Model Architecture

In [None]:
import torch
import torch.nn as nn

class ContextBasedAttention_mm(nn.Module):
    def __init__(self, hidden_dim, attention_dim):
        super().__init__()
        self.wh = nn.Linear(hidden_dim, attention_dim)
        self.v = nn.Parameter(torch.rand(attention_dim, 1))
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        hi = self.wh(x)
        ei = self.tanh(hi).matmul(self.v)
        ai = self.softmax(ei)
        z = torch.sum(ai * x, dim=1)
        return z,ai

class GMU(nn.Module):
    def __init__(self, input_dim_a, input_dim_l, hidden_out_dim):
        super().__init__()

        self.fc_acoustic = nn.Linear(input_dim_a, hidden_out_dim)
        self.fc_lex = nn.Linear(input_dim_l, hidden_out_dim)
        self.fc_gate = nn.Linear(input_dim_a + input_dim_l, hidden_out_dim)

    def forward(self, output_acoustic, output_lex):
        ha = torch.tanh(self.fc_acoustic(output_acoustic))
        hl = torch.tanh(self.fc_lex(output_lex))
        z = torch.sigmoid(self.fc_gate(torch.cat((output_acoustic, output_lex), dim=1)))

        h = z * ha + (1 - z) * hl
        return h,z

class MultimodalClassifier_GMU(nn.Module):
    def __init__(self, pretrained_acoustic_dict, pretr_model_lex, rnn_hidden_dim,gmu_out_dim, num_classes,
                 attention_dim, input_dim_a=65, input_dim_l=768, num_layers=2,
                 device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__()
        # the BLSTMs from the pre-trained lexical model
        pretr_model_acoustic = BLSTMWithContextBasedAttention(input_dim_a, rnn_hidden_dim,num_layers,
                                                              attention_dim, num_classes).to(device)
        pretr_model_acoustic.load_state_dict(pretrained_acoustic_dict)
        self.blstm_acoustic = pretr_model_acoustic.rnn
        # the BLSTMs from the pre-trained lexical model
        self.blstm_lex = pretr_model_lex.rnn
        # LLattention (low level attention)
        # self.LL_attention_l = pretr_model_lex.attention
        self.LL_attention_a = ContextBasedAttention_mm(rnn_hidden_dim*2, attention_dim)
        LL_attention_a.load_state_dict(pretr_model_acoustic.attention.state_dict())
        self.LL_attention_l = ContextBasedAttention_mm(rnn_hidden_dim*2, attention_dim)
        self.LL_attention_l.load_state_dict(pretr_model_lex.attention.state_dict())
        
        # HLfusion (high level), GMU attention
        self.gmu = GMU(rnn_hidden_dim*2, rnn_hidden_dim*2, gmu_out_dim)
        # linear softmax classification
        self.fc_mm = torch.nn.Linear(gmu_out_dim, num_classes) 

    def forward(self, x_acoustic, lengths_acoustic, x_lex, lengths_lex):
        # Pack the padded sequence
        x_acoustic = torch.nn.utils.rnn.pack_padded_sequence(x_acoustic, lengths_acoustic,
                                                             batch_first=True, enforce_sorted=False)
        x_lex = torch.nn.utils.rnn.pack_padded_sequence(x_lex, lengths_lex,
                                                        batch_first=True, enforce_sorted=False)

        output_acoustic,_ = self.blstm_acoustic(x_acoustic)
        output_lex,_ = self.blstm_lex(x_lex)

        # Unpack the packed sequence
        output_acoustic, _ = torch.nn.utils.rnn.pad_packed_sequence(output_acoustic, batch_first=True)
        output_lex, _ = torch.nn.utils.rnn.pad_packed_sequence(output_lex, batch_first=True)

        # Compute the average across the sequence dimension (axis=1)
        output_acoustic, acoustic_attention = self.LL_attention_a(output_acoustic) # torch.mean(output_acoustic, dim=1)
        output_lex, lexical_attention = self.LL_attention_l(output_lex) # torch.mean(output_lex, dim=1)

        # GMU attention mechanism
        h,modality_attention = self.gmu(output_acoustic, output_lex)

        # Pass the combined output through the linear layer for classification
        output = self.fc_mm(h)
        return output,acoustic_attention,lexical_attention,modality_attention


### Training MMMF-ER

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the pre-trained model
acoustic_model_path = os.path.join(main_dir, "model_acoustic_III.pth")
pretrained_acoustic_dict = torch.load(acoustic_model_path,map_location=torch.device(device))

# Load the pre-trained model
lex_model_path = os.path.join(main_dir,"model_lex_III.pth")
pretr_model_lex = torch.load(lex_model_path,map_location=torch.device(device))
print(device)

cpu


In [None]:
# Set up the training settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Initialize the hyperparameters
learning_rate = 1e-4 
input_dim_a = 65
input_dim_l = 768 # dataset_lex[0][0].shape[-1] #  = 768 (the size of the word embeddings)
rnn_hidden_dim = 128
num_layers = 2
num_classes = len(np.unique(iemocap_dataset.y))
attention_dim = 64
gmu_out_dim = 128

In [None]:
model_mm_mla = MultimodalClassifier_GMU(pretrained_acoustic_dict, pretr_model_lex,
                                        rnn_hidden_dim,gmu_out_dim,num_classes,attention_dim).to(device)
save_path = os.path.join(main_dir,'model_mm/') 
file_name = 'model_mm_mla'
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_mm_mla.parameters(), lr=learning_rate)

In [None]:
train_dataset_mm
test_dataset_mm

<torch.utils.data.dataloader.DataLoader at 0x7fa183bafb50>

In [None]:
num_epochs = 2

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from collections import defaultdict

# Define a function to compute class-wise accuracy
def class_wise_accuracy(true_labels, predicted_labels, num_classes):
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    for t, p in zip(true_labels, predicted_labels):
        class_correct[t] += (t == p)
        class_total[t] += 1
        
    return [correct / total if total > 0 else 0 for correct, total in zip(class_correct, class_total)]

# Training loop
for epoch in range(num_epochs):

    model_mm_mla.train()   #----<----
    
    train_loss = 0
    train_iter = 0
    train_labels = []
    train_preds = []
    
    loop = tqdm(enumerate(train_dataset_mm), total=len(train_dataset_mm), leave=True)
    for i, (feat_a,seq_size_a,feat_l,seq_size_l,labels) in loop:
        feat_a = feat_a.to(device)
        feat_l = feat_l.to(device)
        labels = labels.to(device)

        # Forward pass

        outputs,_,_,_ = model_mm_mla(feat_a,seq_size_a,feat_l,seq_size_l)  # ----<----

        loss = criterion(outputs, labels)
        train_loss += loss.item()
        train_iter += 1

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

        # Store labels and predictions for training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(predicted.cpu().numpy())

    # Evaluate the model on the test set
    model_mm_mla.eval() #----<----
    test_loss = 0
    test_iter = 0
    test_labels = []
    test_preds = []
    
    with torch.no_grad():
        for feat_a,seq_size_a,feat_l,seq_size_l,labels in test_dataset_mm:
            feat_a = feat_a.to(device)
            feat_l = feat_l.to(device)
            labels = labels.to(device)

            outputs,_,_,_ = model_mm_mla(feat_a,seq_size_a,feat_l,seq_size_l)  #  ----<----
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            test_iter += 1
            
            _, predicted = torch.max(outputs.data, 1)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(predicted.cpu().numpy())

    avg_train_loss = train_loss / train_iter
    avg_test_loss = test_loss / test_iter
    weighted_train_accuracy = accuracy_score(train_labels, train_preds)
    unweighted_train_accuracy = balanced_accuracy_score(train_labels, train_preds)
    weighted_test_accuracy = accuracy_score(test_labels, test_preds)
    unweighted_test_accuracy = balanced_accuracy_score(test_labels, test_preds)
    train_class_accuracies = class_wise_accuracy(train_labels, train_preds, num_classes)
    test_class_accuracies = class_wise_accuracy(test_labels, test_preds, num_classes)

    print(f"Epoch[{epoch + 1}] Avg Train Loss: {avg_train_loss:.4f}, Weighted Train Accuracy: {100 * weighted_train_accuracy:.2f}%, Unweighted Train Accuracy: {100 * unweighted_train_accuracy:.2f}%")
    print(f" Avg Test Loss: {avg_test_loss:.4f}, Weighted Test Accuracy: {100 * weighted_test_accuracy:.2f}%, Unweighted Test Accuracy: {100 * unweighted_test_accuracy:.2f}%")
    print(f"Train Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(train_class_accuracies)])}")
    print(f"Test Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(test_class_accuracies)])}")
    file_path_epoch = os.path.join(save_path,file_name + str(epoch+1)+'.pth') 
    torch.save(model_mm_mla, file_path_epoch)   #  ----<----
    # torch.save(model_mm_mla, 'model_mm_mla'+ str(epoch+1)+ '.pth') 

In [None]:
file_path_epoch = os.path.join(save_path,file_name + str(epoch+1)+'.pth') 
torch.save(model_mm_mla, file_path_epoch)   #  ----<----

### MMMA Model Results

**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Epoch[2] | 0.4706 | 84.10% | 84.72% | 87.51% | 83.52% | 79.76% | 88.10% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Epoch[2] | 0.6848 | 73.10% | 74.36% | 76.64% | 74.38% | 66.22% | 80.20% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

# Saving the Test Results

In [None]:
# Load the pre-trained model
main_dir
mm_ma_model_path = main_dir + "model_mm_mla.pth"
pretr_model_mm_mla = torch.load(mm_ma_model_path,map_location=torch.device(device))

In [None]:
# pretr_model_mm_mla = model_mm_mla.to(device)

In [None]:
test_loader = test_dataset_mm

In [None]:
# Set the model to evaluation mode
pretr_model_mm_mla.eval()

# Create empty lists to store the required information
sample_indices = []
predicted_labels = []
true_labels = []
predicted_probs = []
z_values = []
ll_attention_a_values = []
ll_attention_l_values = []
# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch_idx, (feat_a,seq_size_a,feat_l,seq_size_l,labels) in enumerate(test_loader):
        feat_a = feat_a.to(device)
        feat_l = feat_l.to(device)
        labels = labels.to(device)

        # Forward pass through the model
        logits, ll_attention_a, ll_attention_l,z = pretr_model_mm_mla(feat_a,seq_size_a, feat_l, seq_size_l)

        # Calculate predicted probabilities and labels
        probabilities = F.softmax(logits, dim=1)
        predictions = torch.argmax(probabilities, dim=1)

        # Get the sample indices in the original dataset
        original_indices = test_indices[batch_idx * batch_size: (batch_idx + 1) * batch_size]

        # Store the required information for each sample in the batch
        for idx, label, pred, prob, z_val, ll_a_val, ll_l_val in zip(original_indices, labels, predictions, probabilities, z, ll_attention_a, ll_attention_l):
            sample_indices.append(idx)
            true_labels.append(label.item())
            predicted_labels.append(pred.item())
            predicted_probs.append(prob.cpu().numpy())
            z_values.append(z_val.cpu().numpy())
            ll_attention_a_values.append(ll_a_val.cpu().numpy())
            ll_attention_l_values.append(ll_l_val.cpu().numpy())


In [None]:
audio_paths = []
trans_words = []
start_stamps = []

for idx in sample_indices:
    audio_path = data.loc[idx, 'audio_path']
    trans_word = data.loc[idx, 'trans_words']
    start_stamp = data.loc[idx, 'start_stamp']
    
    audio_paths.append(audio_path)
    trans_words.append(trans_word)
    start_stamps.append(start_stamp)

In [None]:
# get the emotion categories from the interger labels
iemocap_dataset.get_encoder().inverse_transform([0, 1, 2,3])
predicted_labels_cat = iemocap_dataset.get_encoder().inverse_transform(predicted_labels)
true_labels_cat = iemocap_dataset.get_encoder().inverse_transform(true_labels)
print(predicted_labels_cat)
print(true_labels_cat)

['neu' 'neu' 'hap' ... 'neu' 'sad' 'hap']
['neu' 'neu' 'neu' ... 'neu' 'sad' 'neu']


In [None]:
# storing the sample sequence lenght of the acoustic and lexical LLDs inputs
seq_len_a = []
seq_len_l = []
for samp in test_dataset_mm:
    _,seq_size_a,_,seq_size_l,_ = samp
    seq_len_a.append(seq_size_a)
    seq_len_l.append(seq_size_l)
    # print(seq_size_a,seq_size_l)

# removing the padding values from the attention values: 
for idx in range(analysis_data.shape[0]):
    ll_attention_a_values[idx] = ll_attention_a_values[idx].T[0][:seq_len_a[idx]]
    ll_attention_l_values[idx] = ll_attention_l_values[idx].T[0][:seq_len_l[idx]]

In [None]:
import pandas as pd
# Create a dictionary from the given lists
data_dict = {
    'sample_indices': sample_indices,
    'predicted_labels_cat': predicted_labels_cat,
    'true_labels_cat': true_labels_cat,
    'predicted_probs': predicted_probs,
    'z_values': z_values,
    'll_attention_a_values': ll_attention_a_values,
    'll_attention_l_values': ll_attention_l_values,
    'audio_paths': audio_paths,
    'trans_words': trans_words,
    'start_stamps': start_stamps
    'seq_len_l':seq_len_l
    'seq_len_a':seq_len_a
}
# Create a pandas data frame from the dictionary
analysis_data = pd.DataFrame(data_dict)

In [None]:
# Store the data frame in a pickle file
with open('analysis_data.pkl', 'wb') as f:
    pickle.dump(analysis_data, f)

In [None]:
idx = 7
print("Sample Index",sample_indices[idx])
print("Predicted Label:",predicted_labels_cat[idx])
print("True Label:",true_labels_cat[idx])
print("trans_words:",trans_words[idx])
print(start_stamps[idx])
print("Prediction Probablities:",predicted_probs[idx])
print(z_values[idx].T)
print(ll_attention_a_values[idx].T)

Sample Index 31
Predicted Label: hap
True Label: hap
trans_words: <s> LOOK THERE SEE WHAT'S THAT <sil> NO THAT'S <sil> THAT'S JUST SEAWEED </s>
0 4 60 67 73 76 97 106 115 119 136 145
Prediction Probablities: [0.3119609  0.6486507  0.03298217 0.00640613]
[[7.6825818e-06 1.3120624e-05 2.2176970e-05 ... 5.7989169e-08
  5.7989169e-08 5.7989169e-08]]
[[1.7885396e-08 1.1439136e-06 5.8797043e-05 5.8951700e-04 2.1982733e-03
  5.7809399e-03 7.7269054e-03 8.4549207e-03 2.1782612e-02 4.6532586e-02
  9.6331462e-02 1.0548870e-01 2.1491514e-01 2.8426430e-01 1.6519560e-01
  3.7971765e-02 2.6908647e-03 1.6449820e-05 1.5342091e-14 1.5342091e-14
  1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14
  1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14
  1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14
  1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14
  1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14 1.5342091e-14
  1.

In [None]:
# Store the data frame in a pickle file
with open('data_frame.pkl', 'wb') as f:
    pickle.dump(df, f)

## END