## Loading Libraries and Data 

In [1]:
from IPython.display import Audio, display
import os
import pickle
import torch
import torchaudio
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import random
# Set the random seed
random.seed(42)

In [3]:
cwd = os.getcwd()

'/Users/ltc/Documents/Python Scripts/CS772/Project'

In [5]:
data_pickel_path = os.path.join(cwd,'data_processed.pickle')
data = pd.read_pickle(data_pickel_path)
data.shape

(5520, 12)

# **BLSTM Based Unimodal Models**

**Note: Same models will be used for the lexical only and audio only classification**



### **Model I -output of the final BLSTM block**

In [None]:
# use the output of the final BLSTM  block for the classification
class BLSTM_lastblock(nn.Module):
    def __init__(self, input_dim, hidden_dim,num_layers, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x,lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Select the output of the final LSTM block
        x = x[torch.arange(x.size(0)), lengths - 1, :]
        # Pass the output through the linear layer for classification
        x = self.fc(x)
        return x

In [None]:
# use the output of the final BLSTM  block for the classification
class BLSTM_CLSblock(nn.Module):
    def __init__(self, input_dim, hidden_dim,num_layers, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x,lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Select the output of the final LSTM block
        x = x[torch.arange(x.size(0)), 0, :]
        # Pass the output through the linear layer for classification
        x = self.fc(x)
        return x

### **Model II -Using averaging pooling**

In [None]:
# use average pooling of the outputs of the BLSTM block for classification
class BLSTM_avg_pooling(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Compute the average across the sequence dimension (axis=1)
        x = torch.mean(x, dim=1)
        # Pass the output through the linear layer for classification
        x = self.fc(x)
        return x

### **Model III - Using context based attention pooling**

In [None]:
class ContextBasedAttention(nn.Module):
    def __init__(self, hidden_dim, attention_dim):
        super().__init__()
        self.wh = nn.Linear(hidden_dim, attention_dim)
        self.v = nn.Parameter(torch.rand(attention_dim, 1))
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        hi = self.wh(x)
        ei = self.tanh(hi).matmul(self.v)
        ai = self.softmax(ei)
        z = torch.sum(ai * x, dim=1)
        return z

class BLSTMWithContextBasedAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, attention_dim, num_classes):
        super().__init__()
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=num_layers,
                           batch_first=True, bidirectional=True)
        self.attention = ContextBasedAttention(hidden_dim * 2, attention_dim)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x, lengths):
        # Pack the padded sequence
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        # Pass the packed sequence through the LSTM
        x, _ = self.rnn(x)
        # Unpack the packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # Compute the attention based weighted average across the sequence dimension (axis=1)
        x = self.attention(x)
        x = self.fc(x)
        return x

# Acousitc Modality

In [None]:
# Prepare data for training
X = []
num_frames = []
for samp_feat in data['features']:
    samp_feat = np.array(samp_feat)
    num_frames.append(samp_feat.shape[0])
    X.append(samp_feat)

num_frames = pd.DataFrame(num_frames)
num_frames.describe()

Unnamed: 0,0
count,5520.0
mean,450.925725
std,323.165689
min,54.0
25%,229.0
50%,353.0
75%,573.0
max,3409.0


### Audio Dataset Class

In [None]:
class IEMOCAP_audio(Dataset):

    def __init__(self, mean=None, std=None):
        # Initialize data, download etc.
        data = pd.read_pickle(data_pickel_path)
        self.x = [np.array(samp_feat) for samp_feat in data['features']]
        y = data['emotion'].values
        self.label_encoder = LabelEncoder()
        self.spk = data['spk'] 
        self.y = self.label_encoder.fit_transform(y)
        self.n_samples = data.shape[0]
        self.mean = mean
        self.std = std

    def __getitem__(self, index):
        x_ = torch.tensor(self.x [index])
        if self.mean is not None and self.std is not None:
            x_ = (x_ - self.mean) / self.std
        seq_size = x_.shape[0]
        return x_, torch.tensor(self.y[index]), seq_size

    def __len__(self):
        return self.n_samples

    def get_spk(self):
        return self.spk
        
    def get_encoder(self):
        return self.label_encoder

In [None]:
iemocap_dataset  = IEMOCAP_audio()
index = 400
print('label = ',iemocap_dataset[index][1])
print('sample feature_shape = ',iemocap_dataset [index][0].shape)
print('original sequence length = ',iemocap_dataset [index][2])
print('inverse label transform',iemocap_dataset.get_encoder().inverse_transform([0, 1, 2,3]))
iemocap_dataset[index]

label =  tensor(2)
sample feature_shape =  torch.Size([690, 65])
original sequence length =  690
inverse label transform ['ang' 'hap' 'neu' 'sad']


(tensor([[ 0.0000,  0.6169,  0.0000,  ..., -9.2440, -5.5332, -8.9552],
         [ 0.0000,  0.6130,  0.0000,  ..., -4.1164, -1.0473, -9.4122],
         [ 0.0000,  0.5902,  0.0000,  ..., -1.9275,  3.9153, -8.6798],
         ...,
         [ 0.0000,  0.5658,  0.0000,  ...,  1.1384,  6.4713, -5.1534],
         [ 0.0000,  0.5709,  0.0000,  ..., -6.5104,  6.0755, -3.8702],
         [ 0.0000,  0.5872,  0.0000,  ..., -2.5860,  9.4837, -6.2946]]),
 tensor(2),
 203)

**Custom Collate Function**

In [None]:
def custom_collate_fn(batch):
    features, labels,seq_lengths = zip(*batch)
    # seq_lengths = [len(seq) for seq in features]

    # Sort sequences by length in descending order
    seq_lengths, perm_idx = torch.tensor(seq_lengths).sort(0, descending=True)
    features = [features[i] for i in perm_idx]
    labels = torch.tensor([labels[i] for i in perm_idx])

    # Pad the sequences
    padded_features = pad_sequence(features, batch_first=True)
    return padded_features, labels, seq_lengths

**Randomly splitting the dataset and storing the test (20%) and train (80%) indices**

In [None]:
num_samp = len(iemocap_dataset)
train_size = int(0.8 * len(iemocap_dataset))
test_size = len(iemocap_dataset) - train_size
train_indices = random.sample(range(num_samp), train_size)
test_indices = [i for i in range(num_samp) if i not in train_indices ]
print("train_indices",train_indices)
print("test_indices",test_indices)

train_indices [5238, 912, 204, 2253, 2006, 1828, 1143, 839, 4467, 712, 4837, 3456, 260, 244, 767, 1791, 1905, 4139, 4931, 217, 4597, 1628, 5323, 4464, 3436, 1805, 3679, 4827, 2278, 53, 1307, 3462, 2787, 2276, 1273, 1763, 2757, 837, 759, 3112, 792, 2940, 2817, 4945, 2166, 355, 3763, 4392, 1022, 3100, 645, 4522, 2401, 5149, 5066, 2962, 4729, 1575, 569, 375, 5417, 1866, 2370, 653, 1907, 827, 3113, 2277, 3714, 5207, 2988, 1332, 3032, 2910, 1716, 2187, 5308, 584, 4990, 5201, 1401, 4375, 2005, 1338, 3786, 3108, 2211, 5242, 4562, 1799, 2656, 458, 1876, 262, 2584, 3286, 2193, 542, 1728, 4646, 2577, 1741, 5369, 4089, 3241, 5266, 3758, 1170, 2169, 5513, 2020, 4598, 4415, 2152, 4788, 3509, 4780, 3271, 2965, 1796, 1133, 4174, 4042, 744, 385, 898, 1252, 5140, 1310, 3458, 4885, 520, 3152, 3126, 4881, 3834, 4334, 2059, 4532, 94, 938, 4398, 2185, 5250, 2786, 913, 2404, 3561, 1295, 3716, 26, 2157, 4100, 1463, 4158, 871, 5122, 2444, 5234, 5365, 4988, 1629, 5393, 3063, 1323, 4418, 4344, 4, 4906, 2655, 40

In [None]:
batch_size = 32
train_dataset_unnormalized = torch.utils.data.Subset(iemocap_dataset, train_indices)
train_loader_unnormalized = DataLoader(dataset=train_dataset_unnormalized, batch_size=batch_size, shuffle=True,collate_fn=custom_collate_fn)

### **Normalizing the Audio Data**

In [None]:
def calculate_mean_std(train_loader):
    running_sum = 0
    running_sum_sq = 0
    total_count = 0

    for i,batch in enumerate(train_loader):
        # if i <= 2:
        features_batch, labels_batch, seq_sizes_batch = batch
        for features, seq_size in zip(features_batch, seq_sizes_batch):
            features = features.numpy()
            # print(features[:seq_size].shape)
            # print(features[:seq_size].shape,features[:seq_size])
            running_sum += np.sum(features[:seq_size], axis=0)
            running_sum_sq += np.sum(features[:seq_size] ** 2, axis=0)
            total_count += seq_size

    mean = running_sum / total_count
    std = np.sqrt(running_sum_sq / total_count - mean ** 2)

    return mean, std

In [None]:
mean, std = calculate_mean_std(train_loader_unnormalized)
print(mean.shape,mean[:10])
print(std.shape,std[:10])

torch.Size([65]) tensor([ 8.5654e+01,  5.8080e-01,  1.0940e-02,  8.4625e-03,  6.1445e-02,
        -5.3501e+01,  3.6119e-01,  1.1703e+00,  2.2729e-02,  1.0051e-01])
torch.Size([65]) tensor([1.1668e+02, 2.7898e-01, 3.0493e-02, 2.7910e-02, 1.0592e-01, 5.2843e+01,
        5.1323e-01, 6.8625e-01, 5.4571e-02, 7.9502e-02])


In [None]:
# normalised dataset ,and test and train dataloaders
iemocap_dataset_norm  = IEMOCAP_audio(mean=mean,std=std)
# initilizing the train and test datasets using the stored test and train indices
train_dataset_norm = torch.utils.data.Subset(iemocap_dataset_norm, train_indices)
test_dataset_norm = torch.utils.data.Subset(iemocap_dataset_norm, test_indices)

In [None]:
batch_size = 32

# data loaaders
train_loader_norm = DataLoader(dataset=train_dataset_norm, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
test_loader_norm = DataLoader(dataset=test_dataset_norm, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)

### **Acoustic only classifiers:**

In [None]:
test_loader_norm
train_loader_norm

<torch.utils.data.dataloader.DataLoader at 0x7f8e8dddfe20>

In [None]:
# Set up the training settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Initialize the hyperparameters
learning_rate = 1e-4 
input_dim = 65 # iemocap_dataset[0][0].shape[-1] #  = 65 (the number of frame level features)
hidden_dim = 128
num_layers = 2
num_classes = len(np.unique(iemocap_dataset.y))
attention_dim = 64

cuda


In [None]:
# Initialize the model, loss function, and optimizer

# choose a model
# model_acoustic,model_name = BLSTM_lastblock(input_dim, hidden_dim,num_layers, num_classes).to(device), 'model_acoustic_I'
# model_acoustic,model_name = BLSTM_avg_pooling(input_dim, hidden_dim,num_layers, num_classes).to(device), 'model_acoustic_II'
model_acoustic,model_name  = BLSTMWithContextBasedAttention(input_dim, hidden_dim,num_layers, attention_dim, num_classes).to(device), 'model_acoustic_III'
print(model_acoustic)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_acoustic.parameters(), lr=learning_rate)

BLSTMWithContextBasedAttention(
  (rnn): LSTM(65, 128, num_layers=2, batch_first=True, bidirectional=True)
  (attention): ContextBasedAttention(
    (wh): Linear(in_features=256, out_features=64, bias=True)
    (tanh): Tanh()
    (softmax): Softmax(dim=1)
  )
  (fc): Linear(in_features=256, out_features=4, bias=True)
)


In [None]:
save_path = os.path.join(main_dir,'model_acoustic')
print(save_path)

/content/drive/MyDrive/IITB_EE/CS772_project/model_acoustic


In [None]:
num_epochs = 15
model_ = model_acoustic

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from collections import defaultdict

# Define a function to compute class-wise accuracy
def class_wise_accuracy(true_labels, predicted_labels, num_classes):
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    for t, p in zip(true_labels, predicted_labels):
        class_correct[t] += (t == p)
        class_total[t] += 1
        
    return [correct / total if total > 0 else 0 for correct, total in zip(class_correct, class_total)]

# Training loop
for epoch in range(num_epochs):
    model_.train() # <---------------
    train_loss = 0
    train_iter = 0
    train_labels = []
    train_preds = []
    
    loop = tqdm(enumerate(train_loader_norm), total=len(train_loader_norm), leave=True)
    for i, (features, labels, seq_lengths) in loop:
        features = features.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model_(features,seq_lengths) #<----------------
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        train_iter += 1

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

        # Store labels and predictions for training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(predicted.cpu().numpy())

    # Evaluate the model on the test set
    model_.eval()
    test_loss = 0
    test_iter = 0
    test_labels = []
    test_preds = []
    
    with torch.no_grad():
        for features, labels, seq_lengths in test_loader_norm:
            features = features.to(device)
            labels = labels.to(device)

            outputs = model_(features,seq_lengths) #<----------------------
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            test_iter += 1
            
            _, predicted = torch.max(outputs.data, 1)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(predicted.cpu().numpy())

    avg_train_loss = train_loss / train_iter
    avg_test_loss = test_loss / test_iter
    weighted_train_accuracy = accuracy_score(train_labels, train_preds)
    unweighted_train_accuracy = balanced_accuracy_score(train_labels, train_preds)
    weighted_test_accuracy = accuracy_score(test_labels, test_preds)
    unweighted_test_accuracy = balanced_accuracy_score(test_labels, test_preds)
    train_class_accuracies = class_wise_accuracy(train_labels, train_preds, num_classes)
    test_class_accuracies = class_wise_accuracy(test_labels, test_preds, num_classes)

    print(f"Epoch[{epoch + 1}] Avg Train Loss: {avg_train_loss:.4f}, Weighted Train Accuracy: {100 * weighted_train_accuracy:.2f}%, Unweighted Train Accuracy: {100 * unweighted_train_accuracy:.2f}%")
    print(f" Avg Test Loss: {avg_test_loss:.4f}, Weighted Test Accuracy: {100 * weighted_test_accuracy:.2f}%, Unweighted Test Accuracy: {100 * unweighted_test_accuracy:.2f}%")
    print(f"Train Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(train_class_accuracies)])}")
    print(f"Test Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(test_class_accuracies)])}")
    # Save the model checkpoint
    if epoch >= 5:
        file_path_epoch = os.path.join(save_path, model_name + str(epoch+1)+'.pth')
        torch.save(model_, file_path_epoch)

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[1] Avg Train Loss: 1.2331, Weighted Train Accuracy: 43.32%, Unweighted Train Accuracy: 45.30%
 Avg Test Loss: 1.0763, Weighted Test Accuracy: 50.91%, Unweighted Test Accuracy: 51.56%
Train Class Accuracies: Class 0: 44.77%, Class 1: 55.38%, Class 2: 16.42%, Class 3: 64.63%
Test Class Accuracies: Class 0: 44.39%, Class 1: 45.62%, Class 2: 52.28%, Class 3: 63.96%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[2] Avg Train Loss: 1.0534, Weighted Train Accuracy: 52.31%, Unweighted Train Accuracy: 53.18%
 Avg Test Loss: 1.0525, Weighted Test Accuracy: 51.81%, Unweighted Test Accuracy: 54.02%
Train Class Accuracies: Class 0: 49.16%, Class 1: 39.13%, Class 2: 58.10%, Class 3: 66.33%
Test Class Accuracies: Class 0: 43.93%, Class 1: 38.44%, Class 2: 52.01%, Class 3: 81.73%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[3] Avg Train Loss: 1.0152, Weighted Train Accuracy: 54.64%, Unweighted Train Accuracy: 55.63%
 Avg Test Loss: 1.0128, Weighted Test Accuracy: 56.16%, Unweighted Test Accuracy: 55.89%
Train Class Accuracies: Class 0: 52.53%, Class 1: 40.73%, Class 2: 60.19%, Class 3: 69.05%
Test Class Accuracies: Class 0: 48.13%, Class 1: 53.44%, Class 2: 60.05%, Class 3: 61.93%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[4] Avg Train Loss: 0.9853, Weighted Train Accuracy: 56.97%, Unweighted Train Accuracy: 57.97%
 Avg Test Loss: 1.0023, Weighted Test Accuracy: 55.16%, Unweighted Test Accuracy: 55.51%
Train Class Accuracies: Class 0: 56.02%, Class 1: 44.55%, Class 2: 61.02%, Class 3: 70.29%
Test Class Accuracies: Class 0: 42.06%, Class 1: 54.69%, Class 2: 55.23%, Class 3: 70.05%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[5] Avg Train Loss: 0.9543, Weighted Train Accuracy: 58.61%, Unweighted Train Accuracy: 59.60%
 Avg Test Loss: 0.9863, Weighted Test Accuracy: 56.07%, Unweighted Test Accuracy: 57.89%
Train Class Accuracies: Class 0: 59.06%, Class 1: 47.22%, Class 2: 61.62%, Class 3: 70.52%
Test Class Accuracies: Class 0: 49.07%, Class 1: 53.12%, Class 2: 50.67%, Class 3: 78.68%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[6] Avg Train Loss: 0.9255, Weighted Train Accuracy: 60.21%, Unweighted Train Accuracy: 61.17%
 Avg Test Loss: 0.9717, Weighted Test Accuracy: 57.34%, Unweighted Test Accuracy: 59.55%
Train Class Accuracies: Class 0: 61.87%, Class 1: 50.42%, Class 2: 61.99%, Class 3: 70.41%
Test Class Accuracies: Class 0: 62.15%, Class 1: 44.38%, Class 2: 56.03%, Class 3: 75.63%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[7] Avg Train Loss: 0.9016, Weighted Train Accuracy: 61.98%, Unweighted Train Accuracy: 63.20%
 Avg Test Loss: 0.9701, Weighted Test Accuracy: 58.88%, Unweighted Test Accuracy: 59.64%
Train Class Accuracies: Class 0: 65.80%, Class 1: 52.33%, Class 2: 61.54%, Class 3: 73.13%
Test Class Accuracies: Class 0: 59.81%, Class 1: 52.19%, Class 2: 60.05%, Class 3: 66.50%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[8] Avg Train Loss: 0.8646, Weighted Train Accuracy: 64.27%, Unweighted Train Accuracy: 65.41%
 Avg Test Loss: 0.9668, Weighted Test Accuracy: 59.06%, Unweighted Test Accuracy: 59.10%
Train Class Accuracies: Class 0: 68.95%, Class 1: 54.39%, Class 2: 64.62%, Class 3: 73.70%
Test Class Accuracies: Class 0: 46.26%, Class 1: 53.75%, Class 2: 63.81%, Class 3: 72.59%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[9] Avg Train Loss: 0.8352, Weighted Train Accuracy: 65.31%, Unweighted Train Accuracy: 66.54%
 Avg Test Loss: 0.9590, Weighted Test Accuracy: 60.05%, Unweighted Test Accuracy: 60.89%
Train Class Accuracies: Class 0: 70.75%, Class 1: 57.51%, Class 2: 62.97%, Class 3: 74.94%
Test Class Accuracies: Class 0: 61.21%, Class 1: 46.88%, Class 2: 65.42%, Class 3: 70.05%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[10] Avg Train Loss: 0.8016, Weighted Train Accuracy: 67.46%, Unweighted Train Accuracy: 68.73%
 Avg Test Loss: 0.9651, Weighted Test Accuracy: 60.51%, Unweighted Test Accuracy: 59.59%
Train Class Accuracies: Class 0: 73.00%, Class 1: 58.43%, Class 2: 66.04%, Class 3: 77.44%
Test Class Accuracies: Class 0: 59.81%, Class 1: 50.62%, Class 2: 71.58%, Class 3: 56.35%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[11] Avg Train Loss: 0.7670, Weighted Train Accuracy: 68.91%, Unweighted Train Accuracy: 70.20%
 Avg Test Loss: 0.9890, Weighted Test Accuracy: 60.05%, Unweighted Test Accuracy: 60.80%
Train Class Accuracies: Class 0: 76.15%, Class 1: 59.88%, Class 2: 67.32%, Class 3: 77.44%
Test Class Accuracies: Class 0: 56.07%, Class 1: 59.69%, Class 2: 57.37%, Class 3: 70.05%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[12] Avg Train Loss: 0.7348, Weighted Train Accuracy: 70.61%, Unweighted Train Accuracy: 71.79%
 Avg Test Loss: 0.9966, Weighted Test Accuracy: 60.05%, Unweighted Test Accuracy: 62.00%
Train Class Accuracies: Class 0: 77.84%, Class 1: 62.17%, Class 2: 69.27%, Class 3: 77.89%
Test Class Accuracies: Class 0: 62.62%, Class 1: 53.75%, Class 2: 55.50%, Class 3: 76.14%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[13] Avg Train Loss: 0.6957, Weighted Train Accuracy: 72.40%, Unweighted Train Accuracy: 73.64%
 Avg Test Loss: 1.0274, Weighted Test Accuracy: 58.97%, Unweighted Test Accuracy: 61.07%
Train Class Accuracies: Class 0: 80.31%, Class 1: 64.38%, Class 2: 70.16%, Class 3: 79.71%
Test Class Accuracies: Class 0: 62.15%, Class 1: 51.56%, Class 2: 54.42%, Class 3: 76.14%


  0%|          | 0/138 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
# Define the file path for saving the model
file_path_epoch = os.path.join(save_path, model_name + str(epoch+1)+'.pth')
# Save the model state dictionary to a file
torch.save(model_, file_path_epoch)

### Results for Acoustic only models:
**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (epochs=15) | 1.1084 | 49.95% | 51.57% | 52.19% | 37.83% | 48.80% | 67.46% |
| Avg. Pool (epochs15) | 0.9579 | 61.41% | 62.69% | 65.69% | 51.56% | 60.72% | 72.79% |
| Attention based (attention layer size - 64) (e=14) | 0.8352 | 65.31% | 66.54% | 70.75% | 57.51% | 62.97% | 74.94% | 


**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=15) | 1.1296 | 49.82% | 50.03% | 38.79% | 38.44% | 57.91% | 64.97% |
| Avg. Pool (e=15) | 1.0815 | 54.98% | 54.45% | 37.38% | 49.38% | 63.00% | 68.02% |
| Attention based (attention layer size - 64) (e=9) | 0.959 | 60.05% | 60.89% | 61.21% | 46.88% | 65.42% | 70.05% | 

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

3. (32/9)*** Avg Train Loss: 0.8352, Weighted Train Accuracy: 65.31%, Unweighted Train Accuracy: 66.54%
 Avg Test Loss: 0.9590, Weighted Test Accuracy: 60.05%, Unweighted Test Accuracy: 60.89%
Train Class Accuracies: Class 0: 70.75%, Class 1: 57.51%, Class 2: 62.97%, Class 3: 74.94%
Test Class Accuracies: Class 0: 61.21%, Class 1: 46.88%, Class 2: 65.42%, Class 3: 70.05%

1. (64)Avg Train Loss: 0.8493, Weighted Train Accuracy: 64.92%, Unweighted Train Accuracy: 66.16%
 Avg Test Loss: 0.9746, Weighted Test Accuracy: 57.34%, Unweighted Test Accuracy: 59.20%
Train Class Accuracies: Class 0: 69.52%, Class 1: 54.39%, Class 2: 65.22%, Class 3: 75.51%
Test Class Accuracies: Class 0: 59.35%, Class 1: 44.69%, Class 2: 57.64%, Class 3: 75.13%

2. (128/14) Avg Train Loss: 0.8812, Weighted Train Accuracy: 63.36%, Unweighted Train Accuracy: 64.49%
 Avg Test Loss: 0.9619, Weighted Test Accuracy: 58.88%, Unweighted Test Accuracy: 60.11%
Train Class Accuracies: Class 0: 66.37%, Class 1: 53.55%, Class 2: 63.79%, Class 3: 74.26%
Test Class Accuracies: Class 0: 54.67%, Class 1: 50.62%, Class 2: 59.52%, Class 3: 75.63%



***saved

# **lexical Only Model**

In [None]:
!pip install transformers
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, h

In [None]:
from transformers import BertTokenizer, BertModel
import contractions

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer.add_special_tokens({'additional_special_tokens': ['<sil>', '++laughter++', '++breathing++']}) # Modify the tokenizer to add special tokens
bert_model.resize_token_embeddings(len(tokenizer))  # Update the BERT model to account for the new tokens

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(30525, 768)

To retain the <sil> (silence) token's information, we can map it to a special token in BERT's vocabulary. The idea is to replace the <sil> token with a token that exists in BERT's vocabulary but doesn't appear in regular text, like [unused0]. 

It should be noted that the [unused0] token may not have a meaningful representation in BERT's pre-trained embeddings. But fine-tuning BERT on your specific dataset might help the model learn a more meaningful representation for the silence token in the context of your task.

To expand contractions like "That's" to "That is" before feeding the text into BERT, you can use a library like contractions or write a custom function to handle specific contractions. Here's how to use the contractions library

++LAUGHTER++,++GARBAGE++,++BREATHING++ total 172 instance in total
1. remove garbage
2. keep laugh and breath



In [None]:
trans_words = data['trans_words'].copy()

In [None]:
#  remove the <s>, </s>  tokens from the text before tokenizing 
def preprocess_text(text,tokenizer):
    text = text.lower()
    text = contractions.fix(text)  # Expand contractions

    # note: since we have lowered the case, we are using garbage instead of GARBAGE 
    text = text.replace('<s>', '[CLS]').replace('</s>', '[SEP]').replace('++garbage++', '') #.replace('++breathing++', '').replace('++laughter++', '').replace('<sil>', '')
    # print(text)
    tokens = tokenizer.tokenize(text)
    # print(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    return input_ids


In [None]:
idx = 2
trans_word = trans_words.iloc[idx]
lst = trans_word.split()
print(len(lst),lst)
print('transcribed words:',trans_word)
tokenised = preprocess_text(trans_words.iloc[idx],tokenizer)
print('transcribed words:',tokenised,len(tokenised))
input_ids = torch.tensor(tokenised).unsqueeze(0)
outputs = bert_model(input_ids)
print(outputs.last_hidden_state.squeeze(0))

11 ['<s>', 'YOU', 'KEEP', 'SAYING', 'MY', 'FLASHLIGHT', 'LIKE', "IT'S", 'JUST', 'YOURS', '</s>']
transcribed words: <s> YOU KEEP SAYING MY FLASHLIGHT LIKE IT'S JUST YOURS </s>
[CLS] you keep saying my flashlight like it is just yours [SEP]
['[CLS]', 'you', 'keep', 'saying', 'my', 'flashlight', 'like', 'it', 'is', 'just', 'yours', '[SEP]']
transcribed words: [101, 2017, 2562, 3038, 2026, 15257, 2066, 2009, 2003, 2074, 6737, 102] 12
tensor([[ 0.4242,  0.3604, -0.0238,  ..., -0.3269,  0.2944,  0.4648],
        [ 0.5419,  0.2180,  0.0233,  ...,  0.2271,  1.1444,  0.3384],
        [ 0.7502,  0.1607,  0.7397,  ..., -0.4460, -0.0597,  0.5583],
        ...,
        [ 0.4529,  0.0954,  0.9105,  ...,  0.2409, -0.0369,  1.0204],
        [ 0.1697,  0.0588,  0.1713,  ...,  0.3682,  0.0328,  0.5527],
        [ 0.9737,  0.4401, -0.0175,  ...,  0.0230, -0.3806, -0.4799]],
       grad_fn=<SqueezeBackward1>)


In [None]:
idx = 2
trans_word = trans_words.iloc[idx]
lst = trans_word.split()
print(len(lst),lst)
print('transcribed words:',trans_word)
tokenised = preprocess_text(trans_words.iloc[idx],tokenizer)[1:]
print('transcribed words:',tokenised,len(tokenised))
input_ids = torch.tensor(tokenised).unsqueeze(0)
outputs = bert_model(input_ids)
print(outputs.last_hidden_state.squeeze(0))

11 ['<s>', 'YOU', 'KEEP', 'SAYING', 'MY', 'FLASHLIGHT', 'LIKE', "IT'S", 'JUST', 'YOURS', '</s>']
transcribed words: <s> YOU KEEP SAYING MY FLASHLIGHT LIKE IT'S JUST YOURS </s>
[CLS] you keep saying my flashlight like it is just yours [SEP]
['[CLS]', 'you', 'keep', 'saying', 'my', 'flashlight', 'like', 'it', 'is', 'just', 'yours', '[SEP]']
transcribed words: [2017, 2562, 3038, 2026, 15257, 2066, 2009, 2003, 2074, 6737, 102] 11
tensor([[ 0.2970,  0.2857,  0.0788,  ...,  0.0213, -0.0377,  0.1018],
        [ 0.7928,  0.5266,  1.0389,  ...,  0.2718, -0.2735,  0.2367],
        [ 0.6917,  0.5986,  0.3120,  ...,  0.0748,  0.0761,  0.2420],
        ...,
        [ 0.3667, -0.1594,  0.6207,  ...,  0.6107, -0.0207,  0.6009],
        [-0.2820, -0.3096, -0.0326,  ...,  0.9092,  0.0034,  0.0447],
        [ 1.2007,  0.4542,  0.0052,  ...,  0.0946, -0.6579, -0.3773]],
       grad_fn=<SqueezeBackward1>)


### Lexical Dataset Class

In [None]:
class IEMOCAP_lexical(Dataset):
    def __init__(self, data, tokenizer, bert_model, max_len=512):
        data = pd.read_pickle(data_pickel_path)
        self.data = data['trans_words']
        y = data['emotion'].values
        self.label_encoder = LabelEncoder()
        # self.spk = data['spk'] 
        self.y = self.label_encoder.fit_transform(y)
        self.tokenizer = tokenizer
        self.bert_model = bert_model
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        input_ids = preprocess_text(text,self.tokenizer)
        seq_size = len(input_ids)
        input_ids = self.truncate(input_ids)
        # attention_mask = self.create_attention_mask(input_ids)
        embeddings = self.extract_embeddings(input_ids)#, attention_mask)
        return torch.tensor(embeddings),torch.tensor(self.y[idx]), seq_size

    def truncate(self, sequence):
        if len(sequence) > self.max_len:
            return sequence[:self.max_len]
        else:
            return sequence # + [0] * (self.max_len - len(sequence))

    def create_attention_mask(self, input_ids):
        return [1 if token_id > 0 else 0 for token_id in input_ids]

    def extract_embeddings(self, input_ids):#, attention_mask):
        with torch.no_grad():
            input_ids = torch.tensor(input_ids).unsqueeze(0)
            # attention_mask = torch.tensor(attention_mask).unsqueeze(0)
            outputs = self.bert_model(input_ids)#, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.squeeze(0).numpy()
        return embeddings


In [None]:
dataset_lex = IEMOCAP_lexical(data, tokenizer, bert_model)
feat = dataset_lex[0]
print(feat[0].shape,feat)

torch.Size([4, 768]) (tensor([[-7.6917e-02,  3.0443e-01, -3.8888e-02,  ..., -3.1553e-02,
          6.7862e-02,  3.1871e-01],
        [ 1.5550e+00, -3.1333e-01,  2.5032e-01,  ..., -2.2655e-01,
          2.3212e-01,  9.8409e-02],
        [ 5.7058e-02, -3.9955e-01, -1.1912e-01,  ...,  5.8026e-01,
          3.8527e-01, -7.7418e-02],
        [ 8.8470e-01, -3.3663e-05, -2.3655e-01,  ...,  1.8076e-01,
         -6.3129e-01, -1.5363e-01]]), tensor(2), 4)


**Custom Collate Function**

In [None]:
def custom_collate_fn(batch):
    features, labels,seq_lengths = zip(*batch)
    # seq_lengths = [len(seq) for seq in features]

    # Sort sequences by length in descending order
    seq_lengths, perm_idx = torch.tensor(seq_lengths).sort(0, descending=True)
    features = [features[i] for i in perm_idx]
    labels = torch.tensor([labels[i] for i in perm_idx])

    # Pad the sequences
    padded_features = pad_sequence(features, batch_first=True)
    return padded_features, labels, seq_lengths

**Randomly splitting the dataset and storing the test (20%) and train (80%) indices**

In [None]:
num_samp = len(dataset_lex)
train_size = int(0.8 * len(dataset_lex))
test_size = len(dataset_lex) - train_size
train_indices = random.sample(range(num_samp), train_size)
test_indices = [i for i in range(num_samp) if i not in train_indices ]
print("train_indices",train_indices)
print("test_indices",test_indices)

train_indices [5238, 912, 204, 2253, 2006, 1828, 1143, 839, 4467, 712, 4837, 3456, 260, 244, 767, 1791, 1905, 4139, 4931, 217, 4597, 1628, 5323, 4464, 3436, 1805, 3679, 4827, 2278, 53, 1307, 3462, 2787, 2276, 1273, 1763, 2757, 837, 759, 3112, 792, 2940, 2817, 4945, 2166, 355, 3763, 4392, 1022, 3100, 645, 4522, 2401, 5149, 5066, 2962, 4729, 1575, 569, 375, 5417, 1866, 2370, 653, 1907, 827, 3113, 2277, 3714, 5207, 2988, 1332, 3032, 2910, 1716, 2187, 5308, 584, 4990, 5201, 1401, 4375, 2005, 1338, 3786, 3108, 2211, 5242, 4562, 1799, 2656, 458, 1876, 262, 2584, 3286, 2193, 542, 1728, 4646, 2577, 1741, 5369, 4089, 3241, 5266, 3758, 1170, 2169, 5513, 2020, 4598, 4415, 2152, 4788, 3509, 4780, 3271, 2965, 1796, 1133, 4174, 4042, 744, 385, 898, 1252, 5140, 1310, 3458, 4885, 520, 3152, 3126, 4881, 3834, 4334, 2059, 4532, 94, 938, 4398, 2185, 5250, 2786, 913, 2404, 3561, 1295, 3716, 26, 2157, 4100, 1463, 4158, 871, 5122, 2444, 5234, 5365, 4988, 1629, 5393, 3063, 1323, 4418, 4344, 4, 4906, 2655, 40

In [None]:
batch_size = 32

train_dataset_lex = torch.utils.data.Subset(dataset_lex, train_indices)
train_loader_lex = DataLoader(dataset=train_dataset_lex, batch_size=batch_size, shuffle=True,collate_fn=custom_collate_fn)

test_dataset_lex = torch.utils.data.Subset(dataset_lex, test_indices)
test_loader_lex = DataLoader(dataset=test_dataset_lex, batch_size=batch_size, shuffle=False,collate_fn=custom_collate_fn)

In [None]:
train_dataset_lex[0][0].shape

torch.Size([29, 768])

## **BLSTM based Lexical only classifiers:**

### Training and Testing the Audio only Model

In [None]:
dataset_lex[0][0].shape[-1]

768

In [None]:
train_loader_lex
test_loader_lex

<torch.utils.data.dataloader.DataLoader at 0x7ff23e58fe20>

In [None]:
# Set up the training settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Initialize the hyperparameters
learning_rate = 1e-4 
input_dim = 768 # dataset_lex[0][0].shape[-1] #  = 768 (the size of the word embeddings)
hidden_dim = 128
num_layers = 2
num_classes = 4 # len(np.unique(dataset_lex.y))
attention_dim = 64

cuda


In [None]:
# Initialize the model, loss function, and optimizer

# choose a model
model_lex,model_name = BLSTM_lastblock(input_dim, hidden_dim,num_layers, num_classes).to(device), 'model_lex_I'
# model_lex,model_name = BLSTM_avg_pooling(input_dim, hidden_dim,num_layers, num_classes).to(device), 'model_lex_II'
# model_lex,model_name  = BLSTMWithContextBasedAttention(input_dim, hidden_dim,num_layers, attention_dim, num_classes).to(device), 'model_lex_III'
print(model_lex)

criterion = nn.CrossEntropyLoss()
weight_decay = 1e-5
optimizer = torch.optim.Adam(model_lex.parameters(), lr=learning_rate, weight_decay=weight_decay)

BLSTM_lastblock(
  (rnn): LSTM(768, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)


In [None]:
save_path = os.path.join(main_dir,'model_lexical')

In [None]:
num_epochs = 10
model_ = model_lex

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score,balanced_accuracy_score
from collections import defaultdict
from google.colab import files

# Define a function to compute class-wise accuracy
def class_wise_accuracy(true_labels, predicted_labels, num_classes):
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    for t, p in zip(true_labels, predicted_labels):
        class_correct[t] += (t == p)
        class_total[t] += 1
        
    return [correct / total if total > 0 else 0 for correct, total in zip(class_correct, class_total)]

# Training loop

for epoch in range(num_epochs):
    model_.train() # <---------------------------
    train_loss = 0
    train_iter = 0
    train_labels = []
    train_preds = []
    
    loop = tqdm(enumerate(train_loader_lex), total=len(train_loader_lex), leave=True)
    for i, (features, labels, seq_lengths) in loop:
        features = features.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model_(features,seq_lengths) # <-----------------
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        train_iter += 1

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch [{epoch + 1}/{num_epochs}]")
        loop.set_postfix(loss=loss.item())

        # Store labels and predictions for training accuracy
        _, predicted = torch.max(outputs.data, 1)
        train_labels.extend(labels.cpu().numpy())
        train_preds.extend(predicted.cpu().numpy())

    # Evaluate the model on the test set
    model_.eval() # <-----------------------------
    test_loss = 0
    test_iter = 0
    test_labels = []
    test_preds = []
    
    with torch.no_grad():
        for features, labels, seq_lengths in test_loader_lex:
            features = features.to(device)
            labels = labels.to(device)

            outputs = model_(features,seq_lengths) #<------------------
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            test_iter += 1
            
            _, predicted = torch.max(outputs.data, 1)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(predicted.cpu().numpy())

    avg_train_loss = train_loss / train_iter
    avg_test_loss = test_loss / test_iter
    weighted_train_accuracy = accuracy_score(train_labels, train_preds)
    unweighted_train_accuracy = balanced_accuracy_score(train_labels, train_preds)
    weighted_test_accuracy = accuracy_score(test_labels, test_preds)
    unweighted_test_accuracy = balanced_accuracy_score(test_labels, test_preds)
    train_class_accuracies = class_wise_accuracy(train_labels, train_preds, num_classes)
    test_class_accuracies = class_wise_accuracy(test_labels, test_preds, num_classes)

    print(f"Epoch[{epoch + 1}] Avg Train Loss: {avg_train_loss:.4f}, Weighted Train Accuracy: {100 * weighted_train_accuracy:.2f}%, Unweighted Train Accuracy: {100 * unweighted_train_accuracy:.2f}%")
    print(f" Avg Test Loss: {avg_test_loss:.4f}, Weighted Test Accuracy: {100 * weighted_test_accuracy:.2f}%, Unweighted Test Accuracy: {100 * unweighted_test_accuracy:.2f}%")
    print(f"Train Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(train_class_accuracies)])}")
    print(f"Test Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(test_class_accuracies)])}")
    # Save the model checkpoint
    # if epoch >= 4 and avg_test_loss<= avg_test_loss_best:
    if epoch >= 2:
        file_path_epoch = os.path.join(save_path, model_name + str(epoch+1)+'.pth')
        torch.save(model_, file_path_epoch)

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[1] Avg Train Loss: 1.3251, Weighted Train Accuracy: 36.53%, Unweighted Train Accuracy: 31.30%
 Avg Test Loss: 1.1894, Weighted Test Accuracy: 48.01%, Unweighted Test Accuracy: 43.54%
Train Class Accuracies: Class 0: 3.15%, Class 1: 61.17%, Class 2: 54.42%, Class 3: 6.46%
Test Class Accuracies: Class 0: 22.43%, Class 1: 70.62%, Class 2: 54.69%, Class 3: 26.40%


  0%|          | 0/138 [00:00<?, ?it/s]

Epoch[2] Avg Train Loss: 1.0963, Weighted Train Accuracy: 53.51%, Unweighted Train Accuracy: 52.69%
 Avg Test Loss: 1.0351, Weighted Test Accuracy: 58.06%, Unweighted Test Accuracy: 57.66%
Train Class Accuracies: Class 0: 47.81%, Class 1: 52.71%, Class 2: 60.79%, Class 3: 49.43%
Test Class Accuracies: Class 0: 48.60%, Class 1: 62.19%, Class 2: 57.91%, Class 3: 61.93%


  0%|          | 0/138 [00:00<?, ?it/s]

In [None]:
model_ = torch.load(os.path.join(main_dir+'model_lex_I.pth'))
print(model_)

BLSTM_CLSblock(
  (rnn): LSTM(768, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)


In [None]:
# Evaluate the model on the test set
model_.eval() # <-----------------------------
test_loss = 0
test_iter = 0
test_labels = []
test_preds = []

with torch.no_grad():
    for features, labels, seq_lengths in test_loader_lex:
        features = features.to(device)
        labels = labels.to(device)

        outputs = model_(features,seq_lengths) #<------------------
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        test_iter += 1
        
        _, predicted = torch.max(outputs.data, 1)
        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(predicted.cpu().numpy())

avg_train_loss = train_loss / train_iter
avg_test_loss = test_loss / test_iter
weighted_train_accuracy = accuracy_score(train_labels, train_preds)
unweighted_train_accuracy = balanced_accuracy_score(train_labels, train_preds)
weighted_test_accuracy = accuracy_score(test_labels, test_preds)
unweighted_test_accuracy = balanced_accuracy_score(test_labels, test_preds)
train_class_accuracies = class_wise_accuracy(train_labels, train_preds, num_classes)
test_class_accuracies = class_wise_accuracy(test_labels, test_preds, num_classes)

# print(f"Epoch[{epoch + 1}] Avg Train Loss: {avg_train_loss:.4f}, Weighted Train Accuracy: {100 * weighted_train_accuracy:.2f}%, Unweighted Train Accuracy: {100 * unweighted_train_accuracy:.2f}%")
print(f" Avg Test Loss: {avg_test_loss:.4f}, Weighted Test Accuracy: {100 * weighted_test_accuracy:.2f}%, Unweighted Test Accuracy: {100 * unweighted_test_accuracy:.2f}%")
# print(f"Train Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(train_class_accuracies)])}")
print(f"Test Class Accuracies: {', '.join([f'Class {i}: {100 * acc:.2f}%' for i, acc in enumerate(test_class_accuracies)])}")

 Avg Test Loss: 0.9369, Weighted Test Accuracy: 63.13%, Unweighted Test Accuracy: 63.18%
Test Class Accuracies: Class 0: 62.62%, Class 1: 65.00%, Class 2: 61.66%, Class 3: 63.45%


In [None]:
file_path_epoch = os.path.join(save_path, model_name + str(epoch+1)+'.pth')
torch.save(model_, file_path_epoch)

### **Reslut for Lexical Only:**
**Train Results:**

| Model | Train Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=8/10) | 0.6268 | 76.49% | 76.50% | 78.63% | 78.26% | 74.74% | 74.38% |
| Avg. Pool (e=9/10) | 0.7926 | 70.70% | 70.29% | 71.65% | 68.73% | 75.71% | 65.08% |
| Attention based (attention layer size - 64) (e=7/10) | 0.5780 | 77.65% | 77.60% | 79.53% | 76.74% | 78.86% | 75.28% | 

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

**Test Results:**

| Model | Test Loss | Weighted Accuracy | Unweighted Accuracy | Angry Acc. | Happy Acc. | Neutral Acc. | Sad Acc. |
|-------|-----------|------------------|---------------------|------------|------------|--------------|----------|
| Last Block (e=8/10) | 0.9218 | 65.04% | 64.03% | 53.74% | 64.06% | 71.31% | 67.01% |
| Avg. Pool (e=9/10) | 1.0551 | 62.68% | 60.56% | 52.34% | 70.31% | 67.83% | 51.78% |
| Attention based (attention layer size - 64) (e=7/10) | 0.9797 | 63.59% | 61.59% | 54.67% | 70.00% | 68.90% | 52.79% |

Note: The categorical names used are: Angry, Happy, Neutral, and Sad.

First Block :  
    1. Epoch[7] Avg Train Loss: 0.6517, Weighted Train Accuracy: 75.52%, Unweighted Train Accuracy: 75.71%
    2.  Avg Test Loss: 0.9369, Weighted Test Accuracy: 63.13%, Unweighted Test Accuracy: 63.18%
    3. Train Class Accuracies: Class 0: 81.44%, Class 1: 76.89%, Class 2: 72.64%, Class 3: 71.88%
    4. Test Class Accuracies: Class 0: 62.62%, Class 1: 65.00%, Class 2: 61.66%, Class 3: 63.45%

1. Why the last block method performed better?
    1. Shorter sequences show BLSTM able to capture most of the utterance context in the last block. So the last block has most of the information. But since we have a BLSTM model the other blocks output would also have the context to the entire utterance so averag pooling should also work. But maybe focusing on just one block's output makes it easier for the model to optimize weights sice we have a start and end token that are same across all the samples. It was also evident as when we didnt use the start and end tokens all the three models performed worst and the last block output method didnt do as good.
    2. Something to do with BERT
    3. Large feature input size so overfitting is possible. The more complex attention model overfits more compared to the simepler non attention based model.

# END

## Multimodal

1.  Use the pretrained acoustic and lexical model and fuse (by concatenation) there BLSTM pooled output  to get multimodal representation that can then be passed thorugth a linear layer for classification.
2. Fusion using GMU attention

1. Lexical and acoustic model with average pooling as input.
2. Concatenates the pooled output from the two modalities to get a multimodal representaiton that is then passed through a softmax classifier.

### Dataset

In [None]:
# data_pickel_path = ''

In [None]:
class IEMOCAP_mm(Dataset):
    def __init__(self, tokenizer, bert_model, mean, std, data_pickel_path, max_len=512):
        data = pd.read_pickle(data_pickel_path)
        self.x_l = data['trans_words']
        self.x_a = [np.array(samp_feat) for samp_feat in data['features']]

        y = data['emotion'].values
        self.label_encoder = LabelEncoder()
        self.y = self.label_encoder.fit_transform(y)

        self.tokenizer = tokenizer
        self.bert_model = bert_model
        self.max_len = max_len
        self.mean = mean
        self.std = std
        self.n_samples = data.shape[0]
        # self.spk = data['spk'] 

    def __getitem__(self, idx):

        # lexical features
        text = self.x_l[idx]
        word_ids = preprocess_text(text,self.tokenizer)
        word_ids = self.truncate(word_ids)
        embeddings = self.extract_embeddings(word_ids) 
        seq_size_l = len(word_ids)

        # acoustic features
        feat_a = torch.tensor(self.x_a[idx])
        if self.mean is not None and self.std is not None:
            feat_a = (feat_a - self.mean) / self.std
        seq_size_a = feat_a.shape[0]

        return feat_a,seq_size_a,torch.tensor(embeddings),seq_size_l,torch.tensor(self.y[idx])

    def truncate(self, sequence):
        if len(sequence) > self.max_len:
            return sequence[:self.max_len]
        else:
            return sequence # + [0] * (self.max_len - len(sequence))

    def create_attention_mask(self, input_ids):
        return [1 if token_id > 0 else 0 for token_id in input_ids]

    def extract_embeddings(self, input_ids):#, attention_mask):
        with torch.no_grad():
            input_ids = torch.tensor(input_ids).unsqueeze(0)
            # attention_mask = torch.tensor(attention_mask).unsqueeze(0)
            outputs = self.bert_model(input_ids)#, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state.squeeze(0).numpy()
        return embeddings

    def __len__(self):
        return self.n_samples

    def get_spk(self):
        return self.spk
        
    def get_encoder(self):
        return self.label_encoder

In [None]:
dataset_mm = IEMOCAP_mm(tokenizer, bert_model,mean,std,data_pickel_path)

In [None]:
feat = dataset_mm[0]
print(feat[0].shape,feat[2].shape,feat)

torch.Size([203, 65]) torch.Size([4, 768]) (tensor([[-0.7341,  0.1294, -0.3588,  ..., -0.0663, -0.8278, -0.3104],
        [-0.7341,  0.1156, -0.3588,  ...,  0.4177, -0.3671, -0.3634],
        [-0.7341,  0.0338, -0.3588,  ...,  0.6244,  0.1425, -0.2785],
        ...,
        [-0.7341, -0.0537, -0.3588,  ...,  0.9138,  0.4050,  0.1305],
        [-0.7341, -0.0355, -0.3588,  ...,  0.1917,  0.3644,  0.2793],
        [-0.7341,  0.0230, -0.3588,  ...,  0.5622,  0.7144, -0.0018]]), 203, tensor([[-0.1388,  0.2195, -0.0053,  ...,  0.0023,  0.0133,  0.3097],
        [ 1.5709, -0.4490,  0.4407,  ..., -0.3449,  0.0637, -0.0286],
        [-0.4176, -0.6967, -0.0282,  ...,  0.8575,  0.4115, -0.4271],
        [ 0.8579, -0.0590, -0.1324,  ...,  0.1821, -0.6168, -0.2024]]), 4, tensor(2))


In [None]:
def custom_collate_fn_mm(batch):
    feat_a, seq_size_a, feat_l, seq_size_l, labels = zip(*batch)

    # Pad the sequences
    padded_feat_a = pad_sequence(feat_a, batch_first=True)
    padded_feat_l = pad_sequence(feat_l, batch_first=True)

    # Convert sequence lengths and labels to tensors
    seq_size_a = torch.tensor(seq_size_a)
    seq_size_l = torch.tensor(seq_size_l)
    labels = torch.tensor(labels)

    return padded_feat_a, seq_size_a, padded_feat_l, seq_size_l, labels


In [None]:
# from torch.utils.data import random_split
import random
# Set the random seed
random.seed(42)
num_samp = len(dataset_mm)
train_size = int(0.8 * len(dataset_mm))
test_size = len(dataset_mm) - train_size
train_indices = random.sample(range(num_samp), train_size)
test_indices = [i for i in range(num_samp) if i not in train_indices ]
print(train_indices)
print(test_indices)
batch_size = 32
train_dataset_mm = torch.utils.data.Subset(dataset_mm, train_indices)
train_dataset_mm = DataLoader(dataset=train_dataset_mm, batch_size=batch_size, shuffle=True,collate_fn=custom_collate_fn_mm)

test_dataset_mm = torch.utils.data.Subset(dataset_mm, test_indices)
test_dataset_mm = DataLoader(dataset=test_dataset_mm, batch_size=batch_size, shuffle=False,collate_fn=custom_collate_fn_mm)

[5238, 912, 204, 2253, 2006, 1828, 1143, 839, 4467, 712, 4837, 3456, 260, 244, 767, 1791, 1905, 4139, 4931, 217, 4597, 1628, 5323, 4464, 3436, 1805, 3679, 4827, 2278, 53, 1307, 3462, 2787, 2276, 1273, 1763, 2757, 837, 759, 3112, 792, 2940, 2817, 4945, 2166, 355, 3763, 4392, 1022, 3100, 645, 4522, 2401, 5149, 5066, 2962, 4729, 1575, 569, 375, 5417, 1866, 2370, 653, 1907, 827, 3113, 2277, 3714, 5207, 2988, 1332, 3032, 2910, 1716, 2187, 5308, 584, 4990, 5201, 1401, 4375, 2005, 1338, 3786, 3108, 2211, 5242, 4562, 1799, 2656, 458, 1876, 262, 2584, 3286, 2193, 542, 1728, 4646, 2577, 1741, 5369, 4089, 3241, 5266, 3758, 1170, 2169, 5513, 2020, 4598, 4415, 2152, 4788, 3509, 4780, 3271, 2965, 1796, 1133, 4174, 4042, 744, 385, 898, 1252, 5140, 1310, 3458, 4885, 520, 3152, 3126, 4881, 3834, 4334, 2059, 4532, 94, 938, 4398, 2185, 5250, 2786, 913, 2404, 3561, 1295, 3716, 26, 2157, 4100, 1463, 4158, 871, 5122, 2444, 5234, 5365, 4988, 1629, 5393, 3063, 1323, 4418, 4344, 4, 4906, 2655, 4002, 159, 916, 

### Multimodal models

In [None]:
input_dim_a,input_dim_l, hidden_dim,num_layers, num_classes = 65,768,128,2,4

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_acoustic = BLSTM_avg_pooling(input_dim_a, hidden_dim,num_layers, num_classes).to(device)
model_lex = BLSTM_avg_pooling(input_dim_l, hidden_dim,num_layers, num_classes).to(device)

In [None]:
# Load the pre-trained model
acoustic_model_path = "/content/drive/MyDrive/IITB_EE/CS772_project/model_acoustic_II.pth"
ptmodel_acoustic = torch.load(acoustic_model_path,map_location=torch.device(device))

# Load the pre-trained model
lex_model_path = "/content/drive/MyDrive/IITB_EE/CS772_project/model_lex_II.pth"
ptmodel_lex = torch.load(lex_model_path,map_location=torch.device(device))

# Load the state_dict, assuming you have the 'acoustic_model' variable
model_acoustic.load_state_dict(ptmodel_acoustic)
model_lex.load_state_dict(ptmodel_lex)

<All keys matched successfully>

In [None]:
# Extract the BLSTM directly from the model
blstm_acoustic = model_acoustic.rnn
blstm_lex = model_lex.rnn

In [None]:
# blstm_acoustic = acoustic_model['rnn']
# blstm_lex = acoustic_model['rnn']
# fc_a = acoustic_model['fc']
# fc_l = lex_model['fc']
# fc_mm = torch.nn.Linear(hidden_dim * 4, num_classes) 
# fc_mm.weight.data = torch.cat((fc_a.weight, fc_l.weight), dim=1)
# fc_mm.bias.data = fc_a.bias + fc_l.bias