In [49]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [50]:
# ----------------------------
# 1. Load Speech Dataset
# ----------------------------
speech_dir = 'dataset/speech'
speech_data = []

for file in os.listdir(speech_dir):
    if file.endswith('.wav'):
        parts = file.split('_')
        if len(parts) == 3:
            word = parts[1]
            emotion = parts[2].replace('.wav', '')
            speech_data.append({
                'word': word,
                'emotion': emotion,
                'speech_path': os.path.join(speech_dir, file)
            })

speech_df = pd.DataFrame(speech_data)
speech_df.to_csv('speech_word_dataset.csv', index=False)
print(speech_df.head())
print(speech_df.__len__())
print(speech_df['emotion'].unique().__len__())
print(speech_df['emotion'].unique())

# ----------------------------
# 2. Load Text Dataset
# ----------------------------
def load_csvs_from_dir(directory):
    combined_df = pd.DataFrame()
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

text_train_df = load_csvs_from_dir("dataset/text/train")
text_val_df = load_csvs_from_dir("dataset/text/validation")
text_test_df = load_csvs_from_dir("dataset/text/test")
text_df = pd.concat([text_train_df, text_val_df, text_test_df], ignore_index=True)
print(text_df.head())
print(text_df.__len__())
print(text_df['label'].unique().__len__())
print(text_df['label'].unique())

# remove labels from text_df that are not in speech_df
# neutral

   word  emotion                          speech_path
0  back    angry    dataset/speech\YAF_back_angry.wav
1  back  disgust  dataset/speech\YAF_back_disgust.wav
2  back     fear     dataset/speech\YAF_back_fear.wav
3  back    happy    dataset/speech\YAF_back_happy.wav
4  back       ps       dataset/speech\YAF_back_ps.wav
1200
6
['angry' 'disgust' 'fear' 'happy' 'ps' 'sad']
     label                                               text
0  neutral  dannycastillo we want to trade with someone wh...
1  neutral                                   cant fall asleep
2  neutral  no topic maps talks at the balisage markup con...
3  neutral                               cynthia i cant sleep
4  neutral                               i missed the bly bus
92005
13
['neutral' 'worry' 'happiness' 'sadness' 'love' 'surprise' 'fun' 'relief'
 'hate' 'empty' 'enthusiasm' 'boredom' 'anger']


In [51]:
# ----------------------------
# 3. Encode Labels (Shared)
# ----------------------------
label_encoder = LabelEncoder()
all_labels = pd.concat([speech_df['emotion'], text_df['label']], ignore_index=True)
label_encoder.fit(all_labels)

speech_df['label'] = label_encoder.transform(speech_df['emotion'])
text_df['label'] = label_encoder.transform(text_df['label'])


In [52]:
# ----------------------------
# 4. Tokenizer and BERT Model
# ----------------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [53]:
# ----------------------------
# 5. Feature Extraction Utils
# ----------------------------
def extract_mfcc(wav_path, max_len=100):
    y, sr = librosa.load(wav_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc.T
def extract_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=32)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [62]:
# ----------------------------
# 6. Early Fusion Dataset
# ----------------------------
class EarlyFusionDataset(Dataset):
    def __init__(self, speech_df, text_df):
        self.features = []
        self.labels = []

        # Matching based on emotion class
        min_samples = min(len(speech_df), len(text_df))
        for i in range(min_samples):
            speech_row = speech_df.iloc[i]
            # print(speech_row)
            text_row = text_df.iloc[i]
            # print(text_row)

            # Extract features
            mfcc = extract_mfcc(speech_row['speech_path'])  # shape: [time, 40]
            # print(f"MFCC length: {mfcc.__len__()}")
            # print(f"MFCC shape: {mfcc.shape}")
            bert = extract_bert_embedding(text_row['text'])  # shape: [768]
            # print(f"BERT length: {bert.__len__()}")
            # print(f"BERT shape: {bert.shape}")
            # print(f"BERT shape: {bert.shape[0]}")
            
            
            # Concatenate
            bert_repeated = np.repeat(bert[np.newaxis, :], mfcc.shape[0], axis=0)  # [time, 768]
            fused = np.concatenate((mfcc, bert_repeated), axis=1)  # [time, 808]
            

            self.features.append(torch.tensor(fused, dtype=torch.float32))
            self.labels.append(torch.tensor(speech_row['label'], dtype=torch.long))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]
    
dataset = EarlyFusionDataset(speech_df, text_df)

fused_tensor, label = dataset[0]  # Choose index 0 or any number < len(dataset)
print("Fused Tensor Shape:", fused_tensor.shape)
print("Fused Tensor Data:", fused_tensor)
print("Label:", label)

# Save the first fused tensor to CSV for visual inspection
np.savetxt("fused_sample.csv", dataset[0][0].numpy(), delimiter=",")
print("Saved to fused_sample.csv")


Fused Tensor Shape: torch.Size([100, 808])
Fused Tensor Data: tensor([[-4.5964e+02, -5.6997e+01,  4.3259e+01,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01],
        [-4.0625e+02, -7.3839e+01,  4.9714e+01,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01],
        [-4.1438e+02, -8.6455e+01,  4.9407e+01,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.2901e-01,
          4.6350e-01,  2.4182e-01]])
Label: tensor(1)
Saved to fused_sample.csv


In [55]:
# ----------------------------
# 7. Collate Function for Padding
# ----------------------------
def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    return padded, torch.tensor(labels)

In [56]:
# ----------------------------
# 8. LSTM Model
# ----------------------------
class EarlyFusionLSTM(nn.Module):
    def __init__(self, input_dim=808, hidden_dim=128, num_classes=6):
        super(EarlyFusionLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn.squeeze(0))

In [57]:
from sklearn.model_selection import train_test_split

# ----------------------------
# 9. Training and Evaluation
# ----------------------------

# Split dataset into train and validation
full_dataset = EarlyFusionDataset(speech_df, text_df)
print(full_dataset.__len__())
train_indices, val_indices = train_test_split(list(range(len(full_dataset))), test_size=0.2, random_state=42)

train_subset = torch.utils.data.Subset(full_dataset, train_indices)
val_subset = torch.utils.data.Subset(full_dataset, val_indices)

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_subset, batch_size=16, shuffle=False, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EarlyFusionLSTM(num_classes=len(label_encoder.classes_)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

def evaluate(model, loader):
    model.eval()
    total_correct = 0
    total_samples = 0
    total_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == batch_y).sum().item()
            total_samples += batch_y.size(0)
    accuracy = total_correct / total_samples
    avg_loss = total_loss / len(loader)
    return accuracy, avg_loss

# Training loop with metrics
for epoch in range(25):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        total_correct += (preds == batch_y).sum().item()
        total_samples += batch_y.size(0)

    train_acc = total_correct / total_samples
    train_loss = total_loss / len(train_loader)

    val_acc, val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Train Acc: {train_acc*100:.2f}%, "
          f"Val Loss: {val_loss:.4f}, "
          f"Val Acc: {val_acc*100:.2f}%")


word                                        back
emotion                                    angry
speech_path    dataset/speech\YAF_back_angry.wav
label                                          1
Name: 0, dtype: object
label                                                   12
text     dannycastillo we want to trade with someone wh...
Name: 0, dtype: object
MFCC length: 100
MFCC shape: (100, 40)
BERT length: 768
BERT shape: (768,)
BERT shape: 768
Fused shape: (100, 808)
Fused length: 100
Fused dtype: float32
word                                          back
emotion                                    disgust
speech_path    dataset/speech\YAF_back_disgust.wav
label                                            3
Name: 1, dtype: object
label                  12
text     cant fall asleep
Name: 1, dtype: object
MFCC length: 100
MFCC shape: (100, 40)
BERT length: 768
BERT shape: (768,)
BERT shape: 768
Fused shape: (100, 808)
Fused length: 100
Fused dtype: float32
word                          