In [33]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [34]:
# ----------------------------
# 1. Load Speech Dataset
# ----------------------------
speech_dir = 'dataset/speech'
speech_data = []

for file in os.listdir(speech_dir):
    if file.endswith('.wav'):
        parts = file.split('_')
        if len(parts) == 3:
            word = parts[1]
            emotion = parts[2].replace('.wav', '')
            speech_data.append({
                'word': word,
                'emotion': emotion,
                'speech_path': os.path.join(speech_dir, file)
            })

speech_df = pd.DataFrame(speech_data)
speech_df.to_csv('speech_word_dataset.csv', index=False)

# ----------------------------
# 2. Load Text Dataset
# ----------------------------
def load_csvs_from_dir(directory):
    combined_df = pd.DataFrame()
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

text_train_df = load_csvs_from_dir("dataset/text/train")
text_val_df = load_csvs_from_dir("dataset/text/validation")
text_test_df = load_csvs_from_dir("dataset/text/test")
text_df = pd.concat([text_train_df, text_val_df, text_test_df], ignore_index=True)


In [35]:
# ----------------------------
# 3. Encode Labels (Shared)
# ----------------------------
label_encoder = LabelEncoder()
all_labels = pd.concat([speech_df['emotion'], text_df['label']], ignore_index=True)
label_encoder.fit(all_labels)

speech_df['label'] = label_encoder.transform(speech_df['emotion'])
text_df['label'] = label_encoder.transform(text_df['label'])


In [36]:
# ----------------------------
# 4. Tokenizer and BERT Model
# ----------------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [37]:
# ----------------------------
# 5. Feature Extraction Utils
# ----------------------------
def extract_mfcc(wav_path, max_len=100):
    y, sr = librosa.load(wav_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0,0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc.T
def extract_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=32)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [38]:
# ----------------------------
# 6. Early Fusion Dataset
# ----------------------------
class EarlyFusionDataset(Dataset):
    def __init__(self, speech_df, text_df):
        self.features = []
        self.labels = []

        # Matching based on emotion class
        min_samples = min(len(speech_df), len(text_df))
        for i in range(min_samples):
            speech_row = speech_df.iloc[i]
            text_row = text_df.iloc[i]

            # Extract features
            mfcc = extract_mfcc(speech_row['speech_path'])  # shape: [time, 40]
            bert = extract_bert_embedding(text_row['text'])  # shape: [768]

            # Concatenate
            bert_repeated = np.repeat(bert[np.newaxis, :], mfcc.shape[0], axis=0)  # [time, 768]
            fused = np.concatenate((mfcc, bert_repeated), axis=1)  # [time, 808]

            self.features.append(torch.tensor(fused, dtype=torch.float32))
            self.labels.append(torch.tensor(speech_row['label'], dtype=torch.long))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [39]:
# ----------------------------
# 7. Collate Function for Padding
# ----------------------------
def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = [len(seq) for seq in sequences]
    padded = nn.utils.rnn.pad_sequence(sequences, batch_first=True)
    return padded, torch.tensor(labels)

In [40]:
# ----------------------------
# 8. LSTM Model
# ----------------------------
class EarlyFusionLSTM(nn.Module):
    def __init__(self, input_dim=808, hidden_dim=128, num_classes=6):
        super(EarlyFusionLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn.squeeze(0))

In [42]:
from sklearn.model_selection import train_test_split

# ----------------------------
# 9. Training and Evaluation
# ----------------------------

# Split dataset into train and validation
full_dataset = EarlyFusionDataset(speech_df, text_df)
train_indices, val_indices = train_test_split(list(range(len(full_dataset))), test_size=0.2, random_state=42)

train_subset = torch.utils.data.Subset(full_dataset, train_indices)
val_subset = torch.utils.data.Subset(full_dataset, val_indices)

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_subset, batch_size=16, shuffle=False, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EarlyFusionLSTM(num_classes=len(label_encoder.classes_)).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

def evaluate(model, loader):
    model.eval()
    total_correct = 0
    total_samples = 0
    total_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == batch_y).sum().item()
            total_samples += batch_y.size(0)
    accuracy = total_correct / total_samples
    avg_loss = total_loss / len(loader)
    return accuracy, avg_loss

# Training loop with metrics
for epoch in range(25):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = torch.argmax(outputs, dim=1)
        total_correct += (preds == batch_y).sum().item()
        total_samples += batch_y.size(0)

    train_acc = total_correct / total_samples
    train_loss = total_loss / len(train_loader)

    val_acc, val_loss = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}, "
          f"Train Loss: {train_loss:.4f}, "
          f"Train Acc: {train_acc*100:.2f}%, "
          f"Val Loss: {val_loss:.4f}, "
          f"Val Acc: {val_acc*100:.2f}%")


Epoch 1, Train Loss: 2.6080, Train Acc: 14.90%, Val Loss: 2.2680, Val Acc: 13.75%
Epoch 2, Train Loss: 1.9985, Train Acc: 23.75%, Val Loss: 1.8813, Val Acc: 25.42%
Epoch 3, Train Loss: 1.7655, Train Acc: 35.83%, Val Loss: 1.7273, Val Acc: 28.75%
Epoch 4, Train Loss: 1.5863, Train Acc: 40.52%, Val Loss: 1.5450, Val Acc: 32.92%
Epoch 5, Train Loss: 1.3947, Train Acc: 49.17%, Val Loss: 1.3769, Val Acc: 42.92%
Epoch 6, Train Loss: 1.2066, Train Acc: 60.10%, Val Loss: 1.2248, Val Acc: 54.58%
Epoch 7, Train Loss: 1.0489, Train Acc: 72.92%, Val Loss: 1.1236, Val Acc: 58.75%
Epoch 8, Train Loss: 0.9059, Train Acc: 79.06%, Val Loss: 0.9956, Val Acc: 66.67%
Epoch 9, Train Loss: 0.7615, Train Acc: 86.46%, Val Loss: 0.8629, Val Acc: 75.00%
Epoch 10, Train Loss: 0.6059, Train Acc: 91.77%, Val Loss: 0.7086, Val Acc: 82.08%
Epoch 11, Train Loss: 0.4448, Train Acc: 95.83%, Val Loss: 0.5274, Val Acc: 91.25%
Epoch 12, Train Loss: 0.2899, Train Acc: 98.54%, Val Loss: 0.3679, Val Acc: 94.58%
Epoch 13, Tra