In [7]:
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.optim as optim
import librosa
import numpy as np
import os

In [8]:
# Set your speech audio folder
audio_dir = 'dataset/speech'
data = []

# Go through each WAV file
for file in os.listdir(audio_dir):
    if file.endswith('.wav'):
        parts = file.split('_')
        if len(parts) == 3:
            word = parts[1]
            emotion = parts[2].replace('.wav', '')
            data.append({
                'word': word,
                'emotion': emotion,
                'speech_path': os.path.join(audio_dir, file)
            })

# Create and save the DataFrame
df = pd.DataFrame(data)
df.to_csv('speech_word_dataset.csv', index=False)
print(df.head())


   word  emotion                          speech_path
0  back    angry    dataset/speech\YAF_back_angry.wav
1  back  disgust  dataset/speech\YAF_back_disgust.wav
2  back     fear     dataset/speech\YAF_back_fear.wav
3  back    happy    dataset/speech\YAF_back_happy.wav
4  back       ps       dataset/speech\YAF_back_ps.wav


In [9]:
def extract_mfcc(file_path, n_mfcc=40, max_len=100):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfcc = mfcc.T  # Shape: (time, n_mfcc)

    if mfcc.shape[0] < max_len:
        pad_width = max_len - mfcc.shape[0]
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_len, :]
    return mfcc


In [10]:
class MultimodalDataset(Dataset):
    def __init__(self, text_paths, speech_dir, tokenizer, label2id, max_text_len=64, max_audio_len=100, n_mfcc=40):
        self.samples = []
        for path in text_paths:
            df = pd.read_csv(path)
            for _, row in df.iterrows():
                text = str(row['text']).strip()
                label = str(row['label']).strip().lower()
                # Reconstruct filename
                filename = f"YAF_{text.lower()}_{label}.wav"
                speech_path = f"{speech_dir}/{filename}"
                self.samples.append((text, speech_path, label2id[label]))
        self.tokenizer = tokenizer
        self.max_text_len = max_text_len
        self.max_audio_len = max_audio_len
        self.n_mfcc = n_mfcc


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        text, speech_file, label = self.samples[idx]

        # Text encoding
        encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_text_len, return_tensors='pt')
        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)

        # MFCC extraction
        mfcc = extract_mfcc(speech_file, n_mfcc=self.n_mfcc, max_len=self.max_audio_len)
        mfcc = torch.tensor(mfcc, dtype=torch.float)

        return input_ids, attention_mask, mfcc, torch.tensor(label)

In [11]:
class MultimodalLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, text_hidden, audio_feat_dim, audio_hidden, output_dim, pad_idx):
        super().__init__()
        # Text branch
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.text_lstm = nn.LSTM(embed_dim, text_hidden, batch_first=True)
        # Audio branch
        self.audio_lstm = nn.LSTM(audio_feat_dim, audio_hidden, batch_first=True)
        # Fusion
        self.fc = nn.Linear(text_hidden + audio_hidden, output_dim)

    def forward(self, input_ids, attention_mask, mfcc):
        # Text
        embedded = self.embedding(input_ids)
        lengths = attention_mask.sum(dim=1).cpu()
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        _, (text_hidden, _) = self.text_lstm(packed)
        # Audio
        _, (audio_hidden, _) = self.audio_lstm(mfcc)
        # Concatenate last hidden states
        fused = torch.cat([text_hidden[-1], audio_hidden[-1]], dim=1)
        return self.fc(fused)

In [12]:
# Prepare tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
pad_idx = tokenizer.pad_token_id

# Paths
speech_dir = "dataset/speech"
train_paths = ["dataset/text/train/training.csv", "dataset/text/train/training2.csv", "dataset/text/train/training3.csv"]
val_paths = ["dataset/text/validation/validation.csv","dataset/text/validation/validation2.csv","dataset/text/validation/validation3.csv"]
test_paths = ["dataset/text/test/test.csv","dataset/text/test/test2.csv","dataset/text/test/test3.csv","dataset/text/test/testing3.csv"]

# Label mapping
df_train = pd.concat([pd.read_csv(p) for p in train_paths])
df_train['label'] = df_train['label'].astype(str)
labels = sorted(df_train['label'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}

# Datasets
train_dataset = MultimodalDataset(train_paths, speech_dir, tokenizer, label2id)
val_dataset = MultimodalDataset(val_paths, speech_dir, tokenizer, label2id)
test_dataset = MultimodalDataset(test_paths, speech_dir, tokenizer, label2id)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Model
model = MultimodalLSTM(
    vocab_size=tokenizer.vocab_size,
    embed_dim=128,
    text_hidden=128,
    audio_feat_dim=40,
    audio_hidden=128,
    output_dim=len(label2id),
    pad_idx=pad_idx
)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, mfcc, labels in train_loader:
        input_ids, attention_mask, mfcc, labels = (
            input_ids.to(device),
            attention_mask.to(device),
            mfcc.to(device),
            labels.to(device)
        )
        outputs = model(input_ids, attention_mask, mfcc)

        loss = F.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss / len(train_loader):.4f}")
    
    # Validation loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in val_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask, mfcc)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    val_acc = correct / total if total > 0 else 0
    print(f"Validation Accuracy: {val_acc:.4f}")


  y, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/speech/YAF_i think guys who feel need to compensate do it by being obnoxious_3.wav'