In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import librosa
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os

In [25]:
# # Set your speech audio folder
# audio_dir = 'dataset/speech'
# data = []

# # Go through each WAV file
# for file in os.listdir(audio_dir):
#     if file.endswith('.wav'):
#         parts = file.split('_')
#         if len(parts) == 3:
#             word = parts[1]
#             emotion = parts[2].replace('.wav', '')
#             data.append({
#                 'word': word,
#                 'emotion': emotion,
#                 'speech_path': os.path.join(audio_dir, file)
#             })

# # Create and save the DataFrame
# df = pd.DataFrame(data)
# df.to_csv('speech_word_dataset.csv', index=False)
# print(df.head())


In [26]:
# -----------------------
# 1. Preprocessing
# -----------------------

def extract_mfcc(file_path, n_mfcc=40, max_len=100):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    combined = np.vstack([mfcc, delta, delta2]).T  # (time, feature_dim)

    if combined.shape[0] < max_len:
        pad_width = max_len - combined.shape[0]
        combined = np.pad(combined, ((0, pad_width), (0, 0)), mode='constant')
    else:
        combined = combined[:max_len, :]
    return (combined - np.mean(combined)) / np.std(combined)

In [27]:
# # Load CSV
# df = pd.read_csv('speech_word_dataset.csv')

# # Encode emotions
# label_encoder = LabelEncoder()
# df['label'] = label_encoder.fit_transform(df['emotion'])

# # Create word2idx dictionary for simple word embeddings
# word_list = sorted(df['word'].unique())
# word2idx = {word: idx for idx, word in enumerate(word_list)}

In [28]:
# -----------------------
# 2. Dataset
# -----------------------

class MultimodalDataset(Dataset):
    def __init__(self, df, tokenizer, label_encoder, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Text (word)
        encoding = self.tokenizer(
            row['word'],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        # Audio (MFCC)
        mfcc = extract_mfcc(row['speech_path'])

        # Label
        label = self.label_encoder.transform([row['emotion']])[0]

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'mfcc': torch.tensor(mfcc, dtype=torch.float32),
            'label': torch.tensor(label)
        }

In [29]:
# -----------------------
# 3. Model
# -----------------------

class MultimodalEmotionModel(nn.Module):
    def __init__(self, text_model_name='bert-base-uncased', audio_feature_dim=120, num_classes=5):
        super().__init__()

        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_fc = nn.Linear(self.text_encoder.config.hidden_size, 128)

        self.audio_cnn = nn.Sequential(
            nn.Conv1d(audio_feature_dim, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten()
        )
        self.audio_fc = nn.Linear(128, 128)

        self.fusion = nn.Linear(256, 64)
        self.classifier = nn.Linear(64, num_classes)

    def forward(self, input_ids, attention_mask, mfcc):
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_embed = self.text_fc(text_out.last_hidden_state[:, 0, :])

        mfcc = mfcc.permute(0, 2, 1)
        audio_embed = self.audio_fc(self.audio_cnn(mfcc))

        combined = torch.cat((text_embed, audio_embed), dim=1)
        fusion_out = F.relu(self.fusion(combined))
        logits = self.classifier(fusion_out)
        return logits

In [30]:
# -----------------------
# 4. Training & Evaluation
# -----------------------

def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        mfcc = batch['mfcc'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, mfcc)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            mfcc = batch['mfcc'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, mfcc)
            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == labels).sum().item()
            total += labels.size(0)
    return total_correct / total


In [31]:
# -----------------------
# 5. Main Execution
# -----------------------

df = pd.read_csv("speech_word_dataset.csv")
label_encoder = LabelEncoder()
df['emotion'] = label_encoder.fit_transform(df['emotion'])

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = MultimodalDataset(train_df, tokenizer, label_encoder)
val_dataset = MultimodalDataset(val_df, tokenizer, label_encoder)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalEmotionModel(num_classes=len(label_encoder.classes_)).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_acc = evaluate(model, val_loader, device)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Acc: {val_acc:.4f}")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: y contains previously unseen labels: np.int64(0)