In [40]:
import librosa
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

In [42]:
# Data augmentation
def augment_audio(audio, sr):
    noise = np.random.randn(len(audio))
    audio_noise = audio + 0.005 * noise
    audio_pitch = librosa.effects.pitch_shift(audio, sr=sr, n_steps=2)
    return [audio, audio_noise, audio_pitch]

In [44]:
# Feature extraction
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, sr=22050)
    augmented_audios = augment_audio(audio, sample_rate)
    features = []
    for aug_audio in augmented_audios:
        mfcc = librosa.feature.mfcc(y=aug_audio, sr=sample_rate, n_mfcc=40)
        mfcc_scaled = np.mean(mfcc.T, axis=0)
        features.append(mfcc_scaled)
    return np.array(features)

In [46]:
# Load RAVDESS dataset
data_path = "./archive"  # Update this path after downloading RAVDESS
X, y = [], []
emotions = {"01": "neutral", "03": "happy", "04": "sad", "05": "angry"}

for root, _, files in os.walk(data_path):
    for file in files:
        if file.endswith(".wav"):
            emotion_code = file.split("-")[2]
            if emotion_code in emotions:
                file_path = os.path.join(root, file)
                features = extract_features(file_path)
                for feature in features:
                    X.append(feature)
                    y.append(emotions[emotion_code])

In [48]:
# Preprocess data
X = np.array(X)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded = torch.tensor(y_encoded, dtype=torch.long)

In [50]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [52]:
# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

In [54]:
# Reshape for CNN-LSTM
X_train = X_train.unsqueeze(2)
X_test = X_test.unsqueeze(2)

In [56]:
# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [58]:
# Build hybrid CNN-LSTM model
class EmotionDetectionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionDetectionModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 5, padding=2)
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(128, 64, 5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.MaxPool1d(2)
        self.lstm1 = nn.LSTM(64, 128, batch_first=True)
        self.dropout1 = nn.Dropout(0.3)
        self.lstm2 = nn.LSTM(128, 64, batch_first=True)
        self.dropout2 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        x = x.permute(0, 2, 1)
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x[:, -1, :])
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = EmotionDetectionModel(num_classes=len(emotions))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [62]:
# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)

# Reshape for CNN-LSTM
X_train = X_train.permute(0, 2, 1)  # Change shape from (batch_size, 40, 1) to (batch_size, 1, 40)
X_test = X_test.permute(0, 2, 1)    # Change shape from (batch_size, 40, 1) to (batch_size, 1, 40)

# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Build hybrid CNN-LSTM model
class EmotionDetectionModel(nn.Module):
    def __init__(self, num_classes):
        super(EmotionDetectionModel, self).__init__()
        self.conv1 = nn.Conv1d(1, 128, 5, padding=2)  # Input channels = 1
        self.bn1 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(128, 64, 5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.MaxPool1d(2)
        self.lstm1 = nn.LSTM(64, 128, batch_first=True)
        self.dropout1 = nn.Dropout(0.3)
        self.lstm2 = nn.LSTM(128, 64, batch_first=True)
        self.dropout2 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
        x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
        x = x.permute(0, 2, 1)  # Reshape for LSTM: (batch_size, seq_len, channels)
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x[:, -1, :])  # Use the last output of the LSTM
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = EmotionDetectionModel(num_classes=len(emotions))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

# Evaluate
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

  X_train = torch.tensor(X_train, dtype=torch.float32)
  X_test = torch.tensor(X_test, dtype=torch.float32)


Epoch 1/100, Loss: 1.356095283338339
Epoch 2/100, Loss: 1.348395859841073
Epoch 3/100, Loss: 1.261888832148939
Epoch 4/100, Loss: 1.1643539832370116
Epoch 5/100, Loss: 1.098966638640602
Epoch 6/100, Loss: 1.032539421969121
Epoch 7/100, Loss: 1.0084736778004335
Epoch 8/100, Loss: 0.9575578658887656
Epoch 9/100, Loss: 0.9114729301764233
Epoch 10/100, Loss: 0.9024640835157716
Epoch 11/100, Loss: 0.8618368223161981
Epoch 12/100, Loss: 0.8560073806507753
Epoch 13/100, Loss: 0.8365718745949244
Epoch 14/100, Loss: 0.7925548854440746
Epoch 15/100, Loss: 0.7740341720014515
Epoch 16/100, Loss: 0.7487800088849398
Epoch 17/100, Loss: 0.7371934445777742
Epoch 18/100, Loss: 0.7188749095000843
Epoch 19/100, Loss: 0.6900963550156886
Epoch 20/100, Loss: 0.6680868805044948
Epoch 21/100, Loss: 0.6416519656039701
Epoch 22/100, Loss: 0.6142152913726202
Epoch 23/100, Loss: 0.597919366442331
Epoch 24/100, Loss: 0.5969571179092521
Epoch 25/100, Loss: 0.5621621272351482
Epoch 26/100, Loss: 0.5384008244122609
E

In [64]:
# Evaluate
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 91.20%


In [66]:
# Save model and encoder
torch.save(model.state_dict(), "emotion_voice_model.pth")
np.save("label_encoder_classes.npy", le.classes_)