In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Define the emotions
emotion = {"Angry": 0, "Disgusted": 1, "Fearful": 2, "Happy": 3, "Neutral": 4, "Sad": 5, "Suprised": 6}

# MFCC extraction
def extract_mfcc(path, pad_len=100):
    audio, sample_rate = librosa.load(path, res_type='kaiser_fast')
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    if mfcc.shape[1] < pad_len:
        pad_width = pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :pad_len]
    return mfcc

# Additional features
def extract_add(path):
    audio, sr = librosa.load(path, res_type='kaiser_fast')
    pitch, _ = librosa.core.piptrack(y=audio, sr=sr)
    pitch_mean = np.mean(pitch[pitch > 0]) if np.any(pitch > 0) else 0
    energy = np.mean(librosa.feature.rms(y=audio))
    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sr))
    return np.array([pitch_mean, energy, spectral_centroid, spectral_bandwidth])

# Load the dataset
def load_data(data_path, emotions):
    features, add_feature, labels = [], [], []
    for emotion in emotions:
        emotion_folder = os.path.join(data_path, emotion)
        for file in os.listdir(emotion_folder):
            if file.endswith('.wav'):
                file_path = os.path.join(emotion_folder, file)
                mfcc = extract_mfcc(file_path, pad_len=100)
                features.append(mfcc)
                labels.append(emotions[emotion])
                add_feature1 = extract_add(file_path)
                add_feature.append(add_feature1)
    features = np.array(features)
    labels = np.array(labels)
    features = features.reshape(features.shape[0], features.shape[1], features.shape[2], 1)  # For CNN
    return features, labels, np.array(add_feature)

# CNN model


# Assuming data loading step
X, y, X1 = load_data("/kaggle/input/audio-emotions/Emotions", emotion)

# Prepare DataLoaders
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.long).to(device)
X_tensor1 = torch.tensor(X1, dtype=torch.float32).to(device)

dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize CNN model


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [6]:
X_tensor.shape


torch.Size([12798, 40, 100, 1])

In [28]:
class EmotionCNN(nn.Module):
    def __init__(self):
        super(EmotionCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.fc1 = nn.Linear(64 * 10 * 25, 128)
        self.fc2 = nn.Linear(128, 7)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.reshape(x.size(0), -1)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.3, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size * 2, 128)  # Multiply by 2 for bidirectional
        self.dropout1 = nn.Dropout(0.5)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, num_classes)
        
    def forward(self, x):
        # LSTM forward pass
        lstm_out, _ = self.lstm(x)  # lstm_out shape: (batch_size, seq_len, hidden_size * 2)
        
        # Use the last hidden state for classification
        last_hidden_state = lstm_out[:, -1, :]  # Shape: (batch_size, hidden_size * 2)
        
        x = self.fc1(last_hidden_state)  # Shape: (batch_size, 128)
        x = self.dropout1(x)  # Apply dropout
        x = self.batch_norm1(x)  # Apply batch normalization
        x = torch.relu(x)  # Activation function
        
        x = self.fc2(x)  # Output layer
        return x

# Parameters
input_size = 4  # Number of features (from additional features extraction)
hidden_size = 64
num_layers = 2  # Number of LSTM layers
num_classes = 7  # Number of emotion classes

# Instantiate the model


In [13]:
model = EmotionCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop for CNN
for epoch in range(100):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs = inputs.permute(0, 3, 1, 2)  # Shape [32, 1, 40, 100]
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch + 1}/30], Loss: {running_loss / len(train_loader):.4f}')





Epoch [1/30], Loss: 1.9578
Epoch [2/30], Loss: 1.2110
Epoch [3/30], Loss: 1.1504
Epoch [4/30], Loss: 1.0953
Epoch [5/30], Loss: 1.0486
Epoch [6/30], Loss: 1.0121
Epoch [7/30], Loss: 0.9675
Epoch [8/30], Loss: 0.9283
Epoch [9/30], Loss: 0.8825
Epoch [10/30], Loss: 0.8525
Epoch [11/30], Loss: 0.8192
Epoch [12/30], Loss: 0.7661
Epoch [13/30], Loss: 0.7370
Epoch [14/30], Loss: 0.7152
Epoch [15/30], Loss: 0.6749
Epoch [16/30], Loss: 0.6415
Epoch [17/30], Loss: 0.6253
Epoch [18/30], Loss: 0.5860
Epoch [19/30], Loss: 0.5593
Epoch [20/30], Loss: 0.5521
Epoch [21/30], Loss: 0.5119
Epoch [22/30], Loss: 0.5123
Epoch [23/30], Loss: 0.4893
Epoch [24/30], Loss: 0.4723
Epoch [25/30], Loss: 0.4648
Epoch [26/30], Loss: 0.4370
Epoch [27/30], Loss: 0.4371
Epoch [28/30], Loss: 0.4245
Epoch [29/30], Loss: 0.4014
Epoch [30/30], Loss: 0.3847
Epoch [31/30], Loss: 0.3654
Epoch [32/30], Loss: 0.3891
Epoch [33/30], Loss: 0.3838
Epoch [34/30], Loss: 0.3708
Epoch [35/30], Loss: 0.3516
Epoch [36/30], Loss: 0.3428
E

In [25]:
# Evaluation loop

model.eval()  # Set the model to evaluation mode
correct = 0
total = 0

# Use no_grad() to disable gradient calculation
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.permute(0, 3, 1, 2)  # Ensure correct shape [batch_size, channels, height, width]
        
        # Forward pass through the model
        outputs = model(inputs)  # Get model predictions
        
        # Get the predicted class (the index of the maximum log-probability)
        _, predicted = torch.max(outputs.data, 1)
        
        # Update total and correct counts
        total += labels.size(0)
        correct += (predicted == labels).sum().item()  # Count correct predictions

# Print the final accuracy
accuracy = 100 * correct / total if total > 0 else 0
print(f'Accuracy: {accuracy:.2f}%')


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 4

In [17]:
# Save the model state dictionary
torch.save(model.state_dict(), 'emotion_cnn_model.pth')

In [18]:
# Save the entire model
torch.save(model, 'emotion_cnn_model_full.pth')


In [30]:
dataset = TensorDataset(X_tensor1, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize CNN model

In [32]:
model1 = LSTMModel(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop for CNN
for epoch in range(150):
    model1.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        inputs = inputs.unsqueeze(1)
        outputs = model1(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch + 1}/30], Loss: {running_loss / len(train_loader):.4f}')




Epoch [1/30], Loss: 2.0168
Epoch [2/30], Loss: 2.0125
Epoch [3/30], Loss: 2.0097
Epoch [4/30], Loss: 2.0091
Epoch [5/30], Loss: 2.0087
Epoch [6/30], Loss: 2.0131
Epoch [7/30], Loss: 2.0135
Epoch [8/30], Loss: 2.0145
Epoch [9/30], Loss: 2.0109
Epoch [10/30], Loss: 2.0047
Epoch [11/30], Loss: 2.0115
Epoch [12/30], Loss: 2.0095
Epoch [13/30], Loss: 2.0110
Epoch [14/30], Loss: 2.0083
Epoch [15/30], Loss: 2.0208
Epoch [16/30], Loss: 2.0145
Epoch [17/30], Loss: 2.0110
Epoch [18/30], Loss: 2.0135
Epoch [19/30], Loss: 2.0152
Epoch [20/30], Loss: 2.0051
Epoch [21/30], Loss: 2.0158
Epoch [22/30], Loss: 2.0113
Epoch [23/30], Loss: 2.0075
Epoch [24/30], Loss: 2.0087
Epoch [25/30], Loss: 2.0130
Epoch [26/30], Loss: 2.0127
Epoch [27/30], Loss: 2.0104
Epoch [28/30], Loss: 2.0082
Epoch [29/30], Loss: 2.0105
Epoch [30/30], Loss: 2.0131
Epoch [31/30], Loss: 2.0058
Epoch [32/30], Loss: 2.0086
Epoch [33/30], Loss: 2.0099
Epoch [34/30], Loss: 2.0123
Epoch [35/30], Loss: 2.0088
Epoch [36/30], Loss: 2.0129
E

In [34]:
# Evaluation loop
model1.eval()  # Set the model to evaluation mode
correct = 0
total = 0

# Use no_grad() to disable gradient calculation
with torch.no_grad():
    for inputs, labels in test_loader:
      # Ensure correct shape [batch_size, channels, height, width]
        inputs = inputs.unsqueeze(1) 
        # Forward pass through the model
        outputs = model1(inputs)  # Get model predictions
        
        # Get the predicted class (the index of the maximum log-probability)
        _, predicted = torch.max(outputs.data, 1)
        
        # Update total and correct counts
        total += labels.size(0)
        correct += (predicted == labels).sum().item()  # Count correct predictions

# Print the final accuracy
accuracy = 100 * correct / total if total > 0 else 0
print(f'Accuracy: {accuracy:.2f}%')


Accuracy: 16.09%


In [35]:
X_tensor.shape

torch.Size([12798, 40, 100, 1])

In [36]:
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [43]:
class ComplexModel(nn.Module):
    def __init__(self):
        super(ComplexModel, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=100, out_channels=2048, kernel_size=5, padding='same')
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.bn1 = nn.BatchNorm1d(2048)

        self.conv2 = nn.Conv1d(in_channels=2048, out_channels=1024, kernel_size=5, padding='same')
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.bn2 = nn.BatchNorm1d(1024)

        self.conv3 = nn.Conv1d(in_channels=1024, out_channels=512, kernel_size=5, padding='same')
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.bn3 = nn.BatchNorm1d(512)

        self.lstm1 = nn.LSTM(input_size=512, hidden_size=256, num_layers=1, batch_first=True, bidirectional=False, dropout=0.3)
        self.lstm2 = nn.LSTM(input_size=256, hidden_size=128, num_layers=1, batch_first=True, bidirectional=False)

        self.fc1 = nn.Linear(128, 128)
        self.dropout1 = nn.Dropout(0.5)

        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)

        self.fc3 = nn.Linear(64, 32)
        self.dropout3 = nn.Dropout(0.2)

        self.fc4 = nn.Linear(32, 16)  # Adjust the output dimension as needed

    def forward(self, x):
        # x shape: (batch_size, 40, 100, 1) -> reshape to (batch_size, 100, 40)
        x = x.squeeze(-1)  # Shape: (batch_size, 100, 40)
        x = x.permute(0, 2, 1)  # Shape: (batch_size, 40, 100)
        
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool3(x)

        # x shape after Conv1D and MaxPool: (batch_size, 512, new_length)
        # Prepare for LSTM: need (batch_size, seq_length, features)
        x = x.permute(0, 2, 1)  # Shape: (batch_size, new_length, 512)

        lstm_out, _ = self.lstm1(x)  # LSTM output
        lstm_out, _ = self.lstm2(lstm_out)  # LSTM output

        # Take the last output from LSTM
        x = lstm_out[:, -1, :]  # Shape: (batch_size, 128)

        x = self.fc1(x)
        x = self.dropout1(x)
        x = F.relu(x)

        x = self.fc2(x)
        x = self.dropout2(x)
        x = F.relu(x)

        x = self.fc3(x)
        x = self.dropout3(x)
        x = F.relu(x)

        x = self.fc4(x)  # Output layer
        return x

# Initialize model
model = ComplexModel().to(device)

In [41]:
import torch.nn.functional as F

In [45]:
for epoch in range(100):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Zero the gradients
            
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)  # Get predictions
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Calculate average loss and accuracy
        avg_loss = total_loss / len(train_loader)
        accuracy = total_correct / total_samples * 100

        print(f'Epoch [{epoch + 1}/{100}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

Epoch [1/100], Loss: 2.7436, Accuracy: 15.15%
Epoch [2/100], Loss: 2.7429, Accuracy: 15.00%
Epoch [3/100], Loss: 2.7430, Accuracy: 15.48%
Epoch [4/100], Loss: 2.7434, Accuracy: 14.95%
Epoch [5/100], Loss: 2.7433, Accuracy: 14.89%
Epoch [6/100], Loss: 2.7431, Accuracy: 15.28%
Epoch [7/100], Loss: 2.7429, Accuracy: 15.64%
Epoch [8/100], Loss: 2.7429, Accuracy: 15.05%
Epoch [9/100], Loss: 2.7429, Accuracy: 15.46%
Epoch [10/100], Loss: 2.7433, Accuracy: 14.92%
Epoch [11/100], Loss: 2.7430, Accuracy: 15.48%
Epoch [12/100], Loss: 2.7429, Accuracy: 15.65%
Epoch [13/100], Loss: 2.7433, Accuracy: 15.19%
Epoch [14/100], Loss: 2.7432, Accuracy: 15.10%
Epoch [15/100], Loss: 2.7429, Accuracy: 15.14%
Epoch [16/100], Loss: 2.7433, Accuracy: 15.53%
Epoch [17/100], Loss: 2.7432, Accuracy: 15.59%
Epoch [18/100], Loss: 2.7431, Accuracy: 15.31%
Epoch [19/100], Loss: 2.7430, Accuracy: 15.47%
Epoch [20/100], Loss: 2.7429, Accuracy: 15.34%
Epoch [21/100], Loss: 2.7433, Accuracy: 14.85%
Epoch [22/100], Loss: 

KeyboardInterrupt: 