In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa, librosa.display
from IPython.display import Audio
from pydub import AudioSegment
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.nn.functional as func

In [2]:
iemo_data = pd.read_csv('/kaggle/input/iemocap-transcriptions-english-french/iemocapTrans.csv')
iemo_data = iemo_data[['emotion', 'titre']]
iemo_data['filepath'] = '/kaggle/input/iemocap-transcriptions-english-french/Iemocap_audio/iemocap_audio/IEMOCAP_wav/' + iemo_data['titre'] + '.wav'

In [3]:
# ravdess dataset
# emotions -> 01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised
# third part of the name

emo_dict = {
    '01': 'neu',
    '02': 'neu',
    '03': 'hap',
    '04': 'sad',
    '05': 'ang',
    '06': 'fea',
    '07': 'dis',
    '08': 'sur'
}

ravdess_base = "/kaggle/input/ravdess-emotional-speech-audio/"

rav_data = pd.DataFrame(columns=['emotion', 'titre', 'filepath'])

for dirname, _, filenames in os.walk(ravdess_base):
    for filename in filenames:
        
        info_list = filename.split('-')
        emotion = emo_dict[info_list[2]]
                
        new_row = {
            'emotion': [emotion],
            'titre': [filename[:-4]],
            'filepath': [os.path.join(dirname, filename)]
        }
        rav_data = pd.concat([rav_data, pd.DataFrame(new_row)], ignore_index=True)
rav_data.head()

Unnamed: 0,emotion,titre,filepath
0,sur,03-01-08-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...
1,neu,03-01-01-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...
2,dis,03-01-07-02-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...
3,dis,03-01-07-01-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...
4,neu,03-01-01-01-02-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...


In [4]:
data = pd.concat([iemo_data, rav_data], ignore_index=True)
data = data.sample(frac=1)

In [5]:
data = pd.get_dummies(data, columns=['emotion'], dtype='int')

In [6]:
X = []
y = []

for _, row in data.iterrows():
    
    arr_len = 48000
    
    signal, sr = librosa.load(row['filepath'], sr=16000)
    arr = np.array(signal)
    
    while (arr.shape[0] >= arr_len//3):
        
        
        if (arr.shape[0] < arr_len):
            
            pad_len = arr_len - arr.shape[0]
            arr = np.pad(arr, (0, pad_len), 'constant')
            assert arr_len == arr.shape[0]
            X.append(arr.reshape(300,160))
            y.append(row.drop(['titre', 'filepath']).to_numpy(dtype=np.float32))
            arr = np.zeros(0)
            
        else:
            
            segment = arr[: arr_len]
            assert arr_len == segment.shape[0]
            X.append(segment.reshape(300,160))
            y.append(row.drop(['titre', 'filepath']).to_numpy(dtype=np.float32))
            arr = arr[arr_len:]

In [7]:
X_np = np.array(X)
y_np = np.array(y)

In [8]:
X_np.shape

(20222, 300, 160)

In [9]:
Xtr, Xte, ytr, yte = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

In [10]:
X_train = torch.tensor(Xtr)
X_test = torch.tensor(Xte)
y_train = torch.tensor(ytr)
y_test = torch.tensor(yte)

In [11]:
batch_size = 32
num_epochs = 5
input_size = 160
sequence_length = 300
lr = 1e-5

In [12]:
loaders = {
    'train': torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=False),
    'test': torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test, y_test), batch_size=batch_size)
}

In [13]:
# rnn
class LSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, num_classes, sequence_length):
        
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.sequence_length = sequence_length
        # rnn layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        # fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, X):
        
        #initial hidden states
        h0 = torch.zeros(self.num_layers, self.sequence_length, self.hidden_size)
        c0 = torch.zeros(self.num_layers, self.sequence_length, self.hidden_size)
        # feed into rnn
        out, _ = self.lstm(X, (h0, c0))
        
        # feed into fc
        # need to decode the hidden state of last time
        out = out[:, -1, :]
        out = func.relu(self.fc(out))
        
        return func.softmax(out, dim=1)

In [14]:
def evaluate(model, loaders, loss_func):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(loaders['test']):
            src, label = batch  # You'll need to adjust this based on your data
            output = model(src)
            loss = loss_func(output, label)
            total_loss += loss.item()
    return total_loss / len(loaders['test'])

In [15]:
train_losses = []
val_losses = []

def train(model, loss_func, optim, loaders, epochs):
    
    for epoch in range(epochs):
        
        loss = 0
        
        for idx, (data, label) in enumerate(loaders['train']):
            
            optim.zero_grad()
            output = model(data)
            loss = loss_func(output, label)

            pred_label = (torch.argmax(output,dim =1).view(-1)).numpy()
            true_label = (torch.argmax(label,dim =1).view(-1)).numpy()
            acc = accuracy_score(pred_label, true_label)

            loss.backward()
            optim.step()
            
            if (idx%100==0):
                print(f'epoch: {epoch+1}/{epochs} -> loss: {loss.item()} ; accuracy: {acc}')
        
        train_losses.append(loss.item())
        
        val_loss = evaluate(model, loaders, loss_func)
        val_losses.append(val_loss)
        print(f'epoch: {epoch+1} -> loss: {val_loss}')
        
    return train_losses, val_losses

In [16]:
model = LSTM(input_size, 500, 2, 10, sequence_length)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
train_losses, val_losses = train(model, criterion, optimizer, loaders, num_epochs)

epoch: 1/5 -> loss: 2.302506446838379 ; accuracy: 0.0
epoch: 1/5 -> loss: 2.2993690967559814 ; accuracy: 0.3125
epoch: 1/5 -> loss: 2.2940189838409424 ; accuracy: 0.21875
epoch: 1/5 -> loss: 2.293480157852173 ; accuracy: 0.1875
epoch: 1/5 -> loss: 2.2375707626342773 ; accuracy: 0.25
epoch: 1/5 -> loss: 2.271878242492676 ; accuracy: 0.125
epoch: 1 -> loss: 2.228910564437626
epoch: 2/5 -> loss: 2.2522449493408203 ; accuracy: 0.1875
epoch: 2/5 -> loss: 2.1463804244995117 ; accuracy: 0.34375
epoch: 2/5 -> loss: 2.2714409828186035 ; accuracy: 0.21875
epoch: 2/5 -> loss: 2.2743020057678223 ; accuracy: 0.1875
epoch: 2/5 -> loss: 2.2256624698638916 ; accuracy: 0.25
epoch: 2/5 -> loss: 2.2751758098602295 ; accuracy: 0.125
epoch: 2 -> loss: 2.224353617570532
epoch: 3/5 -> loss: 2.257131576538086 ; accuracy: 0.1875
epoch: 3/5 -> loss: 2.1434574127197266 ; accuracy: 0.34375
epoch: 3/5 -> loss: 2.2672977447509766 ; accuracy: 0.21875
epoch: 3/5 -> loss: 2.2771358489990234 ; accuracy: 0.1875
epoch: 3

In [None]:
train_losses

In [None]:
val_losses