<a href="https://colab.research.google.com/github/shadfdz/speechEmotionRecognition/blob/master/Speech_emotion_recognition_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
from google.colab import drive
import librosa
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
drive.mount('/content/drive',force_remount=True)
from torch.utils.data import Dataset, DataLoader
import torch
import glob

Mounted at /content/drive


In [2]:
# get emotion categories (remove calm it has been grouped with neutral prior)
path = 'drive/MyDrive/processed_emotion_dataset/'
sub_folder = glob.glob(path + '*')
sub_folder.remove('drive/MyDrive/processed_emotion_dataset/calm')
emotions = [i.split('/')[3] for i in sub_folder]
emotions

['neutral', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [3]:
# print count of each file
for cat in emotions:
  f_list = glob.glob(path + cat + '/*')
  print('Emotion category: \'{}\' Count: {}'.format(cat,len(f_list)))

Emotion category: 'neutral' Count: 1619
Emotion category: 'happy' Count: 1739
Emotion category: 'sad' Count: 1739
Emotion category: 'angry' Count: 1739
Emotion category: 'fearful' Count: 1739
Emotion category: 'disgust' Count: 1739
Emotion category: 'surprised' Count: 469


# Create Melspect Dataset Class

In [4]:
class audiodata(Dataset):
  def __init__(self, file_paths, n_mels=128):
    self.mels = n_mels
    self.file_paths = file_paths
    self.data = []
    for f_name in file_paths:
      emotion_category = f_name.split('/')[-1].split('_')[0]
      self.data.append([f_name, emotion_category])
    self.class_dict = {"neutral": 0,
                    "happy": 1,
                    "sad": 2,
                    "angry": 3,
                    "fearful": 4,
                    "disgust": 5,
                    "surprised": 6 
                    }

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    audio_path, emotion_class = self.data[index]
    x, sr = librosa.load(audio_path)
    melspect = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128, pad_mode='constant')
    pad_crop_length = 128 - melspect.shape[1]
    if pad_crop_length > 0:
      melspect = np.pad(melspect, [(0,0),(0,pad_crop_length)], mode='constant')
    if pad_crop_length < 0:
      melspect = melspect[:,0:128]
    melspect = melspect[np.newaxis,:,:]
    # get emotion class code
    class_code = self.class_dict[emotion_class]
    audio_tensor = torch.from_numpy(melspect)
    
    return audio_tensor, class_code

# Create Training, Validation, Test Split


In [14]:
import random
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import enumerate

ModuleNotFoundError: ignored

In [10]:
# get paths for each emotion category
sub_folder

['drive/MyDrive/processed_emotion_dataset/neutral',
 'drive/MyDrive/processed_emotion_dataset/happy',
 'drive/MyDrive/processed_emotion_dataset/sad',
 'drive/MyDrive/processed_emotion_dataset/angry',
 'drive/MyDrive/processed_emotion_dataset/fearful',
 'drive/MyDrive/processed_emotion_dataset/disgust',
 'drive/MyDrive/processed_emotion_dataset/surprised']

In [6]:
# get all files
file_list = []
for folder in sub_folder:
  file_list += glob.glob(folder + "/*")

In [7]:
# create dataset
dataset = audiodata(file_list)

In [8]:
def train_val_dataset(dataset, val_split=0.20):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets

In [9]:
datasets = train_val_dataset(dataset)
print(len(datasets['train']))
print(len(datasets['val']))

8626
2157


In [13]:
dataloaders = {x:DataLoader(datasets[x],32, shuffle=True, num_workers=2) for x in ['train','val']}
x,y = next(iter(dataloaders['train']))
print(x.shape, y.shape)

torch.Size([32, 1, 128, 128]) torch.Size([32])


In [11]:
# use cuda
torch.cuda.get_device_name(0)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [4]:
# create NN model

import torch.nn.functional as F
from torch.nn import init
import torch.nn as nn


class AudioEmotionClassifier(nn.Module):
  def __init__(self, debug=False):
    super(AudioEmotionClassifier, self).__init__()
    self.debug=debug

    # first convolutional layer
    self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    self.bn1 = nn.BatchNorm2d(8)

    # second convolutional layer
    self.conv2 = nn.Conv2d(8, 16, kernel_size=3, stride=(2, 2), padding=(1, 1))
    self.bn2 = nn.BatchNorm2d(18)
    self.pool2 = nn.MaxPool2d(2, stride=2)


    # third convolutional layer
    self.conv3 = nn.Conv2d(16, 32, kernel_size=3, stride=(2, 2), padding=(1, 1))
    self.bn3 = nn.BatchNorm2d(32)

    # fourth convolutional layer
    self.conv4 = nn.Conv2d(32, 64, kernel_size=3, stride=(2, 2), padding=(1, 1))
    self.bn4 = nn.BatchNorm2d(64)
    self.pool4 = nn.AvgPool2d(2)

    # fully connected layer
    self.flatten = nn.Flatten()
    self.fc = nn.linear(64,10)

    # Softmax layer
    self.output = nn.LogSoftmax(dim=1)


  def forward(self, x):
    # make this prettier
    # forward pass
    x = self.bn1(F.relu(self.conv1(x)))

    x = self.pool2(self.bn2(F.relu(self.conv2(x))))

    x = self.bn3(F.relu(self.conv3(x)))

    x = self.pool4(self.bn4(F.relu(self.conv4(x))))

    x = self.flatten(x)

    x = F.relu(self.fc(x))

    x = self.output(x)

    return x

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, verbose=True):
  for i, (X, y) in enumarate(dataloader):
    pred = model(X)
    loss = loss_fn(pred, y)

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
