In [202]:
# initial data loading cell

# !wget http://emodb.bilderbar.info/download/download.zip
# !mkdir -p data/berlin && unzip download.zip -d data/berlin

# # !aria2c -x 16 https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
# !wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
# !mkdir -p data/ravdess && unzip Audio_Speech_Actors_01-24.zip -d data/ravdess
# !rm download.zip Audio_Speech_Actors_01-24.zip
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [203]:
import glob
import os
import re

import librosa
import numpy as np
from copy import deepcopy

RAVDESS = "data/ravdess/"
BERLIN = "data/berlin/wav/"
RAVDESS_CLEAN = "clean_data/ravdess_clean.npy"
BERLIN_CLEAN = "clean_data/berlin_clean.npy"
# !rm {RAVDESS_CLEAN}
# !rm {BERLIN_CLEAN}
DEV_MODE = False
DEV_LIMIT = 20
BATCH_SIZE = 5

# AUDIO = 0
# SAMPLING_RATE = 44100//2
# EMOTION = 2
# ACTOR = 3
# GENDER = 4
MFCC = 20

# Possible Emotions
#  - Neutral
#  - Calm
#  - Happy
#  - Sad
#  - Angry
#  - Fearful
#  - Disgust
#  - Surprised
#  - Boredom

# Emotion = {'01': neutral, '02': calm, '03': happy, '04': sad, '05': angry, '06': fearful, '07': disgust, '08': surprised}

# Possible genders
#  - male
#  - female

# 34 possible actors


class Dataset:
    def __init__(self):
        self.berlin_data = None
        self.ravdess_data = None
        self.noisy_berlin_data = None
        self.noisey_ravdess_data = None
        self.feature_names = ["audio", "sampling_rate", "emotion", "actor", "gender"]
        self._normalized = False
        self._augmented = False

    def load(self):
        try:
            self.ravdess_data = np.load(RAVDESS_CLEAN, allow_pickle=True)
            self.berlin_data = np.load(BERLIN_CLEAN, allow_pickle=True)
        except:  # no file found, so do the normal loading stuff
            print("Manually rebuilding the dataset!")
            self.ravdess_data = self.fetch(RAVDESS, RAVDESS_CLEAN)
            self.berlin_data = self.fetch(BERLIN, BERLIN_CLEAN)

    def fetch(self, path, save_path):
        #         n = Dataset._get_num_files(path)
        output = []  # np.zeros(n, dtype=object)
        for root, dirs, files in os.walk(path):
            for file in files:
                if DEV_MODE and len(output) > DEV_LIMIT:
                    continue
                if ".wav" in file:
                    element = {
                        "audio": None,
                        "sampling_rate": None,
                        "emotion": None,
                        "actor": None,
                        "gender": None,
                        "mfcc": None,
                    }
                    filepath = os.path.join(root, file)
                    audio, sampling_rate = librosa.load(filepath)
                    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=MFCC)
                    emotion = Dataset._get_emotion(file, path)
                    actor = Dataset._get_actor(file, path)
                    gender = Dataset._get_gender(file, path)

                    element["audio"] = audio
                    element["sampling_rate"] = sampling_rate
                    element["emotion"] = emotion
                    element["actor"] = actor
                    element["gender"] = gender
                    element["mfcc"] = mfcc
                    output.append(element)
            #                 output[i] = element

        output = np.array(output, dtype=object)
        np.save(save_path, output)
        return output

    def normalize(self):
        if self._normalized == False:
            for elt in self.ravdess_data:
                mean = np.mean(elt["mfcc"], axis=0)
                std = np.std(elt["mfcc"], axis=0)
                elt["mfcc"] = (elt["mfcc"] - mean) / std
        self._normalized = True

    def augment_data(self, noisefactor=2, shiftmax=0.5, shiftdir="both"):
        noisy_data = self.get_noisy_data(noisefactor)
        shifted_data = self.get_shifted_data(shiftmax, shiftdir)
        self.ravdess_data = np.append(self.ravdess_data, noisy_data)
        self.ravdess_data = np.append(self.ravdess_data, shifted_data)

    def get_noisy_data(self, noise_factor=1):
        noisy_data = np.array([])
        for elt in self.ravdess_data:
            newelt = deepcopy(elt)
            noise = np.random.randn(newelt["mfcc"].shape[0], newelt["mfcc"].shape[1])
            newelt["mfcc"] = newelt["mfcc"] + noise * noise_factor
            noisy_data = np.append(noisy_data, newelt)
        return noisy_data

    def get_shifted_data(self, shift_max, shift_direction):
        shifted_data = np.array([])
        for elt in self.ravdess_data:
            newelt = deepcopy(elt)
            shift = np.random.randint(newelt["sampling_rate"] * shift_max)
            if shift_direction == "left":
                shift = -shift
            elif shift_direction == "both":
                direction = np.random.randint(0, 2)
                if direction == 1:
                    shift = -shift

            shifted_audio = np.roll(elt["audio"], shift)
            if shift > 0:
                shifted_audio[:shift] = 0
            else:
                shifted_audio[shift:] = 0
            newelt["audio"] = shifted_audio
            newelt["mfcc"] = librosa.feature.mfcc(
                y=newelt["audio"], sr=newelt["sampling_rate"], n_mfcc=MFCC
            )
            shifted_data = np.append(shifted_data, newelt)
        return shifted_data

    @staticmethod
    def _get_num_files(path):
        total = 0
        for root, dirs, files in os.walk(path):
            total += len(files)
        return total

    @staticmethod
    def _get_emotion(filename, dataset_type):
        if dataset_type == RAVDESS:
            return re.findall("[0-9][0-9]", filename)[2]
        elif dataset_type == BERLIN:
            emotion = re.search("[A-Z]", filename)

    @staticmethod
    def _get_actor(filename, dataset_type):
        if dataset_type == RAVDESS:
            return re.findall("[0-9][0-9]", filename)[6]
        elif dataset_type == BERLIN:
            actor = re.search("[a-z][0-9][0-9]", filename)

    @staticmethod
    def _get_gender(filename, dataset_type):
        if dataset_type == RAVDESS:
            actor = re.findall("[0-9][0-9]", filename)[6]
            if int(actor) % 2 == 0:
                return "female"
            else:
                return "male"
        elif dataset_type == BERLIN:
            actor = re.findall("[0-9][0-9]", filename)[0]
            return Dataset._get_berlin_gender_from_actor(actor)

    @staticmethod
    def _get_berlin_gender_from_actor(actor):
        switch = {
            "03": "male",
            "08": "female",
            "09": "female",
            "10": "male",
            "11": "male",
            "12": "male",
            "13": "female",
            "14": "female",
            "15": "male",
            "16": "female",
        }
        return switch.get(actor, "Invalid actor!")


dataset = Dataset()
dataset.load()

In [204]:
dataset.augment_data()
dataset.normalize()

In [205]:
import torch
import time
from torch import nn
from torchvision import models, transforms
from typing import Dict, Tuple, List
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def pad(to_pad, r=MFCC, c=230):
    z = np.zeros((r, c))
    z[: to_pad.shape[0], : to_pad.shape[1]] = to_pad
    return z


# X = (lambda x: pad(x['mfcc']))(dataset.ravdess_data)
def prep_Xy(data):
    #     padded = pad_sequence([torch.from_numpy(x['mfcc']) for x in dataset.ravdess_data]) #https://pytorch.org/docs/stable/nn.html#pad-sequence
    X = torch.tensor([pad(x["mfcc"]) for x in data.ravdess_data], dtype=torch.float32)
    y = torch.tensor([int(x["emotion"]) - 1 for x in data.ravdess_data])
    return X, y


class PaddedDataset(Dataset):
    def __init__(self, X: torch.tensor, y: torch.tensor):
        super(PaddedDataset).__init__()
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index].unsqueeze(0).to(device), self.y[index].to(device)

    def __len__(self):
        return self.X.shape[0]


paddedDataset = PaddedDataset(*prep_Xy(dataset))
# split 70% for training, 15% for validation, 15% for test
train_size = int(0.7 * len(paddedDataset))
valid_size = int(0.15 * len(paddedDataset))
test_size = len(paddedDataset) - (train_size + valid_size)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    paddedDataset, [train_size, valid_size, test_size]
)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE)  # , shuffle=True)
valid_iterator = DataLoader(valid_dataset, batch_size=BATCH_SIZE)  # , shuffle=True)
test_iterator = DataLoader(test_dataset, batch_size=BATCH_SIZE)  # , shuffle=True)

# from collections import Counter
# Counter(prep_Xy(dataset)[1])

In [206]:
import torch.nn as nn
import torch.nn.functional as F


class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 2 * 54, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=8)

    # comes in as 20x230. 20 = MFCCs, 230 is padded length that doesnt truncate any data
    def forward(self, t):
        # conv 1
        t = self.conv1(t)  # output 6x16x226
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)  # output 6x8x113

        # conv 2
        t = self.conv2(t)  # output 12x4x109
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)  # output 12x2x54

        # fc1
        t = t.reshape(-1, 12 * 2 * 54)  # flatten to 1x1296
        t = self.fc1(t)  # output 1x120
        t = F.relu(t)

        # fc2
        t = self.fc2(t)  # output 1x60
        t = F.relu(t)
        t = self.out(t)  # output 1x8 (8 classes)
        # don't need softmax here since we'll use cross-entropy as activation.
        return t


def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(
        dim=1, keepdim=True
    )  # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for X, y in iterator:
        optimizer.zero_grad()
        predictions = model(X)
        loss = criterion(predictions, y)
        acc = categorical_accuracy(predictions, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for X, y in iterator:
            predictions = model(X)
            loss = criterion(predictions, y)
            acc = categorical_accuracy(predictions, y)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


# def train(model, train_loader, optimizer, epochs=10, log_interval=100, device=device):
#     model.to(device).train()
#     for batch_idx, (X, y) in enumerate(train_loader):
#         optimizer.zero_grad()
#         output = model(X)
#         loss = F.cross_entropy(output, y)
#         loss.backward()
#         optimizer.step()
#         if batch_idx % log_interval == 0:
#             print(
#                 "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
#                     epochs,
#                     batch_idx * len(X),
#                     len(train_loader.dataset),
#                     100.0 * batch_idx / len(train_loader),
#                     loss.item(),
#                 )
#             )


# def test(model, test_loader, device=device):
#     model.eval()
#     test_loss = 0
#     correct = 0
#     with torch.no_grad():
#         for X, y in test_loader:
#             output = model(X)
#             test_loss += F.cross_entropy(output, y).item()  # sum up batch loss
#             pred = output.argmax(
#                 dim=1, keepdim=True
#             )  # get the index of the max log-probability
#             correct += pred.eq(y.view_as(pred)).sum().item()

#     test_loss /= len(test_loader.dataset)

#     print(
#         "\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
#             test_loss,
#             correct,
#             len(test_loader.dataset),
#             100.0 * correct / len(test_loader.dataset),
#         )
#     )


model = Network().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss().to(device)
# train(net, trainloader, optimizer)
# test(net, testloader)
N_EPOCHS = 40

best_valid_loss = float("inf")

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "classifer-model.pt")

    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

Epoch: 01 | Epoch Time: 0m 6s
	Train Loss: 2.054 | Train Acc: 16.14%
	 Val. Loss: 1.990 |  Val. Acc: 18.87%
Epoch: 02 | Epoch Time: 0m 8s
	Train Loss: 1.938 | Train Acc: 24.08%
	 Val. Loss: 1.836 |  Val. Acc: 26.62%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 1.752 | Train Acc: 30.86%
	 Val. Loss: 1.682 |  Val. Acc: 33.18%
Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 1.669 | Train Acc: 34.76%
	 Val. Loss: 1.611 |  Val. Acc: 35.69%
Epoch: 05 | Epoch Time: 0m 6s
	Train Loss: 1.627 | Train Acc: 36.55%
	 Val. Loss: 1.582 |  Val. Acc: 36.46%
Epoch: 06 | Epoch Time: 0m 5s
	Train Loss: 1.593 | Train Acc: 37.02%
	 Val. Loss: 1.558 |  Val. Acc: 38.72%
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 1.559 | Train Acc: 38.21%
	 Val. Loss: 1.544 |  Val. Acc: 39.79%
Epoch: 08 | Epoch Time: 0m 4s
	Train Loss: 1.528 | Train Acc: 40.26%
	 Val. Loss: 1.523 |  Val. Acc: 40.10%
Epoch: 09 | Epoch Time: 0m 6s
	Train Loss: 1.492 | Train Acc: 41.84%
	 Val. Loss: 1.497 |  Val. Acc: 40.87%
Epoch: 10 | Epoch Time: 0m 7