In [5]:
import torch
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import os

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
gen = torch.manual_seed(2024)

cpu


## Dataset
Data is expected to be located in a folder in separate files with .npy format. This is audio data encoded in Numpy array format which is loaded directly. The dataloader calculates MFCC coefficients for each of the audio files. The labels and file locations are read from a `.csv` datafile.

Additionally, these files are expected to be of a constant length and sample rate. This size is entered manually below; this was run with audio of 15 seconds length and 22050 sample rate.

In [7]:
class SoundDataset(Dataset):
    def __init__(self, datafile, data_dir, transform = None):
        self.data_dir = data_dir
        self.data_file = pd.read_csv(datafile)
        self.transform = transform
    
    def __len__(self):
        return len(self.data_file)
    
    def __getitem__(self, idx):
        dat_file = self.data_file.iloc[idx].filepath
        dat_file = os.path.join(self.data_dir, dat_file)
        label = self.data_file.iloc[idx].multi_classification
        audio_np = np.load(dat_file, allow_pickle=True)

        if self.transform:
            audio_np = self.transform(audio_np)
        return audio_np, label

In [8]:
class MFCC_Transform:
    def __init__(self, n_mfcc_coeffs):
        self.n_mfcc_coeffs = n_mfcc_coeffs
        self.sr = 22050

    def __call__(self, sample):
        return librosa.feature.mfcc(y=sample, sr=self.sr, n_mfcc=self.n_mfcc_coeffs)

In [9]:
mfcc_trans = MFCC_Transform(70)
dataset = SoundDataset("multi_data_description.csv", "new_np_data/", transform=mfcc_trans)
train, test, validate = torch.utils.data.random_split(dataset, [0.8, 0.1, 0.1], generator=gen)
train_dataloader = DataLoader(train, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test, batch_size=128, shuffle=True)
validation_dataloader = DataLoader(validate, batch_size=128, shuffle=True)

## Main Architecture
The architecture below uses a hybrid CNN / GRU architecture. First, CNN layers are applied with batch normalisation and dropout, to attempt to extract useful features from the audio data. 

The outputs from the CNN layers are fed to the GRU layer, in order to extract time-series patterns from the CNN layers.

Then, the GRU outputs are consolidated with a final set of Linear layers, to produce class output.

In [15]:
class CNN_GRU(torch.nn.Module):
    def __init__(self, input_size, output_size):
        super(CNN_GRU, self).__init__()

        if len(input_size) != 2:
            raise Exception("Input should have 2 axes")
        self.in_x, self.in_y = input_size
        self.leakyRelu = torch.nn.LeakyReLU(negative_slope=0.2)
        # self.maxpool = F.max_pool2d

        kernel1 = 16
        self.conv1 = torch.nn.Conv2d(1, 32, kernel_size = kernel1)
        self.bn1 = torch.nn.BatchNorm2d(32)
        self.dropout1 = torch.nn.Dropout2d(0.4)

        kernel2 = 8
        self.conv2 = torch.nn.Conv2d(32, 16, kernel_size = kernel2)
        self.bn2 = torch.nn.BatchNorm2d(16)
        self.dropout2 = torch.nn.Dropout2d(0.4)

        kernel3 = 4
        self.conv3 = torch.nn.Conv2d(16, 1, kernel_size = kernel3)
        self.bn3 = torch.nn.BatchNorm2d(1)
        self.dropout3 = torch.nn.Dropout2d(0.4)


        size_x = self.in_x - (kernel1 - 1) - (kernel2 - 1) - (kernel3 - 1)
        size_y = self.in_y - (kernel1 - 1) - (kernel2 - 1) - (kernel3 - 1)
        self.hidden_size = 64
        self.gru = torch.nn.GRU(input_size=size_x, hidden_size=self.hidden_size, batch_first=True)
        self.seq_len = size_y

        self.fc1 = torch.nn.Linear(in_features=self.seq_len*self.hidden_size, out_features=256)
        self.sig_activ = torch.nn.Sigmoid()
        self.fc2 = torch.nn.Linear(in_features=256, out_features=32)
        self.fc3 = torch.nn.Linear(in_features=32, out_features=output_size)
        
    def forward(self, x):
        x = self.conv1(x)
        if len(x.shape) == 3:
            x = x.unsqueeze(0)
        x = self.bn1(x)
        x = self.dropout1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.dropout3(x)

        x = self.leakyRelu(x)
        x = x.view(x.size(0), x.size(3), -1)  # batch, seq_len, features
        x, _ = self.gru(x)
        x = x.reshape(-1, self.hidden_size*self.seq_len)
        # x = x.view(x.size(0), -1)  # batch, seq_len, features
        x = self.fc1(x)
        x = self.sig_activ(x)
        x = self.fc2(x)
        x = self.sig_activ(x)
        x = self.fc3(x)
        x = self.sig_activ(x)
        return x

In [16]:
def train(model, dataloader, testloader, num_epochs, lr):
    loss_function = torch.nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(model.parameters(), lr = lr)

    for epoch in range(num_epochs):
        model.train()
        loss = 0
        for inputs, labels in dataloader:
            labels = labels.to(device)
            inputs = inputs.unsqueeze(1).to(device)
            # print(inputs.shape)
            outputs = model(inputs).to(device)
            loss = loss_function(outputs, labels)
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
        # loss /= len(dataloader)
        # loss.backward()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")
        
        # Test accuracy after each epoch
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in testloader:
                # calculate outputs by running images through the network
                inputs = inputs.unsqueeze(1).to(device)
                labels = labels.to(device)
                outputs = model(inputs).to(device)
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Accuracy: {100 * correct / total} %')

In [17]:
cnn_gru = CNN_GRU((70, 646), 7).to(device)
print(cnn_gru)

CNN_GRU(
  (leakyRelu): LeakyReLU(negative_slope=0.2)
  (conv1): Conv2d(1, 32, kernel_size=(16, 16), stride=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout2d(p=0.4, inplace=False)
  (conv2): Conv2d(32, 16, kernel_size=(8, 8), stride=(1, 1))
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout2d(p=0.4, inplace=False)
  (conv3): Conv2d(16, 1, kernel_size=(4, 4), stride=(1, 1))
  (bn3): BatchNorm2d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout3): Dropout2d(p=0.4, inplace=False)
  (gru): GRU(45, 64, batch_first=True)
  (fc1): Linear(in_features=39744, out_features=256, bias=True)
  (sig_activ): Sigmoid()
  (fc2): Linear(in_features=256, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=7, bias=True)
)


In [19]:
train(cnn_gru, train_dataloader, test_dataloader, 30, 0.01)

Epoch 1/30, Loss: 1.6096
Accuracy: 45.0 %
Epoch 2/30, Loss: 1.5531
Accuracy: 45.0 %
Epoch 3/30, Loss: 1.5061
Accuracy: 45.0 %
Epoch 4/30, Loss: 1.4781
Accuracy: 45.0 %
Epoch 5/30, Loss: 2.0201
Accuracy: 45.0 %
Epoch 6/30, Loss: 1.3927
Accuracy: 45.0 %
Epoch 7/30, Loss: 1.7257
Accuracy: 45.0 %
Epoch 8/30, Loss: 2.1213
Accuracy: 45.0 %
Epoch 9/30, Loss: 1.9582
Accuracy: 45.0 %
Epoch 10/30, Loss: 1.7509
Accuracy: 45.0 %
Epoch 11/30, Loss: 1.3844
Accuracy: 45.0 %
Epoch 12/30, Loss: 1.9433
Accuracy: 45.0 %
Epoch 13/30, Loss: 1.4883
Accuracy: 45.0 %
Epoch 14/30, Loss: 1.4574
Accuracy: 45.0 %
Epoch 15/30, Loss: 1.4522
Accuracy: 45.0 %
Epoch 16/30, Loss: 1.4459
Accuracy: 45.0 %
Epoch 17/30, Loss: 1.2255
Accuracy: 45.0 %
Epoch 18/30, Loss: 1.6408
Accuracy: 45.0 %
Epoch 19/30, Loss: 1.7224
Accuracy: 45.0 %
Epoch 20/30, Loss: 1.4744
Accuracy: 45.0 %
Epoch 21/30, Loss: 1.5367
Accuracy: 45.0 %
Epoch 22/30, Loss: 1.7397
Accuracy: 45.0 %
Epoch 23/30, Loss: 1.4902
Accuracy: 45.0 %
Epoch 24/30, Loss: 1

# Model failure
As can be observed, the model does not converge, and both the accuracy on the test set and the loss from the training set are roughly constant.

This is likely because the model is too complex, and suffers from over-fitting. As can be seen in the cell below, the model outputs the same prediction for any input, likely because it is just outputting the mostly common class from the test set.

In [20]:
for i in  range(32):
    dat, lab = validate[i]
    dat = torch.tensor(dat)
    dat = dat.unsqueeze(1).reshape([1, 1, 70, -1])
    print(cnn_gru(dat), lab)

tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 0
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 3
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 4
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 3
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 3
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 3
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 2
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 0
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]],
       grad_fn=<SigmoidBackward0>) 3
tensor([[0.9989, 0.0266, 0.0014, 0.6137, 0.0022, 0.0020, 0.0038]