# Audio Classification

## Basic steps

* Load model
* Classify test data

## Usage examples at the end


In [1]:
import sklearn.metrics as metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as aT
import torchvision.models as models


In [2]:
class DenseNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.model = models.densenet201(pretrained=True)
        conv0 = self.model.features.conv0
        self.model.features.conv0 = nn.Conv2d(
            1,
            conv0.out_channels,
            kernel_size=conv0.kernel_size,
            stride=conv0.stride,
            padding=conv0.padding,
        )
        self.model.classifier = nn.Linear(1920, num_classes)

    def forward(self, x):
        output = self.model(x)
        return output


In [29]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

classes = ['1', '2', '3', '4', '5', '6', '7']


def audio_loader(path, max_length_in_seconds=4):
    waveform, sample_rate = torchaudio.load(path)
    _, num_frames = waveform.shape
    max_frames = sample_rate * max_length_in_seconds

    # ? Pad audio with zeros if too short or cut audio if too long
    if num_frames < max_frames:
        waveform = torch.nn.functional.pad(waveform, (0, max_frames - num_frames))
    elif num_frames > max_frames:
        waveform = waveform.narrow(dim=1, start=0, length=max_frames)

    transforms = nn.Sequential(
        aT.Resample(44100, 22050),
        aT.MFCC(sample_rate=sample_rate, n_mfcc=64),
        aT.AmplitudeToDB(),
    )
    waveform = transforms(waveform)

    return waveform


def predict(model, input_tensor):
    model.eval()
    inputs = input_tensor.unsqueeze(1)
    inputs = inputs.to(device)

    with torch.no_grad():
        output = model(inputs)

        output = output.squeeze()
        output =  F.softmax(output, dim=-1)

        accuracy, predicted = torch.max(output.data, -1)
        accuracy *= 100

        # ? provide class labels
        predicted = classes[predicted]

        return predicted, accuracy


def make_predictions(audio_paths, labels):
    MODEL_PATH = './IE643_190020066_CHALLENGE_MODEL.pt'
    model = torch.load(MODEL_PATH, device)

    predictions = []

    for audio_path, label in zip(audio_paths, labels):
        input_tensor = audio_loader(audio_path)
        predicted, accuracy = predict(model, input_tensor)

        # print(f"label: {label}, predicted: {predicted} ({accuracy:.2f} %)")
        predictions.append(predicted)

    accuracy = metrics.accuracy_score(labels, predictions) * 100

    precision = metrics.precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = metrics.recall_score(labels, predictions, average='weighted', zero_division=0)
    f1 = metrics.f1_score(labels, predictions, average='weighted', zero_division=0)

    return accuracy, precision, recall, f1


cuda:0


## Example 1

In [33]:
audio_paths = ['./dataset/2/430816.wav', './dataset/6/112564.wav']
labels = ['2', '6']

accuracy, precision, recall, f1 = make_predictions(audio_paths, labels)
print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")




Accuracy: 0.000, Precision: 0.000, Recall: 0.000, F1: 0.000


## Example 2

In [31]:
from pathlib import Path

audio_paths = list(Path('./dataset').glob('**/*.wav'))
labels = [path.parts[-2] for path in audio_paths]

accuracy, precision, recall, f1 = make_predictions(audio_paths, labels)
print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")




Accuracy: 51.261, Precision: 0.542, Recall: 0.513, F1: 0.479
