# Audio Classification

* Load model
* Classify test data

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as aT
import torchvision.models as models


In [5]:
class DenseNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.model = models.densenet201(pretrained=True)
        conv0 = self.model.features.conv0
        self.model.features.conv0 = nn.Conv2d(
            1,
            conv0.out_channels,
            kernel_size=conv0.kernel_size,
            stride=conv0.stride,
            padding=conv0.padding,
        )
        self.model.classifier = nn.Linear(1920, num_classes)

    def forward(self, x):
        output = self.model(x)
        return output


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

original_rate = 44100
sample_rate = 22050
classes = ['1', '2', '3', '4', '5', '6', '7']

def audio_loader(path, max_length_in_seconds=4):
    waveform, sample_rate = torchaudio.load(path)
    num_channels, num_frames = waveform.shape
    max_frames = sample_rate * max_length_in_seconds

    # ? Pad audio with zeros if too short or cut audio if too long
    if num_frames < max_frames:
        waveform = torch.nn.functional.pad(waveform, (0, max_frames - num_frames))
    elif num_frames > max_frames:
        waveform = waveform.narrow(dim=1, start=0, length=max_frames)

    return waveform

transforms = nn.Sequential(
    aT.Resample(original_rate, sample_rate),
    aT.MFCC(sample_rate=sample_rate, n_mfcc=64),
    aT.AmplitudeToDB(),
)

def predict(audio_path):
    waveform = audio_loader(audio_path)
    inputs = transforms(waveform)

    MODEL_PATH = './IE643_190020066_CHALLENGE_MODEL.pt'
    model = torch.load(MODEL_PATH)
    model.to(device)
    model.eval()

    inputs = inputs.unsqueeze(1)
    inputs = inputs.to(device)

    with torch.no_grad():
        output = model(inputs)

        output = output.squeeze()
        output =  F.softmax(output, dim=-1)

        accuracy, predicted = torch.max(output.data, -1)
        accuracy *= 100
        predicted = classes[predicted]

        return predicted, accuracy


cuda:0




In [8]:
AUDIO_PATH = './dataset/2/430816.wav'

predicted, accuracy = predict(AUDIO_PATH)

print(f"Predicted '{predicted}' with {accuracy:.2f}% accuracy")


Predicted '2' with 99.80% accuracy
