# Urban sounds classification

This notebook is a simple example of how to use the `fastai` library to classify urban sounds. The dataset used is the [UrbanSound8K](https://urbansounddataset.weebly.com/urbansound8k.html) which contains 8732 labeled sound excerpts (<=4s) of urban sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, siren, and street_music.

We will start by importing the necessary libraries and loading the data.

In [69]:
import pandas as pd
import os
import torchaudio
import torch
from torch import nn, Tensor
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader

Make sure to enable the GPU in the settings to speed up the training process.

In [70]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

Now, let's define the dataset class. Besides the `__len__` and `__getitem__` methods, this dataset class makes sure that all samples are loaded with the same duration, resamples the audio if necessary, brings the audio to only one channel, and pads it if necessary. 

In [71]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_file: str, audio_dir: str, target_sample_rate: int, transformation: torchaudio.transforms, num_samples: int, device) -> None:
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.target_sample_rate = target_sample_rate
        self.device = device
        self.transformation = transformation.to(self.device)
        self.num_samples = num_samples


    def __getitem__(self, index: int) -> tuple[Tensor, str]:
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label


    def __len__(self) -> int:
        return len(self.annotations)
    

    def _get_audio_sample_path(self, index: int) -> str:
        fold = f'fold{self.annotations.iloc[index, 5]}'
        file_name = self.annotations.iloc[index, 0]
        path = os.path.join(self.audio_dir, fold, file_name)
        return path
    
        
    def _get_audio_sample_label(self, index: int) -> int:
        return self.annotations.iloc[index, 6]
    

    def _resample_if_necessary(self, signal: Tensor, sr: int) -> tuple[Tensor, int]:
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal  
        

    def _mix_down_if_necessary(self, signal: Tensor) -> Tensor:
        if signal.size()[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    

    def _cut_if_necessary(self, signal: Tensor):
        # signal -> Tensor -> (num_channels, num_samples)
        if signal.shape[1] > self.num_samples:
            signal = signal[: , :self.num_samples]
        return signal
    

    def _right_pad_if_necessary(self, signal: Tensor):
        length = signal.shape[1]
        if length < self.num_samples:
            missing_samples = self.num_samples - length
            last_dimension_padding = (0, missing_samples) # (1, 2) -> (left, right)
            # [1, 1, 1] -> [0, 1, 1, 1, 0 , 0]
            signal = torch.nn.functional.pad(signal, last_dimension_padding)
        return signal

The sample rate used is 22050 Hz, and the duration is 4 seconds. The audio is resampled to 22050 Hz if it is not already in this sample rate. If the audio is stereo, it is converted to mono. If the audio is shorter than 4 seconds, it is padded with zeros. If the audio is longer than 4 seconds, it is truncated.

In [79]:
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
ANNOTATIONS_FILE = './data/UrbanSound8K.csv'
AUDIO_DIR = os.path.join(os.path.curdir, 'data')

class_mapping = {
    0: 'air_conditioner',
    1: 'car_horn',
    2: 'children_playing',
    3: 'dog_bark',
    4: 'drilling',
    5: 'enginge_idling',
    6: 'gun_shot',
    7: 'jackhammer',
    8: 'siren',
    9: 'street_music'
}

mel_spectogram = torchaudio.transforms.MelSpectrogram(
    SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, SAMPLE_RATE, mel_spectogram, NUM_SAMPLES, device)

Now, for audio classification, we will use a simple neural network with four convolutional layers and one fully connected layers. The model is defined in the `AudioCNN` class. The `forward` method defines the forward pass of the model.

In [73]:
class AudioCNN(nn.Module):
    def __init__(self) -> None:
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 10)
        self.softmax = nn.Softmax(dim=1)


    def forward(self, input_data: Tensor) -> Tensor:
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [74]:
model = AudioCNN().to(device)
summary(model, (1, 64, 44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

In [82]:
def train(model: nn.Module, data_loader: torch.utils.data.DataLoader, loss_fn: nn.Module, optimizer: torch.optim.Optimizer, num_epochs: int, device: str) -> None:
    model.train()
    for epoch in range(num_epochs):
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            
            output = model(data)
            loss = loss_fn(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch} - Loss: {loss.item()}')

def predict(model: nn.Module, input: Tensor, target: int, class_maping: dict) -> Tensor:
    predictions = []
    model.eval()
    with torch.no_grad():
        output = model(input)
        _, predicted_index = torch.max(output, 1)
        predicted_class = class_maping[predicted_index.item()]
        expected_class = class_maping[target]
        predictions.append((expected_class, predicted_class))

    return predictions

In [76]:
train_data_loader = DataLoader(usd, batch_size=128, shuffle=True)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train(model, train_data_loader, loss_fn, optimizer, 10, device)

Epoch 0 - Loss: 2.2592251300811768


In [84]:
predictions = predict(model, usd[0][0].unsqueeze(0), usd[0][1], class_mapping)
print(f'Expected: {predictions[0][0]}, Predicted: {predictions[0][1]}')

Expected: dog_bark, Predicted: dog_bark
