In [35]:
import numpy as np # linear algebra
import pandas as pd
import os
from torch.utils.data import Dataset
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader


for dirname, folder_names, filenames in os.walk('/kaggle/input/urbansound8k'):
    for folder_name in folder_names:
        print(os.path.join(dirname, folder_name))


/kaggle/input/urbansound8k/fold7
/kaggle/input/urbansound8k/fold1
/kaggle/input/urbansound8k/fold3
/kaggle/input/urbansound8k/fold5
/kaggle/input/urbansound8k/fold10
/kaggle/input/urbansound8k/fold9
/kaggle/input/urbansound8k/fold8
/kaggle/input/urbansound8k/fold4
/kaggle/input/urbansound8k/fold2
/kaggle/input/urbansound8k/fold6


In [3]:
!pip install torchsummary
from torchsummary import summary

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


In [4]:
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.BatchNorm2d(16),  
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.BatchNorm2d(32),  # Batch Normalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.BatchNorm2d(64),  # Batch Normalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.BatchNorm2d(128), # Batch Normalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128 * 5 * 4, 512)  # Adjusts size for fc1
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)
        self.dropout = nn.Dropout(p=0.4)  # Dropout before final layer
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = nn.ReLU()(x)  # ReLU activation function for hidden layers
        x = self.fc2(x)
        x = nn.ReLU()(x)  # ReLU activation function for hidden layers
        x = self.dropout(x)  # Applies Dropout
        logits = self.fc3(x)
        predictions = self.softmax(logits)
        return predictions

if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn, (1, 64, 44))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
       BatchNorm2d-2           [-1, 16, 66, 46]              32
              ReLU-3           [-1, 16, 66, 46]               0
         MaxPool2d-4           [-1, 16, 33, 23]               0
            Conv2d-5           [-1, 32, 35, 25]           4,640
       BatchNorm2d-6           [-1, 32, 35, 25]              64
              ReLU-7           [-1, 32, 35, 25]               0
         MaxPool2d-8           [-1, 32, 17, 12]               0
            Conv2d-9           [-1, 64, 19, 14]          18,496
      BatchNorm2d-10           [-1, 64, 19, 14]             128
             ReLU-11           [-1, 64, 19, 14]               0
        MaxPool2d-12             [-1, 64, 9, 7]               0
           Conv2d-13           [-1, 128, 11, 9]          73,856
      BatchNorm2d-14           [-1, 128

In [6]:
BATCH_SIZE = 128
EPOCHS = 250
LEARNING_RATE = 0.001

ANNOTATIONS_FILE = "/kaggle/input/urbansound8k/UrbanSound8K.csv"
AUDIO_DIR = "/kaggle/input/urbansound8k"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

In [30]:
class UrbanSoundDataset(Dataset):

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
        self.annotations = pd.read_csv(annotations_file)
        self.annotations = self.annotations.iloc[:6990]
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
         return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)  # Ensures transformation is on the same device
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

if __name__ == "__main__":
    ANNOTATIONS_FILE = "/kaggle/input/urbansound8k/UrbanSound8K.csv"
    AUDIO_DIR = "/kaggle/input/urbansound8k"
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050

    device = ("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device {device}")

    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
        ).to(device)  
    
    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    print(f"There are {len(usd)} samples in the dataset.")

Using device cuda
There are 6990 samples in the dataset.


In [27]:
# Training block

def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader

def train_single_epoch(model, data_loader, loss_fn, optimizer, device):
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    model.train()  # Sets the model to training mode

    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass: computes predicted outputs by passing inputs to the model
        predictions = model(inputs)
        loss = loss_fn(predictions, targets)

        # Backward pass: computes gradient of the loss with respect to model parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulates loss
        running_loss += loss.item()

        # accuracy
        _, predicted_labels = torch.max(predictions, 1)
        correct_predictions += (predicted_labels == targets).sum().item()
        total_samples += targets.size(0)

    # Calculates the average loss over the epoch
    average_loss = running_loss / len(data_loader)
    
    # Calculates accuracy
    accuracy = correct_predictions / total_samples * 100

    print(f"Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%")

def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        if (i + 1) % 20 == 0:
            Data_Snapshot = f'/kaggle/working/cnnnet_epoch_{i+1}.pth'
            torch.save(model.state_dict(), Data_Snapshot)
            print(f" saved at {Data_Snapshot}")
        print("---------------------------")
    print("Finished training")

if __name__ == "__main__":
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    # instantiating our dataset object and create data loader
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    train_dataloader = create_data_loader(usd, BATCH_SIZE)

    # constructs model and assigns it to device
    cnn = CNNNetwork().to(device)
    print(cnn)


31.3s 8821 Epoch 1
241.6s 8822 Loss: 2.2735, Accuracy: 16.45%
241.6s 8823 ---------------------------
241.6s 8824 Epoch 2
350.1s 8825 Loss: 2.2567, Accuracy: 17.81%
350.1s 8826 ---------------------------
350.1s 8827 Epoch 3
458.7s 8828 Loss: 2.2384, Accuracy: 21.10%
458.7s 8829 ---------------------------
458.7s 8830 Epoch 4
567.2s 8831 Loss: 2.2286, Accuracy: 22.39%
567.2s 8832 ---------------------------
567.2s 8833 Epoch 5
675.9s 8834 Loss: 2.2156, Accuracy: 23.92%
675.9s 8835 ---------------------------
675.9s 8836 Epoch 6
784.8s 8837 Loss: 2.2109, Accuracy: 24.48%
784.8s 8838 ---------------------------
784.8s 8839 Epoch 7
894.1s 8840 Loss: 2.2001, Accuracy: 25.36%
894.1s 8841 ---------------------------
894.1s 8842 Epoch 8
1003.4s 8843 Loss: 2.2015, Accuracy: 24.28%
1003.4s 8844 ---------------------------
1003.4s 8845 Epoch 9
1113.0s 8846 Loss: 2.1853, Accuracy: 26.05%
1113.0s 8847 ---------------------------
1113.0s 8848 Epoch 10
1221.4s 8849 Loss: 2.1683, Accuracy: 28.51%
122

In [26]:
# Inference block

class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # loads back the model
    cnn = CNNNetwork()
    state_dict = torch.load("/kaggle/input/trained-model-data/cnnnet_epoch_160.pth", map_location=torch.device('cpu'))
    cnn.load_state_dict(state_dict)

    # loads urban sound dataset dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")
    
    correct_predictions = 0
    total_samples = 0

    for i in range(len(usd) - 1742, len(usd)):
        input, target = usd[i][0], usd[i][1]
        input = input.unsqueeze(0)
        cnn.eval()
        with torch.no_grad():
            predictions = cnn(input)
            predicted_index = predictions[0].argmax(0)
            # Maps indices to class names
            predicted = class_mapping[predicted_index.item()]
            expected = class_mapping[target.item()]
            if predicted == expected:
                correct_predictions += 1

            total_samples += 1

    # Calculates overall accuracy
    accuracy = correct_predictions / total_samples
    print(f"Overall accuracy on the last 1742 samples: {accuracy*100:.4f}")


Overall accuracy on the last 1742 samples: 59.3571
