# Get Urban sound dataset

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

In [3]:
#changing the working directory
%cd /content/gdrive/My Drive/Kaggle

/content/gdrive/My Drive/Kaggle


In [7]:
!pip install opendatasets --upgrade --quiet

In [8]:
import opendatasets as od

dataset_url = 'https://www.kaggle.com/chrisfilo/urbansound8k'
od.download(dataset_url)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: saimislam
Your Kaggle Key: ··········
Downloading urbansound8k.zip to ./urbansound8k


100%|██████████| 5.61G/5.61G [01:11<00:00, 84.4MB/s]





# Custom Dataset Class

In [57]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import os
import torch

In [58]:
class UrbanSoundDataset(Dataset):

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sr).to(self.device)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
            index, 0])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

In [6]:
!pwd

/content/gdrive/MyDrive/Kaggle


# Apply all data preprocess

In [59]:
ANNOTATIONS_FILE = "urbansound8k/UrbanSound8K.csv"
AUDIO_DIR = "urbansound8k"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)

usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                        AUDIO_DIR,
                        mel_spectrogram,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
print(f"There are {len(usd)} samples in the dataset.")

Using device cpu
There are 8732 samples in the dataset.


In [60]:
signal, label = usd[0]

# Create Model

In [61]:
from torch import nn 
from torchsummary import summary

class CNNNetwork(nn.Module):
  def __init__(self):
    super().__init__()
    # 4 conv block -> flatten -> linear -> softmax
    self.conv1 = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=2),
        nn.ReLU(), 
        nn.MaxPool2d(kernel_size=2)
    )

    self.conv2 = nn.Sequential(
        nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2),
        nn.ReLU(), 
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2),
        nn.ReLU(), 
        nn.MaxPool2d(kernel_size=2)
    )

    self.conv4 = nn.Sequential(
        nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2),
        nn.ReLU(), 
        nn.MaxPool2d(kernel_size=2)
    )

    self.flatten = nn.Flatten()
    self.linear = nn.Linear(128*5*4, 10)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_data):
    x = self.conv1(input_data)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.conv4(x)
    x = self.flatten(x)
    logits = self.linear(x)
    predictions = self.softmax(logits)
    return predictions


In [62]:
cnn = CNNNetwork()
summary(cnn.to(device), (1,64,44))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

# Train the model

In [63]:
import torch
from torch import nn 
from torch.utils.data import DataLoader

In [65]:
def create_dataloader(train_data, batch_size):
  train_dataloader = DataLoader(train_data, batch_size=batch_size)
  return train_dataloader 


In [66]:
def train_single_epoch(model, data_loader, loss_fn, optimizer, device):
  for input, target in data_loader:
    input, target = input.to(device), target.to(device)

    # calculate loss
    prediction = model(input)
    loss = loss_fn(prediction, target)

    # backpropagate error and update weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step() 

  print(f"loss: {loss.item()}")

In [67]:
def train(model, data_loader, loss_fn, optimizer, device, epochs):
  for i in range(epochs):
    print(f"Epoch {i+1}/{epochs}")
    train_single_epoch(model, data_loader, loss_fn, optimizer, device)
    print("--------------------------------")
  print("Finished training")

In [68]:
BATCH_SIZE = 128
EPOCHS = 10 
LEARNING_RATE = 0.001
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE, 
    n_fft = 1024,
    hop_length=512,
    n_mels=64
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using device {device}")

ANNOTATIONS_FILE = "urbansound8k/UrbanSound8K.csv"
AUDIO_DIR = "urbansound8k"
usd = UrbanSoundDataset(ANNOTATIONS_FILE, 
                        AUDIO_DIR, 
                        mel_spectrogram, 
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)
train_dataloader = create_dataloader(usd, BATCH_SIZE)

Using device cpu


In [71]:
len(train_dataloader)

69

In [72]:
cnn = CNNNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

In [75]:
#changing the working directory
%cd /content/gdrive/My Drive/Kaggle
!pwd

/content/gdrive/My Drive/Kaggle
/content/gdrive/My Drive/Kaggle


In [None]:
train(cnn, train_dataloader, loss_fn, optimizer, device, EPOCHS)
torch.save(cnn.state_dict(), 'classifier.pth')
print("Trained Audio classifier and saved at Torch Audio/classifier.pth")

# Predict

0 = air_conditioner
1 = car_horn
2 = children_playing
3 = dog_bark
4 = drilling
5 = engine_idling
6 = gun_shot
7 = jackhammer
8 = siren
9 = street_music

In [83]:
import torch 
class_mapping = ["air_conditioner","car_horn","children_playing", "dog_bark", 
                 "drilling", "engine_idling", "gun_shot", "jackhammer","siren","street_music" ]

def predict(model, input, target, class_mapping):
  model.eval()
  with torch.no_grad():
    predictions = model(input) 
    predicted_index = predictions[0].argmax(0)
    predicted = class_mapping[predicted_index]
    expected = class_mapping[target] 
  return predicted, expected

In [84]:
# load back the model
cnn = CNNNetwork()
state_dict = torch.load('classifier.pth')
cnn.load_state_dict(state_dict)

# load urban sound data
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE, 
    n_fft = 1024,
    hop_length=512,
    n_mels=64
)

ANNOTATIONS_FILE = "urbansound8k/UrbanSound8K.csv"
AUDIO_DIR = "urbansound8k"
usd = UrbanSoundDataset(ANNOTATIONS_FILE, 
                        AUDIO_DIR, 
                        mel_spectrogram, 
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        device)


In [86]:
# Get a sample
input, target = usd[5][0], usd[5][1]
input.unsqueeze_(0)
# inference
predicted, expected = predict(cnn, input, target, class_mapping)
print(f"Predicted: {predicted}, Expected: {expected}")

Predicted: children_playing, Expected: children_playing
