<a href="https://colab.research.google.com/github/shayanthrn/Acoustic_scene_classification/blob/main/Acoustic_scene_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Acoustic scene classification

Dataset: IEEE AASP CASA Challenge, Available on: http://dcase.community/challenge2013/task-acoustic-scene-classification
<br/>Currently dataset is in my googe drive

In [51]:
!pip install torchaudio
!pip install torchsummary



##import libraries

In [26]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import os
from torchvision.transforms import ToTensor
import numpy as np
from torchsummary import summary

##global variables and hyperparameters

In [50]:
class_map = ["bus","busystreet","office","openairmarket","park","quietstreet","restaurant","supermarket","tube","tubestation"]
BATCH_SIZE = 32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 20
LEARNING_RATE = 0.001

##create dataset class for DCASE2013 dataset

In [28]:
class DCASE13(Dataset):

  def __init__(self,path,class_map):
    super().__init__()
    self.dataset_path = path
    self.class_map = class_map
    self.sample_rate = 16000
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.n_sample = 160000 # 10 sec of each audio
    self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate,n_fft=1024,hop_length=512,n_mels=64).to(self.device)
  
  def __len__(self):
    return len(os.listdir(self.dataset_path))
  
  def __getitem__(self,index):
    file_name = os.listdir(self.dataset_path)[index]
    class_name = file_name.split('0')[0].split('1')[0]
    label = class_map.index(class_name)
    file_path = os.path.join(self.dataset_path,file_name)
    signal, sample_rate = torchaudio.load(filepath=file_path)
    signal = signal.to(self.device)
    #resample if necessary
    if(sample_rate != self.sample_rate):
      resampler = torchaudio.transforms.Resample(sample_rate,self.sample_rate)
      signal = resampler(signal)
    # stereo to mono convert
    if(signal.shape[0]>1):
      signal = torch.mean(signal, dim=0, keepdim=True)
    #adjust lenght

    #cut if necessary
    if(signal.shape[1]>self.n_sample):
      signal = signal [:,:self.n_sample]
    #pad if necessary
    elif(signal.shape[1]<self.n_sample):
      signal = nn.functional.pad(signal,(0,self.n_sample-signal.shape[1])) #right pad at last dim
    else:
      pass
    
    signal = self.mel_spectrogram(signal)
    print(signal.shape)
    return signal,label




## create our model which is a CNN
<Br />
architecture --> 4 Convolutional layer + relu activation + max pooling, flatten,linear, soft max
<br /> kernel for Convolutional : 3x3,  kernel for max pooling: 2x2

In [45]:
class CNN(nn.Module):

  def __init__(self):
    super().__init__()

    self.conv1 = nn.Sequential(
        nn.Conv2d(
            in_channels=1,
            out_channels=16,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d(
            in_channels=32,
            out_channels=64,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv4 = nn.Sequential(
        nn.Conv2d(
            in_channels=64,
            out_channels=128,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.flat = nn.Flatten()
    self.linear = nn.Linear(
        in_features = 128*5*21,
        out_features = 10
    )
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self,input):
    x = self.conv1(input)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.conv4(x)
    x = self.flat(x)
    x = self.linear(x)
    predict = self.softmax(x)
    return predict

In [46]:
cnn = CNN().to(DEVICE)

In [47]:
summary(cnn, (1, 64, 313))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 16, 66, 315]             160
              ReLU-2          [-1, 16, 66, 315]               0
         MaxPool2d-3          [-1, 16, 33, 157]               0
            Conv2d-4          [-1, 32, 35, 159]           4,640
              ReLU-5          [-1, 32, 35, 159]               0
         MaxPool2d-6           [-1, 32, 17, 79]               0
            Conv2d-7           [-1, 64, 19, 81]          18,496
              ReLU-8           [-1, 64, 19, 81]               0
         MaxPool2d-9            [-1, 64, 9, 40]               0
           Conv2d-10          [-1, 128, 11, 42]          73,856
             ReLU-11          [-1, 128, 11, 42]               0
        MaxPool2d-12           [-1, 128, 5, 21]               0
          Flatten-13                [-1, 13440]               0
           Linear-14                   

## Train Function
<br /> optimizer:adam, loss function: crossentropy, batch size: 32, epochs: 20, learning rate: 0.001

In [None]:
def train_one_epoch(model,dataloader,optimizer,loss_f,device):

  for signal, label in dataloader:

    signal, label = signal.to(device), label.to(device)
    output = model(signal)
    loss = loss_f(output,label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f"loss after this epoch: {loss.item()}")

def train(model,dataloader,optimizer,loss_f,device,epochs):
  model.train()
  for i in range(epochs):
    print(f" Epoch: {i+1}")
    train_one_epoch(model,dataloader,optimizer,loss_f,device)
    print("----------------------")

In [48]:
dcase13 = DCASE13("/content/drive/MyDrive/IEEE_AASP_CASA_Challenge/DCASE13_train",class_map)

In [49]:
dataloader = DataLoader(dcase13 , batch_size=BATCH_SIZE , shuffle=True)

In [53]:
optimizer = torch.optim.Adam(cnn.parameters(),lr=LEARNING_RATE)

In [54]:
loss_f = nn.CrossEntropyLoss()

In [None]:
train(cnn,dataloader,optimizer,loss_f,DEVICE,EPOCHS)

In [None]:
torch.save(cnn.state_dict(), "feedforwardnet.pth")