<a href="https://colab.research.google.com/github/shayanthrn/Acoustic_scene_classification/blob/main/Acoustic_scene_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Acoustic scene classification

Dataset: IEEE AASP CASA Challenge, Available on: http://dcase.community/challenge2013/task-acoustic-scene-classification
<br/>Currently dataset is in my googe drive

In [None]:
!pip install torchaudio
!pip install torchsummary

##import libraries

In [111]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchaudio
import os
from torchvision.transforms import ToTensor
import numpy as np
from torchsummary import summary

##global variables and hyperparameters

In [116]:
class_map = ["bus","busystreet","office","openairmarket","park","quietstreet","restaurant","supermarket","tube","tubestation"]
BATCH_SIZE = 32
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 50
LEARNING_RATE = 0.0002

##create dataset class for DCASE2013 dataset

In [117]:
class DCASE13(Dataset):

  def __init__(self,path,class_map):
    super().__init__()
    self.dataset_path = path
    self.class_map = class_map
    self.sample_rate = 22050
    self.device = "cuda" if torch.cuda.is_available() else "cpu"
    self.n_sample = 661500 # 30 sec of each audio
    self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=self.sample_rate,n_fft=1024,hop_length=512,n_mels=96).to(self.device)
  
  def __len__(self):
    return len(os.listdir(self.dataset_path))
  
  def __getitem__(self,index):
    file_name = os.listdir(self.dataset_path)[index]
    class_name = file_name.split('0')[0].split('1')[0]
    label = class_map.index(class_name)
    file_path = os.path.join(self.dataset_path,file_name)
    signal, sample_rate = torchaudio.load(filepath=file_path)
    signal = signal.to(self.device)
    #resample if necessary
    if(sample_rate != self.sample_rate):
      resampler = torchaudio.transforms.Resample(sample_rate,self.sample_rate).to(self.device)
      signal = resampler(signal)
    # stereo to mono convert
    if(signal.shape[0]>1):
      signal = torch.mean(signal, dim=0, keepdim=True)
    #adjust lenght

    #cut if necessary
    if(signal.shape[1]>self.n_sample):
      signal = signal [:,:self.n_sample]
    #pad if necessary
    elif(signal.shape[1]<self.n_sample):
      signal = nn.functional.pad(signal,(0,self.n_sample-signal.shape[1])) #right pad at last dim
    else:
      pass
    
    signal = self.mel_spectrogram(signal)
    return signal,label




## create our model which is a CNN
<Br />
architecture --> 4 Convolutional layer + relu activation + max pooling, flatten,linear, soft max
<br /> kernel for Convolutional : 3x3,  kernel for max pooling: 2x2

In [120]:
class CNN(nn.Module):

  def __init__(self):
    super().__init__()

    self.conv1 = nn.Sequential(
        nn.Conv2d(
            in_channels=1,
            out_channels=16,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv2 = nn.Sequential(
        nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv3 = nn.Sequential(
        nn.Conv2d(
            in_channels=32,
            out_channels=64,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.conv4 = nn.Sequential(
        nn.Conv2d(
            in_channels=64,
            out_channels=128,
            kernel_size=3,
            stride=1,
            padding=2
        ),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )
    self.flat = nn.Flatten()
    self.linear = nn.Linear(
        in_features = 128*7*82,
        out_features = 10
    )
    self.softmax = nn.Softmax(dim=1)
  
  def forward(self,input):
    x = self.conv1(input)
    x = self.conv2(x)
    x = self.conv3(x)
    x = self.conv4(x)
    x = self.flat(x)
    x = self.linear(x)
    predict = self.softmax(x)
    return predict

In [121]:
cnn = CNN().to(DEVICE)
summary(cnn, (1, 96, 1292))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 98, 1294]             160
              ReLU-2         [-1, 16, 98, 1294]               0
         MaxPool2d-3          [-1, 16, 49, 647]               0
            Conv2d-4          [-1, 32, 51, 649]           4,640
              ReLU-5          [-1, 32, 51, 649]               0
         MaxPool2d-6          [-1, 32, 25, 324]               0
            Conv2d-7          [-1, 64, 27, 326]          18,496
              ReLU-8          [-1, 64, 27, 326]               0
         MaxPool2d-9          [-1, 64, 13, 163]               0
           Conv2d-10         [-1, 128, 15, 165]          73,856
             ReLU-11         [-1, 128, 15, 165]               0
        MaxPool2d-12           [-1, 128, 7, 82]               0
          Flatten-13                [-1, 73472]               0
           Linear-14                   

## Train Function
<br /> optimizer:adam, loss function: crossentropy, batch size: 32, epochs: 20, learning rate: 0.001

In [122]:
def train_one_epoch(model,dataloader,optimizer,loss_f,device):

  for signal, label in dataloader:

    signal, label = signal.to(device), label.to(device)
    output = model(signal)
    loss = loss_f(output,label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f"loss after this epoch: {loss.item()}")

def train(model,dataloader,optimizer,loss_f,device,epochs):
  model.train()
  for i in range(epochs):
    print(f" Epoch: {i+1}")
    train_one_epoch(model,dataloader,optimizer,loss_f,device)
    print("----------------------")

##Final Result:
<br/>
model accuracy:<br/>
batch size:50,Lr:0.0001,epochs:20, n_mels:64, duration: 10sec --> 68%<br/> 
batch size:50,Lr:0.0001,epochs:100, n_mels:64, duration: 10sec --> 67%<br/> 
batch size:50,Lr:0.0001,epochs:130, n_mels:64, duration: 10sec --> 74%<br/> 
batch size:50,Lr:0.0002,epochs:50, n_mels:64, duration: 10sec --> 77%<br/> 
batch size:32,Lr:0.0002,epochs:50, n_mels:96, duration: 15sec --> 86%<br/> batch size:32,Lr:0.0002,epochs:50, n_mels:96, duration: 30sec --> 70%

##personal tests and getting result

In [123]:
dcase13 = DCASE13("/content/drive/MyDrive/IEEE_AASP_CASA_Challenge/DCASE13_train",class_map)

In [124]:
dataloader = DataLoader(dcase13 , batch_size=BATCH_SIZE , shuffle=True)

In [125]:
optimizer = torch.optim.Adam(cnn.parameters(),lr=LEARNING_RATE)

In [126]:
loss_f = nn.CrossEntropyLoss()

In [127]:
print(f"we are using {DEVICE}")
train(cnn,dataloader,optimizer,loss_f,DEVICE,EPOCHS)

we are using cuda
 Epoch: 1
loss after this epoch: 2.328824996948242
----------------------
 Epoch: 2
loss after this epoch: 2.3045663833618164
----------------------
 Epoch: 3
loss after this epoch: 2.010098934173584
----------------------
 Epoch: 4
loss after this epoch: 2.020603895187378
----------------------
 Epoch: 5
loss after this epoch: 1.8490681648254395
----------------------
 Epoch: 6
loss after this epoch: 2.0560996532440186
----------------------
 Epoch: 7
loss after this epoch: 2.091836452484131
----------------------
 Epoch: 8
loss after this epoch: 1.9683080911636353
----------------------
 Epoch: 9
loss after this epoch: 1.7442511320114136
----------------------
 Epoch: 10
loss after this epoch: 1.6076536178588867
----------------------
 Epoch: 11
loss after this epoch: 1.465901494026184
----------------------
 Epoch: 12
loss after this epoch: 1.4629019498825073
----------------------
 Epoch: 13
loss after this epoch: 1.8425918817520142
----------------------
 Epoch: 

In [105]:
torch.save(cnn.state_dict(), "model.pth")

In [128]:
dcase13_test = DCASE13("/content/drive/MyDrive/IEEE_AASP_CASA_Challenge/DCASE13_eval",class_map)
dataloader_test = DataLoader(dcase13 , batch_size=BATCH_SIZE , shuffle=True)

In [129]:
correct = 0
total = 100
for input, label in dataloader_test:
  batchlabel = label
  batchpredict = cnn(input).argmax(dim=1)
  for i in range(len(batchlabel)):
    if(batchpredict[i].item()==batchlabel[i].item()):
      correct+=1

print(f" Acurracy after {EPOCHS} epoch : {correct/total}")

 Acurracy after 50 epoch : 0.7
