In [1]:
import os
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset
import pandas as pd
from torch import nn
from torchsummary import summary
from torch.utils.data import DataLoader

In [2]:
annotation_dir = '/home/chahak/Desktop/audio_based_gan/UrbanSound8K/metadata/UrbanSound8K.csv'
df = pd.read_csv(annotation_dir)
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [12]:
class UrbanSoundDataset(Dataset):
    
    def __init__(self, annotations_file, audio_dir, transformations, sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformations = transformations.to(self.device)
        self.sample_rate = sample_rate
        self.num_samples = num_samples
        
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self,index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self.resample(signal, sr)
        signal = self.mix_down(signal)
        signal = self.cut(signal)
        signal = self.right_pad(signal)
        signal = self.transformations(signal)
        return signal, label
    
    def cut(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal
    
    def right_pad(self, signal):
        length = signal.shape[1]
        if length<self.num_samples:
            num_missing = -length+self.num_samples
            signal = torch.nn.functional.pad(signal, (0,num_missing))
        return signal
    
    def resample(self,signal, sr):
        if sr != self.sample_rate:
            signal = F.resample(signal, sr, self.sample_rate)
        return signal
    
    def mix_down(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal


    
    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

In [13]:
if torch.cuda.is_available():
    device = "cuda"
    print("Running on GPU")
else:
    device = "cpu"
    print("Running on cpu")

Running on GPU


In [14]:
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(16),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16,out_channels=2*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=32,out_channels=4*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(2*32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=2*32,out_channels=8*16,kernel_size=3,stride=1,padding=2,bias=False),
        nn.BatchNorm2d(4*32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2))
        
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128*5*4,10) # 10 classes
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        preds = self.softmax(logits)

        return preds
        

In [15]:

BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

In [16]:
sample_rate = 22050 #22.5 KHz
num_samples = 22050
dataset_path = "/home/chahak/Desktop/audio_based_gan/UrbanSound8K/audio"
mel_spectogram = T.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=1024, hop_length=512, n_mels=64
)

data = UrbanSoundDataset(annotation_dir, dataset_path, mel_spectogram,sample_rate, num_samples, device)

In [17]:
signal,label = data.__getitem__(54)
print(signal.shape,label)

torch.Size([1, 64, 44]) 3


In [18]:
model = CNNNetwork()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("using ", device)
model = model.to(device)
print(summary(model,(1,64,44)))

using  cuda
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             144
       BatchNorm2d-2           [-1, 16, 66, 46]              32
              ReLU-3           [-1, 16, 66, 46]               0
         MaxPool2d-4           [-1, 16, 33, 23]               0
            Conv2d-5           [-1, 32, 35, 25]           4,608
       BatchNorm2d-6           [-1, 32, 35, 25]              64
              ReLU-7           [-1, 32, 35, 25]               0
         MaxPool2d-8           [-1, 32, 17, 12]               0
            Conv2d-9           [-1, 64, 19, 14]          18,432
      BatchNorm2d-10           [-1, 64, 19, 14]             128
             ReLU-11           [-1, 64, 19, 14]               0
        MaxPool2d-12             [-1, 64, 9, 7]               0
           Conv2d-13           [-1, 128, 11, 9]          73,728
      BatchNorm2d-14       

In [19]:
batch_size=1
num_epochs=1
learning_rate = 1e-4

train_loader = DataLoader(data,batch_size=batch_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
for i in range(num_epochs):
    for step,data in enumerate(train_loader):
        sample,label = data
        sample = sample.to(device)
        label =  label.to(device)

        pred = model(sample)
        #calculate loss
        loss = criterion(pred,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("Epoch-{}/{},Step-{}/{}, Loss- {}".format(i,num_epochs,step,len(train_loader), loss.item()))

Epoch-0/1,Step-0/8732, Loss- 2.3615164756774902
Epoch-0/1,Step-1/8732, Loss- 2.3290860652923584
Epoch-0/1,Step-2/8732, Loss- 2.277190923690796
Epoch-0/1,Step-3/8732, Loss- 2.2317020893096924
Epoch-0/1,Step-4/8732, Loss- 2.1866326332092285
Epoch-0/1,Step-5/8732, Loss- 1.9399100542068481
Epoch-0/1,Step-6/8732, Loss- 1.744401454925537
Epoch-0/1,Step-7/8732, Loss- 1.7613694667816162
Epoch-0/1,Step-8/8732, Loss- 1.6018935441970825
Epoch-0/1,Step-9/8732, Loss- 2.31868839263916
Epoch-0/1,Step-10/8732, Loss- 2.3452892303466797
Epoch-0/1,Step-11/8732, Loss- 2.3653554916381836
Epoch-0/1,Step-12/8732, Loss- 2.3181896209716797
Epoch-0/1,Step-13/8732, Loss- 2.2487151622772217
Epoch-0/1,Step-14/8732, Loss- 2.328883171081543
Epoch-0/1,Step-15/8732, Loss- 2.247760772705078
Epoch-0/1,Step-16/8732, Loss- 2.2876460552215576
Epoch-0/1,Step-17/8732, Loss- 2.251765727996826
Epoch-0/1,Step-18/8732, Loss- 2.283036470413208
Epoch-0/1,Step-19/8732, Loss- 2.159482955932617
Epoch-0/1,Step-20/8732, Loss- 2.0777060

OutOfMemoryError: CUDA out of memory. Tried to allocate 234.00 MiB (GPU 0; 1.83 GiB total capacity; 251.05 MiB already allocated; 167.31 MiB free; 258.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [79]:


def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

In [80]:
train_dataloader = create_data_loader(usd, BATCH_SIZE)

cnn = CNNNetwork().to(device)
print(cnn)

# initialise loss funtion + optimiser
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(),
                                lr=LEARNING_RATE)

# train model
train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

# save model
torch.save(cnn.state_dict(), "feedforwardnet.pth")
print("Trained feed forward net saved at feedforwardnet.pth")

CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1


RuntimeError: stack expects each tensor to be equal size, but got [1, 64, 14] at entry 0 and [1, 64, 173] at entry 1

In [1]:
class_mapping = ["air_conditioner",
                 "car_horn",
                 "children_playing",
                 "dog_bark",
                 "drilling",
                 "engine_idling",
                 "gun_shot",
                 "jackhammer",
                 "siren",
                 "street_music"]