In [1]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('fivethirtyeight')
%matplotlib inline
import seaborn as sns
import librosa

import os
import random
import sklearn

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary

from pathlib import Path
import torchaudio
print("Libraries imported - ready to use PyTorch", torch.__version__)

from tqdm.notebook import trange,tqdm

import warnings
warnings.filterwarnings('ignore')

Libraries imported - ready to use PyTorch 2.1.0+cu118


In [2]:
import random
SEED = 42
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED)

In [4]:
!git clone https://github.com/karolpiczak/ESC-50.git

Cloning into 'ESC-50'...
remote: Enumerating objects: 4199, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 4199 (delta 40), reused 38 (delta 18), pack-reused 4136[K
Receiving objects: 100% (4199/4199), 878.79 MiB | 15.34 MiB/s, done.
Resolving deltas: 100% (287/287), done.
Updating files: 100% (2011/2011), done.


In [5]:
dataset = pd.read_csv('ESC-50/meta/esc50.csv')
dataset.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [10]:
audio_path = 'ESC-50/audio/'

In [9]:
def compute_log_mel_spect(audio,sample_rate):
    n_fft=1024
    hop_length=512
    window_type ='hann'
    mel_bins = 60
    normalized_y = librosa.util.normalize(audio)
    Mel_spectrogram = librosa.feature.melspectrogram(y=normalized_y, sr=sample_rate,hop_length=hop_length, win_length=n_fft, n_mels = mel_bins)
    mel_spectrogram_db = librosa.power_to_db(Mel_spectrogram)
    return  mel_spectrogram_db

In [11]:
data=[]

for i in tqdm(range(len(dataset))):
    wave_file = audio_path+dataset.iat[i,0]
    audio, sample_rate = librosa.load(wave_file)
    mel_spectrogram_db=(compute_log_mel_spect(audio,sample_rate))
    delta=librosa.feature.delta(mel_spectrogram_db)
    data.append( np.dstack( (mel_spectrogram_db,delta) ))

dataset['data']=data

  0%|          | 0/2000 [00:00<?, ?it/s]

In [12]:
from torchvision.transforms import Resize

import torchaudio.transforms as T

train_df = dataset[dataset['fold'] < 5]
test_df = dataset[dataset['fold'] == 5]

train_df = train_df[['data','target']]
test_df = test_df[['data','target']]

In [13]:
class escDataset(Dataset):
    def __init__(self,dataset, transformation, device):
        self.dataset = dataset
        self.device = device
        self.transformation = transformation
        self.length=len(self.dataset)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        spectogram = torch.tensor(self._get_audio_spectogram(index))
        label = self._get_audio_sample_label(index)
        spectogram=spectogram.permute(2, 0, 1)
        resize_transform = Resize((60, 216))
        spectogram = resize_transform(spectogram)
        #spectogram  = spectogram.to(self.device)
        #spectogram  = self.transformation(spectogram)
        return spectogram, label
    def _get_audio_spectogram(self, index):
        y = self.dataset.iloc[index, 0].astype(np.float32)
        return y

    def _get_audio_sample_label(self, index):
        return self.dataset.iloc[index, 1]

In [14]:
class MyReshape(object):
    """Reshape the image array."""

    def __init__(self, output_size):
        assert isinstance(output_size, (tuple))

        self.output_size = output_size
    def __call__(self, image):
        return image.reshape(self.output_size)

In [15]:
# build transformation pipelines for data augmentation

train_transforms = transforms.Compose([ MyReshape(output_size=(2,60, 216) ) ])
test_transforms = transforms.Compose([MyReshape(output_size=(2,60, 216))])

In [16]:
batch_size = 64
device='cuda'

train_data = escDataset(train_df,train_transforms,device)
test_data = escDataset(test_df,train_transforms,device)

train_loader = DataLoader(train_data, batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [17]:
class CNN(nn.Module):

    def __init__(self):
        super().__init__()
        # 5 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=2, out_channels=24,kernel_size=(6,6),stride=1),
            nn.ReLU(),
            nn.BatchNorm2d(24)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=24,out_channels=24,kernel_size=(6,6),stride=1),
            nn.LeakyReLU(0.2),
            nn.BatchNorm2d(24)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=24,out_channels=48,kernel_size=(5,5),stride=(2,2)),
            nn.LeakyReLU(0.2),
            nn.BatchNorm2d(48)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=48,out_channels=48,kernel_size=(5,5),stride=(2,2)),
            nn.LeakyReLU(0.2),
            nn.BatchNorm2d(48)
        )
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=48,out_channels=64,kernel_size=(4,4),stride=1),
            nn.LeakyReLU(0.2),
            nn.BatchNorm2d(64)
        )
        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels=64,out_channels=64,kernel_size=(4,4),stride=1),
            nn.LeakyReLU(0.2),
            nn.BatchNorm2d(64)
        )
        self.connected_layer=nn.Sequential(
          nn.Flatten(),
          nn.Linear(64*4*43, 200),
          nn.Dropout(0.25),
          nn.Linear(200, 50), #200 unit
          nn.Softmax()
        )

    def forward(self, input_data):
        input_data = input_data.to('cuda')
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)
        x= self.connected_layer(x)
        return x

In [18]:
cnn = CNN().cuda()
input_data = torch.randn(64, 2,60,216)

In [None]:
summary(cnn,(2,60,216))

In [21]:
loss_criteria = nn.CrossEntropyLoss()
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_loss = 0
    print("------------------------------- Epoch:", epoch,"-------------------------------")
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data.to(device))
        loss = loss_criteria(output, target)
        train_loss += loss.item()

        loss.backward(retain_graph=True)
        optimizer.step()
    avg_loss = train_loss / (batch_idx+1)
    print('Training set: Average loss: {:.6f}'.format(avg_loss))
    return avg_loss

In [22]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += loss_criteria(output, target).item()
            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(target==predicted).item()
    avg_loss = test_loss / batch_count
    print('Validation set: Average loss: {:.6f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return avg_loss

In [23]:
def training(model):
    # Use an "Adam" optimizer to adjust weights
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    # Specify the loss criteria
    loss_criteria = nn.CrossEntropyLoss()

    # Track metrics in these arrays
    epoch_nums = []
    training_loss = []
    validation_loss = []

    # Train over 10 epochs (We restrict to 10 for time issues)
    epochs = 50
    print('Training on', device)
    for epoch in tqdm(range(1, epochs + 1)):
        train_loss = train(model, device, train_loader, optimizer, epoch)
        test_loss = test(model, device, test_loader)
        epoch_nums.append(epoch)
        training_loss.append(train_loss)
        validation_loss.append(test_loss)


In [24]:
model = cnn
optimizer = optim.Adam(model.parameters(), lr=3e-4)

loss_criteria = nn.CrossEntropyLoss()

epoch_nums = []
training_loss = []
validation_loss = []

epochs = 50
print('Training on', device)
for epoch in tqdm(range(1, epochs + 1)):
    train_loss = train(model, device, train_loader, optimizer, epoch)
    test_loss = test(model, device, test_loader)
    epoch_nums.append(epoch)
    training_loss.append(train_loss)
    validation_loss.append(test_loss)

Training on cuda


  0%|          | 0/50 [00:00<?, ?it/s]

------------------------------- Epoch: 1 -------------------------------
Training set: Average loss: 3.868380
Validation set: Average loss: 3.838712, Accuracy: 35/400 (9%)

------------------------------- Epoch: 2 -------------------------------
Training set: Average loss: 3.796949
Validation set: Average loss: 3.809923, Accuracy: 51/400 (13%)

------------------------------- Epoch: 3 -------------------------------
Training set: Average loss: 3.748162
Validation set: Average loss: 3.761649, Accuracy: 60/400 (15%)

------------------------------- Epoch: 4 -------------------------------
Training set: Average loss: 3.726835
Validation set: Average loss: 3.744568, Accuracy: 84/400 (21%)

------------------------------- Epoch: 5 -------------------------------
Training set: Average loss: 3.695999
Validation set: Average loss: 3.725512, Accuracy: 91/400 (23%)

------------------------------- Epoch: 6 -------------------------------
Training set: Average loss: 3.674114
Validation set: Avera

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import torch.nn as nn

loss_criteria = nn.CrossEntropyLoss()

def test_upd(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += loss_criteria(output, target).item()

            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(target == predicted).item()

            all_targets.extend(target.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    avg_loss = test_loss / batch_count
    accuracy = 100. * correct / len(test_loader.dataset)

    # Calculate precision, recall and F1-score
    precision = precision_score(all_targets, all_predictions, average='weighted')
    recall = recall_score(all_targets, all_predictions, average='weighted')
    f1 = f1_score(all_targets, all_predictions, average='weighted')

    print('Validation set: Average loss: {:.6f}, Accuracy: {}/{} ({:.0f}%), Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}\n'.format(
        avg_loss, correct, len(test_loader.dataset), accuracy, precision, recall, f1))

    return avg_loss, accuracy, precision, recall, f1

In [26]:
t = test_upd(model, device, test_loader)

Validation set: Average loss: 3.631904, Accuracy: 136/400 (34%), Precision: 0.2522, Recall: 0.3400, F1 Score: 0.2801

