In [1]:
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from tqdm import tqdm

from pathlib import Path
import torchaudio

In [2]:
!git clone https://github.com/karolpiczak/ESC-50.git

Cloning into 'ESC-50'...
remote: Enumerating objects: 4199, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 4199 (delta 40), reused 38 (delta 18), pack-reused 4136[K
Receiving objects: 100% (4199/4199), 878.79 MiB | 32.12 MiB/s, done.
Resolving deltas: 100% (287/287), done.
Updating files: 100% (2011/2011), done.


In [5]:
path_audio = 'ESC-50/audio'

In [16]:
from torchaudio.transforms import Resample
class DataGenerator(Dataset):
    def __init__(self, path, transform = None, kind='train'):

        if kind=='train':
            files = Path(path).glob('[1-4]-*')
            self.items = [(str(file), file.name.split('-')[-1].replace('.wav', '')) for file in files]
        if kind=='test':
            files = Path(path).glob('5-*')
            self.items = [(str(file), file.name.split('-')[-1].replace('.wav', '')) for file in files]

        self.length = len(self.items)
        # print(self.length)

    def __getitem__(self, index):
        filename, label = self.items[index]
        data_tensor, orig_rate = torchaudio.load(filename)
        if orig_rate != 16000:
          resampler = Resample(orig_freq=orig_rate, new_freq=16000)
          data_tensor = resampler(data_tensor)
        tmp = data_tensor[0,0:80000]
        return (tmp, int(label))

    def __len__(self):
        return self.length

In [17]:
batch_size = 64

train_data = DataGenerator(path_audio, kind='train')
test_data = DataGenerator(path_audio, kind='test')

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [18]:
class Net(nn.Module):

    def __init__(self, num_classes=50):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(1,8), stride=(1,1), padding="same")
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(1,8), stride=(1,1), padding="same")
        self.bn2 = nn.BatchNorm2d(16)

        self.pool_1 = nn.MaxPool2d(kernel_size=(1,128), stride = (1,128), padding=0)

        self.conv3 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3,3), stride=(1,1), padding=1)
        self.bn3 = nn.BatchNorm2d(32)
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=(1,1), padding=1)
        self.bn4 = nn.BatchNorm2d(32)

        self.pool_2 = nn.MaxPool2d(kernel_size=4, padding=0)

        self.conv5 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3,3), stride=(2,2), padding=2)
        self.bn5 = nn.BatchNorm2d(64)
        self.conv6 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn6 = nn.BatchNorm2d(64)

        self.pool_3 = nn.MaxPool2d(kernel_size=2, padding=0)

        self.conv7 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn7 = nn.BatchNorm2d(128)
        self.conv8 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(3,3), stride=(2,2), padding=1)
        self.bn8 = nn.BatchNorm2d(128)

        self.pool_4 = nn.MaxPool2d(kernel_size=(1,2), padding=0)
        self.dense = nn.Linear(in_features= 256, out_features=num_classes)
        self.dropout = nn.Dropout(0.2)


    def forward(self, x):
        x = x.unsqueeze(1).view(-1, 1, 1, 80000)
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool_1(x)
        x = x.view((-1,1,16, 625))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.pool_2(x)
        x = F.relu(self.bn5(self.conv5(x)))
        x = self.dropout(x)
        x = F.relu(self.bn6(self.conv6(x)))
        x = self.pool_3(x)
        x = F.relu(self.bn7(self.conv7(x)))
        x = self.dropout(x)
        x = F.relu(self.bn8(self.conv8(x)))
        x = self.pool_4(x)
        x = x.view(x.size(0),-1)
        x = self.dense(x)
        x = self.dropout(x)
        return x

device = "cpu"
if (torch.cuda.is_available()):
    device = "cuda"

In [9]:
# summary(model,(1,80000))

  return F.conv2d(input, weight, bias, self.stride,


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 1, 80000]             144
       BatchNorm2d-2         [-1, 16, 1, 80000]              32
           Dropout-3         [-1, 16, 1, 80000]               0
            Conv2d-4         [-1, 16, 1, 80000]           2,064
       BatchNorm2d-5         [-1, 16, 1, 80000]              32
         MaxPool2d-6           [-1, 16, 1, 625]               0
            Conv2d-7          [-1, 32, 16, 625]             320
       BatchNorm2d-8          [-1, 32, 16, 625]              64
           Dropout-9          [-1, 32, 16, 625]               0
           Conv2d-10          [-1, 32, 16, 625]           9,248
      BatchNorm2d-11          [-1, 32, 16, 625]              64
        MaxPool2d-12           [-1, 32, 4, 156]               0
           Conv2d-13            [-1, 64, 3, 79]          18,496
      BatchNorm2d-14            [-1, 64

In [19]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    train_loss = 0
    print("------------------------------- Epoch:", epoch,"-------------------------------")
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        optimizer.zero_grad()

        output = model(data.to(device))

        loss = loss_criteria(output, target)

        train_loss += loss.item()

        loss.backward(retain_graph=True)
        optimizer.step()

    avg_loss = train_loss / (batch_idx+1)
    print('Training set: Average loss: {:.6f}'.format(avg_loss))
    return avg_loss

In [20]:
loss_criteria = nn.CrossEntropyLoss()
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)

            output = model(data)

            test_loss += loss_criteria(output, target).item()

            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(target==predicted).item()

    avg_loss = test_loss / batch_count
    print('Validation set: Average loss: {:.6f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        avg_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    return avg_loss

In [21]:
model = Net(num_classes=50).to(device)
input_data = torch.randn(64, 1, 80000)

optimizer = optim.Adam(model.parameters(), lr=3e-4)

loss_criteria = nn.CrossEntropyLoss()

epoch_nums = []
training_loss = []
validation_loss = []

epochs = 50
print('Training on', device)
for epoch in tqdm(range(1, epochs + 1)):
    train_loss = train(model, device, train_loader, optimizer, epoch)
    test_loss = test(model, device, test_loader)
    epoch_nums.append(epoch)
    training_loss.append(train_loss)
    validation_loss.append(test_loss)

Training on cuda


  0%|          | 0/50 [00:00<?, ?it/s]

------------------------------- Epoch: 1 -------------------------------
Training set: Average loss: 3.943439


  2%|▏         | 1/50 [00:34<27:55, 34.20s/it]

Validation set: Average loss: 3.922614, Accuracy: 8/400 (2%)

------------------------------- Epoch: 2 -------------------------------
Training set: Average loss: 3.732102


  4%|▍         | 2/50 [01:09<27:53, 34.87s/it]

Validation set: Average loss: 3.868791, Accuracy: 14/400 (4%)

------------------------------- Epoch: 3 -------------------------------
Training set: Average loss: 3.597051


  6%|▌         | 3/50 [01:43<26:55, 34.38s/it]

Validation set: Average loss: 3.701922, Accuracy: 24/400 (6%)

------------------------------- Epoch: 4 -------------------------------
Training set: Average loss: 3.442448


  8%|▊         | 4/50 [02:19<26:45, 34.90s/it]

Validation set: Average loss: 3.628297, Accuracy: 24/400 (6%)

------------------------------- Epoch: 5 -------------------------------
Training set: Average loss: 3.314631


 10%|█         | 5/50 [02:53<26:02, 34.72s/it]

Validation set: Average loss: 3.537424, Accuracy: 40/400 (10%)

------------------------------- Epoch: 6 -------------------------------
Training set: Average loss: 3.211943


 12%|█▏        | 6/50 [03:28<25:36, 34.91s/it]

Validation set: Average loss: 3.507287, Accuracy: 37/400 (9%)

------------------------------- Epoch: 7 -------------------------------
Training set: Average loss: 3.102491


 14%|█▍        | 7/50 [04:03<25:02, 34.94s/it]

Validation set: Average loss: 3.481516, Accuracy: 54/400 (14%)

------------------------------- Epoch: 8 -------------------------------
Training set: Average loss: 3.025943


 16%|█▌        | 8/50 [04:38<24:22, 34.82s/it]

Validation set: Average loss: 3.388337, Accuracy: 56/400 (14%)

------------------------------- Epoch: 9 -------------------------------
Training set: Average loss: 2.957786


 18%|█▊        | 9/50 [05:13<23:55, 35.01s/it]

Validation set: Average loss: 3.277274, Accuracy: 75/400 (19%)

------------------------------- Epoch: 10 -------------------------------
Training set: Average loss: 2.870336


 20%|██        | 10/50 [05:47<23:10, 34.77s/it]

Validation set: Average loss: 3.192422, Accuracy: 66/400 (16%)

------------------------------- Epoch: 11 -------------------------------
Training set: Average loss: 2.789886


 22%|██▏       | 11/50 [06:23<22:47, 35.05s/it]

Validation set: Average loss: 3.106782, Accuracy: 75/400 (19%)

------------------------------- Epoch: 12 -------------------------------
Training set: Average loss: 2.710361


 24%|██▍       | 12/50 [06:58<22:04, 34.84s/it]

Validation set: Average loss: 3.143548, Accuracy: 76/400 (19%)

------------------------------- Epoch: 13 -------------------------------
Training set: Average loss: 2.611834


 26%|██▌       | 13/50 [07:33<21:40, 35.16s/it]

Validation set: Average loss: 3.182131, Accuracy: 73/400 (18%)

------------------------------- Epoch: 14 -------------------------------
Training set: Average loss: 2.546299


 28%|██▊       | 14/50 [08:07<20:53, 34.82s/it]

Validation set: Average loss: 2.957328, Accuracy: 90/400 (22%)

------------------------------- Epoch: 15 -------------------------------
Training set: Average loss: 2.487960


 30%|███       | 15/50 [08:43<20:26, 35.03s/it]

Validation set: Average loss: 2.927986, Accuracy: 88/400 (22%)

------------------------------- Epoch: 16 -------------------------------
Training set: Average loss: 2.433832


 32%|███▏      | 16/50 [09:17<19:43, 34.82s/it]

Validation set: Average loss: 2.750760, Accuracy: 103/400 (26%)

------------------------------- Epoch: 17 -------------------------------
Training set: Average loss: 2.336916


 34%|███▍      | 17/50 [09:54<19:24, 35.30s/it]

Validation set: Average loss: 2.750193, Accuracy: 111/400 (28%)

------------------------------- Epoch: 18 -------------------------------
Training set: Average loss: 2.288192


 36%|███▌      | 18/50 [10:29<18:46, 35.22s/it]

Validation set: Average loss: 2.721897, Accuracy: 115/400 (29%)

------------------------------- Epoch: 19 -------------------------------
Training set: Average loss: 2.214825


 38%|███▊      | 19/50 [11:04<18:08, 35.12s/it]

Validation set: Average loss: 2.614399, Accuracy: 129/400 (32%)

------------------------------- Epoch: 20 -------------------------------
Training set: Average loss: 2.189125


 40%|████      | 20/50 [11:39<17:35, 35.19s/it]

Validation set: Average loss: 2.595345, Accuracy: 121/400 (30%)

------------------------------- Epoch: 21 -------------------------------
Training set: Average loss: 2.136032


 42%|████▏     | 21/50 [12:13<16:53, 34.96s/it]

Validation set: Average loss: 2.602413, Accuracy: 119/400 (30%)

------------------------------- Epoch: 22 -------------------------------
Training set: Average loss: 2.021222


 44%|████▍     | 22/50 [12:49<16:25, 35.19s/it]

Validation set: Average loss: 2.545722, Accuracy: 129/400 (32%)

------------------------------- Epoch: 23 -------------------------------
Training set: Average loss: 2.017486


 46%|████▌     | 23/50 [13:23<15:41, 34.88s/it]

Validation set: Average loss: 2.516858, Accuracy: 121/400 (30%)

------------------------------- Epoch: 24 -------------------------------
Training set: Average loss: 1.989339


 48%|████▊     | 24/50 [13:59<15:10, 35.00s/it]

Validation set: Average loss: 2.433514, Accuracy: 139/400 (35%)

------------------------------- Epoch: 25 -------------------------------
Training set: Average loss: 1.965242


 50%|█████     | 25/50 [14:32<14:26, 34.68s/it]

Validation set: Average loss: 2.348858, Accuracy: 148/400 (37%)

------------------------------- Epoch: 26 -------------------------------
Training set: Average loss: 1.909331


 52%|█████▏    | 26/50 [15:08<13:58, 34.92s/it]

Validation set: Average loss: 2.495628, Accuracy: 138/400 (34%)

------------------------------- Epoch: 27 -------------------------------
Training set: Average loss: 1.885168


 54%|█████▍    | 27/50 [15:42<13:16, 34.65s/it]

Validation set: Average loss: 2.356762, Accuracy: 157/400 (39%)

------------------------------- Epoch: 28 -------------------------------
Training set: Average loss: 1.789555


 56%|█████▌    | 28/50 [16:18<12:48, 34.95s/it]

Validation set: Average loss: 2.342469, Accuracy: 162/400 (40%)

------------------------------- Epoch: 29 -------------------------------
Training set: Average loss: 1.776257


 58%|█████▊    | 29/50 [16:52<12:08, 34.70s/it]

Validation set: Average loss: 2.369518, Accuracy: 150/400 (38%)

------------------------------- Epoch: 30 -------------------------------
Training set: Average loss: 1.704145


 60%|██████    | 30/50 [17:27<11:37, 34.88s/it]

Validation set: Average loss: 2.250917, Accuracy: 160/400 (40%)

------------------------------- Epoch: 31 -------------------------------
Training set: Average loss: 1.697640


 62%|██████▏   | 31/50 [18:01<10:57, 34.62s/it]

Validation set: Average loss: 2.450459, Accuracy: 147/400 (37%)

------------------------------- Epoch: 32 -------------------------------
Training set: Average loss: 1.679692


 64%|██████▍   | 32/50 [18:36<10:26, 34.83s/it]

Validation set: Average loss: 2.344755, Accuracy: 156/400 (39%)

------------------------------- Epoch: 33 -------------------------------
Training set: Average loss: 1.635463


 66%|██████▌   | 33/50 [19:10<09:47, 34.53s/it]

Validation set: Average loss: 2.286819, Accuracy: 170/400 (42%)

------------------------------- Epoch: 34 -------------------------------
Training set: Average loss: 1.607316


 68%|██████▊   | 34/50 [19:46<09:19, 34.95s/it]

Validation set: Average loss: 2.224867, Accuracy: 158/400 (40%)

------------------------------- Epoch: 35 -------------------------------
Training set: Average loss: 1.585783


 70%|███████   | 35/50 [20:20<08:40, 34.72s/it]

Validation set: Average loss: 2.229199, Accuracy: 165/400 (41%)

------------------------------- Epoch: 36 -------------------------------
Training set: Average loss: 1.548303


 72%|███████▏  | 36/50 [20:56<08:08, 34.88s/it]

Validation set: Average loss: 2.285540, Accuracy: 162/400 (40%)

------------------------------- Epoch: 37 -------------------------------
Training set: Average loss: 1.486341


 74%|███████▍  | 37/50 [21:30<07:31, 34.76s/it]

Validation set: Average loss: 2.270438, Accuracy: 169/400 (42%)

------------------------------- Epoch: 38 -------------------------------
Training set: Average loss: 1.508499


 76%|███████▌  | 38/50 [22:04<06:55, 34.61s/it]

Validation set: Average loss: 2.243991, Accuracy: 156/400 (39%)

------------------------------- Epoch: 39 -------------------------------
Training set: Average loss: 1.461757


 78%|███████▊  | 39/50 [22:40<06:24, 34.94s/it]

Validation set: Average loss: 2.178950, Accuracy: 166/400 (42%)

------------------------------- Epoch: 40 -------------------------------
Training set: Average loss: 1.396416


 80%|████████  | 40/50 [23:14<05:46, 34.65s/it]

Validation set: Average loss: 2.134559, Accuracy: 172/400 (43%)

------------------------------- Epoch: 41 -------------------------------
Training set: Average loss: 1.375967


 82%|████████▏ | 41/50 [23:49<05:13, 34.85s/it]

Validation set: Average loss: 2.197079, Accuracy: 163/400 (41%)

------------------------------- Epoch: 42 -------------------------------
Training set: Average loss: 1.362275


 84%|████████▍ | 42/50 [24:23<04:36, 34.56s/it]

Validation set: Average loss: 2.125292, Accuracy: 159/400 (40%)

------------------------------- Epoch: 43 -------------------------------
Training set: Average loss: 1.372530


 86%|████████▌ | 43/50 [24:59<04:04, 34.87s/it]

Validation set: Average loss: 2.156828, Accuracy: 162/400 (40%)

------------------------------- Epoch: 44 -------------------------------
Training set: Average loss: 1.360384


 88%|████████▊ | 44/50 [25:33<03:27, 34.64s/it]

Validation set: Average loss: 2.106139, Accuracy: 176/400 (44%)

------------------------------- Epoch: 45 -------------------------------
Training set: Average loss: 1.347091


 90%|█████████ | 45/50 [26:08<02:54, 34.82s/it]

Validation set: Average loss: 2.062835, Accuracy: 178/400 (44%)

------------------------------- Epoch: 46 -------------------------------
Training set: Average loss: 1.287902


 92%|█████████▏| 46/50 [26:42<02:18, 34.57s/it]

Validation set: Average loss: 2.121573, Accuracy: 162/400 (40%)

------------------------------- Epoch: 47 -------------------------------
Training set: Average loss: 1.229381


 94%|█████████▍| 47/50 [27:17<01:44, 34.79s/it]

Validation set: Average loss: 2.066873, Accuracy: 166/400 (42%)

------------------------------- Epoch: 48 -------------------------------
Training set: Average loss: 1.309892


 96%|█████████▌| 48/50 [27:51<01:09, 34.55s/it]

Validation set: Average loss: 2.051270, Accuracy: 171/400 (43%)

------------------------------- Epoch: 49 -------------------------------
Training set: Average loss: 1.234426


 98%|█████████▊| 49/50 [28:27<00:34, 34.88s/it]

Validation set: Average loss: 2.109373, Accuracy: 165/400 (41%)

------------------------------- Epoch: 50 -------------------------------
Training set: Average loss: 1.264724


100%|██████████| 50/50 [29:01<00:00, 34.83s/it]

Validation set: Average loss: 2.020796, Accuracy: 175/400 (44%)






In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import torch.nn as nn

loss_criteria = nn.CrossEntropyLoss()

def test_upd(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        batch_count = 0
        for data, target in test_loader:
            batch_count += 1
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += loss_criteria(output, target).item()

            _, predicted = torch.max(output.data, 1)
            correct += torch.sum(target == predicted).item()

            all_targets.extend(target.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    avg_loss = test_loss / batch_count
    accuracy = 100. * correct / len(test_loader.dataset)

    # Calculate precision, recall and F1-score
    precision = precision_score(all_targets, all_predictions, average='weighted')
    recall = recall_score(all_targets, all_predictions, average='weighted')
    f1 = f1_score(all_targets, all_predictions, average='weighted')

    print('Validation set: Average loss: {:.6f}, Accuracy: {}/{} ({:.0f}%), Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}\n'.format(
        avg_loss, correct, len(test_loader.dataset), accuracy, precision, recall, f1))

    return avg_loss, accuracy, precision, recall, f1

In [24]:
t = test_upd(model, device, test_loader)

Validation set: Average loss: 2.105532, Accuracy: 175/400 (44%), Precision: 0.4823, Recall: 0.4375, F1 Score: 0.4316

