In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import pytorch_lightning as pl
import torchaudio
import pandas as pd

import matplotlib.pyplot as plt

from pathlib import Path


In [2]:
dataset_path = '/home/andrea/lesson1/reprodl2021/ESC-50/audio/'

In [3]:
torch.cuda.is_available()

True

In [4]:
## Step1: load data

In [5]:
datapath =Path('../ESC-50')

In [6]:
datapath.exists()

True

In [7]:
csv=pd.read_csv(datapath/Path('meta/esc50.csv'))

In [8]:
csv.loc[csv.fold == 1].filename

0       1-100032-A-0.wav
1      1-100038-A-14.wav
2      1-100210-A-36.wav
3      1-100210-B-36.wav
4      1-101296-A-19.wav
             ...        
395      1-9841-A-13.wav
396      1-9886-A-49.wav
397      1-9887-A-49.wav
398      1-9887-B-49.wav
399     1-99958-A-31.wav
Name: filename, Length: 400, dtype: object

In [9]:
csv.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [10]:
datapath/Path('audio')/csv.loc[csv.fold == 1].filename

0       ../ESC-50/audio/1-100032-A-0.wav
1      ../ESC-50/audio/1-100038-A-14.wav
2      ../ESC-50/audio/1-100210-A-36.wav
3      ../ESC-50/audio/1-100210-B-36.wav
4      ../ESC-50/audio/1-101296-A-19.wav
                     ...                
395      ../ESC-50/audio/1-9841-A-13.wav
396      ../ESC-50/audio/1-9886-A-49.wav
397      ../ESC-50/audio/1-9887-A-49.wav
398      ../ESC-50/audio/1-9887-B-49.wav
399     ../ESC-50/audio/1-99958-A-31.wav
Name: filename, Length: 400, dtype: object

In [11]:
#x1=torchaudio.transforms.Resample(orig_freq=sr,new_freq=4096)(x)
#plt.plot(x1[0,::5])

In [12]:
#x1.shape

In [13]:
#h=torchaudio.transforms.MelSpectrogram(sample_rate=sr)(x)

In [14]:
#h=torchaudio.transforms.AmplitudeToDB()(h)

In [15]:
#plt.imshow(h[0])

In [16]:
class ESC50Dataset(torch.utils.data.Dataset):
    
    def __init__(self, path: Path = Path('../ESC-50'), 
                 sample_rate: int = 8000,
                 folds = [1]):
        # Load CSV & initialize all torchaudio.transforms
        # Resample --> MelSpectrogram --> AmplitudeToDB
        self.path = path
        self.csv = pd.read_csv(path / Path('meta/esc50.csv'))
        self.csv = self.csv[self.csv['fold'].isin(folds)]
        self.resample = torchaudio.transforms.Resample(
            orig_freq=44100, new_freq=sample_rate
        )
        self.melspec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate)
        self.db = torchaudio.transforms.AmplitudeToDB(top_db=80)
        
        
    def __getitem__(self, index):
        # Returns (xb, yb) pair, after applying all transformations on the audio file.
        row = self.csv.iloc[index]
        wav, _ = torchaudio.load(self.path / 'audio' / row['filename'])
        label = row['target']
        xb = self.db(
            self.melspec(
                self.resample(wav)
            )
        )
        return xb, label
        
    def __len__(self):
        # Returns length
        return len(self.csv)


In [17]:
train_data=ESC50Dataset(folds=[1,2,3])
val_data=ESC50Dataset(folds=[4])
test_data=ESC50Dataset(folds=[5])

In [18]:
train_loader = torch.utils.data.DataLoader(train_data,batch_size=16,shuffle=True)

In [19]:
val_loader = torch.utils.data.DataLoader(val_data,batch_size=8)

In [20]:
test_loader = torch.utils.data.DataLoader(test_data,batch_size=8)

In [21]:
## Model Building

In [22]:
class AudioNet(pl.LightningModule):
    
    def __init__(self, n_classes = 50, base_filters = 32):
        super().__init__()
        self.conv1 = nn.Conv2d(1, base_filters, 11, padding=5)
        self.bn1 = nn.BatchNorm2d(base_filters)
        self.conv2 = nn.Conv2d(base_filters, base_filters, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(base_filters)
        self.pool1 = nn.MaxPool2d(2)
        self.conv3 = nn.Conv2d(base_filters, base_filters * 2, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(base_filters * 2)
        self.conv4 = nn.Conv2d(base_filters * 2, base_filters * 4, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(base_filters * 4)
        self.pool2 = nn.MaxPool2d(2)
        self.fc1 = nn.Linear(base_filters * 4, n_classes)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool1(x)
        x = self.conv3(x)
        x = F.relu(self.bn3(x))
        x = self.conv4(x)
        x = F.relu(self.bn4(x))
        x = self.pool2(x)
        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = self.fc1(x[:, :, 0, 0])
        return x
    
    def training_step(self, batch, batch_idx):
        # Very simple training loop
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss, on_step=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_hat = torch.argmax(y_hat, dim=1)
        acc = pl.metrics.functional.accuracy(y_hat, y)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        return acc
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [23]:
audionet = AudioNet()

In [24]:
xb,yb=next(iter(train_loader))

In [25]:
audionet(xb).shape

torch.Size([16, 50])

In [26]:
##https://pytorch-lightning.readthedocs.io/en/latest/starter/new-project.html#step-1-define-lightningmodule

In [27]:
##password =reprodl2021

In [28]:
pl.seed_everything(0)

Global seed set to 0


0

In [29]:
trainer = pl.Trainer(gpus=1, max_epochs=25)#fast_dev_run for a quick check

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [30]:
trainer.fit(audionet,train_loader,val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name  | Type        | Params
---------------------------------------
0  | conv1 | Conv2d      | 3.9 K 
1  | bn1   | BatchNorm2d | 64    
2  | conv2 | Conv2d      | 9.2 K 
3  | bn2   | BatchNorm2d | 64    
4  | pool1 | MaxPool2d   | 0     
5  | conv3 | Conv2d      | 18.5 K
6  | bn3   | BatchNorm2d | 128   
7  | conv4 | Conv2d      | 73.9 K
8  | bn4   | BatchNorm2d | 256   
9  | pool2 | MaxPool2d   | 0     
10 | fc1   | Linear      | 6.5 K 
---------------------------------------
112 K     Trainable params
0         Non-trainable params
112 K     Total params
0.450     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

1

In [31]:
trainer.test(audionet,test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


1