In [1]:
import torch
import torchaudio
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import torchvision
from torchvision import transforms
from torchvision.datasets import FashionMNIST
import torchvision.models as models
from PIL import Image
from tqdm.notebook import tqdm

import os
import math
import random
import pandas as pd
import numpy as np

num_epochs = 30
batch_size = 32

# reproducibility
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(13)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

print(device)

  '"sox" backend is being deprecated. '


cuda:0


In [2]:
data_path = 'dataset/'
meta_path = 'metadata.csv'

In [3]:
import os

len(os.listdir(data_path))

4000

In [4]:
df = pd.read_csv('metadata.csv')
df.head()

Unnamed: 0,file,type,freq,detune,level
0,0000.wav,0,30.0,0.0,1.0
1,0001.wav,0,30.0,0.0,12.0
2,0002.wav,0,30.0,0.0,23.0
3,0003.wav,0,30.0,0.0,34.0
4,0004.wav,0,30.0,0.0,45.0


In [5]:
df['freq'].value_counts()

53.333333     400
61.111111     400
92.222222     400
84.444444     400
37.777778     400
45.555556     400
68.888889     400
76.666667     400
100.000000    400
30.000000     400
Name: freq, dtype: int64

In [6]:
from torch.utils.data import Dataset, DataLoader, random_split
import os 
import pandas as pd

class AudioDateset(Dataset):
    def __init__(self, data_dir, metadata):
        self.data_dir = data_dir
        self.data_table = pd.read_csv(metadata)
    def __getitem__(self, idx):
        # print(list(self.data_table.loc[idx]))
        wav_file, wtype, freq, detune, level = list(self.data_table.loc[idx])
        path = os.path.join(self.data_dir, wav_file)
        wav, sr = torchaudio.load(path)
        
        wav_proc = torchaudio.transforms.MelSpectrogram(sample_rate=22050, n_mels=64, n_fft=1024, hop_length=256, f_max=8000)
        
        mel_spectrogram = wav_proc(wav)
        
        return mel_spectrogram.squeeze(0), torch.Tensor([wtype, freq, detune, level])
    
    def __len__(self):
        length = len(os.listdir(data_path))
        return length

dataset = AudioDateset(data_path, meta_path)

n = len(os.listdir(data_path))
k = int(0.75 * n)

train_dataset, valid_dataset = random_split(dataset, [k, n - k])

train_loader = DataLoader(train_dataset, batch_size=batch_size,
                                         shuffle=True, num_workers=8, pin_memory=True)

valid_loader = DataLoader(valid_dataset, batch_size=batch_size,
                                         shuffle=False, num_workers=8, pin_memory=True)

In [7]:
class PabloNet(nn.Module):    
    def __init__(self):
        super(PabloNet, self).__init__()
          
        self.layers_conv = nn.Sequential(
            nn.Conv1d(64, 64, kernel_size=7, stride=2, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=7, stride=2, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(64, 128, kernel_size=5, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=5, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Conv1d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            #nn.AdaptiveAvgPool1d(4)
        )
        
        self.wavetype_head = nn.Sequential(
            nn.Linear(5120, 256),
            nn.ReLU(),
            nn.Linear(256, 4)
        )
        
        self.regression_head = nn.Sequential(
            nn.Linear(5120, 256),
            nn.ReLU(),
            nn.Linear(256, 3)
        )
        
    def forward(self, x):
        x = self.layers_conv(x)
        x = x.view(x.size(0), -1)
        
        wavetype = self.wavetype_head(x)
        param_vec = self.regression_head(x)
                                         
        return wavetype, param_vec     
    
model = PabloNet()

model = model.to(device)

In [8]:
import torch.optim as optim

wavetype_criterion = nn.CrossEntropyLoss()
regression_criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [9]:
def train(epoch):
    model.train() #don't forget to switch between train and eval!
    
    running_loss = 0.0 #more accurate representation of current loss than loss.item()

    for i, (wavs, labels) in enumerate(tqdm(train_loader)):
        wavs, labels = wavs.to(device), labels.to(device)
        
        gt_wavetype = labels[:, 0].type(torch.LongTensor).to(device)
        gt_param_vec = labels[:, 1:].to(device)

        optimizer.zero_grad()

        out_wavetype, out_param_vec = model(wavs)

        loss = wavetype_criterion(out_wavetype, gt_wavetype) + regression_criterion(out_param_vec, gt_param_vec)

        loss.backward()

        optimizer.step()

        running_loss += loss.item()

        if (i + 1)% 50 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, (i+ 1) * len(wavs), len(train_loader.dataset),
                    100. * (i + 1) / len(train_loader), running_loss / 50))
            
            running_loss = 0.0
            
def evaluate(data_loader):
    model.eval() 
    
    running_loss = 0.0
    correct = 0
    
    with torch.no_grad():
        for i, (wavs, labels) in enumerate(data_loader):
            wavs, labels = wavs.to(device), labels.to(device)

            gt_wavetype = labels[:, 0].type(torch.LongTensor).to(device)
            gt_param_vec = labels[:, 1:].to(device)

            out_wavetype, out_param_vec = model(wavs)

            loss = wavetype_criterion(out_wavetype, gt_wavetype) + regression_criterion(out_param_vec, gt_param_vec)
            
            pred = out_wavetype.data.max(1, keepdim=True)[1]
            correct += pred.eq(gt_wavetype.data.view_as(pred)).cpu().sum()
            
            running_loss += loss.item()
            
    running_loss /= len(data_loader)
        
    print('\nAverage loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)\n'.format(
        running_loss, correct, len(data_loader.dataset),
        100. * correct / len(data_loader.dataset)))

In [10]:
import warnings
warnings.simplefilter("ignore")

In [11]:
for epoch in range(num_epochs): 
    train(epoch)
    evaluate(valid_loader)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 693.5876, Accuracy: 257/1000 (25.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 652.9857, Accuracy: 328/1000 (32.800%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 507.2728, Accuracy: 448/1000 (44.800%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 623.6809, Accuracy: 330/1000 (33.000%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 453.4178, Accuracy: 447/1000 (44.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 461.5745, Accuracy: 374/1000 (37.400%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 416.4287, Accuracy: 399/1000 (39.900%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 635.9422, Accuracy: 486/1000 (48.600%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 470.5255, Accuracy: 445/1000 (44.500%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 555.9962, Accuracy: 433/1000 (43.300%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 560.4314, Accuracy: 457/1000 (45.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 468.9714, Accuracy: 437/1000 (43.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 553.3388, Accuracy: 414/1000 (41.400%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 715.5912, Accuracy: 416/1000 (41.600%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 456.0444, Accuracy: 481/1000 (48.100%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 319.9091, Accuracy: 432/1000 (43.200%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 378.5396, Accuracy: 545/1000 (54.500%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 373.0092, Accuracy: 450/1000 (45.000%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 218.3077, Accuracy: 512/1000 (51.200%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 372.3362, Accuracy: 461/1000 (46.100%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 491.1683, Accuracy: 400/1000 (40.000%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 244.0831, Accuracy: 524/1000 (52.400%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 231.6392, Accuracy: 480/1000 (48.000%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 220.6114, Accuracy: 517/1000 (51.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 317.4529, Accuracy: 477/1000 (47.700%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 207.9203, Accuracy: 520/1000 (52.000%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 260.5199, Accuracy: 514/1000 (51.400%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 246.6582, Accuracy: 548/1000 (54.800%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 342.4559, Accuracy: 449/1000 (44.900%)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=94.0), HTML(value='')))



Average loss: 182.7814, Accuracy: 533/1000 (53.300%)



In [12]:
for idx in range(5):
    with torch.no_grad():
        out_wavetype, out_param_vec = model(valid_dataset[idx][0].cuda().unsqueeze(0))
        print(torch.argmax(out_wavetype).item(), out_param_vec)
        print(valid_dataset[idx][1][0].item(), valid_dataset[idx][1][1:])


3 tensor([[48.3045, 81.2175, 55.4030]], device='cuda:0')
3.0 tensor([ 45.5556, 100.0000,  12.0000])
1 tensor([[69.8770, 52.2266, -0.9066]], device='cuda:0')
2.0 tensor([76.6667, 33.3333,  1.0000])
2 tensor([[80.3099, 53.0009, 66.6117]], device='cuda:0')
2.0 tensor([84.4444, 66.6667, 89.0000])
3 tensor([[44.1474, 68.1140, 53.7822]], device='cuda:0')
3.0 tensor([45.5556, 66.6667, 23.0000])
0 tensor([[90.5629, 17.1914, 82.9355]], device='cuda:0')
1.0 tensor([100.0000,  22.2222, 100.0000])
