## try torch audio

In [None]:
import torchaudio

for i in range(100):
    filename = f'mp3/train/train_{i:04d}.mp3'
    waveform, sample_rate = torchaudio.load(filename)
    assert len(waveform.size()) == 2, f'{i}'
    print("Shape of waveform: {}".format(waveform.size()))
    print("Sample rate of waveform: {}".format(sample_rate))
    soundata = torch.mean(waveform, 0)
    print("Shape of single: {}".format(soundata.size()))

Computing MFCC is timie comsuming. **If we do it on-the-fly, this will become a bottleneck and heavily slow down processing.** So we prepare the feature and load it from memory using TensorDataset later.

In [None]:
from extra.audio_dataset import AudioTagging, Common, myMFCC, myPermute
import torchvision.transforms as transforms

In [None]:
tf = transforms.Compose([Common(duration=3.0,
                                resample_rate=44100),
                         myMFCC(),
                        ])
# tf = Common(duration=3.0)

train_dataset = AudioTagging('mp3/train', 'mp3/train.csv', transform=tf)
test_dataset = AudioTagging('mp3/test', transform=tf)

In [None]:
my_mfcc_train, label_train = zip(*[ train_dataset[i] for i in range(len(train_dataset)) ])
my_mfcc_test = [ test_dataset[i] for i in range(len(test_dataset)) ]

In [None]:
my_mfcc_train_tensor = torch.stack(my_mfcc_train, 0)
my_mfcc_test_tensor = torch.stack(my_mfcc_test, 0)
label_train_tensor = torch.stack(label_train, 0)

print(my_mfcc_train_tensor.shape)
print(label_train_tensor.shape)
print(my_mfcc_test_tensor.shape)

In [None]:
torch.save({'mfcc_train': my_mfcc_train_tensor,
            'mfcc_test': my_mfcc_test_tensor,
            'label_train': label_train_tensor}, 'ckpt/mfccs.pt')

Then we create TensorDataset to train our model.

In [None]:
from torch.utils.data import TensorDataset

my_mfcc_train_dataset = TensorDataset(my_mfcc_train_tensor, label_train_tensor)
my_mfcc_test_dataset = TensorDataset(my_mfcc_test_tensor)

now, we are ready to train our model using [skorch](https://github.com/skorch-dev/skorch).

In [None]:
from skorch import NeuralNet
from extra.nn_model import GRUTagging
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import GridSearchCV

In [None]:
model = GRUTagging(input_dim=13,
                   hidden_dim=128,
                   batch_first=True)

net = NeuralNet(model,
                criterion=nn.BCEWithLogitsLoss,
                optimizer=optim.Adam,
                device='cuda',
                batch_size=32,
                max_epochs=50
               )

In [None]:
net.fit(train_dataset)

In [None]:
params = {
    'max_epochs': [10, 20, 30],
    'criterion': [nn.BCEWithLogitsLoss],
    'module__hidden_dim': [64, 128],
    'module__dropout': [0.2, 0.3]
}

gs = GridSearchCV(net, params, refit=False, cv=5)
gs.fit(my_mfcc_train_tensor, label_train_tensor, scoring='accuracy')

In [None]:
my_mfcc_test_dataset[0]

In [None]:
train_dataset[50][0].shape

In [None]:
train_dataset.audio_names