In [1]:
# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.transforms as aud_transforms
import numpy as np
import tabulate

torchaudio.set_audio_backend("sox_io")



In [2]:
# config
spectrogram_transform = (
    aud_transforms.MelSpectrogram(
        sample_rate=16000,
        n_mels=128,
        n_fft=1024,
        hop_length=256
    )
)

In [3]:
# setup data
genres = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
sample_paths = []
sample_targets = []
validation_paths = []
validation_targets = []

for g_ind, genre in enumerate(genres):
    for i in range(10):
        ind = "{0:0=2d}".format(i)
        sample_paths.append(f"data/WAV/{genre}/{genre}.000{ind}.wav")
        sample_targets.append(g_ind)
    for i in range(10, 20):
        ind = "{0:0=2d}".format(i)
        validation_paths.append(f"data/WAV/{genre}/{genre}.000{ind}.wav")
        validation_targets.append(g_ind)

class GtzanDataset(Dataset):
    def __init__(self, paths, labels, mel_spec_transform):
        self.X = paths
        self.targets = labels
        self.transform = mel_spec_transform
    def __getitem__(self, index):
        path = self.X[index]
        wave_data = self.load_audio(path)
        return self.transform(wave_data), self.targets[index] #self.get_patched_spectrograms(wave_data), self.targets[index]
    def __len__(self):
        return len(self.X)
    def load_audio(self, path):
        ''' Loads wave data from given path and resamples it to 16000Hz '''
        wd, sr = torchaudio.load(path, normalize=True)
        resampler = aud_transforms.Resample(sr, 16000)
        return resampler(wd).squeeze()
    def get_patched_spectrograms(self, wave_data):
        ''' Splits wave data into half overlapping windows and turns each into Melspectrograms '''
        patches = self.splitsongs(wave_data[:465984]) # <-- fix the number of patches
        mel_specs = [self.transform(patch) for patch in patches]
        return torch.stack(mel_specs)
    def splitsongs(self, wd, overlap = 0.0):
        temp_X = []
        xshape = wd.shape[0]
        chunk = 10000
        offset = int(chunk*(1.-overlap))
        spsong = [wd[i:i+chunk] for i in range(0, xshape - chunk + offset, offset)]
        for s in spsong:
            if s.shape[0] != chunk:
                continue
            temp_X.append(s)

        return np.array(temp_X)

In [16]:
# setup network
class SegmentedCNN(nn.Module):
    def __init__(self):
        super(SegmentedCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=1, padding=(1, 1))
        self.conv2 = nn.Conv2d(64, 64, kernel_size=5, stride=1, padding=(1, 1))

        self.fc1 = nn.Linear(896640, 10)
        # nn.init.kaiming_normal_(self.fc1.weight, mode='fan_in')
        self.fc1.bias.data.fill_(0.01)

        self.pool = F.max_pool2d
        self.dropout = nn.Dropout(p=0.5)
        self.log_sm = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = x.unsqueeze(1)
        x = F.relu(self.pool(self.conv1(x), 2))
        x = F.relu(self.pool(self.conv2(x), 2))
        x = x.reshape(x.shape[0], -1)
        print(x.shape)
        x = self.dropout(F.relu(self.fc1(x)))

        return x



In [5]:
# attempt loading all samples and see which one is faulty
for path in sample_paths:
    try:
        temp, sr = torchaudio.load(path, normalize=True)
    except:
        print(path)
for path in validation_paths:
    try:
        temp, sr = torchaudio.load(path, normalize=True)
    except:
        print(path)

In [17]:
epochs = 5
eval_freq = 1

weight_decay = 5e-4
lr = 0.001

model = SegmentedCNN()
optimizer = optim.Adam(model.parameters(), weight_decay=weight_decay, lr=lr)
criterion = torch.nn.CrossEntropyLoss()

train_loader = DataLoader(GtzanDataset(sample_paths, sample_targets, spectrogram_transform), batch_size=8)
test_loader = DataLoader(GtzanDataset(validation_paths, validation_targets, spectrogram_transform), batch_size=8)

In [18]:
len(train_loader.dataset)

100

In [19]:
all_predictions = None
all_targets = None
results = []
columns = ['ep', 'tr_loss', 'tr_acc', 'te_loss', 'te_acc']
# training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1} started...")
    loss_sum = 0.0
    correct = 0.0
    # TRAIN
    model.train()
    for batch_idx, (data, targets) in enumerate(train_loader):
        # preds_sum = torch.from_numpy(np.zeros((data.shape[0], 10)))
        # # for each patch make a prediction and final prediction is the sum/avg
        # for i in range(27):
        #     patch_data = data[:,i,:,:]
        #     preds = model(patch_data)
        #     preds_sum += preds
        # final_prediction = preds_sum#.div(27)
        print(data.shape)
        final_prediction = model(data)
        print(final_prediction)
        loss = criterion(final_prediction, targets)
        # backward
        optimizer.zero_grad()
        loss.backward()

        # Adam step
        optimizer.step()
        
        loss_sum += loss.item() * data.size(0)
        pred = final_prediction.data.max(1, keepdim=True)[1]
        correct += pred.eq(targets.view_as(pred)).sum().item()
    train_res = {
        'loss': loss_sum / len(train_loader.dataset),
        'accuracy': (correct / len(train_loader.dataset)) * 100.0,    
    }
    print(f"Epoch {epoch + 1} training finished with loss: {train_res['loss']} and accuracy: {train_res['accuracy']}")
        
    loss_sum = 0.0
    correct = 0.0
    # # TEST
    # model.eval()
    # with torch.no_grad():
    #     for batch_idx, (data, targets) in enumerate(test_loader):
    #         preds_sum = torch.from_numpy(np.zeros((data.shape[0], 10)))
    #         # for each patch make a prediction and final prediction is the sum/avg
    #         for i in range(27):
    #             patch_data = data[:,i,:,:]
    #             preds = model(patch_data)
    #             preds_sum += preds
    #         final_prediction = preds_sum#.div(27)
    #         loss = criterion(final_prediction, targets)

    #         loss_sum += loss.item() * data.size(0)
    #         pred = final_prediction.data.max(1, keepdim=True)[1]
    #         correct += pred.eq(targets.view_as(pred)).sum().item()

    #         if epoch == epochs - 1:
    #             # record confusion matrix data
    #             if all_predictions is None:
    #                 all_predictions = final_prediction
    #             else:
    #                 all_predictions = torch.vstack((all_predictions, final_prediction))
    #             if all_targets is None:
    #                 all_targets = targets
    #             else:
    #                 all_targets = torch.vstack((all_targets, targets))

    # test_res = {
    #     'loss': loss_sum / len(test_loader.dataset),
    #     'accuracy': correct / len(test_loader.dataset) * 100.0,    
    # }
    # print(f"Epoch {epoch + 1} eval finished with loss: {test_res['loss']} and accuracy: {test_res['accuracy']}")
    
    
    # values = [epoch + 1, train_res['loss'], train_res['accuracy'], test_res['loss'], test_res['accuracy']]
    # table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f')
    # results.append(table)
    if epoch % 5 == 0:
        table = table.split('\n')
        table = '\n'.join([table[1]] + table)
    else:
        table = table.split('\n')[2]
    print(table) 

Epoch 1 started...
torch.Size([8, 128, 1876])
torch.Size([8, 896640])
tensor([[  0.0000,  10.1179,  46.2915,   0.0000,  17.4552,  55.2144,  46.0332,
          43.7128,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.8851,   0.0000,  17.5415,   0.0000,   0.0000,
           3.5625,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,  31.0938,   0.0000,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000,   0.0000,   0.0000,   0.0000,  58.7025,   5.3700,   0.0000,
          72.2399,   0.0000,   0.0000],
        [  0.0000,  35.4370,   0.0000,   0.0000,  43.4813,  29.1596,   0.0000,
           0.0000,   0.0000,   0.0000],
        [ 19.7875,   0.0000,   0.0000,  80.5428,  14.8424,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000],
        [  0.0000, 220.5316,   0.0000,  55.1512,   0.0000,   0.0000,   0.0000,
          21.6237,   0.0000,   0.0000],
        [  8.3944, 197.0452,   8.4101,  22.2783, 153.5576,   0.0000, 100.8699,
           0.0000,

RuntimeError: stack expects each tensor to be equal size, but got [128, 1876] at entry 0 and [128, 1881] at entry 4

In [9]:
for batch_idx, (data, targets) in enumerate(train_loader):
    preds_sum = torch.from_numpy(np.zeros((data.shape[0], 10)))
    # for each patch make a prediction and final prediction is the sum/avg
    for i in range(27):
        patch_data = data[:,i,:,:]
        preds = model(patch_data)
        preds_sum += preds
    final_prediction = preds_sum#.div(27)
    print(final_prediction)
    if batch_idx == 4: break

  return np.array(temp_X)
  return np.array(temp_X)
  return F.softmax(x)
tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.]],
       dtype=torch.float64, grad_fn=<AddBackward0>)
tensor([[ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 27.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,