In [None]:
# Import the packages we'll use

import numpy as np
import os, glob, csv

# librosa is a widely-used audio processing library
import librosa

import sklearn

import torch
import torch.nn as nn
import torch.nn.functional as nnF

# for plotting
%matplotlib inline
import matplotlib.pyplot as plt

import math

# for accuracy and confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# for data normalization
from sklearn.preprocessing import StandardScaler

In [None]:
# USER CONFIGURATION
# Please alter the paths here to where the data are stored on your local filesystem
binarylabelcsv  = os.path.expanduser("~/shared_storage/ECS7013P/bird_audio_detection/warblrb10k_public_metadata_2018.csv")
audiofilefolder = os.path.expanduser("~/shared_storage/ECS7013P/warblrb10k_public_wav")

# we experiment with 100 files here. In practice, it depends on your actual training, validation, and test data
#maxfilestoload  = 1000      # limit, because loading the whole dataset is very slow
maxfilestoload  = 100      # limit, because loading the whole dataset is very slow


In [None]:
# here we load the metadata labels
binarylabels = {}
with open(binarylabelcsv, 'r') as infp:
        rdr = csv.DictReader(infp)
        for row in rdr:
                binarylabels[row['itemid']] = float(row['hasbird'])
                if len(binarylabels)==maxfilestoload:
                        break  # note, here we are restricting the maximum number of rows.

fkeys = sorted(binarylabels.keys())
# inspect:
for i, kv in enumerate(binarylabels.items()):
    print(kv)
    if i==10: break

In [None]:
'''
- Load an example audio file, converting the audio data to mel spectrogram
- window length 50 ms, hop_len 25 ms
'''
def extract_melspectrogram(filename, win_len=0.05, hop_len=0.025, n_mels=64):
    audio, sr = librosa.load("%s/%s.wav" % (audiofilefolder, filename), sr=22050)
    win_len = int(win_len*sr)
    hop_len = int(hop_len*sr)
    spec = librosa.feature.melspectrogram(audio, sr, n_mels=n_mels, n_fft=2048, win_length=win_len, hop_length=hop_len)
    # return data format (time_len, n_mels)
    return spec.transpose((1,0))
'''
 - Load the data, 
 - Extract mel spectrograms
 - Annotation: one element corresponding to one audio file
'''
data = np.zeros((maxfilestoload, 400, 64)) # for storing mel spectrograms
label = np.zeros(maxfilestoload) # for storing the annotion
for i, kv in enumerate(binarylabels.items()):
    print(kv[0])
    # the number of the melspectrograms' time frames varies a bit (due to some small differences in audio length)
    # for simplicity, let's take a maximum of 400 time frames.
    data[i] = extract_melspectrogram(kv[0])[:400] 
    label[i] = kv[1]

In [None]:
'''
- Split the data into 
    training (80%)
    validation (10%)
    test (10%)
'''
#print(label)
#print(data.shape)
#print(data[0])

# training data
train_data = data[:int(0.8*maxfilestoload)]
train_label = label[:int(0.8*maxfilestoload)]
print(train_data.shape)

# validation data
valid_data = data[int(0.8*maxfilestoload):int(0.9*maxfilestoload)]
valid_label = label[int(0.8*maxfilestoload):int(0.9*maxfilestoload)]
print(valid_data.shape)

# test data
test_data = data[int(0.9*maxfilestoload):]
test_label = label[int(0.9*maxfilestoload):]
print(test_data.shape)

#del data

In [None]:
# data normalisation
scaler = StandardScaler()
# compute normalisation parameters based on the training data 
# QUESTION: why do we reshape the data to (-1,64)?
scaler.fit(train_data.reshape((-1,64)))
print(scaler.mean_)

# normalise the training data with the computed parameters
train_data = scaler.transform(train_data.reshape((-1,64)))
train_data = train_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(train_data[0])

# normalise the validation data with the computed parameters
valid_data = scaler.transform(valid_data.reshape((-1,64)))
valid_data = valid_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(valid_data[0])

# normalise the test data with the computed parameters
test_data = scaler.transform(test_data.reshape((-1,64)))
test_data = test_data.reshape((-1, 400, 64)) # reverse back to the original shape
#print(test_data[0])

In [None]:
def init_layer(layer):
    """Initialize a Linear or Convolutional layer. 
    Ref: He, Kaiming, et al. "Delving deep into rectifiers: Surpassing 
    human-level performance on imagenet classification." Proceedings of the 
    IEEE international conference on computer vision. 2015.
    """
    
    if layer.weight.ndimension() == 4:
        (n_out, n_in, height, width) = layer.weight.size()
        n = n_in * height * width
        
    elif layer.weight.ndimension() == 2:
        (n_out, n) = layer.weight.size()

    std = math.sqrt(2. / n)
    scale = std * math.sqrt(3.)
    layer.weight.data.uniform_(-scale, scale)

    if layer.bias is not None:
        layer.bias.data.fill_(0.)

In [None]:
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    
    bn.weight.data.fill_(1.)

In [None]:
class CnnModel(nn.Module):
    """The CNN model"""
    def __init__(self):
        
        super(CnnModel, self).__init__()
        
        # FILLING THE ... TO COMPLETE THE 1ST 2D CONV LAYER OF THE NETWORK GIVEN:
        # - kernel size 5x5
        # - the number of kernels: 64
        # What is the value of in_channels here?
        # Ref: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
        self.conv1 = nn.Conv2d(in_channels= ..., 
                               out_channels= ...,
                               kernel_size=..., 
                               bias=False)

        # FILLING THE ... TO COMPLETE THE 2ST 2D CONV LAYER OF THE NETWORK GIVEN:
        # - kernel size 5x5
        # - the number of kernels: 128
        # What is the value of in_channels here?
        self.conv2 = nn.Conv2d(in_channels=..., 
                               out_channels=...,
                               kernel_size=..., 
                               bias=False)

        # FILLING THE ... TO COMPLETE THE 3ST 2D CONV LAYER OF THE NETWORK GIVEN:
        # - kernel size 3x3
        # - the number of kernels: 128
        # What is the value of in_channels here?
        self.conv3 = nn.Conv2d(in_channels=..., 
                               out_channels=...,
                               kernel_size=..., 
                               bias=False)

        # FILLING THE ... TO COMPLETE THE FOLLOWING FULLY CONNECTED (DENSE) LAYER OF THE NETWORK GIVEN:
        # - the number of hidden units: 128
        # What is the value of in_features here? Hint, you need to work out the number of features after the last convolutional layer
        # Ref: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
        self.fc1 = nn.Linear(..., 
                             ..., 
                             bias=True)
        self.fc2 = nn.Linear(128, 1, bias=True)

        # batch normalisation layers
        self.bn1 = nn.BatchNorm2d(64)
        self.bn2 = nn.BatchNorm2d(128)
        self.bn3 = nn.BatchNorm2d(128)

        # call to initialise the network's weights
        self.init_weights()

    def init_weights(self):

        init_layer(self.conv1)
        init_layer(self.conv2)
        init_layer(self.conv3)
        init_layer(self.fc1)

        init_bn(self.bn1)
        init_bn(self.bn2)
        init_bn(self.bn3)

    def forward(self, x):
        (_, time_len, mel_bins) = x.shape
        # reshape the input into 4D format (batch_size, channels, time_len, frequency_len)
        x = x.view(-1, 1, time_len, mel_bins)
        #print('Input')
        #print(x.size())

        # 1st conv layer + batch norm + relu activation
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        # Note: the default stride is 1x1
        x = nnF.relu(self.bn1(self.conv1(x)))
        #print('Conv1')
        #print(x.size())
        
        # max pooling with kernel size (8, 4)
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        # QUESTION: WHAT IS THE EFFECT OF PADDING HERE?
        x = nnF.max_pool2d(x,kernel_size=(8,4),padding=(4,0))
        #print('Pool1')
        #print(x.size())
        
        # 2nd conv layer + batch norm + relu activation
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        # Note: the default stride is 1x1
        x = nnF.relu(self.bn2(self.conv2(x)))
        #print('Conv2')
        #print(x.size())
        
        # max pooling with kernel size (8, 4)
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        x = nnF.max_pool2d(x,kernel_size=(8,4),padding=(2,1))
        #print('Pool2')
        #print(x.size())
        
        # 3rd conv layer + batch norm + relu activation
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        # Note: the default stride is 1x1
        x = nnF.relu(self.bn3(self.conv3(x)))
        #print('Conv3')
        #print(x.size())
        
        # max pooling with kernel size (8, 4)
        # QUESTION: WHAT IS THE SHAPE OF x AFTER THIS LINE?
        x = nnF.max_pool2d(x,kernel_size=(2,1))
        #print('Pool3')
        #print(x.size())
        
        # flatten the feature map into a vector
        x = x.view(-1, self.num_flat_features(x))
        # the first dense layer + relu activation
        x = nnF.relu(self.fc1(x))
        # the first dense layer + sigmoid activation
        # QUESTION: WHY DO WE NEED TO USE SIGMOID ACTIVATION HERE?
        x = torch.sigmoid(self.fc2(x))

        return x

    def forward_and_convert(self, x):
        "Handles the torch<--->numpy tensor conversion, for convenience"
        x_torch = torch.FloatTensor(x)
        y_torch = self.forward(x_torch)
        return y_torch.detach().numpy()
        
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [None]:
# create a model instance
net = CnnModel()
print(net)

# Binary-cross entropy loss, closely related to logistic regression loss
criterion = nn.BCELoss()

# Adam Optimizer, learning rate 0.001
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.)

In [None]:
# minibatch size (remember stochastic gradient descent?)
batch_size = 4

# some helpful functions

'''
Evaluate a network "model" on the data "data" 
Predicted class labels will be returned
'''
def evaluate(model, data):
    pred = np.zeros(len(data)) # for storing predicted class labels, one for each data sample
    num_batch = len(data)//batch_size # number of batches in one data epoch
    # evaluate batch by batch and store the output to "pred"
    for i in range(num_batch):
        temp = model.forward_and_convert(data[i*num_batch : (i+1)*num_batch])
        pred[i*num_batch : (i+1)*num_batch] = temp.squeeze()
    # some trailing data samples
    if(num_batch*batch_size < len(data)):
        temp = model.forward_and_convert(data[num_batch*batch_size :])
        pred[num_batch*batch_size :] = temp.squeeze()
    # each element in "pred" is the output after sigmoid function and has value in [0, 1].
    # to obtain the discrete label (0 or 1 in this case), we threshold the value by 0.5.
    pred[pred >= 0.5] = 1.
    pred[pred < 0.5] = 0.
    return pred

'''
Randomly shuffle the data. It will be used to shuffle the training data after every training epoch
'''
def shuffle_data(data, label):
    # permute the data indices
    rand_ind = np.random.permutation(len(data))
    # re-order the data with the pumuted indices
    return data[rand_ind], label[rand_ind]

In [None]:
'''The training loop'''

num_epochs = 100 # the number of training epoch (i.e. when you've gone through all samples of the training data, that's one epoch)
evaluate_every_epoch = 1 # how often you want to evaluate the network during training?
best_valid_acc = 0.0 # for keeping track of the best accuracy on the validation data
saved_model = './best_model' # path for saving the best model during training

for epoch in range(num_epochs):
    # shuffle training data
    train_data, train_label = shuffle_data(train_data, train_label)
    
    # the number of minibatch in one epoch
    num_batch = len(train_data) // batch_size
    for i in range(num_batch):
        # sample one minibatch
        # FILLING ... TO COMPLETE THE LINES BELOW TO SAMPLE THE I-TH MINIBATCH OF DATA FOR TRAINING
        # Hint: you need to think about the starting and ending index of this minibatch in 'train_data' and 'train_label'
        batch_data = train_data[...]
        label_data = train_label[...]
    
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(torch.FloatTensor(batch_data))
        loss = criterion(outputs.squeeze(), torch.FloatTensor(label_data))
    
        loss.backward()
        optimizer.step()

    running_loss = loss.item()
    # print training loss
    print('[%d] loss: %.8f' %(epoch, running_loss))
    
    # evaluate the network on the validation data
    if((epoch+1) % evaluate_every_epoch == 0):
        valid_pred = evaluate(net, valid_data)
        valid_acc = accuracy_score(test_pred, valid_label)
        print('Validation accuracy: %g' % valid_acc)
        
        # if the best validation performance so far, save the network to file 
        if(best_valid_acc < acc):
            print('Saving best model')
            # COMPLETE THE LINE BELOW TO SAVE THE CURRENT BEST MODEL.
            # - the path of the model is in the variable 'saved_model'
            # Ref: https://pytorch.org/tutorials/beginner/saving_loading_models.html
            ...

In [None]:
'''When you are here, we have the best model saved in file.'''
'''Then, load the saved model, and evaluate it on the test data'''
net = CnnModel()
# COMPLETE THE LINE BELOW TO LOAD THE SAVED MODEL.
# - the path of the model is in the variable 'saved_model'
# Ref: https://pytorch.org/tutorials/beginner/saving_loading_models.html
...

# evaluate on the test data
test_pred = evaluate(net, test_data) 
print(test_pred)

# test accuracy
test_acc = accuracy_score(test_pred, test_label)
print('Test accuracy: %g' % test_acc)

# confusion matrix
confusion_matrix(test_label, test_pred)
print('Confusion_matrix')