This is my first Kaggle comp.  I've started with the baseline PyTorch kernel <a href="https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners">All-in-one-rfcx-baseline-for-beginners</a> and tried to make improvements.  So far nothing I've tried has actually improved the score, so not great for confidence building, and and a lot of the other notebooks are going over my head just because I'm a relative newcomer to OOP.   But I'm learning a lot for next time!

I've noticed that a lot of the same songs are repeated for a given recording.  So by slicing up recordings to just isolate the songs and training on the slices, those patterns get lost.  So in this version I'm going to trying to create a second model, taking the probability vector from the first, grouping by recording ID, pass the combined vectors through a dense MLP or maybe a LSTM network, and train that model using a custom LWLRAP loss metric.  Then for the final predictions I would run the test set through both models.

In [None]:
# Housekeeping stuff

import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import librosa
import csv
import matplotlib.pyplot as plt
import seaborn as sns
from skimage.transform import resize
from PIL import Image
import random

save_to_disk = False  # A variable used to determine if the image files are left on disk.

print("Initial working directory path: {0}".format(os.getcwd()))
current_folder = os.path.basename(os.path.normpath(os.getcwd()))
print('Working directory name:',current_folder)

# Change the current working directory if necessary
if not current_folder == 'working':
    os.chdir('../working')
cwd = os.getcwd() + '/'
print("Current working directory: {0}".format(cwd))

# check if CUDA is available
gpu_available = torch.cuda.is_available()

if not gpu_available:
    print('CUDA is not available :( Training must be done on CPU')
else:
    print('CUDA is available! Training can be done on GPU')

In [None]:
# Take a look at the data provided:
data_tp=pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')
data_fp=pd.read_csv('../input/rfcx-species-audio-detection/train_fp.csv')


In [None]:
#True positive Class balance 
plt.figure(figsize=(16,8))
sns.countplot(data_tp.species_id)
plt.title('True_positives from the training folder')

In [None]:
#False Positives
plt.figure(figsize=(16,8))
sns.countplot(data_fp.species_id)
plt.title('False positives')

In [None]:
data_tp.head(5)

In [None]:
data_fp.head(5)

In [None]:
# Checking if there are true positive recordings with multiple labels.

data_tp_multi = data_tp['species_id'].groupby(data_tp.recording_id).apply(list).reset_index()
data_tp_multi['labels'] = data_tp_multi.species_id.map(len)
data_tp_multi = data_tp_multi.sort_values('labels', ascending=False)

In [None]:
data_tp_multi.head(5)

In [None]:
# So it would be nice to show this with some graphics, 
# Anyway interesting that the labels are often the same, so there are patterns to be found at the recording level.

Generating Mel spectrograms for training from true positive data

In [None]:
fft = 2048
hop = 512
# Less rounding errors this way
sr = 48000
length = 10 * sr   #This step here is the length of the slices, that go around each labelled song.

with open('../input/rfcx-species-audio-detection/train_tp.csv') as f:    
    reader = csv.reader(f)
    data = list(reader)

# Check minimum/maximum frequencies for bird calls
# Not neccesary, but there are usually plenty of noise in low frequencies, and removing it helps
fmin = 24000
fmax = 0

# Skip header row (recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max) and start from 1 instead of 0
for i in range(1, len(data)):
    if fmin > float(data[i][4]):
        fmin = float(data[i][4])
    if fmax < float(data[i][6]):
        fmax = float(data[i][6])
# Get some safety margin
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)
print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))

print('Starting spectrogram generation')
for i in range(1, len(data)):
    # All sound files are 48000 bitrate, no need to slowly resample
    wav, sr = librosa.load('../input/rfcx-species-audio-detection/train/' + data[i][0] + '.flac', sr=None)
    
    t_min = float(data[i][3]) * sr
    t_max = float(data[i][5]) * sr
    
    # Positioning sound slice
    center = np.round((t_min + t_max) / 2)
    beginning = center - length / 2
    if beginning < 0:
        beginning = 0
    
    ending = beginning + length
    if ending > len(wav):
        ending = len(wav)
        beginning = ending - length
        
    slice = wav[int(beginning):int(ending)]
    
    # Mel spectrogram generation
    # Default settings were bad, parameters are adjusted to generate somewhat reasonable quality images
    # The better your images are, the better your neural net would perform
    # You can also use librosa.stft + librosa.amplitude_to_db instead
    mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)
    mel_spec = resize(mel_spec, (224, 400))
    
    # Normalize to 0...1 - this is what goes into neural net
    mel_spec = mel_spec - np.min(mel_spec)
    mel_spec = mel_spec / np.max(mel_spec)

    # And this 0...255 is for the saving in bmp format
    mel_spec = mel_spec * 255
    mel_spec = np.round(mel_spec)    
    mel_spec = mel_spec.astype('uint8')
    mel_spec = np.asarray(mel_spec)
    
    bmp = Image.fromarray(mel_spec, 'L')
    bmp.save(cwd + data[i][0] + '_' + data[i][1] + '_' + str(center) + '.bmp')
    
    if i % 200 == 0:
        print('Processed ' + str(i) + ' train examples from ' + str(len(data)))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Show some spectrograms

Settings and random seeds initialization for reproducible results

In [None]:
num_birds = 24
# 6GB GPU-friendly (~4 GB used by model)
# Increase if neccesary
batch_size = 16 # Tried 32 but got no improvement  

# This is enough to exactly reproduce results on local machine (Windows / Turing GPU)
# Kaggle GPU kernels (Linux / Pascal GPU) are not deterministic even with random seeds set
# Your score might vary a lot (~up to 0.05) on a different runs due to picking different epochs to submit
rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Model dataset class

In [None]:
import torch.utils.data as torchdata

class RainforestDataset(torchdata.Dataset):
    def __init__(self, filelist):
        self.specs = []
        self.labels = []
        for f in filelist:
            # Easier to pass species in filename at the start; worth changing later to more capable method
            label = int(str.split(f, '_')[1])
            label_array = np.zeros(num_birds, dtype=np.single)
            label_array[label] = 1.
            self.labels.append(label_array)
            
            # Open and save spectrogram to memory
            
            # If you use more spectrograms (add train_fp, for example), then they would not all fit to memory
            # In this case you should load them on the fly in __getitem__
            img = Image.open(f)
            mel_spec = np.array(img)
            img.close()
            
            # Transforming spectrogram from bmp to 0..1 array
            mel_spec = mel_spec / 255
            # Stacking for 3-channel image for resnet
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            
            self.specs.append(mel_spec)
    
    def __len__(self):
        return len(self.specs)
    
    def __getitem__(self, item):
        # Augment here if you want
        return self.specs[item], self.labels[item]


Split training set on training and validation  
  
What StratifiedKFold does:  
![StratifiedKFold](https://scikit-learn.org/stable/_images/sphx_glr_plot_cv_indices_003.png)

In [None]:
file_list = []
label_list = []

for f in os.listdir(cwd):
    if '.bmp' in f:
        file_list.append(f)
        label = str.split(f, '_')[1]
        label_list.append(label)

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng_seed)

train_files = []
val_files = []

for fold_id, (train_index, val_index) in enumerate(skf.split(file_list, label_list)):
    # Picking only first fold to train/val on
    # This means loss of 20% training data
    # To avoid this, you can train 5 different models on 5 folds and average predictions
    if fold_id == 0:
        train_files = np.take(file_list, train_index)
        val_files = np.take(file_list, val_index)

print('Training on ' + str(len(train_files)) + ' examples')
print('Validating on ' + str(len(val_files)) + ' examples')

Preparing everything for training

In [None]:
!pip install resnest > /dev/null

In [None]:
from resnest.torch import resnest50

train_dataset = RainforestDataset(train_files)
val_dataset = RainforestDataset(val_files)

train_loader = torchdata.DataLoader(train_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(train_dataset))
val_loader = torchdata.DataLoader(val_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(val_dataset))

# ResNeSt: Split-Attention Networks
# https://arxiv.org/abs/2004.08955
# Significantly outperforms standard Resnet

model = resnest50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_birds)
)

# Picked for this notebook; pick new ones after major changes (such as adding train_fp to train data)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

# This loss function is not exactly suited for competition metric, which only cares about ranking of predictions
pos_weights = torch.ones(num_birds)
pos_weights = pos_weights * num_birds
loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights)
#loss_function = nn.CrossEntropyLoss()   # Tried this but got an error.

if gpu_available:
    model = model.cuda()
    loss_function = loss_function.cuda()

Training model on saved spectrograms

In [None]:
best_corrects = 0

# Train loop
print('Starting training loop')
for e in range(0, 2):    # 32 is fine Tried increasing to 64 but no improvement.  Look into this later when doing more to augment data and monitor losses.
    # Stats
    train_loss = []
    train_corr = []
    
    # Single epoch - train
    model.train()
    for batch, (data, target) in enumerate(train_loader):
        data = data.float()
        if gpu_available:
            data, target = data.cuda(), target.cuda()
            
        optimizer.zero_grad()
        
        output = model(data)
        loss = loss_function(output, target)

        loss.backward()
        optimizer.step()
        
        # Stats
        vals, answers = torch.max(output, 1)
        vals, targets = torch.max(target, 1)
        corrects = 0
        for i in range(0, len(answers)):
            if answers[i] == targets[i]:
                corrects = corrects + 1
        train_corr.append(corrects)
        
        train_loss.append(loss.item())
    
    # Stats
    for g in optimizer.param_groups:
        lr = g['lr']
    print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
          ', Correct answers: ' + str(sum(train_corr)) + '/' + str(train_dataset.__len__()))
    
    # Single epoch - validation
    with torch.no_grad():
        # Stats
        val_loss = []
        val_corr = []
        
        model.eval()
        for batch, (data, target) in enumerate(val_loader):
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()

            output = model(data)
            loss = loss_function(output, target)
            #loss = LWLRAP(output.cpu(), target.cpu()) # I don't think this achieves much, since we're training on 10 second clips around the labelled song.
            # Stats
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
        
            #val_loss.append(loss.item())  #This one for a tensor
            val_loss.append(loss)
    
    # Stats
    print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
          ', Correct answers: ' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()))
    
    # If this epoch is better than previous on validation, save model
    # Validation loss is the more common metric, but in this case our loss is misaligned with competition metric, making accuracy a better metric
    if sum(val_corr) > best_corrects:
        print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
        torch.save(model, cwd + 'best_model.pt')
        best_corrects = sum(val_corr)
        
    # Call every epoch
    scheduler.step()

# Free memory
del model

Pseudocode:
* Take each of the previously prepared annotated slices
* Run them through the model to provide a probablility vector
* Group the vectors and their labels by sound recording (up to five per recording, so 24 x n)
* Order by t_min so they have a time order
* Flatten into a vector
* Use for each of the groups > n=1 train a dense MLP with about four or five layers LHS is a 24xn,  256, 256, 256, 24xn vector again
* Use the LWLRAP metric on both to give a training loss.  (It works on a 24xn tensor)

In [None]:
# Custom loss function for LW-LRAP
# https://www.kaggle.com/c/rfcx-species-audio-detection/discussion/198418   
# label-level average
# Assume float preds [BxC], labels [BxC] of 0 or 1

def LWLRAP(preds, labels):
    # Ranks of the predictions
    ranked_classes = torch.argsort(preds, dim=-1, descending=True)
    # i, j corresponds to rank of prediction in row i
    class_ranks = torch.zeros_like(ranked_classes)
    for i in range(ranked_classes.size(0)):
        for j in range(ranked_classes.size(1)):
            class_ranks[i, ranked_classes[i][j]] = j + 1
    # Mask out to only use the ranks of relevant GT labels
    ground_truth_ranks = class_ranks * labels + (1e6) * (1 - labels)
    # All the GT ranks are in front now
    sorted_ground_truth_ranks, _ = torch.sort(ground_truth_ranks, dim=-1, descending=False)
    # Number of GT labels per instance
    num_labels = labels.sum(-1)
    pos_matrix = torch.tensor(np.array([i+1 for i in range(labels.size(-1))])).unsqueeze(0)
    score_matrix = pos_matrix / sorted_ground_truth_ranks
    score_mask_matrix, _ = torch.sort(labels, dim=-1, descending=True)
    scores = score_matrix * score_mask_matrix
    score = scores.sum() / labels.sum()
    return score.item()

In [None]:
# Prepare the image files for classification by the trained model

probability_df = pd.DataFrame(columns=['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])

file_list = []
label_list = []

for f in os.listdir(cwd):
    if '.bmp' in f:
        file_list.append(f)
        label = str.split(f, '_')[1]
        label_list.append(label)

print('A total of {} .bmp image files found'.format(len(file_list)))

train_dataset = RainforestDataset(file_list)
train_loader = torchdata.DataLoader(train_dataset)


In [None]:
model = resnest50(pretrained=True)
sigmoid = torch.nn.Sigmoid()

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_birds)
)

model = torch.load(cwd + 'best_model.pt')
model.eval()

if gpu_available:
    model.cuda()

# Prediction loop
print('Starting prediction loop on whole training set', batch)

for batch, (data, target) in enumerate(train_loader):  #default batch size = 1
    data = data.float()
    if gpu_available:
        data = data.cuda()

    output = sigmoid(model(data)).cpu().detach()  # converting output to a probability between 0 and 1
    
    # Append a line to the dataframe
    file_id = str.split(file_list[batch], '.')[0]
    output_list = [element.item() for element in output.flatten()]
    probability_df.loc[batch] = [file_id] + output_list
        
    if i % 200 == 0 and i > 0:
        print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Prediction vectors ready')

In [None]:
probability_df.head(5)

Function to split and load one test file

In [None]:
# Already defined above; for reference

# fft = 2048
# hop = 512
# sr = 48000
# length = 10 * sr

def load_test_file(f):
    wav, sr = librosa.load('../input/rfcx-species-audio-detection/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)
        mel_spec = resize(mel_spec, (224, 400))
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

Submitting predictions with best model

In [None]:
# Loading model back
model = resnest50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_birds)
)

model = torch.load(cwd + 'best_model.pt')
model.eval()

# Scoring does not like many files:(
if not save_to_disk:
    for f in os.listdir():
        os.remove(f)

if gpu_available:
    model.cuda()

# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
                               
    test_files = os.listdir('../input/rfcx-species-audio-detection/test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
    #for i in range(0, 400):  #just for code dev purposes
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if gpu_available:
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 200 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

In [None]:
# Free memory
del model
if gpu_available:
    torch.cuda.empty_cache()