WARNING: Kernel fails to automatically score if more than one file is saved to disk. You can still download and manually submit prediction. To allow model/spectrograms saving, change setting below.

In [None]:
save_to_disk = 0

In [None]:
import csv
import librosa
import numpy as np
from skimage.transform import resize
from PIL import Image

In [None]:
import os
os.mkdir('/kaggle/working/tp/')
os.mkdir('/kaggle/working/fp/')

In [None]:
with open('/kaggle/input/rfcx-species-audio-detection/train_tp.csv') as f:
    reader = csv.reader(f)
    next(reader, None)
    tp_data = list(reader)
with open('/kaggle/input/rfcx-species-audio-detection/train_fp.csv') as f:
    reader = csv.reader(f)
    next(reader, None)
    fp_data = list(reader)

In [None]:
fft = 2048
hop = 512
# Less rounding errors this way
sr = 48000
length = 10 * sr

fp_train_len = 500

In [None]:
def spectrogram_generation(data, data_type, max_len):
    # Check minimum/maximum frequencies for bird calls
    # Not neccesary, but there are usually plenty of noise in low frequencies, and removing it helps
    fmin = 24000
    fmax = 0

    # (recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max)
    for i in range(0, len(data)):
        if fmin > float(data[i][4]):
            fmin = float(data[i][4])
        if fmax < float(data[i][6]):
            fmax = float(data[i][6])
    # Get some safety margin
    fmin = int(fmin * 0.9)
    fmax = int(fmax * 1.1)
    print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))
    
    len_data = len(data)
    if max_len > 0:
        len_data = max_len
    print('Starting spectrogram generation')
    for i in range(0, len_data):
        # All sound files are 48000 bitrate, no need to slowly resample
        wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/train/' + data[i][0] + '.flac', sr=None)

        t_min = float(data[i][3]) * sr
        t_max = float(data[i][5]) * sr

        # Positioning sound slice
        center = np.round((t_min + t_max) / 2)
        beginning = center - length / 2
        if beginning < 0:
            beginning = 0

        ending = beginning + length
        if ending > len(wav):
            ending = len(wav)
            beginning = ending - length

        slice = wav[int(beginning):int(ending)]

        # Mel spectrogram generation
        # Default settings were bad, parameters are adjusted to generate somewhat reasonable quality images
        # The better your images are, the better your neural net would perform
        # You can also use librosa.stft + librosa.amplitude_to_db instead
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)    
        mel_spec = resize(mel_spec, (224, 400))

        # Normalize to 0...1 - this is what goes into neural net
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)

        # And this 0...255 is for the saving in bmp format
        mel_spec = mel_spec * 255
        mel_spec = np.round(mel_spec)    
        mel_spec = mel_spec.astype('uint8')
        mel_spec = np.asarray(mel_spec)

        bmp = Image.fromarray(mel_spec, 'L')
        bmp.save('/kaggle/working/' + data_type + '/' + data[i][0] + '_' + data[i][1] + '_' + str(center) + '.bmp')

        if i % 100 == 0:
            print('Processed ' + str(i) + ' train examples from ' + str(len_data))
    return fmin, fmax

In [None]:
fmin, fmax = spectrogram_generation(tp_data, 'tp', 0)
spectrogram_generation(fp_data, 'fp', fp_train_len)

Settings and random seeds initialization for reproducible results

In [None]:
import os
import torch
import random

num_birds = 24
# 6GB GPU-friendly (~4 GB used by model)
# Increase if neccesary
batch_size = 16

# This is enough to exactly reproduce results on local machine (Windows / Turing GPU)
# Kaggle GPU kernels (Linux / Pascal GPU) are not deterministic even with random seeds set
# Your score might vary a lot (~up to 0.05) on a different runs due to picking different epochs to submit
rng_seed = 1234
random.seed(rng_seed)
np.random.seed(rng_seed)
os.environ['PYTHONHASHSEED'] = str(rng_seed)
torch.manual_seed(rng_seed)
torch.cuda.manual_seed(rng_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import torch.utils.data as torchdata

class RainforestDataset(torchdata.Dataset):
    def __init__(self, filelist, data_type):
        self.specs = []
        self.labels = []
        for f in filelist:
            # Easier to pass species in filename at the start; worth changing later to more capable method
            label = int(str.split(f, '_')[1])
            label_array = np.zeros(num_birds, dtype=np.single)
            label_array[label] = 1.
            self.labels.append(label_array)
            
            # Open and save spectrogram to memory
            
            # If you use more spectrograms (add train_fp, for example), then they would not all fit to memory
            # In this case you should load them on the fly in __getitem__
            img = Image.open('/kaggle/working/' + data_type + '/' + f)
            mel_spec = np.array(img)
            img.close()
            
            # Transforming spectrogram from bmp to 0..1 array
            mel_spec = mel_spec / 255
            # Stacking for 3-channel image for resnet
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            
            self.specs.append(mel_spec)
    
    def __len__(self):
        return len(self.specs)
    
    def __getitem__(self, item):
        # Augment here if you want
        return self.specs[item], self.labels[item]

In [None]:
!pip install resnest > /dev/null

In [None]:
import torch.nn as nn
from resnest.torch import resnest50

def initModel():
    # ResNeSt: Split-Attention Networks
    # https://arxiv.org/abs/2004.08955
    # Significantly outperforms standard Resnet
    model = resnest50(pretrained=True)

    model.fc = nn.Sequential(
        nn.Linear(2048, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, num_birds)
    )

    # Picked for this notebook; pick new ones after major changes (such as adding train_fp to train data)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0001, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)

    # This loss function is not exactly suited for competition metric, which only cares about ranking of predictions
    # Exploring different loss fuctions would be a good idea
    pos_weights = torch.ones(num_birds)
    pos_weights = pos_weights * num_birds
    loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

    if torch.cuda.is_available():
        model = model.cuda()
        loss_function = loss_function.cuda()
        
    return model, optimizer, scheduler, loss_function

In [None]:
from sklearn.model_selection import StratifiedKFold

def training(tp_file_list, tp_label_list, fp_train_dataset):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng_seed)
    
    for fold_id, (train_index, val_index) in enumerate(skf.split(tp_file_list, tp_label_list)):
        print('fold_id', fold_id)
        best_train_corrects = 0
        best_val_corrects = 0
        best_incorrects = fp_train_len
        model, optimizer, scheduler, loss_function = initModel()
        train_files = []
        val_files = []
        train_files = np.take(tp_file_list, train_index)
        val_files = np.take(tp_file_list, val_index)

        train_dataset = RainforestDataset(train_files, 'tp')
        val_dataset = RainforestDataset(val_files, 'tp')

        train_loader = torchdata.DataLoader(train_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(train_dataset))
        val_loader = torchdata.DataLoader(val_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(val_dataset))
        fp_train_loader = torchdata.DataLoader(fp_train_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(fp_train_dataset))
        
        # Train loop
        print('Starting training loop')
        for e in range(0, 30):
            # Stats
            train_loss = []
            train_corr = []

            # Single epoch - train
            model.train()
            for batch, (data, target) in enumerate(train_loader):
                data = data.float()
                if torch.cuda.is_available():
                    data, target = data.cuda(), target.cuda()

                optimizer.zero_grad()

                output = model(data)
                loss = loss_function(output, target)

                loss.backward()
                optimizer.step()

                # Stats
                vals, answers = torch.max(output, 1)
                vals, targets = torch.max(target, 1)
                corrects = 0
                for i in range(0, len(answers)):
                    if answers[i] == targets[i]:
                        corrects = corrects + 1
                train_corr.append(corrects)

                train_loss.append(loss.item())

            # Stats
            for g in optimizer.param_groups:
                lr = g['lr']
            print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
                  ', Correct answers: ' + str(sum(train_corr)) + '/' + str(train_dataset.__len__()))

            # Single epoch - validation
            with torch.no_grad():
                # Stats
                val_loss = []
                val_corr = []

                model.eval()
                for batch, (data, target) in enumerate(val_loader):
                    data = data.float()
                    if torch.cuda.is_available():
                        data, target = data.cuda(), target.cuda()

                    output = model(data)
                    loss = loss_function(output, target)

                    # Stats
                    vals, answers = torch.max(output, 1)
                    vals, targets = torch.max(target, 1)
                    corrects = 0
                    for i in range(0, len(answers)):
                        if answers[i] == targets[i]:
                            corrects = corrects + 1
                    val_corr.append(corrects)

                    val_loss.append(loss.item())

            # Stats
            print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
                  ', Correct answers: ' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()))


            # Single epoch - fp_train
            with torch.no_grad():
                # Stats
                fp_train_incorr = []
                model.eval()
                for batch, (data, target) in enumerate(fp_train_loader):
                    data = data.float()
                    if torch.cuda.is_available():
                        data, target = data.cuda(), target.cuda()

                    output = model(data)

                    # Stats
                    vals, answers = torch.max(output, 1)
                    vals, targets = torch.max(target, 1)
                    incorrects = 0
                    for i in range(0, len(answers)):
                        if answers[i] == targets[i]:
                            incorrects = incorrects + 1
                    fp_train_incorr.append(incorrects)

            # Stats
            print('Epoch ' + str(e) + ' fp_train end. LR: ' + str(lr) +
                  ', Incorrect answers: ' + str(sum(fp_train_incorr)) + '/' + str(fp_train_dataset.__len__()))


            # If this epoch is better than previous on validation, save model
            # Validation loss is the more common metric, but in this case our loss is misaligned with competition metric, making accuracy a better metric
#             print("sum(val_corr)", sum(val_corr))
#             print("best_corrects", best_corrects)
#             print("sum(fp_train_incorr)", sum(fp_train_incorr))
#             print("best_incorrects", best_incorrects)
            percent_tp_train = sum(train_corr) / train_dataset.__len__()
            percent_tp_val = sum(val_corr) / val_dataset.__len__()
            if (percent_tp_val > best_val_corrects) or (percent_tp_val == 1 and percent_tp_train > best_train_corrects):
                print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
                torch.save(model, 'best_model_'+str(fold_id)+'.pt')
                best_val_corrects = percent_tp_val
                print('Best val corrects at epoch ' + str(e) + " (" + str(best_val_corrects) + ")" )
            if (percent_tp_train > best_train_corrects):
                best_train_corrects = percent_tp_train
                print('Best train corrects at epoch ' + str(e) + " (" + str(best_train_corrects) + ")" )

            # Call every epoch
            scheduler.step()

    # Free memory
    model = None
    del model

In [None]:
def getFileAndLabel(data_type):
    file_list = []
    label_list = []

    for f in os.listdir('/kaggle/working/'+data_type+'/'):
        if '.bmp' in f:
            file_list.append(f)
            label = str.split(f, '_')[1]
            label_list.append(label)
    return file_list, label_list

In [None]:
tp_file_list, tp_label_list = getFileAndLabel("tp")
fp_file_list, fp_label_list = getFileAndLabel("fp")

fp_train_files = []
fp_train_index = []
# for (index, f) in enumerate(fp_file_list):
for index in range(fp_train_len):
    fp_train_index.append(index)

fp_train_files = np.take(fp_file_list, fp_train_index)
fp_train_dataset = RainforestDataset(fp_train_files, 'fp')
training(tp_file_list, tp_label_list, fp_train_dataset)

Function to split and load one test file

In [None]:
# Already defined above; for reference

# fft = 2048
# hop = 512
# sr = 48000
# length = 10 * sr

def load_test_file(f):
    wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=fft, hop_length=hop, sr=sr, fmin=fmin, fmax=fmax, power=1.5)
        mel_spec = resize(mel_spec, (224, 400))
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [None]:
members = []
for i in range(5):
    model = resnest50(pretrained=True)

    model.fc = nn.Sequential(
        nn.Linear(2048, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, 1024),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(1024, num_birds)
    )
    model = torch.load('/kaggle/working/best_model_'+str(i)+'.pt')
    model.eval()
    members.append(model)

Submitting predictions with best model

In [None]:
# # Loading model back
# model = resnest50(pretrained=True)

# model.fc = nn.Sequential(
#     nn.Linear(2048, 1024),
#     nn.ReLU(),
#     nn.Dropout(p=0.2),
#     nn.Linear(1024, 1024),
#     nn.ReLU(),
#     nn.Dropout(p=0.2),
#     nn.Linear(1024, num_birds)
# )

# model = torch.load('/kaggle/working/best_model_0.pt')
# model.eval()

# Scoring does not like many files:(
if save_to_disk == 0:
    for f in os.listdir('/kaggle/working/tp/'):
        os.remove('/kaggle/working/tp/' + f)
    for f in os.listdir('/kaggle/working/fp/'):
        os.remove('/kaggle/working/fp/' + f)

if torch.cuda.is_available():
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('/kaggle/input/rfcx-species-audio-detection/test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output_list = []
        for m in members:
            output = m(data)
            maxed_output = torch.max(output, dim=0)[0]
            maxed_output = maxed_output.cpu().detach()
            output_list.append(maxed_output)
        avg_maxed_output = torch.mean(torch.stack(output_list), dim=0)
            
#         output = model(data)
#         # Taking max prediction from all slices per bird species
#         # Usually you want Sigmoid layer here to convert output to probabilities
#         # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
#         maxed_output = torch.max(output, dim=0)[0]
#         print("maxed_output1", maxed_output)
#         maxed_output = maxed_output.cpu().detach()
#         print("maxed_output2", maxed_output)
        
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
#         for out in maxed_output:
        for out in avg_maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')