In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.notebook import tqdm
from skimage.transform import resize
import PIL
from IPython.display import Image


In [None]:
TRAIN_DIR = Path('../input/rfcx-species-audio-detection/train')

In [None]:
flacfiles = list(TRAIN_DIR.glob('*.flac'))
y, sr = librosa.load(flacfiles[0], duration=10)
y, sr

In [None]:
librosa.display.waveplot(y, sr=sr)

In [None]:
spec = np.abs(librosa.stft(y))
spec = librosa.amplitude_to_db(spec, ref=np.max)

librosa.display.specshow(spec, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')

In [None]:
melspec = librosa.power_to_db(librosa.feature.melspectrogram(y, sr=sr, n_mels=128))
librosa.display.specshow(melspec, sr=sr, x_axis='time', y_axis='log')
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')

In [None]:
df_train = pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')
df_train.head()

In [None]:
f_min = df_train.f_min.min()
f_max = df_train.f_max.max()

f_min, f_max

In [None]:
length = 48000*10

In [None]:
for i in tqdm(range(df_train.shape[0])):
    data, sr = librosa.load('../input/rfcx-species-audio-detection/train/' + df_train.recording_id[i] + '.flac', sr=None)
    t_min = df_train.t_min[i]
    t_max = df_train.t_max[i]
    time_arr = np.arange(0, data.shape[0])/sr
    t_center = (t_min+t_max)/2
    t_min = t_center-5
    if t_min < 0.0:
        t_min = 0.0
    t_max = t_min+10
    if t_max > len(data)/sr:
        t_max = len(data)/sr
    t_min = t_max-10.0
    data = data[np.where((time_arr >= t_min) & (time_arr <= t_max) )]
    data = data[:length]
    if len(data) < length:
        print('bad', len(data), t_min, t_max)
    mel_spec = librosa.power_to_db(librosa.feature.melspectrogram(data, n_fft=2048, hop_length=512, fmin=f_min, fmax=f_max, sr=sr, n_mels=512, power=1.5))
    mel_spec = resize(mel_spec, (224, 400))
    
    mel_spec = mel_spec - np.min(mel_spec)
    mel_spec = mel_spec / np.max(mel_spec)
    
    mel_spec = mel_spec * 255
    mel_spec = np.round(mel_spec)
    mel_spec = mel_spec.astype('uint8')
    mel_spec = np.asarray(mel_spec)
    bmp = PIL.Image.fromarray(mel_spec, 'L')
    bmp.save('/kaggle/working/' + str(df_train.recording_id[i]) + '_' + str(df_train.species_id[i]) + '_' + str(t_center) +  '_' + '.bmp')
    
    

In [None]:
plt.imshow(PIL.Image.open('/kaggle/working/003bec244_14_44.83735_.bmp'))

In [None]:
num_species = 24
import torch
import random
batch_size=16



In [None]:
import torch.utils.data as torchdata

class RFCXDataset(torchdata.Dataset):
    def __init__(self, filelist):
        self.specs = []
        self.labels = []
        for f in filelist:
            label = int(str.split(f, '_')[1])
            label_arr = np.zeros(num_species, dtype=np.single)
            label_arr[label] = 1
            self.labels.append(label_arr)
            
            img = PIL.Image.open(f)
            mel_spec = np.array(img)
            img.close()
            mel_spec = mel_spec / 255
            mel_spec = np.stack((mel_spec, mel_spec, mel_spec))
            self.specs.append(mel_spec)
    def __len__(self):
        return len(self.specs)
    def __getitem__(self, item):
        return self.specs[item], self.labels[item]

In [None]:
import glob
file_list = glob.glob('/kaggle/working/*.bmp')
label_list = []

for f in file_list:
    label = str.split(f, '_')[1]
    label_list.append(label)

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_files = []
val_files = []

for fold_id, (train_id, val_id) in enumerate(skf.split(file_list, label_list)):
    if fold_id == 0:
        train_files = np.take(file_list, train_id)
        val_files = np.take(file_list, val_id)
    
print('Training on ' + str(len(train_files)) + ' examples')
print('Validating on ' + str(len(val_files)) + ' examples')

In [None]:
!pip install resnest > /dev/null

In [None]:
import torch.nn as nn
from resnest.torch import resnest50

train_dataset = RFCXDataset(train_files)
val_dataset = RFCXDataset(val_files)

train_loader = torchdata.DataLoader(train_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(train_dataset))
val_loader = torchdata.DataLoader(val_dataset, batch_size=batch_size, sampler=torchdata.RandomSampler(val_dataset))

model = resnest50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.0001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.4)


pos_weights = torch.ones(num_species)
pos_weights = pos_weights * num_species
loss_function = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

if torch.cuda.is_available():
    model = model.cuda()
    loss_function = loss_function.cuda()


In [None]:
best_corrects = 0

# Train loop
print('Starting training loop')
for e in range(0, 32):
    # Stats
    train_loss = []
    train_corr = []
    
    # Single epoch - train
    model.train()
    for batch, (data, target) in enumerate(train_loader):
        data = data.float()
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()
            
        optimizer.zero_grad()
        
        output = model(data)
        loss = loss_function(output, target)
        
        loss.backward()
        optimizer.step()
        
        # Stats
        vals, answers = torch.max(output, 1)
        vals, targets = torch.max(target, 1)
        corrects = 0
        for i in range(0, len(answers)):
            if answers[i] == targets[i]:
                corrects = corrects + 1
        train_corr.append(corrects)
        
        train_loss.append(loss.item())
    
    # Stats
    for g in optimizer.param_groups:
        lr = g['lr']
    print('Epoch ' + str(e) + ' training end. LR: ' + str(lr) + ', Loss: ' + str(sum(train_loss) / len(train_loss)) +
          ', Correct answers: ' + str(sum(train_corr)) + '/' + str(train_dataset.__len__()))
    
    # Single epoch - validation
    with torch.no_grad():
        # Stats
        val_loss = []
        val_corr = []
        
        model.eval()
        for batch, (data, target) in enumerate(val_loader):
            data = data.float()
            if torch.cuda.is_available():
                data, target = data.cuda(), target.cuda()
            
            output = model(data)
            loss = loss_function(output, target)
            
            # Stats
            vals, answers = torch.max(output, 1)
            vals, targets = torch.max(target, 1)
            corrects = 0
            for i in range(0, len(answers)):
                if answers[i] == targets[i]:
                    corrects = corrects + 1
            val_corr.append(corrects)
        
            val_loss.append(loss.item())
    
    # Stats
    print('Epoch ' + str(e) + ' validation end. LR: ' + str(lr) + ', Loss: ' + str(sum(val_loss) / len(val_loss)) +
          ', Correct answers: ' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()))
    
    # If this epoch is better than previous on validation, save model
    # Validation loss is the more common metric, but in this case our loss is misaligned with competition metric, making accuracy a better metric
    if sum(val_corr) > best_corrects:
        print('Saving new best model at epoch ' + str(e) + ' (' + str(sum(val_corr)) + '/' + str(val_dataset.__len__()) + ')')
        torch.save(model, 'best_model.pt')
        best_corrects = sum(val_corr)
        
    # Call every epoch
    scheduler.step()

# Free memory
del model

In [None]:

def load_test_file(f):
    wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        mel_spec = librosa.feature.melspectrogram(slice, n_fft=2048, hop_length=512, sr=sr, fmin=f_min, fmax=f_max, power=1.5)
        mel_spec = resize(mel_spec, (224, 400))
    
        mel_spec = mel_spec - np.min(mel_spec)
        mel_spec = mel_spec / np.max(mel_spec)
        
        mel_spec = np.stack((mel_spec, mel_spec, mel_spec))

        mel_array.append(mel_spec)
    
    return mel_array

In [None]:
import csv
model = resnest50(pretrained=True)

model.fc = nn.Sequential(
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, 1024),
    nn.ReLU(),
    nn.Dropout(p=0.2),
    nn.Linear(1024, num_species)
)

model = torch.load('/kaggle/working/best_model.pt')
model.eval()

for f in os.listdir('/kaggle/working/'):
    os.remove('/kaggle/working/' + f)

if torch.cuda.is_available():
    model.cuda()
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('/kaggle/input/rfcx-species-audio-detection/test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())
    
        submission_writer.writerow(write_array)
        
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')