In [None]:
from skimage.transform import resize
from skimage.util import random_noise
from skimage.filters import gaussian
from skimage import exposure
import cv2
import numpy as np
import random

def addNoisy(img):
    noise_img = random_noise(img)
    return addChannels(noise_img)

def contrast_stretching(img):
    p2, p98 = np.percentile(img, (2, 98))
    contrast_img = exposure.rescale_intensity(img, in_range=(p2, p98))
    return addChannels(contrast_img)

def log_correction(img):
    log_img = exposure.adjust_log(img)
    return addChannels(log_img)

def randomGaussian(img):
    gaussian_img = gaussian(img, sigma=random.randint(0, 5))
    return addChannels(gaussian_img)

def addChannels(img):
    return np.stack((img, img, img))

def spec_to_image(spec):    
    spec = resize(spec, (224, 400))
    eps=1e-6
    mean = spec.mean()
    std = spec.std()
    spec_norm = (spec - mean) / (std + eps)
    spec_min, spec_max = spec_norm.min(), spec_norm.max()
    spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
    spec_scaled = spec_scaled.astype(np.uint8)
    spec_scaled = np.asarray(spec_scaled)
    return spec_scaled

In [None]:
import librosa
from torch.utils.data import Dataset, DataLoader

class AudioData(Dataset):
    def __init__(self, _data, data_type):
        self.data = []
        self.labels = []
        for i in range(0, len(_data)):
            # All sound files are 48000 bitrate, no need to slowly resample
            wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/train/' + _data[i][0] + '.flac', sr=None)

            t_min = float(_data[i][3]) * sr
            t_max = float(_data[i][5]) * sr

            # Positioning sound slice
            center = np.round((t_min + t_max) / 2)
            beginning = center - length / 2
            if beginning < 0:
                beginning = 0

            ending = beginning + length
            if ending > len(wav):
                ending = len(wav)
                beginning = ending - length

            slice = wav[int(beginning):int(ending)]
            
            spec=librosa.feature.melspectrogram(slice, sr=sr,n_fft=fft,hop_length=hop,fmin=fmin,fmax=fmax)
            spec_db=librosa.power_to_db(spec,top_db=80)
            
            img = spec_to_image(spec_db)
            mel_spec = np.stack((img, img, img))
            self.data.append(mel_spec)
            label = int(_data[i][1])
            self.labels.append(label)
            
            if data_type == "train":
                augmentation_functions = [
                    addNoisy, contrast_stretching,
                    randomGaussian, log_correction
                ]
                for fun in augmentation_functions:
                    mel_spec = fun(img)
                    self.data.append(mel_spec)
                    self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

Generating Mel spectrograms for training from true positive data

In [None]:
import csv
import random

fft = 2048
hop = 512
# Less rounding errors this way
sr = 48000
length = 10 * sr

with open('/kaggle/input/rfcx-species-audio-detection/train_tp.csv') as f:
    reader = csv.reader(f)
    next(reader, None)
    data = list(reader)

# Check minimum/maximum frequencies for bird calls
# Not neccesary, but there are usually plenty of noise in low frequencies, and removing it helps
fmin = 24000
fmax = 0

# Skip header row (recording_id,species_id,songtype_id,t_min,f_min,t_max,f_max) and start from 1 instead of 0
for i in range(0, len(data)):
    if fmin > float(data[i][4]):
        fmin = float(data[i][4])
    if fmax < float(data[i][6]):
        fmax = float(data[i][6])
# Get some safety margin
fmin = int(fmin * 0.9)
fmax = int(fmax * 1.1)
print('Minimum frequency: ' + str(fmin) + ', maximum frequency: ' + str(fmax))

percentage_train = 90
random.shuffle(data)
total = len(data)
train_data_amount = round(total / 100 * percentage_train)
train_audio = data[:train_data_amount]
val_audio = data[train_data_amount:]
train_data = AudioData(train_audio, "train")
valid_data = AudioData(val_audio, "valid")
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=16, shuffle=True)

In [None]:
print("train", len(train_data))
print("valid", len(valid_data))

In [None]:
from torchvision.models import resnet50
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
import copy

In [None]:
num_birds = 24

if torch.cuda.is_available():
    device=torch.device('cuda:0')
else:
    device=torch.device('cpu')

resnet_model = resnet50(pretrained=True)
num_ftrs = resnet_model.fc.in_features
resnet_model.fc = nn.Linear(num_ftrs, num_birds)
resnet_model = resnet_model.to(device)

In [None]:
from tqdm import tqdm

learning_rate = 2e-4
optimizer = torch.optim.Adam(resnet_model.parameters(), lr=learning_rate)
epochs = 20
loss_fn = nn.CrossEntropyLoss()

def setlr(optimizer, lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

def lr_decay(optimizer, epoch):
    if epoch%10==0:
        new_lr = learning_rate / (10**(epoch//10))
        optimizer = setlr(optimizer, new_lr)
        print(f'Changed learning rate to {new_lr}')
    return optimizer

def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, change_lr=None):
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    train_losses = []
    valid_losses = []
    
    for epoch in tqdm(range(1,epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch)
        for i, data in enumerate(train_loader):
            x, y = data
            optimizer.zero_grad()
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            loss.backward()
            batch_losses.append(loss.item())
            optimizer.step()
            
        train_losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')
        # deep copy the model
        if accuracy > best_acc:
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
resnet_model = train(resnet_model, loss_fn, train_loader, valid_loader, epochs, optimizer, lr_decay)

In [None]:
# Already defined above; for reference

# fft = 2048
# hop = 512
# sr = 48000
# length = 10 * sr

def load_test_file(f):
    wav, sr = librosa.load('/kaggle/input/rfcx-species-audio-detection/test/' + f, sr=None)

    # Split for enough segments to not miss anything
    segments = len(wav) / length
    segments = int(np.ceil(segments))
    
    mel_array = []
    
    for i in range(0, segments):
        # Last segment going from the end
        if (i + 1) * length > len(wav):
            slice = wav[len(wav) - length:len(wav)]
        else:
            slice = wav[i * length:(i + 1) * length]
        
        # Same mel spectrogram as before
        spec=librosa.feature.melspectrogram(slice, sr=sr,n_fft=fft,hop_length=hop,fmin=fmin,fmax=fmax)
        spec_db=librosa.power_to_db(spec,top_db=80)

        img = spec_to_image(spec_db)
        mel_spec = np.stack((img, img, img))
        mel_array.append(mel_spec)
    
    return mel_array

Submitting predictions with best model

In [None]:
import os
    
# Prediction loop
print('Starting prediction loop')
with open('submission.csv', 'w', newline='') as csvfile:
    submission_writer = csv.writer(csvfile, delimiter=',')
    submission_writer.writerow(['recording_id','s0','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11',
                               's12','s13','s14','s15','s16','s17','s18','s19','s20','s21','s22','s23'])
    
    test_files = os.listdir('/kaggle/input/rfcx-species-audio-detection/test/')
    print(len(test_files))
    
    # Every test file is split on several chunks and prediction is made for each chunk
    for i in range(0, len(test_files)):
        data = load_test_file(test_files[i])
        data = torch.tensor(data)
        data = data.float()
        if torch.cuda.is_available():
            data = data.cuda()

        output = resnet_model(data)

        # Taking max prediction from all slices per bird species
        # Usually you want Sigmoid layer here to convert output to probabilities
        # In this competition only relative ranking matters, and not the exact value of prediction, so we can use it directly
        maxed_output = torch.max(output, dim=0)[0]
        maxed_output = maxed_output.cpu().detach()
        
        file_id = str.split(test_files[i], '.')[0]
        write_array = [file_id]
        
        for out in maxed_output:
            write_array.append(out.item())

        submission_writer.writerow(write_array)
        if i % 100 == 0 and i > 0:
            print('Predicted for ' + str(i) + ' of ' + str(len(test_files) + 1) + ' files')

print('Submission generated')

### References
https://www.kaggle.com/fffrrt/all-in-one-rfcx-baseline-for-beginners

https://www.kaggle.com/tomahim/image-manipulation-augmentation-with-skimage

https://www.kaggle.com/safavieh/image-augmentation-using-skimage

https://medium.com/@hasithsura/audio-classification-d37a82d6715