# My Training Script
This is the training script I used to approach the competition. If you have any questions, feel free to reach out and ask.

In [None]:
import os

import pylab as pl
import numpy as np
import pandas as pd
import librosa as lb
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

from tqdm import tqdm
from random import shuffle, randrange, sample
from sklearn.metrics import label_ranking_average_precision_score

In [None]:
annotations = pd.read_csv("../input/rfcx-species-audio-detection/train_tp.csv")
annotations_nl = pd.read_csv("../input/rfcx-species-audio-detection/train_fp.csv")
crop_length = 10

sample_rate = 48000
n_mels = 384

f_min = 50
f_max = sample_rate // 2

mels_per_bin = 2595 * tf.math.log(1. + f_max/700.) / n_mels

models = { 'inception': keras.applications.InceptionV3,
           'efficientnet': keras.applications.EfficientNetB3,
           'resnet': keras.applications.ResNet50 }

# Data Preprocessing
Changing the input data from 1D sound signals to 2D Mel Spectrograms, cropped to where the labels are located along the time (x) axis, and label gathering.

The 1D sound signals are being converted to Mel Spectrograms because neural network architectures in image classification are well established while the data conversion doesn't lead to any loss of information, so it makes sense to use better tools for what is essentially the same data.

We're cropping the input images to the locations of the labels because feeding instead a 60-second clip would provide too much information for the model to learn properly what is what. e.g. if the label is tf.one_hot([0, 3, 15, 23], 24) given the entire clip, then distinguishing which part is which class is much more difficult than feeding the model each instance of the label directly.

In [None]:
def get_labels_numpy(recording_id, start_time, end_time, annotations):
    info = np.array(annotations.loc[annotations['recording_id'] == recording_id])
    
    total_labels = np.zeros((24,))
    if info.shape[0] == 0:
        return total_labels
    
    label_species_ids = info[:,1]
    label_start_times = info[:,3]
    label_end_times = info[:,5]
    for i, s, e in zip(label_species_ids, label_start_times, label_end_times):
        if e > start_time and s < end_time:
            total_labels += tf.one_hot(i, 24)
            
    total_labels = tf.minimum(total_labels, 1)
    
    return tf.cast(total_labels, tf.float32)

def get_labels_validation_numpy(recording_id, times, frequencies, annotations):
    info = np.array(annotations.loc[annotations['recording_id'] == recording_id])
    
    start_time, end_time = times
    lower_freq, upper_freq = frequencies

    total_labels = np.zeros((24,))
    if info.shape[0] == 0:
        return total_labels
    
    label_species_ids = info[:,1]
    label_lower_freqs = info[:,2]
    label_start_times = info[:,3]
    label_upper_freqs = info[:,4]
    label_end_times = info[:,5]
    for i, lf, st, uf, et in zip(label_species_ids,
                                 label_lower_freqs,
                                 label_start_times,
                                 label_upper_freqs,
                                 label_end_times):
        if et > start_time and st < end_time and uf > lower_freq and lf < upper_freq:
            total_labels += tf.one_hot(i, 24)
            
    total_labels = tf.minimum(total_labels, 1)
    
    return tf.cast(total_labels, tf.float32)

def frequency_to_mel_bin(frequency):
    mel = 2595. * tf.math.log(1. + frequency/700.)

    return int(mel / mels_per_bin)

def mel_bin_to_frequency(mel_bin):
    freq = mel_bin * mels_per_bin
    freq = 700 * (tf.math.exp(freq/700.) - 1)

    return freq


# load a signal and convert it to a mel-spectrogram
# NOTE: crop_length is given in seconds, all signals are 60 seconds long
def load_spec(filename, index=-1, crop_length=crop_length):
    # signal.shape == [<signal_length>,]
    signal, _ = lb.load(filename, sr=sample_rate)
    
    filename = filename[-14:-5]
    label = annotations.loc[annotations['recording_id'] == filename]

    if index == -1:
        label_start = label['t_min']
        idx = randrange(label_start.shape[0])
        freq_min = float(label['f_min'].iloc[idx])
        label_start = int(label_start.iloc[idx])
        label_end = label['t_max']
        freq_max = float(label['f_max'].iloc[idx])
        freq_max = freq_max + randrange(f_max-int(freq_max))
        label_end = int(label_end.iloc[idx])

        label_length = label_end - label_start
    else:
        freq_max = f_max

    m = signal.shape[0]
    
    # helper functions for converting to and from signal length to seconds
    ratio = lambda x: int(x/60 * m)
    anti_ratio = lambda x: x / m * 60
    
    # randomly crop each signal at a point where the label is to be found,
    # with a slight offset
    # TODO: is this offset helping?
    if index == -1:
        start_offset = randrange(ratio(crop_length - (crop_length/4)))
        start = max(ratio(label_start) - start_offset, 1)
    else:
        start = ratio(index)
    
    end = start + ratio(crop_length)
    
    signal = signal[start:end]
    
    melspec = lb.feature.melspectrogram(signal, sr=sample_rate, n_mels=n_mels, fmin=f_min, fmax=f_max)
    melspec = lb.power_to_db(melspec).astype(np.float32)

    # this function should honestly just return the labels themselves
    # instead of the times at which the labels are found
    start, end = anti_ratio(start), anti_ratio(end)
    
    if index == -1:
        return melspec, start, end
    else:
        return melspec

# see https://arxiv.org/pdf/1710.09412.pdf
def mixup(inp, targ):
    indice = tf.range(len(inp))
    indice = tf.random.shuffle(indice)
    sinp = tf.gather(inp, indice, axis=0)
    starg = tf.gather(targ, indice, axis=0)
    
    alpha = 0.1
    t = tf.compat.v1.distributions.Beta(alpha, alpha).sample([len(inp)])
    tx = tf.reshape(t, [-1, 1, 1, 1])
    ty = tf.reshape(t, [-1, 1])
    x = inp * tx + sinp * (1-tx)
    y = targ * ty + starg * (1-ty)

    return x, y


# Dataset Generation
Actual making of the TensorFlow Datasets using generators to fetch the data. The created datasets are then divided into five different folds. If you're unfamiliar with k-fold validation, see https://en.wikipedia.org/wiki/Cross-validation_(statistics).

Something I tried to do for some cross-validation and leaderboard correlation was to similarize the validation and inference methods i.e. validation follows inference in slicing and stacking each image then taking the max prediction for each slice to get the prediction for the entire image.

In [None]:
BATCH_SIZE = 8

train_path = "../input/rfcx-species-audio-detection/train/"
#train_path = "../input/rcfx-spectrograms-32-khz/train/"
# only training on the signals which contain true positive labels
train_files = [s for s in annotations['recording_id']]
class_count = 24

def generate_dataset(batch_size, fold, train_files, validation_files):
    def train_generator():
        for i in range(len(train_files)):
            image, start, end, recording_id = get_data_index(i)
            images = tf.expand_dims(image, -1)
            images = augment(images)

            labels_pl = get_labels_numpy(recording_id, start, end, annotations)
            labels_nl = get_labels_numpy(recording_id, start, end, annotations_nl)

            labels = labels_pl
            
            yield images, labels
            
    def validation_generator():
        for i in range(len(validation_files)):
            recording_id = validation_files[i].split('/')[-1][:-5]

            images = []
            labels = []
            image = tf.expand_dims(load_spec(validation_files[i], 0, 60), axis=-1)

            x_step = (image.shape[1]//60*(crop_length))
            for j in range(0, image.shape[1]-(image.shape[1]//60*crop_length), x_step):
                im = image[:,j:j+x_step]
                im = augment(im, training=False)
                images.append(im)

                labels.append(tf.cast(get_labels_numpy(recording_id,
                                                       j/image.shape[1]*60, (j+x_step)/image.shape[1]*60,
                                                       annotations), tf.float32))

            images = tf.stack(images)
            labels = tf.stack(labels)

            yield images, labels
            
            
    train_dataset = tf.data.Dataset.from_generator(train_generator, output_types=(tf.float32,
                                                                                  tf.float32)).repeat().batch(batch_size).map(mixup)

    validation_dataset = tf.data.Dataset.from_generator(validation_generator, output_types=(tf.float32,
                                                                                            tf.float32))#.batch(batch_size)
    
    return train_dataset, validation_dataset

# Creating each fold
# Note that this is deterministic and could likely be improved by shuffling
# the lists containing each class before selecting recording_ids for the folds
folds = 5
datasets = []
for fold in range(folds):
    train_files = []
    validation_files = []
    for c in range(class_count):
        class_records = annotations.loc[annotations['species_id'] == c]['recording_id']
        class_ratio = class_records.shape[0] // folds 
        class_records = list(class_records)
        train_files += class_records[:class_ratio*fold] + class_records[class_ratio*(fold+1):]
        validation_files += class_records[class_ratio*fold:class_ratio*(fold+1)]
        
    shuffle(train_files)
    shuffle(validation_files)
    
    train_files = [train_path+s+".flac" for s in train_files]
    validation_files = [train_path+s+".flac" for s in validation_files]
        
    datasets.append(generate_dataset(BATCH_SIZE, fold, train_files, validation_files))

# Data Augmentations
Various augmentations to improve the generalization of the model. I wasn't able to find any success in using them in this notebook.

In [None]:
# size in seconds
# images includes batch dimension
def time_dropout(images, cuts=3, size=0.1):
    cut_size = int(images.shape[2] * size)
    images = images.numpy()
    for c in range(cuts):
        begin = randrange(images.shape[2]-cut_size)
        end = begin + cut_size
        
        images[:,:,begin:end] = np.zeros(list(images.shape[:2])+[end-begin])
        
    return images

def frequency_masking(mel_spectrogram):
    
    frequency_masking_para = 80, 
    frequency_mask_num = 2
    
    fbank_size = tf.shape(mel_spectrogram)
#     print(fbank_size)
    n, v = fbank_size[0], fbank_size[1]

    for i in range(frequency_mask_num):
        f = tf.random.uniform([], minval=0, maxval=tf.squeeze(frequency_masking_para), dtype=tf.int32)
        v = tf.cast(v, dtype=tf.int32)
        f0 = tf.random.uniform([], minval=0, maxval=tf.squeeze(v-f), dtype=tf.int32)

        # warped_mel_spectrogram[f0:f0 + f, :] = 0
        mask = tf.concat((tf.ones(shape=(n, v - f0 - f,1)),
                          tf.zeros(shape=(n, f,1)),
                          tf.ones(shape=(n, f0,1)),
                          ),1)
        mel_spectrogram = mel_spectrogram * mask
    return tf.cast(mel_spectrogram, dtype=tf.float32)


def time_masking(mel_spectrogram):
    time_masking_para = 40, 
    time_mask_num = 1
    
    fbank_size = tf.shape(mel_spectrogram)
    n, v = fbank_size[0], fbank_size[1]

   
    for i in range(time_mask_num):
        t = tf.random.uniform([], minval=0, maxval=tf.squeeze(time_masking_para), dtype=tf.int32)
        t0 = tf.random.uniform([], minval=0, maxval=n-t, dtype=tf.int32)

        # mel_spectrogram[:, t0:t0 + t] = 0
        mask = tf.concat((tf.ones(shape=(n-t0-t, v,1)),
                          tf.zeros(shape=(t, v,1)),
                          tf.ones(shape=(t0, v,1)),
                          ), 0)
        
        mel_spectrogram = mel_spectrogram * mask
    return tf.cast(mel_spectrogram, dtype=tf.float32)


def random_brightness(image):
    return tf.image.random_brightness(image, 0.2)

def random_gamma(image):
    return tf.image.random_contrast(image, lower=0.1, upper=0.3)

def random_flip_right(image):
    return tf.image.random_flip_left_right(image)

def random_flip_up_down(image):
    return tf.image.random_flip_left_right(image)

available_ops = [
          #frequency_masking,
          #time_masking, 
          #random_brightness, 
          #random_flip_up_down,
          #random_flip_right 
         ]

def apply_augmentation(image):
    num_layers = int(np.random.uniform(low=0, high=3))
    
    for layer_num in range(num_layers):
        op_to_select = tf.random.uniform([], maxval=len(available_ops), dtype=tf.int32, seed=1)
        for (i, op_name) in enumerate(available_ops):
            image = tf.cond(
            tf.equal(i, op_to_select),
            lambda selected_func=op_name,: selected_func(
                image),
            lambda: image)
    return image

def augment(images, training=True):
    #if training:
    #    images = apply_augmentation(images)
        
    images = tf.image.grayscale_to_rgb(images)
    images = tf.image.per_image_standardization(images)
    images = tf.image.resize(images, (n_mels//2, int(n_mels)))
    
    return images

def get_data_index(index):
    s = train_files[index].split('/')[-1][:-5]
    f, start, end = load_spec(train_files[index])
    
    return f, start, end, s

def get_validation_data(index):
    f, start, end = load_spec(validation_files[index])
    s = validation_files[index].split('/')[-1][:-5]

    return f, start, end, s

# Model Definition
After being sent through the model, the output is averaged across the frequency-axis (y-axis) then sent through an attention mechanism before the final predictions are made.

In [None]:
for i, _ in datasets[0][0].take(1):
    spec_shape = i[0].shape
    print(spec_shape)

for _ in datasets[0][1].take(1):
    print(_[0].shape)

input_shape = (spec_shape[0], spec_shape[1], 3)
def get_model(model='inception'):
    backbone = models[model](include_top=False, weights='imagenet', pooling=None, input_shape=input_shape)
            
    inp = keras.Input(input_shape)
    
    x = backbone(inp)
    x = tf.math.reduce_mean(x, axis=1)
    x = layers.Dropout(rate=0.25)(x)
    norm_att = tf.keras.layers.Conv1D(class_count, 1)(x)
    norm_att = tf.nn.softmax(tf.clip_by_value(norm_att, -10, 10), axis=1)
    conv = tf.keras.layers.Conv1D(class_count, 1)(x)
    x = tf.math.reduce_sum(norm_att * conv, axis=1)
    x = tf.nn.sigmoid(x)
    
    return tf.keras.Model(inp, x)

# Competition metric
See https://stackoverflow.com/questions/55881642/how-to-interpret-label-ranking-average-precision-score.
 
Code here taken from https://www.kaggle.com/ashusma/training-rfcx-tensorflow-tpu-effnet-b2#Competition-Metric

In [None]:
def _one_sample_positive_class_precisions(example):
    y_true, y_pred = example
    y_true = tf.reshape(y_true, tf.shape(y_pred))
    retrieved_classes = tf.argsort(y_pred, direction='DESCENDING')
    class_rankings = tf.argsort(retrieved_classes)
    retrieved_class_true = tf.gather(y_true, retrieved_classes)
    retrieved_cumulative_hits = tf.math.cumsum(tf.cast(retrieved_class_true, tf.float32))

    idx = tf.where(y_true)[:, 0]
    i = tf.boolean_mask(class_rankings, y_true)
    r = tf.gather(retrieved_cumulative_hits, i)
    c = 1 + tf.cast(i, tf.float32)
    precisions = r / c

    dense = tf.scatter_nd(idx[:, None], precisions, [y_pred.shape[0]])
    return dense

class LWLRAP(tf.keras.metrics.Metric):
    def __init__(self, num_classes, name='lwlrap'):
        super().__init__(name=name)

        self._precisions = self.add_weight(
            name='per_class_cumulative_precision',
            shape=[num_classes],
            initializer='zeros',
        )

        self._counts = self.add_weight(
            name='per_class_cumulative_count',
            shape=[num_classes],
            initializer='zeros',
        )

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = y_true[:,:class_count]
        y_pred = y_pred[:,:class_count]

        y_true = tf.math.reduce_max(y_true, axis=0, keepdims=True)
        y_pred = tf.math.reduce_max(y_pred, axis=0, keepdims=True)

        precisions = tf.map_fn(
            fn=_one_sample_positive_class_precisions,
            elems=(y_true, y_pred),
            dtype=(tf.float32),
        )

        increments = tf.cast(precisions > 0, tf.float32)
        total_increments = tf.reduce_sum(increments, axis=0)
        total_precisions = tf.reduce_sum(precisions, axis=0)

        self._precisions.assign_add(total_precisions)
        self._counts.assign_add(total_increments)        

    def result(self):
        per_class_lwlrap = self._precisions / tf.maximum(self._counts, 1.0)
        per_class_weight = self._counts / tf.reduce_sum(self._counts)
        overall_lwlrap = tf.reduce_sum(per_class_lwlrap * per_class_weight)
        return overall_lwlrap

    def reset_states(self):
        self._precisions.assign(self._precisions * 0)
        self._counts.assign(self._counts * 0)


class Precision(keras.metrics.Metric):
    def __init__(self):
        super().__init__()

        self.met = keras.metrics.Precision()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.met.update_state(y_true[:,:class_count], y_pred[:,:class_count], sample_weight)

    def result(self):
        return self.met.result()

    def reset_states(self):
        self.met.reset_states()

# Learning Rate Schedule
A changing learning rate over time typically performs better than a stagnant one. Here we're using the schedule used to train the [Transformer model](https://arxiv.org/pdf/1706.03762.pdf%EF%BC%89%E6%8F%8F%E8%BF%B0%E4%BA%86%E8%BF%99%E6%A0%B7%E5%81%9A%E7%9A%84%E5%8E%9F%E5%9B%A0%E3%80%82).

In [None]:
epochs = 30
steps_per_epoch = len(train_files)//BATCH_SIZE
learning_rate_base = 1.5e-3
total_steps = steps_per_epoch * epochs
warmup_learning_rate = 1e-5
warmup_steps = (epochs // 10) * steps_per_epoch


@tf.function
def cosine_decay_with_warmup(global_step,
                             hold_base_rate_steps=0):

    if total_steps < warmup_steps:
        raise ValueError('total_steps must be larger or equal to '
                     'warmup_steps.')
    learning_rate = 0.5 * learning_rate_base * (1 + tf.cos(
        np.pi *
        (tf.cast(global_step, tf.float32) - warmup_steps - hold_base_rate_steps
        ) / float(total_steps - warmup_steps - hold_base_rate_steps)))
    if hold_base_rate_steps > 0:
        learning_rate = tf.where(
          global_step > warmup_steps + hold_base_rate_steps,
          learning_rate, learning_rate_base)
    if warmup_steps > 0:
        if learning_rate_base < warmup_learning_rate:
            raise ValueError('learning_rate_base must be larger or equal to '
                         'warmup_learning_rate.')
        slope = (learning_rate_base - warmup_learning_rate) / warmup_steps
        warmup_rate = slope * tf.cast(global_step,
                                    tf.float32) + warmup_learning_rate
        learning_rate = tf.where(global_step < warmup_steps, warmup_rate,
                               learning_rate)
    return tf.where(global_step > total_steps, 0.0, learning_rate,
                    name='learning_rate')

class LRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __call__(self, step):
        return cosine_decay_with_warmup(step)

# Training
Nested loop for training all five folds for each model. The loop is wrapped in a function to make sure this notebook can submit; otherwise it would take too long and time out.

In [None]:
lf = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.2, reduction=tf.keras.losses.Reduction.NONE)
loss = lambda target, logits: tf.nn.compute_average_loss(lf(target, logits))

submissions = []

def train():
    for m in models.keys():
        for i, fold in enumerate(datasets):
            print(f"FOLD {i}")

            train_dataset, validation_dataset = fold

            model = get_model(m)
            model.compile(optimizer=keras.optimizers.Adam(learning_rate=LRSchedule()),
                          loss=loss,
                          metrics=[LWLRAP(24)])

            checkpointing = tf.keras.callbacks.ModelCheckpoint(
                                f"fold_{m}_{i}.h5", monitor='val_loss', verbose=0, save_best_only=False,
                                save_weights_only=True, mode='min', save_freq='epoch')

            # Train the model
            model.fit(train_dataset,
                      validation_data=validation_dataset,
                      epochs=epochs,
                      steps_per_epoch=steps_per_epoch,
                      callbacks=[checkpointing])

            # Use the trained model to create the submission for this fold
            test_filepath = "data/raw/test/"
            test_files = [test_filepath+s for s in os.listdir(test_filepath)]

            submission = { 'recording_id': [] }
            for x in range(24):
                submission[f's{x}'] = []

            for filepath in tqdm(test_files):
                recording_id = filepath.split('/')[-1][:-5]
                logitses = []
                inp = []

                # For inference, we evaluate the model on each {crop_length}-second segment of the signal
                # and take the maximum of each of its predictions 
                # for a given class to make the image-wide prediction.
                #
                # Each batch consists of a single image, sliced and stacked.
                image = tf.expand_dims(load_spec(filepath, 0, 60), axis=-1)
                x_step = (image.shape[1]//60*(crop_length))
                for j in range(0, image.shape[1]-(image.shape[1]//60*crop_length), x_step):
                    im = image[:,j:j+x_step]
                    im = tf.image.resize(im, (n_mels//2, int(n_mels)))
                    inp.append(im)

                inp = tf.stack(inp)
                inp = augment(inp, training=False)
                logitses = tf.squeeze(model(inp, training=False))
                logits = tf.math.reduce_max(logitses, axis=0).numpy()[:class_count]

                submission['recording_id'].append(recording_id)
                for x, species in enumerate(logits):
                    submission[f's{x}'].append(species)

            submissions.append(submission)

            pd.DataFrame.from_dict(submission).to_csv(f"submission_{m}_{i}.csv", index=False)

        # Merge the submissions from each fold by using the average prediction for each recording_id
        submissions = [pd.DataFrame.from_dict(sub) for sub in submissions]
        submission_values = np.mean([np.array(sub.drop('recording_id', axis=1)) for sub in submissions], axis=0)
        submission = submissions[0]
        submission[[f's{i}' for i in range(class_count)]] = submission_values

        submission.to_csv(f"submission_{m}.csv", index=False)