# Install and Load Packages

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
from os import listdir
from os.path import isfile, join

from tqdm.auto import tqdm
import joblib
import gc
import time
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import scipy as sp

import librosa
import librosa.display

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.utils import Progbar

from IPython.core.display import display, HTML
import IPython.display as ipd
from functools import partial

from imblearn.over_sampling import RandomOverSampler

display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(threshold=100000)

In [None]:
librosa.__version__

In [None]:
tf.__version__

In [None]:
os.getcwd()

# Define global variables

In [None]:
ON_KAGGLE = True
MODEL_FOLDER_NAME = 'resnet50/'

In [None]:
# train folder
TRAIN_FOLDER = "../input/birdsong-recognition/train_audio/"

if (ON_KAGGLE): # if on kaggle
    MODEL_PATH = '../input/birdcall-models/'  # load model from memory
    metric_log = joblib.load(open(MODEL_PATH + "metric_log.pkl", "rb")) # load metric_log from memory
    NOCALL_TRAIN_PATH = "../input/bird-backgrounds/" # nocall audio
else: # if not on kaggle, pick up from last epoch
    MODEL_PATH = '/Users/longy/Documents/checkpoints/birdcall/' + MODEL_FOLDER_NAME
    if os.path.isfile(MODEL_PATH + "metric_log.pkl"):
        metric_log = joblib.load(open(MODEL_PATH + "metric_log.pkl", "rb"))
        start_epoch = len(metric_log[0])
    NOCALL_TRAIN_PATH = "../input/nocall/"

In [None]:
# set default sampling rate to 32kHz
DEFAULT_SR = 32000

# frequency range
F_MIN, F_MAX = 20, DEFAULT_SR/2

# number of threads
NUM_THREADS = 8

# number of samples in window
N_FFT=2048

# step in samples
HOP_LENGTH = 512

# number of mel bins
N_MELS=128

# number of decibels below reference value to preserve in log-melspectrogram
TOP_DB=80

# clip length to train and predict on
NUM_SECONDS=5

# percentage of training data vs. validation data
TRAIN_SIZE=0.8

# Explore Training Data

In [None]:
train_info = pd.read_csv("../input/birdsong-recognition/train.csv").drop_duplicates()
print(train_info.shape)
print(len(train_info.ebird_code.unique()))
print(train_info.columns)
train_info.head()

## create code for each bird, add code for nocall

In [None]:
# number of birds to make prediction for, adding 'nocall'
num_birds = len(train_info.ebird_code.unique())+1

# categorize ebird_code
train_info['ebird_code_cat'] = train_info.ebird_code.astype('category').cat.codes

## create lookup between bird name and bird code

In [None]:
# create lookup for abbreviated name
nocall_label='nocall'
name_lookup = dict(zip(train_info.ebird_code.astype('category').cat.codes, 
                       train_info.ebird_code.astype('category')))

nocall_code = np.max(train_info.ebird_code.astype('category').cat.codes.unique())+1
name_lookup[nocall_code]=nocall_label

# create reverse lookup for code (from abbreviated name)
code_lookup={v:k for k,v in name_lookup.items()}

# create lookup for sampling rate
sr_lookup = dict(zip(train_info.filename, train_info.sampling_rate))

## Examine one audio file

In [None]:
# example
example = train_info.iloc[0,:]

# filename
filename = example.filename

# ebird
bird = example.ebird_code

# sampling rate
sr = example.sampling_rate

# duration of clip
duration = example.duration

print("#ebird code: {}\n".format(bird))
print("#label: {}\n".format(example.primary_label))
print("#secondary labels: {}\n".format(example.secondary_labels))
print("#description:\n {}\n".format(example.description))
print("#type: {}\n".format(example.type))
print("#saw bird: {}\n".format(example.bird_seen))
print("#sampling rate: {} Hz\n".format(sr))
print("#recording length: {} seconds\n".format(duration))

### Load sound clip

In [None]:
# shared in discussion: https://www.kaggle.com/c/birdsong-recognition/discussion/179592
def load_clip(path):
    clip, sr_native = librosa.core.audio.__audioread_load(path, offset=0.0, duration=None, dtype=np.float32)
    clip = librosa.to_mono(clip)
    sr = DEFAULT_SR
    if sr_native > 0:
        clip = librosa.resample(clip, sr_native, sr, res_type='kaiser_fast')
    return clip, sr

In [None]:
sound_clip, sr = load_clip(TRAIN_FOLDER + bird + '/' + filename)

In [None]:
ipd.Audio(TRAIN_FOLDER + bird + '/' + filename)

### Show Timbre
Timbre is the quality of sound that distinguishes the tone of different instruments and voices even if the sounds have the same pitch and loudness.

**mel-spectrogram**

Definition: short-time-fourier-transform (stft) in each window to represent frequencies in mel-scale (such that equal distances in pitch sounded equally distant to the listener)

In [None]:
melspectrogram = librosa.feature.melspectrogram(sound_clip, sr=sr, fmin=F_MIN, fmax=F_MAX)
print("In this case, melspectrogram computed {} mel-frequency spectrogram coefficients over {} frames.".format(melspectrogram.shape[0], melspectrogram.shape[1]))
melspectrogram = librosa.power_to_db(melspectrogram).astype(np.float32)
plt.figure(figsize=(20,3))
librosa.display.specshow(melspectrogram, sr=sr, x_axis='time', y_axis='mel', fmin=F_MIN, fmax=F_MAX)
plt.colorbar(format='%+2.0f dB')

print(melspectrogram.shape)
print("no white noise:")
ipd.Audio(sound_clip, rate=sr)

#### Decibel-Frequency chart

Definition: Plot x=Frequency in Hz and y=Decibel for the entire clip. Useful to show the effect of adding noise to clip.

In [None]:
def plot_db_freq(clip):
    S = librosa.stft(clip)
    D = librosa.amplitude_to_db(np.abs(S))
    D_AVG = np.mean(D, axis=1)

    x_ticks_positions = [n for n in range(0, N_FFT // 2, N_FFT // 16)]
    x_ticks_labels = [str(sr / N_FFT * n) + 'Hz' for n in x_ticks_positions]

    plt.figure(figsize=(10,5))
    plt.plot(D_AVG)
    plt.xticks(x_ticks_positions, x_ticks_labels)
    plt.xlabel('Frequency')
    plt.ylabel('dB')
    plt.show()

In [None]:
plot_db_freq(sound_clip)
ipd.Audio(sound_clip, rate=sr)

### add white noise

In [None]:
def add_white_noise(clip, wn_rate):
    return clip + wn_rate * np.random.randn(len(clip)) # randn: standard normal distribution

In [None]:
wn_rate = 0.01
sound_clip_with_wn = add_white_noise(sound_clip, wn_rate)
plot_db_freq(sound_clip_with_wn)

### add pink noise

In [None]:
def add_pink_noise(clip, freq_floor,freq_ceil, n_freq):
    freq = np.linspace(freq_floor,freq_ceil,n_freq)
    noise = np.zeros(len(clip))
    for f in freq:
        amp = 1/f**1
        noise = noise + amp*np.sin(2*np.pi*f*clip+np.random.rand(1)*2*np.pi) # rand: uniform distribution
    return noise

In [None]:
sound_clip_with_pn = add_pink_noise(sound_clip,1,100,30)
plot_db_freq(sound_clip_with_pn)

### experiment with different augmentation parameters

#### useful functions

In [None]:
# get melspectrogram
def librosa_get_melspec(sound_clip, sr):
    melspectrogram = librosa.feature.melspectrogram(sound_clip, 
                                                    n_fft=N_FFT, 
                                                    win_length=N_FFT, 
                                                    center=False, 
                                                    sr=sr, 
                                                    fmin=F_MIN, 
                                                    fmax=F_MAX)
    melspectrogram = librosa.power_to_db(melspectrogram).astype(np.float32)
    return melspectrogram

# standardize 2D image, convert to grayscale: https://www.kaggle.com/daisukelab/cnn-2d-basic-solution-powered-by-fast-ai
def np_to_grayscale(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    X = standardize(X)
    X = stack(X)
    return X

def standardize(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def stack(X):
    return np.stack([X,X,X],axis=-1)

# crop 5 seconds
def crop_image(img, sr, random=True, num_seconds=NUM_SECONDS, hop_length=HOP_LENGTH):
    height, width = img.shape
    duration = width*hop_length/sr
    if duration>num_seconds: 
        if random: #randomly crop 5 seconds
            end_second = np.random.uniform(low=num_seconds,high=duration, size=1)[0]
            start_second = end_second-num_seconds
        else: #crop first 5 seconds
            end_second = num_seconds
            start_second = end_second-num_seconds
    else:
        end_second = duration
        start_second = 0
    
    start_frame = int(np.floor(start_second*sr/hop_length))
    end_frame = int(np.round(end_second*sr/hop_length))
    
    return img[:, start_frame:end_frame].astype(np.float32)

#### white noise

In [None]:
n_col = 3
n_img = 9

fig, axs = plt.subplots(n_img//n_col,n_col,figsize=(5*n_col,5*(n_img//n_col-1)))
for i in range(0,n_img):
    x = i//n_col
    y = i-x*n_col
    ax = axs[x][y]
    
    sc = add_white_noise(sound_clip, i*0.005)
    melspec = librosa_get_melspec(sc, sr)
    cropped_melspec = crop_image(melspec,sr,random=False)
    cropped_img = np_to_grayscale(cropped_melspec)

    ax.imshow(cropped_img)
    ax.set_title("White Noise Rate: " + str(i*0.005))

#### Pink Noise

In [None]:
n_col = 3
n_img = 9

fig, axs = plt.subplots(n_img//n_col,n_col,figsize=(5*n_col,5*(n_img//n_col-1)))
for i in range(0,n_img):
    x = i//n_col
    y = i-x*n_col
    ax = axs[x][y]
    
    sc = add_pink_noise(sound_clip,1,100*(x+1),30*(y+1))
    melspec = librosa_get_melspec(sc, sr)
    cropped_melspec = crop_image(melspec,sr,random=False)
    cropped_img = np_to_grayscale(cropped_melspec)

    ax.imshow(cropped_img)
    ax.set_title("Pink Noise: {}-{} Hz in a total of {} steps".format(1,100*(x+1),30*(y+1)))

# Get nocall data

In [None]:
nocall_info = [(nocall_code, join(NOCALL_TRAIN_PATH, f)) for f in listdir(NOCALL_TRAIN_PATH) if isfile(join(NOCALL_TRAIN_PATH, f))]
nocall_info = pd.DataFrame(nocall_info, columns=['ebird_code_cat', 'filepath'])

In [None]:
nocall_audio,sr = load_clip(nocall_info.iloc[0].filepath)
ipd.Audio(nocall_audio,rate=sr)

In [None]:
nocall_melspec = librosa_get_melspec(nocall_audio, sr)
print(nocall_melspec.shape)
plt.figure(figsize=(20,3))
plt.imshow(np_to_grayscale(nocall_melspec))

# Build Pipeline

In [None]:
train_info['filepath'] = TRAIN_FOLDER+train_info["ebird_code"]+"/"+train_info["filename"]
train_info.ebird_code_cat = train_info.ebird_code_cat.astype('int32')
train_info.filepath = train_info.filepath.astype('string')
train_info[['ebird_code_cat', 'filepath']].head(1)

In [None]:
nocall_info.ebird_code_cat = nocall_info.ebird_code_cat.astype('int32')
nocall_info.head(1)

## Extract

In [None]:
def get_birdcall_audio(filepath, label):
    audio, sr = load_clip(filepath.numpy())
    return audio, tf.cast(sr,tf.float32), label

extract = lambda x,y: tf.py_function(get_birdcall_audio,[x,y], [tf.float32,tf.float32,tf.int32])

## Transform

1. randomly crop 5 seconds from clip

In [None]:
def crop_and_pad(sound_clip,sr,label):
    length = tf.size(sound_clip,out_type=tf.int32)
    length = tf.cast(length,tf.float32)
    duration = length/sr
    
    if (duration>=NUM_SECONDS):
        # randomly select end second
        end_second = tf.random.uniform(shape=[],
                                       minval=NUM_SECONDS,
                                       maxval=duration,
                                       dtype=tf.float32)
        # transform second to sample
        cut_max = end_second*sr
        cut_min = (end_second-NUM_SECONDS)*sr

        # cast to integer
        cut_min = tf.cast(cut_min, tf.int32)
        cut_max = tf.cast(cut_max, tf.int32)

        # cut clip
        sound_clip = sound_clip[cut_min:cut_max]
    else:
        zero_padding_len = tf.cast((NUM_SECONDS-duration)*sr, tf.int32)
        sound_clip = tf.concat([sound_clip,tf.zeros(zero_padding_len)],axis=0)
    
    return sound_clip,sr,label

2. augment data by adding white noise or pink noise

- choice 1: no augmentation
- choice 2: white noise up to 0.5
- choice 3: pink noise (200 or 300),30

In [None]:
augment_choices = tf.range(0,3,dtype=tf.float32)
wr_ceil = 0.05
freq_ceil_choices = 100. * tf.range(2,4,dtype=tf.float32) # 200, 300
n_freq = 30

In [None]:
def augment(sound_clip,sr,label):
    choice = tf.random.shuffle(augment_choices)[0]
    length = tf.size(sound_clip, out_type=tf.int32)
    
    if tf.math.equal(choice,1): # white noise
        wr_rate = tf.random.uniform([],0,wr_ceil)
        sound_clip = sound_clip + wr_rate * tf.random.normal([length])
    elif tf.math.equal(choice,2): # pink noise
        freq_ceil = tf.random.shuffle(freq_ceil_choices)[0]
        freqs = tf.linspace(1.,freq_ceil,n_freq)

        i0 = tf.constant(0)
        s0 = tf.zeros(shape=length)
        
        c = lambda i,s: i < n_freq
        b = lambda i,s: [i+1, 
                         s + 1/freqs[i] * tf.math.sin(2*np.pi*(1/freqs[i])*sound_clip+tf.random.uniform(shape=[],minval=0,maxval=1,dtype=tf.float32)*2*np.pi)]
        
        _,sound_clip = tf.while_loop(c,b,loop_vars=[i0,s0],shape_invariants=[i0.get_shape(),sound_clip.get_shape()])
    
    return sound_clip,sr,label

3. get mel-spectrogram

In [None]:
def log10(x):
    numerator = tf.math.log(x)
    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
    return tf.divide(numerator, denominator)

def get_melspec(sound_clip,sr,label):
    spec = tf.signal.stft(sound_clip,frame_length=N_FFT,frame_step=HOP_LENGTH)
    spec = tf.abs(spec)
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins=N_MELS, 
                                                                        num_spectrogram_bins=N_FFT//2+1, 
                                                                        sample_rate=sr, 
                                                                        lower_edge_hertz=F_MIN, 
                                                                        upper_edge_hertz=F_MAX,
                                                                        dtype=tf.dtypes.float32)
    melspec = tf.matmul(spec, linear_to_mel_weight_matrix)
    log_melspec = 10*log10(tf.transpose(melspec)**2)
    ref = tf.math.reduce_max(log_melspec)-TOP_DB
    log_melspec = tf.where(log_melspec<ref,ref,log_melspec)
    
    return log_melspec,label

4. standardize and stack to rgb(#,#,#)

In [None]:
eps=1e-6
def to_grayscale(melspec, label):
    mean = tf.math.reduce_mean(melspec)
    std = tf.math.reduce_std(melspec)
    xstd = (melspec-mean)/(std + eps)
    norm_min = tf.math.reduce_min(xstd)
    norm_max = tf.math.reduce_max(xstd)
    
    if (norm_max-norm_min>eps):
        v = 255*(xstd-norm_min)/(norm_max-norm_min)
        v = tf.cast(v, tf.uint8)
    else:
        v = tf.zeros_like(xstd, dtype=tf.uint8)
    return tf.stack([v,v,v],axis=2), label

5. create training and validation

In [None]:
birdcall_train, birdcall_val = train_test_split(train_info,
                                                stratify=train_info.ebird_code_cat, 
                                                train_size=TRAIN_SIZE)
nocall_train, nocall_val = train_test_split(nocall_info,train_size=TRAIN_SIZE)

train, val = (pd.concat([birdcall_train[["filepath", "ebird_code_cat"]],nocall_train[["filepath", "ebird_code_cat"]]],axis=0), 
              pd.concat([birdcall_val[["filepath", "ebird_code_cat"]],nocall_val[["filepath", "ebird_code_cat"]]],axis=0))

balance dataset by oversampling on the minority classes

In [None]:
def overresample(df):
    ros = RandomOverSampler()
    resampled_df, _ = ros.fit_resample(df, df.ebird_code_cat)
    resampled_df, _ = train_test_split(resampled_df, 
                                       stratify = resampled_df.ebird_code_cat, 
                                       train_size=int(np.mean(df.ebird_code_cat.value_counts().values))/int(np.max(df.ebird_code_cat.value_counts().values)))
    return resampled_df 

In [None]:
resampled_train = overresample(train)
resampled_train.ebird_code_cat.value_counts()

### Pull things together

In [None]:
def stream_files(df, train=True):
    if train:
        ds = (tf.data.Dataset.from_tensor_slices((df.filepath.values,df.ebird_code_cat.values)).
              map(extract,num_parallel_calls=NUM_THREADS).
              map(crop_and_pad, num_parallel_calls=NUM_THREADS).
              map(augment, num_parallel_calls=NUM_THREADS).
              map(get_melspec, num_parallel_calls=NUM_THREADS))
    else:
        ds = (tf.data.Dataset.from_tensor_slices((df.filepath.values,df.ebird_code_cat.values)).
              map(extract,num_parallel_calls=NUM_THREADS).
              map(crop_and_pad, num_parallel_calls=NUM_THREADS).
              map(get_melspec, num_parallel_calls=NUM_THREADS))

    return ds

In [None]:
train_files = (stream_files(resampled_train))

We can take a peek at the mel-spectrograms from training data:

In [None]:
n_col = 4
n_img = 16

fig, axs = plt.subplots(n_img//n_col,n_col,figsize=(5*n_col,5*(n_img//n_col-1)))
for i, (melspec,label) in enumerate(train_files.take(n_img)):
    x = i//n_col
    y = i-x*n_col
    ax = axs[x][y]
    ax.imshow(melspec)
    ax.set_title(name_lookup[label.numpy()])

#### get image width and height

In [None]:
for i, (melspec,label) in enumerate(train_files.take(1)):
    img_height = melspec.numpy().shape[0]
    img_width = melspec.numpy().shape[1]

print("image width: {}, height: {}".format(img_width, img_height))

## Load

### set up params

In [None]:
epochs = 100
batch_size = 64
lr=.001
early_stopping=20
threshold=0.5

### compile model

resnet50

In [None]:
img_resize = [img_height, img_width]

model = tf.keras.Sequential()
resnet50 = tf.keras.applications.ResNet50(include_top=False, 
                                          pooling='avg', 
                                          weights=None,
                                          input_shape=(img_height, img_width, 3))

model.add(resnet50)
model.add(tf.keras.layers.Dense(num_birds, activation='sigmoid'))

In [None]:
# Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

# cross entropy loss function
loss_fn = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM)

# metrics
f1_train = tfa.metrics.F1Score(num_birds, 'micro')
f1_val = tfa.metrics.F1Score(num_birds, 'micro')

# for logging results
metric_name = ["loss", "f1", "val-f1"]

best_f1_val = None
best_epoch = None

start_epoch = 0
epoch_num = []
metric_log_val_f1 = []
metric_log_f1 = []

# compile
model.compile(optimizer=optimizer,loss=loss_fn)

# summary
model.summary()

### train step

In [None]:
@tf.function
def train_step(x,y): 
    with tf.GradientTape() as tape:
        x,y = tf.vectorized_map(lambda a: to_grayscale(a[0],a[1]),(x,y))
        
        # predict
        y_pred = model(x)
        y_truth = tf.one_hot(y, num_birds)

        # calculate loss
        loss = loss_fn(y_truth, y_pred)

    # calculate gradients
    grads = tape.gradient(loss,model.trainable_weights)
    optimizer.apply_gradients(zip(grads,model.trainable_weights))

    # compute metrics
    f1_train.update_state(y_truth,y_pred)
    
    return loss

### validation step

In [None]:
@tf.function
def val_step(x,y):
    x,y = tf.vectorized_map(lambda a: to_grayscale(a[0],a[1]),(x,y))
    
    y_pred = model(x)
    y_truth = tf.one_hot(y, num_birds)
    
    f1_val.update_state(y_truth,y_pred)

### start training

In [None]:
resampled_train = overresample(train)
print(resampled_train.shape)

In [None]:
val_ds = stream_files(val, train=False).batch(batch_size)

In [None]:
steps_per_epoch = int(np.ceil(len(resampled_train)/batch_size))
steps_per_epoch

In [None]:
if (ON_KAGGLE==False):
    with np.errstate(all='raise'):
        # for each epoch
        for epoch in range(start_epoch, epochs):
            print("epoch %d" % (epoch,))

            # reset states
            f1_train.reset_states()
            f1_val.reset_states()
            
            # set up progress bar
            pb_i = Progbar(target=steps_per_epoch,stateful_metrics=metric_name)

            # get balanced data from train, shuffle, batch
            resampled_train = overresample(train)
            train_files = stream_files(resampled_train).batch(batch_size).prefetch(2)

            for (x_batch, y_batch) in train_files:
                # get loss
                loss = train_step(x_batch, y_batch)
                # update progress
                pb_i.add(1, values=[('loss', loss), ('f1', f1_train.result()), ('val-f1', f1_val.result())])

            # validate
            for (x_batch_val, y_batch_val) in val_ds:
                val_step(x_batch_val, y_batch_val)

            # update f1 for validation set
            pb_i.update(steps_per_epoch, 
                        values=[('loss', loss), ('f1', f1_train.result()),('val-f1', f1_val.result())],
                        finalize=True)

            # log results
            epoch_num.append(epoch)
            metric_log_f1.append(f1_train.result().numpy())
            metric_log_val_f1.append(f1_val.result().numpy())

            # save checkpoint after each epoch
            model.save_weights(MODEL_PATH + 'model')
            joblib.dump([epoch_num,metric_log_val_f1,metric_log_f1], open(MODEL_PATH + "metric_log.pkl", "wb"))

            # early stopping if f1 score hasn't improved on validation set
            if best_f1_val is None or f1_val.result()>best_f1_val:
                best_f1_val,best_epoch=f1_val.result(),epoch

            if best_epoch<epoch-early_stopping:
                model.stop_training=True
                print("training stopped early at epoch %d" % epoch)
                break
else:
    model.load_weights(MODEL_PATH + 'model')
    metric_log_val_f1 = metric_log[1]
    metric_log_f1 = metric_log[2]

### Plot F1 Score

In [None]:
plt.plot(metric_log_val_f1, label='validation f1')
plt.plot(metric_log_f1, label='training f1')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.legend(loc='lower right')

# Submission to evaluate on test data

In [None]:
BASE_TEST_DIR = '../input/birdsong-recognition' if os.path.exists('../input/birdsong-recognition/test_audio') else '../input/birdcall-check'
TEST_FOLDER = f'{BASE_TEST_DIR}/test_audio/'
TEST_FOLDER

In [None]:
df_test = pd.read_csv(f'{BASE_TEST_DIR}/test.csv')
df_test["filepath"] = TEST_FOLDER + df_test.audio_id + '.mp3'
df_test.head()

In [None]:
def postprocess(y_prob, top = 3):
    y_candidate = tf.where(tf.where(y_prob > threshold, 1.0, 0.0) * y_prob > 0)
    y_top = tf.argsort(y_candidate, direction='DESCENDING')
    if len(y_top)>0: return [name_lookup[code.numpy()] for code in y_top[0][:top]]
    return []

In [None]:
def make_prediction(audio, site):
    sound_clip,sr=audio
    
    if (site=='site_1' or site=='site_2'):
        melspec = librosa_get_melspec(sound_clip, sr=sr)
        x = tf.expand_dims(np_to_grayscale(melspec).astype(np.uint8), axis=0)
        y_prob = model(x)
        y = postprocess(y_prob)
    else:
        duration = librosa.get_duration(sound_clip,sr)
        if duration<NUM_SECONDS:
            padded_clip = np.concatenate([sound_clip, np.zeros(int(sr*(NUM_SECONDS-duration)))])
            melspec = librosa_get_melspec(padded_clip, sr=sr)
            x = tf.expand_dims(np_to_grayscale(melspec).astype(np.uint8), axis=0)
            y_prob = model(x)
            y = postprocess(y_prob)
        else:
            start_second, end_second = 0, NUM_SECONDS
            y = []
            
            # predict for each 5 seconds
            while end_second<=duration:
                clip = sound_clip[start_second*sr:end_second*sr]
                melspec = librosa_get_melspec(clip, sr=sr)
                x = tf.expand_dims(np_to_grayscale(melspec).astype(np.uint8), axis=0)
                clip_prob = model(x)
                clip_y = postprocess(clip_prob)
                y.extend(clip_y)
                start_second += NUM_SECONDS
                end_second += NUM_SECONDS
                
            # predict for remaining time: at least 1 second
            if end_second-duration>=1:
                last_clip = np.concatenate([sound_clip[start_second*sr:], np.zeros(int(sr*(end_second-duration)))])
                melspec = librosa_get_melspec(last_clip, sr=sr)
                x = tf.expand_dims(np_to_grayscale(melspec).astype(np.uint8), axis=0)
                clip_prob = model(x)
                clip_y = postprocess(clip_prob)
                y.extend(clip_y)
    return y

In [None]:
# remove redundancy from memory
del train_info, birdcall_train, birdcall_val, nocall_info, nocall_train, nocall_val, resampled_train
gc.collect()

In [None]:
cached_audio = dict()
start = time.time()

preds = []
for i in tqdm(df_test.index):
    row = df_test.iloc[i]
    start_time = row.seconds-NUM_SECONDS
    row_id, site, filepath = row.row_id, row.site, row.filepath
    
    if (filepath in cached_audio):
        loaded_clip, sr = cached_audio[filepath]
    else:
        loaded_clip, sr = load_clip(filepath)
        cached_audio[filepath] = (loaded_clip, sr)
        
    if site=='site_1' or site=='site_2':
        audio = loaded_clip[int(start_time*sr):int((start_time+NUM_SECONDS)*sr)], sr
    else:
        audio = loaded_clip, sr
        
    pred = make_prediction(audio, site)
    pred = ' '.join([bird for bird in np.unique(pred) if bird!=nocall_label])
    if (len(pred)==0): pred=nocall_label
    preds.append([row_id, pred])
    
print("prediction finished in %d seconds" % ((time.time() - start)))
preds = pd.DataFrame(preds, columns=['row_id', 'birds'])
preds.head()

In [None]:
preds.to_csv('submission.csv', index=False)