 # Overview
 Readable code to create Resnet50 model in keras with various data augmentations and noise.  Any suggestions for improvement welcome.  Models seem to have trouble getting past ~.65 val_lwlrap score.

In [None]:
pip install audiomentations

In [None]:
# import necessary libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
import librosa, librosa.display
import tensorflow
import keras
import tensorflow as tf
from PIL import Image
from skimage.transform import resize
#from specAugment import spec_augment_tensorflow # doesn't work with tensorflow 2.0
#import tensorflow_io as tfio # tensorflow_io has dependency problem when using TPU
from audiomentations import Compose, SpecFrequencyMask, FrequencyMask, TimeMask, AddGaussianNoise
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras import models, layers
from tensorflow.keras.applications.resnet50 import ResNet50

# ---------------------------
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D,Convolution2D,BatchNormalization
from tensorflow.keras.layers import Flatten,MaxPooling2D,Dropout
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.applications.densenet import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator,img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

import warnings
warnings.filterwarnings("ignore")
#-----------------------------

from sklearn.model_selection import train_test_split
import IPython.display as ipd
from sklearn.preprocessing import OneHotEncoder,LabelBinarizer
import matplotlib.pyplot as plt

# Process data:

## Windowing data

The audio files will be processed in the following manor to give the most robust data set.  First the minimum and maximum frequencies for the data set will be found and reduced/increased by 25% to give some margin.  Next, the max time difference between t_min and t_max is found.  This max time difference is then used to calculate a step distance with margin.  With the step distance calculated, the t_min and t_max for each audio sample is used to find the center of the audio cut.  A random number between -3 and 3 is added to this center to randomly shift the labeled event left or right.  Extra logic is included if events fall to close to the beginning or end of audio file.  The final product is a dataframe with new ‘start’ and ‘end’ time stamps for each audio file.


In [None]:
# read in true positives csv
train_t = pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')

df_in = train_t

# get min and max frequencies
f_min = round(df_in.f_min.min() * .75, 1)
f_max = round(df_in.f_max.max() * 1.25, 1)


# create time diff column:
df_in['t_diff'] = df_in['t_max'] - df_in['t_min']

# get max time_diff
t_max = round(train_t['t_diff'].max(),0)

# margin in seconds
margin = 1

# calculate step length
step = (t_max/2) + margin

# initialize start and end rows in dataframe
df_in['start'] = 0
df_in['end'] = 0

for index, row in df_in.iterrows():
    
    # set noise
    #noise=0
    noise = np.random.uniform(-3,3)
    
    # set window size
    t_center = row['t_min'] + row['t_diff'] + noise
    
    # get start and end of window
    start = t_center - step
    end = t_center + step
    
    # special process events too close to start or end
    if start < 0:
        t_start = 0
        t_end = 2*step
        
    else:
        t_start = start
        t_end = end
        
    if end > 60:
        t_end = 60
        t_start = t_end - (2*step)
    
    
    # add start and end to dataframe
    df_in['start'][index] = t_start
    df_in['end'][index] = t_end

    
# check to make sure all windows are correct
df_in['temp'] = df_in['end'] - df_in['start']
df_in.temp.describe()

## Split into train/test (add stratified k-fold in new version)
Split data into train (80%) and test (20%) and OneHot encode labels.


In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df_in[['recording_id', 'start', 'end']],
                                                    df_in[['species_id', 'start', 'end']],
                                                    test_size=0.20, random_state=8)

# create train_df and test_df
train_df = X_train
train_df['label'] = y_train['species_id']

test_df = X_test
test_df['label'] = y_test['species_id']

# Onehot encode labels
enc = LabelBinarizer()

train_onehot = enc.fit_transform((np.array(train_df.label)).reshape(len(train_df),1))
test_onehot  = enc.fit_transform((np.array(test_df.label)).reshape(len(test_df),1))

# add to dataframes
train_df['onehot_label'] = train_onehot.tolist()
test_df['onehot_label'] = test_onehot.tolist()

train_df.head()

## View class distribution in train/test split

In [None]:
train_bar = px.histogram(train_df, x='label', histfunc='count', opacity=0.8,
                         template='plotly_white')#, title='Train')
train_bar.update_layout(title={'text':'Training Split Class Distribution','x':0.5})

train_bar.show()

test_bar = px.histogram(test_df, x='label', histfunc='count', opacity=0.8,
                        template='plotly_white',nbins=24)
test_bar.update_layout(title={'text':'Testing Split Class Distribution','x':0.5})

test_bar.show()

## Create spectrogram function
Spectrogram function creates spectrogram with option to add gaussian noise, time mask, and frequency mask.  


In [None]:
def create_spec(audio_path, start, end, shape, power=2.0, sr=None, third_dim=False, time_freq_mask=True):
    '''
    docstring here
    '''
    # convert shape into array
    shape = np.array(shape)
    
    # load audio and get sample rate
    loaded_audio, sample_rate = librosa.load(audio_path, sr=sr)
    
    # create window
    loaded_audio = loaded_audio[(start*sample_rate):(end*sample_rate)]
    
    # add time and frequency masking
    if time_freq_mask == True:
        augment = Compose([
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1),
            TimeMask(min_band_part=0.005, max_band_part=0.10, p=1),
            FrequencyMask(min_frequency_band=0.005, max_frequency_band=0.10, p=1)
        ])
        
        loaded_audio = augment(loaded_audio, sample_rate=sample_rate)
        
    # create Mel spectrogram
    stft = librosa.feature.melspectrogram(loaded_audio, sr=sample_rate, power=power,
                                          fmin=f_min, fmax=f_max, n_mels=shape[0])
    stft_to_db = librosa.core.amplitude_to_db(np.abs(stft))
    
    # reshape image size

    # resize array
    stft_to_db = resize(stft_to_db, shape)
    
    # normalize 
    stft_to_db = stft_to_db - np.min(stft_to_db)
    stft_to_db = stft_to_db / np.max(stft_to_db)
    
    if third_dim == True:
        stft_to_db = np.stack((stft_to_db,stft_to_db,stft_to_db))
        
    
    return(stft_to_db)

## Create training and testing/validation dictionary with data

#### Training

In [None]:
# create dictionary with data and metadata

# initialize empty dictionary
data = {'index':[],
        'recording_id':[],
        'spec_data':[],
        'label':[]}

# set paths
root_path = '../input/rfcx-species-audio-detection/train/'
file_ext = '.flac'

n=0

# specify shape for spectrograms, important parameter for rest of code
shape = (256,512)
power = 1.5

# loop through each row in dataframe
for index, row in train_df.iterrows():
    
    # create filepath
    f_path = root_path+row['recording_id']+file_ext
    
    # pass f_path to stft function and add 3rd dimension
    temp_spec = create_spec(f_path, row['start'], row['end'], shape, power, third_dim=True, time_freq_mask=False)
    temp_spec = temp_spec.reshape(temp_spec.shape[1], temp_spec.shape[2], temp_spec.shape[0])
    
    # add data to data dictionary
    data['index'].append(index)
    data['recording_id'].append(row['recording_id'])
    data['spec_data'].append(temp_spec)
    #data['label'].append(row['onehot_label'])
    data['label'].append(np.array(row['onehot_label']))
    
    n+=1
    
    if n%100 == 0:
        print(n,'files processed')

#### Test

In [None]:
# create dictionary with data and metadata

# initialize empty dictionary
val_data = {'index':[],
            'recording_id':[],
            'spec_data':[],
            'label':[]}

# set counter
n=0

# loop through each row in dataframe
for index, row in test_df.iterrows():
    
    # create filepath
    f_path = root_path+row['recording_id']+file_ext
    
    # pass f_path to stft function and add 3rd dimension
    temp_spec = create_spec(f_path, row['start'], row['end'], shape,
                            power=power, third_dim=True, time_freq_mask=False)
    temp_spec = temp_spec.reshape(temp_spec.shape[1], temp_spec.shape[2], temp_spec.shape[0])
    
    # add data to data dictionary
    val_data['index'].append(index)
    val_data['recording_id'].append(row['recording_id'])
    val_data['spec_data'].append(temp_spec)
    #data['label'].append(row['onehot_label'])
    val_data['label'].append(np.array(row['onehot_label']))
    
    n+=1
    
    if n%100 == 0:
        print(n,'files processed')

#### See example spectrogram with preprocessing

In [None]:
# set paths
root_path = '../input/rfcx-species-audio-detection/train/'
file_ext = '.flac'

i=4

# create and display spectrogram
name = root_path+df_in['recording_id'][i]+file_ext
start = df_in['start'][i]
end = df_in['end'][i]

spec = create_spec(name, start, end, shape=(256,512), power=1.5)

# display spectrogram
librosa.display.specshow(spec, sr=48000, x_axis='time', y_axis='mel')

print(spec.shape)
print(df_in.loc[i])

# Create Model:

## Add keras custom scoring metric LWLRAP
credit: https://www.kaggle.com/carlthome/l-lrap-metric-for-tf-keras/comments

In [None]:
# credit: https://www.kaggle.com/carlthome/l-lrap-metric-for-tf-keras/comments

@tf.function
def _one_sample_positive_class_precisions(example):
    y_true, y_pred = example

    retrieved_classes = tf.argsort(y_pred, direction='DESCENDING')
    class_rankings = tf.argsort(retrieved_classes)
    retrieved_class_true = tf.gather(y_true, retrieved_classes)
    retrieved_cumulative_hits = tf.math.cumsum(tf.cast(retrieved_class_true, tf.float32))

    idx = tf.where(y_true)[:, 0]
    i = tf.boolean_mask(class_rankings, y_true)
    r = tf.gather(retrieved_cumulative_hits, i)
    c = 1 + tf.cast(i, tf.float32)
    precisions = r / c

    dense = tf.scatter_nd(idx[:, None], precisions, [y_pred.shape[0]])
    return dense


class LWLRAP(tf.keras.metrics.Metric):
    def __init__(self, num_classes, name='lwlrap'):
        super().__init__(name=name)

        self._precisions = self.add_weight(
            name='per_class_cumulative_precision',
            shape=[num_classes],
            initializer='zeros',
        )

        self._counts = self.add_weight(
            name='per_class_cumulative_count',
            shape=[num_classes],
            initializer='zeros',
        )

    def update_state(self, y_true, y_pred, sample_weight=None):
        precisions = tf.map_fn(
            fn=_one_sample_positive_class_precisions,
            elems=(y_true, y_pred),
            dtype=(tf.float32),
        )

        increments = tf.cast(precisions > 0, tf.float32)
        total_increments = tf.reduce_sum(increments, axis=0)
        total_precisions = tf.reduce_sum(precisions, axis=0)

        self._precisions.assign_add(total_precisions)
        self._counts.assign_add(total_increments)        

    def result(self):
        per_class_lwlrap = self._precisions / tf.maximum(self._counts, 1.0)
        per_class_weight = self._counts / tf.reduce_sum(self._counts)
        overall_lwlrap = tf.reduce_sum(per_class_lwlrap * per_class_weight)
        return overall_lwlrap

    def reset_states(self):
        self._precisions.assign(self._precisions * 0)
        self._counts.assign(self._counts * 0)

## Create Resnet50 model and train

#### GPU setup

In [None]:
# gpu set up
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    print(e)

#### Model

In [None]:
image_input = keras.layers.Input(shape=(shape[0],shape[1], 3))

restnet = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)
#restnet = InceptionResNetV2(weights='imagenet', include_top=False, input_tensor=image_input)

output = restnet.layers[-1].output
output = keras.layers.Flatten()(output)
restnet = Model(restnet.input, restnet.output)

for layer in restnet.layers:
    layer.trainable = False


# Add final layers for rainforest classifier
# instantiate model
model = keras.models.Sequential()

# add resnet model
model.add(restnet)
#model.add(layers.Flatten())
model.add(keras.layers.GlobalAveragePooling2D())
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(1024, activation='relu'))
#     model.add(keras.layers.BatchNormalization())
#     model.add(keras.layers.Dropout(0.4))
#     model.add(keras.layers.Dense(512, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(24, activation='softmax'))

# set optimizer
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001)
#optimizer = tfa.optimizers.RectifiedAdam(lr=0.002, total_steps=1152, warmup_proportion=0.3, min_lr=.000001)

model.compile(optimizer=optimizer,
              #loss=tfa.losses.SigmoidFocalCrossEntropy(),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[LWLRAP(24)])
              #metrics=['accuracy'])
model.summary()

#### Set up callback and train model

In [None]:
# set model callbacks to save best model
#filepath = "saved-model-{epoch:02d}-{val_lwlrap:.2f}.hdf5"
filepath = "saved-model-best.hdf5"
my_callbacks = [keras.callbacks.ModelCheckpoint(filepath, monitor='val_lwlrap', verbose=0,
                                                save_best_only=True, mode='max')    
]

In [None]:
history = model.fit(np.array(data['spec_data']), np.array(data['label']), epochs=100, batch_size=96,
                    validation_data=(np.array(val_data['spec_data']), np.array(val_data['label'])),
                    shuffle=True, callbacks=[my_callbacks])

#### Show model plot

In [None]:
# create dataframe for history
hist_df = pd.DataFrame(history.history)

# plot model history
plot_1 = px.line(hist_df, y=['lwlrap', 'val_lwlrap'],)
plot_1.show()

# Generate Submission File
1. read in test data
2. process test data
3. run predictions
4. change to submission format

In [None]:
def create_test_spec(audio, shape, sr=48000, power=2.0, third_dim=True):
    '''
    docstring here
    '''
    
    # initialize list 
    final_list = []
    # break audio into 10 seconds sub arrays
    audio = np.split(audio, 10)

    # loop through sub arrays
    for sub_array in audio:

        stft = librosa.feature.melspectrogram(sub_array, sr=sr, power=power, fmin=f_min, fmax=f_max, n_mels=shape[0])
        stft_to_db = librosa.core.amplitude_to_db(np.abs(stft))

        # resize array
        stft_to_db = resize(stft_to_db, shape)

        # normalize 
        stft_to_db = stft_to_db - np.min(stft_to_db)
        stft_to_db = stft_to_db / np.max(stft_to_db)
    
    
        if third_dim == True:
            stft_to_db = np.stack((stft_to_db,stft_to_db,stft_to_db))
        
        # reshape output
        stft_to_db = stft_to_db.reshape(stft_to_db.shape[1], stft_to_db.shape[2], stft_to_db.shape[0])
        
        # append stft_to_db to list
        final_list.append(stft_to_db)
    
    return(final_list)

In [None]:
# load model
model = keras.models.load_model('./saved-model-best.hdf5', compile=False)

In [None]:
# compile model
model.compile(optimizer=optimizer,
              #loss=tfa.losses.SigmoidFocalCrossEntropy(),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=[LWLRAP(24)])
              #metrics=['accuracy'])

In [None]:
# get test headers from sample_submission.csv
temp = pd.read_csv('../input/rfcx-species-audio-detection/sample_submission.csv')
cols = temp.columns.tolist()
n=0

# initialize list
prediction_list = []

# loop through test files
test_path = '../input/rfcx-species-audio-detection/test'

for root, dirs, files in os.walk(test_path):
    
    # get total number of files for progress report
    total = len(files)
    
    # loop through files 
    for file in files:
        
        # load in audio
        loaded_audio, sample_rate = librosa.load(test_path+'/'+file, sr=None)
        
        # pass loaded_audio to create_test_spec function
        test_list = create_test_spec(loaded_audio, shape=shape, power=power)
        
        # get predictions
        preds = model.predict(np.array(test_list))
        
        # get mean prediction probabilities
        mean_preds = np.mean(preds, axis=0)
        
        # append mean probs to list
        prediction_list.append(mean_preds)
        
        if n%100 == 0:
            print(f'{n} of {total} files processed')
        n+=1
        
        
# convert final list to dataframe --> this is messy and needs to be cleaned up
sub_df = pd.DataFrame({cols[0]:files})
cols2 = cols[1:]
temp_df= pd.DataFrame(prediction_list, columns=cols2)
temp_df['recording_id']=files
sub_df = temp_df[cols]

# eliminate file extension
sub_df['recording_id'] = sub_df['recording_id'].str.replace(r'.flac$','')

In [None]:
sub_df.head()

In [None]:
# create submission .csv
sub_df.to_csv('./submission.csv',index=False)