In [None]:
import os
import warnings
import shutil
warnings.filterwarnings(action='ignore')

import math
import pandas as pd
import librosa
import librosa.display
import numpy as np
import seaborn as sns; sns.set(style='whitegrid')
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm,tnrange,tqdm_notebook
import tensorflow as tf
from tqdm.keras import TqdmCallback
from keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras import applications as app
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten,AveragePooling2D
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.applications import EfficientNetB4, ResNet50,ResNet101, VGG16, MobileNet, InceptionV3

In [None]:
# Global Coefficients that can be modified
class coefs:
    
    # Generate Subset
    rat_id = 4 # rating subset limiter 
    recs = 200 # each specie must have X recodings
    max_files = 1500 # general last limit for rows
    thresh = 0.25 # label probability selection threshold
    submission = True # For Submission Only (Less Inference Output)
    
    # Global vars
    seed = 1337
    sr = 32000        # librosa sample rate input
    sl = 5 # seconds   
    sshape = (48,128) # height x width
    fmin = 500      # spectrum min frequency
    fmax = 12500    # spectrum max frequency
    n_epoch = 100   # training epochs
    cutoff = 15     # 3 sample spectogram (training) overwritten for inference

path_switch = False

# Helper Functions Stored Below

In [None]:
# Plot Keras Training History
def HistPlot():

    fig,ax = plt.subplots(1,2,figsize=(12,4))
    sns.despine(top=True,left=True,bottom=True)

    ax[0].plot(history.history['accuracy'])
    ax[0].plot(history.history['val_accuracy'])
    ax[0].set_title('model accuracy')
    ax[0].set_ylabel('accuracy')
    ax[0].set_xlabel('epoch')
    ax[0].grid(True,linestyle='--',alpha=0.5)
    
    ax[1].plot(history.history['loss'])
    ax[1].plot(history.history['val_loss'])
    ax[1].set_title('model loss')
    ax[1].set_ylabel('loss')
    ax[1].set_xlabel('epoch')
    ax[1].legend(['train', 'test'], loc='upper left')
    ax[1].grid(True,linestyle='--',alpha=0.5)
    plt.show()

# Split the Input signal into segments
def split_signal(sig):
    sig_splits = []
    for i in range(0, len(sig), int(coefs.sl * coefs.sr)):
        split = sig[i:i + int(coefs.sl * coefs.sr)]
        if len(split) < int(coefs.sl * coefs.sr):
            break
        sig_splits.append(split)
    
    return sig_splits

# extracts spectrograms and saves them in a working directory
def get_spectrograms(filepath, primary_label, output_dir):

    # duration is set from global variable
    sig, rate = librosa.load(filepath, sr=coefs.sr, offset=None, duration=coefs.cutoff)
    sig_splits = split_signal(sig) # split the signal into parts
    
    # Extract mel spectrograms for each audio chunk
    s_cnt = 0
    saved_samples = []
    for chunk in sig_splits:
        
        hop_length = int(coefs.sl * coefs.sr / (coefs.sshape[1] - 1))
        mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                                  sr=coefs.sr, 
                                                  n_fft=1024, 
                                                  hop_length=hop_length, 
                                                  n_mels=coefs.sshape[0], 
                                                  fmin=coefs.fmin, 
                                                  fmax=coefs.fmax)
    
        mel_spec = librosa.power_to_db(mel_spec**2, ref=np.max) 
        
        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()
        
        # Save as image file
        save_dir = os.path.join(output_dir, primary_label)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + 
                                 '_' + str(s_cnt) + '.png')
        im = Image.fromarray(mel_spec * 255.0).convert("L")
        im.save(save_path)
        
        saved_samples.append(save_path)
        s_cnt += 1
        
    return saved_samples

# https://stackoverflow.com/questions/1524126/how-to-print-a-list-more-nicely
def list_columns(obj, cols=4, columnwise=True, gap=4):
    sobj = [str(item) for item in obj]
    if cols > len(sobj): cols = len(sobj)
    max_len = max([len(item) for item in sobj])
    if columnwise: cols = int(math.ceil(float(len(sobj)) / float(cols)))
    plist = [sobj[i: i+cols] for i in range(0, len(sobj), cols)]
    if columnwise:
        if not len(plist[-1]) == cols:
            plist[-1].extend(['']*(len(sobj) - len(plist[-1])))
        plist = zip(*plist)
    printer = '\n'.join([
        ''.join([c.ljust(max_len + gap) for c in p])
        for p in plist])
    print (printer)

# **KERAS BASED MODEL GENERATION**

<b>Notebook Aim & Modifications</b>

- The aim of this notebook is to slightly expand on the already very useful notebook posted by the host; [notebook](https://www.kaggle.com/stefankahl/birdclef2021-model-training).
- That notebook contains a <b>single layer model approach</b>, which as it turns out cannot be used with more <b>sophisticated pretrained models</b>, I regrouped a few things and overall there doesn't seem to be a big difference between the two approaches when it comes to training.

<b>Dataloaders & Augmentation</b>

- <b>Dataloaders</b> are used here (as opposed to the additional step of reloading the into a numpy array) in order for one to use <b>image augmentations</b>, which help improve the model during training. The winning entry of the previous competition hosted by the same lab used <b>noise</b>, as an example.
- An example notebook which shows the <b>benefit of image augmentation</b> can be seen here; [Hummingbird Classification with CNN](https://www.kaggle.com/shtrausslearning/hummingbird-classification-with-cnn). If you are interested in <b>birds & their classification</b>, which I assumed a lot of you are, consider taking out the [Hummingbird Dataset](https://www.kaggle.com/akimball002/hummingbirds-at-my-feeders) dataset for a spin by [Amanda K Kimball](https://www.kaggle.com/akimball002/cnn-hummingbird-speciesgender-image-classification) and liking her work. Accurate bird classification most definitely requires the addition of video for accurate classification and not just sound, which is why I brought the above example up.

<b>By Not Means Complete</b>

- The notebook, like the one posted by the host, is by no means complete, <b>subsets are created via parameter selection</b> (rating filter,recording number per specie,general limiter)
- They barely are even able to correctly select the correct species present in the <b>training soundscapes</b> to begin with (as you will see in the soundscape), not even having done any training, 
- It's likey this is a critical step, not just this competition, but for bird classification in general. Some ideas have already been put forward in this notebook, [At the right place in the right time?](https://www.kaggle.com/aramacus/at-the-right-place-in-the-right-time)

# <sub>1.</sub> <span style='color:#F7765E'><sub>SUBSET GENERATION</sub></span>
- As per host's notebook, a simple subset selection of potetial birds that will be present in the <b>soundscape</b> are chosen;
    - <b>rating limitation</b>; only high quality recordings (as per Xeno Laws) are used.
    - <b>recording per specie limitation</b>; recordings with appropriate ammount of recordings in the dataset.
    - <b>overall limiter of rows</b>; general final limiter.
    
    
- Having limited the dataset, <b>spectograms</b> are generated, these arrays are exported via images and reinported via dataloader during training.
- <b>subset</b> is used to define which folders dataloader is the <b>training data</b> & the <b>validation data</b> using the <b>flow_from_directory</b> function/method.
- One slight concern I have with this approach of splitting the general non (train/vald) sorted folder data via <b>identical seed</b> & <b>validation_split</b> specification in the <b>ImageDataGenerator</b> input is the potenial occurence of image leakage.

In [None]:
''' CREATE A SUBSET OF THE DATA '''
print('STEP 1) CREATING A SUBSET OF DATASET:\n')

if(path_switch):
    lpath = '.\\train_metadata.csv'
else:
    lpath = '../input/birdclef-2021/train_metadata.csv'
train = pd.read_csv(lpath)
print(f"[DATASET]: {train.values.shape} : LABELS {len(train.primary_label.value_counts())}")

# subset filter 1 (rating)
temp_str = 'rating>='+str(coefs.rat_id)
train = train.query(temp_str)
print('\nRATING LIMITER APPLIED:')
print(f'[SUBSET]: {train.values.shape} : LABELS {len(train.primary_label.value_counts())}')

# subset filter 2 (number of recordings per specie)
birds_count = {};
a = train.primary_label.unique() 
a_val = train.groupby('primary_label')['primary_label'].count().values
for bird_species, count in zip(a,a_val):
    birds_count[bird_species] = count
to_model_spec = [key for key,value in birds_count.items() if value >= coefs.recs] 

print(f'\n {coefs.recs}+ RECORDINGS ONLY BIRDS LIMITED:')
TRAIN = train.query('primary_label in @to_model_spec')
LABELS = sorted(TRAIN.primary_label.unique())
print(f'[SUBSET]: {TRAIN.values.shape} : LABELS {len(LABELS)}')

print('\n BIRD LABELS AVAILABLE AFTER FILTER:')
list_columns(to_model_spec, cols=4, columnwise=True, gap=4)

# subset filter 3 (max audio files)

# Shuffle the training data and limit the number of audio files to max_files
print('\nLIMITING AUDIO FILES ...')
TRAIN = shuffle(TRAIN, random_state=coefs.seed)[:coefs.max_files]
LABELS = sorted(TRAIN.primary_label.unique())
print(f'[SUBSET]: {TRAIN.values.shape} : LABELS {len(LABELS)}')

In [None]:
''' CREATE & OUTPUT SPECTOGRAMS FOR TRAINING'''
# we will use CNN approach 

# Parse audio files and extract training samples
if(path_switch):
    input_dir = '.\\train_short_audio\\'
    output_dir = '.\\working\\melspectrogram_dataset\\'
else:
    input_dir = '../input/birdclef-2021/train_short_audio/'
    output_dir = './/working/melspectrogram_dataset/'

samples = []
with tqdm_notebook(total=len(TRAIN)) as pbar:
    for idx, row in TRAIN.iterrows():
        pbar.update(1)
        
        if row.primary_label in to_model_spec:
            audio_file_path = os.path.join(input_dir, row.primary_label, row.filename)
            samples += get_spectrograms(audio_file_path, row.primary_label, output_dir)
            
TRAIN_SPECS = shuffle(samples, random_state=coefs.seed)
print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS)))

In [None]:
''' DATALOADERS '''
# Create Data Generators/Loader for Keras, images not to be deleted

train_folder = './working/melspectrogram_dataset/'
valid_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2,\
                                   shear_range=10,fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(train_folder, 
                        target_size=(coefs.sshape[0],coefs.sshape[1]),  # target size
                        batch_size=32, 
                        seed=42,
                        subset = "training",
                        class_mode='categorical')    # batch size
validation_generator = valid_datagen.flow_from_directory(train_folder, 
                        target_size=(coefs.sshape[0],coefs.sshape[1]),  # target size
                        batch_size=32, 
                        seed=42,
                        subset = "validation",
                        class_mode='categorical')    # batch size

# <sub>2.</sub> <span style='color:#F7765E'><sub>MODEL GENERATION</sub></span>

<b>Base Model</b>

- The same model is used from [notebook](https://www.kaggle.com/stefankahl/birdclef2021-model-training), with the exception of a three layer <b>input shape (X,X,3)</b>

<b>Pretrained Models</b>

- Pretrained Models all require 3 layer inputs, in the input shape.
- Pretrained Models are also provided in the function, <code>pretrained_model</code>, which requires one to specify which <b>head model</b> is chosen. 
- The <b>tail end</b> Dense Layer is also fixed, by no means optimal and adjusted to be used for classification in this problem.
- <b>head weight coefficients</b> are often fixed to prevent overfitting, the same is done here.

<b>The Rest</b>

- Compilation settings, <b>optimiser</b>, <b>loss function</b> & <b>evaluation metric</b> are all identical to the previous notebook.
- <b>Callbacks</b> are all quite standard, <b>TqdmCallback</b> is used to reduce keras training output.
- <b>Train & Validation Generators</b> are used for training and evaluation during training, defined earlier.
- <b>Results</b> of the <b>evaluation metric (accuracy)</b> & <b>model loss</b> are plotted for each iteration of image dataset passes (epoch).

In [None]:
tf.random.set_seed(coefs.seed)
model = tf.keras.Sequential([
    
    # First conv block
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', 
                           input_shape=(coefs.sshape[0], coefs.sshape[1],3)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Second conv block
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    # Third conv block
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    # Fourth conv block
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Global pooling instead of flatten()
    tf.keras.layers.GlobalAveragePooling2D(), 
    
    # Dense block
    tf.keras.layers.Dense(256, activation='relu'),   
    tf.keras.layers.Dropout(0.5),  
    tf.keras.layers.Dense(256, activation='relu'),   
    tf.keras.layers.Dropout(0.5),
    
    # Classification layer
    tf.keras.layers.Dense(len(LABELS), activation='softmax')
])

In [None]:
tf.random.set_seed(coefs.seed)
def pretrained_model(head_id):

    # Define model with different applications
    model = Sequential()

    ''' Define Head Pretrained Models '''

    if(head_id is 'vgg'):
        model.add(VGG16(input_shape=(coefs.sshape[0],coefs.sshape[1],3),
                            pooling='avg',
                            classes=1000,
                            include_top=False,
                            weights='imagenet'))

    elif(head_id is 'resnet'):
        model.add(ResNet101(include_top=False,
                               input_tensor=None,
                               input_shape=(coefs.sshape[0],coefs.sshape[1],3),
                               pooling='avg',
                               classes=100,
                               weights='imagenet'))

    elif(head_id is 'mobilenet'):
        model.add(MobileNet(alpha=1.0,
                               depth_multiplier=1,
                               dropout=0.001,
                               include_top=False,
                               weights="imagenet",
                               input_tensor=None,
                               input_shape = (coefs.sshape[0],coefs.sshape[1],3),
                               pooling=None,
                               classes=1000))

    elif(head_id is 'inception'):
        # 75x75
        model.add(InceptionV3(input_shape = (coefs.sshape[0],coefs.sshape[1],3), 
                                                    include_top = False, 
                                                    weights = 'imagenet'))

    elif(head_id is 'efficientnet'):
        model.add(EfficientNetB4(input_shape = (coefs.sshape[0],coefs.sshape[1],3), 
                                    include_top = False, 
                                    weights = 'imagenet'))

    ''' Tail Model Part '''
    model.add(Flatten())
    model.add(Dense(1024,activation='relu'))
    model.add(Dropout(0.01))
    model.add(Dense(len(LABELS),activation='softmax'))

    # # freeze main model coefficients
    model.layers[0].trainable = False
    model.summary()

    return model

# Select & Comment out above cell if used
# model = pretrained_model('resnet') # define the model
# model = tf.keras.models.load_model('../input/birdclef-resnet101-1/best_model.h5') # Reload your model 

In [None]:
# Compile the model and specify optimizer, loss and metric
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.01),
              metrics=['accuracy'])

# Add callbacks to reduce the learning rate if needed, early stopping, and checkpoint saving
callbacks = [ReduceLROnPlateau(monitor='val_loss',patience=50,verbose=1,factor=0.5),
             EarlyStopping(monitor='val_loss',verbose=1,patience=5),
             ModelCheckpoint(filepath='best_model.h5',monitor='val_loss',verbose=0,save_best_only=True),
             TqdmCallback(verbose=0)
            ]
model.summary()

In [None]:
history = model.fit(train_generator,
                    validation_data = validation_generator,
                    verbose = 0,
                    callbacks=callbacks,
                    epochs=coefs.n_epoch)

In [None]:
# Plot Training 
HistPlot() # plot accuracy metric & loss function

In [None]:
''' I. HELPER FUNCTIONS'''
# Define Path for Training & Test Soundscape Data
# If competition reruns notebook, test data folder will be loaded

# Print Available Soundscape Files
def print_available(verbose = False):
    
    def list_files(path):
        return [os.path.join(path, f) for f in os.listdir(path) if f.rsplit('.', 1)[-1] in ['ogg']]
    test_audio = list_files('../input/birdclef-2021/test_soundscapes')
    if len(test_audio) == 0:
        test_audio = list_files('../input/birdclef-2021/train_soundscapes')

    if(verbose):
        print('AVAILABLE SOUNDSCAPES:')
        print('{} FILES IN TEST SET.'.format(len(test_audio)))
        print('')
    
        ii=-1
        for i in test_audio:
            ii+=1
            print(ii,i)
        
    return test_audio

# Get the labels that will be used to train the model
def print_label_species():
    list_columns(to_model_spec, cols=4, columnwise=True, gap=2)

# <sub>3.</sub> <span style='color:#F7765E'><sub>SOUNDSCAPE INFERENCE (TRAIN/TEST)</sub></span>
- <b>Function <code>soundscape_records</code> requires:</b>
    - pathway to <b>soundscape</b> file.
    - model to be used for <b>evaluation (predict)</b>
    - submission option allows one to quickly switch from <b>soundscape investigations/confirmations</b> to <b>submission format</b>
    
    
- <b>The function does the following:</b>
    - Firstly, clears common/temporary <code>mel_soundscape</code> folder used for <b>spectogram</b> export for each individual soundscape file.
    - Reads soundscape audio using librosa & temporary store 600 segment spectrums in folder (each image is a 5 second segment); <code>mel_soundscape</code>.
    - Keras <b>Dataloader/Data Generator is created</b> for the "test" set of 600 spectrum segments.
    - <b>Inference is conducted</b>, using <b>imported model</b> using "test" data generator (w/ standard augmentation), <b>results (probability)</b> are stored in a common array and extracted individually. 
    - For each segment, if a probability exceeds <b>a threshold</b>, the label corresponding to that probability is stored in dictionary, <b>data</b>. If none of the probabilites exceeds this threshold, a no bird call result is stored <b>nocall</b>.
    - Individual soundscape results are stored in local DataFrame and passed via return, the results for all possible soundscape files are then stored in a unified DataFrame <b>(df_allres)</b>, which is used for submission.
   
   
- <b>Unified submission option (submission=False)</b>:
    - <b>False</b> is used for different training soundscape investigations, eg. comparison of birds correctly predicted / all available birds ..., 
    - <b>True</b> option is used for creating a submission...

In [None]:
pd.set_option('display.max_rows', None)
# Function to evaluate inference on given recording
def soundscape_records(path,model,submission=False):
    
    # before prediction clear read folder if it exists
    if(os.path.exists('.//working/mel_soundscape/')):
        shutil.rmtree('.//working/mel_soundscape/')
#         os.listdir('.//working//')

    # General Output / Submission DataFrame Structure  
    if(submission is False):
        data = {'row_id': [], 'prediction': [], 'score': []}
    else:
        data = {'row_id': [], 'birds': []}
    
    print('*** READING NEW FILE & STARTING PREDICTION... ***')
    print(f'Reading File: {path}')
    
    ''' 1. CREATE SOUNDSCAPE FILES AND SAVE THEM '''
    # for each spectogram input, call get_spectogram, which cuts the entire
    # soundscape into chunks of 5 seconds
    
    coefs.cutoff = 600 # change to get all 5s segments in soundscape; should make 120 files
    get_spectrograms(path,'soundscape','.//working/mel_soundscape/')
#     print(len(os.listdir('.//working/mel_soundscape/soundscape'))) # should be 120
    
    ''' 2. LOAD IMAGE FILES & DATALOADER '''
    # soundscape recording folder
    soundscape_folder = './/working/mel_soundscape/'   
    # image augmentation & generate dataloader
    gen_datagen = ImageDataGenerator(rescale=1./255)
    gen_test = gen_datagen.flow_from_directory(soundscape_folder,
                        target_size=(coefs.sshape[0],coefs.sshape[1]),
                        batch_size=32,
                        class_mode='categorical')
    
    ''' 3. MAKE MODEL PREDICTION '''
    # for each class predict probability for all images simulaneously
    scores = model.predict(gen_test, verbose=1)
    
    # For each soundcape -> create X chunks + predict each 
    
    time_id=0
    for i in range(len(scores)):
    
        time_id+=5 # update segment time interval 
        idx = scores[i].argmax()      # possibly not best choice
        species = LABELS[idx]
        score = scores[i][idx]
        
        data['row_id'].append(path.split(os.sep)[-1].rsplit('_', 1)[0] + '_' + str(time_id))
        
        ''' *DECIDE IF PREDICTION PROBABILITY SHOULD EXCEED THRESHOLD '''
        if score > coefs.thresh:
            if(submission is False):
                data['prediction'].append(species)
            else:
                data['birds'].append(species)
        else:
            if(submission is False):
                data['prediction'].append('nocall')
            else:
                data['birds'].append('nocall')
        
         # store score
        if(submission is False):
            data['score'].append(score) # Add the confidence score as well
        
    # COMBINE & SHOW RESULTS
    if(submission is False):
        results = pd.DataFrame(data, columns = ['row_id', 'prediction', 'score'])
    else:
        results = pd.DataFrame(data, columns = ['row_id', 'birds'])
    
    if(submission is False):
        gt = pd.read_csv('../input/birdclef-2021/train_soundscape_labels.csv')
        results = pd.merge(gt, results, on='row_id') # merge only at available rows
        results['outcome'] = results.birds == results.prediction
        intersection_set = list(set(LABELS) & set(results.birds.to_list()))

        print('1A. Before Prediction:')
        list_columns(LABELS, cols=8, columnwise=True, gap=4)
        print('1B. Birds Present')
        list_columns(results.birds.unique())
        print(f'bird overlap: {len(intersection_set)}/{len(results.birds.unique())} are even present')

        print('\n 2. All Predictions:')
        print(results.outcome.value_counts())
        print('')

        print('3. Bird Predictions Only:')
        df_bird = results[results.birds!='nocall']
        print(df_bird.outcome.value_counts())
        print('\n\n')
        return 0
    else:
        return results # return one soundscape inference
    
''' MAIN INFERENCE OPTIONS '''
# Use model to evaluate on soundscape segments in one or many files
    
# A. Get Pathways to Soundscapes 
test_audio = print_available()
    
# B. Inference on all Soundscape, tlist stores all soundscape individual results
ii=-1;tlist = []
for i in test_audio:
#     model = tf.keras.models.load_model('best_model.h5') # load external model
    ii+=1;df_infer = soundscape_records(test_audio[ii],model,coefs.submission)
    if(coefs.submission):
        tlist.append(df_infer)

# combine all soundscape inference results
if(coefs.submission):    
    df_allres = pd.concat(tlist)
    df_allres.to_csv("submission.csv", index=False)
    
# C. Inference for one soundscape
# model = tf.keras.models.load_model('best_model.h5')
# soundscape_records(test_audio[0],model,coefs.submission)

# Remove Training Spectrums to not show them in output
shutil.rmtree('.//working/melspectrogram_dataset/')
os.listdir('.//working//')