In [1]:
''' GLOBAL CONFIGURATION CLASS '''

class cfg:

    local = False
    # Generate Subset
    kfold = 4       # Number of kfolds to be used
    pw = 2   # Tunable constant that affects spectogram output
    offset = None # Offset input signal
    seed = 1337      # random seed id
    sr = 32000        # librosa sample rate input
    sl = 5 # seconds   
    sshape = (48*2,128*2) # height x width of spectogram images
    fmin = 500      # spectrum min frequency
    fmax = 12500    # spectrum max frequency
    n_epoch = 100   # training epochs
    cutoff = 15     # 3 sample spectogram (training) 
    hop_len = int(sl*sr / (sshape[1] - 1))
    nfft = 1024
    model_bins = 20  # split signal into bins

In [2]:
''' IMPORT MODULES & HELPER FUNCTIONS '''

import os, random
import numpy as np
import math
from PIL import Image
import copy
import shutil
from tqdm import tqdm,tqdm_notebook
import pandas as pd
import librosa
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import librosa.display
import IPython.display as ipd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from scipy.signal import find_peaks
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
    
''' HELPER FUNCTION '''
lst_col = ['#B1D784','#2E8486','#004379','#032B52','#EAEA8A']

''' Display List '''
# display list neatly
# https://stackoverflow.com/questions/1524126/how-to-print-a-list-more-nicely
def list_columns(obj, cols=4, columnwise=True, gap=4):
    sobj = [str(item) for item in obj]
    if cols > len(sobj): cols = len(sobj)
    max_len = max([len(item) for item in sobj])
    if columnwise: cols = int(math.ceil(float(len(sobj)) / float(cols)))
    plist = [sobj[i: i+cols] for i in range(0, len(sobj), cols)]
    if columnwise:
        if not len(plist[-1]) == cols:
            plist[-1].extend(['']*(len(sobj) - len(plist[-1])))
        plist = zip(*plist)
    printer = '\n'.join([
        ''.join([c.ljust(max_len + gap) for c in p])
        for p in plist])
    print (printer)

''' Split Signal into Segments'''
# split audio signal into chunks
def split_signal(sig):
    sig_splits = []
    for i in range(0, len(sig), int(cfg.sl * cfg.sr)):
        split = sig[i:i + int(cfg.sl * cfg.sr)]
        if len(split) < int(cfg.sl * cfg.sr):
            break
        sig_splits.append(split)
    
    return sig_splits

''' Pixel to Frequency '''
# get spectogram frequency value
def pxtohz(y_mel_index, sr=cfg.sr, n_fft = 1024, printSummary = True):
    def find_nearest(a, a0):
        idx = np.abs(a - a0).argmin()
        return a.flat[idx]

    hz_scale = librosa.core.fft_frequencies(sr=sr, n_fft=n_fft)
    mel_scale = librosa.core.mel_frequencies(n_mels=cfg.sshape[0], 
                                               fmin=cfg.fmin, fmax=cfg.fmax, htk=False)
    y_hz = mel_scale[int(y_mel_index)] 
    y_hz_nearest = find_nearest(hz_scale, y_hz)
    y_hz_index = list(hz_scale).index(y_hz_nearest)
    return y_hz

''' Split DataFrame into Parts by index '''
# split dataframe into chunks 
def split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

''' LOCAL DIRECTORIES '''

# Work from Main Directory
BASE_DIR = '.\\kaggle\\input\\birdclef-2021\\'
DIR_SPEC_IN = '.\\kaggle\\input\\birdclef-2021\\train_short_audio\\'

# spectogram options
DIR_SPEC_OUT = '.\\kaggle\\working\\train_96_256_rat4\\'
# DIR_SPEC_OUT = '.\\kaggle\\working\\train_96_256\\'
# DIR_SPEC_OUT = '.\\kaggle\\working\\train_48_128\\'

CSV_IN_TRAIN = '.\\kaggle\\input\\birdclef-2021\\train_metadata.csv'
DIR_WEIGHTS = '.\\kaggle\\working\\weights\\'
DIR_KFOLDS = '.\\kaggle\\working\\kfolds\\'
INITIAL_CONDITION = '.\\kaggle\\working\\model_t0.h5'

if(cfg.local is False):
    os.chdir('../..')
    BASE_DIR = BASE_DIR.replace('\\','/')
    DIR_SPEC_IN = DIR_SPEC_IN.replace('\\','/')
    DIR_SPEC_OUT = DIR_SPEC_OUT.replace('\\','/')
    CSV_IN_TRAIN = CSV_IN_TRAIN.replace('\\','/')
    DIR_WEIGHTS = DIR_WEIGHTS.replace('\\','/')
    DIR_KFOLDS = DIR_KFOLDS.replace('\\','/')
    INITIAL_CONDITION = INITIAL_CONDITION.replace('\\','/')
    
print(f'cwd: {os.getcwd()}')

cwd: /


![](https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/8cc1eeaa-4046-4c4a-ae93-93d656f68688/deogxkz-d6664c42-119d-4f85-9ba0-ecdc685c3de6.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzhjYzFlZWFhLTQwNDYtNGM0YS1hZTkzLTkzZDY1NmY2ODY4OFwvZGVvZ3hrei1kNjY2NGM0Mi0xMTlkLTRmODUtOWJhMC1lY2RjNjg1YzNkZTYuanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.o1HrEp0sm9MdhZ0j1jADchdL2uGCZPEb9LwyGD4TfT0)

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>1 |</b> INTRODUCTION
    </p>
</div>

### <b><span style='color:#FFC300'>1.1</span> | TONAL PEAK FREQUENCY IDENTIFICATION (TPFI)</b>
- In this notebook, given as <b>set/group</b> of labeled recordings , we'll be looking at a <b>model based approach</b> to determine the frequency at which a bird specie of interest tends to call for a primary specie.
- The approach requires the identification of noise within the spectogram, to do that, we will be utilising maximum tonal sound peaks to create a broadband natured model.
- Noise generally can be divided into <b>broadband noise</b> (general noise level) & <b>tonal noises</b> (peaks at specific frequency bins). They don't have precise definitions, but <b>broadband</b> noises can be abstractly defined as the general noise level in an environement coming from various locations, creating a broad frequency range noise relation to output noise level. <b>Tonal</b> noise sources tend be associated to very clearly distinguishible noise peaks at specific frequencies ( or over a small frequency range ). 
- When we look at a spectogram, each bird specie tends to create quite a repetitive collection of freq vs time structures, usually across a specific frequency range, usually it's a combination of tonal peaks that make up an entire bird call. 
- In this approach, the two terms are used even looser, since there is a time element to this model from the STFT, which can be useful in a variety of scenarios.
- The tonal peak frequency identification approach relies on the assumption that the more data is fed into the system, the more precise the result should get, as occasional secondary birds & other noises should eventually start to show more dissipative distribution in the entire subset that is analysed.

### <b><span style='color:#FFC300'>1.2</span> | SOUND EVENT DETECTION (SED)</b>
- There are various way to detect the presence of tonal peaks (usually what we are interested) within a spectogram, in fact you can simply do it by dividing the audio clip into segments & doing an FFT for each segment, followed by a comparison of correlation between each segment FFT functions. STFT conversion already introduces this time component, which is very handy and exactly what we'll use to create a model that will be used in SED.
- These spectograms ( obtained via STFT ) go much further and can usually contain quite a lot information (relevant and irrelevant). They capture various noise sources not even associated with the primary specie which it was weakly labeled.
- With a time domain component (as opposed to standard FFT), creating <b>a model containing a time element</b> can be quite handy over of a simple FFT overall since call time is a critical component in a call noise structure.
- When attempting Sound Event Detection (SED) in the spectogram, we'll probably run into some logistical issues of how to actually identify these tonal noise sources; determining peak locations was the thing immediately come to mind.
- In this problem, the peak cut off threshold and the tonal peak's relativeness will depend on a constructed model, which will act to recreate the general sound level noise curve, having a time dependency as well, which can be more useful than a similar one constructed from FFT as opposed to STFT.

### <b><span style='color:#FFC300'>1.3</span> | (ENSEMBLED) BROADBAND MODEL</b>
- The STFT based broadband model has applications which go slightly outside the scope of the current notebook, its purpose in this problem is outlined below.
- Using data from STFT, the constructed model serves a few purpose;
  - (1) It is used for creating <b>an ensemble model from all individual time bin models</b> that oscillates less, thus creating less peaks when combined with scipy's peak identificaton module
  - (2) It is also useful for identifying how cluttered a particular frequency is with tonal peaks, this way we can easily identify constanly occuring noises, such as insects, bird groups & build a collection of unique bird calls, when combined with a simple correlation evaluation for all functions
 
### <b><span style='color:#FFC300'>1.4</span> | DETERMINING FREQUENCY CUTOFF FOR SPECTOGRAM</b>
- When we want to use spectograms as CNN inputs, ultimately several key variables come into play: <b>minimum</b> & <b>maximum</b> frequencies determine the cutoff frequency points in the spectogram.
- We need to determine which frequency range combination & figure sizes to set in the created spectogram, before feeding it into the CNN model.
- As birds don't tend to call in the entire frequency range, we could utilise a specie specific tonal peak occurence library in order to determine the cutoff frequencies, since they can contain more specific information about the call freq/time structure, as opposed to a zoomed out one, which will naturally lose some detail.

### <b><span style='color:#FFC300'>1.5</span> | NOTEBOOK WORKFLOW</b>
- We aim to ustilise a broadband model, which exhibits minor fluctuation tendencies to recreate tonal peaks. From these peaks, we utilise scipy's peak detection module and save the pixel index at which a peak was found, we repeat this process for every short audio recording saving all the freq/pixel indicies at which model peaks were constructed, thus creating a one dimensional peak map for each specie, which will tend to tell us at which frequencies tonal peaks tend to occur for a given specie.

### <b><span style='color:#FFC300'>1.6</span> | SOME APPLICATIONS FOR BOTH BROADBAND MODEL & TPFI</b>
- <b>For the creation of a detailed spectrum that will capture the frequency/time domain call signature (our focus here)</b>
- For pre-process short audio & soundscape detail investigation, 
- For image augmentation including time & frequency filtering. 

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>2 |</b> METHOD
    </p>
</div>

- The STFT based broadband model is quite straightforward, and there probably is nothing unique about it that I'm aware of.
- I just thought I'd share the model I couple with the CNN approach for the analyses of both short audio & soundscape data. 
- I simply will share one of its applications (which is just to count the peak index occurence count) outlined in the introduction. I tend to analyse the whole batch of weakly labeled subset data belonging to one specie.
- Looping over all desired audio files of a subset of interest to us (a particular primary label subset):
> - First, we load an audio recording that we wish to convert to  desired to be used as inputs for CNN models. 
> - The audio is then split into segments that will define the spectogram time domain limits. Usually we would start with the entire frequency range [0,12.5kHz] and split the recording into a 5 second chunks, creating a time & frequency domain relation.
> - For reference, we find the maximum dB value in the entire frequency range, <b>this model will define the peaks of the tonal noises and will always be the maximum.</b>
> - The spectogram is then divided into <b>time bins</b>, <b>cfg.model_bins</b> & for each time bin, the maximum value for each frequency is determined.
> - A <b>model for each time bin</b> is created and a simple <b>enemble of all time segments is constructed</b>, this should always create a model that is lower in dB level than the global peak model mentioned earlier. There are certain cases where this is not the case, usually an indicator that there exist an anomaly in the structure of the curve (as shown in the example below).
> - The <b>peaks of the model</b> are then found using <b>scipy's find_peaks module</b>, stored into a global list & the <b>Counter</b> module counts all list entries.
> - The results are subsequently plotted for each pixel value. The corresponding frequency values can be extracted using the function <b>pxtohz</b>.

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>3 |</b> MAIN CLASS
    </p>
</div>

### <b><span style='color:#FFC300'>3.1</span> OVERVIEW</b>
- It is always useful to keep code clean and work with classes as much as possible when working on any project.
- I've created a simplified class, with operations associated with getting data from <b>birdclef-2021</b> data.
- I'll be making continuous references to this class and its instantiation is required to load relevant data.

### <b><span style='color:#FFC300'>3.2</span> | CLASS CONTENTS</b>
- Reading of Training <b>shot audio</b> & <b>training soundscape</b> related CSV files are both required to get the relevant subset of data used here, which is achieved straight after instantiation.
- <b>get_short_labels</b> is useful to get all possible <b>primary_labels</b>, which is an important data feature in this competition.

### <b><span style='color:#FFC300'>3.3</span> | CREATING A SUBSET</b>
- I'm quite fond of using the primary label <b>rugdov (Ruddy Ground Dove)</b> for the example for the earlir outlined reason.
- In this dataset, there are only 66 recordings of this primary specie.

In [3]:
class get_subset:

    def __init__(self):

        ''' 1. SHORT TRAINING FILES '''
        self.__SHORTAUDIO__ = DIR_SPEC_IN
        # main short audio info CSV file
        self.pd_short_audio = pd.read_csv(BASE_DIR+'train_metadata.csv')
        if(cfg.local is False):
            self.pd_short_audio['path'] = self.__SHORTAUDIO__ + "/" + self.pd_short_audio['primary_label'] + '/' + self.pd_short_audio['filename']
        else:
            self.pd_short_audio['path'] = self.__SHORTAUDIO__ + "\\" + self.pd_short_audio['primary_label'] + '\\' + self.pd_short_audio['filename']
        
        ''' 2. TRAINING SOUNDSCAPE FILES '''
        self.__SO_PATH_TR__ = BASE_DIR+'.\\train_soundscapes\\'  # path to train soundcape files
        self.__SO_PATH_TE__ = BASE_DIR+'.\\test_soundscapes\\'  # path to test soundcape files
        if(cfg.local is False):
            self.__SO_PATH_TR__ = self.__SO_PATH_TR__.replace('\\','/')
            self.__SO_PATH_TE__ = self.__SO_PATH_TE__.replace('\\','/')
        
        # main soundscape info CSV file (shows interval labels)
        path_soundscape_audio = BASE_DIR+'train_soundscape_labels.csv'   # read soundscape related CSV
        self.pd_scape = pd.read_csv(path_soundscape_audio)

        # list of filest to soundscape .ogg
        lst_sounds = os.listdir(self.__SO_PATH_TR__)
        self.PATH_SCAPE = [self.__SO_PATH_TR__ + i for i in lst_sounds]
        self.PATH_SCAPE.sort()

    ''' GET ALL LABELS AVAILABLE '''
    # display all available classes in dataset
    def get_short_labels(self):
        primary_labels = self.pd_short_audio.primary_label.unique()
        primary_labels.sort()
        return primary_labels
    
    # get a sample row of a primary_label
    def prim_lookup(self,prim_id):
        tdf = self.pd_short_audio[self.pd_short_audio['primary_label'] == prim_id]
        return tdf.sample(1,random_state=24)

    ''' 3. GET VARIOUS SUBSETS OF DATA '''
    # get various subsets of dataframe
    def get_bird_subset(self,name='acafly'):
        return self.pd_short_audio[self.pd_short_audio['primary_label'] == name].copy().reset_index()
    # get rating subset
    def get_rating_subset(self,rating=2):
        return self.pd_short_audio[self.pd_short_audio['rating'] == rating].copy().reset_index()
    # get bird & rating subset
    def get_bird_rating(self,name='acafly',rating=4):
        return self.pd_short_audio[(self.pd_short_audio['primary_label'] == name)&(self.pd_short_audio['rating'] == rating)].copy().reset_index()
    # # show name of primary label
    def primary_to_common(self,primary='cangoo'):
        specie = self.pd_short_audio[self.pd_short_audio['primary_label'] == primary].sample(1)
        return specie
    
    ''' 4. FIND DATA VIA XENO IDENTIFIER '''
    # You might want to quickly find data based on unique Xeno identifier eg. XC544318
    
    # get the entire row info
    def id_lookup(self,record_id):
        row = self.pd_short_audio[self.pd_short_audio['filename'] == record_id + '.ogg']
        display(row)
    # get the pathway to the recording
    def id_path(self,record_id):
        row = self.pd_short_audio[self.pd_short_audio['filename'] == record_id + '.ogg']
        path = row['path'].values[0]
        return path

# Instantiate Main Dataset Class
data = get_subset() # Instantiate Main Dataset Class
# data.get_short_labels() # Show all available classes
subset = data.get_bird_subset('rugdov') # Pick 1 Class
display(subset.head(1))
subset_path = subset['path'].tolist() # Define Series list
print(f'Remaining Subset: {subset.shape}') 

Unnamed: 0,index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,path
0,46785,rugdov,[],"['female', 'male', 'song']",-3.6668,-45.8465,Columbina talpacoti,Ruddy Ground Dove,GABRIEL LEITE,2011-07-08,XC118662.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,09:30,https://www.xeno-canto.org/118662,./kaggle/input/birdclef-2021/train_short_audio...


Remaining Subset: (66, 16)


<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>4 |</b> BROADBAND MODEL - UNIVERSAL KRIGING
    </p>
</div>

### <b><span style='color:#FFC300'>4.1</span> | OVERVIEW</b>

- Kriging, popular in geospatial interpolation & optimisation is an ensemble model consisting of <b>Gaussian Process Regression</b> and typically <b>Polynomial Regression</b>, which is quite suitable for the current problem since we want to have a model that doesn't overfit, nor underfit.
- Kriging is a very <b>accurate interpolation approach</b> & has a tendency to overfit (defeating the purpose of making a broadband model) when its hyperparameters are optimised, so we will be using <b>self defined hyperparameters</b> & focusing only on the variation of only one (<b>theta</b>) when needed, whilst keeping the other two fixed.
- If you are interested in other applications of Kriging, I've also used it to estimate temperature in defined regions in another notebook; [Geospatial Data Visualisation](https://www.kaggle.com/shtrausslearning/geospatial-data-visualisation).
- The class is written utiling sklearn's structure for custom classes that can  be integrated.

### <b><span style='color:#FFC300'>4.2</span> | USE OF THE MODEL</b>

- In our problem, we will use this model in an attempt to differentiate between <b>broadband</b> & <b>tonal</b> noise sources in audio recordings.
- We will be creating a kriging model <b>for every time bin segment</b> we defined in our 5 second cut segment using the <b>global cfg.model_bins parameter</b>.
- The assumption is that <b>by creating an ensemble of all individual time segments</b>, the <b>model should mainly capture the broadband noise level</b>, variations relative to the peak values is an indicator of how much activity occurs in each timebin segment, we can easily use this to  detect event/non event cases.
- There are quite a lot of things such a model can tell us not covered in the scope of this notebook. I can just name one on the top of my head; for example if capturing the entire frequency and time range, highly similar model and global peaks indicate the presence of constantly occuring noise, which are usually non bird related, such as insect etc and you may want to be aware of it.

### <b><span style='color:#FFC300'>4.3</span> | MODEL THEORY</b>

<p>The Kriging model is built on the assumption that the data <em>Y</em> obey a Gaussian process with an assumed form for the mean function and the covariance between data points: </p>
<p class="formulaDsp">
\[ Y = N(m(\vec{x}), K(\vec{x},\vec{x}))\]
</p>
<p> where \(m(\vec{x})\) is the mean function and \(K(\vec{x},\vec{x})\) represents the covariance between function values. For this work, a regression mean function is assumed. Using this form, the mean function has the following form: </p>
<p class="formulaDsp">
\[ m(x) = h^{T}(\vec{x}) \beta \]
</p>
<p> where \(h^{T}(\vec{x})\) represents a column vector containing the basis functions of the basis evaluated at the points \(\vec{x}\). The regression parameters \(\beta\) are treated as part of the Kriging model and are determined while constructioning the model. Using this form of the mean function yields a Universal Kriging model. The case of a (polyorder=0) regression (where the vector \(h(\vec{x})\) reduces to unity) is referred to as <b>Ordinary Kriging</b> and is also covered by this functional form. The assumption of a vague prior on the regression parameters gives the following closed form for the parameters: </p>
<p class="formulaDsp">
\[ \beta=(H K^{-1} H^{T})^{-1} H^T K^{-1} Y = A^{-1} H^T K^{-1} Y \]
</p>
<p> where \(K\) is the covariance matrix between the training data. For a Kriging model, the covariance between function values is assumed to be only a function of the distance between points. The multi-dimension covariance function is constructed using a tensor product of one dimension functions. The multi-dimension covariance is calculated in <b>fit()</b>, which calls the static method <b>covfn</b>. The elements in ths covariance matrix are given as: </p>
<p class="formulaDsp">
\[ K_{i,j} = cov(y_{i},y_{j}) = \sigma^{2} k(\vec{X}_{i},\vec{X}_{j}; \theta) + \sigma^{2}_{n} \delta_{i,j} \]
</p>
<p> The parameters \(\sigma\) and \(\theta\) (and in some cases \(\sigma_{n}\)) are denoted as hyperparameters and are determined maximizing the likelihood equation for the Kriging model. This likelihood gives the probability that a Gaussian process with specified hyperparamters describes the training data <em>X</em> and <em>Y</em>. By picking the hyperparameters that maximize this probability, a Kriging model that best describes the data can be constructed. The hyperparameters are determined based on the <b>likelihood equation</b> for a gaussian process with a vague prior on the regression parameters. This likelihood is computed in function <b>llhobj</b> which uses the Scipy Module, <b>minimize</b>. Using this equation, optimization is used to determine all of the parameters, including the covariance magnitude \(\sigma\) and noise \(\sigma_{n}\). This way of determining hyperparameters should be used when the noise level of the function needs to be fitted. <br/>
 With the regression and covariance parameters determined, the final processed data can be constructed using the inverse of the covariance matrix. To make predictions from the Kriging model, the following vector is required: </p>
<p class="formulaDsp">
\[ V = K^{-1} (Y - H^{T} \beta) \]
</p>
<p> where \( K \) is the covariance matrix, the product \(H^{T} \beta\) represents the mean function evaluated at the training points and \(Y\) represents the function values at the training points. Using this processed data, the regression parameters and covariance parameters & predictions can be made.

<br>
    
<tr><td class="mdescLeft">&#160;</td><td class="mdescRight">Model predictions throughout the domain are determined by sampling from the conditional distribution \(y_* | \vec{X},Y\) using the covariance between points in the domain where \(\vec{X},Y\) are the input and output training data. The posterior mean predictions for an explicit mean are given by the formula: </p>
<p class="formulaDsp">
\[ y(\vec{x}_{*}) | \vec{X},Y,m(x) = m(\vec{x}_{*}) + k_*^T K^{-1} (Y-m(\vec{x}_{*})) \]
</p>
<p> where \(k_{*}^{T}\) represents the covariance between the test point, \(\vec{x}_{*}\), and the training points \(\vec{X}\) (a row vector of length ntot). <br/>
 For a regression mean function, the function predictions take the form of: </p>
<p class="formulaDsp">
\[ y(\vec{x}_{*}) | \vec{X},Y,\beta = h^{T}(\vec{x}_{*}) \beta + k_*^T K^{-1} (Y-H^{T} \beta) \]
</p>
<p> The regression parameters \(\beta\) and the hyperparamters in the covariance function are supplied by <b>fit()</b>. Using only this data, function predictions can be made; however, the construction and inverse of the covariance matrix can make the function predictions expensive. Because this matrix is inverted during the construction of the Kriging model, this work can be re-used for function predictions. Defining the processed data \(V\) as: </p>
<p class="formulaDsp">
\[ V = K^{-1} (Y - H^{T} \beta) \]
</p>
<p> the function predictions are given by: </p>
<p class="formulaDsp">
\[ y(\vec{x}_{*}) | \vec{X},Y,\beta = h^{T}(\vec{x}_{*}) \beta + k_*^T V \]
</p>

In [4]:
from sklearn.base import BaseEstimator,RegressorMixin
from numpy.linalg import cholesky, det, lstsq, inv, pinv
from scipy.optimize import minimize
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
pi = 4.0*np.arctan(1.0)
import warnings
warnings.filterwarnings("ignore")

# Universal Kriging Model (Polynomial Regression + Full Gaussian Process Regression Model)
# Commonly used Ensemble Approach for geospatial interpolation and 

class Kriging(BaseEstimator,RegressorMixin):
    
    def __init__(self,kernel='rbf',theta=10.0,sigma=10.0,sigma_n=1,opt=True,polyorder=2):
        self.theta = theta
        self.sigma = sigma
        self.sigma_n = sigma_n
        self.opt = opt
        self.polyorder = polyorder 
        Kriging.kernel = kernel 

    ''' local covariance functions '''
    @staticmethod
    def covfn(X0,X1,theta=1.0,sigma=1.0):

        ''' Radial Basis Covariance Function '''
        if(Kriging.kernel == 'rbf'):
            r = np.sum(X0**2,1).reshape(-1,1) + np.sum(X1**2,1) - 2 * np.dot(X0,X1.T)
            return sigma**2 * np.exp(-0.5/theta**2*r)

        ''' Matern Covariance Class of Funtions '''
        if(Kriging.kernel == 'matern'):
            lid=1
            r = np.sum(X0**2,1)[:,None] + np.sum(X1**2,1) - 2 * np.dot(X0,X1.T)
            if(lid==1):
                return sigma**2 * np.exp(-r/theta)
            elif(lid==2):
                ratio = r/theta
                v1 = (1.0+np.sqrt(3)*ratio)
                v2 = np.exp(-np.sqrt(3)*ratio)
                return sigma**2*v1*v2
            elif(lid==3):
                ratio = r/theta
                v1 = (1.0+np.sqrt(5)*ratio+(5.0/3.0)*ratio**2)
                v2 = np.exp(-np.sqrt(5)*ratio)
                return sigma**2*v1*v2
        else:
            print('Covariance Function not defined')
            
    ''' Train the Model'''
    def fit(self,X,y):
        
        ''' Working w/ numpy matrices'''
        if(type(X) is np.ndarray):
            self.X = X;self.y = y
        else:
            self.X = X.values; self.y = y.values
        self.ntot,ndim = self.X.shape
        
        # Collocation Matrix
        self.poly = PolynomialFeatures(self.polyorder)
        self.H = self.poly.fit_transform(self.X)
        
        ''' Optimisation Objective Function '''
        # Optimisation of hyperparameters via the objective funciton
        def llhobj(X,y,noise):
            
            # Simplified Variant
            def llh_dir(hypers):
                K = self.covfn(X,X,theta=hypers[0],sigma=hypers[1]) + noise**2 * np.eye(self.ntot)
                return 0.5 * np.log(det(K)) + \
                    0.5 * y.T.dot(inv(K).dot(y)).ravel()[0] + 0.5 * self.ntot * np.log(2*pi)

            # Full Likelihood Equation
            def nll_full(hypers):
                K = self.covfn(X,X,theta=hypers[0],sigma=hypers[1]) + noise**2 * np.eye(self.ntot)
                L = cholesky(K)
                return np.sum(np.log(np.diagonal(L))) + \
                    0.5 * y.T.dot(lstsq(L.T, lstsq(L,y)[0])[0]) + \
                    0.5 * self.ntot * np.log(2*pi)
            
            return llh_dir # return one of the two, simplified variant doesn't always work well
        
        ''' Update hyperparameters based on set objective function '''
        if(self.opt==True):
            # define the objective funciton
            objfn = llhobj(self.X,self.y,self.sigma_n)
            # search for the optimal hyperparameters based on given relation
            res = minimize(fun=objfn,x0=[1,1],
                           method='Nelder-Mead',tol=1e-6)
            self.theta,self.sigma = res.x # update the hyperparameters to 

        self.HT = self.H.T
        self.Kmat = self.covfn(self.X,self.X,self.theta,self.sigma) \
                  + self.sigma_n**2 * np.eye(self.ntot) # Covariance Matrix (Train/Train)
        self.IKmat = pinv(self.Kmat) # Pseudo Matrix Inversion (More Stable)

        self.HK = np.dot(self.HT,self.IKmat) # HK^-1
        HKH = np.dot(self.HK,self.H)     # HK^-1HT
        self.A = inv(HKH)             # Variance-Covariance Weighted LS Matrix

        self.W = np.dot(self.IKmat,self.y)
        Q = np.dot(self.HT,self.W)
        self.beta = np.dot(self.A,Q)               # Regression coefficients
        self.V = self.W - np.dot(self.IKmat,self.H).dot(self.beta) # K^{-1} (Y - H^{T} * beta)
        
        return self  # return class & use w/ predict()

    ''' Posterior Prediction;  '''
    # Make a prediction based on what the model has learned 
    def predict(self,Xm):
        
        ''' Working w/ numpy matrices'''
        if(type(Xm) is np.ndarray):
            self.Xm = Xm
        else:
            self.Xm = Xm.values
        self.mtot,ndim = self.Xm.shape
        
        self.Hm = self.poly.fit_transform(self.Xm) # Collocation Matrix
        self.Kmat = self.covfn(self.X,self.Xm,self.theta,self.sigma) # Covariance Matrix (Train/Test)
        yreg = np.dot(self.Hm,self.beta)               # Mean Prediction based on Regression
        ykr = np.dot(self.Kmat.T,self.V)              # posterior mean predictions for an explicit mean 

        return yreg + ykr

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>5 |</b> CREATING SPECTOGRAMS
    </p>
</div>

### <b><span style='color:#FFC300'>5.1</span> | OVERVIEW</b>

- <b>get_spectograms</b> is an extended function of the one posted another notebook [[Keras, Inference] BirdCLEF2021 starter](https://www.kaggle.com/shtrausslearning/keras-inference-birdclef2021-starter), the function is used for creating a spectogram for different audio chunks of the split input audio signal.
- I've included several things in the function that can be useful to visualise during general EDA & we will look through one recording that is split into several segments (all contents for one case only.

> **filepath** : string containg path to audio <br>
> **primary_label** : desired output name <br>
> **output_dir** : directory in which the spectograms are saved <br>
> **save_id** : Save spectogram output or not <br>
> **audio_id** : display segment audio recording <br>
> **select_id** : training / soundscape data is looked at ( soundscape just adds label ) <br>
> **plot_id** : output general plots ( those displayed in the next example )
> **store_id** : save peak values to global loop ( when actually looping through all audios )
> **height** : plotly figure height

In [5]:
def get_spectrograms(filepath=None,primary_label=None, output_dir=None,save_id=False,
                     audio_id=False,select_id='train',plot_id=False,store_id=False,height=400):

    # duration is set from global variable
    sig, rate = librosa.load(filepath, sr=cfg.sr, offset=cfg.offset, duration=cfg.cutoff) # read audio data
    sig_splits = split_signal(sig) # split the signal into parts        
    fig = make_subplots(rows=3, cols=len(sig_splits))
    
    # Extract mel spectrograms for each audio chunk
    s_cnt = 0; path_all = []; jj=0; kk= cfg.offset; lst_peak_model = []
    for chunk in sig_splits:
        
        kk=+ cfg.offset + cfg.sl
        
        # Play Audio
        if(audio_id):
            print(f'Audio ID: {kk}')
            display(ipd.display(ipd.Audio(data=chunk, rate=cfg.sr)))
            
        kk+=5; jj+=1;
        mel = librosa.feature.melspectrogram(chunk,sr=cfg.sr,
                                             fmin=cfg.fmin, 
                                             fmax=cfg.fmax,
                                             n_mels=cfg.sshape[0],
                                             n_fft = cfg.nfft) 
        mel_spec = librosa.power_to_db(mel**cfg.pw, ref=np.min)
        mel_spec_disp = mel_spec.copy()
        mel_spec_disp -= mel_spec_disp.min(); mel_spec_disp /= mel_spec_disp.max() # numpy format
        if(plot_id):
            fig.add_trace(go.Heatmap(z=mel_spec,colorscale='viridis',showscale=False),1,jj)
        
        # add sandscape results annotation
        if(select_id is 'soundscape'):
            data = get_subset()
            record_id = filepath.split('_')[1].split('/')[1]
            get_birds = data.pd_scape[data.pd_scape['audio_id']==int(record_id)][['seconds','birds']]
            get_birds_v= get_birds[get_birds['seconds'] == kk]['birds'].values[0]
        
        '''Get Mel Max/Min/Mean Values (TIME)'''
        ldf = pd.DataFrame(mel_spec)
        maxst = ldf.describe().loc['max',:]
        if(plot_id):
            fig.add_trace(go.Scatter(y=maxst,line=dict(color=lst_col[0])),2,jj) # chunk signal data

        '''Get Mel Max/Min/Mean Values (FREQUENCY)'''
        ldf = pd.DataFrame(mel_spec.T)
        maxsf = ldf.describe().loc['max',:]
        if(plot_id):
            fig.add_trace(go.Scatter(y=maxsf,line = dict(color=lst_col[1]),name='spect-max'),3,jj)
    
        ''' TIME BIN MODEL PREDICTION  '''
        # Split the data into time bins, splitting data into bins -> broadband model attempt

        lst_df = split_dataframe(ldf,chunk_size=cfg.model_bins)        
        lst_ensemble = []; ym = 0; ii=0
        for tdf in lst_df:
            ii+=1; maxs = tdf.describe().loc['max',:]  # maximum value in frequency band (plot)
            model = Kriging(opt=False,theta=4)
            model.fit(X=tdf.T.index[:,None],y=maxs)
            Xm = np.arange(0,maxsf.shape[0],1)[:,None]
            ym+= model.predict(Xm)
            y_ens = ym/float(ii)  # ensemble model
        if(plot_id):
            fig.add_trace(go.Scatter(y=y_ens,line = dict(color=lst_col[0]),name='spect-bin-model'),3,jj)
        
        ''' Find Peaks in Data '''
        # find peaks in bin model 
        peaks, _ = find_peaks(y_ens, height=0) # find peaks in ensemble model
        if(store_id):
            glst_peak_model.extend(peaks)      
        if(plot_id):
            fig.add_trace(go.Scatter(x=peaks,y=y_ens[peaks],mode='markers',marker=dict(color='black')),3,jj)
    
        ''' (SAVE) MELSPECTOGRAM '''
        if(save_id):
            mel_spec -= mel_spec.min(); mel_spec /= mel_spec.max() # numpy format
            save_dir = os.path.join(output_dir, primary_label)
            if not os.path.exists(save_dir): os.makedirs(save_dir)
            fname = filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + '_' + str(s_cnt) + '.png'
            save_path = os.path.join(save_dir,fname)
            im = Image.fromarray(mel_spec * 255.0).convert("L")
            im.save(save_path)
        
            # add filepath to list
            path_all.append(save_path)
            s_cnt += 1
    
    if(plot_id):
        fig.update_layout(margin=dict(l=0, r=0, t=30, b=0),height=height,showlegend=False)
        fig.update_layout(template='plotly_white',font=dict(family='sans-serif',size=14))
        fig.show()
        
#     if(plot_id):
#         fig.update_layout(margin=dict(l=0, r=0, t=30, b=0),height=height,coloraxis_showscale=False,showlegend=False)
#         fig.update_layout(template='plotly_white',font=dict(family='sans-serif',size=14))
#         fig.show()
        
    return path_all # return list of pathways to created spectograms

### <b><span style='color:#FFC300'>5.2</span> | SAMPLE VISUALISATION CASE</b>

- Let's choose one particular audio recording that outlines some particular interesting things; <b>subset_path[5]</b>
- We'll limit outselves to one recording that is longer than 20 seconds, however we'll use <b>cfg.cutoff = 20</b> to limit ourselves to four chunks of spectogram data.
- This particular one is voted to be a rather clean recording <b>rating = 4</b> by [Xeno Canto](https://www.xeno-canto.org/) members.
- For the spectogram image output, we'll use <b>(96,256) px</b> & a desired range of <b>(0,12.5) kHz frequency range</b>, which is what you might start off with.

### <b><span style='color:#FFC300'>5.3</span> | OUTPUT IMAGES</b>

- We'll output, in order:
  - 5 second chunk mel spectogram images (vertical & horizontal axis represent <b>px equivalent</b> of <b>time</b> & <b>frequency</b> domains.)
  - <b>Time Domain</b> | Maximum signal chunk dB value at each time segment (<b>vertical (dB)</b> & <b>horizontal (time)</b>)
  - <b>Frequeny Domain</b> | Maximum signal chunk dB value at each frequency (<b>vertical (dB)</b> & <b>horizontal(frequency)</b>)
    - <b>dark green</b> : global spectogram maximum value at each frequency 
    - <b>light green</b> : local bin maximum value at each frequency 
    - <b>dots</b> : scipy module evaluated model peak values
  
### <b><span style='color:#FFC300'>5.4</span> | SOME OBSERVATIONS</b>

- We can see some <b>repetitive tonal peaks</b> at the bottom half of the spectogram. Likely our <b>primary_label</b>, since most chunks contain similar patterns.
- The more such tonal peaks there are the more the model moves towards the maximum spectogram value, as seeen from images (1,2 & 4) which have different numbers of calls.
- <b>Frequency masking</b> is present in some recordings, being already tampered with to filter out high frequency noise, such a insect sounds. 
- Such constant line sounds are all over the place, not only at costant frequency, but at constant times as well, as a result its useful study the spectogram as much as possible.
- The scipy module finds <b>four peaks in total at around 14 px</b>, you can use the function <b>pxtohz</b> to get the frequency equivalent.

### <b><span style='color:#FFC300'>5.5</span> | MODELS OVER FASTER METHODS</b>
- From the results below, you can see that the model generates much lower number of peaks compared to that of the maximum peak value (dark green).
- Inevitably, you will start to contribute more peaks to the overall count and potentially have multiple options instead of one (say if you were using a threshold to obtain the most common bird frequency), the overall confidence of having more peaks to choose from
- There is a slight issue surrounding model hyperparameters, how do you know which ones to choose? I think this can come down to simply understanding how each hyperparameter affects the model, and going off that knowledge. Ultimately there are scenarios in which we will get False results, so coming up with a method that will tune them will definitely be useful!

In [6]:
display(subset[5:6])

Unnamed: 0,index,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url,path
5,46790,rugdov,[],['song'],-8.9244,-62.0836,Columbina talpacoti,Ruddy Ground Dove,GABRIEL LEITE,2014-01-31,XC167299.ogg,Creative Commons Attribution-NonCommercial-Sha...,4.0,15:00,https://www.xeno-canto.org/167299,./kaggle/input/birdclef-2021/train_short_audio...


In [7]:
cfg.sshape = (48*2,128*2) # define shape of the spectogram 
cfg.pw = 1; cfg.cutoff = 20; cfg.offset = 0  # defined power of spectogram, read cutoff & read starting location
cfg.fmin = 0; cfg.fmax = 12500  # minimum and maximum frequency of the spectogram
out = get_spectrograms(filepath=subset_path[5],primary_label='temp',output_dir=DIR_SPEC_OUT,
                       save_id=False,audio_id=False,select_id='short',plot_id=True,store_id=False,height=500)

### <b><span style='color:#FFC300'>5.6</span> | MISSING POTENTIAL BIRD CALLS</b>

- 57,55 px show another peak, perhaps this is the <b>primary label?</b>, we clearly have a case of non specified <b>secondary labels</b> and the CNN model might start to pay attention to such secondary calls in the recordings.
- Some segments have <b>considerably higher broadband noise levels</b> & less noticable visible peaks (img. 3), which from my observations is similar to the soundscape recording environments.
- A realistic scenario would be that the peak would be completely missed in a soundcape environment when using the previously defined frequency range because the <b>bird may be quite far away</b> from the microphone and the call would be extremly faint on the spectogram that we are using. 
- One option to counter this is to redefine the frequency range and reduce the image size, whilst focusing on one part of the spectogram, which we know is the primary label.
- If we had the knowledge that this bird does call at such a small frequency range, we can redefine the window (0,1) kHz & we can note that the peak stands out a little more compared to the previous example, of course 

In [8]:
cfg.sshape = (48,128) # define shape of the spectogram 
cfg.pw = 1; cfg.cutoff = 20; cfg.offset = 0  # defined power of spectogram, read cutoff & read starting location
cfg.fmin = 0; cfg.fmax = 1000  # minimum and maximum frequency of the spectogram
out = get_spectrograms(filepath=subset_path[5],primary_label='temp',output_dir=DIR_SPEC_OUT,
                       save_id=False,audio_id=False,select_id='short',plot_id=True,store_id=False,height=500)

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>6 |</b> FINDING THE MOST COMMON FREQUENCY
    </p>
</div>

- As outlined in the introduction, we are most interested in inspecting all of the available data to us for a particular bird specie.
- We'll be investigating the audio recordings of the <b>rugdov (Ruddy Ground Dove)</b>, setting the cutoff to 60 seconds per audio recording; <b>cfg.cutoff = 60</b>
- We have about 60 entries, so let's loop through all audio recordings & store the peak location results into the global list <b>glst_peak_model</b> & then simply use the counter module; <b>Counter</b> to count all the unique entries.

In [9]:
''' Plot Counter Values'''
def pxcounter(Counter):
    fig = px.bar(x=Counter.keys(),y=Counter.values(),color=Counter.values(),color_continuous_scale  ='viridis')
    fig.update_layout(margin=dict(l=30, r=30, t=70, b=30),height=300,coloraxis_showscale=False,showlegend=False)
    fig.update_layout(template='plotly_white',font=dict(family='sans-serif',size=14))
    fig.update_layout(title=f'<b>MODEL PEAK COUNT</b> | FOR DIFFERENT PIXEL/FREQUENCY VALUES',
                      font=dict(family='sans-serif',size=12))
    fig.show()

In [10]:
cfg.sshape = (48*2,128*2)
cfg.pw = 1; cfg.cutoff = 60; cfg.offset = 0
cfg.fmin = 0; cfg.fmax = 12500; cfg.model_bins = 20

glst_peak_model = []
with tqdm_notebook(total=60) as pbar:
    for recording in range(0,60):
        pbar.update(1)
        out = get_spectrograms(filepath=subset_path[recording],
                               primary_label='temp',
                               output_dir=DIR_SPEC_OUT,
                               save_id=False,
                               audio_id=False,
                               select_id='short',
                               plot_id=False,
                               store_id=True,
                               height=450)

  0%|          | 0/60 [00:00<?, ?it/s]

In [11]:
glo_count = Counter(glst_peak_model)
pxcounter(glo_count) # y axis represents the peak occurence count, x axis represents the vertical pixel value

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>7 |</b> SOME PRACTICAL APPLICATIONS
    </p>
</div>

#### <b>FREQUENCY & TIME MASKING</b>
- There are quite a lot of applications for such a model based broadband estimation approach, I will not outline them all since they aren't directly related to the application to neural networks.
- I'll instead focus my attention on audio <b>augmentations</b> which are commonly used in audio CNN applications; <b>frequency</b> & <b>time masking</b> & can be quite useful to create more generalised CNN models given that PyTorch is quite popular. An example implementation of PyTorch augmentations can be found in the notebook; [Histopathologic Cancer Detection w/ Pytorch](https://www.kaggle.com/shtrausslearning/binary-cancer-image-classification-w-pytorch) in Section <b>4. Transforming the Data</b>.
- By creating a library that indicates at which frequency birds tend to call, you can utilise these functions below & modify them to prevent the augmentation from randomly cutting out the frequency bin which we identified to be the most common to that bird.
- I'm sure you can think of some other applications for frequency & time masking as well, i've added the functions below which should get you started.

In [12]:
''' FREQUENCY MASKING '''
# transform.Compose([hzmask(max_width=5, 
#                           use_mean=False)])
class hzmask(object):

    def __init__(self, max_width, use_mean=True):
        self.max_width = max_width
        self.use_mean = use_mean

    def __call__(self, tensor):
        start = random.randrange(0, tensor.shape[2])
        end = start + random.randrange(1, self.max_width)
        if self.use_mean:
            tensor[:, start:end, :] = tensor.mean()
        else:
            tensor[:, start:end, :] = 0
        return tensor

    def __repr__(self):
        format_string = self.__class__.__name__ + "(max_width="
        format_string += str(self.max_width) + ")"
        format_string += 'use_mean=' + (str(self.use_mean) + ')')

        return format_string
    
''' TIME MASKING '''
# transform.Compose([tmask(max_width=5, 
#                          use_mean=False)])
class tmask(object):

    def __init__(self, max_width, use_mean=True):
        self.max_width = max_width
        self.use_mean = use_mean

    def __call__(self, tensor):
        start = random.randrange(0, tensor.shape[1])
        end = start + random.randrange(0, self.max_width)
        if self.use_mean:
            tensor[:, :, start:end] = tensor.mean()
        else:
            tensor[:, :, start:end] = 0
        return tensor

    def __repr__(self):
        format_string = self.__class__.__name__ + "(max_width="
        format_string += str(self.max_width) + ")"
        format_string += 'use_mean=' + (str(self.use_mean) + ')')
        return format_string

<div style="color:white;
       display:fill;
       border-radius:5px;
       background-color:#FFC300;
       font-size:220%;
       font-family:Nexa;
       letter-spacing:0.5px">
    <p style="padding: 20px;
          color:white;">
        <b>8 |</b> OTHER ALTERNATIVES
    </p>
</div>

- The utilisation of a model is mainly down to me wanting something that will differentiate <b>broadband</b> & <b>tonal</b> noises sources in the spectogram & use it for the analyses of spectogram data. 
- It could well be that simply using the maximum value in each bin can be more benefitial from the point of execution time. The fact that I'm using my own model class also doesn't help since it's not optmised for speed, although for some reason I didn't notice any difference between catboost, which is probably down to the fact that I manually select hyperparameters.
- If you have any suggestions for improvements or questions let me know.