<h1 style='background:#A23C4D; border:0; color:white'><center> Import Libraries and Packages</center></h1> 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import soundfile as sf
import scipy.signal as signal
import matplotlib.pyplot as plt # to support librosa display
import seaborn as sns
import IPython.display as ipd # for playing audio
from collections import Counter
from tqdm import tqdm
import gc
import os
import sys
import shutil
import random

from joblib import Parallel, delayed
from functools import partial

import librosa # audio proccessing
import librosa.display # cool audio visuals
from librosa import feature as lf

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam

<h1 style='background:#1CC788; border:0; color:black'><center> Importing Data</center></h1> 

In [None]:
os.listdir("/kaggle")

In [None]:
os.listdir("/kaggle/working")

In [None]:
data_tp=pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_tp.csv')
data_fp=pd.read_csv('/kaggle/input/rfcx-species-audio-detection/train_fp.csv')

<h1 style='background:#3FBBBB; border:0; color:black'><center> Column Description </center></h1> 

- recording_id - unique identifier for recording
- species_id - unique identifier for species
- songtype_id - unique identifier for songtype
- t_min - start second of annotated signal
- f_min - lower frequency of annotated signal
- t_max - end second of annotated signal
- f_max - upper frequency of annotated signal
- is_tp - [tfrecords only] an indicator of whether the label is from the train_tp (1) or train_fp (0) file.

<h1 style='background:#07D700; border:0; color:black'><center> Data Visualization</center></h1> 

In [None]:
data_tp.head(10)

In [None]:
len(data_tp['recording_id'].unique())
#print("Number of rows in train data:'l_train'") # length of unique recording IDs

In [None]:
len(data_tp['recording_id']) # Total number of recording IDs

In [None]:
len(data_tp), len(data_fp)

### 84 recordings ids are repeated in this data.

### Dropping the repeated ones:

In [None]:
data_tp['recording_id'].drop_duplicates()

In [None]:
data_tp.info()

In [None]:
data_fp.head()

In [None]:
data_fp.info()

In [None]:
data_fp['recording_id'].drop_duplicates()

<h1 style='background:#E3C6AD; border:0; color:black'><center> Distribution of f_min and f_max </center></h1> 

In [None]:

plt.figure(figsize = (10, 5), dpi = 300)
plt.style.use('ggplot')
sns.distplot(data_tp['f_min'], color='red')
sns.distplot(data_tp['f_max'], color='Green')
plt.title('Min and Max frequencies')
plt.legend(['Min_Frequency', 'Max_Frequency']);

In [None]:
def plot_count(feature, title, df, size=1):
    '''
    Plot count of classes / feature
    param: feature - the feature to analyze
    param: title - title to add to the graph
    param: df - dataframe from which we plot feature's classes distribution 
    param: size - default 1.
    '''
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:30], palette='Set1')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()  

In [None]:
plot_count('species_id', 'TP: Species ID', data_tp, size=4)

In [None]:
plot_count('songtype_id', 'TP: Songtype ID', data_tp, size=2)

In [None]:
plot_count('species_id', 'FP: Species ID', data_fp, size=4)

In [None]:
plot_count('songtype_id', 'FP: Songtype ID', data_fp, size=2)

In [None]:
def plot_feature_distribution(data_df, feature, feature2, title, kde_mode=False, hist_mode=True):
    f, ax = plt.subplots(1,1, figsize=(12,6))
    for item in list(data_df[feature2].unique()):
        d_df = data_df.loc[data_df[feature2]==item]
        try:
            sns.distplot(d_df[feature], kde=kde_mode, hist=hist_mode, label=item)
        except:
            pass
    plt.legend(labels=list(data_df[feature2].unique()), bbox_to_anchor=(1, 1), loc='upper right', ncol=2)
    plt.title(title)
    plt.show()

In [None]:
plot_feature_distribution(data_tp, 'f_max', 'species_id', "Maximum frequency distribution, FP data, grouped by species id")

In [None]:
plot_feature_distribution(data_tp, 'f_max', 'songtype_id', "Maximum frequency distribution, TP data, grouped by songtype id")

In [None]:
plot_feature_distribution(data_tp, 't_min', 'species_id', 
                          "Minimum time distribution, TP data, grouped by species id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_tp, 't_max', 'species_id', 
                          "Max time distribution, TP data, grouped by species id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_tp, 't_min', 'songtype_id', 
                          "Minimum time distribution, TP data, grouped by songtype id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_tp, 't_max', 'songtype_id', 
                          "Max time distribution, TP data, grouped by songtype id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_fp, 't_min', 'species_id', 
                          "Minimum time distribution, FP data, grouped by species id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_fp, 't_max', 'species_id', 
                          "Max time distribution, FP data, grouped by species id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_fp, 't_min', 'songtype_id', 
                          "Minimum time distribution, FP data, grouped by songtype id", kde_mode=True, hist_mode=False)

In [None]:
plot_feature_distribution(data_fp, 't_max', 'songtype_id', 
                          "Max time distribution, FP data, grouped by songtype id", kde_mode=True, hist_mode=False)

<h1 style='background:#E3C6AD; border:0; color:black'><center>Sounds of the rainforest</center></h1> 

# Understanding the audio data
- https://towardsdatascience.com/understanding-audio-data-fourier-transform-fft-spectrogram-and-speech-recognition-a4072d228520
- https://medium.com/x8-the-ai-community/audio-classification-using-cnn-coding-example-f9cbd272269e

Here we define functions for display of:
- Waveplots
- Spectrograms  
- Mel spectrograms  
- Chroma feature
- Harmonics and Perceptual sound wave components

And for playing the sound.

We then show Waveplots, Spectrograms, Mel Spectrograms, Chroma feature and combined Harmonics and Perceptual graphs for few of the recordings, from both the TP and FP train sets.

In [None]:
def plot_audio_file(data_df, idx):
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    plt.figure(figsize=(12,6))
    x , sr = librosa.load(audio_file_path)
    librosa.display.waveplot(x, sr=sr)
    plt.gca().set_title(f"Waveplot - file: {data_df.recording_id[idx]}")
    plt.show()

In [None]:
def plot_spectrogram(data_df, idx):
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    plt.figure(figsize=(12,6))
    x , sr = librosa.load(audio_file_path)
    xs = librosa.stft(x)
    xdb = librosa.amplitude_to_db(abs(xs))
    librosa.display.specshow(xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.gca().set_title(f"Spectrogram - file: {data_df.recording_id[idx]}")
    plt.colorbar()

In [None]:
def plot_mel_spectrogram(data_df, idx):
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    plt.figure(figsize=(12,6))
    x , sr = librosa.load(audio_file_path)
    xs = librosa.feature.melspectrogram(x)
    xdb = librosa.amplitude_to_db(abs(xs))
    librosa.display.specshow(xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.gca().set_title(f"Mel spectrogram - file: {data_df.recording_id[idx]}")
    plt.colorbar()

In [None]:
def plot_harmonics_and_perceptual(data_df, idx):
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    plt.figure(figsize=(12,6))
    x , sr = librosa.load(audio_file_path)
    y_harmonics, y_perceptual = librosa.effects.hpss(x)
    plt.plot(y_perceptual, color = '#BBAA12')
    plt.plot(y_harmonics, color = '#12AABB')
    plt.legend(("Perceptual", "Harmonics"))
    plt.title(f"Harmonics and Perceptual - file: {data_df.recording_id[idx]}")

In [None]:
def plot_chroma_feature(data_df, idx):
    hop_length=12
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    plt.figure(figsize=(12,6))
    x , sr = librosa.load(audio_file_path)
    chromagram = librosa.feature.chroma_stft(x)
    librosa.display.specshow(chromagram, sr=sr, x_axis='time', y_axis='chroma',hop_length=hop_length, cmap='coolwarm')
    plt.title(f"Chroma feature - file: {data_df.recording_id[idx]}")

In [None]:
def play_sound(data_df, idx):
    audio_file_path = '/kaggle/input/rfcx-species-audio-detection/train/'+data_df.recording_id[idx]+'.flac'
    return ipd.Audio(audio_file_path)

<h2 style='background:#B65EA3; border:0; color:black'><center>Sound samples from TP set</center></h2> 

# Displaying a Waveform¶

In [None]:
plot_audio_file(data_tp, 20)

In [None]:
plot_audio_file(data_tp, 13)

In [None]:
plot_audio_file(data_tp, 1)

In [None]:
plot_audio_file(data_tp, 5)

Remember that this is the magnitude of the frequencies throughout the whole duration of the audio. A more useful graph would show us what frequencies are present on a time axis. One idea to implement this would be to make a bunch of these frequency-domain graphs for short durations in the audio, and then combine them together to form a time axis. This is what Short Time Fourier Transforms (STFT) do, and the visuals we can produce with this are know as spectrograms.

# Chroma
A chroma vector (Wikipedia) (FMP, p. 123) is a typically a 12-element feature vector indicating how much energy of each pitch class, {C, C#, D, D#, E, ..., B}, is present in the signal.

Chroma energy normalized statistics (CENS) (FMP, p. 375). The main idea of CENS features is that taking statistics over large windows smooths local deviations in tempo, articulation, and musical ornaments such as trills and arpeggiated chords. CENS are best used for tasks such as audio matching and similarity.

In [None]:
plot_chroma_feature(data_tp, 20)

In [None]:
plot_chroma_feature(data_tp, 12)

In [None]:
plot_harmonics_and_perceptual(data_tp, 20)

In [None]:
plot_harmonics_and_perceptual(data_tp, 2)

In [None]:
plot_harmonics_and_perceptual(data_tp, 17)

In [None]:
play_sound(data_tp, 20)

<h2 style='background:#B67B65; border:0; color:black'><center>Sound samples from FP set</center></h2> 

In [None]:
play_sound(data_tp, 12)

In [None]:
plot_audio_file(data_fp, 10)

In [None]:
plot_spectrogram(data_fp, 10)

In [None]:
fig= plot_mel_spectrogram(data_fp, 10)


In [None]:
fig= plot_mel_spectrogram(data_fp, 13)

In [None]:
plot_chroma_feature(data_fp, 10)

In [None]:
play_sound(data_fp, 10)

In [None]:
plot_audio_file(data_fp, 70)

<h1 style='background:#A3C6AD; border:0; color:black'><center>Mel Frequency Cepstal Coefficients (MFCCs)</center></h1> 

- The first step in any automatic speech recognition system is to extract features i.e. identify the components of the audio signal that are good for identifying the linguistic content and discarding all the other stuff which carries information like background noise, emotion etc.

- The feature that is useful to extract is the Mel Frequency Cepstral Coefficients (MFCCs). This can give you information about the timbral/textural aspects of the audio, and approximate how the human auditory system interprets sound. This is especially useful in speech recognition, but could prove very important for this competition as well!
- http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/


<h1 style='background:#B3C6ED; border:0; color:black'><center>MFCC Coefficients ¶</center></h1> 

In [None]:
# parameters for data saving as .npy files

class MFCC:
    # number of MFCC coeffcs
    n_mfcc = 32
    # hop length
    hop_length = 512
    pass

class FFT:
    pass

class STFT:
    pass

class MelSpec:
    pass

In [None]:
import os
train = "../input/rfcx-species-audio-detection/train" 
test = "../input/rfcx-species-audio-detection/test"
train_files = os.listdir(train)
test_files = os.listdir(test) 
output_dir = "./"

In [None]:
%%time

from librosa import feature as lf

# take sample example for understanding

sample_path = os.path.join(train, data_tp.loc[np.random.randint(0, len(data_tp)), "recording_id"] + ".flac")
sample, sr = librosa.load(sample_path)

# target features
# mfcc
mfcc_sample_20 = lf.mfcc(sample, sr=sr)
print("shape of the mfcc_sample with 20 Coeff: ", mfcc_sample_20.shape)
plt.figure(figsize=(15, 7))
librosa.display.specshow(mfcc_sample_20, sr=sr, x_axis='time')
plt.show()

mfcc_sample = lf.mfcc(sample, sr=sr, n_mfcc=MFCC.n_mfcc)
print(f"shape of the mfcc_sample with {MFCC.n_mfcc} Coeff: ", mfcc_sample.shape)
plt.figure(figsize=(15, 7))
librosa.display.specshow(mfcc_sample, sr=sr, x_axis='time')
plt.show()

<h1 style='background:#B3C6ED; border:0; color:black'><center>MFCC features Engineering ¶</center></h1> 

### 1.  Separate harmonics and percussives into two waveforms#
When listening to our environment, there exists a wide variety of different sounds. However, on a
very coarse level, many sounds can be categorized to belong in either one of two classes: harmonic
or percussive sounds. Harmonic sounds are the ones which we perceive to have a certain pitch
such that we could for example sing along to them. The sound of a violin is a good example of a
harmonic sound. Percussive sounds often stem from two colliding objects like for example the two
shells of castanets. An important characteristic of percussive sounds is that they do not have a
pitch but a very clear localization in time. Many real-world sounds are mixtures of harmonic and
percussive components. For example, a note played on a piano has a percussive onset (resulting
from the hammer hitting the strings) preceding the harmonic tone (resulting from the vibrating
string).

### 2.  Beat track on the percussive signal 
What is Beat tracking?
Audio beat tracking is commonly defined as determining the time instances in an audio recording, where a human listener is likely to tap his/her foot to the music. Audio beat tracking enables the “beat-synchronous” analysis of music.

In [None]:
# source : Librosa documentations
# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 512

# Separate harmonics and percussives into two waveforms
y_harmonic, y_percussive = librosa.effects.hpss(sample)

# Beat track on the percussive signal
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive,
                                             sr=sr)

# Compute MFCC features from the raw signal
mfcc = librosa.feature.mfcc(y=sample, sr=sr, hop_length=hop_length, n_mfcc=32)

# And the first-order differences (delta features)
mfcc_delta = librosa.feature.delta(mfcc)

# Stack and synchronize between beat events
# This time, we'll use the mean value (default) instead of median
beat_mfcc_delta = librosa.util.sync(np.vstack([mfcc, mfcc_delta]),
                                    beat_frames)

# Compute chroma features from the harmonic signal
chromagram = librosa.feature.chroma_cqt(y=y_harmonic,
                                        sr=sr)

# Aggregate chroma features between beat events
# We'll use the median value of each feature between beat frames
beat_chroma = librosa.util.sync(chromagram,
                                beat_frames,
                                aggregate=np.median)

# Finally, stack all beat-synchronous features together
beat_features = np.vstack([beat_chroma, beat_mfcc_delta])

In [None]:
beat_features.shape

In [None]:
def process_mfcc(idx, data):
    file_path = os.path.join(data, idx)
    assert os.path.exists(file_path), file_path
    
    # load the pcm audio and sr
    data_wav, data_sr = librosa.load(file_path)
    # separate harmonics and percussives
    data_harmonic, data_precussive = librosa.effects.hpss(sample)
    # beat track on the precussive signal
    tempo, beat_frames = librosa.beat.beat_track(y=data_precussive, sr=sr)
    # compute mfcc
    data_mfcc = lf.mfcc(data_wav, sr=data_sr, n_mfcc=MFCC.n_mfcc, hop_length=MFCC.hop_length)
    
    # dynamic mfcc features
    # delta-mfcc -> first order derivative
    mfcc_delta = librosa.feature.delta(data_mfcc)
    # stack and synchronize beat events
    beat_mfcc_delta = librosa.util.sync(np.vstack([data_mfcc, mfcc_delta]), beat_frames)
    # chroma features
    chroma = librosa.feature.chroma_cqt(y=data_harmonic, sr=sr)
    # aggregrate chroma features
    beat_chroma = librosa.util.sync(chroma,
                                   beat_frames,
                                   aggregate=np.median)
    # stack all the features
    beat_features = np.vstack([beat_chroma, beat_mfcc_delta])
    beat_features = np.expand_dims(beat_features, axis=2)
    assert beat_features.ndim == 3, beat_features.shape
                
    fn = idx.split(".")[0]
    np.save(os.path.join(target_sub_dir, f"{fn}" + ".npy"), beat_features)
    assert os.path.exists(os.path.join(target_sub_dir, f"{fn}" + ".npy")), os.path.join(target_sub_dir, f"{fn}" + ".npy")
    pass




In [None]:
def save_mfcc(folder="MFCC", tag="train"):
    target_dir = os.path.join(folder, tag)
    global target_sub_dir 
    target_sub_dir = os.path.join(output_dir, target_dir)
    
    # if exists, delete
    if os.path.exists(target_sub_dir):
        print("Deleting existing folder...")
        shutil.rmtree(target_sub_dir)
        assert not os.path.exists(target_sub_dir), os.path.exists(target_sub_dir)
       
    if not os.path.exists(target_sub_dir):
        os.makedirs(target_sub_dir)
        
        batch =0
    if tag=="train":
        print("\n", "="*50)
        print("Creating MFCC features from train files")
        Parallel(-1, verbose=1)(delayed(partial(process_mfcc, data=train))(x) for x in train_files[batch:batch+1000]) #1050 to 2050
        pass
    else:
        print("\n", "="*50)
        print("Creating MFCC features from test audio files")
        Parallel(-1, verbose=1)(delayed(partial(process_mfcc, data=test))(x) for x in test_files[batch:batch+1000])
        pass

    pass

#n_jobs=-1 determines the number of jobs to use which in parallel doesn't work on windows all the time.
#n_jobs = -2, all CPUs but one are used. 
#If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. 


In [None]:
%%time 

print("[INFO]Creating training MFCCs...")
save_mfcc()
print("[INFO]Creating test MFCCs...")
save_mfcc(tag="test")

print("="*50)
print("Total Train files (TP+FP): ", len(os.listdir("./MFCC/train")))
print("Total Test files: ", len(os.listdir("./MFCC/test")))

In [None]:
#def save_in_batches(start, end, df = data,segments = 6):
    
   # batch_segments = {}
    
   # for i in range(segments):
      #  batch_segments[i] = []
        
   # for j in tqdm(range(start,end)):
      #  process_mfcc(idx, data)
       # save_mfcc(folder="MFCC", tag="train")
    #return batch_segments

- Total Train files (TP+FP):  4727
- Total Test files:  1992
- CPU times: user 1min 34 sec s, sys: 2.84 s, total: 59.5 s
- Wall time: 5h 54min 35s

<h1 style='background:#E3C6AD; border:0; color:black'><center>Training and Test Data set</center></h1> 

# Define the model layers
As seen above 1 audio has two kinds of images associated with it.
Audio signal : Amplitude v/s Time
Spectrogram : Freqeuncy Content v/s Time

Logically both of them can be used to train our CNN. We tried doing that and observed that pure audio signal yields a test-accuracy low as compared to the spectrograms. We will use our beat frame features .npy output here. 

The CNN we use has the following layers:
1. Convolution layer with kernel size : 3x3
2. Convolution layer with kernel size : 3x3
3. Max Pooling layer with pool size : 2x2
4. Dropout layer
5. Flattening layer
6. 2 Dense layered Neural Network at the end