In [None]:
# Copied and modified from: https://www.kaggle.com/code/duythanhng/birdclef-2022-keras-simple-tutorial
# Followed this article for LSTM setup for audio files: https://towardsdatascience.com/recurrent-neural-nets-for-audio-classification-81cb62327990

import os
import json 
import librosa
import librosa.display
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras import optimizers
from tensorflow.keras.utils import plot_model, Sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Input, Concatenate

In [None]:
train_meta = pd.read_csv("/kaggle/input/birdclef-2022/train_metadata.csv")
train_meta = train_meta.dropna()
train_meta

In [None]:
from numpy import array
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

labels = list(train_meta['primary_label'].unique())
label_encoder = LabelEncoder()

# From: https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/#:~:text=A%20one%20hot%20encoding%20is,is%20marked%20with%20a%201.
def encode(labels):
    values = array(labels)
    integer_encoded = label_encoder.fit_transform(values)
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    return onehot_encoder.fit_transform(integer_encoded)

encoded_labels = encode(labels)
encoded_labels

## Split audio into 5 seconds chunks

In [None]:
import soundfile as sf
import os

def cutAudio(file_path, is_save):
    # First load the file
    filename = file_path.replace("/", "_")
    file_path = "/kaggle/input/birdclef-2022/train_audio/" + file_path
    audio, sr = librosa.load(file_path)

    # Get number of samples for 5 seconds; replace 5 by any number
    buffer = 5 * sr

    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    audio_split = []
    audio_filenames = []
    while samples_wrote < samples_total:
        #check if the buffer is not exceeding total samples 
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote : (samples_wrote + buffer)]
        audio_split.append(block)

        # Write 5 second segment
        if is_save == True:
            out_filename = "/kaggle/working/each5s/split_" + str(counter) + "_" + filename
            audio_filenames.append(out_filename)
            sf.write(out_filename, block, sr)
        counter += 1
        samples_wrote += buffer
    return audio_split, sr, audio_filenames

In [None]:
def splitTrainAudio(_df):
    data = []
    for index, row in _df.iterrows():
        cutAudio(row["filename"], True)
        audio_lst, sr, filenames = cutAudio(row["filename"], True)
        for idx, y in enumerate(audio_lst):
            data.append([row["primary_label"], row["filename"], filenames[idx]])

    data_df = pd.DataFrame(data, columns=['primary_label', 'original_filename', 'filename'])
    data_df.to_csv("/kaggle/working/data_df.csv", index=False)

In [None]:
# Sample Data
data_frames = []
for label in labels:
    tmp_df = train_meta[train_meta["primary_label"] == label].sample(n=1, replace=True).reset_index(drop=True)
    data_frames.append(tmp_df)
sample_df = pd.concat(data_frames).reset_index(drop=True)
sample_df

# Uncomment for debugging.
#sample_df = sample_df[:30]

In [None]:
!mkdir -p "/kaggle/working/each5s"
splitTrainAudio(sample_df)

In [None]:
data_df = pd.read_csv("/kaggle/working/data_df.csv")
data_df

In [None]:
#### The full data can be found [here](https://www.kaggle.com/duythanhng/birdclef-2022-audio-per-5-second)

## Training

In [None]:
from os import listdir

# Padding numpy array from: https://stackoverflow.com/questions/59241216/padding-numpy-arrays-to-a-specific-size
def padding(array, xx, yy):
    """
    :param array: numpy array
    :param xx: desired height
    :param yy: desirex width
    :return: padded array
    """

    h = array.shape[0]
    w = array.shape[1]

    a = (xx - h) // 2
    aa = xx - a - h

    b = (yy - w) // 2
    bb = yy - b - w

    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

def get_bird_data():
    paths = []
    path_labels = []
    for i in data_df.index:
        paths.append(data_df['filename'][i]);
        path_labels.append(data_df['primary_label'][i])
    return np.array(paths), np.array(path_labels)

X, y = get_bird_data()

print(len(y))

In [None]:
from tensorflow.keras.utils import plot_model, Sequence
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import librosa.display
from sklearn.preprocessing import normalize
import warnings
import math

hop_length = 512 #the default spacing between frames
n_fft = 255 #number of samples
n_mfcc = 255
sr = 28000
columns = math.ceil(sr * 5 / hop_length)

# Adapted from: https://medium.datadriveninvestor.com/keras-training-on-large-datasets-3e9d9dbc09d4

def gen_audio(block, sr):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        mfcc = librosa.feature.mfcc(block, 
         n_fft=n_fft,hop_length=hop_length,n_mfcc=n_mfcc)
    mfcc = np.nan_to_num(mfcc)

    padded_mfcc = padding(mfcc, 128, columns)
    padded_mfcc = normalize(padded_mfcc)

    return np.array([padded_mfcc]).reshape(128,columns,1)

class DataGenerator(Sequence):
    def __init__(self, paths, labels, batch_size) :
        self.paths = paths
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        return (np.ceil(len(self.paths) / float(self.batch_size))).astype(np.int)

    def __getitem__(self, idx):
        batch_paths = self.paths[idx * self.batch_size : ((idx+1) * self.batch_size)]

        batch_x = np.empty((len(batch_paths), 128, columns, 1))
        np.empty((len(batch_paths), 128, columns, 1))
        for i in range(len(batch_paths)):
            block, _ = librosa.load(batch_paths[i],sr=sr)
            batch_x[i] = gen_audio(block,sr)

        batch_labels = self.labels[idx * self.batch_size : (idx+1) * self.batch_size]
        batch_labels = list(map(lambda x: labels.index(x), batch_labels))

        return batch_x, np.array(batch_labels)

In [None]:
# TODO: Check shuffle parameter.
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123, shuffle=True)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=123)

#Print the shapes
#X_train.shape, X_test.shape, X_val.shape, len(y_train), len(y_test), len(y_val)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=123, shuffle=True)
X_train.shape, X_val.shape, len(y_train), len(y_val)

In [None]:
# Fighting overfitting: https://towardsdatascience.com/dont-overfit-how-to-prevent-overfitting-in-your-deep-learning-models-63274e552323

import tensorflow
import tensorflow.keras as keras
from keras.layers import Conv2D, LSTM, Dense, Dropout, Flatten, BatchNormalization, Activation, ConvLSTM2D, Masking, Bidirectional, MaxPooling2D
from keras import backend as K
from keras.regularizers import L2

# TODO: Look into https://stackoverflow.com/questions/48140989/keras-lstm-input-dimension-setting
batch_size = 4
droput_rate = 0.7
reg = 0.0001
strides = (2, 2)

model = keras.Sequential()
'''
model.add(ConvLSTM2D(32, kernel_size=(3, 3), 
                     input_shape=(batch_size,128,columns,1),
                     #strides=(3,3),
                     dropout=0.7,
                     kernel_regularizer=L2(0.00001), recurrent_regularizer=L2(0.00001)))
'''
model.add(Conv2D(16, (7,7), input_shape=(128, columns, 1), 
                 activation='relu', padding='same', strides=strides, kernel_regularizer=L2(reg), activity_regularizer=L2(reg)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(droput_rate))
model.add(Conv2D(32, (5,5), activation='relu', padding='same', strides=strides, kernel_regularizer=L2(reg), activity_regularizer=L2(reg)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(droput_rate))
model.add(Conv2D(64, (3,3), activation='relu', padding='same', strides=strides, kernel_regularizer=L2(reg), activity_regularizer=L2(reg)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(droput_rate))
model.add(Flatten())
model.add(Dense(len(labels), activation='softmax', kernel_regularizer=L2(reg)))
model.summary()

In [None]:
learning_rate= 0.001

# Cross entropies explained:
# https://stackoverflow.com/questions/49161174/tensorflow-logits-and-labels-must-have-the-same-first-dimension
# https://stackoverflow.com/questions/61742556/valueerror-shapes-none-1-and-none-2-are-incompatible
model.compile(optimizer=Adam(learning_rate=learning_rate
                             #,clipnorm=1.0
                            ),loss='SparseCategoricalCrossentropy',metrics=['acc'])

In [None]:
from keras.callbacks import EarlyStopping

epochs = 100

es_callback = EarlyStopping(monitor='val_loss', patience=10)

# TODO: Remove warning suppression.
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    history = model.fit(DataGenerator(X_train, y_train, batch_size),
                       epochs = epochs,
                       validation_data = DataGenerator(X_val, y_val, batch_size), 
                        callbacks=[es_callback],
                        #shuffle=True
                       )

# Record number of epochs in case of early stopping.
# https://stackoverflow.com/questions/49852241/return-number-of-epochs-for-earlystopping-callback-in-keras
epochs = len(history.history['loss'])

In [None]:
import matplotlib.pyplot as plt

#Adapted from Deep Learning with Python by Francois Chollet, 2018
history_dict=history.history
loss_values=history_dict['loss']
acc_values=history_dict['acc']
val_loss_values = history_dict['val_loss']
val_acc_values=history_dict['val_acc']
epochs_range=range(1,epochs+1)
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,5))
ax1.plot(epochs_range,loss_values,'co',label='Training Loss')
ax1.plot(epochs_range,val_loss_values,'m', label='Validation Loss')
ax1.set_title('Training and validation loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.legend()
ax2.plot(epochs_range,acc_values,'co', label='Training accuracy')
ax2.plot(epochs_range,val_acc_values,'m',label='Validation accuracy')
ax2.set_title('Training and validation accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.legend()
plt.show()

## Submission

In [None]:
test_path = "/kaggle/input/birdclef-2022/test_soundscapes/"
files = [f.split('.')[0] for f in sorted(os.listdir(test_path))]

birds_path = "/kaggle/input/birdclef-2022/scored_birds.json"
with open(birds_path) as bf:
    birds = json.load(bf)

data = []
for f in files:
    file_path = test_path + f + '.ogg'
    audio, sr = librosa.load(file_path)
    # Get number of samples for 5 seconds; replace 5 by any number
    buffer = 5 * sr
    samples_total = len(audio)
    samples_wrote = 0
    counter = 1

    while samples_wrote < samples_total:
        #check if the buffer is not exceeding total samples 
        if buffer > (samples_total - samples_wrote):
            buffer = samples_total - samples_wrote

        block = audio[samples_wrote : (samples_wrote + buffer)]
        x = gen_audio(block, sr)
        x = x.reshape(1, 128, columns, 1)

        pred = model.predict(x)
        label_index = np.argmax(pred,axis=1)[0]
        
        for b in birds:
            segment_end = counter * 5   
            row_id = f + '_' + b + '_' + str(segment_end)
            target = False
            if labels[label_index] == b:
                target = True
            data.append([row_id, target])
        counter += 1
        samples_wrote += buffer
        
submission_df = pd.DataFrame(data, columns=['row_id', 'target'])
submission_df

In [None]:
submission_df.to_csv("/kaggle/working/submission.csv", index=False)