# A brief Introduction

Using Stefan's great introduction to data here -> https://www.kaggle.com/stefankahl/birdclef2021-exploring-the-data
We already have the following details:

1.     Dataset has 397 different bird species
2.     There are less than 500 samples per bird call and some have less than 100. Dataset is highly imbalanced
3.     Each short audio recording is about X mins long. And has multiple bird calls but one of them is prominent and sometimes others  can be heard in the background. The primary bird species is present in primary_label in train_metadata.csv. The secondary labels indicate the other birds heard in the background.
4.     Data about bird calls - Bird species (primary) and secondary ones heard in background and also type of call
5.     Data other than bird species - Date & time when it was spotted, location(given by lattitude and longitude), 
6.     Data - Rating indicating quality of the calls, Author or the contributor who recorded

# Libraries

In [None]:
# Code adapted from https://www.kaggle.com/drcapa/birdclef-2021-starter

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
import IPython.display as display

from sklearn.model_selection import train_test_split

from keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import VGG19, VGG16, ResNet50

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/birdclef-2021/'
os.listdir(path)

# Helper Functions



In [None]:
def read_ogg_file(file,path=None):
    """ Read ogg audio file and return numpay array and samplerate"""
    if path :
        data, samplerate = sf.read(path+file)
    else:
        data, samplerate = sf.read(file)
    return data, samplerate


def plot_audio_file(data, samplerate,label=None):
    """ Plot the audio data"""
    
    sr = samplerate
    fig = plt.figure(figsize=(8, 4))
    x = range(len(data))
    y = data
    plt.plot(x, y)
    plt.plot(x, y, color='red')
    plt.legend(loc='upper center')
    plt.grid()
    if label:
        plt.title(label)
    plt.show()
    
def plot_spectrogram(data, samplerate,label=None):
    """ Plot spectrogram with mel scaling """
    
    sr = samplerate
    spectrogram = librosa.feature.melspectrogram(data, sr=sr)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    librosa.display.specshow(log_spectrogram, sr=sr, x_axis='time', y_axis='mel')
    if label:
        plt.title(label)
    plt.show()

# Load Data

In [None]:
train_soundscape_labels = pd.read_csv(path+'train_soundscape_labels.csv')
train_meta_short_audio = pd.read_csv(path+'train_metadata.csv')
test_data = pd.read_csv(path+'test.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Overview

Loading the data from excel files

In [None]:
print('Number train label samples:', len(train_soundscape_labels))
print('Number train meta samples:', len(train_meta_short_audio))
print('Number train short audio folders:', len(os.listdir(path+'train_short_audio')))
print('Number train audios:', len(os.listdir(path+'train_soundscapes')))
print('Number test samples:', len(test_data))

There are 397 short audio folders ie. 397 birds. There are 62874 short audio files present inside these folders which correspond to the entries in train_meta file

In [None]:
cpt = sum([len(files) for r, d, files in os.walk(path+'train_short_audio')])
print(cpt)

In [None]:
os.listdir(path+'train_short_audio/caltow')[:2]

In [None]:
train_soundscape_labels.head()

In [None]:
train_meta_short_audio.head()

In [None]:
# Code adapted from https://www.kaggle.com/shahules/bird-watch-complete-eda-fe

import plotly.graph_objects as go

# Unique eBird codes
species = train_meta_short_audio['primary_label'].value_counts()

# Make bar chart
fig = go.Figure(data=[go.Bar(y=species.values, x=species.index)],
                layout=go.Layout(margin=go.layout.Margin(l=0, r=0, b=10, t=50)))

# Show chart
fig.update_layout(title='Number of traning samples per species')
fig.show()

# Visualizing Spectograms of different species
Lets compare different species to see how their spectograms differ

In [None]:
species = train_meta_short_audio['primary_label'].unique()[:5]
print(species)

In [None]:
species_sample_filenames = [list(train_meta_short_audio[(train_meta_short_audio.primary_label==each) & (train_meta_short_audio.secondary_labels=='[]')]['filename'])[0] for each in species]
print(species_sample_filenames)

Load the data and samplerate and compare spectrograms:

In [None]:
# for label,filename in zip(species,species_sample_filenames):
#     print(label)
#     data, samplerate = sf.read(path+'train_short_audio/'+label+'/'+filename)
#     print(data[:8])
#     print(samplerate)
#     plot_audio_file(data, samplerate,label)
#     plot_spectrogram(data, samplerate,label)

Display the audio of the file:

In [None]:
# display.Audio(path+'train_short_audio/'+label+'/'+filename)

# Visualizing Spectograms of bird calls from same species
Lets check the variation in spectrogram of bird calls from same species to see if their spectograms differ

In [None]:
species = 'acafly'
species_sample_filenames = list(train_meta_short_audio[(train_meta_short_audio.primary_label==species) & (train_meta_short_audio.secondary_labels=='[]')]['filename'])[:5]
print(species_sample_filenames)

In [None]:
# label = species
# for filename in species_sample_filenames:
#     print(label)
#     data, samplerate = sf.read(path+'train_short_audio/'+label+'/'+filename)
#     print(data[:8])
#     print(samplerate)
#     plot_audio_file(data, samplerate,label)
#     plot_spectrogram(data, samplerate,label)

# Analysis of Long Recordings in Train Soundscapes
Our challenge is to identify which birds are calling in **long** recordings.

There are 20 long audio files in the folder train_soundscapes. And there are also 20 unique audio ids: 

In [None]:
train_soundscape_labels['audio_id'].unique()

Each audio file consists of 120 birds with a lenth of 5 seconds.

In [None]:
train_soundscape_labels.groupby(by=['audio_id']).count()['birds'][:4]

So we have to split the long audio into 120 small audio.

## EDA on Train Soundscapes
The target label birds is a space delimited list of any bird songs present in the 5 second window. So we have to encode the labels. Therefor we look on an example with 3 different birds:

In [None]:
print('original label:', train_soundscape_labels.loc[458, 'birds'])
print('split into list:', train_soundscape_labels.loc[458, 'birds'].split(' '))

We extract all label of the train data:

In [None]:
labels = []
for row in train_soundscape_labels.index:
    labels.extend(train_soundscape_labels.loc[row, 'birds'].split(' '))
labels = list(set(labels))

print('Number of unique bird labels:', len(labels))

We focus on an example. The first audio file is named by

In [None]:
file = os.listdir(path+'train_soundscapes')[0]
file

We load the data and samplerate:

In [None]:
data, samplerate = read_ogg_file(file,path+'train_soundscapes/')
data.shape


The numpy array has a lenght of 19,200,000. And we know there are 120 samples in each.So every sample consists of 160,000 values. These 160,000 values describes 5 seconds of the audio file.

We split the file name into the audio_id and site:

In [None]:
audio_id = file.split('_')[0]
site = file.split('_')[1]
print('audio_id:', audio_id, ', site:', site)

We focus on the samples with the label birds unequal to nocall. There are 4 samples

In [None]:
train_soundscape_labels[(train_soundscape_labels['audio_id']==int(audio_id)) & (train_soundscape_labels['site']==site) & (train_soundscape_labels['birds']!='nocall')]

We want to extract the first example with the id 1771. This bird we can here from 455 seconds to 460 seconds.  

In [None]:
sub_data = data[int(455/5)*160000:int(460/5)*160000]

Plot the audio array:

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(sub_data, sr=samplerate)
plt.grid()
plt.show()

Listen to the bird:

In [None]:
display.Audio(sub_data, rate=samplerate)

# Analyzing the short Audio Files

In [None]:
species = 'caltow'
filename = os.listdir(path+ '/train_short_audio/' + species + '/')[0]
print(filename)
data, samplerate = read_ogg_file(filename,path+'train_short_audio/'+species+'/')
print(data.shape)
display.Audio(data, rate=samplerate)

In [None]:
train_meta_short_audio[train_meta_short_audio['filename']==filename]

In [None]:
# Removing the ones which have secondary labels and splitting the format into 5 second audio similar to the long soundscape format

primary_labels = train_meta_short_audio[train_meta_short_audio['secondary_labels']=='[]']['primary_label']
filenames = train_meta_short_audio[train_meta_short_audio['secondary_labels']=='[]']['filename']

split_audio_ids = pd.DataFrame()
all_seconds = []
all_file_names = []
all_labels = []
cnt = 0
for primary_label,filename in zip(primary_labels,filenames):
    fname = '/train_short_audio/' + primary_label + '/' + filename
    full_path = path + fname
    # Removing this to reduce time taken for running the notebook
#     audio_file, audio_sr = read_ogg_file(fname,path)
#     len_audio_file = len(audio_file)
#     chunk = list(range(0,len_audio_file,160000))
#     for i in range(0,len(chunk)-1):
    for i in range(0,1):
#         audio_file_short = audio_file[chunk[i]:chunk[i+1]]
        # Check if the 5 second slot has a bird call here & Remove if not valid (Later)
        seconds = 5 * (i+1)
        all_seconds.append(seconds)
        all_file_names.append(fname)
        all_labels.append(primary_label)
    cnt = cnt + 1
    if cnt % 10000 == 0:
        print(f"{cnt} done")
        
# display.Audio(audio_file_short, rate=audio_sr)
split_audio_ids = pd.DataFrame({'row_id':[""]*len(all_file_names),'site':[""]*len(all_file_names),'filename':all_file_names,'seconds':all_seconds,'birds':all_labels})
print(len(split_audio_ids))
split_audio_ids.head()



# Data Consolidation across long and short Audios

In [None]:
train_sounscapes_filenames = []

for row in train_soundscape_labels.iterrows():
    audio_id = row[1]['audio_id']
    site = row[1]['site']
    prefix = str(audio_id)+'_'+ str(site)
    file_list = [s for s in os.listdir(path+"/train_soundscapes/") if prefix in s]
    file = ""
    if len(file_list) > 0:
        file = file_list[0]
    train_sounscapes_filenames.append('/train_soundscapes/'+file)

train_soundscape_cleaned = train_soundscape_labels
train_soundscape_cleaned['filename']=train_sounscapes_filenames
train_soundscape_cleaned = train_soundscape_cleaned[train_soundscape_cleaned['filename'] != ""]
train_soundscape_cleaned.head()

In [None]:
train_soundscapes_all  = pd.concat([train_soundscape_cleaned,split_audio_ids])
train_soundscapes_all['birds_split'] = train_soundscapes_all['birds'].str.split().str.len()
train_soundscapes_all = train_soundscapes_all[train_soundscapes_all['birds_split']==1]
labels = pd.get_dummies(train_soundscapes_all['birds'])

train_soundscapes_all = pd.concat([train_soundscapes_all,labels],axis=1)
train_soundscapes_all = train_soundscapes_all.reset_index()
list_IDs_train, list_IDs_val = train_test_split(list(train_soundscapes_all.index), test_size=0.33, random_state=2021)
list_IDs_test = list(samp_subm.index)
train_soundscapes_all.head()

# Parameter
Based on the EDA we define some parameters:

In [None]:
data_lenght = 160000
audio_lenght = 5
num_labels = len(list(labels))


For the Data Generator we want to define in the next step we need additional parameters:

In [None]:
batch_size = 16
len(train_soundscapes_all)

# Train, Val And Test Data

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, batch_size,test=False):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        self.test = test
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        X = X.reshape((self.batch_size, 100, 1600//2))
        return X, y
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, data_lenght//2))
        y = np.zeros((self.batch_size, num_labels))
        for i, ID in enumerate(list_IDs_temp):
            file = str(self.data.loc[ID, 'filename'])
            if file == "":
                # Dummy for missing test audio files
                audio_file_fft = np.zeros((data_lenght//2))
            else:              
                audio_file, audio_sr = read_ogg_file(file,self.path)
                audio_file = audio_file[int((self.data.loc[ID, 'seconds']-5)/audio_lenght)*data_lenght:int(self.data.loc[ID, 'seconds']/audio_lenght)*data_lenght]
                audio_file_fft = np.abs(np.fft.fft(audio_file)[: len(audio_file)//2])
                # scale data
                audio_file_fft = (audio_file_fft-audio_file_fft.mean())/audio_file_fft.std()
            X[i, ] = audio_file_fft
            if not self.test:
                y[i, ] = self.data.loc[ID,self.data.columns[8:]].values
        return X, y

# Audio Data Generator
We use a Data Generator to load the data on demand.

Test the Data Generator

In [None]:
import random
train_sample_size = 100000
val_sample_size = 10000
# list_IDs_train_sample = random.sample(list_IDs_train,train_sample_size)
# list_IDs_val_sample = random.sample(list_IDs_val,val_sample_size)
list_IDs_train_sample = list_IDs_train
list_IDs_val_sample = list_IDs_val
train_generator = DataGenerator(path, list_IDs_train_sample, train_soundscapes_all, batch_size)
val_generator = DataGenerator(path, list_IDs_val_sample, train_soundscapes_all, batch_size)




# Define Model

In [None]:
epochs = 1
lernrate = 2e-3

In [None]:
model = Sequential()
model.add(Conv1D(64, input_shape=(100, 1600//2,), kernel_size=5, strides=4, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size=(4)))
model.add(Conv1D(64, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(num_labels, activation='sigmoid'))

In [None]:
model.compile(optimizer = Adam(lr=lernrate),
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit_generator(generator=train_generator, validation_data=val_generator, epochs = epochs, workers=4)

# Analyse Training

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
fig.subplots_adjust(hspace = .2, wspace=.2)
axs = axs.ravel()
loss = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1, len(loss)+1)
axs[0].plot(epochs, loss, 'bo', label='loss_train')
axs[0].plot(epochs, loss_val, 'ro', label='loss_val')
axs[0].set_title('Value of the loss function')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('value of the loss function')
axs[0].legend()
axs[0].grid()
acc = history.history['binary_accuracy']
acc_val = history.history['val_binary_accuracy']
axs[1].plot(epochs, acc, 'bo', label='accuracy_train')
axs[1].plot(epochs, acc_val, 'ro', label='accuracy_val')
axs[1].set_title('Accuracy')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Value of accuracy')
axs[1].legend()
axs[1].grid()
plt.show()

# Predict Test Data

In [None]:
# Code adapted from https://www.kaggle.com/stefankahl/birdclef2021-sample-submission

def list_files(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.rsplit('.', 1)[-1] in ['ogg']]
test_audio = list_files(path + 'test_soundscapes')

if len(test_audio) == 0:
    print("Using train files...")
    test_audio = list_files(path + 'train_soundscapes')
    
print('{} FILES IN TEST SET.'.format(len(test_audio)))

In [None]:
# Split into 5 second segments

test_seconds = []
test_file_names = []

cnt = 0

for fname in test_audio:
    audio_file, audio_sr = read_ogg_file(fname)
    len_audio_file = len(audio_file)
    chunk = list(range(0,len_audio_file,160000))
    for i in range(0,len(chunk)-1):
        seconds = 5 * (i+1)
        test_seconds.append(seconds)
        test_file_names.append(fname)
    cnt = cnt + 1
    if cnt % 1000 == 0:
        print(f"{cnt} done")
        
split_test_audio_ids = pd.DataFrame({'row_id':[""]*len(test_file_names),'site':[""]*len(test_file_names),'audio_id':[""]*len(test_file_names),'filename':test_file_names,'seconds':test_seconds,'birds':[""]*len(test_file_names),'birds_split':[1]*len(test_file_names)})
print(len(split_test_audio_ids))

split_test_audio_ids = split_test_audio_ids.reset_index()
split_test_audio_ids.head()

In [None]:
data_lenght = 160000
audio_lenght = 5
num_labels = len(list(labels))

list_IDs_test=list(split_test_audio_ids.index)
test_generator = DataGenerator(None, list_IDs_test, split_test_audio_ids, batch_size,True)


In [None]:
y_pred = model.predict_generator(test_generator)

Set all values greater than 0.5 to 1:

In [None]:
y_test = np.where(y_pred > 0.5, 1, 0)

Generate target label string:

In [None]:
print(y_test[:10])
label_names = list(labels)


In [None]:
row_ids = []
labels = []
for i in range(0,len(split_test_audio_ids[:len(y_test)])):
    row_id = split_test_audio_ids.loc[i,'filename'].split("/")[-1].split("_")[0]
    site = split_test_audio_ids.loc[i,'filename'].split("/")[-1].split("_")[1]
    second = split_test_audio_ids.loc[i,'seconds']
    row_ids.append(str(row_id)+"_"+str(site)+"_"+str(second))
    string = ''
    for col in range(0,len(y_test[i])):
        if y_test[i][col] == 1:
            if string == '':
                string += label_names[col]
            else:
                string += ' ' + label_names[col]
    if string == '':
        string = 'nocall'
    labels.append(string)

sample_submission = pd.DataFrame({'row_id':row_ids,'birds':labels})

# sample_submission['birds'].value_counts()

# Export

In [None]:
output = sample_submission
output.to_csv('submission.csv', index=False)

In [None]:
output[:3]