In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
import IPython.display as display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd
import librosa
import librosa.display
import os
from tqdm import tqdm
import sklearn
import seaborn as sns
import plotly.express as px


import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.model_selection import train_test_split

from keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from keras.optimizers import RMSprop,Adam
from keras.applications import VGG19, VGG16, ResNet50

import warnings
warnings.filterwarnings("ignore")

In [None]:
path = '/kaggle/input/birdclef-2021/'
os.listdir(path)

In [None]:
def read_ogg_file(path, file):
    """ Read ogg audio file and return numpay array and samplerate"""
    
    data, samplerate = sf.read(path+file)
    return data, samplerate


def plot_audio_file(data, samplerate):
    """ Plot the audio data"""
    
    sr = samplerate
    fig = plt.figure(figsize=(8, 4))
    x = range(len(data))
    y = data
    plt.plot(x, y)
    plt.plot(x, y, color='red')
    plt.legend(loc='upper center')
    plt.grid()
    
    
def plot_spectrogram(data, samplerate):
    """ Plot spectrogram with mel scaling """
    
    sr = samplerate
    spectrogram = librosa.feature.melspectrogram(data, sr=sr)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    librosa.display.specshow(log_spectrogram, sr=sr, x_axis='time', y_axis='mel')

In [None]:
train_labels = pd.read_csv(path + 'train_soundscape_labels.csv')
train_meta = pd.read_csv(path + 'train_metadata.csv')
test_data = pd.read_csv(path + 'test.csv')
samp_subm = pd.read_csv(path + 'sample_submission.csv')

In [None]:
print('Number train label samples:', len(train_labels))
print('Number train meta samples:', len(train_meta))
print('Number train short folder:', len(os.listdir(path+'train_short_audio')))
print('Number train audios:', len(os.listdir(path+'train_soundscapes')))
print('Number test samples:', len(test_data))

In [None]:
os.listdir(path + 'train_short_audio/caltow')[:2]

In [None]:
train_labels.head()

In [None]:
train_meta.head()

Data Size

In [None]:
print(f"Training Dataset Shape: {train_meta.shape}")
print(f"Training Dataset Labels Shape: {train_labels.shape}")

Column-wise unique values

In [None]:
print("Data: train")
print("-----------")
for col in train_meta.columns:
    print(col + ":" + str(len(train_meta[col].unique())))

print("\nData: train_labels")
print("-----------")
for col in train_labels.columns:
    print(col + ":" + str(len(train_labels[col].unique())))

Time of the Recording

In [None]:
train_meta['year'] = train_meta['date'].apply(lambda x: x.split("-")[0])
train_meta['month'] = train_meta['date'].apply(lambda x: x.split("-")[1])
plt.figure(figsize=(16, 6))
ax = sns.countplot(train_meta['year'].sort_values(ascending=False), palette="hls")

plt.title("Audio Files Registration per Year Made", fontsize=16)
plt.xticks(rotation=70, fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

In [None]:
plt.figure(figsize=(16, 6))
ax = sns.countplot(train_meta['month'].sort_values(ascending=False), palette="hls")

plt.title("Audio Files Registration per Month Made", fontsize=16)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

In [None]:
row = 1000
train_meta.iloc[row]

In [None]:
label = train_meta.loc[row, 'primary_label']
filename = train_meta.loc[row, 'filename']

# Check if the file is in the folder
filename in os.listdir(path+'train_short_audio/' + label)

Visualizing the audio

In [None]:
filename = f'../input/birdclef-2021/train_short_audio/{label}/{filename}'
filename

In [None]:
plt.figure(figsize=(18, 5))

# by default librosa.load returns a sample rate of 22050
# librosa converts input to mono, hence always 
data, sample_rate = librosa.load(filename)
librosa.display.waveplot(data, sr=sample_rate)
print("Sample Rate: ", sample_rate)
ipd.Audio(filename)

## Spectrogram
A spectrogram is a visual representation of the spectrum of frequencies of a signal as it varies with time.

In [None]:
plt.figure(figsize=(18, 5))
X = librosa.stft(data)
Xdb = librosa.amplitude_to_db(abs(X))
librosa.display.specshow(Xdb, sr=sample_rate, x_axis='time', y_axis='hz')
plt.colorbar();

## Spectral Centroid
The spectral centroid is a measure used in digital signal processing to characterise a spectrum. It indicates where the center of mass of the spectrum is located. 

In [None]:
spectral_centroids = librosa.feature.spectral_centroid(data, sr=sample_rate)[0]
plt.figure(figsize=(25, 9))
frames = range(len(spectral_centroids))
t = librosa.frames_to_time(frames)

# Normalising the spectral centroid for visualisation
def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

#Plotting the Spectral Centroid along the waveform
librosa.display.waveplot(data, sr=sample_rate, alpha=0.4)
plt.plot(t, normalize(spectral_centroids), color='b');

## Spectral Rolloff
It is a measure of the shape of the signal. It represents the frequency at which high frequencies decline to 0.

In [None]:
plt.figure(figsize=(25, 9))
spectral_rolloff = librosa.feature.spectral_rolloff(data+0.01, sr=sample_rate)[0]
librosa.display.waveplot(data, sr=sample_rate, alpha=0.4)
plt.plot(t, normalize(spectral_rolloff), color='r');

## Spectral bandwidth
The spectral bandwidth is defined as the width of the band of light at one-half the peak maximum (or full width at half maximum [FWHM]) and is represented by the two vertical red lines and λSB on the wavelength axis.

In [None]:
spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(data+0.01, sr=sample_rate)[0]
spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(data+0.01, sr=sample_rate, p=3)[0]
spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(data+0.01, sr=sample_rate, p=4)[0]
plt.figure(figsize=(25, 9))
librosa.display.waveplot(data, sr=sample_rate, alpha=0.4)
plt.plot(t, normalize(spectral_bandwidth_2), color='r')
plt.plot(t, normalize(spectral_bandwidth_3), color='g')
plt.plot(t, normalize(spectral_bandwidth_4), color='y')
plt.legend(('p = 2', 'p = 3', 'p = 4'));  # p: order of spectral bandwidth

## Zero-Crosing Rate
The zero-crossing rate (ZCR) is the rate at which a signal changes from positive to zero to negative or from negative to zero to positive.

In [None]:
#Plot the signal:
plt.figure(figsize=(25, 9))
# librosa.display.waveplot(data, sr=sample_rate)
# Zooming in
n0 = 9000
n1 = 9100

plt.plot(data[n0:n1])
plt.grid()

In [None]:
zero_crossings = librosa.zero_crossings(data[n0:n1], pad=False)
print(sum(zero_crossings))

## Mel-Frequency Cepstral Coefficients (MFCCs)
The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) which concisely describe the overall shape of a spectral envelope.

In [None]:
mfccs = librosa.feature.mfcc(data, sr=sample_rate)

#Displaying  the MFCCs:
plt.figure(figsize=(15, 7))
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time')
plt.colorbar();

## Chrome features
A chroma feature or vector is typically a 12-element feature vector indicating how much energy of each pitch class, {C, C#, D, D#, E, …, B}, is present in the signal.

In [None]:
hop_length=512
chromagram = librosa.feature.chroma_stft(data, sr=sample_rate, hop_length=hop_length)
plt.figure(figsize=(20, 8))
librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
plt.colorbar();

In [None]:
print("There are total {} species".format(train_meta['primary_label'].nunique()))

## Top 100

In [None]:
def plotbar(series, pal):
    plt.figure(figsize=(20, 9))
    chart = sns.barplot(x=series.index, y=series.values, edgecolor=(0,0,0), linewidth=2, palette=(pal))
    chart.set_xticklabels(chart.get_xticklabels(), rotation=45)
    
    
species = train_meta['primary_label'].value_counts()[:100]
plotbar(species, "Blues_r") # series, palette

In [None]:
sns.set(rc={'figure.figsize':(20,6)})
sns.countplot(x='rating', data=train_meta, edgecolor=(0,0,0), linewidth=2, palette=('cubehelix'));

There are only few files with low ratings

In [None]:
authors = train_meta['author'].value_counts()[:10]
plotbar(authors, "YlOrBr_r") # series, palette

Richard E. Webster is author having maximum file entries.

## Top 100 training samples per species

In [None]:
print("Common Name")
common = train_meta['common_name'].value_counts()[:100]
plotbar(authors, "light:b_r") # series, palette

In [None]:
print("Scientific Name- Top-50")
scien = train_meta['scientific_name'].value_counts()[:50]
plotbar(scien, "Greens_r") # series, palette

In [None]:
plt.figure(figsize=(18, 5))

# by default librosa.load returns a sample rate of 22050
# librosa converts input to mono, hence always 
sig, sample_rate = librosa.load(filename)
librosa.display.waveplot(data, sr=sample_rate)
print("Sample Rate: ", sample_rate)
ipd.Audio(filename)

In the past that convolutional neural networks (CNN) perform particularly well for sound classification. But CNN need 2D inputs. Luckily, we can transform an audio signal into a 2D representation: a so-called spectrogram.

In [None]:
# First, compute the spectrogram using the "short-time Fourier transform" (stft)
spec = librosa.stft(sig)

# Scale the amplitudes according to the decibel scale
spec_db = librosa.amplitude_to_db(spec, ref=np.max)

# Plot the spectrogram
plt.figure(figsize=(15, 5))
librosa.display.specshow(spec_db, 
                         sr=32000, 
                         x_axis='time', 
                         y_axis='hz', 
                         cmap=plt.get_cmap('viridis'));

In [None]:
print('SPEC SHAPE:', spec_db.shape)

## EDA

In [None]:
train_labels['audio_id'].unique()

In [None]:
train_labels.groupby(by=['audio_id']).count()['birds'][:4]

In [None]:
print('original label:', train_labels.loc[458, 'birds'])
print('split into list:', train_labels.loc[458, 'birds'].split(' '))

In [None]:
labels = []
for row in train_labels.index:
    labels.extend(train_labels.loc[row, 'birds'].split(' '))
labels = list(set(labels))

print('Number of unique bird labels:', len(labels))

In [None]:
df_labels_train = pd.DataFrame(index=train_labels.index, columns=labels)
for row in train_labels.index:
    birds = train_labels.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_train.loc[row, bird] = 1
df_labels_train.fillna(0, inplace=True)

# We set a dummy value for the target label in the test data because we will need for the Data Generator
test_data['birds'] = 'nocall'

df_labels_test = pd.DataFrame(index=test_data.index, columns=labels)
for row in test_data.index:
    birds = test_data.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_test.loc[row, bird] = 1
df_labels_test.fillna(0, inplace=True)

In [None]:
df_labels_train.sum().sort_values(ascending=False)[:10]

In [None]:
train_labels = pd.concat([train_labels, df_labels_train], axis=1)
test_data = pd.concat([test_data, df_labels_test], axis=1)

In [None]:
file = os.listdir(path + 'train_soundscapes')[0]
file

In [None]:
data, samplerate = read_ogg_file(path + 'train_soundscapes/', file)

In [None]:
audio_id = file.split('_')[0]
site = file.split('_')[1]
print('audio_id:', audio_id, ', site:', site)

In [None]:
train_labels[(train_labels['audio_id']==int(audio_id)) & (train_labels['site']==site) & (train_labels['birds']!='nocall')]

In [None]:
sub_data = data[int(455/5)*160000:int(460/5)*160000]

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveplot(sub_data, sr=samplerate)
plt.grid()
plt.show()

In [None]:
display.Audio(sub_data, rate=samplerate)

In [None]:
data_lenght = 160000
audio_lenght = 5
num_labels = len(labels)

In [None]:
batch_size = 16

In [None]:
list_IDs_train, list_IDs_val = train_test_split(list(train_labels.index), test_size=0.33, random_state=2021)
list_IDs_test = list(samp_subm.index)

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_ * self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size: (index+1) * self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        X = X.reshape((self.batch_size, 100, 1600//2))
        return X, y
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, data_lenght//2))
        y = np.zeros((self.batch_size, num_labels))
        for i, ID in enumerate(list_IDs_temp):
            prefix = str(self.data.loc[ID, 'audio_id']) + '_' + self.data.loc[ID, 'site']
            file_list = [s for s in os.listdir(self.path) if prefix in s]
            if len(file_list) == 0:
                # Dummy for missing test audio files
                audio_file_fft = np.zeros((data_lenght//2))
            else:
                file = file_list[0]#[s for s in os.listdir(self.path) if prefix in s][0]
                audio_file, audio_sr = read_ogg_file(self.path, file)
                audio_file = audio_file[
                    int((self.data.loc[ID, 'seconds']-5)/audio_lenght)*data_lenght:
                    int(self.data.loc[ID, 'seconds']/audio_lenght)*data_lenght
                ]
                audio_file_fft = np.abs(np.fft.fft(audio_file)[: len(audio_file)//2])
                # scale data
                audio_file_fft = (audio_file_fft-audio_file_fft.mean())/audio_file_fft.std()
            X[i, ] = audio_file_fft
            y[i, ] = self.data.loc[ID, self.data.columns[5:]].values
        return X, y

In [None]:
train_generator = DataGenerator(path+'train_soundscapes/', list_IDs_train, train_labels, batch_size)
val_generator = DataGenerator(path+'train_soundscapes/', list_IDs_val, train_labels, batch_size)
test_generator = DataGenerator(path+'test_soundscapes/', list_IDs_test, test_data, batch_size)

In [None]:
epochs = 2
lr = 1e-3

In [None]:
num_labels

In [None]:
for batch in train_generator:
    print(batch[0][0][0].shape)
    print(batch[1][0].shape)
    break

## Model 1 - 1D CNN Neural Network

In [None]:
model = Sequential()
model.add(Conv1D(128, input_shape=batch, 1600//2,), kernel_size=5, strides=4, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPool1D(pool_size=(4)))
model.add(Conv1D(64, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(num_labels, activation='sigmoid'))

In [None]:
model.compile(optimizer = Adam(lr=lr),
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

In [None]:
model.summary()

In [None]:
# history = model.fit_generator(generator=train_generator,
#                               validation_data=val_generator,
#                               epochs=epochs,
#                               workers=4)

In [None]:
from keras.models import load_model

# model.save('model_1d_conv.h5')
model = load_model('../input/bird-model-conv1d/model_1d_conv.h5')
model.summary()

In [None]:
y_pred = model.predict_generator(test_generator, verbose=1)

In [None]:
y_test = np.where(y_pred > 0.5, 1, 0)

In [None]:
for row in samp_subm.index:
    string = ''
    for col in range(len(y_test[row])):
        if y_test[row][col] == 1:
            if string == '':
                string += labels[col]
            else:
                string += ' ' + labels[col]
    if string == '':
        string = 'nocall'
    samp_subm.loc[row, 'birds'] = string

In [None]:
output = samp_subm
output.to_csv('submission.csv', index=False)