In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

import librosa
import librosa.display
import soundfile as sf
from sklearn.model_selection import train_test_split
import plotly.express as px
from plotly.offline import iplot
import cufflinks as cf
import tensorflow as tf
import gc
import wave
from scipy.io import wavfile
from IPython.display import Audio
import IPython.display as display
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette('Set3')
%matplotlib inline


In [None]:
# Input data files are available in the read-only "../input/" directory
import os
PATH = '/kaggle/input/birdclef-2021/'
os.listdir(PATH)

In [None]:
# Load train data
train_data = pd.read_csv(PATH + 'train_metadata.csv')
print(f'Len train data: {len(train_data)}')
train_data.head(3)

In [None]:
# Load train labels
train_labels = pd.read_csv(PATH + 'train_soundscape_labels.csv')
print(f'Len train labels: {len(train_labels)}')
train_labels.head(3)

In [None]:
# Load test data
test_data = pd.read_csv(PATH + 'test.csv')
print(f'Len test data: {len(test_data)}')
test_data.head(3)

In [None]:
num = 0
label = train_data.loc[num, 'primary_label']
fn = train_data.loc[num, 'filename']

print(f'{train_data.iloc[num]}')
print(f'Train Data shape={train_data.shape}')
print(f'Train Labels shape={train_labels.shape}')
print('----------------------------')
print(f'{train_data.info()}')

In [None]:
# Load sample submission
sample_submission = pd.read_csv(PATH + 'sample_submission.csv')
print(f'Len test data: {len(sample_submission)}')
print(f"Birds in train_short_audio: {len(os.listdir(PATH + 'train_short_audio/'))}")
print(f"Audio files in train_soundscapes: {len(os.listdir(PATH + 'train_soundscapes/'))}")
sample_submission.head(3)

In [None]:
# Recordings Count by Year
train_data['year'] = train_data['date'].apply(lambda x: x.split('-')[0])
train_data['month'] = train_data['date'].apply(lambda x: x.split('-')[1])
train_data['day'] = train_data['date'].apply(lambda x: x.split('-')[2])

train_data['year'] = train_data['year'].apply(lambda x: x if x[:2] in ['19', '20'] else np.nan)
train_data['year'].fillna(train_data['year'].value_counts().index[0], inplace = True)

In [None]:
temp = train_data['year'].value_counts()
px.bar(x = temp.index, y = temp.values, title = 'Recordings by Year', 
       labels = {'x': 'Year', 'y': 'Count'})

In [None]:
# Load Audio File
sample_audio = PATH + 'train_short_audio/rucwar/XC133150.ogg'
signal, sr = librosa.load(sample_audio)

print(f'Rate: {sr}')
print(f'Signal: {signal}')
print(f'Lenght: {len(signal)}')
print(f'Duration signal: {round(len(signal)/sr, 3)}')

In [None]:
signal, _ = librosa.load(sample_audio, sr = 44100, duration = 20)

# Signal
plt.figure(figsize=(10, 6))
librosa.display.waveplot(signal)
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.show()

# Melspectrogram
plt.figure(figsize=(10, 6))
mels = librosa.feature.melspectrogram(y = signal, sr = 44100, n_mels = 256, fmax = 8000)
librosa.display.specshow(librosa.power_to_db(mels, ref = np.max), x_axis = 'time', y_axis = 'mel')
plt.title('Melspectrogram')
plt.colorbar()
plt.show()

# Fourier Transform
plt.figure(figsize = (10, 6))
stft = librosa.stft(y = signal)
stft_db = librosa.amplitude_to_db(stft)
librosa.display.specshow(stft_db, x_axis = 'time', y_axis = 'hz')
plt.title('Spectrogram - STFT')
plt.colorbar()
plt.show()

#Log Frequency Axis
plt.figure(figsize = (10, 6))
librosa.display.specshow(stft_db, sr = 44100, x_axis = 'time', y_axis = 'log')
plt.colorbar()
plt.title('Log Frequency Axis')
plt.show()

In [None]:
Audio(sample_audio, rate = 44100)

In [None]:
# Check the birds and their associated audio files
audio_path = PATH + 'train_short_audio/'
birds_audio = {}
for bird in os.listdir(audio_path):
    birds_audio[bird] = len(os.listdir(audio_path + bird))
birds_df = pd.DataFrame(birds_audio.items())
birds_df.columns = ['Birds', 'Num_Audio']
birds_df = birds_df.sort_values(by = 'Num_Audio', ascending = False)
px.bar(birds_df, x = 'Birds', y = 'Num_Audio')

In [None]:
# Bird Recording Location on World Map
import geopandas as gpd
gdf = gpd.GeoDataFrame(train_data, geometry=gpd.points_from_xy(train_data.longitude,train_data.latitude)) 

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
fig,ax = plt.subplots(figsize=(24,12))
world.plot(ax=ax, color='black', edgecolor='black')
gdf.plot(ax=ax, color='red', markersize=2)
plt.show()

In [None]:
# Extract all label of the train data
labels = []
for row in train_labels.index:
    labels.extend(train_labels.loc[row, 'birds'].split(' '))
labels = list(set(labels))

print(f'Len of unique bird labels: {len(labels)}')

In [None]:
# Encode the labels
df_encode = pd.DataFrame(index=train_labels.index, columns=labels)
for row in train_labels.index:
    birds = train_labels.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_encode.loc[row, bird] = 1
df_encode.fillna(0, inplace=True)

#  Set a dummy value for the target label in the test data
test_data['birds'] = 'nocall'

df_labels_test = pd.DataFrame(index=test_data.index, columns=labels)
for row in test_data.index:
    birds = test_data.loc[row, 'birds'].split(' ')
    for bird in birds:
        df_labels_test.loc[row, bird] = 1
df_labels_test.fillna(0, inplace=True)

train_labels = pd.concat([train_labels, df_encode], axis=1)
test_data = pd.concat([test_data, df_labels_test], axis=1)

In [None]:
file = os.listdir(PATH+'train_soundscapes')[0]
audio_id = file.split('_')[0]
site = file.split('_')[1]
print('audio_id:', audio_id, ', site:', site)
train_labels[(train_labels['audio_id']==int(audio_id)) & (train_labels['site']==site) 
             & (train_labels['birds']!='nocall')]

In [None]:
from tensorflow.keras.utils import Sequence, to_categorical, plot_model
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from keras.optimizers import RMSprop, Adam
from keras.applications import VGG19, VGG16, ResNet50

In [None]:
# Parameters
data_lenght = 160000
audio_lenght = 5
num_labels = len(labels)
batch_size = 16

In [None]:
# Split data to train, val, test
train, val = train_test_split(list(train_labels.index), test_size=0.3, random_state=2021)
test = list(sample_submission.index)

In [None]:
# Read file and return np.array
def get_read_file(PATH, fn):
    date, sample_rate = sf.read(PATH + fn)
    return date, sample_rate

# Built Audio Generator
class AudioDataGen(tf.keras.utils.Sequence):
    def __init__(self, PATH, id_, data, batch_size):
        self.data = data
        self.id_ = id_
        self.path = PATH
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.id_))
        
    def __len__(self):
        len_ = int(len(self.id_)/self.batch_size)
        if len_*self.batch_size < len(self.id_):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        id_temp = [self.id_[k] for k in indexes]
        X, y = self.__data_generation(id_temp)
        X = X.reshape((self.batch_size, 100, 1600//2))
        return X, y
    
    def __data_generation(self, id_temp):
        X = np.zeros((self.batch_size, data_lenght//2))
        y = np.zeros((self.batch_size, num_labels))
        for i, ID in enumerate(id_temp):
            prefix = str(self.data.loc[ID, 'audio_id'])+'_'+self.data.loc[ID, 'site']
            file_list = [s for s in os.listdir(self.path) if prefix in s]
            if len(file_list) == 0:
                audio_file_fft = np.zeros((data_lenght//2))
            else:
                file = file_list[0]
                audio_file, audio_sr = get_read_file(self.path, file)
                audio_file = audio_file[int((self.data.loc[ID, 'seconds']-5)/audio_lenght)*data_lenght:int(self.data.loc[ID, 'seconds']/audio_lenght)*data_lenght]
                audio_file_fft = np.abs(np.fft.fft(audio_file)[: len(audio_file)//2])
                audio_file_fft = (audio_file_fft-audio_file_fft.mean())/audio_file_fft.std()
            X[i, ] = audio_file_fft
            y[i, ] = self.data.loc[ID, self.data.columns[5:]].values
        return X,y

In [None]:
# Get data from AudioDataGen
train_gen = AudioDataGen(PATH+'train_soundscapes/', train, train_labels, batch_size)
val_gen = AudioDataGen(PATH+'train_soundscapes/', val, train_labels, batch_size)
test_gen = AudioDataGen(PATH+'test_soundscapes/', test, test_data, batch_size)

In [None]:
# Create model
def create_model_first():
    model = Sequential()
    model.add(Conv1D(64, input_shape=(100, 1600//2,), kernel_size=5, strides=4, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool1D(pool_size=(4)))
    model.add(Conv1D(64, kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dense(len(labels), activation='sigmoid'))
    model.compile(optimizer=Adam(lr=2e-3), loss='binary_crossentropy', metrics=['binary_accuracy'])
    return model

In [None]:
model = create_model_first()
model.summary()
plot_model(model, to_file='model.png')

In [None]:
history = model.fit_generator(generator=train_gen, validation_data=val_gen, epochs=5, workers=4)

In [None]:
plt.plot(history.history["binary_accuracy"])
plt.plot(history.history["val_binary_accuracy"])
sns.set()
plt.show()

In [None]:
model.save('./best_model.h5')

In [None]:
y_pred = model.predict_generator(test_gen, verbose=1)

In [None]:
# Test model
y_test = np.where(y_pred > 0.5, 1, 0)

for row in sample_submission.index:
    string = ''
    for col in range(len(y_test[row])):
        if y_test[row][col] == 1:
            if string == '':
                string += labels[col]
            else:
                string += ' ' + labels[col]
    if string == '':
        string = 'nocall'
    sample_submission.loc[row, 'birds'] = string

In [None]:
output = sample_submission
output.to_csv('submission.csv', index=False)

In [None]:
output