In [None]:
import pandas as pd
import numpy as np
import librosa
import glob 
from IPython.display import Audio
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from keras.layers import Dense , Dropout ,Input , LSTM
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.utils import plot_model
from keras.models import Model

In [None]:
!ls '../input/xenocanto-avian-vocalizations-canv-usa'

In [None]:
root = '../input/xenocanto-avian-vocalizations-canv-usa/'
meta = pd.read_csv(root + 'xeno-canto_ca-nv_index.csv')
meta.head()

In [None]:
meta.shape

In [None]:
meta['species'].value_counts()

In [None]:
meta = meta[meta['species'].isin(['californica','nuttallii','occidentalis'])]
meta.head()

In [None]:
le = LabelEncoder()
le.fit(meta['species'])
meta['species'] = le.transform(meta['species'])

In [None]:
sound_id = list(meta['file_name'].values)
labels = list(meta['species'].values)

print(f"shape sound id : {len(sound_id)}")
print(f"shape labels : {len(labels)}")

In [None]:
labels = to_categorical(labels)
print(f"shape labels : {labels.shape}")

In [None]:
import IPython
root = '../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto-ca-nv/'
IPython.display.Audio(root + sound_id[50])

In [None]:
# more feature extractor : https://librosa.org/doc/main/feature.html

def extract_mfcc(file_name):
    #This function extracts mfcc features and obtain the mean of each dimension
    #Input : path sound
    #Output: mfcc_features'''
    root = '../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto-ca-nv/'
    path =  root + file_name 
    y, sr = librosa.load(path)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr,n_mfcc=36).T,axis=0)
    
    return mfccs

In [None]:
def extract_CENS(file_name):
    #This function extracts CENS features and obtain the mean of each dimension
    #Input : path sound
    #Output: CENS_features'''
    root = '../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto-ca-nv/'
    path =  root + file_name 
    y, sr = librosa.load(path)
    cens = np.mean(librosa.feature.chroma_cens(y=y, sr=sr,n_chroma=36).T,axis=0)
    
    return cens

In [None]:
sound_features_mfcc = []
sound_features_cens = []
for path in sound_id:
    sound_features_mfcc.append(extract_mfcc(path))
    sound_features_cens.append(extract_CENS(path))

In [None]:
sound_features_mfcc[0]

In [None]:
print(len(sound_features_mfcc))
print(sound_features_mfcc[0].shape)

In [None]:
sound_features_cens[0]

In [None]:
print(len(sound_features_cens))
print(sound_features_cens[0].shape)

In [None]:
sound_features = []

for i in range(len(sound_id)):
    mfcc = sound_features_mfcc[i]
    feature = np.array([mfcc]).reshape(36,1)
    sound_features.append(feature)
    
sound_features = np.asarray(sound_features)

In [None]:
# LSTM input -> (Samples,timesteps,Features)
# we split sound to 36 timesteps and each timesteps represents by 2 features.
input_layer = Input(shape=(36,1), name ="input")
lstm_layer = LSTM(128, return_sequences=False)(input_layer)
hidden_layer1 = Dense(32 ,activation='relu' , name="layer1")(lstm_layer)
hidden_layer2 = Dense(16 ,activation='relu' , name="layer2")(hidden_layer1)
droupout_layer = Dropout(0.5)(hidden_layer2)
hidden_layer3 = Dense(8 ,activation='tanh' , name="layer3")(droupout_layer)
output_layer = Dense(3 ,activation='softmax' , name="output")(droupout_layer)

model = Model(inputs=input_layer, outputs=output_layer , name = "model")
model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
sgd = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(sound_features,labels ,epochs=50,batch_size = 32 ,verbose=1)

In [None]:
plt.figure(figsize = (10,5))
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='lower right')
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper right')
plt.show()

In [None]:
sound_features = []

for i in range(len(sound_id)):
    mfcc = sound_features_mfcc[i]
    cens = sound_features_cens[i]
    feature = np.array([mfcc,cens]).reshape(36,2)
    sound_features.append(feature)
    
sound_features = np.asarray(sound_features)

In [None]:
sound_features[0].shape

In [None]:
# LSTM input -> (Samples,timesteps,Features)
# we split sound to 36 timesteps and each timesteps represents by 2 features.
input_layer = Input(shape=(36,2), name ="input")
lstm_layer = LSTM(128, return_sequences=False)(input_layer)
hidden_layer1 = Dense(32 ,activation='relu' , name="layer1")(lstm_layer)
hidden_layer2 = Dense(16 ,activation='relu' , name="layer2")(hidden_layer1)
droupout_layer = Dropout(0.5)(hidden_layer2)
hidden_layer3 = Dense(8 ,activation='tanh' , name="layer3")(droupout_layer)
output_layer = Dense(3 ,activation='softmax' , name="output")(droupout_layer)

model = Model(inputs=input_layer, outputs=output_layer , name = "model")
model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
sgd = SGD(lr=0.0001, momentum=0.9)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(sound_features,labels ,epochs=50,batch_size = 32 ,verbose=1)

In [None]:
plt.figure(figsize = (10,5))
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='lower right')
plt.show()

In [None]:
plt.figure(figsize = (10,5))
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper right')
plt.show()