Uses the prepared [melspectrograms](https://librosa.github.io/librosa/generated/librosa.feature.melspectrogram.html) and [Mel-frequency cepstral coefficients (MFCCs)](https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html) to train and evaluate a CNN-based classifier.

This version includes MFCCs in the feature space. Instead of adding this array as a separate input, or stacking it on top, it is simply overwrites the bottom 20 frequency bands of the spectrogram. Since they share the same `time` axis, this should be a good approach. This is also a convenient means of filtering out the lower frequencies.

This version also include Dropout(rate=0.2) after each MaxPooling layer. 

This version does **not** filter out quiet frames.

In [None]:
import pandas as pd
import numpy as np
import librosa as lr
from librosa.display import specshow
from glob import glob
import os
from IPython.display import Audio
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential

## Data Loading

In [None]:
def get_full_path(sample): return os.path.join(sounds_dir, sample['file_name'])
sounds_dir = "../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto-ca-nv/"
# sounds_dir = "../input/xeno-canto-ca-nv/"

melspec_dir = "../input/avian-vocalizations-melspectrograms-log-norm/"
melspec_features_dir = melspec_dir + "/melspectrograms_logscaled_normalized/features"

df = pd.read_csv("../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto_ca-nv_index.csv", index_col='file_id' )
display(df.head(2))
# df = pd.read_csv("../input/xeno-canto_ca-nv_index.csv")
files_list = glob(os.path.join(sounds_dir,"*.mp3"))
print("%i mp3 files in %s"%(len(files_list), sounds_dir))
print("%i samples in index."%len(df))


In [None]:
shapes_df = pd.read_csv("../input/avian-vocalizations-spectrograms-and-mfccs/feature_shapes.csv",index_col=0 )
display(shapes_df.head(2))

In [None]:
train_df = pd.read_csv("../input/avian-vocalizations-partitioned-data/train_file_ids.csv",index_col=0)
display("Training data:",train_df.head(2))

In [None]:
test_df = pd.read_csv("../input/avian-vocalizations-partitioned-data/test_file_ids.csv",index_col=0)
display("Test data:",test_df.head(2))

In [None]:
import re
def parse_shape(shape_str):
    """Shape was saved in feature_shapes as a string. Woops. Parse out the values. """
    a,b = re.search('\((\d+), (\d+)\)',shape_str).groups()
    return int(a), int(b)

def log_clipped(a):
    """Convenience function to clip the input to positive values then return the log.""" 
    return np.log(np.clip(a,.0000001,a.max()))

In [None]:
# def get_ids(generator, batch_number, batch_index):
#     idxs = generator.indexes[
#         batch_number*generator.batch_size+batch_index:(batch_number+1)*generator.batch_size]
#     return [generator.list_IDs[k] for k in idxs]
# def show_batch_item(generator, batch_number, batch_index):
#     X_batch, y_batch = generator.__getitem__(batch_number)
#     ids = get_ids(generator, batch_number, batch_index)
#     file_id = X_train[ids[batch_index]]
#     sg = X_batch[batch_index].reshape(generator.dim)
# #     mm_scaled_log_sg = np.memmap('features/XC%s_log_scaled_melspectrogram.dat'%file_id))
# #     specshow(mm_scaled_log_sg, x_axis='time', y_axis='mel' )
# #     specshow(np.log(np.clip(sg,.0000001,sg.max())), x_axis='time', y_axis='mel' )
#     specshow(sg, x_axis='time', y_axis='mel' )
# #     specshow(mfcc, x_axis='time', y_axis='mel' )
#     sample = df[df.file_id==file_id].to_dict(orient='records')[0]
#     print(file_id,label_encoder.classes_[y_train[ids][batch_index]],
#           "contributed by",sample['recordist'], sample['recordist_url'])
#     species_name = sample['english_cname']
#     plt.title(species_name+" - "+sample['file_name'])
#     plt.colorbar()
#     plt.show()
#     wav, sr = lr.load(get_full_path(sample))
#     return Audio(wav, rate=sr)

In [None]:
y_english_labels_entire_dataset = [s['english_cname'] for i,s in df.iterrows()]
label_encoder = LabelEncoder().fit(y_english_labels_entire_dataset)
n_classes = len(label_encoder.classes_)

# X_train, X_test, y_train, y_test = train_test_split(
#     np.array([s['file_id'] for i,s in df.iterrows()]), 
#     y_encoded_entire_dataset, 
#     test_size=1/3, 
#     stratify=y_encoded_entire_dataset, 
#     shuffle=True, 
#     random_state=37,
# )
X_train = list(train_df.index)
y_train = list(train_df.label)
print("Training data len:",len(X_train), len(y_train))
X_test = list(test_df.index)
y_test = list(test_df.label)
print("Test data len:    ",len(X_test), len(y_test))

In [None]:
# print('Training File IDs: \n['+', '.join([str(x) for x in X_train])+']')

In [None]:
# print('Test File IDs: \n['+', '.join([str(x) for x in X_test])+']')

#### Create a Data Generator to generate fixed-length samples from random windows within clips

In [None]:
# import pickle
# with open('../input/avian-vocalizations-pickled-spectrograms-and-mfcc/features.pickle','rb') as f:
#     features = pickle.load(f)

In [None]:
# mfccs_mean_px = np.mean([features[file_id]['mfcc'].mean() for file_id in features])
# mfccs_std_px = np.mean([features[file_id]['mfcc'].std() for file_id in features])
# mfccs_mean_px, mfccs_std_px

## Data Generator

In [None]:
from sklearn.preprocessing import StandardScaler
mfcc_scaler = StandardScaler()
for file_id in shapes_df.file_id:
    mfcc = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/mfccs/features/XC%s_mfcc.dat'%file_id, 
        shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['mfcc_shapes'].values[0]),  dtype='float32', mode='readonly')
    mfcc_scaler.partial_fit(mfcc.flatten().reshape(-1, 1))
print("MFCC scaler:",mfcc_scaler.mean_, mfcc_scaler.var_, np.sqrt(mfcc_scaler.var_))  

In [None]:
shapes_df[shapes_df.file_id==file_id]

In [None]:
import keras
from keras.utils import to_categorical

# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class AudioFeatureGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size, n_frames=128, n_channels=1,
                 n_classes=10, shuffle=False, seed=37):
        'Initialization'
        self.n_frames = n_frames
        self.dim = (128, self.n_frames)
        self.batch_size = batch_size
        self.labels = {list_IDs[i]:l for i,l in enumerate(labels)}
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.seed = seed
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.seed(self.seed)
            self.seed = self.seed+1 # increment the seed so we get a different batch.
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        #X = np.empty((self.batch_size, 128+20, self.dim[1], self.n_channels))
        y = np.empty((self.batch_size, self.n_classes), dtype=int) # one-hot encoded labels

        for i, ID in enumerate(list_IDs_temp):
            sg_lognorm = np.memmap(os.path.join(melspec_features_dir,'XC%s_melspectrogram_logscaled_normalized.dat'%ID), 
                    shape=parse_shape(shapes_df[shapes_df.file_id==ID]['melspectrogram_shapes'].values[0]),  dtype='float32', mode='readonly')
#             sg = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/melspectrograms/features/XC%s_melspectrogram.dat'%file_id, 
#                     shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['melspectrogram_shapes'].values[0]),  dtype='float32', mode='readonly')
            mfcc = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/mfccs/features/XC%s_mfcc.dat'%ID, 
                    shape=parse_shape(shapes_df[shapes_df.file_id==ID]['mfcc_shapes'].values[0]),  dtype='float32', mode='readonly')
            # Normalize MFCCs
            mfcc = mfcc_scaler.transform(mfcc)
            
            # Filter out quiet frames, thanks to https://www.kaggle.com/fleanend/extract-features-with-librosa-predict-with-nb
            # Take mean amplitude M from frame with highest energy
#             m = sg[:,np.argmax(sg.mean(axis=0))].mean()
#             # Filter out all frames with energy less than 5% of M
#             mask = sg.mean(axis=0)>=m/20
#             sg = sg[:,mask]
#             sg_lognorm = sg_lognorm[:,mask]
#             mfcc = mfcc[:,mask]
            
            d_len = mfcc.shape[1] - self.dim[1]
            if d_len<0: # Clip is shorter than window, so pad with mean value.
                n = int(np.random.uniform(0, -d_len))
                pad_range = (n, -d_len-n) # pad with n values on the left, clip_length - n values on the right 
#                 sg_cropped = np.pad(sg, ((0,0), pad_range), 'constant', constant_values=sg.mean())
                sg_lognorm_cropped = np.pad(sg_lognorm, ((0,0), pad_range), 'constant', constant_values=0)
                mfcc_cropped = np.pad(mfcc, ((0,0), pad_range), 'constant', constant_values=0)
            else: # Clip is longer than window, so slice it up
                n = int(np.random.uniform(0, d_len))
#                 sg_cropped = sg[:, n:(n+self.dim[1])]
                sg_lognorm_cropped = sg_lognorm[:, n:(n+self.dim[1])]
                mfcc_cropped = mfcc[:, n:(n+self.dim[1])]
                
            #X[i,] = np.concatenate([sg_lognorm_cropped.reshape(1,128,self.dim[1],1), mfcc_cropped.reshape(1,20,self.dim[1],1)], axis=1)
            X[i,] = sg_lognorm_cropped.reshape(1,128,self.dim[1],1)
            # Overwrite the bottom of X with MFCCs (we don't need the low frequency bands anyway) 
            X[i,:20] = mfcc_cropped.reshape(1,20,self.dim[1],1)
            y[i,] = to_categorical(self.labels[ID], num_classes=self.n_classes)

        return X, y

## Sample Visualizations

In [None]:
from itertools import islice
generator = AudioFeatureGenerator(X_test, y_test, batch_size=1, shuffle=True, seed=37, n_frames=128, n_classes=n_classes)
for g in islice(generator,0,4): # show a few examples
    for i,spec in enumerate(g[0]): 
        plt.figure(figsize=(10,4))
        spec_ax = specshow(spec.squeeze(), x_axis='time', y_axis='mel')
        plt.title(label_encoder.classes_[np.argmax(g[1][i])])
        plt.colorbar()
        plt.show()

In [None]:
def vis_learning_curve(learning):
    train_loss = learning.history['loss']
    train_acc = learning.history['acc']
    val_loss = learning.history['val_loss'] if hasattr(learning.history,'val_loss') else None
    val_acc = learning.history['val_acc'] if hasattr(learning.history,'val_acc') else None

    fig, axes = plt.subplots(1, 2, figsize=(20,4), subplot_kw={'xlabel':'epoch'} )
    axes[0].set_title("Accuracy")
    axes[0].plot(train_acc, label='training')
    if val_acc is not None: axes[0].plot(val_acc, label='validation')
    axes[0].legend()
    axes[1].set_title("Loss")
    axes[1].plot(train_loss, label='training')
    if val_acc is not None: axes[1].plot(val_loss, label='validation')
    axes[1].legend()

    # Plot a line to indicate the best epoch 
    best_training_epoc = np.argmin(val_loss) if val_acc is not None else np.argmin(train_loss)
    axes[0].axvline(x=best_training_epoc, color='red')
    axes[1].axvline(x=best_training_epoc, color='red')

## Model Training

In [None]:
print("Accuracy by random guess: %.4f"%(1/len(df['english_cname'].unique())))

In [None]:
n_epochs = 150
params = {#'dim': (128,256),
          'n_frames': 128,
          'n_classes': n_classes,
          'n_channels': 1}
training_generator = AudioFeatureGenerator(X_train, y_train, batch_size=64, shuffle=True, seed=37, **params)
# validation_generator = AudioFeatureGenerator(cv_val_index, y_cv_val, batch_size=len(cv_val_index), **params)
dim = training_generator.dim

snapshot_filename = "weights.best.hdf5"

checkpointer = ModelCheckpoint(filepath=snapshot_filename, verbose=1)#, save_best_only=True)

# From My Udacity Dog Project Image Classifier
model = Sequential()
#model.add(Conv2D(16,(1,4),input_shape=(20+dim[0], dim[1], 1),padding='valid',activation="relu"))
model.add(Conv2D(64,3,input_shape=(dim[0], dim[1], 1),padding='valid',activation="relu"))
model.add(MaxPooling2D(pool_size=3))
model.add(Dropout(rate=.2))
model.add(Conv2D(64,3,padding='valid',activation="relu"))
model.add(MaxPooling2D(pool_size=3))
model.add(Dropout(rate=.2))
model.add(Conv2D(64,3,padding='valid',activation="relu"))
model.add(MaxPooling2D(pool_size=3))
model.add(Dropout(rate=.2))
model.add(GlobalAveragePooling2D())
model.add(Dense(n_classes, activation="softmax"))
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
learning = model.fit_generator(
            training_generator, 
#             validation_data=validation_generator,
            epochs=n_epochs, 
            callbacks=[checkpointer], 
#             use_multiprocessing=True, workers=4,
            verbose=0, )
pd.DataFrame(learning.history).to_csv('training_history.csv', index_label='epoch')
vis_learning_curve(learning)
plt.savefig("learning_curve.png")
plt.show()
acc_at_min_loss = learning.history['acc'][np.argmin(learning.history['loss'])]
print("Min training loss: %.5f, Training accuracy at min loss: %.5f"%(np.min(learning.history['loss']), acc_at_min_loss ))

## Model Testing

In [None]:
model.load_weights(snapshot_filename)

In [None]:
X_batch, y_batch = AudioFeatureGenerator(X_test, y_test, batch_size=len(X_test), **params)[0]
predictions = model.predict(X_batch) 
y_predicted = [np.argmax(p) for p in predictions]
y_true = [np.argmax(y) for y in y_batch]
test_score = accuracy_score(y_true, y_predicted)
print("Test accuracy score: %.5f"%test_score)

In [None]:
# Draw a confusion matrix
conf_matrix = confusion_matrix(y_true, y_predicted, labels=range(n_classes))
plt.figure(figsize=(20,20))
plt.imshow(conf_matrix)
plt.xticks(range(n_classes), label_encoder.classes_, rotation='vertical')
plt.yticks(range(n_classes), label_encoder.classes_)
plt.colorbar(shrink=.25);

In [None]:
# Show which species are correctly classified
plt.figure(figsize=(15,4))
plt.title("Percent Correctly Classified, by Species")
pct_correct_by_class = np.zeros(n_classes)
counts = np.sum(conf_matrix,axis=1)
np.divide(np.array([conf_matrix[i,i] for i in range(n_classes)]), counts, 
          out=pct_correct_by_class, where=counts!=0)*100
plt.bar(range(n_classes), pct_correct_by_class, .75)
plt.xlim(-1,91)
plt.xticks(range(n_classes), label_encoder.classes_, rotation='vertical');

In [None]:
# Show which species are incorrectly classified
plt.figure(figsize=(15,4))
plt.title("Percent Correct, by Predicted Class")
pct_correct_by_predicted_class = np.zeros(n_classes)
counts = np.sum(conf_matrix,axis=0)
np.divide(np.array([conf_matrix[i,i] for i in range(n_classes)]), counts, 
          out=pct_correct_by_predicted_class, where=counts!=0)*100
plt.bar(range(n_classes), pct_correct_by_predicted_class, .75)
plt.xlim(-1,91)
plt.xticks(range(n_classes), label_encoder.classes_, rotation='vertical');