Uses the prepared [melspectrograms](https://librosa.github.io/librosa/generated/librosa.feature.melspectrogram.html) and [Mel-frequency cepstral coefficients (MFCCs)](https://librosa.github.io/librosa/generated/librosa.feature.mfcc.html) to train and evaluate a CNN-based classifier.

This version adds MFCCs to the feature space. Instead of adding it as a separate input, it is simply concatenated onto the top of the spectrogram. Since they share the same `time` axis, this should be a good approach.



This version does not filter out quiet frames.

In [None]:
import pandas as pd
import numpy as np
import librosa as lr
from librosa.display import specshow
from glob import glob
import os
from IPython.display import Audio
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [None]:
def get_full_path(sample): return os.path.join(sounds_dir, sample['file_name'])
sounds_dir = "../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto-ca-nv/"
# sounds_dir = "../input/xeno-canto-ca-nv/"

df = pd.read_csv("../input/xenocanto-avian-vocalizations-canv-usa/xeno-canto_ca-nv_index.csv" ).drop('Unnamed: 0',axis=1)
display(df.head(2))
# df = pd.read_csv("../input/xeno-canto_ca-nv_index.csv")
files_list = glob(os.path.join(sounds_dir,"*.mp3"))
print("%i mp3 files in %s"%(len(files_list), sounds_dir))
print("%i samples in index."%len(df))

melspec_dir = "../input/avian-vocalizations-melspectrograms-log-norm/"
melspec_features_dir = melspec_dir + "/melspectrograms_logscaled_normalized/features"

# shapes_df = pd.read_csv(melspec_dir+"/feature_shapes.csv")
shapes_df = pd.read_csv("../input/avian-vocalizations-spectrograms-and-mfccs/feature_shapes.csv" ).drop('Unnamed: 0',axis=1)
shapes_df.head(2)

In [None]:
import re
def parse_shape(shape_str):
    """Shape was saved in feature_shapes as a string. Woops. Parse out the values. """
    a,b = re.search('\((\d+), (\d+)\)',shape_str).groups()
    return int(a), int(b)

def log_clipped(a):
    """Convenience function to clip the input to positive values then return the log.""" 
    return np.log(np.clip(a,.0000001,a.max()))

In [None]:
def get_ids(generator, batch_number, batch_index):
    idxs = generator.indexes[
        batch_number*generator.batch_size+batch_index:(batch_number+1)*generator.batch_size]
    return [generator.list_IDs[k] for k in idxs]
def show_batch_item(generator, batch_number, batch_index):
    X_batch, y_batch = generator.__getitem__(batch_number)
    ids = get_ids(generator, batch_number, batch_index)
    file_id = X_train[ids[batch_index]]
    sg = X_batch[batch_index].reshape(generator.dim)
#     mm_scaled_log_sg = np.memmap('features/XC%s_log_scaled_melspectrogram.dat'%file_id))
#     specshow(mm_scaled_log_sg, x_axis='time', y_axis='mel' )
#     specshow(np.log(np.clip(sg,.0000001,sg.max())), x_axis='time', y_axis='mel' )
    specshow(sg, x_axis='time', y_axis='mel' )
#     specshow(mfcc, x_axis='time', y_axis='mel' )
    sample = df[df.file_id==file_id].to_dict(orient='records')[0]
    print(file_id,label_encoder.classes_[y_train[ids][batch_index]],
          "contributed by",sample['recordist'], sample['recordist_url'])
    species_name = sample['english_cname']
    plt.title(species_name+" - "+sample['file_name'])
    plt.colorbar()
    plt.show()
    wav, sr = lr.load(get_full_path(sample))
    return Audio(wav, rate=sr)

### Create a Train/Test Split

In [None]:
y_english_labels_entire_dataset = [s['english_cname'] for i,s in df.iterrows()]
label_encoder = LabelEncoder().fit(y_english_labels_entire_dataset)
y_encoded_entire_dataset = np.array(label_encoder.transform(y_english_labels_entire_dataset))

n_classes = len(label_encoder.classes_)

X_train, X_test, y_train, y_test = train_test_split(
    np.array([s['file_id'] for i,s in df.iterrows()]), 
    y_encoded_entire_dataset, 
    test_size=1/3, 
    stratify=y_encoded_entire_dataset, 
    shuffle=True, 
    random_state=37,
)
print("Training data shape:",X_train.shape, y_train.shape)
print("Test data shape:    ",X_test.shape, y_test.shape)

#### Create a Data Generator to generate fixed-length samples from random windows within clips

In [None]:
# import pickle
# with open('../input/avian-vocalizations-pickled-spectrograms-and-mfcc/features.pickle','rb') as f:
#     features = pickle.load(f)

In [None]:
# mfccs_mean_px = np.mean([features[file_id]['mfcc'].mean() for file_id in features])
# mfccs_std_px = np.mean([features[file_id]['mfcc'].std() for file_id in features])
# mfccs_mean_px, mfccs_std_px

In [None]:
file_id = shapes_df.iloc[1].file_id
sg = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/melspectrograms/features/XC%s_melspectrogram.dat'%file_id, 
    shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['melspectrogram_shapes'].values[0]),  dtype='float32', mode='readonly')
sg[:,np.argmax(sg.mean(axis=0))].mean()
lr.display.specshow(np.log(sg), x_axis='time', y_axis='mel')
plt.colorbar();

In [None]:
from sklearn.preprocessing import StandardScaler
mfcc_scaler = StandardScaler()
for file_id in shapes_df.file_id:
    mfcc = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/mfccs/features/XC%s_mfcc.dat'%file_id, 
        shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['mfcc_shapes'].values[0]),  dtype='float32', mode='readonly')
    mfcc_scaler.partial_fit(mfcc.flatten().reshape(-1, 1))
mfcc_scaler.mean_, mfcc_scaler.var_, np.sqrt(mfcc_scaler.var_)

In [None]:
a1=np.array([[1,2,3],[4,5,6]])
a2=np.array([[7,8,9],[10,11,12]])
np.concatenate([a1,a2])

In [None]:
import keras
from keras.utils import to_categorical

# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
class AudioFeatureGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size, n_frames=128, n_channels=1,
                 n_classes=10, shuffle=False, seed=37):
        'Initialization'
        self.n_frames = n_frames
        self.dim = (128+20, self.n_frames)
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.seed = seed
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.seed(self.seed)
            self.seed = self.seed+1 # increment the seed so we get a different batch.
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        #X = np.empty((self.batch_size, *self.dim, self.n_channels))
        X = np.empty((self.batch_size, 128+20, self.dim[1], self.n_channels))
        y = np.empty((self.batch_size, self.n_classes), dtype=int) # one-hot encoded labels

        for i, ID in enumerate(list_IDs_temp):
            file_id = X_train[ID]
            sample = df[df.file_id==file_id]
            idx = sample.index[0]
            sg_lognorm = np.memmap(os.path.join(melspec_features_dir,'XC%s_melspectrogram_logscaled_normalized.dat'%file_id), 
                    shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['melspectrogram_shapes'].values[0]),  dtype='float32', mode='readonly')
            sg = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/melspectrograms/features/XC%s_melspectrogram.dat'%file_id, 
                    shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['melspectrogram_shapes'].values[0]),  dtype='float32', mode='readonly')
            mfcc = np.memmap('../input/avian-vocalizations-spectrograms-and-mfccs/mfccs/features/XC%s_mfcc.dat'%file_id, 
                    shape=parse_shape(shapes_df[shapes_df.file_id==file_id]['mfcc_shapes'].values[0]),  dtype='float32', mode='readonly')
            # Normalize MFCCs
            mfcc = mfcc_scaler.transform(mfcc)
            
            # Filter out quiet frames, thanks to https://www.kaggle.com/fleanend/extract-features-with-librosa-predict-with-nb
            # Take mean amplitude M from frame with highest energy
#             m = sg[:,np.argmax(sg.mean(axis=0))].mean()
#             # Filter out all frames with energy less than 5% of M
#             mask = sg.mean(axis=0)>=m/20
#             sg = sg[:,mask]
#             sg_lognorm = sg_lognorm[:,mask]
#             mfcc = mfcc[:,mask]
            
            d_len = sg.shape[1] - self.dim[1]
            if d_len<0: # Clip is shorter than window, so pad with mean value.
                n = int(np.random.uniform(0, -d_len))
                pad_range = (n, -d_len-n) # pad with n values on the left, clip_length - n values on the right 
                sg_cropped = np.pad(sg, ((0,0), pad_range), 'constant', constant_values=sg.mean())
                sg_lognorm_cropped = np.pad(sg_lognorm, ((0,0), pad_range), 'constant', constant_values=sg.mean())
                mfcc_cropped = np.pad(mfcc, ((0,0), pad_range), 'constant', constant_values=0)
            else: # Clip is longer than window, so slice it up
                n = int(np.random.uniform(0, d_len))
                sg_cropped = sg[:, n:(n+self.dim[1])]
                sg_lognorm_cropped = sg_lognorm[:, n:(n+self.dim[1])]
                mfcc_cropped = mfcc[:, n:(n+self.dim[1])]
                
            # Check label
#             print("y_train[%i]: %s"%(ID, y_train[ID]))
#             print("self.labels[y_train[ID]]",self.labels[y_train[ID]])
            X[i,] = np.stack((np.vstack([sg_lognorm_cropped,mfcc_cropped]),)*3, axis=-1)
            #y[i,] = to_categorical(self.labels[y_train[ID]], num_classes=self.n_classes)
            y[i,] = to_categorical(y_train[ID], num_classes=self.n_classes)

        return X, y

In [None]:
for cv_train_index, cv_val_index in StratifiedShuffleSplit(n_splits=2, test_size=1/4, random_state=37).split(X_train, y_train):
    generator = AudioFeatureGenerator(cv_train_index, y_train[cv_train_index], batch_size=1, shuffle=True, seed=37, n_frames=128, n_classes=n_classes, n_channels=3)
    for g in generator:
        for i,spec in enumerate(g[0]):
            specshow(spec[:,:,0], x_axis='time', y_axis='mel')
            plt.title(label_encoder.classes_[np.argmax(g[1][i])])
            plt.colorbar()
            plt.show()
#             specshow(spec[128:].squeeze(), x_axis='time', y_axis='mel')
#             plt.title("MFCC")
#             plt.colorbar()
#             plt.show()
        break

In [None]:
def vis_learning_curve(learning):
    train_loss = learning.history['loss']
    train_acc = learning.history['acc']
    val_loss = learning.history['val_loss']
    val_acc = learning.history['val_acc']

    fig, axes = plt.subplots(1, 2, figsize=(20,4), subplot_kw={'xlabel':'epoch'} )
    axes[0].set_title("Accuracy")
    axes[0].plot(train_acc)
    axes[0].plot(val_acc)
    axes[0].legend(['training','validation'])
    axes[1].set_title("Loss")
    axes[1].plot(train_loss)
    axes[1].plot(val_loss)
    axes[1].legend(['training','validation'])

    best_training_epoc = val_loss.index(np.min(val_loss))
    axes[0].axvline(x=best_training_epoc, color='red')
    axes[1].axvline(x=best_training_epoc, color='red')


In [None]:
print("Accuracy by random guess: %.4f"%(1/len(df['english_cname'].unique())))

In [None]:
import keras
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from keras.applications.xception import Xception, preprocess_input

n_splits = 5
n_epochs = 25
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=1/4, random_state=37)
scores = []
params = {#'dim': (128,256),
          'n_frames': 128,
          'n_classes': n_classes,
          'n_channels': 3}
for cv_train_index, cv_val_index in sss.split(X_train, y_train):
    y_cv_train = y_train[cv_train_index]
    y_cv_val = y_train[cv_val_index]
    training_generator = AudioFeatureGenerator(cv_train_index, y_cv_train, batch_size=64, shuffle=True, seed=37, **params)
    validation_generator = AudioFeatureGenerator(cv_val_index, y_cv_val, batch_size=len(cv_val_index), **params)
    dim = training_generator.dim
   
    checkpointer = ModelCheckpoint(filepath='weights.best.cnn.16.32.64.batch%i.hdf5'%len(scores), verbose=1, save_best_only=True) 
    # From my kernel https://www.kaggle.com/samhiatt/udacity-dog-project-transfer-learning-data-aug 

    xception_model = Xception(weights='imagenet', include_top=False, input_shape=(training_generator.dim[0],training_generator.dim[1],params['n_channels']))
    for layer in xception_model.layers:
        layer.trainable=False
    model = Sequential()
    model.add(xception_model)
    model.add(Dropout(.3))
    model.add(GlobalAveragePooling2D())
    # model.add(Flatten())
    # model.add(Dense(256, activation='relu'))
    model.add(Dense(training_generator.n_classes, activation='softmax'))
    model.summary()
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    learning = model.fit_generator(
                training_generator, 
                validation_data=validation_generator,
                epochs=n_epochs, 
                steps_per_epoch=np.ceil(len(cv_train_index)/training_generator.batch_size),
                validation_steps=1,
                callbacks=[checkpointer], 
                workers=4,
                use_multiprocessing=True,
                verbose=1, )
    vis_learning_curve(learning)
#     plt.show()
    acc_at_min_loss = learning.history['val_acc'][np.argmin(learning.history['val_loss'])]
    scores.append(acc_at_min_loss)
    print("Split %i: min loss: %.5f, accuracy at min loss: %.5f"%(
        len(scores), np.min(learning.history['val_loss']), acc_at_min_loss ))
print("Cross Validation Accuracy: %.4f"%(np.mean(scores)))