In [None]:
import os
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import librosa
import numpy as np

from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import librosa.display

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.pyplot as pp

# Global vars
RANDOM_SEED = 1337
SAMPLE_RATE = 32000
SIGNAL_LENGTH = 5 # seconds
SPEC_SHAPE = (48, 128) # height x width
FMIN = 500
FMAX = 12500
MAX_AUDIO_FILES = 1500

# data loading

In [None]:
train = pd.read_csv('../input/birdclef-2021/train_metadata.csv',)
list(train.columns)
# Code adapted from: 
# https://www.kaggle.com/frlemarchand/bird-song-classification-using-an-efficientnet
# Make sure to check out the entire notebook.
# Limit the number of training samples and classes
# First, only use high quality samples
train = train.query('rating>=4')
# Second, assume that birds with the most training samples are also the most common
# A species needs at least 150 recordings with a rating above 4 to be considered common
birds_count = {}
for bird_species, count in zip(train.primary_label.unique(), 
                               train.groupby('primary_label')['primary_label'].count().values):
    birds_count[bird_species] = count
most_represented_birds = [key for key,value in birds_count.items() if value >= 200]

TRAIN = train.query('primary_label in @most_represented_birds')
LABELS = sorted(TRAIN.primary_label.unique())
# # Let's see how many species and samples we have left
print('NUMBER OF SPECIES IN TRAIN DATA:', len(LABELS))
print('NUMBER OF SAMPLES IN TRAIN DATA:', len(TRAIN))
print('LABELS:', most_represented_birds)

# data visualization

In [None]:
# # visualize the dataset after cleaning to determine the number of clusters
# fig,ax=plt.subplots(figsize=(15,10))
# sns.scatterplot('longitude', 'latitude', data=TRAIN, hue='primary_label',ax=ax)
# TRAIN_pos = TRAIN[['latitude','longitude']]
# TRAIN_pos_norm = StandardScaler().fit_transform(TRAIN_pos)
# print("The shape of TRAIN_pos_norm is",TRAIN_pos_norm.shape)

# clustering

In [None]:
# # perform DBSCAN
# db = DBSCAN(eps = 0.6,
#              min_samples = 20,
#              algorithm = 'ball_tree',
#              metric = 'euclidean').fit(TRAIN_pos_norm)
# cluster_labels = db.labels_
# num_clusters = len(set(cluster_labels))
# print("The number of cluster is: ", num_clusters)
# TRAIN['DBSCAN_labels'] = cluster_labels
# TRAIN['DBSCAN_labels'].value_counts().sort_index()
# # visualize the cluster
# fig,ax=plt.subplots(figsize=(15,10))
# sns.scatterplot('longitude', 'latitude', data=TRAIN, hue='DBSCAN_labels',ax=ax)

In [None]:
# # k-means
# model = KMeans(n_clusters=2)
# model.fit(TRAIN_pos_norm)
# kmeans_cluster_labels = model.labels_
# TRAIN['Kmeans_labels'] = kmeans_cluster_labels
# print("Number of clusters and its count: \n", TRAIN['Kmeans_labels'].value_counts().sort_index())
# # visualize the cluster
# fig,ax=plt.subplots(figsize=(15,10))
# x = [-75.85,-84.51,-119.95,-76.45]
# y = [5.57,10.12,38.49,42.47]
# location = ["Jardín, Departamento de Antioquia Colombia", "Alajuela, San Ramón Costa Rica", "Sierra Nevada, California USA", "Ithaca, New York USA"]
# ax.scatter(x, y)
# # train.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,ax=ax)
# sns.scatterplot('longitude', 'latitude', data=TRAIN, hue='Kmeans_labels',ax=ax)

# plt.scatter(x, y, color="red")
# for i, txt in enumerate(location):
#     ax.annotate(txt, (x[i], y[i]))

# plt.legend([],[], frameon=False)
# plt.show()

In [None]:
# fig,ax=plt.subplots(figsize=(15,10))
# sns.scatterplot('longitude', 'latitude', hue="Kmeans_labels",data=TRAIN,ax=ax)

In [None]:
# TRAIN_ = TRAIN[TRAIN.Kmeans_labels == 1]

In [None]:
# TRAIN_ = TRAIN[TRAIN.Kmeans_labels == 0]

# extract audio features

In [None]:
# features = np.empty((0, 60))
# #sig, rate = librosa.load("../input/birdclef-2021/train_short_audio/acafly/XC11209.ogg", sr=SAMPLE_RATE, offset=None)
# sig, rate = librosa.load("../input/birdclef-2021/train_short_audio/acafly/XC109605.ogg",sr=SAMPLE_RATE, offset=None)
# chunk = sig
# hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        
# # zero crossing rate
# zero_crossing_rate = librosa.feature.zero_crossing_rate(y=chunk, frame_length=2048, hop_length=hop_length)
# print("zero crossing rate shape: ")
# print(zero_crossing_rate.shape)
        
    
# # librosa.feature.spectral_flatness
# spec_flat = librosa.feature.spectral_flatness(y=chunk, S=None,n_fft=1024, hop_length=hop_length, win_length=1024)
# print("spectral flatness shape: ")
# print(spec_flat.shape)
        
# # spectral centroids
# spectral_centroids = librosa.feature.spectral_centroid(y=chunk, sr=SAMPLE_RATE,n_fft=1024,hop_length=hop_length)
# print("spectral centroids shape: ")
# print(spectral_centroids.shape)
        
# # spectral rolloff
# spectral_rolloff = librosa.feature.spectral_rolloff(y=chunk, sr=SAMPLE_RATE,n_fft=1024,hop_length=hop_length)
# print("spectral rolloff shape: ")
# print(spectral_rolloff.shape)

# # spectral contrast
# spectral_contrast = librosa.feature.spectral_contrast(y=chunk, sr=SAMPLE_RATE,n_fft=1024, hop_length=hop_length)
# print("spectral contrast shape: ")
# print(spectral_contrast.shape)
        
# # melspectralgram
# mel_spec = librosa.feature.melspectrogram(y=chunk,sr=SAMPLE_RATE, n_fft=1024, hop_length=hop_length, n_mels=SPEC_SHAPE[0],fmin=FMIN, fmax=FMAX)

# mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
# # Normalize
# mel_spec -= mel_spec.min()
# mel_spec /= mel_spec.max()
# print("melspectralgram shape: ")
# print(mel_spec.shape)

# chroma_stft = librosa.feature.chroma_stft(y=chunk, 
#                                           sr=SAMPLE_RATE,
#                                           n_fft=1024, 
#                                           hop_length=hop_length)
# # print("chroma_stft shape: ")
# # print(chroma_stft.shape)
# labels = np.full((1,mel_spec.shape[1]), "label")
# print("labels")
# print(labels.shape)
# f=np.concatenate((np.transpose(zero_crossing_rate),
#                   np.transpose(spec_flat),
#                   np.transpose(spectral_centroids),
#                   np.transpose(spectral_rolloff),
#                   np.transpose(spectral_contrast),
#                   np.transpose(mel_spec),
#                   np.transpose(labels)),
#                   axis=1)
# print("f shape: ")
# print(f.shape)    
# features = np.concatenate((features,f),axis=0)
# print("features shape: ")
# print(features.shape)

In [None]:
get_features("../input/birdclef-2021/train_short_audio/acafly/XC109605.ogg", "xz")

In [None]:
# get spectral features
def get_features(filepath, primary_label):
    features = np.empty((0, 60))
    # Open the file with librosa
    sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None,duration=15)
    
    # Split signal into five second chunks
    sig_splits = []
    for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
        split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]

        # End of signal?
        if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
            break
        
        sig_splits.append(split)
        
 
    for chunk in sig_splits:
        hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        
        # zero crossing rate
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y=chunk, frame_length=2048, hop_length=hop_length)
#         print(zero_crossing_rate.shape)
        # librosa.feature.spectral_flatness
        spec_flat = librosa.feature.spectral_flatness(y=chunk, S=None,n_fft=1024, hop_length=hop_length, win_length=1024)
        
        # spectral centroids
        spectral_centroids = librosa.feature.spectral_centroid(y=chunk, sr=SAMPLE_RATE,n_fft=1024,hop_length=hop_length)
        
        # spectral rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=chunk, sr=SAMPLE_RATE,n_fft=1024,hop_length=hop_length)
        
        # spectral contrast
        spectral_contrast = librosa.feature.spectral_contrast(y=chunk, sr=SAMPLE_RATE,n_fft=1024, hop_length=hop_length)
        
        # melspectralgram
        mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                                  sr=SAMPLE_RATE, 
                                                  n_fft=1024, 
                                                  hop_length=hop_length, 
                                                  n_mels=SPEC_SHAPE[0], 
                                                  fmin=FMIN, 
                                                  fmax=FMAX)
    
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()
        
        labels = np.full((1,mel_spec.shape[1]), primary_label)
        
        f=np.concatenate((np.transpose(zero_crossing_rate),
                  np.transpose(spec_flat),
                  np.transpose(spectral_centroids),
                  np.transpose(spectral_rolloff),
                  np.transpose(spectral_contrast),
                  np.transpose(mel_spec),
                  np.transpose(labels)),
                  axis=1)
        features = np.concatenate((features,f),axis=0)
#         print(features.shape)
    return(features)

In [None]:
# # Define a function that splits an audio file, 
# # extracts spectrograms and saves them in a working directory
# def get_spectrograms(filepath, primary_label, output_dir):
    
#     # Open the file with librosa (limited to the first 60 seconds)
#     sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None)
    
#     # Split signal into five second chunks
#     sig_splits = []
#     for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
#         split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]

#         # End of signal?
#         if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
#             break
        
#         sig_splits.append(split)
        
#     # Extract mel spectrograms for each audio chunk
#     s_cnt = 0
#     saved_samples = []
#     for chunk in sig_splits:
#         hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
#         # librosa.feature.spectral_flatness
        
# #         spec_flat = librosa.feature.spectral_flatness(y=chunk, 
# #                                                       S=None, 
# #                                                       n_fft=1024, 
# #                                                       hop_length=hop_length, 
# #                                                       win_length=1024, 
# #                                                       window='hann', 
# #                                                       center=True, 
# #                                                       pad_mode='reflect', 
# #                                                       amin=1e-10, 
# #                                                       power=2.0)
        
        
# #         spec_flat_mean = np.mean(spec_flat)
# #         a = spec_flat[0]
# #         #sns.scatterplot(a, list(range(len(a))))
# #         spec_flat_bag.append(spec_flat[0])
        
#         mel_spec = librosa.feature.melspectrogram(y=chunk, 
#                                                   sr=SAMPLE_RATE, 
#                                                   n_fft=1024, 
#                                                   hop_length=hop_length, 
#                                                   n_mels=SPEC_SHAPE[0], 
#                                                   fmin=FMIN, 
#                                                   fmax=FMAX)
    
#         mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
        
#         # Normalize
#         mel_spec -= mel_spec.min()
#         mel_spec /= mel_spec.max()
            
#         # display spectrum
# #         plt.figure(figsize=(15, 5))
# #         librosa.display.specshow(mel_spec, 
# #                         sr=32000, 
# #                         hop_length=hop_length, 
# #                         x_axis='time', 
# #                         y_axis='mel',
# #                         fmin=FMIN, 
# #                         fmax=FMAX, 
# #                         cmap=plt.get_cmap('viridis'))
         
            
#         # Save as image file
#         save_dir = os.path.join(output_dir, primary_label)
#         if not os.path.exists(save_dir):
#             os.makedirs(save_dir)
#         save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + 
#                                 '_' + str(s_cnt) + '.png')
#         im = Image.fromarray(mel_spec * 255.0).convert("L")
#         im.save(save_path)
        
#         saved_samples.append(save_path)
#         s_cnt += 1
        
# #         save_path_bag.append(save_path)
        
#     return saved_samples


In [None]:
TRAIN_1 = TRAIN
TRAIN_1 = shuffle(TRAIN_1, random_state=RANDOM_SEED)
# Parse audio files and extract training samples
input_dir = '../input/birdclef-2021/train_short_audio/'
# output_dir = '../working/melspectrogram_dataset/cluster0/'

In [None]:
import os
os.remove("../working/features.csv")

In [None]:
import numpy as np        
f=open('asd.dat','ab')
for iind in range(4):
    a=np.random.rand(10,10)
    np.savetxt(f,a)
f.close()

In [None]:
import csv
f = open('../working/features.csv','ab')
for idx, row in TRAIN_1.iterrows():
    audio_file_path = os.path.join(input_dir, row.primary_label, row.filename)
    features = get_features(audio_file_path,row.primary_label)
    np.savetxt(f,features,delimiter=",", fmt='%s')
f.close

In [None]:
pd_df.shape

In [None]:
chunk = pd.read_csv('../working/features.csv',chunksize=1000000)
pd_df = pd.concat(chunk)

In [None]:
pd_df.iloc[1,:]

In [None]:
from dask import dataframe as dd
start = time.time()
dask_df = dd.read_csv('huge_data.csv')
end = time.time()
print("Read csv with dask: ",(end-start),"sec")

In [None]:
features = np.empty((0, 60))
with tqdm(total=len(TRAIN_1)) as pbar:
    for idx, row in TRAIN_1.iterrows():
        pbar.update(1)
        audio_file_path = os.path.join(input_dir, row.primary_label, row.filename)
        features = np.concatenate((features,get_features(audio_file_path,row.primary_label)),axis=0)
            
# TRAIN_SPECS_1 = shuffle(samples_1, random_state=RANDOM_SEED)
# print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS_1)))

In [None]:
print(features.shape)

In [None]:
from sklearn.model_selection import train_test_split
y = features[:, -1] # for last column
X = features[:, :-1] # for all but last column
X_train, X_test, Y_train_label, Y_test_label = train_test_split(X, y, test_size=0.25, random_state=RANDOM_SEED)

In [None]:
# Transforming non numerical labels into numerical labels
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()

# encoding train labels 
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

# encoding test labels 
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

In [None]:
# Scaling the Train and Test feature set 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Libraries to Build Ensemble Model : Random Forest Classifier 
# Create the parameter grid based on the results of random search 
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# Performing CV to tune parameters for best SVM fit 
svm_model = GridSearchCV(SVC(), params_grid, cv=5)
svm_model.fit(X_train_scaled, Y_train)

In [None]:
# View the accuracy score
print('Best score for training data:', svm_model.best_score_,"\n") 

# View the best parameters for the model found using grid search
print('Best C:',svm_model.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model.best_estimator_.gamma,"\n")

final_model = svm_model.best_estimator_
Y_pred = final_model.predict(X_test_scaled)
Y_pred_label = list(encoder.inverse_transform(Y_pred))

In [None]:
# Making the Confusion Matrix
#print(pd.crosstab(Y_test_label, Y_pred_label, rownames=['Actual Activity'], colnames=['Predicted Activity']))
print(confusion_matrix(Y_test_label,Y_pred_label))
print("\n")
print(classification_report(Y_test_label,Y_pred_label))

print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train))
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))

svm_model.score

In [None]:
# samples_1 = []
# with tqdm(total=len(TRAIN_1)) as pbar:
#     for idx, row in TRAIN_1.iterrows():
#         pbar.update(1)
#         audio_file_path = os.path.join(input_dir, row.primary_label, row.filename)
#         samples_1 += get_spectrograms(audio_file_path, row.primary_label, output_dir)
            
# TRAIN_SPECS_1 = shuffle(samples_1, random_state=RANDOM_SEED)
# print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS_1)))

Nice! These are good samples. Notice how some of them only contain a fraction of a bird call? That's an issue we won't deal with in this tutorial. We will simply ignore the fact that samples might not contain any bird sounds.

# 4. Load training samples

For now, our spectrograms reside in a working directory. If we want to train a model, we have to load them into memory. Yet, with potentially hundreds of thousands of extracted spectrograms, an in-memory dataset is not a good idea. But for now, loading samples from disk and combining them into a large NumPy array is fine. It’s the easiest way to use these data for training with Keras.

In [None]:
# Parse all samples and add spectrograms into train data, primary_labels into label data
train_specs_1, train_labels_1 = [], []
with tqdm(total=len(TRAIN_SPECS_1)) as pbar:
    for path in TRAIN_SPECS_1:
        pbar.update(1)
        spec = Image.open(path) # Open image
        spec = np.array(spec, dtype='float32') # Convert to numpy array
        spec -= spec.min()  # Normalize between 0.0 and 1.0 and exclude samples with nan 
        spec /= spec.max()
        if not spec.max() == 1.0 or not spec.min() == 0.0:
            continue
        spec = np.expand_dims(spec, -1) # Add channel axis to 2D array
        spec = np.expand_dims(spec, 0)  # Add new dimension for batch size
        if len(train_specs_1) == 0: #  # Add to train data
            train_specs_1 = spec
        else:
            train_specs_1 = np.vstack((train_specs_1, spec))
        target = np.zeros((len(LABELS)), dtype='float32') # Add to label data
        bird = path.split(os.sep)[-2]
        target[LABELS.index(bird)] = 1.0
        if len(train_labels_1) == 0:
            train_labels_1 = target
        else:
            train_labels_1 = np.vstack((train_labels_1, target))

In [None]:
train_specs_1.shape

# 5. Build a simple model

Alright, our dataset is ready, now we need to define a model architecture. In this tutorial, we’ll use a very simplistic, AlexNet-like design with four convolutional layers and three dense layers. It might make sense to choose an off-the-shelve TF model that was pre-trained on audio data, but we would need to adjust the inputs (i.e., the resolution of our spectrograms) to fit the external model. So we keep it simple and build our own model.

In [None]:
# Make sure your experiments are reproducible
tf.random.set_seed(RANDOM_SEED)

# Build a simple model as a sequence of  convolutional blocks.
# Each block has the sequence CONV --> RELU --> BNORM --> MAXPOOL.
# Finally, perform global average pooling and add 2 dense layers.
# The last layer is our classification layer and is softmax activated.
# (Well it's a multi-label task so sigmoid might actually be a better choice)
model = tf.keras.Sequential([
    
    # First conv block
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', 
                           input_shape=(SPEC_SHAPE[0], SPEC_SHAPE[1], 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Second conv block
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    # Third conv block
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    # Fourth conv block
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Global pooling instead of flatten()
    tf.keras.layers.GlobalAveragePooling2D(), 
    
    # Dense block
    tf.keras.layers.Dense(256, activation='relu'),   
    tf.keras.layers.Dropout(0.5),  
    tf.keras.layers.Dense(256, activation='relu'),   
    tf.keras.layers.Dropout(0.5),
    
    # Classification layer
    tf.keras.layers.Dense(len(LABELS), activation='softmax')
])
print('MODEL HAS {} PARAMETERS.'.format(model.count_params()))

This is not a huge CNN, it only has ~200,000 parameters. Yet, we also only have a very small dataset with just 27 classes.

Next, we need to specify an optimzer, initial learning rate, a loss function and a metric.

In [None]:
# Compile the model and specify optimizer, loss and metric
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.01),
              metrics=['accuracy'])

Callbacks make our life easier, the three that we're adding will take care of saving the best checkpoint, they will reduce the learning rate whenever the training process stalls, and they will stop the training if the model is overfitting.

In [None]:
# Add callbacks to reduce the learning rate if needed, early stopping, and checkpoint saving
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                  patience=2, 
                                                  verbose=1, 
                                                  factor=0.5),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                              verbose=1,
                                              patience=5),
             tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5', 
                                                monitor='val_loss',
                                                verbose=0,
                                                save_best_only=True)]

Here we go, everything is in place, let's train a model. We'll use 20% of our training data for validation and we'll stop after 25 epochs.

In [None]:
# Let's train the model for a few epochs
model.fit(train_specs_1, 
          train_labels_1,
          batch_size=32,
          validation_split=0.2,
          callbacks=callbacks,
          epochs=50)


# 6. Soundscape analysis

In this tutorial, we will simply pick a soundscape from the training data, but the overall process can easily be automated and then applied to all soundscape files. And again, we have to load a file with Librosa, extract spectrograms for 5-second chunks, pass each chunk through the model and eventually assign a label to the 5-second audio chunk.

Let's use a soundscape that actually contains some of the species that we trained our model for. The file "28933_SSW_20170408.ogg" seems to contain a lot of Song Sparrow (sonspa) vocalizations, let's try this one then.

In [None]:
# Load the best checkpoint
model = tf.keras.models.load_model('best_model.h5')

# Pick a soundscape
soundscape_path = '../input/birdclef-2021/train_soundscapes/28933_SSW_20170408.ogg'
#soundscape_path = '../input/birdclef-2021/train_soundscapes/11254_COR_20190904.ogg'

# Open it with librosa
sig, rate = librosa.load(soundscape_path, sr=SAMPLE_RATE)

# Store results so that we can analyze them later
data = {'row_id': [], 'prediction': [], 'score': []}

# Split signal into 5-second chunks
# Just like we did before (well, this could actually be a seperate function)
sig_splits = []
for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
    split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]

    # End of signal?
    if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
        break

    sig_splits.append(split)
    
# Get the spectrograms and run inference on each of them
# This should be the exact same process as we used to
# generate training samples!
seconds, scnt = 0, 0
for chunk in sig_splits:
    
    # Keep track of the end time of each chunk
    seconds += 5
        
    # Get the spectrogram
    hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
    mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                              sr=SAMPLE_RATE, 
                                              n_fft=1024, 
                                              hop_length=hop_length, 
                                              n_mels=SPEC_SHAPE[0], 
                                              fmin=FMIN, 
                                              fmax=FMAX)

    mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 

    # Normalize to match the value range we used during training.
    # That's something you should always double check!
    mel_spec -= mel_spec.min()
    mel_spec /= mel_spec.max()
    
    # Add channel axis to 2D array
    mel_spec = np.expand_dims(mel_spec, -1)

    # Add new dimension for batch size
    mel_spec = np.expand_dims(mel_spec, 0)
    
    # Predict
    p = model.predict(mel_spec)[0]
    
    # Get highest scoring species
    idx = p.argmax()
    species = LABELS[idx]
    score = p[idx]
    
    # Prepare submission entry
    data['row_id'].append(soundscape_path.split(os.sep)[-1].rsplit('_', 1)[0] + 
                          '_' + str(seconds))    
    
    # Decide if it's a "nocall" or a species by applying a threshold
    if score > 0.25:
        data['prediction'].append(species)
        scnt += 1
    else:
        data['prediction'].append('nocall')
        
    # Add the confidence score as well
    data['score'].append(score)
        
print('SOUNSCAPE ANALYSIS DONE. FOUND {} BIRDS.'.format(scnt))

Ok, we found a few bird species with a score above the threshold. Let's look at the results and see how well we're actually doing.

In [None]:
# Make a new data frame
results = pd.DataFrame(data, columns = ['row_id', 'prediction', 'score'])

# Merge with ground truth so we can inspect
gt = pd.read_csv('../input/birdclef-2021/train_soundscape_labels.csv',)
results = pd.merge(gt, results, on='row_id')

# Let's look at the first 50 entries
results.head(50)

Ok, that's not too bad. We actually got some of these Song Sparrow (sonspa) vocalizations. Well, and we missed others... We also didn't detect the Northern Cardinal (norcar) and Red-winged Blackbird (rewbla) even though we had them in our training data.

This is a good example for the difficulties we're facing when analyzing soundscapes. Focal recordings as training data can be misleading and soundscapes have much higher noise levels (and also contain very faint bird calls).

Now it's your turn to find better strategies to cope with this shift in acoustic domains. Please don't hesitate to leave a comment or start a new forum thread if you have any questions.