In [None]:
import os

import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import librosa
import numpy as np

import seaborn as sns 

from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

import tensorflow as tf

# Global vars
RANDOM_SEED = 1337
SAMPLE_RATE = 32000
SIGNAL_LENGTH = 5 # seconds
SPEC_SHAPE = (48, 128) # height x width
FMIN = 500
FMAX = 12500
MAX_AUDIO_FILES = 5000

In [None]:
train = pd.read_csv('../input/birdclef-2021/train_metadata.csv',)

In [None]:
print(f'Number of species in the data {train.primary_label.nunique()}')

#plot of ratings of audio data :
plt.figure(figsize=(16,8))
sns.countplot(x=train.rating,data=train)
plt.title('Recordings Ratings')

In [None]:
highest_recorded=train['common_name'].value_counts().sort_values(ascending=False)[:20]
least_recorded=train['common_name'].value_counts().sort_values()[:20]

plt.subplots(2,1,figsize=(16,16))

plt.subplot(2,1,1)
plt.bar(x=highest_recorded.index,height=highest_recorded.values)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Most Recorded birds')

plt.subplot(2,1,2)
plt.bar(x=least_recorded.index,height=least_recorded.values)
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.title('Least Recorded birds')

plt.tight_layout()
plt.show()

In [None]:
train = train.query('rating>=3') # High quality samples

birds_count = {}
for bird_species, count in zip(train.primary_label.unique(), 
                               train.groupby('primary_label')['primary_label'].count().values):
    birds_count[bird_species] = count
most_represented_birds = [key for key,value in birds_count.items() if value >= 30] 

TRAIN = train.query('primary_label in @most_represented_birds')
LABELS = sorted(TRAIN.primary_label.unique())

print('NUMBER OF SPECIES IN TRAIN DATA:', len(LABELS))
print('NUMBER OF SAMPLES IN TRAIN DATA:', len(TRAIN))
print('LABELS:', most_represented_birds)

Split signal function:

In [None]:
def split_sig(sig):
    sig_splits = []
    for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
        split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]
        
        # End of signal?
        if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
            break

        sig_splits.append(split)
    return sig_splits

In [None]:
TRAIN = shuffle(TRAIN, random_state=RANDOM_SEED)[:MAX_AUDIO_FILES]

def get_spectrograms(filepath, primary_label, output_dir):
    
    sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None, duration=15)
    
    sig_splits = split_sig(sig)
    
    s_cnt = 0
    saved_samples = []
    for chunk in sig_splits:
        
        hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                                  sr=SAMPLE_RATE, 
                                                  n_fft=1024, 
                                                  hop_length=hop_length, 
                                                  n_mels=SPEC_SHAPE[0], 
                                                  fmin=FMIN, 
                                                  fmax=FMAX)
    
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
        
        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()
        
        # Save as image file
        save_dir = os.path.join(output_dir, primary_label)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + 
                                 '_' + str(s_cnt) + '.png')
        im = Image.fromarray(mel_spec * 255.0).convert("L")
        im.save(save_path)
        
        saved_samples.append(save_path)
        s_cnt += 1
        
        
    return saved_samples

print('FINAL NUMBER OF AUDIO FILES IN TRAINING DATA:', len(TRAIN))

In [None]:
input_dir = '../input/birdclef-2021/train_short_audio/'
output_dir = '../working/melspectrogram_dataset/'
samples = []
with tqdm(total=len(TRAIN)) as pbar:
    for idx, row in TRAIN.iterrows():
        pbar.update(1)
        
        if row.primary_label in most_represented_birds:
            audio_file_path = os.path.join(input_dir, row.primary_label, row.filename)
            samples += get_spectrograms(audio_file_path, row.primary_label, output_dir)
            
TRAIN_SPECS = shuffle(samples, random_state=RANDOM_SEED)
print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS)))

In [None]:
plt.figure(figsize=(20, 10))
for i in range(24):
    spec = Image.open(TRAIN_SPECS[i])
    plt.subplot(6, 4, i + 1)
    plt.axis('off')
    plt.title(TRAIN_SPECS[i].split(os.sep)[-1])
    plt.imshow(spec, origin='lower')

In [None]:
train_specs, train_labels = [], []
with tqdm(total=len(TRAIN_SPECS)) as pbar:
    for path in TRAIN_SPECS:
        pbar.update(1)

        spec = Image.open(path)

        spec = np.array(spec, dtype='float32')
        
        spec -= spec.min()
        spec /= spec.max()
        if not spec.max() == 1.0 or not spec.min() == 0.0:
            continue

        spec = np.expand_dims(spec, -1)

        spec = np.expand_dims(spec, 0)

        if len(train_specs) == 0:
            train_specs = spec
        else:
            train_specs = np.vstack((train_specs, spec))

        target = np.zeros((len(LABELS)), dtype='float32')
        bird = path.split(os.sep)[-2]
        target[LABELS.index(bird)] = 1.0
        if len(train_labels) == 0:
            train_labels = target
        else:
            train_labels = np.vstack((train_labels, target))

In [None]:
train_labels.shape

In [None]:
tf.random.set_seed(RANDOM_SEED)

# CONV --> RELU --> BNORM --> MAXPOOL.
model = tf.keras.Sequential([
    
    # liczba conv block
    #rozmiar okienka
    #MaxPooling -> AveragePooling
    #Dense block 256-> 64,
    
    # First conv block
    tf.keras.layers.Conv2D(16, (3, 3), activation='relu', 
                           input_shape=(SPEC_SHAPE[0], SPEC_SHAPE[1], 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
     tf.keras.layers.Dropout(0.08),  
    
    # Second conv block
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
     tf.keras.layers.Dropout(0.08),  
    
    # Third conv block
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    tf.keras.layers.Dropout(0.08),  
    
    # Fourth conv block
    tf.keras.layers.Conv2D(128, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Global pooling instead of flatten()
    tf.keras.layers.GlobalAveragePooling2D(), 
      
    tf.keras.layers.Dropout(0.05),  
    
    # Classification layer
    tf.keras.layers.Dense(len(LABELS), activation='softmax')
])
print('MODEL HAS {} PARAMETERS.'.format(model.count_params()))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.008),  #zalezy od zadania
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.01),
              metrics=['accuracy'])

In [None]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                                  patience=2, 
                                                  verbose=1, 
                                                  factor=0.5),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                              verbose=1,
                                              patience=10),
             tf.keras.callbacks.ModelCheckpoint(filepath='best_model.h5', 
                                                monitor='val_loss',
                                                verbose=0,
                                                save_best_only=True)]

In [None]:
# Train
history = model.fit(train_specs, 
                  train_labels,
                  batch_size=32,
                  validation_split=0.2, # sprobowac 0.1
                  callbacks=callbacks,
                  epochs=50) # nie wiecej niz 100, ale tak zeby loss byl mn w staly

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
model = tf.keras.models.load_model('best_model.h5')


soundscape_path = '../input/birdclef-2021/train_soundscapes/28933_SSW_20170408.ogg'


sig, rate = librosa.load(soundscape_path, sr=SAMPLE_RATE)

# Store results
data = {'row_id': [], 'prediction': [], 'score': [], 'prediction2': [], 'score2': [], 'p': []}


sig_splits = split_sig(sig)


seconds, scnt = 0, 0
for chunk in sig_splits:
    seconds += 5
        
    # Get the spectrogram
    hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
    mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                              sr=SAMPLE_RATE, 
                                              n_fft=1024, 
                                              hop_length=hop_length, 
                                              n_mels=SPEC_SHAPE[0], 
                                              fmin=FMIN, 
                                              fmax=FMAX)

    mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 


    mel_spec -= mel_spec.min()
    mel_spec /= mel_spec.max()
    

    mel_spec = np.expand_dims(mel_spec, -1)

    mel_spec = np.expand_dims(mel_spec, 0)
    
    # Predict
    p = model.predict(mel_spec)[0]
    
    data['p'].append(p)
    
    idx = p.argmax()
    species = LABELS[idx]
    score = p[idx]

    data['row_id'].append(soundscape_path.split(os.sep)[-1].rsplit('_', 1)[0] + 
                          '_' + str(seconds))    
    
    #"nocall"
    if score > 0.25:
        data['prediction'].append(species)
        scnt += 1
    else:
        data['prediction'].append('nocall')
        
    
    data['score'].append(score)
    
    p[idx] = 0
    
    idx = p.argmax()
    species = LABELS[idx]
    score = p[idx]
    
    if score > 0.15:
        data['prediction2'].append(species)
        scnt += 1
    else:
        data['prediction2'].append('')
        
    data['score2'].append(score)

        
print('SOUNSCAPE ANALYSIS DONE. FOUND {} BIRDS.'.format(scnt))

In [None]:
# Make a new data frame
results = pd.DataFrame(data, columns = ['row_id', 'prediction', 'score', 'prediction2', 'score2'])

# Merge with ground truth so we can inspect
gt = pd.read_csv('../input/birdclef-2021/train_soundscape_labels.csv',)
results = pd.merge(gt, results, on='row_id')