In [None]:
import os
import warnings
import shutil
warnings.filterwarnings(action='ignore')

import math
import pandas as pd
import librosa
import librosa.display
import numpy as np
import seaborn as sns; sns.set(style='whitegrid')
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from PIL import Image
from tqdm import tqdm,tnrange,tqdm_notebook
import tensorflow as tf
from tqdm.keras import TqdmCallback
from keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras import applications as app
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten,AveragePooling2D
from tensorflow.keras.layers import Dense,BatchNormalization,Dropout
from tensorflow.keras.models import Sequential 
from tensorflow.keras.applications import EfficientNetB4, ResNet50,ResNet101, VGG16, MobileNet, InceptionV3
import pickle

In [None]:
# Global vars
RANDOM_SEED = 1337
SAMPLE_RATE = 32000
SIGNAL_LENGTH = 5 # seconds
SPEC_SHAPE = (224, 224) # height x width
FMIN = 500
FMAX = 12500
# MAX_AUDIO_FILES = 10000
EPOCHS=50

In [None]:
# Code adapted from: 
# https://www.kaggle.com/frlemarchand/bird-song-classification-using-an-efficientnet
# Make sure to check out the entire notebook.

# Load metadata file
train = pd.read_csv('../input/birdclef-2022/train_metadata.csv',)

# Limit the number of training samples and classes
# First, only use high quality samples
train = train.query('rating>=4')

# Second, assume that birds with the most training samples are also the most common
# A species needs at least 200 recordings with a rating above 4 to be considered common
birds_count = {}
for bird_species, count in zip(train.primary_label.unique(), 
                               train.groupby('primary_label')['primary_label'].count().values):
    birds_count[bird_species] = count
most_represented_birds = [key for key,value in birds_count.items() if value >= 175] 

TRAIN = train.query('primary_label in @most_represented_birds')
LABELS = sorted(TRAIN.primary_label.unique())

# Let's see how many species and samples we have left
print('NUMBER OF SPECIES IN TRAIN DATA:', len(LABELS))
print('NUMBER OF SAMPLES IN TRAIN DATA:', len(TRAIN))
print('LABELS:', most_represented_birds)

In [None]:
# saving labels 
with open('LABELS.pkl','wb') as f:
    pickle.dump(LABELS,f)

In [None]:
# Shuffle the training data and limit the number of audio files to MAX_AUDIO_FILES
TRAIN = shuffle(TRAIN, random_state=RANDOM_SEED)

# Define a function that splits an audio file, 
# extracts spectrograms and saves them in a working directory
def get_spectrograms(filepath, primary_label, output_dir):
    
    # Open the file with librosa (limited to the first 15 seconds)
    sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None, duration=15)
    
    # Split signal into five second chunks
    sig_splits = []
    for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
        split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]

        # End of signal?
        if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
            break
        
        sig_splits.append(split)
        
    # Extract mel spectrograms for each audio chunk
    s_cnt = 0
    saved_samples = []
    for chunk in sig_splits:
        
        hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
        mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                                  sr=SAMPLE_RATE, 
                                                  n_fft=1024, 
                                                  hop_length=hop_length, 
                                                  n_mels=SPEC_SHAPE[0], 
                                                  fmin=FMIN, 
                                                  fmax=FMAX)
    
        mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 
        
        # Normalize
        mel_spec -= mel_spec.min()
        mel_spec /= mel_spec.max()
        
        # Save as image file
        save_dir = os.path.join(output_dir, primary_label)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] + 
                                 '_' + str(s_cnt) + '.png')
        im = Image.fromarray(mel_spec * 255.0).convert("L")
        im.save(save_path)
        
        saved_samples.append(save_path)
        s_cnt += 1
        
        
    return saved_samples

print('FINAL NUMBER OF AUDIO FILES IN TRAINING DATA:', len(TRAIN))

In [None]:
# Parse audio files and extract training samples
input_dir = '../input/birdclef-2022/train_audio'
output_dir = '../working/melspectrogram_dataset/'
samples = []
with tqdm(total=len(TRAIN)) as pbar:
    for idx, row in TRAIN.iterrows():
        pbar.update(1)
        
        if row.primary_label in most_represented_birds:
            audio_file_path = os.path.join(input_dir, row.filename)
            samples += get_spectrograms(audio_file_path, row.primary_label, output_dir)
            
TRAIN_SPECS = shuffle(samples, random_state=RANDOM_SEED)
print('SUCCESSFULLY EXTRACTED {} SPECTROGRAMS'.format(len(TRAIN_SPECS)))

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import DenseNet201, EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import argparse
import os

In [None]:

train_folder = './melspectrogram_dataset'
# valid_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest",
    preprocessing_function=preprocess_input,
    validation_split=0.2)

train_generator = datagen.flow_from_directory(train_folder, 
#                         target_size=(coefs.sshape[0],coefs.sshape[1]),  # target size
                        target_size=(224,224),
                        batch_size=64, 
                        seed=2022,
                        shuffle=True,
                        subset = "training",
                        class_mode='categorical')    # batch size
validation_generator = datagen.flow_from_directory(train_folder, 
#                         target_size=(coefs.sshape[0],coefs.sshape[1]),  # target size
                        target_size=(224,224),
                        batch_size=64,
                        seed=2022,
                        shuffle=True,
                        subset = "validation",
                        class_mode='categorical')    # batch size

In [None]:
import cv2
img = cv2.imread('./melspectrogram_dataset/brnowl/XC635289_2.png')

img.shape

In [None]:
LEARNING_RATE = 0.001
num_epochs = 80
BATCH_SIZE = 64
IMG_SIZE = 224

In [None]:
from datetime import datetime, timedelta

start_time = datetime.now()
print('Time now is', start_time)
end_training_by_tdelta = timedelta(seconds=8400)
this_run_file_prefix = start_time.strftime('%Y%m%d_%H%M_')

num_epochs = 80
# EPOCHS = 25
# STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE



# Learning Rate Schedule for Fine Tuning #
# Learning rate schedule for TPU, GPU and CPU.
# Using an LR ramp up because fine-tuning a pre-trained model.
# Starting with a high LR would break the pre-trained weights.

LR_START = 0.00001
LR_MAX = 0.00005 * 8
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_EXP_DECAY = .8

def lrfn(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = LR_START + (epoch * (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS)
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_EXP_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr
    
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
rng = [i for i in range(num_epochs)]
y = [lrfn(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
 
base_model = EfficientNetB0(weights='imagenet',include_top=False, input_shape=(224,224,3))

# Construct the head of the model that will be placed on top of the base model
head_model = base_model.output
head_model = GlobalAveragePooling2D()(head_model)
head_model = Flatten(name="flatten")(head_model)
head_model = Dense(16, activation="softmax")(head_model)
model = Model(inputs=base_model.input, outputs=head_model)

for layer in base_model.layers:
    layer.trainable = True
model.summary()
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

early_stopping = EarlyStopping(
    monitor='val_accuracy',
    min_delta=0.0001,
    patience=5,
    verbose=1,
    mode='max',
    baseline=None,
    restore_best_weights=True)
H = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    callbacks=[early_stopping, lr_callback],
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    epochs=num_epochs)

In [None]:
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, len(H.history["loss"])), H.history["loss"], label="train_loss")
plt.plot(np.arange(0, len(H.history["val_loss"])), H.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, len(H.history["accuracy"])), H.history["accuracy"], label="train_acc")
plt.plot(np.arange(0, len(H.history["val_accuracy"])), H.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.show()

In [None]:
model.save('bird_model.h5')

In [None]:
test_audio_dir = '../input/birdclef-2022/test_soundscapes/'
file_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]

print('Number of test soundscapes:', len(file_list))

In [None]:
import json
with open('../input/birdclef-2022/scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)

In [None]:
def predict(threshold):
#     row_id = []
    pred = {'row_id': [], 'target': []}
    scnt = 0
    for afile in file_list:
        # Open it with librosa
        path = test_audio_dir + afile + '.ogg'
        sig, rate = librosa.load(path, sr=SAMPLE_RATE)
        sig_splits = []
        for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
            split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]

            # End of signal?
            if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
                break

            sig_splits.append(split)

        seconds= 0
        for chunk in sig_splits:

            # Keep track of the end time of each chunk
            seconds += 5

            # Get the spectrogram
            hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
            mel_spec = librosa.feature.melspectrogram(y=chunk, 
                                                      sr=SAMPLE_RATE, 
                                                      n_fft=1024, 
                                                      hop_length=hop_length, 
                                                      n_mels=SPEC_SHAPE[0], 
                                                      fmin=FMIN, 
                                                      fmax=FMAX)

            mel_spec = librosa.power_to_db(mel_spec, ref=np.max) 

            # Normalize to match the value range we used during training.
            # That's something you should always double check!
            mel_spec -= mel_spec.min()
            mel_spec /= mel_spec.max()

            # Add channel axis to 2D array
            mel_spec = np.expand_dims(mel_spec, -1)

            # Add new dimension for batch size
            mel_spec = np.expand_dims(mel_spec, 0)

            # Predict
            p = 0.5*model.predict(mel_spec)[0] 

            # Get highest scoring species
            idx = p.argmax()
            species = LABELS[idx]
            score = p[idx]
            chunks = [[] for i in range(12)]
            for idx,i in enumerate(range(len(chunks))):        
                for bird in scored_birds:
                    chunk_end_time = (i + 1) * 5
                # Prepare submission entry
                    row_id = afile + '_' + bird + '_' + str(chunk_end_time)
#                     row_id.append(afile.split(os.sep)[-1].rsplit('_', 1)[0] + 
#                                   '_' + str(seconds))  
                    # Decide if it's a "nocall" or a species by applying a threshold
#                     pred['row_id'].append(row_id)
#                     pred['target'].append(True if score > threshold else False)
            
                    pred['row_id'].append(row_id)
                    pred['target'].append(True if score > threshold else False)
                       

    result = pd.DataFrame(pred, columns = ['row_id', 'target'])
    return result

In [None]:
result=predict(0.3)
result

In [None]:
result
results = result.to_csv('submission_bird.csv',index=False)

In [None]:
results = pd.read_csv('./submission_bird.csv')
results