In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
from tqdm import tqdm
from PIL import Image
import tensorflow as tf
import cv2
import skimage.io

#to play audio
import librosa
import librosa.display
import sklearn.model_selection as sk
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

In [None]:
train_datadir = "../input/birdclef-2021/train_short_audio"
soundscapes = '../input/birdclef-2021/train_soundscapes'
train_csv = "../input/birdclef-2021/train_metadata.csv"
test_csv = "../input/birdclef-2021/test.csv"
train_soundscape = "../input/birdclef-2021/train_soundscape_labels.csv"
output_dir = "../output/kaggle/working"

In [None]:
sample_rate = 32000
duration = 10.0
offset = 1.0
hop_length = 512 # number of samples per time-step in spectrogram
n_mels = 128 # number of bins in spectrogram. Height of image
time_steps = 384 # number of time-steps. Width of image
threshold = 0.01

In [None]:
df_train_metadata = pd.read_csv(train_csv)
df_train_soundscape = pd.read_csv(train_soundscape)
df_test = pd.read_csv(test_csv)

In [None]:
df_train_metadata.head()

In [None]:
df_train_metadata.hist(column='rating', figsize=(10, 5))

In [None]:
df_train_metadata['primary_label'].value_counts()[:100].plot(kind="bar", figsize=(20, 10), rot=90)

In [None]:
df_train_soundscape.head()

In [None]:
def scale_minmax(x, min=0.0, max=1.0):
    x_std = (x - x.min()) / (x.max() - x.min())
    x_scaled = x_std * (max - min) + min
    return x_scaled

In [None]:
def spectrogram_image(y, sr, out, hop_length, n_mels):
    # use log-melspectrogram
    mels = librosa.feature.melspectrogram(
        y=y, 
        sr=sr, 
        n_mels=n_mels,                   
        n_fft=hop_length*2, 
        hop_length=hop_length
    )
    mels = np.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy

    # save as PNG
    skimage.io.imsave(out, img)

In [None]:
def extract_links_and_labels(data_dir):
    audio_clips = []
    folder_names = []
    file_names = []
    birds = os.listdir(data_dir)
    labels = []

    for bird in birds:
        for clip in os.listdir(data_dir + "/" + bird):
            folder_names.append(bird)
            file_names.append(clip.split(".ogg")[0])
            audio_clips.append(data_dir +  "/" + bird + "/" + clip)
            labels.append(bird)
    
    return folder_names, file_names, audio_clips, labels

In [None]:
folder_names, file_names, train_audio_clips, labels = extract_links_and_labels(train_datadir)

In [None]:
train_audio_clips = []
folder_names = []
file_names = []
birds = os.listdir(train_datadir)
labels = []

for bird in birds:
    for clip in os.listdir(train_datadir +  "/" + bird):
        folder_names.append(bird)
        file_names.append(clip.split(".ogg")[0])
        train_audio_clips.append(train_datadir +  "/" + bird + "/" + clip)
        if not bird in labels:
            labels.append(bird)

In [None]:
!rm -rf kaggle/working

In [None]:
!mkdir train_images

In [None]:
def extract_spectogram(path, folder_name, file_name):
    start_sample = 0 # starting at beginning
    length_samples = time_steps * hop_length
    counter = 0
    y, sr = librosa.load(path, offset=offset, duration=duration, sr=sample_rate)
    while (start_sample <= len(y) and start_sample + length_samples <= len(y)):
        # extract a fixed length window
        window = y[start_sample: start_sample + length_samples]
        # convert to PNG
        if not os.path.exists('train_images/' + folder_name):
            os.makedirs('train_images/' + folder_name)
#         spectrogram_image(window, 
#                           sr=sr, 
#                           out='train_images/' + folder_name + "/" + file_name + "_" + str(counter) + ".png", 
#                           hop_length=hop_length, 
#                           n_mels=n_mels)
#         start_sample = start_sample + length_samples
#         counter += 1

In [None]:
# extract_spectogram(train_audio_clips[0], folder_names[0], file_names[0])

In [None]:
# for i in tqdm(range(len(train_audio_clips))):
#     extract_spectogram(train_audio_clips[i], folder_names[i], file_names[i])

In [None]:
def imagetensor(imagedir):  
    for i, im in enumerate(os.listdir(imagedir)):

        image= cv2.imread(os.path.join(imagedir, im), 0)
        image.resize(32, 96)
#         image = image.resize(48,144)
#         print(image.shape)

        if i == 0:
            images= np.expand_dims(np.array(image, dtype= float) / 255, axis= 0)
        else:
            image= np.expand_dims(np.array(image, dtype= float) / 255, axis= 0)
            images= np.append(images, image, axis= 0)
    return images

In [None]:
# labels = []
# for label in os.listdir('./train_images/'):
#     if (not '.ipynb' in label and 
#         not 'h5'in label and 
#         not '.npy' in label and 
#         not '.txt' in label and 
#         not 'out' in label):
#         labels.append(label)

In [None]:
len(labels)

In [None]:
# x_train = np.array([])
# y_train = []
# for idx, label in tqdm(enumerate(tqdm(labels))):
#     if idx == 0:
#         x_train= imagetensor('./train_images/' + label)
#         y_train.extend([label] * len(x_train))
#     else:
#         images = imagetensor('./train_images/' + label)
#         y_train.extend([label] * len(images))
#         x_train = np.vstack((x_train, images))

In [None]:
# print(len(x_train), len(y_train))

In [None]:
# inputs = x_train.reshape((61440, 32, 96, 1))
# outputs = pd.get_dummies(pd.Series(y_train)).to_numpy()

In [None]:
# print(inputs.shape)
# print(outputs.shape)

In [None]:
# np.savez_compressed('batch_reduced_32_96', x=inputs, y=outputs)

In [None]:
# b = os.path.getsize("batch_reduced_32_96.npz")
# print(b)

In [None]:
batch_1 = np.load('../input/bird-outputs/batch_reduced_32_96.npz')

In [None]:
inputs = batch_1['x']
outputs = batch_1['y']

In [None]:
# x_train, x_val, y_train, y_val = sk.train_test_split(inputs, outputs, test_size=0.2, random_state = 42)

In [None]:
# print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

In [None]:
SPEC_SHAPE = inputs[0].shape
OUTPUT_SHAPE = outputs[0].shape
SEED = 8000

In [None]:
from sklearn.utils import shuffle
inputs, outputs = shuffle(inputs, outputs)

In [None]:
tf.random.set_seed(SEED)
model = tf.keras.Sequential([
    
    # First conv block
    tf.keras.layers.Conv2D(16, (3, 3), 
                           activation='relu', 
                           input_shape=(SPEC_SHAPE[0], SPEC_SHAPE[1], 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)),
    
    # Second conv block
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 
    
    # Third conv block
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2, 2)), 

    tf.keras.layers.GlobalAveragePooling2D(), 
    
    # Dense block
    tf.keras.layers.Dense(128, activation='relu'),   
    tf.keras.layers.Dropout(0.5),  
    tf.keras.layers.Dense(128, activation='relu'),   
    tf.keras.layers.Dropout(0.5),
    
    # Classification layer
    tf.keras.layers.Dense(OUTPUT_SHAPE[0], activation='softmax')
])
print('MODEL HAS {} PARAMETERS.'.format(model.count_params()))

In [None]:
# Compile the model and specify optimizer, loss and metric
initial_learning_rate = 0.1
decay_steps = 1.0
decay_rate = 0.5
learning_rate_fn = tf.keras.optimizers.schedules.InverseTimeDecay(
  initial_learning_rate, decay_steps, decay_rate
)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0015),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

In [None]:
model.summary()

In [None]:
early_stopping_monitor = EarlyStopping(
    monitor='val_accuracy',
    patience=30,
    verbose=1,
    mode='auto',
    restore_best_weights=True
)

mcp_save = ModelCheckpoint(
    './best_model.h5', 
    save_best_only=True, 
    monitor='val_accuracy',
    verbose=1
)

In [None]:
# model.fit(
#     inputs, 
#     outputs, 
#     epochs=100,
#     batch_size=256, 
#     validation_split=0.2, 
#     callbacks=[early_stopping_monitor, mcp_save]
# )

In [None]:
# !rm rf train_soundscapes

In [None]:
!mkdir train_soundscapes_2

In [None]:
soundscape_links = []
filenames = []
for soundscape in os.listdir(soundscapes):
    soundscape_link = soundscapes + '/' + soundscape
    soundscape_links.append(soundscape_link)
    filenames.append(soundscape.split('.')[0])

In [None]:
print(soundscape_links[0])
print(filenames[0])
output_links = []

In [None]:
def extract_soundscape_spectogram(path, file_name):
    start_sample = 0 # starting at beginning
    length_samples = time_steps * hop_length
    y, sr = librosa.load(path, offset=offset, duration=duration, sr=sample_rate)
    counter = 5
#     output_links = []
    while (start_sample <= len(y) and start_sample + length_samples <= len(y)):
        # extract a fixed length window
        window = y[start_sample: start_sample + length_samples]
        # convert to PNG
        spectrogram_image(window, 
                          sr=sr, 
                          out='train_soundscapes_2/' + file_name + "_" + str(counter) + ".png", 
                          hop_length=hop_length, 
                          n_mels=n_mels)
        start_sample = start_sample + length_samples
        output_links.append(file_name + "_" + str(counter) + ".png")
        counter += 5

In [None]:
for i in tqdm(range(len(soundscape_links))):
    extract_soundscape_spectogram(soundscape_links[i], filenames[i])

In [None]:
len(soundscape_links), len(output_links)

In [None]:
len(labels)

In [None]:
output_links

In [None]:
x_test = imagetensor('./train_soundscapes_2/')

In [None]:
x_test.shape

In [None]:
x_test = x_test.reshape((20, 32, 96, 1))
np.savez_compressed('test_reduced_32_96', x=x_test)

In [None]:
from tensorflow import keras
model = keras.models.load_model('../input/bird-outputs/best_model (1).h5')

In [None]:
p = model.predict(x_test)
y_prob = np.argmax(p, axis=1)

In [None]:
submission = []
for i in range(len(p)):
    data = {}
    label = output_links[i].split('_')[0] + '_' + output_links[i].split('_')[1] + '_' + output_links[i].split('_')[-1].split('.png')[0]
    data['row_id'] = label
    if (p[i][y_prob[i]]) < threshold:
        data['birds'] = 'nocall'
    else:
        data['birds'] = labels[y_prob[i]]
    submission.append(data)

In [None]:
df_submission = pd.DataFrame(submission)
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index=False)