## Libraries

In [None]:
!pip install -U tensorflow==2.8

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn import preprocessing
import os
from tqdm import tqdm
import librosa
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
print(tf.__version__)

## Config

In [None]:
# Data:
TRAIN_DIR = '../input/birdclef-2022/train_audio/'
IMAGES_DIR = 'images/'
SAMPLE_RATE = 32000
VAL_SIZE = 0.2

# Data processing:
N_FFT = 2048
HOP_LEN = 512
WIN_FUNC = 'hann'
N_MELS = 224
F_MIN = 0
F_MAX = SAMPLE_RATE / 2

# Learning process:
NAME_MODEL_0 = "model_0_inst.h5"
NAME_MODEL_0_PIC = 'model_0_pic.png'
NAME_MODEL_0_CHECKPOINT = 'model_0_cp.ckpt'
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
BATCH_SIZE = 32
N_CHANNELS = 3
EPOCHS = 5
CALL_BACKS = [tf.keras.callbacks.ModelCheckpoint(
    filepath=NAME_MODEL_0_CHECKPOINT,
    save_weights_only=True,
    verbose=0
)]

## Load data

In [None]:
train_metadata = pd.read_csv('/kaggle/input/birdclef-2022/train_metadata.csv')
train_metadata.head()

In [None]:
# Load work classes:
with open('/kaggle/input/birdclef-2022/scored_birds.json', 'r') as f:
    valid_classes = json.load(f)

primary_labels = train_metadata.primary_label

# Encode labels:
encoder = preprocessing.LabelEncoder()
labels = encoder.fit_transform(primary_labels)
labels = np.uint8(labels)

NUM_CLASSES = len(np.unique(labels))

## Convert audio files into spectrograms

In [None]:
# Load audio:
def loadAudio(filename: str) -> np.ndarray:
    # Load audio:
    signal, _ = librosa.load(
        filename,
        sr=SAMPLE_RATE,
        mono=True,
        dtype=np.float32
    )
    
    return signal


# Cut the signal into frames duration 5 sec:
def framing(sig: np.ndarray, sample_rate: int, frame_len: int, duration_time: float) -> np.ndarray:
    num_frames = int(np.ceil(duration_time / 5))
    framed_sig = np.zeros((num_frames, int(frame_len * sample_rate)))
    start_time = 0
    end_time = frame_len * sample_rate
    if duration_time < 5:
        framed_sig[0][:sig.shape[0]] = sig
    else:
        for i in range(num_frames):
            framed_sig[i][:end_time - start_time] = sig[start_time:end_time]
            start_time = start_time + int(frame_len * sample_rate)
            if i == num_frames - 2:
                end_time = end_time + int(sig.shape[0] - start_time)
            else:
                end_time = end_time + int(frame_len * sample_rate)

    return framed_sig


# Convert audio frame into spectrogram:
def createSpectrogram(frame: np.ndarray) -> np.ndarray:
    specgram = librosa.feature.melspectrogram(
        y=frame,
        sr=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP_LEN,
        win_length=N_FFT,
        window='hann',
        center=True,
        pad_mode='reflect',
        power=2.0,
        n_mels=N_MELS,
        fmin=F_MIN,
        fmax=F_MAX,
        dtype=np.float32
    )
    specgram = librosa.amplitude_to_db(specgram, ref=np.max)
    
    return specgram


# Save spectrogram as png:
def saveSpectogram(specgram: np.ndarray, filename: str, label: np.uint8, ind: int) -> None:
    bird_name = filename.rsplit('/', 2)[0]
    file_id = filename.rsplit('/', 2)[1].rsplit('.', 2)[0]
    file_name = IMAGES_DIR + 'class_' + str(labels[i]) + '/' + bird_name + '_' + file_id + '_' + str((ind + 1) * 5) + '.png'
    specgram = specgram + 80 # -80 dB -> Min
    specgram = specgram.astype(np.uint8) # 0 - 255 the pixel value
    plt.axis('off')
    plt.imsave(file_name, specgram)
    

# Common function:
def convertAudio(filename: str, label: np.uint8) -> None:
    signal = loadAudio(filename=TRAIN_DIR + filename)
    frames = framing(
        sig=signal,
        sample_rate=SAMPLE_RATE,
        frame_len=5,
        duration_time=librosa.get_duration(
            y=signal,
            sr=SAMPLE_RATE
        )
    )
    for i in range(frames.shape[0]):
        specgram = createSpectrogram(frame=frames[i])
        saveSpectogram(
            specgram=specgram,
            filename=filename,
            label=label,
            ind=i
        )

In [None]:
# Create dirs for each class:
os.mkdir(IMAGES_DIR)
for i in range(np.unique(labels).shape[0]):
    os.mkdir(IMAGES_DIR + 'class_' + str(np.unique(labels)[i]) + '/')
    
# Convert audio into spectragrams:
for i in tqdm(range(train_metadata.shape[0])):
    convertAudio(
        filename=train_metadata.filename.iloc[i],
        label=labels[i]
    )

## Preparing data

In [None]:
# Make a dataset containing the training spectrograms
train_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    batch_size=BATCH_SIZE,
    validation_split=VAL_SIZE,
    directory=IMAGES_DIR,
    shuffle=True,
    color_mode='rgb',
    image_size=(IMAGE_HEIGHT, IMAGE_WIDTH),
    subset="training",
    label_mode='categorical',
    seed=42
)

# Make a dataset containing the validation spectrogram
valid_dataset = tf.keras.preprocessing.image_dataset_from_directory(
    batch_size=BATCH_SIZE,
    validation_split=VAL_SIZE,
    directory=IMAGES_DIR,
    shuffle=True,
    color_mode='rgb',
    image_size=(IMAGE_HEIGHT, IMAGE_WIDTH),
    subset="validation",
    label_mode='categorical',
    seed=42
)

In [None]:
# Function to prepare our datasets for modelling
def prepare(ds, augment=False):
    # Define our one transformation
    rescale = tf.keras.Sequential([tf.keras.layers.experimental.preprocessing.Rescaling(1./255)])
    flip_and_rotate = tf.keras.Sequential([
        tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
        tf.keras.layers.experimental.preprocessing.RandomRotation(0.2)
    ])
    
    # Apply rescale to both datasets and augmentation only to training
    ds = ds.map(lambda x, y: (rescale(x, training=True), y))
    if augment: ds = ds.map(lambda x, y: (flip_and_rotate(x, training=True), y))
    return ds

train_dataset = prepare(train_dataset, augment=False)
valid_dataset = prepare(valid_dataset, augment=False)

## Build and fit the model

In [None]:
def getModel():
    # Create CNN model
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, N_CHANNELS)))
    model.add(tf.keras.layers.Conv2D(32, 3, strides=2, padding='same', activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Conv2D(128, 3, padding='same', activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dropout(0.5))
    model.add(tf.keras.layers.Dense(np.unique(labels).shape[0], activation='softmax'))

    # Compile model
    model.compile(
        loss='categorical_crossentropy',
        optimizer=tf.keras.optimizers.RMSprop(),
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ],
    )
    
    return model


def plotMetrics(history):
    metrics = list()
    for key, value in history.history.items():
        metrics.append(key)
        
    for i in range(int(len(metrics) / 2)):
        plt.figure(figsize=(24, 6))
        plt.plot(history.history[metrics[i]], c ="darkblue")
        plt.plot(history.history[metrics[i + int(len(metrics) / 2)]], c ="crimson")
        plt.legend(["Train", "Validation"])
        plt.title("Model" + metrics[i])
        plt.xlabel("Epoch")
        plt.ylabel(metrics[i])
        plt.grid(True, alpha = 0.2)
        plt.show()

In [None]:
model = getModel()
tf.keras.utils.plot_model(model, NAME_MODEL_0_PIC, show_shapes=True)
history = model.fit(
    train_dataset,
    epochs=EPOCHS,
    validation_data=valid_dataset,
    callbacks=CALL_BACKS
)
model.save(NAME_MODEL_0)

In [None]:
plotMetrics(history)

In [None]:
#model.save(NAME_MODEL_0)

## Make prediction

In [None]:
"""pred = {
    'row_id': list(),
    'target': list()
}

test_path = '/kaggle/input/birdclef-2022/test_soundscapes/'
files = [f.split('.')[0] for f in sorted(os.listdir(test_path))]

birds_path = '/kaggle/input/birdclef-2022/scored_birds.json'
with open(birds_path) as bf:
    birds = json.load(bf)
    
for f in files:
    p = test_path + f + '.ogg'
    
    sig = loadAudio(filename=p)
    duration_sig = librosa.get_duration(y=sig, sr=SAMPLE_RATE)
    sig_framed = framing(sig=sig, sample_rate=SAMPLE_RATE, frame_len=5, duration_time=duration_sig)
    
    for i in range(sig_framed.shape[0]):
        # Get prediction:
        specgram = createSpectrogram(frame=sig_framed[i])
        specgram = specgram + 80 # -80 dB -> Min
        specgram = specgram.astype(np.uint8) # 0 - 255 the pixel value

        y_pred = model.predict(specgram)
        predicted_class = np.argmax(y_pred)

        for b in birds:  
            segment_end = (i + 1) * 5
            row_id = f + '_' + b + '_' + str(segment_end)
            pred['row_id'].append(row_id)
            label_inv = encoder.inverse_transform([predicted_class])
            if label_inv[0] == b:
                pred['target'].append(True)
            else:
                pred['target'].append(False)"""

In [None]:
#cols = ['row_id','target']
#df_sub = pd.DataFrame(pred, columns=cols)

In [None]:
#df_sub.to_csv('/kaggle/working/submission.csv', index=False)