# Check GPU

In [None]:
!nvidia-smi -L

# Data Extraction

In [None]:
!unzip ../input/freesound-audio-tagging-2019/train_curated.zip -d /kaggle/working/train_curated | sh &> /dev/null
!unzip ../input/freesound-audio-tagging-2019/test.zip -d /kaggle/working/test | sh &> /dev/null

# Libraries

In [None]:
import os
import IPython
import librosa
import warnings
import librosa.display
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from skmultilearn.model_selection import iterative_train_test_split

plt.style.use("ggplot")
warnings.filterwarnings("ignore", category=FutureWarning)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
FULL_TRAINING = False

# Competition Metrics

In [None]:
class LWLRAP(tf.keras.metrics.Metric):
    def __init__(self, num_classes, name="weighted_label_ranking_average_precision"):
        super().__init__(name=name)

        self._precisions = self.add_weight(
            name="per_class_cumulative_precision",
            shape=[num_classes],
            initializer="zeros",
        )

        self._counts = self.add_weight(
            name="per_class_cumulative_count",
            shape=[num_classes],
            initializer="zeros",
        )

    @staticmethod
    def _one_sample_positive_class_precisions(example):
        y_true, y_pred = example

        retrieved_classes = tf.argsort(y_pred, direction="DESCENDING")
        class_rankings = tf.argsort(retrieved_classes)
        retrieved_class_true = tf.gather(y_true, retrieved_classes)
        retrieved_cumulative_hits = tf.math.cumsum(
            tf.cast(retrieved_class_true, tf.float32)
        )

        idx = tf.where(y_true)[:, 0]
        i = tf.boolean_mask(class_rankings, y_true)
        r = tf.gather(retrieved_cumulative_hits, i)
        c = 1 + tf.cast(i, tf.float32)
        precisions = r / c

        dense = tf.scatter_nd(idx[:, None], precisions, [y_pred.shape[0]])
        return dense

    def update_state(self, y_true, y_pred, sample_weight=None):
        precisions = tf.map_fn(
            fn=self._one_sample_positive_class_precisions,
            elems=(y_true, y_pred),
            fn_output_signature=(tf.float32),
        )

        increments = tf.cast(precisions > 0, tf.float32)
        total_increments = tf.reduce_sum(increments, axis=0)
        total_precisions = tf.reduce_sum(precisions, axis=0)

        self._precisions.assign_add(total_precisions)
        self._counts.assign_add(total_increments)

    def result(self):
        per_class_lwlrap = self._precisions / tf.maximum(self._counts, 1.0)
        per_class_weight = self._counts / tf.reduce_sum(self._counts)
        overall_lwlrap = tf.reduce_sum(per_class_lwlrap * per_class_weight)
        return overall_lwlrap

    def reset_state(self):
        self._precisions.assign(self._precisions * 0)
        self._counts.assign(self._counts * 0)

# Data Preparation

## Load Dataset CSV

In [None]:
df_train_curated = pd.read_csv('/kaggle/input/freesound-audio-tagging-2019/train_curated.csv')
df_train_noisy = pd.read_csv('/kaggle/input/freesound-audio-tagging-2019/train_noisy.csv')
df_sample_submission = pd.read_csv('/kaggle/input/freesound-audio-tagging-2019/sample_submission.csv')

## Add Directory Column

In [None]:
df_train_curated['dir'] = df_train_curated.fname.apply(lambda x: f'/kaggle/working/train_curated/{x}')
df_train_noisy['dir'] = df_train_noisy.fname.apply(lambda x: f'/kaggle/working/train_noisy/{x}')
df_sample_submission['dir'] = df_sample_submission.fname.apply(lambda x: f'/kaggle/working/test/{x}')

In [None]:
if FULL_TRAINING:
    df_train = pd.concat([df_train_curated, df_train_noisy], axis=0).reset_index(drop=True)
else:
    df_train = df_train_curated
df_train

## Label Encoding

In [None]:
LABELS = list(df_train.labels.unique())
label_idx = {label: i for i, label in enumerate(LABELS)}
df_train["labels_idx"] = df_train.labels.apply(lambda x: label_idx[x])

In [None]:
labels = list(df_sample_submission.iloc[:, 1:].columns)[:-1]
y_train = np.zeros((len(df_train), len(labels)), dtype=np.uint8)

for i, y in enumerate(df_train.labels.tolist()):
    for y_i in y.split(','):
        j = labels.index(y_i)
        y_train[i, j] = 1

y_train = pd.DataFrame(y_train, columns=labels)

In [None]:
df = pd.concat([df_train, y_train], axis=1)
df

## Data Generator

In [None]:
class AudioDataset(tf.keras.utils.Sequence):
    sampling_rate = 44100
    duration = 1
    hop_length = 347*duration
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration
    use_three_channels = True
    size = (n_mels, n_mels)
    
    def __init__(self, dirs, labels, batch_size=32):
        self.dirs = dirs
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        length = len(self.dirs) / self.batch_size
        length = np.ceil(length)
        length = length.astype(np.int)
        return length

    @classmethod
    def get_audio(cls, dir, do_trim=True):
        audio, sr = librosa.load(dir, sr=cls.sampling_rate)
        if 0 < len(audio):
            audio, _ = librosa.effects.trim(audio)
        if len(audio) > cls.samples and do_trim:
            audio = audio[:cls.samples]
        else:
            padding = cls.samples - len(audio)
            offset = padding // 2
            audio = np.pad(
                audio, 
                (offset, cls.samples - len(audio) - offset), 
                'constant'
            )
        return audio.astype(np.float32), sr
        
    @classmethod
    def audio_to_spectrogram(cls, audio, sr):
        audio /= 32768
        spectrogram = librosa.feature.melspectrogram(
            audio,
            sr=sr or cls.sampling_rate,
            n_mels=cls.hop_length,
            n_fft=cls.n_fft,
            fmin=cls.fmin,
            fmax=cls.fmax
        )
        spectrogram = librosa.power_to_db(spectrogram)
        spectrogram = spectrogram.astype(np.float32)
        return spectrogram
        
    @staticmethod
    def mono_to_color(x, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
        x = np.stack([x, x, x], -1)
        mean = mean or x.mean()
        std = std or x.std()
        if not (mean is None and std is None):
            mean = np.array(mean)
            std = np.array(std)
        x_std = (x - mean) / (std + eps)
        x_std_min = x_std.min()
        x_std_max = x_std.max()
        norm_min = norm_min or x_std_min
        norm_max = norm_max or x_std_max
        
        if (x_std_max - x_std_min) > eps:
            v = x_std
            v[v < norm_min] = norm_min
            v[v > norm_max] = norm_max
            v = 255 * (v - norm_min) / (norm_max - norm_min)
        else:
            v = np.zeros_like(x_std, dtype=np.uint8)
        return v
        

    def __getitem__(self, idx):
        dirs = self.dirs[idx * self.batch_size : (idx + 1) * self.batch_size]
        if self.labels is None:
            labels = None
        else:
            labels = self.labels[idx * self.batch_size : (idx + 1) * self.batch_size]
        audios = [self.get_audio(dir) for dir in dirs]
        audios = [self.audio_to_spectrogram(audio, sr) for audio, sr in audios]
        if self.use_three_channels:
            audios = [self.mono_to_color(x, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) for x in audios]
        audios = [tf.image.resize(x, self.size) for x in audios]
        audios = np.stack(audios) / 255
        return audios, labels

In [None]:
x = np.expand_dims(df.dir.to_numpy(), -1)
y = df.drop(["fname", "labels", "dir", "labels_idx"], axis=1).to_numpy()
x_train, y_train, x_test, y_test = iterative_train_test_split(x, y, test_size=0.2)

x_train, y_train, x_val, y_val = iterative_train_test_split(
    x_train, y_train, test_size=0.2
)

x_train = x_train.reshape(-1)
x_val = x_val.reshape(-1)
x_test = x_test.reshape(-1)

In [None]:
train_dataset = AudioDataset(x_train, y_train)
val_dataset = AudioDataset(x_val, y_val)
test_dataset = AudioDataset(x_test, y_test)

# Model

## Definition

In [None]:
resnet = tf.keras.applications.ResNet50V2(
    include_top=False,
    weights=None,
    input_shape=(*AudioDataset.size, 3),
)

classifier = tf.keras.Sequential(
    [tf.keras.layers.Flatten(name='flatten'),
     tf.keras.layers.Dense(1024, activation='elu', name='dense_1'),
     tf.keras.layers.Dropout(0.25, name='dropout_1'),
     tf.keras.layers.Dense(1024, activation='elu', name='dense_2'),
     tf.keras.layers.Dropout(0.25, name='dropout_2'),
     tf.keras.layers.Dense(80, activation='sigmoid')],
    name='classifier'
)(resnet.layers[-1].output)

model = tf.keras.Model(inputs=resnet.inputs, outputs=classifier, name='resnet')
model.compile(
    loss="binary_crossentropy", 
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4, decay=1e-4/500), 
    metrics=[LWLRAP(80)]
)
model.summary()

## Training

In [None]:
es = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
lr_sched = tf.keras.callbacks.ReduceLROnPlateau(patience=2)

train_history = model.fit(
    train_dataset, 
    validation_data=val_dataset, 
    epochs=500, 
    callbacks=[es, lr_sched]
).history

test_history = model.evaluate(test_dataset)

# Export Prediction to CSV

In [None]:
prediction = model.predict(AudioDataset(df_sample_submission.dir, None))
df_sample_submission = df_sample_submission.drop('dir', axis=1)
df_sample_submission.iloc[:, 1:] = prediction
df_sample_submission.to_csv('submission.csv', index=False)

# Training Plot

In [None]:
loss = train_history['loss']
val_loss = train_history['val_loss']

lwrap = train_history['weighted_label_ranking_average_precision']
val_lwrap = train_history['val_weighted_label_ranking_average_precision']

lr = train_history['lr']

## Loss

In [None]:
fig, ax = plt.subplots(figsize=(16, 4))
ax.set_title('BCE Loss')
ax.plot(range(1, len(loss) + 1), loss, label='Training')
ax.plot(range(1, len(loss) + 1), val_loss, 'o--', label='Validation')
ax.set_xlim(1, len(loss))
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.legend()
fig.savefig('loss.png')

## Weighted Label Ranking Average Precision

In [None]:
fig, ax = plt.subplots(figsize=(16, 4))
ax.set_title('Weighted Label Ranking Average Precision')
ax.plot(range(1, len(loss) + 1), lwrap, label='Training')
ax.plot(range(1, len(loss) + 1), val_lwrap, 'o--', label='Validation')
ax.set_xlim(1, len(loss))
ax.set_xlabel("Epoch")
ax.set_ylabel("LWRAP")
ax.legend()
fig.savefig('lwrap.png')

## Scheduler Learning Rate

In [None]:
fig, ax = plt.subplots(figsize=(16, 4))
ax.set_title('Scheduler Learning Rate')
ax.plot(range(1, len(lr) + 1), lr, 'o--')
ax.set_xlim(1, len(lr))
ax.set_xlabel("Epoch")
ax.set_ylabel("Learning Rate")
fig.savefig('lr.png')