# Bibliography

Arai, H. (2020). 6th place solution and some thoughts. Retrieved
from https://www.kaggle.com/competitions/birdsong-recognition/discussion/183204

Henkel, C., Pfeiffer, P., & Singer, P. (2021). Recognizing bird species in diverse
soundscapes under weak supervision. 

Hidehisa, A. (2020). Introduction to Sound Event Detection. https://kaggle.com/hidehisaarai1213/introduction-to-sound-event-detection

Kong, Q., Cao, Y., Iqbal, T., Wang, Y., Wang, W., & Plumbley, M. D. (2020). Panns: Large-scale pretrained audio neural networks for audio pattern recognition (No. arXiv:1912.10211). Retrieved from http://arxiv.org/abs/1912.10211 (arXiv:1912.10211 [cs, eess] type: article) doi: 10.48550/arXiv.1912.10211


In [None]:
import sys
import json
import os
import typing
import importlib.util
from hashlib import sha1
from types import SimpleNamespace
from librosa import display
import librosa
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torchaudio as ta
from torch import Tensor, nn
from torch.utils import data

import sklearn.metrics as metrics
import matplotlib.pyplot as plt

In [None]:
resources = SimpleNamespace() # Contains filenames
dataset   = SimpleNamespace() # Contains datasets, data loaders, label mappings
cfg       = SimpleNamespace() # Contains hyper-parameters

### Paths

In [None]:
# Base path for kaggle data
resources.base = "/kaggle"
# Directory with model and weights
resources.model = f"{resources.base}/input/pannscnn14att"

resources.last_checkpoint = f"{resources.base}/input/pannscnn14att/PANNsCNN14Att_BirdClef2022.ckpt"

# Pre-trained weights from Kong et al. (2020) https://zenodo.org/record/3987831. Trained on AudioSet
# I use the "Cnn14_DecisionLevelAtt_mAP" variant
# My checkpoints are fine tuned on-top of this 
resources.panns_cnn14_att_weights = f"{resources.base}/input/pannscnn14att/Cnn14_DecisionLevelAtt_mAP0.425.pth"

# pytorch lightning checkpoint for training
resources.last_checkpoint = f"{resources.base}/input/pannscnn14att/PANNsCNN14Att_BirdClef2022.ckpt"

# CSV file describing the dataset
resources.data_index   = f"{resources.base}/input/birdclef-2022/train_metadata.csv"
# JSON file containing birds used for scoring
resources.scored_birds = f"{resources.base}/input/birdclef-2022/scored_birds.json"

# Base path for training data
resources.train_audio  = f"{resources.base}/input/birdclef-2022/train_audio"
# Base path for test data
resources.test_soundscapes = f"{resources.base}/input/birdclef-2022/test_soundscapes"

# Where to save for submission
resources.save_submission  = f"{resources.base}/working/submission.csv"

In [None]:
# Import model architecture.
# Kong et al. (2020)'s Cnn14_DecisionLevelAtt with minor modification
# Available https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
sys.path.append(resources.model)
from PANNsCNN14Att import PANNsCNN14Att

### Dataset Setup

In [None]:
dataset.full_data_index = pd.read_csv(resources.data_index)
dataset.labels = list(dataset.full_data_index["primary_label"].unique())

# Mapping from bird name -> numeric label
dataset.class_mappings = dict(((label, numeric_label) for numeric_label, label in enumerate(dataset.labels)))

# Load scored bird information
with open(resources.scored_birds) as f:
    dataset.scored_birds = json.load(f)
    # numeric label -> scored bird name
    dataset.scored_birds_mapping = dict(
        (dataset.class_mappings[bird], bird) for bird in dataset.scored_birds)


In [None]:
# Define datasets

def class_balanced_sample(data_index: pd.DataFrame, n: int, random_state=None) -> pd.DataFrame: 
    """
    Class balanced sample that avoids duplicate data as much as possible. 
    Majority classes are under sampled (sampling without replacement). Minority
    classes are over sampled (sampling with replacement).

    :param data_index: Data frame defining a dataset
    :param n: Number of instances that should exist per class
    :param random_state: Random state to ensure deterministic sampling
    :return: A new data index balanced by 'primary_label'
    """    
    gt_n = data_index.value_counts("primary_label", sort=False) > n
    majority_labels = gt_n.index[gt_n]
    minority_labels = gt_n.index[~gt_n]

    # Under sample (sample without replacement)
    majority = data_index[data_index["primary_label"].isin(majority_labels)]
    majority = majority.groupby("primary_label").sample(n=n, random_state=random_state)

    # Over sample (sample with replacement)
    minority = data_index[data_index["primary_label"].isin(minority_labels)]
    minority = minority.groupby("primary_label").sample(n=n, replace=True, random_state=random_state)

    return pd.concat((majority, minority)).reset_index(drop=True)
    

def create_test_train_split(data_index: pd.DataFrame) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    """Generate a class balanced test train split in a deterministic way. 
    Ensure all classes are represented in both splits.

    :param data_index: Data frame defining a dataset
    :return: A test and train data index
    """

    # Handle a unicorn by using it in both the training and testing set
    unicorn = data_index.value_counts("primary_label", sort=False) == 1
    unicorn = unicorn.index[unicorn]

    unicorn_data = data_index[data_index["primary_label"].isin(unicorn)]
    data_index = data_index.drop(unicorn_data.index)

    # Ensure that at least one of each kind is in each set
    minimum_train = data_index.groupby("primary_label").sample(n=1, random_state=1)
    data_index = data_index.drop(minimum_train.index)
    minimum_test = data_index.groupby("primary_label").sample(n=1, random_state=2)
    data_index = data_index.drop(minimum_test.index)
    
    # Perform regular split
    split_larger = data_index.groupby("primary_label").sample(frac=0.8, random_state=3)
    split_smaller  = data_index.drop(split_larger.index)

    train_index = pd.concat((unicorn_data, minimum_train, split_larger), ignore_index=True).reset_index(drop=True)
    test_index = pd.concat((unicorn_data, minimum_test, split_smaller), ignore_index=True).reset_index(drop=True)

    # Resample data
    train_index = class_balanced_sample(train_index, 200, random_state=4)
    test_index = class_balanced_sample(test_index, 40, random_state=5)

    # Assert that we are using the same split as before
    train_digest = sha1(bytes(train_index.to_csv(), encoding="utf-8")).hexdigest()
    test_digest  = sha1(bytes(test_index.to_csv(), encoding="utf-8")).hexdigest()
    print(f"{train_digest} {test_digest}")
    assert "8701e71c6cf468e7fbf761a195213b24446d045c" == train_digest, "Something changed with the training dataset"
    assert "b69eacc9f5204eca6663a4c4ebf11aad8d201184" == test_digest, "Something changed with the test dataset"

    return train_index, test_index

def only_scored(data_index: pd.DataFrame) -> pd.DataFrame:
    """Filter so the data index only contains scored birds

    :param data_index: Data frame defining a dataset
    :return: A data frame defining the filtered dataset
    """
    return data_index[data_index["primary_label"].isin(dataset.scored_birds)].reset_index(drop=True)

dataset.train_index, dataset.test_index = create_test_train_split(dataset.full_data_index)
dataset.scored_test = only_scored(dataset.test_index)
dataset.scored_train_index = only_scored(dataset.train_index)

### Model Configuration

In [None]:
# Configure hyper-parameters

# https://www.kaggle.com/competitions/birdsong-recognition/discussion/183204
# People reckon that 30s gives us a very good chance that our sample contains the data
# we want
cfg.duration = 30

cfg.batch_size = 16 # Batch size has to be small to fit things into memory
cfg.threshold = 0.5
cfg.lr = 0.001
cfg.epochs = 1

cfg.sample_rate = 32000
cfg.window_size = 1024
cfg.hop_size = 320
cfg.mel_bins = 64
cfg.fmin = 50
cfg.fmax = 14000
cfg.num_classes = 152

cfg.chunk_duration = 5.0

# The pre-trained model has 527 classes
cfg.pre_trained_num_classes = 527

# Submission Config
cfg.submission_threshold = 0.01

### Data Loaders

In [None]:
class BIRDClefWaveformDataSet(data.Dataset):

    def __init__(self, data_index: pd.DataFrame, waveform_transform=None):
        super().__init__()
        self.data_index = data_index
        self.waveform_transform = waveform_transform

    def get_random_example(self) -> typing.Tuple[Tensor, Tensor, pd.DataFrame]:
        idx = np.random.randint(len(self.data_index))
        metadata = self.data_index.iloc[idx]
        waveform, labels = self.load_example(idx, False)
        return waveform, labels, metadata["filename"]

    def __len__(self):
        return len(self.data_index)

    def load_example(self, idx: int, with_transform: bool) -> Tensor:
        """Load an example from the data index at index idx

        :param idx: An index into the data index
        :param with_transform: Whether to apply transform or not
        :return: A tensor of the waveform
        """
        row = self.data_index.iloc[idx]
        filename = os.path.join(resources.train_audio, row["filename"])

        # Get the waveform samples and the sample rate from a file
        waveform, sr = ta.load(filename)

        # Ensure the audio is mono channel
        waveform = torch.mean(waveform, dim=0)

        if self.waveform_transform != None and with_transform:
            waveform, _ = self.waveform_transform(waveform, sr)

        # Convert text labels into numeric labels
        primary_label = dataset.class_mappings[row["primary_label"]]
        secondary_labels = list(map(dataset.class_mappings.get, eval(row["secondary_labels"])))

        # Populate target distribution
        target = torch.zeros(cfg.num_classes)
        target[primary_label]   = 1.0
        target[secondary_labels] = 0.99

        return waveform, target

    def __getitem__(self, idx: int):
        return self.load_example(idx, True)

class RandomWaveformCrop(nn.Module):
    """
    Randomly crop a waveform to be a certain duration. If it is too small
    we will pad it to fit.
    """

    def __init__(self, duration: float):
        super().__init__()
        self.duration = duration

    def forward(self, waveform: Tensor, sr: int):
        length = waveform.shape[0]
        new_length = sr * self.duration

        if new_length < length:
            # Crop the waveform because it is too bit
            offset = np.random.randint(length-new_length)
            waveform = waveform[offset:offset+new_length]
        elif length < new_length:
            # Pad the waveform because it is too small
            new_waveform = torch.zeros((new_length))
            offset = np.random.randint(new_length - length)
            new_waveform[offset:offset+length] = waveform
            waveform = new_waveform

        return waveform, sr

In [None]:
# Setup data loaders
dataset.train_dataset       = BIRDClefWaveformDataSet(dataset.train_index, RandomWaveformCrop(cfg.duration))
dataset.test_dataset        = BIRDClefWaveformDataSet(dataset.test_index, RandomWaveformCrop(cfg.duration))
dataset.scored_test_dataset = BIRDClefWaveformDataSet(dataset.scored_test, RandomWaveformCrop(cfg.duration))
dataset.scored_train_dataset = BIRDClefWaveformDataSet(dataset.scored_train_index, RandomWaveformCrop(cfg.duration))


dataset.train_loader = data.DataLoader(dataset.train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=2)
dataset.test_loader = data.DataLoader(dataset.test_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=2)
dataset.scored_test_loader = data.DataLoader(dataset.scored_test_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=2)
dataset.scored_train_loader = data.DataLoader(dataset.scored_train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=2)

## Model

In [None]:
class MyLoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.bce = nn.BCELoss(reduction="sum")

    def forward(self, output: dict, target: Tensor):
        return self.bce(output["clipwise_output"], target)

def filter_bad_numbers(x: Tensor):
    """Remove infinities, NaNs, and force numbers to between 0 and 1. This is an
    ugly hack. 
    
    I'm not sure why it is necessary but I'm not the only one to need it:
    https://www.kaggle.com/hidehisaarai1213/introduction-to-sound-event-detection

    :param x: A tensor to clean
    :return: A clean tensor
    """
    x = torch.where(torch.isnan(x), torch.zeros_like(x), x)
    x = torch.where(torch.isinf(x), torch.zeros_like(x), x)
    x = x.clamp(0, 1)
    return x

class MyBirdSED(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.PANNs = PANNsCNN14Att(
            cfg.sample_rate, 
            cfg.window_size, 
            cfg.hop_size, 
            cfg.mel_bins, 
            cfg.fmin, 
            cfg.fmax, 
            cfg.pre_trained_num_classes
        )

        self.criterion = MyLoss()
        self.eval_y_true = []
        self.eval_y_pred = []
        self.train_y_true = []
        self.train_y_pred = []

    @staticmethod
    def load_pre_trained():
        model = MyBirdSED()
        weights = torch.load(resources.panns_cnn14_att_weights)
        model.PANNs.load_state_dict(weights["model"], strict=False)
        return model

    def forward(self, waveforms, mixup_lambda=None):
        output = self.PANNs.forward(waveforms)
        output['clipwise_output'] = filter_bad_numbers(output['clipwise_output'][:,:cfg.num_classes])
        output['framewise_output'] = filter_bad_numbers(output['framewise_output'][:,:,:cfg.num_classes])
        return output

    def training_step(self, batch, batch_idx):
        x, target = batch

        output = self.forward(x)
        loss = self.criterion(output, target)


        # Store information for statistics
        self.train_y_true.append(target.detach())
        self.train_y_pred.append(output["clipwise_output"].detach())

        # Log
        self.log("train_loss", loss, on_epoch=True, batch_size=cfg.batch_size)
        return loss


    def training_epoch_start(self):
        self.train_y_true = []
        self.train_y_pred = []

    def training_epoch_end(self, step_outputs):
        y_true = torch.concat(self.train_y_true, dim=0).cpu().numpy() > cfg.threshold
        y_pred = torch.concat(self.train_y_pred, dim=0).cpu().numpy() > cfg.threshold
        
        self.log("train_f1", metrics.f1_score(y_true, y_pred, average="macro", zero_division=1))
        self.log("train_accuracy", metrics.accuracy_score(y_true, y_pred))

    def validation_step(self, batch: typing.Tuple[Tensor, Tensor], batch_idx):
        x, target = batch

        output = self.forward(x)
        loss = self.criterion(output, target)

        # Store information for statistics
        self.eval_y_true.append(target)
        self.eval_y_pred.append(output["clipwise_output"])

        # Log
        self.log("test_loss", loss, on_epoch=True, batch_size=cfg.batch_size)
        return loss

    def validation_epoch_start(self):
        self.eval_y_true = []
        self.eval_y_pred = []


    def validation_epoch_end(self, validation_step_outputs):
        y_true = torch.concat(self.eval_y_true, dim=0).cpu().numpy() > cfg.threshold
        y_pred = torch.concat(self.eval_y_pred, dim=0).cpu().numpy() > cfg.threshold
        
        self.log("eval_f1", metrics.f1_score(y_true, y_pred, average="macro", zero_division=1))
        self.log("eval_accuracy", metrics.accuracy_score(y_true, y_pred))



    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), cfg.lr)


In [None]:
#  Save time by using a model prepared earlier
# model = MyBirdSED.load_from_checkpoint(resources.last_checkpoint)

# Fine tune the model
model = MyBirdSED.load_pre_trained()

In [None]:
# Load checkpoint
trainer = pl.Trainer(
    gpus=1,
    max_epochs=cfg.epochs,
)

trainer.fit(model, dataset.train_loader, dataset.test_loader)

In [None]:
# Plot CM

def highlight_cell(x: int, y: int, ax=None, **kwargs):
    """
    Highlight a cell at x, y

    :param x: Coordinate to highlight
    :param y: Coordinate to highlight
    :param ax: pyplot axis, defaults to None
    """
    rect = plt.Rectangle((x-.5, y-.5), 1,1, fill=False, **kwargs)
    ax = ax or plt.gca()
    ax.add_patch(rect)

def plot_cm(cm: Tensor):
    """
    Plot a confusion matrix and highlight scored birds

    :param cm: A tensor representing the confusion matrix
    """
    x = range(0, cfg.num_classes)
    
    plt.figure(figsize=(40, 40))
    plt.imshow(cm)
    plt.xticks(x, dataset.labels, rotation=90)
    plt.yticks(x, dataset.labels)

    # Highlight scored birds
    for x in dataset.scored_birds_mapping.keys():
        highlight_cell(x, x, color="limegreen", linewidth=3)

    ax = plt.gca() # get current axis
    ax.set_xticks(np.arange(-.5, cfg.num_classes, 5), minor=True)
    ax.set_yticks(np.arange(-.5, cfg.num_classes, 5), minor=True)
    ax.grid(which='minor', color='grey', linestyle='-', linewidth=1)

y_true = torch.concat(model.eval_y_true).argmax(1).cpu().numpy()
y_pred = torch.concat(model.eval_y_pred).argmax(1).cpu().numpy()

plot_cm(metrics.confusion_matrix(y_true, y_pred))

In [None]:
def frame_to_seconds(frame: int) -> float:
    """
    Because PANNs14Att notion of time is through frame indices, we need a way
    to convert to seconds.

    :param frame: A frame index
    :return: Offset of the frame index in seconds
    """
    return (frame*cfg.hop_size)/cfg.sample_rate


def seconds_to_frames(duration: float) -> float:
    """
    Convert to frame indices from seconds.

    :param seconds: duration time measured in seconds
    :return: The number of frames in the duration
    """
    return int((duration*cfg.sample_rate)/cfg.hop_size)

In [None]:
def pretty_prediction(y_hat: Tensor, true_bin_vector: Tensor = None):
    ranked_batch = y_hat.argsort(dim=1, descending=True)
    
    for batch_id, ranked in enumerate(ranked_batch):
        print(f"{batch_id:03d}")
        for rank, x in enumerate(ranked[:5]):
            print(f"  {rank:d} {dataset.labels[x]:10} {y_hat[batch_id][x]*100:.2f}%")

        if true_bin_vector != None:
            for numeric_label in true_bin_vector[batch_id].nonzero():
                # print(numeric_label)
                label = dataset.labels[numeric_label]
                rank = ranked.eq(numeric_label).nonzero(as_tuple=True)[0]
                print(f"  Actual Class: {int(rank)} {label:8} {float(y_hat[batch_id][numeric_label]*100):.2f}%")

def plot_framewise_output(waveform: Tensor, framewise_output: Tensor, chunkwise_output: Tensor, top_k_labels: Tensor):

    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(20, 10), sharex=True)

    # Axes 0 is a spectrogram
    D = librosa.stft(waveform.cpu().numpy())
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    librosa.display.specshow(S_db, sr=cfg.sample_rate, x_axis='time', ax=ax[0])

    # Axes 1 is the framewise prediction
    framewise_output = framewise_output.squeeze()[:,top_k_labels].cpu()
    k = top_k_labels.shape[0]
    duration = frame_to_seconds(len(framewise_output))

    ax[1].set_title("Framewise Prediction")
    ax[1].imshow(framewise_output.T.numpy(), 
            interpolation='nearest',
            vmin=0, vmax=1,
            extent=[0, duration, 0, k],
            aspect='auto')
    ax[1].set_yticks(np.arange(k, 0, -1)-0.5, list(map(dataset.labels.__getitem__, top_k_labels)))

    # Axes 2 is the chunkwise prediction
    chunkwise_output = chunkwise_output.squeeze(dim=0)[:,top_k_labels].cpu()

    ax[2].set_title("Chunkwise (5s) Prediction")
    ax[2].imshow(chunkwise_output.T.numpy(), 
            interpolation='nearest',
            vmin=0, vmax=1,
            extent=[0, (duration//5)*5, 0, k],
            aspect='auto')
    ax[2].set_yticks(np.arange(k, 0, -1)-0.5, list(map(dataset.labels.__getitem__, top_k_labels)))

def chunkwise_predictions(framewise_output: Tensor):
    """PANNsCNN14Att outputs point wise prediction at the fidelity of a frame.
    We reshape and aggregate (AdaptiveAvgPool1d) to get a coarser prediction.

    :param framewise_output: output of model of shape (batch_size, frames, classes)
    :return: coarser chunk wise predictions of shape (batch_size, chunks, classes)
    """
    batch_size = framewise_output.shape[0]
    chunk_size = seconds_to_frames(cfg.chunk_duration)
    n_frames = framewise_output.shape[1]

    output_size = n_frames//chunk_size
    max_pool = nn.AdaptiveAvgPool1d(output_size)

    max_pool_out = max_pool(framewise_output.transpose(1, 2))
    max_pool_out = max_pool_out.transpose(1, 2)

    return max_pool_out.reshape((batch_size, -1, cfg.num_classes))

model.eval()
model = model.cuda()

with torch.no_grad():
    labels = None

    x, labels, metadata = dataset.scored_test_dataset.get_random_example()
    labels = labels.unsqueeze(0).cuda()

    x = x.cuda().unsqueeze(0)
    output = model(x)
    pretty_prediction(output["clipwise_output"], labels)
    top_k = output["clipwise_output"].argsort(dim=1, descending=True)[0,:20]
    plot_framewise_output(x[0], output["framewise_output"], chunkwise_predictions(output["framewise_output"]), top_k)
    print(metadata)

model.train()
print()

# Submission

In [None]:
class WaveformIterator():

    def __init__(self, filename:str, window_duration: float):
        self.waveform, self.sr = ta.load(filename)
        self.waveform = torch.mean(self.waveform, dim=0)
        self.offset = 0
        self.waveform_size = self.waveform.shape[0]
        self.window_size = self.sr * window_duration
        self.duration = self.waveform_size/self.sr

    def __iter__(self):
        self.offset = 0
        return self

    def __next__(self):

        if self.offset >= self.waveform_size:
            raise StopIteration()
        
        self.offset += self.window_size

        new_waveform = torch.zeros(self.window_size)

        # Copy over some of the original waveform. The rest is zeros
        chunk_start = self.offset-self.window_size
        chunk_end   = min(self.offset, self.waveform_size)
        new_waveform[0:chunk_end-chunk_start] = self.waveform[chunk_start:chunk_end]

        return new_waveform, chunk_start//self.sr

class SubmissionDataset(data.IterableDataset):

    def __init__(self, directory, window_duration):
        self.directory = directory
        self.test_soundscapes = os.listdir(directory)
        self.waveform_iter = None
        self.file_iter = None
        self.window_duration = window_duration

    def __iter__(self):
        self.file_iter = iter(self.test_soundscapes)
        self.waveform_iter = iter(range(0))
        return self

    def __next__(self):
        try:
            return next(self.waveform_iter), self.file_prefix, self.chunks_in_window
        except StopIteration:
            file = next(self.file_iter)
            path  = os.path.join(self.directory, file)

            self.waveform_iter = iter(WaveformIterator(path, self.window_duration))
            self.file_prefix = file[:-len(".ogg")]

            self.chunks_in_window = self.waveform_iter.duration // cfg.chunk_duration
            return next(self)
             

In [None]:
@torch.no_grad()
def run_model(x: Tensor, row_prefix: str, chunk_id_offset: int, chunks_in_window):
    model.eval()
    output = model.forward(x)

    prediction_results = []
    for label, bird in dataset.scored_birds_mapping.items():
        
        pred = output["clipwise_output"][0][label]

        row_id = f"{row_prefix}_{bird}_{chunk_id_offset+5}"
        # print(f"{row_id:20} {pred*100:0.2f}")
        prediction_results.append((row_id, bool(pred > cfg.submission_threshold)))

    model.train()
    return prediction_results
        
model = model.cuda()
submission = []
for (x, offset), row_prefix, chunks_in_window in SubmissionDataset(resources.test_soundscapes, 5):
    x = x.cuda().unsqueeze(0)
    submission.extend(run_model(x, row_prefix, offset, chunks_in_window))

In [None]:
df_submission = pd.DataFrame(submission, columns=['row_id','target'])
df_submission.to_csv(resources.save_submission, index=False)
df_submission