This notebook uses [panns-inference](https://github.com/qiuqiangkong/audioset_tagging_cnn) to predict "no calls".

panns-inference has a trained audio segmentation model, and the 111th label is birdcall. 

This notebook is made from [Hidehisa's notebook](https://www.kaggle.com/hidehisaarai1213/inference-pytorch-birdcall-resnet-baseline).

Thank you Hidehisa Arai.

In [None]:
import cv2
import audioread
import logging
import os
import random
import time
import warnings

import IPython
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data

from torchvision.models import resnet18, resnet50, densenet121, densenet161

from contextlib import contextmanager
from pathlib import Path
from typing import Optional

from fastprogress import progress_bar
from sklearn.metrics import f1_score
from torchvision import models
from matplotlib import pyplot as plt

from torchvision.transforms.functional import to_tensor
from torchvision.transforms import Normalize

import time
from datetime import timedelta as td
from scipy.ndimage import maximum_filter1d
import scipy

device = torch.device("cuda")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Set my configuration and load dataset.

In [None]:
class config:
    TARGET_SR = 32000
    MELSPECTROGRAM_PARAMETERS = {"n_mels": 128, "fmin": 20, "fmax": 16000}
    SEED = 416
    N_LABEL = 264
    PRETRAINED = False
    THRESHOLD = 0.5
    WEIGHTS_PATH = "../input/birdcall-densenet161/birdcallnet_f0_densenet161.bin"
    SED_THRESHOLD = 0.05
    

# Get Test Set
TEST = Path("../input/birdsong-recognition/test_audio").exists()
if TEST:
    DATA_DIR = Path("../input/birdsong-recognition/")
else:
    # dataset created by @shonenkov, thanks!
    DATA_DIR = Path("../input/birdcall-check/")
test = pd.read_csv(DATA_DIR / "test.csv")
test_audio = DATA_DIR / "test_audio"
sub = pd.read_csv("../input/birdsong-recognition/sample_submission.csv")
sub.to_csv("submission.csv", index=False)


# Get BIRD_CODE dict
train_df = pd.read_csv('../input/birdsong-recognition/train.csv')
keys = set(train_df.ebird_code)
values = np.arange(0, len(keys))
code_dict = dict(zip(sorted(keys), values))
n_labels = len(code_dict)
INV_BIRD_CODE = {v: k for k, v in code_dict.items()}

My model is Densenet161. It local fold-0 f1 score is 0.685494403 and LB score is 0.471.

In [None]:
class BirdcallNet(nn.Module):
    def __init__(self):
        super(BirdcallNet, self).__init__()
        self.densenet = densenet161(pretrained=config.PRETRAINED)
        self.densenet.classifier = nn.Linear(2208, config.N_LABEL)

    def forward(self, x):
        return self.densenet(x)

install and load panns-inference audio segmentation model.

In [None]:
!mkdir -p /root/panns_data/
!cp /kaggle/input/panns-inference/Cnn14_DecisionLevelMax_mAP0.385.pth /root/panns_data/Cnn14_DecisionLevelMax
!cp /kaggle/input/panns-inference/class_labels_indices.csv /root/panns_data/class_labels_indices.csv

!pip install /kaggle/input/torchlibrosa/torchlibrosa-0.0.4-py3-none-any.whl
!pip install /kaggle/input/panns-inference/panns_inference-0.0.6-py3-none-any.whl

In [None]:
from panns_inference import SoundEventDetection

sed = SoundEventDetection(device='cuda')


Let's try audio segmentation and check birdcall.

In [None]:
ebird_code, filename = train_df.sample(1, random_state=123)[["ebird_code", "filename"]].values[0]
path = f"../input/birdsong-recognition/train_audio/{ebird_code}/{filename}"

x, sr = librosa.load(path, mono=True, res_type="kaiser_fast")

print("Sampling Rate:", sr)
plt.plot(x);

111th label is birdcall, so the prediction is here.

In [None]:
sed_pred = sed.inference(np.expand_dims(x, 0))
birdcall_preds = sed_pred[0,:,111]
plt.plot(birdcall_preds);

We use thresholds from the audio segmentation results to predict "no calls".

In [None]:
plt.plot(birdcall_preds>config.SED_THRESHOLD);

Let's check to listen birdcall.

In [None]:
IPython.display.Audio(data=x, rate=sr)

Maybe threshold is more low is better, but now continue.

### Predict test data.

We predict "no call" by panns-inference.

If classification prediction proba is under threshold and panns-inference is over threshold, we use maximum prediction proba.

Dataset Class

In [None]:
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


class TestDataset(data.Dataset):
    def __init__(self, df, clip):
        self.df = df
        self.clip = clip
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        SR = config.TARGET_SR
        sample = self.df.loc[idx, :]
        site = sample.site
        row_id = sample.row_id
        
        if site == "site_3":
            y = self.clip.astype(np.float32)
            len_y = len(y)
            start = 0
            end = SR * 5
            images = []
            while len_y > start:
                y_batch = y[start:end].astype(np.float32)
                if len(y_batch) != (SR * 5):
                    break


                # predict audio segmentation
                framewise_output = sed.inference(np.expand_dims(y_batch, 0))
                _mask = framewise_output[0, :, 111] > config.SED_THRESHOLD
                                    
                start = end
                end = end + SR * 5
                
                if sum(_mask) == 0:
                    continue
                                
                melspec = librosa.feature.melspectrogram(y_batch,
                                                         sr=SR,
                                                         **config.MELSPECTROGRAM_PARAMETERS)
                melspec = librosa.power_to_db(melspec).astype(np.float32)
                image = mono_to_color(melspec)
                image = to_tensor(image)
                image = Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(image)
                image = image.numpy()
                images.append(image)
            images = np.asarray(images)
            return images, row_id, site
        else:
            end_seconds = int(sample.seconds)
            start_seconds = int(end_seconds - 5)
            
            start_index = SR * start_seconds
            end_index = SR * end_seconds
            
            y = self.clip[start_index:end_index].astype(np.float32)
            
            framewise_output = sed.inference(np.expand_dims(y, 0))
            _mask = framewise_output[0, :, 111] > config.SED_THRESHOLD
                        
            if sum(_mask) == 0:
                image = np.zeros((3, 128, 313))
                return image, row_id, site
                
            melspec = librosa.feature.melspectrogram(y, sr=SR, **config.MELSPECTROGRAM_PARAMETERS)
            melspec = librosa.power_to_db(melspec).astype(np.float32)

            image = mono_to_color(melspec)
            image = to_tensor(image)
            image = Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(image)
            image = image.numpy()

            return image, row_id, site

Predict Function

In [None]:
def prediction_for_clip(test_df, clip, model):

    dataset = TestDataset(df=test_df, clip=clip)
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    
    model.eval()
    prediction_dict = {}
    for image, row_id, site in progress_bar(loader):
        site = site[0]
        row_id = row_id[0]
        if site in {"site_1", "site_2"}:
            image = image.to(device)
            
            if image.sum() == 0:
                labels = []    
            else:
                with torch.no_grad():
                    prediction = model(image)
                proba = prediction.detach().cpu().sigmoid().numpy().reshape(-1)
                events = proba >= config.THRESHOLD
                labels = np.argwhere(events).reshape(-1).tolist()
                
                if len(labels) == 0:
                    labels = [proba.argmax()]

        else:
            image = image.squeeze(0)
            batch_size = 16
            whole_size = image.size(0)
            if whole_size % batch_size == 0:
                n_iter = whole_size // batch_size
            else:
                n_iter = whole_size // batch_size + 1
                
            all_events = set()
            for batch_i in range(n_iter):
                batch = image[batch_i * batch_size:(batch_i + 1) * batch_size]
                if batch.ndim == 3:
                    batch = batch.unsqueeze(0)
                
                batch = batch.to(device)
                with torch.no_grad():
                    prediction = model(batch)
                    proba = prediction.detach().cpu().sigmoid().numpy()
                    
                events = proba >= config.THRESHOLD
                for i in range(len(events)):
                    event = events[i, :]
                    labels = np.argwhere(event).reshape(-1).tolist()
                    for label in labels:
                        all_events.add(label)
                        
            labels = list(all_events)
        if len(labels) == 0:
            prediction_dict[row_id] = "nocall"
        else:
            labels_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
            label_string = " ".join(labels_str_list)
            prediction_dict[row_id] = label_string
    return prediction_dict

def prediction(test_df, test_audio):
    
    model = BirdcallNet()
    model.load_state_dict(torch.load(config.WEIGHTS_PATH))
    model.to(device)
    model.eval()
    
    unique_audio_id = test_df.audio_id.unique()


    prediction_dfs = []
    for audio_id in unique_audio_id:
        clip, _ = librosa.load(test_audio / (audio_id + ".mp3"),
                               sr=config.TARGET_SR,
                               mono=True,
                               res_type="kaiser_fast")
         
        test_df_for_audio_id = test_df.query(f"audio_id == '{audio_id}'").reset_index(drop=True)
        prediction_dict = prediction_for_clip(test_df_for_audio_id, clip=clip, model=model)
        
        row_id = list(prediction_dict.keys())
        birds = list(prediction_dict.values())
            
        prediction_df = pd.DataFrame({
            "row_id": row_id,
            "birds": birds
        })
        prediction_dfs.append(prediction_df)
    
    prediction_df = pd.concat(prediction_dfs, axis=0, sort=False).reset_index(drop=True)
    return prediction_df

In [None]:
submission = prediction(test_df=test, test_audio=test_audio)
submission.to_csv("submission.csv", index=False)

In [None]:
display(submission)

If this notebook submit, I got 0.541 LB score.