In [None]:
from pathlib import Path
import os
import random

import numpy as np
import librosa

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchaudio

from torch.utils.data import DataLoader, TensorDataset

import pandas as pd

seed = 42

np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

BASE_DIR = Path('../input')

DATA_DIR = Path('../input/birdsong-recognition') if os.path.exists('../input/birdsong-recognition/test_audio') else Path('../input/birdcall-check')

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            # nn.Dropout2d(0.1),
            nn.ReLU(),
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
            nn.BatchNorm2d(out_channels),
            # nn.Dropout2d(0.1),
            nn.ReLU(),
        )
        
        self._init_weights()
        
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = F.avg_pool2d(x, 2)
        return x


class CNN(nn.Module):
    def __init__(self, num_classes, sample_rate):
        super().__init__()
        
        n_fft = 2048
        hop_len = 256
        f_min = 20
        f_max = sample_rate / 2
        
        self.preprocess = nn.Sequential(
            torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_len, f_min=f_min, f_max=f_max),
            torchaudio.transforms.AmplitudeToDB(stype='magnitude'),
        )

        self.conv = nn.Sequential(
            ConvBlock(in_channels=1, out_channels=16),
            ConvBlock(in_channels=16, out_channels=32),
            ConvBlock(in_channels=32, out_channels=64),
            ConvBlock(in_channels=64, out_channels=128),
            ConvBlock(in_channels=128, out_channels=256),
        )

        self.fc = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(256, 64),
            nn.PReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.4),
            nn.Linear(64, num_classes),
        )
    
    def forward(self, x):
        x = self.preprocess(x)
        x = torch.unsqueeze(x, 1)
        x = self.conv(x)
        x = torch.mean(x, dim=3)
        x, _ = torch.max(x, dim=2)
        x = self.fc(x)
        return x

In [None]:
class SNRSegmenter(object):

    def __init__(self, sample_rate, segment_len_ms, hop_len_ms, noise_len_ms, min_snr):
        self.segment_len_samples = int(sample_rate * segment_len_ms / 1000)
        self.hop_len_samples = int(sample_rate * hop_len_ms / 1000)
        self.noise_len_samples = int(sample_rate * noise_len_ms / 1000)

        self.min_snr = min_snr

    def get_noise_level(self, sample):
        abs_max = []
        
        if len(sample) > self.noise_len_samples:
            idx = 0
            while idx + self.noise_len_samples < len(sample):
                abs_max.append(torch.max(torch.abs(sample[idx:(idx+self.noise_len_samples)])))
                idx += self.noise_len_samples
        else:
            abs_max.append(torch.max(torch.abs(sample)))

        return min(abs_max)

    def __call__(self, sample, noise_level):

        call_segments = []

        if len(sample) > self.segment_len_samples:
            idx = 0
            while idx + self.segment_len_samples < len(sample):
                segment = sample[idx:(idx+self.segment_len_samples)].clone()
                seg_abs_max = torch.max(torch.abs(segment))
                if seg_abs_max / noise_level > self.min_snr:
                    segment -= torch.mean(segment)
                    segment /= seg_abs_max
                    call_segments.append(segment)

                idx += self.hop_len_samples
        else:
            seg_abs_max = torch.max(torch.abs(sample))
            if seg_abs_max / noise_level > self.min_snr:
                segment = torch.randn(self.segment_len_samples) * (0.01 * seg_abs_max)
                segment[:len(sample)] = sample
                segment -= torch.mean(segment)
                segment /= seg_abs_max
                call_segments.append(segment)

        return call_segments

In [None]:
def get_audio(fpath, sample_rate):
    try:
        audio, orig_sr = torchaudio.load(fpath)
        audio = audio[0, :]
        if orig_sr != sample_rate:
            audio = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=sample_rate)(audio)
    except:
        audio, _ = librosa.core.load(fpath, sr=sample_rate, mono=True)
        audio = torch.from_numpy(audio)
    return audio

# def get_all_segments(audio, sample_rate, segment_len_ms, hop_len_ms):
    
#     segment_len_samples = int(sample_rate * segment_len_ms / 1000)
#     hop_len_samples = int(sample_rate * hop_len_ms / 1000)

#     segments = []
#     idx = 0

#     while idx + segment_len_samples < len(audio):
#         segment = audio[int(idx):int(idx+segment_len_samples)].clone()
        
#         segment -= torch.mean(segment)
#         segment /= torch.max(torch.abs(segment))
        
#         segments.append(segment)
        
#         idx += hop_len_samples

#     return segments

def run_inference(model, data, batch_size):
    test_ds = TensorDataset(data)
    test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    nn_out = []
    for x_batch in test_dl:
        nn_out_batch = model(x_batch[0].cuda()).detach().cpu()
        nn_out.append(nn_out_batch)

    return torch.cat(nn_out, 0)

def get_preds(test_data, test_df, model, num2label, label2num, sample_rate, segment_len_ms, hop_len_ms, noise_len_ms, min_snr):

    segmenter = SNRSegmenter(sample_rate, segment_len_ms, hop_len_ms, noise_len_ms, min_snr)
    # segment_len_samples = int(sample_rate * segment_len_ms / 1000)
    # hop_len_samples = int(sample_rate * hop_len_ms / 1000)

    model.cuda()
    model.eval()

    preds = []

    unique_audio_ids = test_df.audio_id.unique()

    for audio_id in unique_audio_ids:

        fpath = test_data / (audio_id + '.mp3')

        audio = get_audio(fpath, sample_rate)
        
        noise_level = segmenter.get_noise_level(audio)
        
        if noise_level < 1e-2:
            noise_level = 1e-2

        for index, row in test_df.iterrows():
            if row['audio_id'] == audio_id:
                if row['site'] in ('site_1', 'site_2'):

                    start_idx = int((row['seconds'] - 5) * sample_rate)
                    end_idx = int(row['seconds'] * sample_rate)
                    
                    sample_audio = audio[start_idx:end_idx].clone()
                else:
                    sample_audio = audio.clone()

                segments = segmenter(sample_audio, noise_level)
                # segments = get_all_segments(sample_audio, sample_rate, segment_len_ms, hop_len_ms)
                # segments = torch.stack([sample_audio[i:i+segment_len_samples] for i in range(0, sample_audio.size(0) - segment_len_samples + 1, hop_len_samples)])

                if segments:
                    segments = torch.stack(segments)
                    nn_out = run_inference(model, segments, batch_size=1024)
                    probabilities = F.softmax(nn_out, dim=1).numpy()

                    all_idxs = list(np.argwhere(probabilities > 0.5)[:, 1])
                    idx_set = list(set(all_idxs))

                    # if len(idx_set) > 1 and label2num['nocall'] in idx_set:
                    #    idx_set.remove(label2num['nocall'])

                    if row['site'] in ('site_1', 'site_2'):
                        pred_idxs = [i for i in idx_set if all_idxs.count(i) > 1]
                    else:
                        pred_idxs = [i for i in idx_set if all_idxs.count(i) > 1]

                    if len(pred_idxs) > 0:
                        birds = [num2label[idx] for idx in pred_idxs]
                        preds.append([row['row_id'], ' '.join(birds)])
                    else:
                        preds.append([row['row_id'], 'nocall'])
                else:
                    preds.append([row['row_id'], 'nocall'])

    return preds

In [None]:
train_df = pd.read_csv(BASE_DIR / 'birdsong-recognition' / 'train.csv')
all_birds = list(train_df['ebird_code'].unique())
# all_birds.append('nocall')
all_birds = sorted(all_birds)

num2label = {idx: label for idx, label in enumerate(all_birds)}
label2num = {label: idx for idx, label in enumerate(all_birds)}

test_df = pd.read_csv(DATA_DIR / 'test.csv')
test_data = DATA_DIR / 'test_audio'

sample_rate = 22050
segment_len_ms = 2500
hop_len_ms = 500
noise_len_ms = 500
min_snr = 5.

model = CNN(num_classes=len(all_birds), sample_rate=sample_rate)
model.load_state_dict(torch.load('../input/bciweights200908/cnn_weights_1.pt'))

preds = get_preds(test_data, test_df, model, num2label, label2num, sample_rate, segment_len_ms, hop_len_ms, noise_len_ms, min_snr)

In [None]:
preds = pd.DataFrame(preds, columns=['row_id', 'birds'])

In [None]:
preds.to_csv('submission.csv', index=False)