## Imports

In [None]:
# Change this flag to train model, else load it from private database
TRAIN = False

import cv2
import audioread
import logging
import os
import random
import time
import warnings
import pickle
import librosa
import librosa.display as display
import numpy as np
import pandas as pd
import soundfile as sf
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data

from contextlib import contextmanager
from IPython.display import Audio
from pathlib import Path
from typing import Optional, List

from fastprogress import progress_bar
from sklearn.metrics import f1_score, average_precision_score


!pip install ../input/bird-panns/torchlibrosa-master/torchlibrosa-master/

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset

In [None]:
INPUT_ROOT = "../input"
TRAIN_CSV_DIR = INPUT_ROOT + '/birdsong-recognition/train.csv'
TRAIN_DIRS = [
  INPUT_ROOT + "/birdsong-resampled-train-audio-{:0>2}".format(i)  for i in range(5)
]
TEST_DIR = INPUT_ROOT + '/birdsong-recognition/test_audio'
TEST_CSV_DIR = INPUT_ROOT + '/birdsong-recognition/test.csv'

FIRST_LETTER_TO_FOLDER_MAP = {
    'a': 0, 'b': 0, 'c': 1, 'd': 1, 'e': 1, 
    'f': 1, 'g': 2, 'h': 2, 'i': 2, 'j': 2, 
    'k': 2, 'l': 2, 'm': 2, 'n': 3, 'o': 3,
    'p': 3, 'q': 3, 'r': 3, 's': 4, 't': 4,
    'u': 4, 'v': 4, 'w': 4, 'x': 4, 'y': 4
}

n_epochs = 23


LOAD_BEST_WEIGTHS = False

best_weights_path = '../input/backtoorigins/fold0/checkpoints/last.pth'
best_species_path = '../input/backtoorigins/species.pickle'

In [None]:
torch.__version__

## Csv Analyzer

In [None]:
# df = BirdcallCsv().df
# df.head()
# df[df['duration'] >= 5][df['duration'] < 60].groupby('ebird_code').agg('sum').describe()#['duration'].describe()

In [None]:
import ast 
from collections import Counter
    
class BirdcallCsv:
    def __init__(self, min_n_species=9, secondary_target_contribution=1, species_pickle_path=None):
        
        self.min_n_occurrences = min_n_species
        self.secondary_target_contribution = secondary_target_contribution
        self.species_pickle_path = species_pickle_path
        
        self.df = pd.read_csv(TRAIN_CSV_DIR)
        self.df = self.df[self.df['duration'] >= 5] # Your dirty sampling # 100
        print(self.df.shape)
        self._create_audio_paths()
        self._add_additional_folders()
        self._count_species_occurrences()
        self._create_maps()
        self._create_targets()
        
        self.df = self.df[['ebird_code', 'species', 'secondary_labels', 'audio_path', 'target', 'primary']]
        
    def train_val_split_df(self, val_ratio=0.1):
        x, x_val, y, y_val = train_test_split(self.df, 
                                self.df["ebird_code"], 
                                stratify=self.df["ebird_code"],
                                test_size=val_ratio)
        return x, x_val
    
    def _add_additional_folders(self):
        folders_dict = {
            'squirrel':'../input/pannrelatedmodules/squirrels'
        }
        for class_name, path in folders_dict.items():
            self._add_folder_to_csv(class_name, path)
            
        
    def _add_folder_to_csv(self, class_name, folder_path):
        rows = []
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            row = {
                'ebird_code':'',
                'audio_path':file_path,
                'species':class_name,
                'secondary_labels':'[]'
            }
            rows.append(row)
        self.df = c = pd.concat([pd.DataFrame(rows),self.df],ignore_index=True)
        
    def _create_audio_paths(self):
        def _create_audio_path(row):
            first_letter = row['ebird_code'][0]
            folder_number = FIRST_LETTER_TO_FOLDER_MAP[first_letter]
            folder_path = TRAIN_DIRS[folder_number]

            return os.path.join(
                folder_path,
                row['ebird_code'], 
                row['filename'].replace('mp3', 'wav')
            )
    
        self.df['audio_path'] = self.df.apply(_create_audio_path, axis=1)
    
    def _create_maps(self):
        def _create_species_to_int_map():
            bool_mask = self.occurrences_df[0] >= self.min_n_occurrences
            species_filtered = self.occurrences_df[bool_mask]
            
            print("Before filtering:", self.df.shape)
            self.df = self.df[self.df['species'].isin(species_filtered.index)]
            print('after filtering:', self.df.shape)
            self.species_to_int_map = dict(
                 zip(
                     species_filtered.index, 
                     range(species_filtered.shape[0])
                 )
             )

        def _create_int_to_species_map():
            self.int_to_species_map = {
                v: k for k, v in self.species_to_int_map.items()
            }

        def _create_species_to_code_map():
            species_to_code_map = self.df.groupby('species').first()
            self.species_to_code_map = {
                k:r['ebird_code'] 
                for k, r in species_to_code_map.iterrows()
            }
        
        def _create_int_to_code_map():
            
            self.int_to_code_map = {
                i:self.species_to_code_map.get(s, '')
                for i, s in self.int_to_species_map.items() 
            }
        
        if self.species_pickle_path is not None:
            self.int_to_species_map = pickle.load(open(self.species_pickle_path,'rb'))
            self.species_to_int_map = {v:k for k,v in self.int_to_species_map.items()}
        else:
            _create_species_to_int_map()
            _create_int_to_species_map()
        _create_species_to_code_map()
        _create_int_to_code_map()
        
        pickle_out = open("classes.pickle","wb")
        pickle.dump(self.int_to_code_map, pickle_out)
        pickle_out.close()
        
        pickle_out = open("species.pickle","wb")
        pickle.dump(self.int_to_species_map, pickle_out)
        pickle_out.close()
        print("species saved at species.pickle")
        
    def _create_targets(self):
        
        def _create_primary_target(row):
            n_classes = len(self.species_to_int_map)
            one_hot_encoding = np.zeros(n_classes)
            one_hot_encoding[self.species_to_int_map[row['species']]] = 1

            return one_hot_encoding
        
        def _create_global_target(row):
            n_classes = len(self.species_to_int_map)

            species_in_row = self._get_species_in_row(row)
            many_hot_encoding = np.zeros(n_classes)

            for species in species_in_row:
                if species in self.species_to_int_map:
                    ind = self.species_to_int_map[species]
                    many_hot_encoding[ind] = self.secondary_target_contribution

            many_hot_encoding[self.species_to_int_map[row['species']]] = 1

            return many_hot_encoding
        
        self.df['primary'] = self.df.apply(
            lambda r: _create_primary_target(r),
            axis=1
        )
            
        self.df['target'] = self.df.apply(
            lambda r: _create_global_target(r),
            axis=1
        )
    
    def _get_species_in_row(self, row):
        "Returns a list of species names of primary + secondary labels"
        
        sec_species = [
            l.split('_')[1]
            for l in ast.literal_eval(row['secondary_labels'])
        ]

        species = set(sec_species)
        species.add(row['species'])
        return species
        
    def _count_species_occurrences(self):
        species_per_example = self.df.apply(self._get_species_in_row, axis = 1)

        species_count = Counter()
        for species in species_per_example:
            for spec in species:
                species_count[spec] += 1
        self.occurrences_df = pd.DataFrame.from_dict(species_count, orient='index')
    
    def label_to_code(self, label):
        species = self.int_to_species_map[label]
        if species in self.species_to_code_map:
            return self.species_to_code_map[species]
        else:
            return ''

## Dataset

In [None]:
import torch
from sklearn.model_selection import train_test_split
import librosa
import soundfile as sf
from copy import deepcopy

def wave_to_frames(wave, frame_size):
    """Transforms a wave into a matrix of dimension (n, frame_size),
    where n is the number of frames contained in the wave. This means 
    the wave ending of the wave may be truncated by up to frame_size - 1 samples  
    """
    n = wave.shape[0]
    if n <= frame_size:
        new_wave = np.zeros(frame_size)
        ind = (frame_size - n)//2
        new_wave[ind:ind+n] = wave
        return new_wave[np.newaxis,:]
    
    n_frames = wave.shape[0]//frame_size
    frame_matrix = np.reshape(wave[:n_frames*frame_size],
                            (n_frames, frame_size))
    return frame_matrix
 
    
class BirdcallDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        df,
        sr = 32000,
        batch_sizes_in_seconds=[5],#[5,10,15], 
    ):
        self.df = df
        self.sr = sr
        
        self.batch_sizes = batch_sizes_in_seconds
        self.cur_batch_size = self.batch_sizes[0]
        self.cur_frame_size = int(self.cur_batch_size*self.sr)
        
        
        self.num_classes = self.df.iloc[0]['target'].shape[0]
        print('Num classes: {}'.format(self.num_classes))
        
    def __getitem__(self, index):
        row = self.df.iloc[index]

        y, sr = librosa.load(row["audio_path"], sr=self.sr,
                               mono=True,
                               res_type="kaiser_fast")

        len_y = len(y)
        
        if len_y < self.cur_frame_size:
            # Zero Pad at a random location
            new_y = np.zeros(self.cur_frame_size, dtype=y.dtype)
            start = np.random.randint(self.cur_frame_size - len_y)
            new_y[start:start + len_y] = y
            y = new_y
        elif len_y > self.cur_frame_size:
            # truncate at a random location
            start = np.random.randint(len_y - self.cur_frame_size)
            new_y = y[start:start + self.cur_frame_size]
            y = new_y

        return {
            "waveform": y.astype(np.float32), 
            "targets": row['target'],
            "primary_target": row['primary']
        }
    
    def __len__(self):
        return self.df.shape[0]

    def on_new_batch(self):
        self.cur_batch_size = random.choice(self.batch_sizes)
        self.cur_frame_size = int(self.cur_batch_size*self.sr)


class TrainingIterDataset(torch.utils.data.IterableDataset):
    def __init__(self, df, frame_size_in_secs=5, max_mins_per_sample=.5, sr=32000):
        
        self.sr = sr
        
        self.frame_size_in_secs = frame_size_in_secs
        self.frame_size = int(frame_size_in_secs*sr)
        
        self.max_mins_per_sample = max_mins_per_sample
        self.max_samples = int(max_mins_per_sample*60*sr)
        self.max_frames = int(self.max_samples/self.frame_size)
        self.n_original = df.shape[0]
        
        # single-process data loading, return the full iterator
        self.iter_start = 0
        self.iter_end = self.n_original
        
        self.rows = [
            {
                'audio_path':row['audio_path'],
                'targets':deepcopy(row['target']),  
                'primary_target':deepcopy(row['primary'])
            }
            for i,row in df.iterrows()
        ]  
        
        self._compute_len()
        
    def _create_worker_props(self):
        worker_info = torch.utils.data.get_worker_info()
                
        if worker_info is not None:  # in a worker process
            # split workload
            per_worker = int(self.n_original / worker_info.num_workers)
            worker_id = worker_info.id
            self.iter_start = worker_id * per_worker
            self.iter_end = min(self.iter_start + per_worker, self.n_original)
        
        self._compute_len()
            
    def __iter__(self): 
        self._create_worker_props()
        
        for row in self.rows[self.iter_start:self.iter_end]:
            
            y, sr = sf.read(row['audio_path'])
        
            frames = self._create_frames(y)

            for i, frame in enumerate(frames):
                yield {
                    "waveform": frame, 
                    "targets": row['targets'],
                    "primary_target": row['primary_target']
                }
        
    def __len__(self):
        return self.n_samples
        
    
    def _compute_len(self):
        rows = self.rows[self.iter_start:self.iter_end]
        self.n_samples = sum([self._compute_len_sample(r) for r in rows])

    
    def _compute_len_sample(self, row):
        with sf.SoundFile(row['audio_path']) as f:
            samples = len(f)
        
        frames = int(samples/self.frame_size)
        return min(frames, self.max_frames)
        
    
    def _create_frames(self, signal): 
        # Select a random chunk of the signal
        if signal.shape[0] > self.max_samples:
            start = np.random.randint(signal.shape[0] - self.max_samples)
            end = start + self.max_samples
            signal = signal[start:end]
        return wave_to_frames(signal, self.frame_size).astype(np.float32)
        
    
class TrainingIterDatasetSample(torch.utils.data.IterableDataset):
    def __init__(self, row, frame_size_in_secs=5, max_mins=.5, sr=32000):
        self.audio_path = row['audio_path']
        self.targets = row['target']
        self.primary_target = row['primary']
        self.sr = sr
        
        self.frame_size_in_secs = frame_size_in_secs
        self.frame_size = int(frame_size_in_secs*self.sr)
        
        self.max_mins = max_mins
        self.max_samples = int(max_mins*60*self.sr)
        self.max_frames = int(self.max_samples/self.frame_size)
        
        self.n_frames = self.create_len()
        
        self.num_classes = self.targets.shape[0]
            
    def __iter__(self):
        y, sr = sf.read(self.audio_path)#, 
                             #sr=self.sr)
        
        frames = self.create_frames(y)
        #print("Debug:",frames.shape, self.n_frames)
        
        for i, frame in enumerate(frames):
            #print(i)
            yield {
                "waveform": frame, 
                "targets": self.targets,
                "primary_target": self.primary_target
            }

    def __len__(self):
        return self.n_frames
    
    def create_len(self, row):
        with sf.SoundFile(self.audio_path) as f:
            samples = len(f)
        
        frames = int(samples/self.frame_size)
        return min(frames, self.max_frames)
        
    def create_frames(self, signal): 
        # Select a random chunk of the signal
        if signal.shape[0] > self.max_samples:
            start = np.random.randint(signal.shape[0] - self.max_samples)
            end = start + self.max_samples
            signal = signal[start:end]
        return wave_to_frames(signal, self.frame_size).astype(np.float32)
        

## Model

In [None]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F


def do_mixup(x, mixup_lambda):
    """Mixup x of even indexes (0, 2, 4, ...) with x of odd indexes 
    (1, 3, 5, ...).
    Args:
      x: (batch_size * 2, ...)
      mixup_lambda: (batch_size * 2,)
    Returns:
      out: (batch_size, ...)
    """
    out = (x[0 :: 2].transpose(0, -1) * mixup_lambda[0 :: 2] + \
        x[1 :: 2].transpose(0, -1) * mixup_lambda[1 :: 2]).transpose(0, -1)
    return out
    

def append_to_dict(dict, key, value):
    if key in dict.keys():
        dict[key].append(value)
    else:
        dict[key] = [value]


def forward(model, generator, return_input=False, 
    return_target=False):
    """Forward data to a model.
    
    Args: 
      model: object
      generator: object
      return_input: bool
      return_target: bool
    Returns:
      audio_name: (audios_num,)
      clipwise_output: (audios_num, classes_num)
      (ifexist) segmentwise_output: (audios_num, segments_num, classes_num)
      (ifexist) framewise_output: (audios_num, frames_num, classes_num)
      (optional) return_input: (audios_num, segment_samples)
      (optional) return_target: (audios_num, classes_num)
    """
    output_dict = {}
    device = next(model.parameters()).device

    # Forward data to a model in mini-batches
    for n, batch_data_dict in enumerate(generator):
        print(n)
        batch_waveform = move_data_to_device(batch_data_dict['waveform'], device)
        
        with torch.no_grad():
            model.eval()
            batch_output = model(batch_waveform)

        append_to_dict(output_dict, 'audio_name', batch_data_dict['audio_name'])

        append_to_dict(output_dict, 'clipwise_output', 
            batch_output['clipwise_output'].data.cpu().numpy())
            
        if return_input:
            append_to_dict(output_dict, 'waveform', batch_data_dict['waveform'])
            
        if return_target:
            if 'target' in batch_data_dict.keys():
                append_to_dict(output_dict, 'target', batch_data_dict['target'])

    for key in output_dict.keys():
        output_dict[key] = np.concatenate(output_dict[key], axis=0)

    return output_dict


def interpolate(x, ratio):
    """Interpolate data in time domain. This is used to compensate the 
    resolution reduction in downsampling of a CNN.
    
    Args:
      x: (batch_size, time_steps, classes_num)
      ratio: int, ratio to interpolate
    Returns:
      upsampled: (batch_size, time_steps * ratio, classes_num)
    """
    (batch_size, time_steps, classes_num) = x.shape
    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
    return upsampled


def pad_framewise_output(framewise_output, frames_num):
    """Pad framewise_output to the same length as input frames. The pad value 
    is the same as the value of the last frame.
    Args:
      framewise_output: (batch_size, frames_num, classes_num)
      frames_num: int, number of frames to pad
    Outputs:
      output: (batch_size, frames_num, classes_num)
    """
    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
    """tensor for padding"""

    output = torch.cat((framewise_output, pad), dim=1)
    """(batch_size, frames_num, classes_num)"""

    return output

In [None]:
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
from torchlibrosa.augmentation import SpecAugmentation
import torch
import torch.nn as nn
import torch.nn.functional as F


def init_layer(layer):
    """Initialize a Linear or Convolutional layer. """
    nn.init.xavier_uniform_(layer.weight)
 
    if hasattr(layer, 'bias'):
        if layer.bias is not None:
            layer.bias.data.fill_(0.)
            
    
def init_bn(bn):
    """Initialize a Batchnorm layer. """
    bn.bias.data.fill_(0.)
    bn.weight.data.fill_(1.)


class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        
        super(ConvBlock, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=in_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.conv2 = nn.Conv2d(in_channels=out_channels, 
                              out_channels=out_channels,
                              kernel_size=(3, 3), stride=(1, 1),
                              padding=(1, 1), bias=False)
                              
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.init_weight()
        
    def init_weight(self):
        init_layer(self.conv1)
        init_layer(self.conv2)
        init_bn(self.bn1)
        init_bn(self.bn2)

        
    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
        
        x = input
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.relu_(self.bn2(self.conv2(x)))
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x1 = F.avg_pool2d(x, kernel_size=pool_size)
            x2 = F.max_pool2d(x, kernel_size=pool_size)
            x = x1 + x2
        else:
            raise Exception('Incorrect argument!')
        
        return x


class Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn14, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, 2048, bias=True)
        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
        
        self.init_weight()

    def init_weight(self):
        init_bn(self.bn0)
        init_layer(self.fc1)
        init_layer(self.fc_audioset)
        
#     def add_noise(self, batch):
#         stddev = 0.01
#         if self.training:
#             noise = torch.randn(batch.shape, device=device)*stddev
#             return batch + noise
#         return batch
 
    def forward(self, input, mixup_lambda=None):
        """
        Input: (batch_size, data_length)"""
        
        #input = self.add_noise(input)

        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
        
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = torch.mean(x, dim=3)
        
        (x1, _) = torch.max(x, dim=2)
        x2 = torch.mean(x, dim=2)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu_(self.fc1(x))
        embedding = F.dropout(x, p=0.5, training=self.training)
        clipwise_output = torch.sigmoid(self.fc_audioset(x))
        
        output_dict = {'clipwise_output': clipwise_output, 'embedding': embedding}

        return output_dict


class Transfer_Cnn14(nn.Module):
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num, freeze_base=True):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()
        audioset_classes_num = 527
        
        self.base = Cnn14(sample_rate, window_size, hop_size, mel_bins, fmin, 
            fmax, audioset_classes_num)

        # Transfer to another task layer
        self.fc_transfer = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            # Freeze AudioSet pretrained layers
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()

    def init_weights(self):
        init_layer(self.fc_transfer)

    def load_from_pretrain(self, pretrained_checkpoint_path):
        checkpoint = torch.load(pretrained_checkpoint_path, map_location=torch.device('cpu'))
        
        self.base.load_state_dict(checkpoint['model'])

    def load_weights(self, weights_checkpoint_path):
        checkpoint = torch.load(weights_checkpoint_path, map_location=torch.device('cpu'))
        self.load_state_dict(checkpoint["model_state_dict"]) 
        
    def forward(self, input, mixup_lambda=None):
        """Input: (batch_size, data_length)
        """
        output_dict = self.base(input, mixup_lambda)
        embedding = output_dict['embedding']

        output_dict['clipwise_output'] = self.fc_transfer(embedding)
        return output_dict

## Criterion

In [None]:
class PANNsLoss(nn.Module):
    def __init__(self):
        super().__init__()

        self.bce = nn.BCEWithLogitsLoss()

    def forward(self, input, target):
        input_ = input["clipwise_output"]
        input_ = torch.where(torch.isnan(input_),
                             torch.zeros_like(input_),
                             input_)
        input_ = torch.where(torch.isinf(input_),
                             torch.zeros_like(input_),
                             input_)

        target = target.float()

        return self.bce(input_, target)
    

def loss_func(output_dict, target_dict):
    loss = - torch.mean(target_dict['target'] * output_dict['clipwise_output'])
    return loss

## Callbacks and Logging

In [None]:
from catalyst.dl import SupervisedRunner, State, CallbackOrder, Callback, CheckpointCallback

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    
    
def get_logger(out_file=None):
    logger = logging.getLogger()
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    logger.handlers = []
    logger.setLevel(logging.INFO)

    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
    logger.info("logger set up")
    return logger
    
    
@contextmanager
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"[{name}] start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield

    msg = f"[{name}] done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    
    
set_seed(1213)

class F1Callback(Callback):
    def __init__(self,
                 input_key: str = 'primary_target',#"targets",
                 output_key: str = "logits",
                 model_output_key: str = "clipwise_output",
                 prefix: str = "f1"):
        super().__init__(CallbackOrder.Metric)

        self.input_key = input_key
        self.output_key = output_key
        self.model_output_key = model_output_key
        self.prefix = prefix

    def on_loader_start(self, state: State):
        self.prediction: List[np.ndarray] = []
        self.target: List[np.ndarray] = []

    def on_batch_end(self, state: State):
        
        targ = state.input[self.input_key].detach().cpu().numpy()
        out = state.output[self.output_key]

        clipwise_output = out[self.model_output_key].detach().cpu().numpy()
        
        self.prediction.append(clipwise_output)
        self.target.append(targ)

        y_pred = clipwise_output.argmax(axis=1)
        y_true = targ.argmax(axis=1)

        score = f1_score(y_true, y_pred, average="macro")
        state.batch_metrics[self.prefix] = score

    def on_loader_end(self, state: State):
        y_pred = np.concatenate(self.prediction, axis=0).argmax(axis=1)
        y_true = np.concatenate(self.target, axis=0).argmax(axis=1)
        score = f1_score(y_true, y_pred, average="macro")
        state.loader_metrics[self.prefix] = score
        if state.is_valid_loader:
            state.epoch_metrics[state.valid_loader + "_epoch_" +
                                self.prefix] = score
        else:
            state.epoch_metrics["train_epoch_" + self.prefix] = score


class mAPCallback(Callback):
    def __init__(self,
                 input_key: str = 'primary_target',#"targets",
                 output_key: str = "logits",
                 model_output_key: str = "clipwise_output",
                 prefix: str = "mAP"):
        super().__init__(CallbackOrder.Metric)
        self.input_key = input_key
        self.output_key = output_key
        self.model_output_key = model_output_key
        self.prefix = prefix

    def on_loader_start(self, state: State):
        self.prediction: List[np.ndarray] = []
        self.target: List[np.ndarray] = []

    def on_batch_end(self, state: State):
        
        targ = state.input[self.input_key].detach().cpu().numpy()
        out = state.output[self.output_key]

        clipwise_output = out[self.model_output_key].detach().cpu().numpy()

        self.prediction.append(clipwise_output)
        self.target.append(targ)

        score = average_precision_score(targ, clipwise_output, average=None)
        score = np.nan_to_num(score).mean()
        state.batch_metrics[self.prefix] = score

    def on_loader_end(self, state: State):
        y_pred = np.concatenate(self.prediction, axis=0)
        y_true = np.concatenate(self.target, axis=0)
        score = average_precision_score(y_true, y_pred, average=None)
        score = np.nan_to_num(score).mean()
        state.loader_metrics[self.prefix] = score
        if state.is_valid_loader:
            state.epoch_metrics[state.valid_loader + "_epoch_" +
                                self.prefix] = score
        else:
            state.epoch_metrics["train_epoch_" + self.prefix] = score
            
            

## Train

In [None]:
if LOAD_BEST_WEIGTHS:
    with open('../input/backtoorigins/species.pickle', 'rb') as f:   
        species_to_int_map = pickle.load(f)
        num_classes = len(species_to_int_map)
    with open('../input/backtoorigins/train_csv.pickle', 'rb') as f:   
        df_train = pickle.load(f)  
    df_val = pd.read_csv('../input/backtoorigins/val_csv.pickle')
    df_val['target'] = df_val.apply(lambda x: np.array(ast.literal_eval(x['target'].replace('.', '').replace(' ',','))), axis=1)
    df_val['primary'] = df_val.apply(lambda x: np.array(ast.literal_eval(x['primary'].replace('.', '').replace(' ',','))), axis=1)
    #with open('../input/backtoorigins/val_csv.pickle', 'rb') as f:   
    #    df_val = pickle.load(f)  
else:        
    birdcall_csv = BirdcallCsv()
    num_classes = len(birdcall_csv.species_to_int_map)
    df_train, df_val = birdcall_csv.train_val_split_df(0.1)
df_train.to_pickle('train_csv.pickle')
df_val.to_pickle('val_csv.pickle')
print("Saved training and validation csvs")

# dataset_train = df_train.apply(lambda row: TrainingIterDataset(row), axis=1).tolist()
# dataset_val = df_val.apply(lambda row: TrainingIterDataset(row), axis=1).tolist()
#dataset_train = TrainingIterDataset(df_train)
#dataset_val = TrainingIterDataset(df_val)

#dataset_train, dataset_val = torch.utils.data.ChainDataset(dataset_train), torch.utils.data.ChainDataset(dataset_val)


dataset_train, dataset_val = BirdcallDataset(df_train), BirdcallDataset(df_val)


print("Training samples: {}\nValidation samples: {}".format(len(dataset_train), len(dataset_val)))



In [None]:
if TRAIN:

    # loaders
    loaders = {
        "train": torch.utils.data.DataLoader(dataset_train, 
                                     batch_size=32, 
                                     #shuffle=True, 
                                     num_workers=4, 
                                     pin_memory=True, 
                                     drop_last=True),
        "valid": torch.utils.data.DataLoader(dataset_val, 
                                     batch_size=32, 
                                     #shuffle=False,
                                     num_workers=4,
                                     pin_memory=True,
                                     drop_last=False)
    }

    # model
    model_config = {
        "sample_rate": 32000,
        "window_size": 1024,
        "hop_size": 320,
        "mel_bins": 64,
        "fmin": 1000,
        "fmax": 14000,
        "classes_num": num_classes,
        "freeze_base": False
    }

    model = Transfer_Cnn14(**model_config)

    
    if LOAD_BEST_WEIGTHS:
        model.load_weights(best_weights_path)
    else:
        model.load_from_pretrain("../input/panns-pretrained-cnn14model/Cnn14_mAP_0.431.pth")
        
    model.to(device)
    print('Using GPU: ', torch.cuda.device_count() > 0)

    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    # Scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)

    # Loss
    criterion = PANNsLoss().to(device)

    # callbacks
    callbacks = [
        F1Callback(input_key="primary_target", output_key="logits", prefix="f1"),
        mAPCallback(input_key="primary_target", output_key="logits", prefix="mAP"),
        CheckpointCallback(save_n_best=0)
    ]
    warnings.simplefilter("ignore")

    runner = SupervisedRunner(
        device=device,
        input_key="waveform",
        input_target_key="targets")

    runner.train(
        model=model,
        criterion=criterion,
        loaders=loaders,
        optimizer=optimizer,
        scheduler=scheduler,
        num_epochs=n_epochs,
        verbose=True,
        logdir=f"fold0",
        callbacks=callbacks,
        main_metric="epoch_f1",
        minimize_metric=False)

## Bird Detector

In [None]:
def wave_to_frames(wave, frame_size):
    """Transforms a wave into a matrix of dimension (n, frame_size),
    where n is the number of frames contained in the wave. This means 
    the wave ending of the wave may be truncated by up to frame_size - 1 samples  
    """
    n = wave.shape[0]
    if n <= frame_size:
        new_wave = np.zeros(frame_size)
        ind = (frame_size - n)//2
        new_wave[ind:ind+n] = wave
        return new_wave[np.newaxis,:]
    
    n_frames = wave.shape[0]//frame_size
    frame_matrix = np.reshape(wave[:n_frames*frame_size],
                            (n_frames, frame_size))
    return frame_matrix
 
def get_model(config: dict, weights_path: str):
    model = Transfer_Cnn14(**config)
    model.load_weights(weights_path)
    model.to(device)
    model.eval()
    return model

class BirdDetector:
    def __init__(self, model_config,
                 weights_path, 
                 classes_dict,
                 threshold=0.5, frame_size_in_secs=5):
        self.model_config = model_config
        self.threshold = threshold
        self.sr = model_config["sample_rate"]
        self.frame_size_in_secs = frame_size_in_secs
        self.frame_size = int(self.frame_size_in_secs*self.sr)
        self.model = get_model(model_config, weights_path)
        
        self.batch_size = 32
        
        self.classes_dict = classes_dict
        
    def birds_on_file(self, path_to_audio_file):
        
        signal, _ = librosa.load(path_to_audio_file,
                               sr=self.sr,
                               mono=True,
                               res_type="kaiser_fast")
        
        return self.birds_on_signal(signal)
       
    def chunkify(self, arr):
        return [arr[i:i+self.batch_size] for i in range(0,len(arr),self.batch_size)]

    def birds_on_signal(self, signal):
        
        frames = wave_to_frames(signal, self.frame_size)
        labels = []
        
        chunks = self.chunkify(frames)
        all_idxs = []
        for batch in chunks:
            batch = torch.from_numpy(batch).float().to(device)
        
            prediction = F.sigmoid(self.model(batch)['clipwise_output'])
            proba = prediction.detach().cpu().numpy()
            events = proba >= self.threshold
            idxs = np.argwhere(events)
            all_idxs += list(idxs[:,1])
        
        return self._translate_labels(set(all_idxs))
    
    def birds_on_segments(self, full_signal, end_seconds_list):
        results = []
        original_duration = 5
        extra_duration = (self.frame_size_in_secs - original_duration)/2
        
        for end_seconds in end_seconds_list:
            end = min(int((end_seconds + extra_duration)*self.sr), full_signal.shape[0])
            start = max(int(end - (original_duration + extra_duration)*self.sr),0)
            signal = full_signal[start:end]
            labels = self.birds_on_signal(signal)
            results.append(labels)
        return results
    
    def _translate_labels(self, labels):
        labels_str_list = []
        for l in list(labels):
            ebird_code = self.classes_dict[l]
            if ebird_code: # Ignores secondaries
                labels_str_list.append(ebird_code)
        if labels_str_list:
            return " ".join(labels_str_list)
        else:
            return "nocall"
        

In [None]:
def create_competition_output(detector):
    
    # Real test or mock test
    if os.path.exists(TEST_DIR):
        test_dir = TEST_DIR
        test_df = pd.read_csv(TEST_CSV_DIR)
    else:
        test_dir = INPUT_ROOT + "/birdcall-check/test_audio"
        test_df = pd.read_csv(INPUT_ROOT + "/birdcall-check/test.csv")
    
    unique_audio_id = test_df.audio_id.unique()

    warnings.filterwarnings("ignore")
    output = []
    for audio_id in unique_audio_id:
        segments = test_df[test_df["audio_id"] == audio_id]
        
        signal, _ = librosa.load(os.path.join(test_dir, audio_id + ".mp3"),
                       sr=detector.sr,
                       mono=True,
                       res_type="kaiser_fast")
            
        if len(segments) == 1:
            results = detector.birds_on_signal(signal)
        else:
            results = detector.birds_on_segments(signal, segments['seconds'])
        
        segments['birds'] = results
        
        for i, segment in segments.iterrows():
            output.append({'row_id':segment['row_id'], 'birds':segment['birds']})
            
    
    df = pd.DataFrame(output)
    
    df.to_csv('submission.csv', index=False)
    print("saved submission.csv")

    return df


In [None]:
if not TRAIN:
    #weights_path = "../input/backtoorigins/fold0/checkpoints/last.pth"
    #classes_dict_path = "../input/backtoorigins/classes.pickle"
    #classes_dict_path = "../input/bestweightsyet/classes.pickle"
    weights_path = "../input/605-10-secs/last.pth"
    classes_dict_path = "../input/605-10-secs/classes.pickle"

else:
    weights_path = 'fold0/checkpoints/last.pth'
    classes_dict_path = 'classes.pickle'
    
with open(classes_dict_path,'rb') as f:
    classes_dict = pickle.load(f)

    
model_config = {
    "sample_rate": 32000,
    "window_size": 1024,
    "hop_size": 320,
    "mel_bins": 64,
    "fmin": 1000,
    "fmax": 14000,
    "classes_num": len(classes_dict)
}

detector = BirdDetector(model_config,
                        weights_path,
                        classes_dict,
                        threshold=0.55,
                        frame_size_in_secs=10)
print('Loaded detector...')
create_competition_output(detector)

## Validate with test

In [None]:

paths = [
    '../input/birdsong-recognition/example_test_audio/BLKFR-10-CPL_20190611_093000.pt540.mp3',
    '../input/birdsong-recognition/example_test_audio/ORANGE-7-CAP_20190606_093000.pt623.mp3'
]

def load_test():
    
    signals = [
        librosa.load(p, sr=32000)[0] for p in paths
    ]

    print('Loaded validation signals.')
    
    tests = {
        'BLKFR-10-CPL': signals[0],
        'ORANGE-7-CAP': signals[1],
    }
    
    return tests

tests = load_test()

In [None]:
def create_validation_output():
    input_df_path = '../input/birdsong-recognition/example_test_audio_summary.csv'
    
    df = pd.read_csv(input_df_path)
    
    unique_audio_id = df.filename.unique()

    warnings.filterwarnings("ignore")
    output = []
    for audio_id in unique_audio_id:
        segments = df[df["filename"] == audio_id]
        
        signal = tests[audio_id]
      
        results = detector.birds_on_segments(signal, segments['seconds'])
        
        segments['predicted'] = results
        
        for i, segment in segments.iterrows():
            output.append({
                'filename':segment['filename'],
                'seconds':segment['seconds'],
                'birds':segment['birds'],
                'predicted':segment['predicted']
            })
            
    
    df = pd.DataFrame(output)
    
    df.to_csv('validation.csv', index=False)
    print("saved validation.csv")

    
    return df

create_validation_output()