In [None]:
!pip install ../input/birdthirdlibs/timm-0.4.5-py3-none-any.whl

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import argparse
import random
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
params = {
    'encoder' : {'tf_efficientnet_b0_ns':['../input/birdeb2baseline/fold_0_epoch_25.pth']},
    'root_dir':'../input/birdclef-2021',
    'sample_rate': 32000,
    'window_size' : 2048,
    'hop_size' : 512,
    'mel_bins' : 128,
    'fmin' : 20,
    'fmax' : 16000,
    'classes_num' : 397,
    "period": 5,
    'tta':1,
    'batch_size':16,
    'workers':16,
}

In [None]:
classes = sorted(os.listdir(f"{params['root_dir']}/train_short_audio/"))
classes_maps = {idx:cls_ for idx, cls_ in enumerate(classes)}
def pred2label(preds):
    labels = []
    for p in preds:
        idxs = np.argwhere(p).reshape(-1).tolist()
        if len(idxs) == 0:
            labels.append("nocall")
            continue
        label = list(map(lambda x: classes_maps[x], idxs))
        label = " ".join(label)
        labels.append(label)
    return labels

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    """
    Converts a one channel array to a 3 channel one in [0, 255]

    Arguments:
        X {numpy array [H x W]} -- 2D array to convert

    Keyword Arguments:
        eps {float} -- To avoid dividing by 0 (default: {1e-6})
        mean {None or np array} -- Mean for normalization (default: {None})
        std {None or np array} -- Std for normalization (default: {None})

    Returns:
        numpy array [3 x H x W] -- RGB numpy array
    """
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)

    # Normalize to [0, 255]
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V
    
def normalize(image, mean=None, std=None):
    """
    Normalizes an array in [0, 255] to the format adapted to neural network

    Arguments:
        image {np array [3 x H x W]} -- [description]

    Keyword Arguments:
        mean {None or np array} -- Mean for normalization, expected of size 3 (default: {None})
        std {None or np array} -- Std for normalization, expected of size 3 (default: {None})

    Returns:
        np array [H x W x 3] -- Normalized array
    """
    image = image / 255.0
    if mean is not None and std is not None:
        image = (image - mean) / std
    return np.moveaxis(image, 2, 0).astype(np.float32)

In [None]:
import librosa
import soundfile
from torch.utils.data import Dataset, DataLoader
class TestDataset(Dataset):
    def __init__(self, filepath, params,transforms = None):
        audio, sr = soundfile.read(filepath)
        if sr != params['sample_rate']:
            audio = librosa.resample(audio, sr, params.sr,res_type="kaiser_fast")
        row_id, site, _ = filepath.split('/')[-1].split('_')
        row_id = row_id+'_'+site+'_{}'
        audio_len = len(audio)
        step = params['period'] * params['sample_rate']
        cnt = 0
        audios = []
        row_ids = []
        for i in range(0, audio_len,  step):
            cnt += 1
            start = i
            end = start + step
            if end > audio_len:
                break
            audios.append(audio[start:end])
            row_ids.append(row_id.format(cnt*5))

        self.params = params
        self.audios = audios
        self.row_ids = row_ids
        self.transforms = transforms

    def __len__(self):
        return len(self.row_ids)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        row_id = self.row_ids[idx]
        if self.transforms:
            audio = self.transforms(samples=audio, sample_rate=sr)
        
        melspec = librosa.feature.melspectrogram(audio, sr=self.params['sample_rate'], n_mels=self.params['mel_bins'], 
                                                 fmin=self.params['fmin'], fmax=self.params['fmax'])
        melspec = librosa.power_to_db(melspec).astype(np.float32)
        
        image = mono_to_color(melspec)
        image = normalize(image, mean=None, std=None)
        return image, row_id

In [None]:
import torch.nn as nn
from timm import create_model
class BirdAudioClassifier(nn.Module):
    def __init__(self,encoder,classes_num):
        super().__init__()
        self.encoder = create_model(model_name = encoder,num_classes = classes_num,in_chans = 3)    
    def forward(self, input):
        x = self.encoder(input)
        return x

In [None]:
def createModels(params):
    models = []
    for modeName in params['encoder'].keys():
        model = BirdAudioClassifier(modeName,params['classes_num'])
        model.cuda()
        for modelPath in params['encoder'][modeName]:
            model.load_state_dict(torch.load(modelPath))
            models.append(model)
            print(modeName + ' ' + modelPath + 'Load Done!!!')
    return models

In [None]:
def modelpredict(model,input):
    with torch.no_grad():
        pred = model(input.cuda())
        pred = torch.sigmoid(pred).cpu().numpy()
    return pred

In [None]:
def infetenceOneFile(filepath,model,params):
    preds = np.empty((0, params['classes_num']))
    row_ids = []
    testDataset = TestDataset(filepath, params)
    tst_loader = DataLoader(testDataset, batch_size=params['batch_size'], shuffle=False, num_workers=params['workers'],   pin_memory=True)
    for idx, (img, row_id) in enumerate(tst_loader):
        row_ids += row_id
        pred = modelpredict(model,img)
        preds = np.concatenate([preds,pred])
    return preds,row_ids

In [None]:
import glob
from tqdm.notebook import tqdm
submission = {"row_id":[], "birds":[]}
models = createModels(params)
root_dir = params['root_dir']
if len(glob.glob(os.path.join(f'{root_dir}/test_soundscapes/', '*.ogg'))):
    testpath = f'{root_dir}/test_soundscapes/'
else:
    testpath = f'{root_dir}/train_soundscapes/'

for filepath in tqdm(sorted(glob.glob(os.path.join(testpath, '*.ogg')))):
    preds = []
    for model in models:
        model.eval()
        for i in range(params['tta']):
            pred,row_ids = infetenceOneFile(filepath,model,params)
            preds.append(pred)
    preds = np.array(preds)
    preds = np.sum(preds,axis=0)
    preds /= (len(models) * params['tta'])
    preds = preds > 0.5
    submission["birds"] = np.append(submission["birds"], pred2label(preds))
    submission["row_id"] = np.append(submission["row_id"], row_ids)

In [None]:
submission = pd.DataFrame(submission)
submission.head()
submission.to_csv("submission.csv", index=False)