In [None]:
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import albumentations as albu
from albumentations.pytorch import ToTensorV2
import soundfile as sf
import librosa

In [None]:
test_audio_dir = '../input/birdclef-2022/test_soundscapes/'
file_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]

print('Number of test soundscapes:', len(file_list))

In [None]:
file_list

In [None]:
scored_bird = ["akiapo", "aniani", "apapan", "barpet", "crehon", "elepai", 
               "ercfra", "hawama", "hawcre", "hawgoo", "hawhaw", "hawpet1",
               "houfin", "iiwi", "jabwar", "maupar", "omao", "puaioh", "skylar",
               "warwhe1", "yefcan"]
len(scored_bird)

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model('tf_efficientnet_b0', in_chans=1, pretrained=False)
        in_features = self.backbone.classifier.in_features
        self.backbone.classifier = nn.Identity()
        self.fc = nn.Sequential(
            nn.Dropout(0.2), 
            nn.Linear(in_features, 21)
        )
    def forward(self, x):
        feature = self.backbone(x)
        logits = self.fc(feature)
        return logits

In [None]:
modelTanks = [
    '../input/exp-b0-5s-5folds/exp001_tf_efficientnet_b0_5clip_fold0.ckpt',
    '../input/exp-b0-5s-5folds/exp001_tf_efficientnet_b0_5clip_fold1.ckpt',
    '../input/exp-b0-5s-5folds/exp001_tf_efficientnet_b0_5clip_fold2.ckpt',
    '../input/exp-b0-5s-5folds/exp001_tf_efficientnet_b0_5clip_fold3.ckpt',
    '../input/exp-b0-5s-5folds/exp001_tf_efficientnet_b0_5clip_fold4.ckpt'
]

In [None]:
def load_model(pth):
    static = torch.load(pth)
    new_static = {}
    for k, v in static['state_dict'].items():
        new_static[k[4:]] = v
    net = Model()
    net.load_state_dict(new_static)
    return net

In [None]:
transforms = albu.Compose([
    ToTensorV2()
])

In [None]:
def readAudio(pth):
    y, sr = sf.read(pth, always_2d=True)
    if sr != 32000:
        y = librosa.resample(y, sr, 32000)
    y = y[:, 0]
    return y, 32000

In [None]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

In [None]:
def make_mel(y, sr):
    melspec = librosa.feature.melspectrogram(y, sr=sr, n_mels=256, n_fft=2048, hop_length=512, pad_mode='reflect', power=2.0)
    image = librosa.power_to_db(melspec).astype(np.float32)
    image = scale_minmax(image, 0, 255)
    image = transforms(image=image)['image']
    return image / 255.

In [None]:
def make_clips(path):
    y, sr = readAudio(path)
    #y = y[:-300]
    #print(len(y))
    mels = []
    for i in range(12):
        clip = np.zeros(32000*5)
        start = i * 32000 * 5
        end = (i+1) * 32000 * 5
        if end > len(y):
            clip[:(len(y)-start)] = y[start:]
        else:
            clip = y[start:end]
        #clip = y[start:end]
        #print(len(clip))
        mel = make_mel(clip, sr)
        mels.append(mel)
    mels = torch.stack(mels)
    #print(mels.shape)
    return mels

In [None]:
def predict(models, mels):
    preds = []
    for model in models:
        model.eval()
        model.cuda()
        with torch.no_grad():
            logits = model(mels.cuda())
            preds.append(logits)
        del model
        torch.cuda.empty_cache()
    preds = sum(preds) / len(preds)
    preds = preds.sigmoid().cpu().detach().data.numpy()
    return preds

In [None]:
pred = {'row_id': [], 'target': []}
binary_th = 0.50

models = [load_model(pth) for pth in modelTanks]
for afile in file_list: 
    path = test_audio_dir + afile + '.ogg'

    # y, sr = readAudio(path)
    # mels = make_clips(y, sr)
    mels = make_clips(path)
    outputs = predict(models, mels) # 12x21
    # print(outputs.shape)
    for idx, i in enumerate(range(outputs.shape[0])):        
        chunk_end_time = (i + 1) * 5
        p = (outputs[i] > binary_th).astype(bool)
        for bird, t in zip(scored_bird, p):
            row_id = afile + '_' + bird + '_' + str(chunk_end_time)
            pred['row_id'].append(row_id)
            pred['target'].append(t)

In [None]:
results = pd.DataFrame(pred, columns = ['row_id', 'target'])
results.to_csv("submission.csv", index=False)  

In [None]:
results.head()