In [None]:
import cv2 as cv
import audioread
import logging
import os
import random
import time
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
# from colored import fg, bg, attr

import librosa
import IPython.display
import numpy as np
import pandas as pd
import soundfile as sf
from pydub import AudioSegment as AS
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.optim import Adam
from torch import FloatTensor, LongTensor, DoubleTensor
from torchvision.models import resnet34
# from facenet_pytorch import MTCNN, InceptionResnetV1
import tensorflow as tf

from contextlib import contextmanager
from pathlib import Path
from typing import Optional



from fastprogress import progress_bar
from sklearn.metrics import f1_score
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences as pad



In [None]:
#functions for utilities

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
def get_logger(out_file=None):
    logger = logging.getLogger()
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    logger.handlers = []
    logger.setLevel(logging.INFO)
#     logger.addHandler(handler)
    
    if out_file is not None:
        fh = logging.FileHandler(out_file)
        fh.setFormatter(formatter)
        fh.setLevel(logging.INFO)
        logger.addHandler(fh)
    logger.info("logger set up")
    return logger

@contextmanager #to ensure output of time is string
def timer(name: str, logger: Optional[logging.Logger] = None):
    t0 = time.time()
    msg = f"[{name}] start"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)
    yield
    
    msg =  f"[{name}] done in {time.time() - t0:.2f} s"
    if logger is None:
        print(msg)
    else:
        logger.info(msg)


In [None]:
logger = get_logger("main.log")
set_seed(1213)

In [None]:
timer("time")

In [None]:
train = pd.read_csv('../input/birdsong-recognition/train.csv')
test = pd.read_csv('../input/birdsong-recognition/test.csv')
audio_path = "../input/birdsong-recognition/train_audio"
TEST = Path("../input/birdsong-recognition/test_audio").exists()

if TEST:
    DATA_DIR = Path("../input/birdsong-recognition/")
else:
    # dataset created by @shonenkov, thanks!
    DATA_DIR = Path("../input/birdcall-check/")
    
test_audio = DATA_DIR / "test_audio"

train_extend = pd.read_csv("../input/xeno-canto-bird-recordings-extended-a-m/train_extended.csv")


In [None]:
test.head(5)

In [None]:
train.head(10)

In [None]:
SR = 44100
EPOCHS = 30
MAXLEN = 100000
CHUNK_SIZE = 10000
CHUNKS = 3
N_MELS = 256 #no of melspectrogram features per time step
MEL_LEN = 1954 #total no of time steps in each melspectrogram
DROP = 0.2
Fr = 512 #output features of resnet34

In [None]:
print(len(train))

In [None]:
#different kind of one hot encoding

keys = set(train["ebird_code"])
values = np.arange(0, len(keys))
code_dict = dict(zip(sorted(keys), values))
print(code_dict)

In [None]:
INV_BIRD_CODE = {v: k for k, v in code_dict.items()}

In [None]:
dirname = train["ebird_code"]
filename = train["filename"]
base_path = "../input/birdsong-recognition/train_audio"

for dirname,filename in zip(dirname, filename):
    path = base_path + '/'+ dirname + '/'+ filename
    size_file = os.path.getsize(path)
    if size_file == 0:
        print('Empty label file:', path)

In [None]:
'''spliting into train and cross-val 80% train 20% val'''
train = shuffle(train)
split = int(0.8*len(train))
train = train.reset_index(drop = True)
val = train[split:].reset_index(drop = True)
train = train[:split].reset_index(drop = True)
# train, val = train_test_split(train, test_size = 0.2)
print(len(train))
print(len(val))

In [None]:
'''
n-mels - no of mel bands to generate
fmin - min frequency
fmax - max frequency
'''

melspec_params = {"n_mels": 128, "fmin":20, "fmax":1600}

'''
dict with params for model 

'''
model_config = {"base_model_name": "resnet50", "pretrained": False, "num_classes": 264 }

In [None]:
#defining some utility funs

In [None]:
def normalize(x):
    return np.float(x)/2**15

def read(file, norm = False):
    try : a = AS.from_mp3(file)
    except: return np.zeros(MAXLEN)
    
    y = np.array(a.get_array_of_samples())
    if a.channels == 2: y = y.reshape((-1, 2))
        
    if norm: return a.frame_rate, normalize(y)
    if not norm: return a.frame_rate, np.float32(y)

In [None]:
'''
n-mels - no of mel bands to generate
fmin - min frequency
fmax - max frequency
'''

melspec_params = {"n_mels": 128, "fmin":20, "fmax":1600}

'''
dict with params for model 

'''
model_config = {"base_model_name": "resnet50", "pretrained": False, "num_classes": 264 }

In [None]:
def get_len(length):
    '''get the maximum length of a signal'''
    if length > MAXLEN : return MAXLEN
    if length <= MAXLEN : return length

In [None]:
def get_idx(length):
    '''select start and end index of a given audio chunk'''
    length = get_len(length)
    idx = np.random.randint(length + 1)
    chunk_range = idx , idx + CHUNK_SIZE
    chunk_idx = max([0, chunk_range[0]])
#     chunk_idx = min([chunk_range[1], 0])
    return (chunk_idx, chunk_idx + CHUNK_SIZE)

In [None]:
def get_chunk(data, length):
    """takes index from chunk data and outputs a given chunk"""
    index = get_idx(length)
    return data[index[0]:index[1]]


In [None]:
def get_signal(data):
    length = max(data.shape)
    data = data.flatten().reshape(1,-1)
    data = np.float32(pad(data, maxlen = MAXLEN).reshape(-1))
    return [get_chunk(data, length) for _ in range(CHUNKS)]

In [None]:
def to_tensor(data):
    return [FloatTensor(point) for point in data]

In [None]:
#if submission is succesful file will be overwritten
sub = pd.read_csv("../input/birdsong-recognition/sample_submission.csv")
sub.to_csv("submission.csv", index = False)

In [None]:
#building a model
class ResNet(nn.Module):
    '''
    Define a class of neural networks and override the feed foward function 
    '''
    def __init__(self, base_model_name: str,  pretrained = False, num_classes=264):
        super().__init__()
        base_model = models.__getattribute__(base_model_name)(pretrained = pretrained)
        layers = list(base_model.children())[:-2]
#         layers = []

        layers.append(nn.AdaptiveMaxPool2d(1))
        self.encoder = nn.Sequential(*layers)
        
        in_features = base_model.fc.in_features
        
        self.classifier = nn.Sequential(nn.Linear(in_features, 1024), nn.ReLU(), nn.Dropout(p=0.2), nn.ReLU(), nn.Dropout(p=0.2), nn.Linear(1024, num_classes))
                                        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.encoder(x).view(batch_size, -1)#-1 is used when you are sure of the no of rows/cols but not sure of the other
        x = self.classifier(x)
        multiclass_prob = F.softmax(x, dim=1) #helps assign decimal values to a multi-class problem
        multilabel_prob = F.sigmoid(x)
        return {"logits": x, "multiclass_prob": multiclass_prob, "multilabel_prob": multilabel_prob}
            
                                        
            

In [None]:
class BirdNet(nn.Module):
    def __init__(self, f, o):
        super(BirdNet, self).__init__()
        self.dropout = nn.Dropout(p=DROP)
        self.dense_output = nn.Linear(f, o)
        self.resnet = resnet34(pretrained=True)
        self.resnet_head = list(self.resnet.children())
        self.resnet_head = nn.Sequential(*self.resnet_head[:-1])

    def forward(self, x):
        x = self.resnet_head(x)
        x = self.dense_output(self.dropout(x.view(-1, Fr)))
#         print(x.shape)
        multiclass_prob = F.softmax(x, dim=1) #helps assign decimal values to a multi-class problem
        multilabel_prob = F.sigmoid(x)
        return {"logits": x, "multiclass_prob": multiclass_prob, "multilabel_prob": multilabel_prob}

In [None]:
class newmodel(nn.Module):
    def __init__(self, num_classes = 264, drpout=0.5):
        super().__init__()
        self.Convlayer1 = nn.Sequential(
            nn.Conv2d(3, 8, 3),
            nn.Conv2d(8, 16, 3),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout(drpout)
        )
        self.Convlayer2 = nn.Sequential(
            nn.Conv2d(16, 32, 3),
            nn.Conv2d(32, 32, 3),
            nn.MaxPool2d(4),
            nn.ReLU(),
            nn.Dropout(drpout)
        )
        self.Convlayer3 = nn.Sequential(
            nn.Conv2d(32, 64, 3),
            nn.Conv2d(64, 64, 3),
            nn.MaxPool2d(2),
            nn.ReLU(),
            nn.Dropout(drpout)
        )
        self.Convlayer4 = nn.Sequential(
            nn.Conv2d(64, 128, 3),
            nn.Conv2d(128, 256, 3),
#             nn.MaxPool2d(3),
            nn.ReLU(),
            nn.Dropout(drpout)
        )
        
        self.lin1 = nn.Linear(2304, 1500)
        self.lin2 = nn.Linear(1500, num_classes)
        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.Convlayer1(x)
        x = self.Convlayer2(x)
        x = self.Convlayer3(x)
        x = self.Convlayer4(x)
        x = x.view(batch_size, -1)
        x = self.lin1(x)
        x = self.lin2(x)
#         y = isinstance(x, (torch.uint8, torch.unit8))
#         print(y)
#         multiclass_prob = F.softmax(x, dim=1) 
#         multilabel_prob = F.sigmoid(x)
        return {"logits": x}
    

In [None]:
model = newmodel()
print(model)

In [None]:
len(os.listdir("../input/birdsong-recognition/train_audio"))

In [None]:
def mono_to_color(X: np.ndarray, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    X = np.stack([X, X, X], axis=-1)
    
    #Standardize
    mean = mean or X.mean()
    X = X - mean
    std = std or X.std()
    Xstd = X / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    
    if (_max - _min) > eps:
        #Normalize to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min)/ (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        #return only zeros
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

In [None]:
path = "../input/birdsong-recognition/train_audio"
filename = train["filename"]
ebird_code = train["ebird_code"]
df_two = train.filter(['filename','ebird_code'])
df_two.head()
# for row in df_two.itertuples():
#     path1 = path + '/'+ ebird_code + '/'+ filename 
#     print(path1)
#     print(path1)
#     arry = load_audio(path1)
#     print(arry)

In [None]:
def load_audio(path):
    try : y, sr = librosa.load(path, sr = SR)
    except Exception as e:
        
        print("Error encountered while parsing file: ", path, e)
        return np.zeros(MAXLEN)
    
    return y

In [None]:
def to_melspectogram(audio_ts):
    melspec = librosa.feature.melspectrogram(audio_ts, sr=SR, **melspec_params)
    melspc = librosa.power_to_db(melspec).astype(np.float32)
    
    return melspc

def get_melsp_img(data):
    data = get_signal(data)
    mel = np.stack([to_melspectogram(point) for point in data])
    return mel

In [None]:
data_transform = transforms.Compose([transforms.ToTensor()])

In [None]:
# os.remove("melspecs.npy")

In [None]:
    class BirdDataset(data.Dataset):
        '''class defining the birds dataset to be fed to a model to identify the
           types of birds'''

        def __init__(self, df, path, img_size=255, transform=None):
            self.code_dict = code_dict
            self.classes = len(code_dict)
            self.df, self.path = df, path
            self.dataset_length = len(df)
            self.img_size = img_size
            self.transform = transform

        def __len__(self):
            return self.dataset_length

        def __getitem__(self, i):
            file_name = self.df.filename[i]
            stripped_files = os.path.splitext(file_name)[0]
            # print(file_name)
            ebird_code = self.df.ebird_code[i]
            num_code = self.code_dict[ebird_code]
            arrayvals = np.load(os.path.join(self.path, ebird_code, stripped_files + "." + "npy"))
#             arrayvals = np.ma.masked_equal(arrayvals, 0)#assume zero values in array
            code = to_categorical([num_code], num_classes=self.classes)
            # return code
            return to_tensor([arrayvals, code])

In [None]:
bird_dfs = BirdDataset(train , "../input/melspecs/melspecs", transform = data_transform)

In [None]:
# print(bird_dfs[0])

In [None]:
bird_df = data.DataLoader(bird_dfs, batch_size = 16)

In [None]:
# for step, (x, y) in enumerate(bird_df):
#     print(x.shape)
#     print(y.shape)

In [None]:
bird_train_df = BirdDataset(train, "../input/melspecs/melspecs", transform = data_transform)
bird_val_df = BirdDataset(val, "../input/melspecs/melspecs", transform = data_transform)

bird_loader_train = tqdm(data.DataLoader(bird_train_df, batch_size = 16, num_workers = 4 ))
bird_loader_val = tqdm(data.DataLoader(bird_val_df, batch_size = 16, num_workers = 4))

In [None]:
print(len(bird_loader_train))
print(len(bird_loader_val))

In [None]:
# for step, (x, y) in enumerate(bird_loader_val):
#     print(x.shape)
# #     print(y.shape)

In [None]:
# '''init the model'''
# model1 = newmodel().to(device)

# '''init optimizer'''
# optimizer = Adam(model1.parameters(), lr = 0.001)
# criterion = nn.CrossEntropyLoss()



In [None]:
O = len(code_dict)
# F = 512
model1 = BirdNet(f=Fr, o=O).to(device)
optimizer = Adam([{'params': model1.resnet.parameters(), 'lr': 0.001},
                  {'params': model1.dense_output.parameters(), 'lr': 0.001}])

In [None]:
'''define cross entropy loss and accuracy'''
def cel(y_true, y_pred):
    y_true = torch.argmax(y_true ,axis = -1).squeeze()
    loss = nn.CrossEntropyLoss()
    criterion = loss(y_pred, y_true)
    return criterion

def accuracy(y_true, y_soft_pred):
    y_true = torch.argmax(y_true, axis = -1).squeeze()
    y_soft_pred = torch.argmax(y_soft_pred, axis = -1).squeeze()
    acc = (y_true == y_soft_pred).float().sum()/len(y_true)
#     acc = torch.round(acc)*100
    return acc


In [None]:
os.getcwd()

In [None]:
def get_shuffle_idx(tensor):
    return shuffle(np.arange(len(tensor)))

In [None]:
def print_metric(data, batch, epoch, start, end, metric, typ):
    t = typ, metric, "%s", data, "%s"
    if typ == "train": pre = "BATCH " + str(batch-1) + " "
    if typ == "val" : pre = "\nEPOCH " +str(epoch+1) + " " 
    time = np.round(end - start, 1); time = "Time: {} s".format(time)
#     fonts = [(fg(211), attr('reset')), (fg(212), attr('reset')), (fg(213), attr('reset'))]
    print(pre  + "{} {} : {}{}{}".format(*t) + " " + time)
    

In [None]:
acc_stats = {
    'train': [],
    "val": []
}
loss_stats = {
    'train': [],
    "val": []
}

In [None]:
files = os.listdir('/kaggle/working')
print(files)

In [None]:
# os.remove("weightsmd1.pth")

In [None]:
D = (3, 128, 255)
PATH = "weightsmd1.pth"

'''training the model'''
start = time.time()
# print(f"start time is.{start}")

for epoch in range(EPOCHS):

    train_epoch_loss = 0
    train_epoch_acc = 0

    state = {
        "epoch": epoch,
        "state_dict": model1.state_dict(),
        "optimizer": optimizer.state_dict()
    }

    torch.save(state, PATH)
#     checkpoint = torch.load(PATH)
#     model1.load_state_dict(checkpoint['state_dict'])
#     optimizer.load_state_dict(checkpoint['optimizer'])

    model1.train()

    batch = 1

    for train_x, train_y in bird_loader_train:
        
        idx = get_shuffle_idx(train_x)
        train_x = train_x[idx].to(device)
        train_y = train_y[idx].to(device)
        
        optimizer.zero_grad()

        train_preds = model1.forward(train_x)
        train_outputs = train_preds["logits"]
        train_loss = cel(train_y, train_outputs)
#         train_softmax = train_preds["multiclass_prob"]
        train_acc = accuracy(train_y, train_outputs)
       
        train_loss.backward()
        optimizer.step()
        
        tr_acc = np.round(train_acc.item(), 3)
        train_epoch_loss += train_loss.item() 
        train_epoch_acc += train_acc.item()
        
        
        end = time.time()
        batch = batch + 1

        is_print = batch % 100 == 1
        if is_print: print_metric(tr_acc, batch, epoch, start, end, "Acc", "train")
        
    epoch_loss = train_epoch_loss / len(bird_loader_train)
    epoch_acc = train_epoch_acc / len(bird_loader_train)
    
    print('Loss {:.4f} Acc: {:.4f} '.format(epoch_loss, epoch_acc))
    
    valid_epoch_loss = 0
    valid_epoch_acc = 0

    model1.eval()

    with torch.no_grad():
        for valid_x, valid_y in bird_loader_val:
            idx = get_shuffle_idx(valid_x)

            valid_x = valid_x[idx].to(device)
            valid_y = valid_y[idx].to(device)

            valid_preds = model1.forward(valid_x)
            valid_outputs = valid_preds['logits']
#             valid_softmax = valid_preds['multiclass_prob']
            valid_loss = cel(valid_y, valid_outputs)
            valid_acc = accuracy(valid_y, valid_outputs)
            
            val_acc = np.round(valid_acc.item(), 3)
            valid_epoch_loss += valid_loss.item() 
            valid_epoch_acc += valid_acc.item()
#             valid_epoch_loss += valid_loss.item() * valid_x.size(0)
#             valid_epoch_acc += torch.sum(preds == valid_y)
        
        epoch_valid_loss = valid_epoch_loss / len(bird_loader_val)
        epoch_valid_acc = valid_epoch_acc / len(bird_loader_val)
        
        print('Loss {:.4f} Acc: {:.4f} '.format(epoch_valid_loss, epoch_valid_acc))

    end = time.time()
    print_metric(val_acc, batch, epoch, start, end, "Acc", "val")

    print('ENDING TRAINING...', epoch)

#     loss_stats['train'].append(train_epoch_loss / len(bird_loader_train))
#     loss_stats['val'].append(valid_epoch_loss / len(bird_loader_val))

#     acc_stats['train'].append(train_epoch_acc / len(bird_loader_train))
#     acc_stats['val'].append(valid_epoch_acc / len(bird_loader_val))
    loss_stats['train'].append(epoch_loss)
    loss_stats['val'].append(epoch_valid_loss)

    acc_stats['train'].append(epoch_acc)
    acc_stats['val'].append(epoch_valid_acc)


In [None]:
# def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
#     since = time.time()
    
#     for epoch in range(num_epochs):
        

In [None]:
print(len(loss_stats['train']))

In [None]:
plt.title("train and val loss vs no of Epochs")
plt.xlabel("Training epochs")
plt.ylabel("Loss")
plt.plot(range(EPOCHS), loss_stats['val'], label="val loss")
plt.plot(range(EPOCHS), loss_stats['train'], label="train_loss")
plt.xticks(np.arange(EPOCHS, 1))
plt.legend()
plt.show()


In [None]:
plt.title("train and val accuracy vs no of Epochs")
plt.xlabel("Training epochs")
plt.ylabel("Accuracy")
plt.plot(range(EPOCHS), acc_stats['val'], label="val_acc")
plt.plot(range(EPOCHS), acc_stats['train'], label="train_acc")
plt.xticks(np.arange(EPOCHS, 1))
plt.legend()
plt.show()

In [None]:
class TestDataset(data.Dataset):
    '''
    Class that defines the test dataset that will be fed into the model
    '''
    def __init__(self, dfs: pd.DataFrame, clip: np.ndarray, img_size =224, melspect_params={}):
        self.dfs = dfs
        self.clip = clip
        self.img_size = img_size
        self.melspect_params = melspect_params
    
    def __len__(self):
        return len(self.dfs)
    
    def __getitem__(self, idx: int):
        sr = 32000
        sample = self.dfs.loc[idx, :]
        site = sample['site']
        row_id = sample['row_id']
        if site =="site_3":
            y = self.clip.astype(np.float32)
            len_y = len(y)
            start = 0
            end = sr * 5
            images = []
            while len_y > start:
                y_batch = y[start:end].astype(np.float32)
                if len(y_batch) != (sr * 5):
                    break
                start = end
                end = end + sr * 5
                
                melspec = librosa.feature.melspectrogram(y_batch, sr=sr, **melspec_params)
                melspc = librosa.power_to_db(melspec).astype(np.float32)
                
                image = mono_to_color(melspec)
                height, width, _ = image.shape
                image = cv.resize(image, (int(width * self.img_size/height), self.img_size))
                image = np.moveaxis(image, 2, 0)
                image = (image/255.0).astype(np.float32)
                images.append(image)
                
            images = np.asarray(images)
            return images, row_id, site
        else:
            end_seconds = int(sample['seconds'])
            start_seconds = int(end_seconds - 5)
            
            start_index = sr * start_seconds
            end_index = sr * end_seconds
            
            y = self.clip[start_index:end_index].astype(np.float32)
            
            melspec = librosa.feature.melspectrogram(y, sr=sr, **melspec_params)
            melspec = librosa.power_to_db(melspec).astype(np.float32)
            
            image = mono_to_color(melspec)
            height, width, _ = image.shape
            image = image = cv.resize(image, (int(width * self.img_size/height), self.img_size))
            image = np.moveaxis(image, 2, 0)
            image = (image/255.0).astype(np.float32)
            
            return image, row_id, site
                
    

In [None]:
# clip, _ = librosa.load("../input/birdsong-recognition/train_audio/ameavo/XC133080.mp3")

In [None]:
# test_df = TestDataset(test, clip )
# for i in range(5):
#     print(test_df[i])

In [None]:
#extracting features from audio
# def feature extraction():
#     dataset = 

In [None]:
def get_model(weights_path: str):
    model = newmodel()
    checkpoint = torch.load(weights_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    return model
    


In [None]:
def prediction_for_clip(test_df:pd.DataFrame, clip:np.ndarray, model, mel_params:dict, threshold = 0.5):
    
    dataset = TestDataset(test, clip=clip, img_size=224, melspect_params = mel_params)
    loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
    
    model.eval()
    prediction_dict ={}
    
    for image, row_id, site in progress_bar(loader):
        site = site[0]
        row_id = row_id[0]
        
        if site in {"site_1", "site_2"}:
            image = image.to(device)
            
            with torch.no_grad():
                prediction = model(image)
                #no more tracking operations and pick a suitable dimention
                proba = prediction["multilabel_prob"].detach().cpu().numpy().reshape(-1)
                
                events = proba >= threshold
                labels = np.argwhere(events).reshape(-1).tolist()
                
        else:
            #avoiding prediction on large batch
            image = image.squeeze(0)
            batch_size = 16
            whole_size = image.size(0)
            if whole_size % batch_size == 0:
                n_iter = whole_size // batch_size
            else:
                n_iter = whole_size // batch_size + 1
                
            all_events = set()
            for batch_i in range(n_iter):
                batch = image[batch_i * batch_size : (batch_i + 1) * batch_size]
                
                if batch.ndim == 3:
                    batch = batch.unsqueeze(0)
                    
                batch = batch.to(device)
                with torch.no_grad():
                    prediction = model(batch)
                    proba = prediction["multilabel_prob"].detach().cpu().numpy()
                    
                    events = proba >= threshold
                    for i in range(len(events)):
                        event = events[i, :]
                        labels = np.argwhere(event).reshape(-1).tolist()
                        
                        for label in labels:
                            all_events.add(label)
            labels = list(all_events)
        if len(labels) == 0:
            prediction_dict[row_id] = "nocall"
            
        else:
            label_str_list = list(map(lambda x: INV_BIRD_CODE[x], labels))
            label_string = " ".join(label_str_list)
            prediction_dict[row_id] = label_string
            
    return prediction_dict
                
                

In [None]:
def prediction(test_df : pd.DataFrame, test_audio: Path, mel_params: dict, weights_path: str, threshold=0.5):
    model = get_model(weights_path)
    unique_audio_id = test["audio_id"].unique()
    check_audio = os.listdir("../input/birdcall-check/test_audio")
#     warnings.filterwarnings("ignore")
    
    predictions_dfs = []
    for audio_id in check_audio:
        
        clip, _ = librosa.load(test_audio / (audio_id ),
                                   sr=SR,
                                   mono=True,
                                   res_type="kaiser_fast")
        test_df_for_audio_id = test.query(f"audio_id == '{audio_id}'").reset_index(drop=True)
        
        with timer(f"Prediction on {audio_id}", logger):
            prediction_dict = prediction_for_clip(test, clip=clip, model=model, mel_params=mel_params, threshold=threshold)
            
            row_id = list(prediction_dict.keys())
            birds = list(prediction_dict.values())
            prediction_df = pd.DataFrame({"row_id": row_id, "birds": birds})
            predictions_dfs.append(prediction_df)
            
        prediction_df = pd.concat(predictions_dfs, axis=0, sort=False).reset_index(drop=True)
        return prediction_df

In [None]:
submission = prediction(test_df=test,
                        test_audio=test_audio,
                        weights_path = "weightsmd1.pth"
                        mel_params=melspec_params,
                        threshold=0.6)

In [None]:
submission.head(20)