In [None]:
#This kernal is based on simCLR unsupervised learning (link below) to learn the birds audio representation,
#with some adjustments to audio data.

algorithm blocks:
1. Get path to data
2. Algorithm hyperparameter set and bird encoding
3. Explore audio exmple
4. Set dataloaders
5. Define simclr unsupervised model with lstm and transformer backbone
6. Define unsupervised and supervised training datasets and nets
7. Unsupervised training loop
8. Freeze weights for supervised linear NN based on unsupervised embedding 
9. Supervised helper functions
10. Supervised training loop
11. Prediction on test set
12. Submit
#https://arxiv.org/abs/2002.05709

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf
import math
import torchaudio
from torchaudio import transforms

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn import preprocessing

from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 1. Get path to data
root_dir = '../input/birdclef-2022'
train_df = pd.read_csv(root_dir + '/train_metadata.csv')

# Load scored birds
import json
test_audio_dir = '../input/birdclef-2022/test_soundscapes/'
file_list = [f.split('.')[0] for f in sorted(os.listdir(test_audio_dir))]
with open('../input/birdclef-2022/scored_birds.json') as sbfile:
    scored_birds = json.load(sbfile)
train_meta = train_df[train_df['primary_label'].isin(scored_birds)]


In [None]:
# 2. Algorithm hyperparameter set and bird encoding
n_epochs_unsuper = 2
n_epochs_super = 10
train_on_gpu = torch.cuda.is_available()
le = preprocessing.LabelEncoder().fit(train_df['primary_label'])
len(le.classes_)

Scored_bird_index = le.transform(scored_birds)
Scored_bird_index_dict = dict(enumerate(Scored_bird_index))
Scored_bird_index_dict_inv = {j:i for i,j in enumerate(Scored_bird_index)}


In [None]:
# test_df = pd.read_csv('../input/birdclef-2022/test.csv')
# print(os.listdir('../input/birdclef-2022/test_soundscapes/'))



In [None]:
# 3. Explore audio exmple
train_audio_dir = '../input/birdclef-2022/train_audio'
idx = random.randint(0, len(train_df))
filename = train_df.loc[idx,'filename']
sig, sr = torchaudio.load(os.path.join(train_audio_dir, filename),)
audio_sample = (sig, sr)
MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=sr,n_fft=800)(sig[0,:])
plt.figure(figsize=(12,6))
meldata = torch.log(MelSpec-MelSpec.min()+1e-5)
meldata = (meldata-meldata.mean())/meldata.std()
plt.imshow(meldata
           ,aspect='auto')
plt.colorbar()
print(sig.shape[1]/sr,sr)
print(meldata.shape)

In [None]:
# 4. set dataloaders

def normRec(Mel_All):
    Mel_All = Mel_All - Mel_All.min()+ 1e-8
    Mel_All = torch.log(Mel_All)
    Mel_All = (Mel_All-Mel_All.mean())/Mel_All.std()
    return Mel_All

class Dataset(Dataset):
    def __init__(self,train_df,le,train_audio_dir,Scored_bird_index_dict_inv,Rec_mel = 500,min_mel_size = 800,zero_rand_low_freq=False):
        'Initialization'
        self.train_df = train_df.copy()
        self.DataPath = train_audio_dir
        self.MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=sr,n_fft=800)
        self.Rec_mel = Rec_mel
        self.min_mel_size = min_mel_size
        self.train_audio_dir = train_audio_dir 
        self.le = le
        self.Scored_bird_index_dict_inv = Scored_bird_index_dict_inv
        self.zero_rand_low_freq = zero_rand_low_freq
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.train_df)

    
    def __getitem__(self, index):
        'Generates one sample of data'
        filename = self.train_df.loc[index,'filename']
        
        lebal_all_birds = self.le.transform([self.train_df.loc[index,'primary_label']])[0]
        label = torch.zeros(len(self.Scored_bird_index_dict_inv))
        if lebal_all_birds in self.Scored_bird_index_dict_inv.keys():
            label[self.Scored_bird_index_dict_inv[lebal_all_birds]] = 1
            
        sig, sr = torchaudio.load(os.path.join(self.train_audio_dir, filename),)
        
        S = sig.shape[0] # random channel select
        Mel_All = self.MelSpec(sig[np.random.randint(S),:])
        #Mel_All = normRec(Mel_All)
        while Mel_All.shape[1] <self.min_mel_size : #duplicate short signals
            Mel_All = torch.cat((Mel_All,Mel_All),dim=1)
        StartRange = Mel_All.shape[1] - self.Rec_mel
        RecStarts = np.random.randint(0,StartRange,2)
        Rec1 = Mel_All[:,RecStarts[0]:RecStarts[0]+self.Rec_mel]
        Rec2 = Mel_All[:,RecStarts[1]:RecStarts[1]+self.Rec_mel]
        # random zero coefficients
        if self.zero_rand_low_freq : 
            zero_lines = np.random.randint(0,Rec1.shape[0]//2,2)
            Rec1[:zero_lines[0],:] = 0
            Rec2[:zero_lines[1],:] = 0
            zero_lines = np.random.randint(0,Rec1.shape[0]//2,2)
            Rec1[-zero_lines[0]:,:] = 0
            Rec2[-zero_lines[1]:,:] = 0
        Rec1 = normRec(Rec1)
        Rec2 = normRec(Rec2)
        return Rec1,Rec2,label

In [None]:
# 5. define simclr unsupervised with lstm and transformer
class SimClr_bird(nn.Module):
    def __init__(self,N_heads,EmbeddingSize =128,dim_ff=2048,N_transormer_L=6,temp = 64,device= 'cuda'):
        super(SimClr_bird,self).__init__()
        
        self.EmbeddingSize = EmbeddingSize
        self.encoderL = nn.TransformerEncoderLayer(EmbeddingSize,N_heads,dim_feedforward=dim_ff,batch_first=True)
        self.Transformer = nn.TransformerEncoder(self.encoderL,N_transormer_L)
        self.lstm = nn.LSTM(EmbeddingSize,EmbeddingSize,2,batch_first=True)
        self.weight = nn.Parameter(torch.randn(EmbeddingSize, EmbeddingSize))
        #self.pos_encoder = PositionalEncoding(d_model= 128,500, 0.1)
        self.Lin = nn.Linear(EmbeddingSize, EmbeddingSize)
        self.temp = temp
        self.rezero = nn.Parameter(torch.FloatTensor([0]))
        self.device = device
        self.bn = nn.LayerNorm(128)
        self.bn2 = nn.LayerNorm(128)
    def Init_hidden(self,bs):
        h0 = torch.zeros((2,bs,self.EmbeddingSize )).to(self.device)
        c0 = torch.zeros((2,bs,self.EmbeddingSize )).to(self.device)
        return h0,c0
    
    def forward(self,x1,x2,return_features = False):
        
        s = x1.shape[0]
        x = torch.cat((x1,x2),dim=0)
        #x_zero = torch.zeros(x.shape[0],1,x.shape[2])
        #x = torch.cat((x_zero,x),dim=1)
        #x = self.pos_encoder(x)
        #x = self.Lin(x)
        h0,c0 = self.Init_hidden(2*s)
        x,_ = self.lstm(x,(h0,c0 ))
        x = self.bn(x)
        x = x+self.Transformer(x)*self.rezero
        x = x[:,-1,:]
        x = self.bn2(x)
        x1_t = x[:s,:]
        x2_t = x[s:,:]
        if return_features:
            return x1_t,x2_t
        
        value = torch.matmul(x2_t, torch.matmul(self.weight, x1_t.transpose(0,1)))
        #return value/(self.temp)
        #return value/(self.temp**2)
        return value


class Pred_head(nn.Module):
    def __init__(self,N_classes,EmbeddingSize =128,device= 'cuda'):
        super(Pred_head,self).__init__()
        self.device = device
        self.featureBack = SimClr_bird(4,dim_ff=2048,N_transormer_L=4,device=self.device)
        self.fc = nn.Linear(EmbeddingSize,EmbeddingSize)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(EmbeddingSize)
        self.bn2 = nn.BatchNorm1d(EmbeddingSize)
        self.classier = nn.Linear(EmbeddingSize,N_classes)
        self.drop = nn.Dropout()
    def forward(self,x1,x2):
        
        x1,x2 = self.featureBack(x1,x2,return_features = True)
        x1 = self.drop(self.bn(self.relu(x1)))
        #x1 = self.bn2(self.relu(self.fc(x1)))
        x1 = self.classier(x1)
        x2 = self.drop(self.bn(self.relu(x2)))
        #x2 = self.bn2(self.relu(self.fc(x2)))
        x2 = self.classier(x2)
        return x1+x2
# simclr loss        
def Diag_loss(output):
    #output = torch.exp(output)
    output = torch.sigmoid(output)
    Nodiag = output.flatten()[1:].view(output.shape[0]-1,output.shape[0]+1)[:,:-1].reshape(output.shape[0],output.shape[0]-1)
    diag = output.diag()
    loss = torch.sum(-torch.log(diag/Nodiag.sum(1)+1e-15) )
    
    return   loss   
        
def Cos_loss(out1,out2):
    output = nn.CosineSimilarity(dim=2)(out1.unsqueeze(1),out2.unsqueeze(0))
    output = nn.ReLU()(output)
    Nodiag = output.flatten()[1:].view(output.shape[0]-1,output.shape[0]+1)[:,:-1].reshape(output.shape[0],output.shape[0]-1)
    diag = output.diag()
    
    return torch.sum(-torch.log(diag/Nodiag.sum(1)+1e-15) )   
                

In [None]:
# 6. define unsupervised and supervised training datasets and nets
device = 'cuda' if train_on_gpu else 'cpu'
model = SimClr_bird(4,dim_ff=2048,N_transormer_L=4,device=device)
if train_on_gpu:
    model.cuda()
#print(model)

criterion = nn.BCEWithLogitsLoss()
batch_size = 48
batch_size_super = 48
params = {'batch_size': batch_size,'shuffle': True, 'num_workers': 5}
params_super = {'batch_size': batch_size_super,'shuffle': True, 'num_workers': 3}
params_super_val = {'batch_size': batch_size_super,'shuffle': False, 'num_workers': 3}

training_set = Dataset(train_df, le , train_audio_dir,Scored_bird_index_dict_inv,zero_rand_low_freq=True)
training_generator = DataLoader(training_set, **params)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.005)


train_df_super,val_df_super = train_test_split(train_meta,test_size=0.2)
train_df_super = train_df_super.reset_index(drop=True)
val_df_super = val_df_super.reset_index(drop=True)
training_set_super = Dataset(train_df_super, le,train_audio_dir,Scored_bird_index_dict_inv)
training_generator_super = DataLoader(training_set_super, **params_super)
val_set_super = Dataset(val_df_super, le,train_audio_dir,Scored_bird_index_dict_inv)
val_generator_super = DataLoader(val_set_super, **params_super_val)
model_P = Pred_head(len(Scored_bird_index_dict_inv))



In [None]:
# 7. Unsupervised training loop:
print_every = 30
for epoch in range(1, n_epochs_unsuper+1):

    # keep track of training and validation loss
    train_loss = 0.0    
    model.train()

    #for data1,data2 in tqdm(training_generator):
    loss_itm = 0
    for data1,data2,_ in tqdm(training_generator):
        
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data1, data2 = data1.cuda(), data2.cuda()
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        #out1,out2 = model(data1.transpose(1,2),data2.transpose(1,2))
        output = model(data1.transpose(1,2),data2.transpose(1,2))
        # calculate the batch loss
        #loss = criterion(output,torch.diag(torch.ones(output.shape[0])))    
        loss = Diag_loss(output)
        #loss = Cos_loss(out1,out2)
        # backward pass: compute gradient of the loss with respect to model parameters

        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
    # update training loss
        train_loss += loss.item()
        loss_itm +=1
        if loss_itm % print_every   == 0 :
            print(train_loss/loss_itm)

        
    print('Epoch: {} \t  Train batch loss: {:.6f} '.format(epoch,train_loss))

In [None]:
# 8. Freeze weights for supervised linear NN based on unsupervised embedding 
model_P = Pred_head(len(Scored_bird_index_dict_inv),device=device)
model_P.featureBack.load_state_dict(model.state_dict())
for param in model_P.featureBack.parameters():
    param.requires_grad = False
    
# model_p.fc.weight.requires_grad = True
# model_p.classifier.weight.requires_grad = True
# model_p.bn.weight.requires_grad = True
# model_p.bn2.weight.requires_grad = True

In [None]:
#data1,data2,labels = next(iter(training_generator_super))
#out = model_P(data1.transpose(1,2).cuda(),data2.transpose(1,2).cuda())


In [None]:
# 9. Supervised helper functions

def BCE_weighted_loss(pred,label,w=3):
    pred_sig = torch.sigmoid(pred)
    
    return -1*torch.sum(torch.log(pred_sig+1e-10)*label*w) - torch.sum( (1-label)*torch.log(1-pred_sig+1e-10) )
    
    
def get_stats(labels,pred,thr=0):
    TP = torch.sum((labels ==1)& (pred>thr))
    FP = torch.sum((labels ==0)& (pred>thr))
    TN = torch.sum((labels ==0)& (pred<thr))
    FN = torch.sum((labels ==1)& (pred<thr))
    
    return TP,FP,TN,FN

def Get_pre_rec_f1(TP,FP,TN,FN):
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2*precision*recall)/(precision+recall)
    return precision, recall,f1


In [None]:
Criterion_Super = torch.nn.CrossEntropyLoss()
optimizer_super = torch.optim.Adam(params=model_P.parameters(), lr=0.01)


In [None]:
# 10. Supervised training loop
print_every = 10
if train_on_gpu:
    model_P = model_P.cuda()
for epoch in range(1, n_epochs_super+1):
    TP_tr,FP_tr,TN_tr,FN_tr = 0,0,0,0
    # keep track of training and validation loss
    train_loss = 0.0    
    model_P.train()
    score_pos,score_neg = 0,0
    #for data1,data2 in tqdm(training_generator):
    loss_itm = 0
    for data1,data2,labels in tqdm(training_generator_super):
        
        # move tensors to GPU if CUDA is available
        if train_on_gpu:
            data1, data2 ,labels = data1.cuda(), data2.cuda(),labels.cuda()
        # clear the gradients of all optimized variables
        optimizer_super.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        #out1,out2 = model(data1.transpose(1,2),data2.transpose(1,2))
        output = model_P(data1.transpose(1,2),data2.transpose(1,2))
        # calculate the batch loss
        loss = Criterion_Super(output,torch.argmax(labels,dim=1))    
        #loss = BCE_weighted_loss(output,labels)
        train_loss += float(loss)
        # backward pass: compute gradient of the loss with respect to model parameters
        TP,FP,TN,FN = get_stats(labels,output)
        TP_tr,FP_tr,TN_tr,FN_tr = TP_tr+TP,FP_tr+FP,TN_tr+TN,FN_tr+FN
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer_super.step()
        loss_itm +=1
        if loss_itm % print_every   == 0 :
            print("train loss loss: {:.3f}  mean pos score: {:.3f}  mean neg score {:.3f}".format(train_loss/loss_itm,score_pos/loss_itm,score_neg/loss_itm))
            print("train precision: {:.3f} recall: {:.3f} F1: {:.3f} ".format(TP_tr/(TP_tr+FN_tr),TP_tr/(TP_tr+FP_tr),TP_tr/(TP_tr+0.5*FN_tr+0.5*FP_tr)))
            loss_itm,train_loss,score_pos,score_neg = 0,0,0,0
    val_loss ,score_val_pos,score_val_neg,loss_itm = 0,0,0,0
    model_P.eval()
    with torch.no_grad():
        TP_ts,FP_ts,TN_ts,FN_ts = 0,0,0,0
        for data1,data2,labels in tqdm(val_generator_super):
            if train_on_gpu:
                data1, data2 ,labels = data1.cuda(), data2.cuda(),labels.cuda()

            output = model_P(data1.transpose(1,2),data2.transpose(1,2))
            TP,FP,TN,FN = get_stats(labels,output)
            TP_ts,FP_ts,TN_ts,FN_ts = TP_ts+TP,FP_ts+FP,TN_ts+TN,FN_ts+FN

            # calculate the batch loss
            #loss = BCE_weighted_loss(output,labels)
            loss = Criterion_Super(output,torch.argmax(labels,dim=1))    
            val_loss += float(loss)
            score_val_pos += torch.mean(torch.sigmoid(output[labels==1]))
            score_val_neg += torch.mean(torch.sigmoid(output[labels==0]))
            loss_itm +=1
        print("test loss: {:.3f}  test pos {:.3f} test neg {:.3f}".format(val_loss/loss_itm,score_val_pos/loss_itm,score_val_neg/loss_itm))
        print("test precision: {:.3f} recall: {:.3f} F1 {:.3f}: ".format(TP_ts/(TP_ts+FN_ts),TP_ts/(TP_ts+FP_ts),TP_ts/(TP_ts+0.5*FN_ts+0.5*FP_ts)))


In [None]:
# for i in range(20):
#     data1, data2 ,labels  = next(iter(training_generator_super))
#     data1, data2 ,labels = data1.cuda(), data2.cuda(),labels.cuda()
#     output = model_P(data1.transpose(1,2),data2.transpose(1,2))
#     outputs = torch.sigmoid(output)
#     print(outputs[labels==1],outputs.mean())

#
# calculate the batch loss
#loss = criterion(output,torch.diag(torch.ones(output.shape[0])))    


In [None]:
#TP,FP,TN,FN
# output = model_P(data1.transpose(1,2),data2.transpose(1,2))
# get_stats(labels,output,0)


In [None]:
# 11. Prediction:
pred = {'row_id': [], 'target': []}
thr_score = 0
MelSpec = torchaudio.transforms.MelSpectrogram(sample_rate=sr,n_fft=800)
S = sig.shape[0]
model_P.eval()
# Process audio files and make predictions
for afile in file_list:
    
    # Complete file path
    path = test_audio_dir + afile + '.ogg'
    
    # Open file with librosa and split signal into 5-second chunks
    sig, sr = torchaudio.load(path)
 
    # Let's assume we have a list of 12 audio chunks (1min / 5s == 12 segments)
    chunks = [[] for i in range(12)]
    
    # Make prediction for each chunk
    # Each scored bird gets a random value in our case
    # since we don't actually have a model
    for i in range(len(chunks)):        
        chunk_end_time = (i + 1) * 5
        audio_seg = sig[0,i*5*sr:(i+1)*5*sr]
        mel_seg = MelSpec(audio_seg.unsqueeze(0))
        mel_seg = normRec(mel_seg)
        if train_on_gpu:
            mel_seg = mel_seg.cuda()
        output = model_P(mel_seg.transpose(1,2),mel_seg.transpose(1,2))

        for bird in scored_birds:
            bired_index = Scored_bird_index_dict_inv[le.transform([bird])[0]]
            # This is our random prediction score for this bird
            try:
                score = torch.sigmoid(output[0,bired_index])
            except IndexError:
                print("some error")
                score = -1
            # Assemble the row_id which we need to do for each scored bird
            row_id = afile + '_' + bird + '_' + str(chunk_end_time)
            
            # Put the result into our prediction dict and
            # apply a "confidence" threshold of 0.5
            pred['row_id'].append(row_id)
            pred['target'].append(True if score > thr_score else False)

In [None]:
# 12. Submit
results = pd.DataFrame(pred, columns = ['row_id', 'target'])

# Quick sanity check
print(results.head()) 
    
# Convert our results to csv
results.to_csv("submission.csv", index=False) 

In [None]:
pred