In [1]:


!export OMP_NUM_THREADS=N

!export OMP_SCHEDULE=STATIC
!export OMP_PROC_BIND=CLOSE
!export GOMP_CPU_AFFINITY="N-M"

In [2]:
import numpy as np
import pandas as pd
import random
import glob
import os, shutil
from tqdm import tqdm
tqdm.pandas()
import copy
import joblib
from collections import defaultdict
import gc
import math
import cv2
import time
import sys
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm
# PyTorch 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.cuda.amp import autocast, GradScaler
import librosa
import librosa as lb
import torchaudio.transforms as T
import soundfile as sf
import albumentations as A
from  soundfile import SoundFile
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")
import concurrent.futures

In [3]:
class CFG:
    seed = 1
    
    # Audio duration, sample rate, and length
    duration = 5 # second
    sample_rate = 32000
    audio_len = duration*sample_rate
    
    # STFT parameters
    nfft = 768
    n_mels = 128
    fmin = 20
    fmax = 16000
#     model_name = "eca_nfnet_l0"
    model_name = "tf_efficientnetv2_b1"
    train_bs = 64
    valid_bs = train_bs * 4
    num_fold = 5
    epoch_warm_up = 0
    total_epoch = 100
    learning_rate = 4e-4
    weight_decay = 0.01
    thr_upsample = 50
    mix_up = 0.1
    hop_length = 256
    train_with_mixup=True
    num_channels = 1
    use_drop_path=True
    
    num_classes = 264
    device = 'cpu'
    target_columns = "abethr1 abhori1 abythr1 afbfly1 afdfly1 afecuc1 affeag1 afgfly1 afghor1 afmdov1 afpfly1 afpkin1 afpwag1 afrgos1 afrgrp1 afrjac1 afrthr1 amesun2 augbuz1 bagwea1 barswa bawhor2 bawman1 bcbeat1 beasun2 bkctch1 bkfruw1 blacra1 blacuc1 blakit1 blaplo1 blbpuf2 blcapa2 blfbus1 blhgon1 blhher1 blksaw1 blnmou1 blnwea1 bltapa1 bltbar1 bltori1 blwlap1 brcale1 brcsta1 brctch1 brcwea1 brican1 brobab1 broman1 brosun1 brrwhe3 brtcha1 brubru1 brwwar1 bswdov1 btweye2 bubwar2 butapa1 cabgre1 carcha1 carwoo1 categr ccbeat1 chespa1 chewea1 chibat1 chtapa3 chucis1 cibwar1 cohmar1 colsun2 combul2 combuz1 comsan crefra2 crheag1 crohor1 darbar1 darter3 didcuc1 dotbar1 dutdov1 easmog1 eaywag1 edcsun3 egygoo equaka1 eswdov1 eubeat1 fatrav1 fatwid1 fislov1 fotdro5 gabgos2 gargan gbesta1 gnbcam2 gnhsun1 gobbun1 gobsta5 gobwea1 golher1 grbcam1 grccra1 grecor greegr grewoo2 grwpyt1 gryapa1 grywrw1 gybfis1 gycwar3 gyhbus1 gyhkin1 gyhneg1 gyhspa1 gytbar1 hadibi1 hamerk1 hartur1 helgui hipbab1 hoopoe huncis1 hunsun2 joygre1 kerspa2 klacuc1 kvbsun1 laudov1 lawgol lesmaw1 lessts1 libeat1 litegr litswi1 litwea1 loceag1 lotcor1 lotlap1 luebus1 mabeat1 macshr1 malkin1 marsto1 marsun2 mcptit1 meypar1 moccha1 mouwag1 ndcsun2 nobfly1 norbro1 norcro1 norfis1 norpuf1 nubwoo1 pabspa1 palfly2 palpri1 piecro1 piekin1 pitwhy purgre2 pygbat1 quailf1 ratcis1 raybar1 rbsrob1 rebfir2 rebhor1 reboxp1 reccor reccuc1 reedov1 refbar2 refcro1 reftin1 refwar2 rehblu1 rehwea1 reisee2 rerswa1 rewsta1 rindov rocmar2 rostur1 ruegls1 rufcha2 sacibi2 sccsun2 scrcha1 scthon1 shesta1 sichor1 sincis1 slbgre1 slcbou1 sltnig1 sobfly1 somgre1 somtit4 soucit1 soufis1 spemou2 spepig1 spewea1 spfbar1 spfwea1 spmthr1 spwlap1 squher1 strher strsee1 stusta1 subbus1 supsta1 tacsun1 tafpri1 tamdov1 thrnig1 trobou1 varsun2 vibsta2 vilwea1 vimwea1 walsta1 wbgbir1 wbrcha2 wbswea1 wfbeat1 whbcan1 whbcou1 whbcro2 whbtit5 whbwea1 whbwhe3 whcpri2 whctur2 wheslf1 whhsaw1 whihel1 whrshr1 witswa1 wlwwar wookin1 woosan wtbeat1 yebapa1 yebbar1 yebduc1 yebere1 yebgre1 yebsto1 yeccan1 yefcan yelbis1 yenspu1 yertin1 yesbar1 yespet1 yetgre1 yewgre1".split()

#     # Class Labels for BirdCLEF 23
#     class_names = sorted(os.listdir('birdclef-2023/train_audio/'))
#     num_classes = len(class_names)
#     class_labels = list(range(num_classes))
#     label2name = dict(zip(class_labels, class_names))
#     name2label = {v:k for k,v in label2name.items()}



In [4]:
def compute_melspec(y, sr, n_mels, fmin, fmax):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    melspec = lb.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax,n_fft = CFG.nfft
    )

    melspec = lb.power_to_db(melspec, ref=1.0).astype(np.float32)
    return melspec

def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

mean = (0.485) # R only for RGB
std = (0.229) # R only for RGB
img_transforms = {
    'valid' : A.Compose([
            A.Normalize(mean, std),
    ], p=1.0),
}

In [5]:
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1.0 / p)


class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps

    def forward(self, x):
        ret = gem(x, p=self.p, eps=self.eps)
        return ret

    def __repr__(self):
        return (
                self.__class__.__name__
                + "("
                + "p="
                + "{:.4f}".format(self.p.data.tolist()[0])
                + ", "
                + "eps="
                + str(self.eps)
                + ")"
        )
class TimmClassifier(nn.Module):
    def __init__(self, base_model_name, pretrained=True, num_classes=CFG.num_classes, in_channels=CFG.num_channels):
        super().__init__()

        self.bn0 = nn.BatchNorm2d(CFG.n_mels)
        if CFG.use_drop_path:
            self.base_model = timm.create_model(
                base_model_name, pretrained=pretrained, in_chans=in_channels, drop_rate=0.5, drop_path_rate = 0.2)
        else:
            self.base_model = timm.create_model(
                base_model_name, pretrained=pretrained, in_chans=in_channels)
        self.gem = GeM(p=3, eps=1e-6)
        if 'efficientnet' in base_model_name:
            in_features = self.base_model.classifier.in_features
        elif 'nfnet' in base_model_name:
            in_features = 2304
        self.head1 = nn.Linear(in_features, num_classes)
        

    def forward(self, x):
        x = self.base_model.forward_features(x)
        x = self.gem(x)
        x = x[:, :, 0, 0]
        logit = self.head1(x)

        output_dict = {
            'logit': logit,
        }

        return output_dict
    
class FineTuneTimmClassifier(nn.Module):
    def __init__(self, base_model_name = CFG.model_name, num_classes=CFG.num_classes):
        super().__init__()
        self.backbone = TimmClassifier(base_model_name=base_model_name, num_classes = 572, pretrained=False)
        if 'v2_b0' in base_model_name or 'v2_b1' in base_model_name or 'v2_s' in base_model_name:
            in_features = 1280
        elif 'v2_b2' in base_model_name:
            in_features = 1408
        elif 'nfnet' in base_model_name:
            in_features = 2304
        self.backbone.head1 = nn.Linear(in_features, num_classes)
    
    def forward(self, x):
        output_dict = self.backbone(x)
        return output_dict
model = FineTuneTimmClassifier().to(CFG.device)

checkpoint = torch.load("/kaggle/input/birdclef2023final/swa_tf_efficientnetv2_b1_fold_1_model_0.820418.pth",
                        map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])  
# model1 = FineTuneTimmClassifier().to(CFG.device)

# checkpoint1 = torch.load("/kaggle/input/birdclef2023final/finetune_tf_efficientnetv2_b1_fold_4_model_epoch_176_0.810505.pth",
#                         map_location=torch.device('cpu'))
# model1.load_state_dict(checkpoint1['state_dict'])

model2 = FineTuneTimmClassifier(base_model_name = 'tf_efficientnetv2_b2').to(CFG.device)

checkpoint2 = torch.load("/kaggle/input/birdclef2023final/swa_tf_efficientnetv2_b2_fold_1_model_0.819913.pth",
                        map_location=torch.device('cpu'))
model2.load_state_dict(checkpoint2['state_dict'])

# model4 = FineTuneTimmClassifier(base_model_name = 'tf_efficientnetv2_b0').to(CFG.device)

# checkpoint4 = torch.load("/kaggle/input/birdclef2023final/finetune_tf_efficientnetv2_b0_fold_1_model_epoch_173_0.813764.pth",
#                         map_location=torch.device('cpu'))
# model4.load_state_dict(checkpoint4['state_dict'])

model3 = FineTuneTimmClassifier(base_model_name = 'tf_efficientnetv2_s').to(CFG.device)

checkpoint3 = torch.load("/kaggle/input/birdclef2023final/swa_tf_efficientnetv2_s_fold_1_model_0.816482.pth",
                        map_location=torch.device('cpu'))
model3.load_state_dict(checkpoint3['state_dict'])


model5 = FineTuneTimmClassifier(base_model_name = 'tf_efficientnetv2_b2')

checkpoint5 = torch.load("/kaggle/input/birdclef2023final/swa_tf_efficientnetv2_b2_fold_4_model_0.813245.pth",
                        map_location=torch.device('cpu'))
model5.load_state_dict(checkpoint5['state_dict'])

# model2 = TimmSED(base_model_name='tf_efficientnetv2_b1', pretrained=False)
# checkpoint2 = torch.load("/kaggle/input/birdclef2023/tf_efficientnetv2_b1_fold_1_model_epoch_68_0.8750.pth",map_location=torch.device('cpu'))

# model2.load_state_dict(checkpoint2['state_dict'])

<All keys matched successfully>

In [6]:
class TestDataset(Dataset):
    def __init__(self, 
                 df: pd.DataFrame, 
                 clip: np.ndarray
                ):
        
        self.df = df
        self.clip = clip
        

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):

        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end_seconds = int(sample.seconds)
        start_seconds = int(end_seconds - 5)
#         print(f"start: {start_seconds}")
        y = self.clip[CFG.sample_rate * start_seconds : CFG.sample_rate  * end_seconds].astype(np.float32)
#         print(f"Audio now: {idx}, {y}")
        waveform=torch.Tensor(y)
        torchaudio_melspec = T.MelSpectrogram(
            sample_rate=CFG.sample_rate,
            n_fft=CFG.nfft,
            win_length=None,
            hop_length=CFG.hop_length,
            center=True,
            pad_mode="constant",
            power=2.0,
            norm='slaney',
            mel_scale='slaney',
            n_mels=CFG.n_mels,
            f_min = CFG.fmin,
            f_max = CFG.fmax,
        )(waveform)
        torchaudio_melspec = T.AmplitudeToDB(stype="power",top_db=80.00)(torchaudio_melspec)
        image = mono_to_color(torchaudio_melspec.numpy())
        image = image.astype(np.uint8)
        image = img_transforms['valid'](image=image)['image']
#         print(f"Image: {idx}{image}")
        image = np.stack([image])
        image = torch.tensor(image).float()
            
        return {
            "image": image,
            "row_id": row_id,
        }
    
# class TestDatasetB1(Dataset):
#     def __init__(self, 
#                  df: pd.DataFrame, 
#                  clip: np.ndarray
#                 ):
        
#         self.df = df
#         self.clip = clip
        

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx: int):

#         sample = self.df.loc[idx, :]
#         row_id = sample.row_id

#         end_seconds = int(sample.seconds)
#         start_seconds = int(end_seconds - 5)
# #         print(f"start: {start_seconds}")
#         y = self.clip[CFG.sample_rate * start_seconds : CFG.sample_rate  * end_seconds].astype(np.float32)
# #         print(f"Audio now: {idx}, {y}")
#         waveform=torch.Tensor(y)
#         torchaudio_melspec = T.MelSpectrogram(
#             sample_rate=CFG.sample_rate,
#             n_fft=CFG.nfft,
#             win_length=None,
#             hop_length=CFG.hop_length,
#             center=True,
#             pad_mode="constant",
#             power=2.0,
#             norm='slaney',
#             mel_scale='slaney',
#             n_mels=CFG.n_mels,
#             f_min = CFG.fmin,
#             f_max = CFG.fmax,
# #             normalized=True,
#         )(waveform)
#         torchaudio_melspec = T.AmplitudeToDB(stype="power",top_db=80.00)(torchaudio_melspec)
#         image = mono_to_color(torchaudio_melspec.numpy())
#         image = image.astype(np.uint8)
# #         image = img_transforms['valid'](image=image)['image']
# #         print(f"Image: {idx}{image}")
#         image = np.stack([image])
#         image = torch.tensor(image).float()
            
#         return {
#             "image": image,
#             "row_id": row_id,
#         }

In [7]:
all_audios = list(Path("../input/birdclef-2023/test_soundscapes/").glob("*.ogg"))
print(all_audios)

[PosixPath('../input/birdclef-2023/test_soundscapes/soundscape_29201.ogg')]


In [8]:
def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    print('> SEEDING DONE')
set_seed(1)   
seconds = [i for i in range(5, 605, 5)]
models_ensemble = []
# os.environ["OMP_NUM_THREADS"]="2"
# os.environ["OMP_SCHEDULE"]="STATIC"
# os.environ["OMP_PROC_BIND"]="CLOSE"
# torch.set_num_threads(4)
model.eval()
# model1.eval()
model2.eval()
model3.eval()
# model4.eval()
model5.eval()
# models_ensemble.append(model)
models_ensemble.append(model)
# models_ensemble.append(model1)
models_ensemble.append(model2)
models_ensemble.append(model3)
# models_ensemble.append(model4)
models_ensemble.append(model5)
# models_ensemble.append(model4)

# models_ensemble.append(model2)
# models_ensemble.append(model3)
weights = np.array([0.35, 0.35, 0.15, 0.15])
def prediction_for_clip(
    audio_path
):
    
    device = torch.device("cpu")
    
    # inference
    prediction_dict = {}

    clip, _ = librosa.load(audio_path, sr=32000)
    name_ = "_".join(audio_path.name.split(".")[:-1])
    row_ids = [name_+f"_{second}" for second in seconds]

    test_df = pd.DataFrame({
        "row_id": row_ids,
        "seconds": seconds
    })
    
    for i in range(len(models_ensemble)):
#         model.eval()
#         if i==0 or i==1:
        dataset = TestDataset(
            df=test_df, 
            clip=clip,
        ) 
#         else:
       
#             dataset = TestDataset1(
#                 df=test_df, 
#                 clip=clip,
#             ) 
#         dataset = TestDatasetB1(
#             df=test_df, 
#             clip=clip,
#         ) 
        
        loader = DataLoader(
            dataset,
            batch_size=4, 
            num_workers=4,
            drop_last=False,
            shuffle=False,
            pin_memory=True
        )
        
        for data in loader:
            
            row_ids = data['row_id']
            
            for row_id in row_ids:
                if row_id not in prediction_dict:
                    prediction_dict[str(row_id)] = []
            
            image = data['image'].to(device)
                
            probas = []
            

            with torch.no_grad():
                with autocast(enabled=True):
                    output = models_ensemble[i](image)
#                 print(output['logit'])
                
#                     
            for row_id_idx, row_id in enumerate(row_ids):
                prediction_dict[str(row_id)].append(F.softmax(output['logit'][[row_id_idx]], dim=1).numpy().reshape(-1))
        gc.collect()                                                
    for row_id in list(prediction_dict.keys()):
#         print("Shape here",np.array(prediction_dict[row_id]).shape)
#         print("Here",prediction_dict[row_id])        
#         weighted_avg = np.average(np.array(prediction_dict[row_id])[:3], axis=0, weights=weights)
#         print("After weights avg: ", weighted_avg.shape)
#         print("Result: ", weighted_avg)
#         logits = np.average((weighted_avg, np.array(prediction_dict[row_id])[3]), axis = 0)
#         logits = np.array(prediction_dict[row_id]).mean(0)
        logits = np.average(np.array(prediction_dict[row_id]), axis=0, weights=weights)
#         print("Logits here: ", logits)
        prediction_dict[row_id] = {}
        for label in range(len(CFG.target_columns)):
            prediction_dict[row_id][CFG.target_columns[label]] = logits[label]

    return prediction_dict

> SEEDING DONE


In [9]:
# os.environ["OMP_NUM_THREADS"]="2"
# os.environ["OMP_SCHEDULE"]="STATIC"
# os.environ["OMP_PROC_BIND"]="CLOSE"
# torch.set_num_threads(4)

start = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    dicts = list(executor.map(prediction_for_clip, all_audios))
print(f"With concurrent ThreadPoolExecutor, time cost reduced to {time.time()-start} for processing 1 audios")

With concurrent ThreadPoolExecutor, time cost reduced to 61.79345464706421 for processing 1 audios


In [10]:
prediction_dicts = {}
for d in dicts:
    prediction_dicts.update(d)

In [11]:
submission = pd.DataFrame.from_dict(prediction_dicts, "index").rename_axis("row_id").reset_index()
submission.to_csv("submission.csv", index=False)

In [12]:
# submission.head()