In [None]:
#!pip install timm
import os
#os.chdir('/kaggle/input/timmmaster/')
!cp -r ../input/timm-pytorch-image-models /kaggle/working/
!pip install /kaggle/working/timm-pytorch-image-models/pytorch-image-models-master/

In [None]:
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

from pathlib import Path
# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Audio 
import torchaudio
from torchaudio.transforms import MelSpectrogram, Resample,AmplitudeToDB

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# For Image Models
import timm

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

Testing Configuration

In [None]:
class CONFIG:
    num_class = 152
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model_name = 'tf_efficientnet_b0_ns'
    embedding_size = 768
    #Audio Specific
    sample_rate = 32000
    max_time = 5
    n_mels = 224
    n_fft = 1024
    period = 30
    
    target_columns = [
        "afrsil1",
        "akekee",
        "akepa1",
        "akiapo",
        "akikik",
        "amewig",
        "aniani",
        "apapan",
        "arcter",
        "barpet",
        "bcnher",
        "belkin1",
        "bkbplo",
        "bknsti",
        "bkwpet",
        "blkfra",
        "blknod",
        "bongul",
        "brant",
        "brnboo",
        "brnnod",
        "brnowl",
        "brtcur",
        "bubsan",
        "buffle",
        "bulpet",
        "burpar",
        "buwtea",
        "cacgoo1",
        "calqua",
        "cangoo",
        "canvas",
        "caster1",
        "categr",
        "chbsan",
        "chemun",
        "chukar",
        "cintea",
        "comgal1",
        "commyn",
        "compea",
        "comsan",
        "comwax",
        "coopet",
        "crehon",
        "dunlin",
        "elepai",
        "ercfra",
        "eurwig",
        "fragul",
        "gadwal",
        "gamqua",
        "glwgul",
        "gnwtea",
        "golphe",
        "grbher3",
        "grefri",
        "gresca",
        "gryfra",
        "gwfgoo",
        "hawama",
        "hawcoo",
        "hawcre",
        "hawgoo",
        "hawhaw",
        "hawpet1",
        "hoomer",
        "houfin",
        "houspa",
        "hudgod",
        "iiwi",
        "incter1",
        "jabwar",
        "japqua",
        "kalphe",
        "kauama",
        "laugul",
        "layalb",
        "lcspet",
        "leasan",
        "leater1",
        "lessca",
        "lesyel",
        "lobdow",
        "lotjae",
        "madpet",
        "magpet1",
        "mallar3",
        "masboo",
        "mauala",
        "maupar",
        "merlin",
        "mitpar",
        "moudov",
        "norcar",
        "norhar2",
        "normoc",
        "norpin",
        "norsho",
        "nutman",
        "oahama",
        "omao",
        "osprey",
        "pagplo",
        "palila",
        "parjae",
        "pecsan",
        "peflov",
        "perfal",
        "pibgre",
        "pomjae",
        "puaioh",
        "reccar",
        "redava",
        "redjun",
        "redpha1",
        "refboo",
        "rempar",
        "rettro",
        "ribgul",
        "rinduc",
        "rinphe",
        "rocpig",
        "rorpar",
        "rudtur",
        "ruff",
        "saffin",
        "sander",
        "semplo",
        "sheowl",
        "shtsan",
        "skylar",
        "snogoo",
        "sooshe",
        "sooter1",
        "sopsku1",
        "sora",
        "spodov",
        "sposan",
        "towsol",
        "wantat1",
        "warwhe1",
        "wesmea",
        "wessan",
        "wetshe",
        "whfibi",
        "whiter",
        "whttro",
        "wiltur",
        "yebcar",
        "yefcan",
        "zebdov",
        ]
    bird2id = {b:i for i,b in enumerate(target_columns)}
    id2bird = {i:b for i,b in enumerate(target_columns)}
    scored_birds = ["akiapo", "aniani", "apapan", "barpet", "crehon", "elepai", "ercfra", "hawama", "hawcre", "hawgoo", "hawhaw", "hawpet1", "houfin", "iiwi", "jabwar", "maupar", "omao", "puaioh", "skylar", "warwhe1", "yefcan"]

In [None]:
#model used for prediction

#GeM pooling
class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'
#model
class BirdCLEFModel(nn.Module):
    def __init__(self, model_name, embedding_size, pretrained=False):
        super(BirdCLEFModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.embedding = nn.Linear(in_features, embedding_size)
        self.fc = nn.Linear(embedding_size, CONFIG.num_class)

    def forward(self, images):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features)
        output = self.fc(embedding)
        return output
    
model = BirdCLEFModel(CONFIG.model_name,CONFIG.embedding_size)
model.to(CONFIG.device)

In [None]:
#create dataset for test
class test_dataset(Dataset):

    def __init__(self,df,clip,target_sample_rate = 32000):
        self.df = df
        self.clip = torch.mean(clip,axis = 0)
        self.SR = target_sample_rate
        self.num_samples = CONFIG.max_time*self.SR

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        sample = self.df.loc[idx, :]
        row_id = sample.row_id

        end = int(sample.seconds)
        start = int(end - 5)

        start_index = int(self.SR*start)
        end_index = int(self.SR*end)

        sample = self.clip[start_index:end_index]
        
        if sample.shape[0] > self.num_samples:
            sample = self.crop_audio(sample)
        if sample.shape[0] < self.num_samples:
            sample = self.pad_audio(sample)
        
        sample = torch.nan_to_num(sample)
        mel_spectrogram = MelSpectrogram(sample_rate=self.SR,
                                        n_mels = CONFIG.n_mels,
                                        n_fft = CONFIG.n_fft)
        mel = mel_spectrogram(sample)
        image = torch.stack([mel,mel,mel])
        max_val = torch.abs(image).max()
        image = image / max_val
        return image,row_id,end
    

    def pad_audio(self, audio):
        pad_length = self.num_samples - audio.shape[0]
        last_dim_padding = (0, pad_length)
        audio = F.pad(audio, last_dim_padding) #奇怪的pad方式增加了
        return audio
        
    def crop_audio(self, audio):
        return audio[:self.num_samples] 

# 
def prediction_for_clip(test_df,clip,model):
    dataset = test_dataset(df = test_df,clip = clip)
    loader = DataLoader(dataset,batch_size = 1,shuffle = False)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    prediction_dict = {'row_id':[],'target':[]}
    for image , row_id,seconds in tqdm(loader):
        image = image.to(device)
        outputs = model(image)
        '''
        _,pred = torch.max(outputs,1)

        birdcode = CONFIG.id2bird[int(pred.cpu().item())]
        #print(birdcode)
        #deal with format-----
        row_id = row_id[0]
        seconds = seconds.item()
        #---------------------
        for bird in CONFIG.scored_birds:
            judge = False
            if bird == birdcode:
                judge = True
            id = row_id + '_' + bird + '_' + str(seconds)

            prediction_dict['row_id'].append(id)
            prediction_dict['target'].append(judge)
        '''
        pred = torch.sigmoid(outputs)[0]
        row_id = row_id[0]
        seconds = seconds.item()
        for bird in CONFIG.scored_birds:
            judge = False
            if pred[int(CONFIG.bird2id[bird])] >= 0.05:
                judge = True
            id = row_id + '_' + bird + '_'+str(seconds)
                
            prediction_dict['row_id'].append(id)
            prediction_dict['target'].append(judge)
            
    return prediction_dict

Prediction part

In [None]:
from torch.nn.modules.batchnorm import _BatchNorm

def prepare_model_for_inference(model,path):
    if not torch.cuda.is_available():
        ckpt = torch.load(path,map_location = 'cpu')
    else:
        ckpt = torch.load(path,map_location = {'cuda:7':'cuda:0'})
    model.load_state_dict(ckpt)
    model.eval()

    return model

def prediction(test_audios,model,threshold = 0.05, threshold_long = None):
    #假设这里的model已经完成了load
    prediction_dicts = {'row_id':[],'target':[]}
    for audio_path in test_audios:
        clip,_ = torchaudio.load(audio_path)
        seconds = []
        row_ids = []

        for second in range(5,65,5):
            row_id = audio_path.name.split('.')[:-1][0]
            #row_id = "_".join(audio_path.name.split('.'[:-1])+f"_{second}")
            seconds.append(second)
            row_ids.append(row_id)
        
        test_df = pd.DataFrame(
            {
                "row_id":row_ids,
                "seconds":seconds
            }
        )
        prediction_dict = prediction_for_clip(test_df,clip,model)
        prediction_dicts['row_id'].extend(prediction_dict['row_id'])
        prediction_dicts['target'].extend(prediction_dict['target'])
    
    return prediction_dicts

In [None]:
#os.chdir('/kaggle/working/')
#test_audio_dir = '../input/birdclef-2022/test_soundscapes/'
#file_list = [[f.split('.')][0] for f in sorted(os.listdir(test_audio))]
torch.cuda.empty_cache()
test_audio_dir = Path('../input/birdclef-2022/test_soundscapes/')
all_audios = list(test_audio_dir.glob("*.ogg"))

model = prepare_model_for_inference(model,'../input/trained-model1/F10.4293_epoch120.bin')
pred = prediction(all_audios,model)
result = pd.DataFrame(pred,columns = ['row_id','target'])
print(result.head())
result.to_csv("submission.csv",index = False)
torch.cuda.empty_cache()
