# RegNet Inference (submission generation)

In [1]:
import os
import sys
from pathlib import Path
import joblib
from joblib import Parallel, delayed
from tqdm import tqdm
import glob
import pandas as pd
import torch
import torch.nn as nn
import torchaudio
import numpy as np
from torchvision.models import regnet_y_800mf, RegNet_Y_800MF_Weights
import timm
import re
from torchaudio import functional as F_audio

In [2]:
## REUSE IN INFERENCE NOTEBOOK

custom_dataset_path = '/kaggle/input/birdclef2023-inference'
if os.path.exists(os.path.join(custom_dataset_path, 'utils.py')):
    sys.path.append(custom_dataset_path)
else:
    sys.path.append('..')
import utils

IS_IN_KAGGLE_ENV = utils.get_is_in_kaggle_env()

DATA_PATH = '/kaggle/input/birdclef-2023' if IS_IN_KAGGLE_ENV else '../data'
JOBLIB_PATH = custom_dataset_path if IS_IN_KAGGLE_ENV else './'

DEVICE = 'cpu'

AUDIO_LENGTH_S = 5
SAMPLE_RATE = 32_000

We are running code on Localhost


In [3]:
## REUSE IN INFERENCE NOTEBOOK

class BirdMelspecClf(nn.Module):
    def __init__(self, out_features, pretrained):
        super().__init__()
        
        # https://pytorch.org/vision/stable/models.html

        self.regnet = regnet_y_800mf(weights=RegNet_Y_800MF_Weights.DEFAULT) if pretrained else regnet_y_800mf()

        """
        Original:
        RegnetCNN(
        (regnet): RegNet(
            (stem): SimpleStemIN(
            (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
        )"""
        self.regnet.stem = nn.Sequential(
            nn.Conv2d(2, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True),
            nn.ReLU(inplace=True),
        )
        
        # Fine-tune the regnet classifier
        self.regnet.fc = nn.Linear(self.regnet.fc.in_features, out_features)

        self.softmax = nn.Softmax(dim=1)
 
    def forward(self, x):
        logits = self.regnet(x)
        probas = self.softmax(logits)

        return logits, probas


def get_model(out_features, device, pretrained=False, load_state_dict=True, state_dict_starts_with=f"{AUDIO_LENGTH_S}s_regnetY800MF_"):
    model = BirdMelspecClf(out_features=out_features, pretrained=pretrained)
    print(f"Loaded model {model.__class__.__name__} with {sum(p.numel() for p in model.parameters())} parameters, pretained={pretrained}")
    model.to(device)

    if not load_state_dict:
        return model

    model_files = [f for f in os.listdir(JOBLIB_PATH) if f.startswith(state_dict_starts_with) and f.endswith('.pt')]
    if len(model_files) == 0:
        print(f"No model starting with {state_dict_starts_with} found in {JOBLIB_PATH}")
        return model
    
    # Extract timestamp from the filenames and sort based on it
    model_files.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]) if re.findall(r'\d+', x) else -1)

    # The latest model file is the last one in the sorted list
    latest_model_file = model_files[-1]
    model_path = os.path.join(JOBLIB_PATH, latest_model_file)
    model.load_state_dict(torch.load(model_path))
    print(f"Loaded model weights from {model_path}")
    model.to(device)

    return model


def get_label_encoder():
    label_encoder_path = os.path.join(JOBLIB_PATH, 'label_encoder.joblib')
    label_encoder = joblib.load(label_encoder_path)
    print(f"Loaded label encoder from {label_encoder_path}")
    return label_encoder

In [4]:
## REUSE IN INFERENCE NOTEBOOK

def resample(audio, current_sample_rate, desired_sample_rate=SAMPLE_RATE):
    resampler = torchaudio.transforms.Resample(orig_freq=current_sample_rate, new_freq=desired_sample_rate)
    resampled_audio = resampler(audio)
    return resampled_audio

def load_audio(audio_path, sample_rate=SAMPLE_RATE):
    audio, sr = torchaudio.load(audio_path)
    if sr != sample_rate:
        audio = resample(audio, sr, sample_rate)
    return audio

# Using librosa defaults for n_fft and hop_length
def get_melspec_transform(sample_rate=SAMPLE_RATE, n_fft=2048, hop_length=512, n_mels=128):
    return torchaudio.transforms.MelSpectrogram(
        sample_rate=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels,
    )

# Using librosa defaults for top_db
def get_melspec_db_transform(stype='power', top_db=80):
    return torchaudio.transforms.AmplitudeToDB(
        stype=stype,
        top_db=top_db
    )

# Copied from torchaudio/transforms/_transforms.py (to avoid converting to melspec twice)
dct_mat = F_audio.create_dct(128, 128, "ortho")
def get_mfcc_from_melspec(melspec):
    return torch.matmul(melspec.transpose(-1, -2), dct_mat).transpose(-1, -2)

def normalize_tensor(tensor):
    min_val = torch.min(tensor)
    max_val = torch.max(tensor)
    if max_val - min_val == 0:
        return tensor
    else:
        return (tensor - min_val) / (max_val - min_val)

In [5]:
filepaths = glob.glob(f"{DATA_PATH}/test_soundscapes/*.ogg")
print(f"filepaths length: {len(filepaths)} (amount of test soundscapes)")

filepaths length: 1 (amount of test soundscapes)


# Inference

In [6]:
debug = False
simulate_200_files = False

if simulate_200_files:
    filepaths = [filepaths[0] for i in range(200)] # simulate submission
    print(f"filepaths length: {len(filepaths)} after simulation additions")

label_encoder = get_label_encoder()
model = get_model(out_features=len(label_encoder.classes_), device=DEVICE, pretrained=False, load_state_dict=True)
model.eval()

MIN_WINDOW = AUDIO_LENGTH_S * SAMPLE_RATE
melspec_transform = get_melspec_transform(n_mels=128)
melspec_db_transform = get_melspec_db_transform()

def infer(filepath):
    all_predictions = []
    name = Path(filepath).stem
    audio = load_audio(filepath)
    audio_len_s = audio.shape[1] / SAMPLE_RATE
    debug and print(f"Infering file {filepath} with length {audio_len_s} s")
    n_crops = int(audio_len_s // 5)
    for i in range(n_crops):
        debug and print(f"Crop {i} / {n_crops}")
        debug and print(f"Audio length: {len(audio)}")
        crop = audio[:, i*MIN_WINDOW:(i+1)*MIN_WINDOW]
        debug and print(f"Crop dimensions: {crop.shape}")
        melspec = melspec_db_transform(melspec_transform(crop))
        norm_melspec = normalize_tensor(melspec)
        mfcc = get_mfcc_from_melspec(norm_melspec)
        norm_mfcc = normalize_tensor(mfcc)
        features = torch.cat((norm_melspec, norm_mfcc), dim=0)
        debug and print(f"features shape: {features.shape}") # [2, 128, 313]
        features = features.unsqueeze(0) # add batch dimension (1)
        debug and print(f"features unsqueezed shape: {features.shape}") # [1, 2, 128, 313]
        with torch.no_grad():
            logit, proba = model(features)
        t = (i + 1) * 5
        all_predictions.append({"row_id": f'{name}_{t}',"predictions": proba})
        debug and print('---')
    return all_predictions

if debug:
    all_preds = []
    for filepath in tqdm(filepaths, desc='Infering files'):
        all_preds.append(infer(filepath))
else:
    parallel_task = (delayed(infer)(filepath) for filepath in tqdm(filepaths, desc='Infering files'))
    all_preds = Parallel(n_jobs=os.cpu_count())(parallel_task)

all_preds_flat = [item for sublist in all_preds for item in sublist]

print(f"all_preds length: {len(all_preds)}, all_preds_flat length: {len(all_preds_flat)}")

Loaded label encoder from ./label_encoder.joblib
Loaded model BirdMelspecClf with 5854464 parameters, pretained=False
Loaded model weights from ./5s_regnetY800MF_e60_valacc61_traacc73_1684594828060.pt


Infering files: 100%|██████████| 1/1 [00:00<00:00, 108.06it/s]


all_preds length: 1, all_preds_flat length: 120


In [7]:
all_preds_flat[100]['predictions'][0]

tensor([0.0018, 0.0075, 0.0016, 0.0033, 0.0029, 0.0004, 0.0624, 0.0014, 0.0026,
        0.0007, 0.0020, 0.0011, 0.0011, 0.0133, 0.0048, 0.0008, 0.0006, 0.0033,
        0.0028, 0.0009, 0.0026, 0.0018, 0.0006, 0.0014, 0.0008, 0.0016, 0.0023,
        0.0017, 0.0650, 0.0064, 0.0012, 0.0047, 0.0007, 0.0062, 0.0005, 0.0002,
        0.0026, 0.0007, 0.0040, 0.0005, 0.0004, 0.0020, 0.0011, 0.0031, 0.0013,
        0.0015, 0.0022, 0.0005, 0.0003, 0.0011, 0.0018, 0.0023, 0.0031, 0.0008,
        0.0026, 0.0024, 0.0133, 0.0012, 0.0033, 0.0011, 0.0010, 0.0012, 0.0008,
        0.0043, 0.0003, 0.0004, 0.0163, 0.0009, 0.0004, 0.0045, 0.0009, 0.0055,
        0.0030, 0.0080, 0.0021, 0.0006, 0.0060, 0.0030, 0.0010, 0.0004, 0.0014,
        0.0012, 0.0008, 0.0009, 0.0012, 0.0012, 0.0024, 0.0645, 0.0039, 0.0012,
        0.0005, 0.0022, 0.0010, 0.0028, 0.0020, 0.0021, 0.0005, 0.0048, 0.0016,
        0.0014, 0.0005, 0.0013, 0.0004, 0.0103, 0.0074, 0.0008, 0.0011, 0.0013,
        0.0179, 0.0019, 0.0018, 0.0008, 

In [8]:
df = pd.concat([
    pd.DataFrame({'row_id': [p['row_id'] for p in all_preds_flat]}), 
    pd.DataFrame(torch.stack([p['predictions'][0] for p in all_preds_flat]).numpy(), columns=label_encoder.classes_)
], axis=1)

df

Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0.011397,0.004769,0.008008,0.000223,0.000958,0.003737,0.001577,0.000470,0.026544,...,0.000419,0.000177,0.000639,0.000304,0.000485,0.000578,0.000195,0.000456,0.000427,0.000440
1,soundscape_29201_10,0.003133,0.002866,0.002608,0.006139,0.002921,0.001124,0.003597,0.002656,0.002320,...,0.001047,0.003776,0.006944,0.003522,0.001577,0.002360,0.002141,0.001568,0.001389,0.001485
2,soundscape_29201_15,0.006521,0.004927,0.005424,0.004603,0.001327,0.004686,0.002124,0.001542,0.005893,...,0.000915,0.004804,0.010225,0.001352,0.000865,0.001025,0.000559,0.002269,0.000655,0.001292
3,soundscape_29201_20,0.000912,0.000782,0.002464,0.009869,0.001237,0.000500,0.000831,0.001633,0.006215,...,0.000571,0.013774,0.004372,0.001278,0.000450,0.000597,0.000920,0.000993,0.000736,0.000811
4,soundscape_29201_25,0.000617,0.000829,0.001992,0.004701,0.001217,0.000133,0.001134,0.002584,0.003391,...,0.000359,0.008839,0.003036,0.001204,0.000282,0.001065,0.000618,0.001979,0.000324,0.000377
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,soundscape_29201_580,0.000247,0.000680,0.000343,0.006701,0.000491,0.000048,0.001225,0.000632,0.001205,...,0.000074,0.012279,0.001680,0.000269,0.000196,0.000549,0.000200,0.000589,0.000071,0.000212
116,soundscape_29201_585,0.004495,0.004055,0.000725,0.000462,0.001091,0.002042,0.074998,0.000240,0.001703,...,0.000707,0.000479,0.000356,0.000126,0.001034,0.008404,0.002985,0.000111,0.000303,0.000361
117,soundscape_29201_590,0.002068,0.002330,0.001470,0.007751,0.004330,0.000435,0.014511,0.003275,0.002927,...,0.000575,0.008562,0.008585,0.001423,0.000733,0.003321,0.004059,0.001229,0.000651,0.000508
118,soundscape_29201_595,0.000113,0.000308,0.000205,0.002616,0.000271,0.000017,0.000400,0.000721,0.001438,...,0.000038,0.005889,0.001188,0.000261,0.000088,0.000426,0.000200,0.000355,0.000042,0.000059


In [9]:
df.to_csv('submission.csv', index=False)

In [10]:
!ls submission.csv

submission.csv
