In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import torchaudio
import soundfile as sf
import seaborn as sns

from pathlib import Path
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import Audio

from birdnetlib import Recording
from birdnetlib.analyzer import Analyzer
from birdnetlib.batch import DirectoryMultiProcessingAnalyzer

In [12]:
train_dir = Path('E:\data\BirdCLEF')

class CFG:
    random_seed = 42
    
    ROOT_FOLDER = train_dir
    AUDIO_FOLDER = train_dir / 'train_audio'
    DATA_DIR = train_dir / 'spectros'
    TRAIN_CSV = train_dir / 'train_metadata.csv'
    RESULTS_DIR = train_dir / 'results'
    CKPT_DIR = RESULTS_DIR / 'ckpt'
    bird20223 = train_dir / 'bird2023.csv'
    UNLABELED_FOLDER = train_dir / 'unlabeled_soundscapes'

    bird_preds_csv = train_dir / 'bird_preds.csv'
    unlabeled_preds_csv = train_dir / 'unlabeled_preds.csv'
    taxonomy_csv = train_dir / 'eBird_Taxonomy_v2021.csv'

In [13]:
sec_labels = ['lotshr1', 'orhthr1', 'magrob', 'indwhe1', 'bltmun1', 'asfblu1']

sample_submission = pd.read_csv(train_dir / 'sample_submission.csv')

# Set labels
CFG.LABELS = sample_submission.columns[1:].tolist()
bird2id = {b: i for i, b in enumerate(CFG.LABELS)}

len(CFG.LABELS)

182

In [19]:
meta_df = pd.read_csv(CFG.TRAIN_CSV)
df_23 = pd.read_csv(CFG.bird20223)
bird_preds_df = pd.read_csv(CFG.bird_preds_csv)
unlabeled_preds_df = pd.read_csv(CFG.unlabeled_preds_csv)
taxonomy_df = pd.read_csv(CFG.taxonomy_csv)

df_23.shape, bird_preds_df.shape, unlabeled_preds_df.shape, taxonomy_df.shape

((16941, 12), (123023, 6), (27203, 5), (16753, 9))

In [16]:
taxonomy_df.head(2)

Unnamed: 0,TAXON_ORDER,CATEGORY,SPECIES_CODE,PRIMARY_COM_NAME,SCI_NAME,ORDER1,FAMILY,SPECIES_GROUP,REPORT_AS
0,1,species,ostric2,Common Ostrich,Struthio camelus,Struthioniformes,Struthionidae (Ostriches),Ostriches,
1,6,species,ostric3,Somali Ostrich,Struthio molybdophanes,Struthioniformes,Struthionidae (Ostriches),,


In [33]:
all_birds = taxonomy_df.SCI_NAME.unique().tolist()
bird_codes = taxonomy_df.SPECIES_CODE.unique().tolist()

sci2code = {b: c for b, c in zip(all_birds, bird_codes)}


In [39]:
sci2code['Struthio camelus']

'ostric2'

### BirdNet train predictions

In [7]:
bird_preds_df.sample(4)

Unnamed: 0,filename,label,name,start,end,confidence
71133,XC564937.ogg,grywag,Motacilla cinerea,39.0,42.0,0.831604
52907,XC381631.ogg,eucdov,Streptopelia decaocto,12.0,15.0,0.98324
50351,XC489936.ogg,eaywag1,Motacilla flava,6.0,9.0,0.95397
43256,XC484737.ogg,comros,Carpodacus erythrinus,0.0,3.0,0.999471


In [49]:
bird_preds_df['pred_code'] = bird_preds_df.apply(lambda row: sci2code[row['name']] if row['name'] in sci2code.keys() else '', axis=1)
bird_preds_df['ood'] = bird_preds_df.apply(lambda row: False if row['pred_code'] in bird2id.keys() else True, axis=1)

In [53]:
bird_preds_df.shape, bird_preds_df[bird_preds_df['pred_code'] == ''].shape

((123023, 8), (49, 8))

In [52]:
bird_preds_df[bird_preds_df['ood'] == True].shape

(8687, 8)

In [54]:
bird_preds_df.sample(2)

Unnamed: 0,filename,label,name,start,end,confidence,pred_code,ood
35941,XC687397.ogg,comgre,Anthus pratensis,21.0,24.0,0.857173,meapip1,True
46044,XC571110.ogg,comsan,Actitis hypoleucos,9.0,12.0,0.970478,comsan,False


In [55]:
bird_preds_df.label.value_counts()[:15]

label
blrwar1    13281
hoopoe      6506
comros      4259
grewar3     4117
bkwsti      3751
eucdov      3539
barswa      3340
zitcis1     2713
comgre      2400
rorpar      2357
comsan      2335
woosan      2311
graher1     2231
grywag      2230
lirplo      2111
Name: count, dtype: int64

### low freq classes

In [59]:
bird_preds_df.label.value_counts()[-10:]

label
nilfly2    18
junmyn1    14
blaeag1    10
wynlau1    10
wbbfly1     9
indtit1     8
brfowl1     6
bncwoo3     5
niwpig1     2
asiope1     1
Name: count, dtype: int64

In [58]:
meta_df[meta_df['secondary_labels'] == '[]'].primary_label.value_counts()[-10:]

primary_label
bncwoo3    7
wbbfly1    6
blaeag1    6
darter2    6
paisto1    6
wynlau1    6
malwoo1    6
integr     5
asiope1    5
niwpig1    4
Name: count, dtype: int64