In [49]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

class CFG:
    seed = 42
    debug = False
    print_freq = 100
    num_workers = 1

    # Set your project root directory
    PROJECT_ROOT_DIR = Path.cwd().parents[2]  # Adjust as needed
    
    OUTPUT_DIR = PROJECT_ROOT_DIR.joinpath('data/working/')
    SPRCTROGRAMS_DIR = OUTPUT_DIR.joinpath('birdclef25-mel-spectrograms/')
    
    train_datadir = PROJECT_ROOT_DIR.joinpath('data/raw/train_audio')
    train_csv = PROJECT_ROOT_DIR.joinpath('data/raw/train.csv')
    taxonomy_csv = PROJECT_ROOT_DIR.joinpath('data/raw/taxonomy.csv')

    spectrogram_npy = SPRCTROGRAMS_DIR.joinpath('birdclef2025_melspec_5sec_256_256.npy')
    working_df = SPRCTROGRAMS_DIR.joinpath('working_df.csv')
    
cfg = CFG()

In [50]:
# ── Load and encode labels only once ────────────────────────────────────────────
labels = pd.read_csv(cfg.working_df)        # has primary_label & class

le = LabelEncoder()
labels['y_species_encoded'] = le.fit_transform(labels['class'])      # numeric code per class
class_names = le.classes_                                    # will be identical later

# ── Create a split column so every size uses the SAME split ────────────────────
train_idx, test_idx = train_test_split(
    labels.index,
    test_size=0.20,
    random_state=42,
    stratify=labels['y_species_encoded']
)

train_idx, val_idx = train_test_split(
    train_idx,
    test_size=0.25,      # 0.25 of 0.80 ⇒ 0.20 of total ⇒ 60/20/20 split
    random_state=42,
    stratify=labels.loc[train_idx, 'y_species_encoded']
)

labels['split'] = 'train'
labels.loc[val_idx,  'split'] = 'val'
labels.loc[test_idx, 'split'] = 'test'

labels.to_csv(cfg.PROJECT_ROOT_DIR.joinpath('configs/work_df_w_split_info.csv'), index=False)
labels

Unnamed: 0,primary_label,rating,filename,target,filepath,samplename,class,y_species_encoded,split
0,1139490,0.0,1139490/CSA36385.ogg,0,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,1139490-CSA36385,Insecta,2,test
1,1139490,0.0,1139490/CSA36389.ogg,0,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,1139490-CSA36389,Insecta,2,train
2,1192948,0.0,1192948/CSA36358.ogg,1,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,1192948-CSA36358,Insecta,2,train
3,1192948,0.0,1192948/CSA36366.ogg,1,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,1192948-CSA36366,Insecta,2,train
4,1192948,0.0,1192948/CSA36373.ogg,1,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,1192948-CSA36373,Insecta,2,val
...,...,...,...,...,...,...,...,...,...
28559,ywcpar,0.0,ywcpar/iNat77392.ogg,205,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,ywcpar-iNat77392,Aves,1,val
28560,ywcpar,0.0,ywcpar/iNat78624.ogg,205,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,ywcpar-iNat78624,Aves,1,train
28561,ywcpar,0.0,ywcpar/iNat789234.ogg,205,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,ywcpar-iNat789234,Aves,1,train
28562,ywcpar,0.0,ywcpar/iNat819873.ogg,205,/pub/ddlin/projects/mids/DATASCI207_Bird_Sound...,ywcpar-iNat819873,Aves,1,train


In [51]:
# Sanity check for distribution
count_df = (
    labels[['class', 'split']]
        .value_counts()               # Series with MultiIndex (class, split)
        .reset_index(name='count')    # make it a DataFrame
        .sort_values(['class', 'split'])
        .reset_index(drop=True)       # clean row index
)

count_df

Unnamed: 0,class,split,count
0,Amphibia,test,117
1,Amphibia,train,350
2,Amphibia,val,116
3,Aves,test,5530
4,Aves,train,16588
5,Aves,val,5530
6,Insecta,test,31
7,Insecta,train,93
8,Insecta,val,31
9,Mammalia,test,35


In [55]:
# What is the percentage of each class in the train set
train_counts = count_df[count_df['split'] == 'train']['count'].values
total_train = train_counts.sum()
class_percentages = (train_counts / total_train) * 100
class_percentages_df = pd.DataFrame({
    'class': count_df[count_df['split'] == 'train']['class'].values,
    'percentage': class_percentages,
})
class_percentages_df = class_percentages_df.sort_values(by='percentage', ascending=False).reset_index(drop=True)

class_percentages_df

Unnamed: 0,class,percentage
0,Aves,96.790757
1,Amphibia,2.042245
2,Mammalia,0.624344
3,Insecta,0.542654
