In [None]:
from pathlib import Path
import gc
import numpy as np
import pandas as pd
from keras.utils import set_random_seed
from utils import *

In [None]:
random_seed = 13
set_random_seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
dataset_version = 3
sr = 22050
exclude_noisybee = True if dataset_version == 2 else False
balancing = True
frame_length = sr
hop_length = frame_length//5
test_hop_length = frame_length//2
bee_fraction_threshold = 1


In [None]:
root_path = Path('results') / f'dataset_V{dataset_version}'
root_path.mkdir(exist_ok=True, parents=True)
params_path = root_path / 'params.npy'

if params_path.exists():
    params = np.load(params_path, allow_pickle=True).item()
    locals().update(params)
else:
    np.save(root_path / 'params.npy', {
        'dataset_version': dataset_version,
        'sr': sr,
        'exclude_noisybee': exclude_noisybee,
        'balancing': balancing,
        'frame_length': frame_length,
        'hop_length': hop_length,
        'bee_fraction_threshold': bee_fraction_threshold,
        'test_hop_length': test_hop_length
    }, allow_pickle=True)

In [None]:
training_data_root_path = Path('all_training_data')
training_data_subdirs = [
    'training_data',
    'training_data_new',
    'training_data_new_2'
]

# different versions have different files
# V1: only the initial files
# V2: initial files + NoisyBee
# V3: initial files + new files + new files 2
if dataset_version < 3: 
    training_data_subdirs = training_data_subdirs[:1]

In [None]:
# spliting the recording files into train, validation and test sets by bee duration ratio
# Train set: ~50%, Validation set: ~15%, Test set: ~30%

datasets_path = root_path / 'datasets.npy'
if datasets_path.exists():
    datasets = np.load(datasets_path, allow_pickle=True).item()
    locals().update(datasets)
else:
    rec_paths = sorted(sum(list(map(lambda subdir: list((training_data_root_path / subdir).glob('*.wav')), training_data_subdirs)), []))
    train_files, test_files = train_test_split_by_bee(rec_paths, test_size=0.3, random_state=random_seed, exclude_noisy_bee=exclude_noisybee)
    train_files, val_files = train_test_split_by_bee(train_files, test_size=0.2, random_state=random_seed, exclude_noisy_bee=exclude_noisybee)
    datasets = {
        'train_files': train_files,
        'val_files': val_files,
        'test_files': test_files,
    }
    np.save(datasets_path, datasets, allow_pickle=True)

In [None]:
train_annotations_path = root_path / 'train_annotations.csv'
val_annotations_path = root_path / 'val_annotations.csv'
test_annotations_path = root_path / 'test_annotations.csv'

try:
    train_annotations = pd.read_csv(train_annotations_path)
    val_annotations = pd.read_csv(val_annotations_path)
    test_annotations = pd.read_csv(test_annotations_path)
except:
    train_annotations = create_dataset_annotations(train_files, sr=sr, frame_length=frame_length, hop_length=hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
    val_annotations = create_dataset_annotations(val_files, sr=sr, frame_length=frame_length, hop_length=test_hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
    test_annotations = create_dataset_annotations(test_files, sr=sr, frame_length=frame_length, hop_length=test_hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
    train_annotations.to_csv(train_annotations_path, index=False)
    val_annotations.to_csv(val_annotations_path, index=False)
    test_annotations.to_csv(test_annotations_path, index=False)