In [None]:
from pathlib import Path
import gc
import numpy as np
import pandas as pd
from utils import *

In [None]:
import tensorflow as tf
tf.keras.mixed_precision.set_global_policy('mixed_float16')
from keras.utils import set_random_seed
from keras.callbacks import EarlyStopping
from keras.optimizers.optimizer_v2.adadelta import Adadelta
from keras.losses import mean_squared_logarithmic_error

In [None]:
random_seed = 13
set_random_seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
dataset_version = 3
root_path = Path('results') / f'dataset_V{dataset_version}'
params_path = root_path / 'params.npy'
params = np.load(params_path, allow_pickle=True).item()
locals().update(params)
datasets_path = root_path / 'datasets.npy'
datasets = np.load(datasets_path, allow_pickle=True).item()
locals().update(datasets)
all_train_files = train_files.copy()
train_all_annotations = pd.read_csv(params_path.with_name('train_annotations.csv'))
val_annotations = pd.read_csv(params_path.with_name('val_annotations.csv'))
test_annotations = pd.read_csv(params_path.with_name('test_annotations.csv'))

In [None]:
frame_length_seconds = frame_length / sr
hop_length_seconds = hop_length / sr

batch_size = 100
n_epochs = 100

In [None]:
save_path_root = root_path / 'data_size_impact'
save_path_root.mkdir(parents=True, exist_ok=True)

In [None]:
seeds = [None, 33897, 24971723, 'sorted_by_effectiveness']
save_paths = [save_path_root / f'{seed}/gammatone_frozen' for seed in seeds]
if not all([save_path.exists() for save_path in save_paths]):
    x_val, y_val, _ = create_dataset_from_annotations(annotations=val_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
    x_val = x_val[..., None].astype(float) #safe_cast_to_f16(x_val[..., None])
    y_val = y_val.astype(float) #safe_cast_to_f16(y_val.astype(float))

In [None]:
all_tries = list(save_path_root.glob('*'))
all_results = []
for try_path in all_tries:
    result_path = list(try_path.rglob('*.csv'))
    if len(result_path) == 0:
        continue
    result_path = result_path[0]
    result = pd.read_csv(result_path)
    result['seed'] = try_path.name.split('_')[-1]
    result['effectiveness'] = result['macro avg_f1-score'].diff() / (result['training_size'].diff() / result['training_size'])
    all_results.append(result)
if len(all_results) > 0:
    all_results = pd.concat(all_results, ignore_index=True)
else: all_results = None

In [None]:
for seed_ in seeds:
    seed = seed_
    train_files = all_train_files.copy()
    if seed is None:
        seed = 'None'
    elif type(seed) is int:
        rng_1 = np.random.default_rng(seed)
        rng_1.shuffle(train_files)
    elif seed == 'sorted_by_effectiveness':
        if all_results is None: continue
        train_files = [Path(p) for p in all_results.groupby('new_rec')['effectiveness'].mean().sort_values(ascending=False).index.values]

    save_path = save_path_root / f'{seed}/gammatone_frozen'
    models_paths = list(save_path.rglob('model.h5'))
    if len(models_paths) >= len(train_files): continue
    
    for i in range(len(train_files)):
        train_files_subset = train_files[:i+1]
        train_files_subset_annotations = create_dataset_annotations(train_files_subset, sr=sr, frame_length=frame_length, hop_length=hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
        
        model_save_path = save_path / f'round_{i}/{int(train_files_subset_annotations.is_selected.sum())}/model.h5'
        if model_save_path.exists(): continue
        
        x_train, y_train, _ = create_dataset_from_annotations(annotations=train_files_subset_annotations, sr=sr, frame_length=frame_length, hop_length=hop_length)
        x_train = x_train[..., None].astype(float) #safe_cast_to_f16(x_train[..., None])
        y_train = y_train.astype(float) #safe_cast_to_f16(y_train.astype(float))
        
        def idx_to_sample(idx):
            idx = np.array(idx)
            return x_train[idx], y_train[idx]
        
        train_data = tf.data.Dataset.from_tensor_slices(range(len(x_train)))
        train_data = train_data.shuffle(buffer_size=len(x_train), seed=random_seed)
        train_data = train_data.batch(batch_size, drop_remainder=True)
        train_data = train_data.map(lambda idx: tf.py_function(func=idx_to_sample, inp=[idx], Tout=(tf.float16, tf.float16)), num_parallel_calls=tf.data.experimental.AUTOTUNE)
        train_data = train_data.prefetch(1)
        
        model = create_classification_model_gammatone(frame_length)
        model.layers[1].trainable = False
        model.compile(
            loss=mean_squared_logarithmic_error,
            optimizer=Adadelta(learning_rate=1e-2),
            metrics=[tf.keras.metrics.BinaryAccuracy()]
        )
        history = model.fit(
            train_data,
            batch_size=batch_size,
            epochs=n_epochs,
            shuffle=True,
            validation_data=(x_val, y_val),
            callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
        )

        model_save_path.parent.mkdir(exist_ok=True, parents=True)
        model.save(model_save_path)
        np.save(model_save_path.with_suffix('.npy'), history.history, allow_pickle=True)
        
        del x_train, y_train
        gc.collect(), tf.keras.backend.clear_session()