In [None]:
import platform
import pathlib
if platform.system() != 'Windows': pathlib.WindowsPath = pathlib.PosixPath

In [None]:
from pathlib import Path
import gc
import numpy as np
import pandas as pd
from utils import *

In [None]:
import tensorflow as tf
tf.keras.mixed_precision.set_global_policy('mixed_float16')
from keras.utils import set_random_seed
from keras.callbacks import EarlyStopping
from keras.optimizers import Adadelta
from keras.losses import mean_squared_logarithmic_error

In [None]:
from sklearn.metrics import classification_report

In [None]:
random_seed = 13
set_random_seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
dataset_version = 3
root_path = Path('results') / f'dataset_V{dataset_version}'
params_path = root_path / 'params.npy'
params = np.load(params_path, allow_pickle=True).item()
locals().update(params)
datasets_path = root_path / 'datasets.npy'
datasets = np.load(datasets_path, allow_pickle=True).item()
locals().update(datasets)
all_train_files = train_files.copy()
train_all_annotations = pd.read_csv(params_path.with_name('train_annotations.csv'))
val_annotations = pd.read_csv(params_path.with_name('val_annotations.csv'))
test_annotations = pd.read_csv(params_path.with_name('test_annotations.csv'))
x_val, y_val = None, None
x_test, y_test = None, None

In [None]:
frame_length_seconds = frame_length / sr
hop_length_seconds = hop_length / sr

batch_size = 100
n_epochs = 100

seeds = [None, 33897, 24971723, 'sorted_by_effectiveness']
seeds_to_train_files = {}
for seed in seeds:
    if seed is None: seeds_to_train_files['None'] = all_train_files.copy()
    elif seed == 'sorted_by_effectiveness': seeds_to_train_files[seed] = None
    else: 
        rng = np.random.default_rng(seed)
        train_files = all_train_files.copy()
        rng.shuffle(train_files)
        seeds_to_train_files[str(seed)] = train_files

In [None]:
save_path_root = root_path / 'data_size_impact/gammatone_frozen'
save_path_root.mkdir(parents=True, exist_ok=True)
result_path = save_path_root / 'results.csv'

In [None]:
if result_path.exists():
    results = pd.read_csv(result_path, index_col=0, header=[0, 1])
    results.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=0, inplace=True)
    results.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=1, inplace=True)
    results.seed = results.seed.apply(lambda x: str(int(x)) if not np.isnan(x) else 'None')
else: results = None

In [None]:
for model_path in save_path_root.rglob('model.h5'):
    _, _, _, _, seed, round_i, n_train_samples, _ = model_path.parts
    n_train_samples = int(n_train_samples)
    round_i = int(round_i.split('_')[-1])
    model_id = f'{seed}_{round_i}_{n_train_samples}'
    new_recording = seeds_to_train_files[seed][round_i]
    new_recording = f'{new_recording.parent.name}_{new_recording.stem}'
    if results is None or model_id not in results['model_id'].values:
        model = tf.keras.models.load_model(model_path)
        if x_test is None:
            x_test, y_test, _ = create_dataset_from_annotations(annotations=test_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
            x_test = x_test[..., None].astype(float)
            y_test = np.vectorize({0: 'NoBee', 1: 'Bee'}.get)(y_test.astype(int))
        y_pred = model.predict(x_test, batch_size=256)
        gc.collect(), tf.keras.backend.clear_session()
        y_pred = np.vectorize({0: 'NoBee', 1: 'Bee'}.get)((y_pred > 0.5).astype(int))
        classification_report_results = classification_report(y_test, y_pred, output_dict=True)
        classification_report_results = convert_classification_report_to_df(classification_report_results)
        classification_report_results['model_id'] = model_id
        classification_report_results['n_train_samples'] = n_train_samples
        classification_report_results['seed'] = seed
        classification_report_results['round_i'] = round_i
        classification_report_results['new_recording'] = new_recording
        results = pd.concat([results, classification_report_results], axis=0, ignore_index=True) if results is not None else classification_report_results
        results.to_csv(result_path)
        results.to_json(result_path.with_suffix('.json'))

In [None]:
results.sort_values('round_i', inplace=True)
results['new_samples_count'] = results.groupby('seed').n_train_samples.diff()
results['new_samples_count'] = results['new_samples_count'].fillna(results['n_train_samples'])
results['new_samples_count_rel'] = results['new_samples_count'] / results['n_train_samples']
results['f1_score_diff'] = results.groupby('seed')[[('macro avg', 'f1-score')]].diff().fillna(results[[('macro avg', 'f1-score')]]).values
results['effectiveness'] = results['f1_score_diff'] / results['new_samples_count_rel']
results['effectiveness'] = results.groupby('seed').effectiveness.transform(lambda x: x / np.abs(x).max())

In [None]:
random_seed_results = results[results.seed != 'sorted_by_effectiveness']
if random_seed_results.shape[0] == (len(seeds) - 1) * len(all_train_files):
    recordings_by_effectiveness = random_seed_results.groupby('new_recording').effectiveness.mean().sort_values(ascending=False).index.to_list()
    train_files = sorted(all_train_files, key=lambda x: recordings_by_effectiveness.index(f'{x.parent.name}_{x.stem}'))
    seeds_to_train_files['sorted_by_effectiveness'] = train_files

In [None]:
for seed, train_files in seeds_to_train_files.items():
    if train_files is None: continue
    save_path = save_path_root / f'{seed}'
    models_paths = list(save_path.rglob('model.h5'))
    if len(models_paths) >= len(train_files): continue
    
    for i in range(len(train_files)):
        train_files_subset = train_files[:i+1]
        train_files_subset_annotations = create_dataset_annotations(train_files_subset, sr=sr, frame_length=frame_length, hop_length=hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
        
        model_save_path = save_path / f'round_{i}/{int(train_files_subset_annotations.is_selected.sum())}/model.h5'
        if model_save_path.exists(): continue
        if x_val is None:
            x_val, y_val, _ = create_dataset_from_annotations(annotations=val_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
            x_val = x_val[..., None].astype(float)
            y_val = y_val.astype(float)
        x_train, y_train, _ = create_dataset_from_annotations(annotations=train_files_subset_annotations, sr=sr, frame_length=frame_length, hop_length=hop_length)
        x_train = x_train[..., None].astype(float)
        y_train = y_train.astype(float)
        
        def idx_to_sample(idx):
            idx = np.array(idx)
            return x_train[idx], y_train[idx]
        
        train_data = tf.data.Dataset.from_tensor_slices(range(len(x_train)))
        train_data = train_data.shuffle(buffer_size=len(x_train), seed=random_seed)
        train_data = train_data.batch(batch_size, drop_remainder=True)
        train_data = train_data.map(lambda idx: tf.py_function(func=idx_to_sample, inp=[idx], Tout=(tf.float16, tf.float16)), num_parallel_calls=tf.data.experimental.AUTOTUNE)
        train_data = train_data.prefetch(1)
        
        model = create_classification_model_gammatone(frame_length)
        model.layers[1].trainable = False
        model.compile(
            loss=mean_squared_logarithmic_error,
            optimizer=Adadelta(learning_rate=1e-2),
            metrics=[tf.keras.metrics.BinaryAccuracy()]
        )
        history = model.fit(
            train_data,
            batch_size=batch_size,
            epochs=n_epochs,
            shuffle=True,
            validation_data=(x_val, y_val),
            callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
        )

        model_save_path.parent.mkdir(exist_ok=True, parents=True)
        model.save(model_save_path)
        np.save(model_save_path.with_suffix('.npy'), history.history, allow_pickle=True)
        
        del x_train, y_train
        gc.collect(), tf.keras.backend.clear_session()