In [None]:
import platform
import pathlib
if platform.system() != 'Windows': pathlib.WindowsPath = pathlib.PosixPath

from pathlib import Path
import gc
import numpy as np
import pandas as pd
from keras.utils import set_random_seed
from utils import *

In [None]:
import tensorflow as tf
tf.keras.mixed_precision.set_global_policy('mixed_float16')
from keras.utils import set_random_seed
from keras.callbacks import EarlyStopping
from keras.optimizers import Adadelta
from keras.losses import mean_squared_logarithmic_error

In [None]:
from sklearn.metrics import classification_report

In [None]:
random_seed = 13
set_random_seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
dataset_version = 3
root_path = Path('results') / f'dataset_V{dataset_version}'
params_path = root_path / 'params.npy'
params = np.load(params_path, allow_pickle=True).item()
locals().update(params)
datasets_path = root_path / 'datasets.npy'
datasets = np.load(datasets_path, allow_pickle=True).item()
locals().update(datasets)
rec_paths = train_files + test_files + val_files
root_path /= 'KFold'

In [None]:
frame_length_seconds = frame_length / sr
hop_length_seconds = hop_length / sr

batch_size = 100
n_epochs = 100

In [None]:
k = 5
val_ratio = 0.2
folds_files = []
split_ratios = np.linspace(1, 0, k+1, endpoint=True)[:-1]
split_ratios = val_ratio / split_ratios
remaining_files = rec_paths.copy()
for r in split_ratios:
    if r == 1:
        folds_files.append(remaining_files.copy())
        break
    remaining_files, fold_files = train_test_split_by_bee(remaining_files, test_size=r, random_state=random_seed, exclude_noisy_bee=exclude_noisybee) # 20%, 80%
    folds_files.append(fold_files)

for i in range(k):
    fold_path = root_path / f'fold_{i}'
    fold_path.mkdir(parents=True, exist_ok=True)
    
    model_save_path = fold_path / 'model.h5'
    if model_save_path.exists(): continue
    
    datasets_path = fold_path / 'datasets.npy'
    if datasets_path.exists():
        datasets = np.load(datasets_path, allow_pickle=True).item()
        locals().update(datasets)
    else:
        fold_files_ = folds_files.copy()
        test_files = fold_files_.pop(i)
        remaining_files = list(sum(fold_files_, []))
        train_files, val_files = train_test_split_by_bee(remaining_files, test_size=val_ratio, random_state=random_seed, exclude_noisy_bee=exclude_noisybee)
        datasets = {
            'train_files': train_files,
            'val_files': val_files,
            'test_files': test_files
        }
        np.save(datasets_path, datasets, allow_pickle=True)
        
    train_annotations_path = fold_path / 'train_annotations.csv'
    val_annotations_path = fold_path / 'val_annotations.csv'
    test_annotations_path = fold_path / 'test_annotations.csv'
    try:
        train_annotations = pd.read_csv(train_annotations_path)
        val_annotations = pd.read_csv(val_annotations_path)
    except:
        train_annotations = create_dataset_annotations(train_files, sr=sr, frame_length=frame_length, hop_length=hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
        val_annotations = create_dataset_annotations(val_files, sr=sr, frame_length=frame_length, hop_length=test_hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
        train_annotations.to_csv(train_annotations_path, index=False)
        val_annotations.to_csv(val_annotations_path, index=False)
        
    x_val, y_val, _ = create_dataset_from_annotations(annotations=val_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
    x_val = x_val[..., None].astype(float)
    y_val = y_val.astype(float)
    
    x_train, y_train, _ = create_dataset_from_annotations(annotations=train_annotations, sr=sr, frame_length=frame_length, hop_length=hop_length)
    x_train = x_train[..., None].astype(float)
    y_train = y_train.astype(float)
    
    def idx_to_sample(idx):
        idx = np.array(idx)
        return x_train[idx], y_train[idx]
    
    train_data = tf.data.Dataset.from_tensor_slices(range(len(x_train)))
    train_data = train_data.shuffle(buffer_size=len(x_train), seed=random_seed)
    train_data = train_data.batch(batch_size, drop_remainder=True)
    train_data = train_data.map(lambda idx: tf.py_function(func=idx_to_sample, inp=[idx], Tout=(tf.float16, tf.float16)), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    train_data = train_data.prefetch(1)
    
    model = create_classification_model_gammatone(frame_length)
    model.layers[1].trainable = False
    model.compile(
        loss=mean_squared_logarithmic_error,
        optimizer=Adadelta(learning_rate=1e-2),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )
    history = model.fit(
        train_data,
        batch_size=batch_size,
        epochs=n_epochs,
        shuffle=True,
        validation_data=(x_val, y_val),
        callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
    )

    model.save(model_save_path)
    model.save_weights(model_save_path.with_name('model_weights.h5'))
    np.save(model_save_path.with_name('history.npy'), history.history, allow_pickle=True)
    
    del x_train, y_train, x_val, y_val
    gc.collect(), tf.keras.backend.clear_session()

In [None]:
result_path = root_path / 'results.csv'
if result_path.exists():
    results = pd.read_csv(result_path, index_col=0, header=[0, 1])
    results.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=0, inplace=True)
    results.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=1, inplace=True)
else:
    results = None

for model_path in root_path.rglob('*.h5'):
    fold = model_path.parent.name
    if results is not None and fold in results.fold.values: continue
    model = tf.keras.models.load_model(model_path)
    model.load_weights(model_path.with_name('model_weights.h5'))
    datasets_path = model_path.with_name('datasets.npy')
    datasets = np.load(datasets_path, allow_pickle=True).item()
    locals().update(datasets)
    test_annotations_path = model_path.with_name('test_annotations.csv')
    try:
        test_annotations = pd.read_csv(test_annotations_path)
    except:
        test_annotations = create_dataset_annotations(test_files, sr=sr, frame_length=frame_length, hop_length=test_hop_length, balancing=balancing, bee_fraction_threshold=bee_fraction_threshold, random_seed=random_seed, exclude_noisybee=exclude_noisybee)
        test_annotations.to_csv(test_annotations_path, index=False)
    x_test, y_test, _ = create_dataset_from_annotations(annotations=test_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
    x_test = x_test[..., None].astype(float)
    y_test = np.vectorize({0: 'NoBee', 1: 'Bee'}.get)(y_test.astype(int))
    y_pred = model.predict(x_test)
    gc.collect(), tf.keras.backend.clear_session()
    y_pred = np.vectorize({0: 'NoBee', 1: 'Bee'}.get)((y_pred > 0.5).astype(int))
    classification_report_results = classification_report(y_test, y_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['fold'] = fold
    results = pd.concat([results, classification_report_results], axis=0, ignore_index=True) if results is not None else classification_report_results
    results.round(3).to_csv(result_path)
    results.to_json(result_path.with_suffix('.json'))