In [None]:
from pathlib import Path
import gc
import numpy as np
import pandas as pd
import librosa
from utils import *
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import tensorflow as tf
from keras.utils import set_random_seed

In [None]:
from sklearn.metrics import classification_report

In [None]:
random_seed = 13
set_random_seed(random_seed)
rng = np.random.default_rng(random_seed)

In [None]:
dataset_version = 3
root_path = Path('results') / f'dataset_V{dataset_version}'
params_path = root_path / 'params.npy'
params = np.load(params_path, allow_pickle=True).item()
locals().update(params)
datasets_path = root_path / 'datasets.npy'
datasets = np.load(datasets_path, allow_pickle=True).item()
locals().update(datasets)
train_annotations_path = root_path / 'train_annotations.csv'
val_annotations_path = root_path / 'val_annotations.csv'
test_annotations_path = root_path / 'test_annotations.csv'
train_annotations = pd.read_csv(train_annotations_path, low_memory=False)
val_annotations = pd.read_csv(val_annotations_path, low_memory=False)
test_annotations = pd.read_csv(test_annotations_path, low_memory=False)
root_path /= 'baseline_methods'
root_path.mkdir(parents=True, exist_ok=True)

In [None]:
train_data, train_labels, feat_indices = create_features_dataset_from_annotations(train_annotations, sr=sr, frame_length=frame_length, hop_length=hop_length)
test_data, test_labels, _ = create_features_dataset_from_annotations(test_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
class_names = np.array(['NoBee', 'Bee'])
train_labels, test_labels = class_names[train_labels.astype(int)], class_names[test_labels.astype(int)]

## Feature Selection

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
features = list(feat_indices.keys())
features_subsets = generate_subsets(features)
features_subsets = sorted(features_subsets, key=lambda x: len(x))[1:]
features_columns = pd.MultiIndex.from_tuples([('features', f) for f in feat_indices.keys()])
features_df = pd.DataFrame(columns=features_columns, data=np.full((1, len(features)), False))

In [None]:
results_path = root_path / 'knn_classification_report/feature_selection_results.csv'
if results_path.exists():
    results_features = pd.read_csv(results_path, index_col=0, header=[0, 1])
    results_features.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=0, inplace=True)
else:
    results_path.mkdir(parents=True, exist_ok=True)
    results_features = None
    
# Searching for the best subset of features with KNN
seen_combinations = results_features.apply(lambda x: '+'.join(sorted(x.features[x.features].index)), axis=1).tolist() if results_features is not None else []

for features_subset in features_subsets:
    combination = '+'.join(sorted(features_subset))
    if combination in seen_combinations: continue
    print(f'Processing features subset: {combination}')
    selected_features_indices = np.concatenate([feat_indices[feat_name] for feat_name in features_subset])

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_data[:, selected_features_indices], train_labels)
    x_pred = knn.predict(test_data[:, selected_features_indices])

    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    features_ = features_df.copy()
    features_.loc[0, ('features', features_subset)] = True
    classification_report_results = pd.concat([classification_report_results, features_], axis=1)
    
    results_features = pd.concat([results_features, classification_report_results], axis=0, ignore_index=True) if results_features is not None else classification_report_results
    results_features.round(3).to_csv(results_path)
    results_features.to_json(results_path.with_suffix('.json'), orient='records')

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(data=results_features, x=('Bee', 'f1-score'), y=('NoBee', 'f1-score'))

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
results_features.Bee.plot.scatter(x='precision', y='recall', marker='.', ax=ax, color='blue', label='Bee')
results_features.NoBee.plot.scatter(x='precision', y='recall', marker='+', ax=ax, color='orange', label='NoBee')
results_features['macro avg'].plot.scatter(x='precision', y='recall', marker='x', ax=ax, color='green', label='Macro avg')
results_features.sort_values(by=('macro avg', 'f1-score'), ascending=False, inplace=True)
best_subset_results = results_features.iloc[0].to_frame().T
best_subset_results.Bee.plot.scatter(x='precision', y='recall', marker='o', ax=ax, color='red', s=50)
best_subset_results.NoBee.plot.scatter(x='precision', y='recall', marker='o', ax=ax, color='red', s=50)
best_subset_results['macro avg'].plot.scatter(x='precision', y='recall', marker='o', ax=ax, color='red', s=50)

## Methods Comparison

In [None]:
results_save_path = root_path / 'results_methods.csv'
if results_save_path.exists():
    results_methods = pd.read_csv(results_save_path, index_col=0, header=[0, 1])
    results_methods.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=0, inplace=True)
    results_methods.rename(lambda x: '' if x.startswith('Unnamed') else x, axis=1, level=1, inplace=True)
else:
    results_methods = None

In [None]:
best_subset = best_subset_results.features.T.index[best_subset_results.features.values[0]].tolist()
print(f'Best subset of features: {best_subset}')
selected_features_indices = np.concatenate([feat_indices[feat_name] for feat_name in best_subset])
train_data = train_data[:, selected_features_indices]
test_data = test_data[:, selected_features_indices]

### KNN

In [None]:
if results_methods is None or 'KNN' not in results_methods.method.values:
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(train_data, train_labels)
    x_pred = knn.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'KNN'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True) if results_methods is not None else classification_report_results
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### SVC

In [None]:
if 'SVC' not in results_methods.method.values:
    from sklearn.svm import SVC
    svc = SVC()
    svc.fit(train_data, train_labels)
    x_pred = svc.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'SVC'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### Random Forest

In [None]:
if 'RandomForest' not in results_methods.method.values:
    from sklearn.ensemble import RandomForestClassifier
    dec_tree = RandomForestClassifier()
    dec_tree.fit(train_data, train_labels)
    x_pred = dec_tree.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'RandomForest'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### Extra Trees

In [None]:
if 'ExtraTrees' not in results_methods.method.values:
    from sklearn.ensemble import ExtraTreesClassifier
    ert = ExtraTreesClassifier()
    ert.fit(train_data, train_labels)
    x_pred = ert.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'ExtraTrees'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### XGBoost

In [None]:
if 'GradientBoosting' not in results_methods.method.values:
    from sklearn.ensemble import GradientBoostingClassifier
    gb = GradientBoostingClassifier()
    gb.fit(train_data, train_labels)
    x_pred = gb.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'GradientBoosting'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### Logistic Regression

In [None]:
if 'LogisticRegression' not in results_methods.method.values:
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    lr.fit(train_data, train_labels)
    x_pred = lr.predict(test_data)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'LogisticRegression'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### KMeans

In [None]:
if 'KMeans' not in results_methods.method.values:
    from sklearn.cluster import KMeans
    clustering = KMeans(n_clusters=2, n_init='auto', random_state=random_seed)
    clustering.fit(train_data)
    labels_map_0 = {0: 'NoBee', 1: 'Bee'}
    labels_map_1 = {1: 'NoBee', 0: 'Bee'}
    accuracy_0 = classification_report(train_labels, np.vectorize(labels_map_0.get)(clustering.labels_), output_dict=True)['accuracy']
    accuracy_1 = classification_report(train_labels, np.vectorize(labels_map_1.get)(clustering.labels_), output_dict=True)['accuracy']
    if accuracy_0 > accuracy_1: labels_map = labels_map_0
    else: labels_map = labels_map_1
    x_pred = clustering.predict(test_data)
    x_pred = np.vectorize(labels_map.get)(x_pred)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'KMeans'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

### Neural Network

In [None]:
model_path = root_path / 'NN_model/model.h5'
if model_path.exists():
    model = tf.keras.models.load_model(model_path)
    history = pd.read_csv(model_path.with_name('history.csv'))
else:
    from keras.models import Sequential
    from keras.layers import Dense, Dropout, BatchNormalization
    from keras.callbacks import EarlyStopping
    from keras.optimizers import Adadelta
    from keras.losses import mean_squared_logarithmic_error
    
    train_labels = np.vectorize({'NoBee': 0., 'Bee': 1.}.get)(train_labels)
    val_data, val_labels, _ = create_features_dataset_from_annotations(val_annotations, sr=sr, frame_length=frame_length, hop_length=test_hop_length)
    val_labels = val_labels.astype(float)
    val_data = val_data[:, selected_features_indices]
    
    batch_size = 100
    n_epochs = 100
    
    set_random_seed(random_seed)
        
    model = Sequential()
    model.add(Dense(8, activation='relu', input_shape=(train_data.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(8, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        loss=mean_squared_logarithmic_error,
        optimizer=Adadelta(learning_rate=1e-2),
        metrics=[tf.keras.metrics.BinaryAccuracy()]
    )
    history = model.fit(
        train_data, train_labels,
        batch_size=batch_size,
        epochs=n_epochs,
        shuffle=True,
        validation_data=(val_data, val_labels),
        callbacks=[EarlyStopping(patience=10, restore_best_weights=True)]
    )
    model.save(model_path)
    pd.DataFrame(history.history).to_csv(model_path.with_name('history.csv'))

In [None]:
if 'NN' not in results_methods.method.values:
    x_pred = model.predict(test_data)
    x_pred = (x_pred > 0.5).astype(int)
    x_pred = np.vectorize({0: 'NoBee', 1: 'Bee'}.get)(x_pred)
    classification_report_results = classification_report(test_labels, x_pred, output_dict=True)
    classification_report_results = convert_classification_report_to_df(classification_report_results)
    classification_report_results['method'] = 'NN'
    results_methods = pd.concat([results_methods, classification_report_results], axis=0, ignore_index=True)
    results_methods.round(3).to_csv(results_save_path)
    results_methods.to_json(results_save_path.with_suffix('.json'), orient='records')

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(data=results_methods, x=('Bee', 'f1-score'), y=('NoBee', 'f1-score'), hue='method')