# Generates Labeled Dataset from Clusterization

In [1]:
import numpy as np

from joblib import dump, load

from sklearn.neighbors import KNeighborsClassifier

from bring_features_and_file_paths import bring_features_and_file_paths, transform_features
from generate_labelled_dataset import generate_labelled_dataset

## Bring all the features and file paths

In [2]:
features, file_paths, indices = bring_features_and_file_paths('/grand/projects/BirdAudio/Soundscapes/Features', sub_sample=None)

We have 21362 feature vectors.


Now, we want to transform the features, scaling them and reducing their dimensionality

In [3]:
dim_red_model = load('/grand/projects/BirdAudio/Soundscapes/Clusters/dim_red_model')
scale_model = load('/grand/projects/BirdAudio/Soundscapes/Clusters/scale_model')

In [4]:
samples = transform_features(features, scale_model, dim_red_model)
samples.shape

(21362, 383)

Load the clusters generated from the features

We will use them to classify our sample of features now

In [5]:
clusters = np.load('/grand/projects/BirdAudio/Soundscapes/Clusters/clusters.npy', allow_pickle=True)
vectors=clusters[()]['x']
labels=clusters[()]['y']

In [6]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 100
Estimated number of noise points: 0


Train a k-nearest neighbor algorithm to predict the samples

In [7]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(vectors, labels)

In [8]:
label_predictions=neigh.predict(samples)

In [9]:
label_predictions.shape

(21362,)

In [10]:
output_dir='/grand/projects/BirdAudio/Soundscapes/Labeled_Dataset'
spectrograms_base_path='/grand/projects/BirdAudio/Soundscapes/Spectrograms/'
generate_labelled_dataset(label_predictions, file_paths, spectrograms_base_path, output_dir)