# Generates Labeled Dataset from Clusterization

In [1]:
import os
import numpy as np

from pathlib import Path
from joblib import dump, load

from sklearn.neighbors import KNeighborsClassifier

from bring_features_and_file_paths import bring_features_and_file_paths, transform_features
from generate_labelled_dataset import generate_labelled_dataset

## Bring all the features and file paths

In [2]:
features, file_paths = bring_features_and_file_paths('/grand/projects/BirdAudio/Soundscapes/Features', sub_sample=None)
#features, file_paths, indices = bring_features_and_file_paths('/grand/projects/BirdAudio/Soundscapes/Features', sub_sample=0.002)

We have 101076 feature vectors.


Now, we want to transform the features, scaling them and reducing their dimensionality

In [3]:
dim_red_model = load('/grand/projects/BirdAudio/Soundscapes/Clusters/ThirdClusters/MLE/dim_red_model')
scale_model = load('/grand/projects/BirdAudio/Soundscapes/Clusters/ThirdClusters/MLE/scale_model')

In [4]:
samples = transform_features(features, scale_model, dim_red_model)
samples.shape

(101076, 383)

Load the clusters generated from the features

We will use them to classify our sample of features now

In [5]:
clusters = np.load('/grand/projects/BirdAudio/Soundscapes/Clusters/ThirdClusters/MLE/clusters.npy', allow_pickle=True)
vectors=clusters[()]['x']
labels=clusters[()]['y']

In [6]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

Estimated number of clusters: 100
Estimated number of noise points: 0


If there are not predicted labels saved, train a k-nearest neighbor algorithm to predict the samples

In [7]:
output_dir='/grand/projects/BirdAudio/Soundscapes/Labeled_Dataset'
if not os.path.isfile(os.path.join(output_dir,'label_predictions.npy')):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    print('Predicting samples ...')
    neigh = KNeighborsClassifier(n_neighbors=10)
    neigh.fit(vectors, labels)
    label_predictions=neigh.predict(samples)
    np.save(os.path.join(output_dir,'label_predictions.npy'), label_predictions)
    print('DONE!')
else:
    print('Bringing saved predictions ...')
    label_predictions=np.load(os.path.join(output_dir,'label_predictions.npy'))
    print('DONE!')

Predicting samples ...
DONE!


In [8]:
label_predictions.shape

(101076,)

In [9]:
spectrograms_base_path='/grand/projects/BirdAudio/Soundscapes/Second_Filtered_Spectrograms/'
generate_labelled_dataset(label_predictions, file_paths, spectrograms_base_path, output_dir, verbose=False)

No checkpoint. Starting from 0.
Checkpointing at every 1000 samples
Something went wrong with the following data path
Label is  43
File path is                                                                                   
To output directory in  /grand/projects/BirdAudio/Soundscapes/Labeled_Dataset/Class_43/
Something went wrong with the following data path
Label is  43
File path is                                                                                   
To output directory in  /grand/projects/BirdAudio/Soundscapes/Labeled_Dataset/Class_43/
Something went wrong with the following data path
Label is  43
File path is                                                                                   
To output directory in  /grand/projects/BirdAudio/Soundscapes/Labeled_Dataset/Class_43/
Something went wrong with the following data path
Label is  43
File path is                                                                                   
To output directory in  /grand/p