# Multi Layer Perceptron Validation

In [41]:
import os
import numpy as np
import torch

from tqdm import tqdm

from datetime import datetime

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn import preprocessing

from bring_features_and_file_paths import bring_features_and_file_paths
from bring_features_and_file_paths import from_spectrogram_path_to_BirdNET_output_path
from bring_features_and_file_paths import get_BirdNET_detections
from bring_features_and_file_paths import get_spectrogram_time_mark_in_file

In [42]:
features_path = '/projects/MultiActiveAI/Samir_VICReg_Outputs/features_second_pass'
indices_path = '/projects/MultiActiveAI/Samir_VICReg_Outputs/indices_first_pass/exclude_indices.pth'

In [43]:
feats, fpaths = bring_features_and_file_paths(features_path, sub_sample=None)

exclude_indices = set(torch.load(indices_path))
# exclude_indices = set(torch.load(indices_path)) | altered_gain_indices
include_indices = [idx for idx in range(len(feats)) if idx not in exclude_indices]
feats = feats[include_indices]
fpaths = fpaths[include_indices]

indices = torch.randperm(feats.size()[0])
feats=feats[indices]
fpaths=fpaths[indices]

feats = PCA(n_components=384).fit_transform(feats)

We have 2136286 feature vectors.


In [44]:
device_to_location = {
    4862: "Grassland 1",
    4879: "Grassland 2",
    23734: "Forest 1",
    23764: "Forest 2",
    23795: "Forest 3",
    23771: "Forest 4",
    23757: (datetime(month=7, day=14, year=2021, hour=14, second=55), ["Forest 3", "Forest 5"]),
    23700: "Forest 6",
}

In [52]:
label='location'

vectors=[]
labels=[]
BirdNET_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/BirdNET_Output'
Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/Second_Filtered_Spectrograms/'
# Spectrograms_BASE_PATH = '/grand/projects/BirdAudio/Soundscapes/First_Filtered_Spectrograms'
spectrogram_duration = 9.0
for k, (fpath, feat) in enumerate(tqdm(zip(fpaths,feats), total=len(feats))):
    file_path = ''.join([chr(int(x)) for x in fpath]).replace('~','')
    BirdNET_PATH = from_spectrogram_path_to_BirdNET_output_path(fpath)

    try:
        start_time = get_spectrogram_time_mark_in_file(fpath, spectrogram_duration)
        
        interval = (start_time, start_time + spectrogram_duration)
        data_elements = get_BirdNET_detections(os.path.join(BirdNET_BASE_PATH, BirdNET_PATH), interval, confidence_threshold = 0.0)
        for data_element in data_elements:
            device=int(data_element['device'])
            if device!=23788:
                if label=='location':
                    location = device_to_location[device]
                    if isinstance(location, tuple):
                        date = datetime(month=data_element['month'], day=data_element['day'], year=data_element['year'], hour=data_element['hour'], second=data_element['second'])
                        if date < location[0]:
                            data_element['location'] = location[1][0]
                        else:
                            data_element['location'] = location[1][1]
                    else:
                        data_element['location'] = location
                vectors.append(np.array(feat))
                labels.append(data_element[label])

    except:
        print("Something was wrong with")
        print(file_path)


#     if k > 100000:
    if k > 100000:
        break

vectors=np.array(vectors)
labels=np.array(labels)
unique_labels=np.unique(labels)
aux=[]
for label in labels:
    aux.append(np.where(unique_labels==label)[0].item())
    
labels=np.array(aux)

  5%|▌         | 100001/1978138 [03:59<1:14:58, 417.47it/s]


In [53]:
vectors.shape

(114777, 384)

In [54]:
labels.shape

(114777,)

In [55]:
unique_labels

array(['Forest 1', 'Forest 2', 'Forest 3', 'Forest 4', 'Forest 5',
       'Forest 6', 'Grassland 1', 'Grassland 2'], dtype='<U11')

In [56]:
# clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1, activation='identity', max_iter=2000)
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1, activation='identity', max_iter=2000)

In [57]:
scaler = preprocessing.StandardScaler().fit(vectors)
clf.fit(scaler.transform(vectors), labels)

MLPClassifier(activation='identity', alpha=1e-05, hidden_layer_sizes=(20,),
              max_iter=2000, random_state=1, solver='lbfgs')

In [58]:
kf = KFold(n_splits=10)

In [59]:
performances=[]
for train_indices, test_indices in tqdm(kf.split(vectors), total=10):
    scaler = preprocessing.StandardScaler().fit(vectors[train_indices])
    clf.fit(scaler.transform(vectors[train_indices]), labels[train_indices])
    performances.append(clf.score(scaler.transform(vectors[test_indices]), labels[test_indices]))

100%|██████████| 10/10 [05:45<00:00, 34.51s/it]


In [60]:
np.mean(np.array(performances))

0.9690617892679076

In [None]:
mask=np.array([unique_labels=='Blue Jay'])
mask=np.logical_or(mask,np.array([unique_labels=='Eastern Wood-Pewee']))
mask=np.logical_or(mask,np.array([unique_labels=='Indigo Bunting']))
# mask=np.logical_or(mask,np.array([unique_labels=='Ovenbird']))
# mask=np.logical_or(mask,np.array([unique_labels=='Scarlet Tanager']))
mask=np.logical_or(mask,np.array([unique_labels=='No detection']))

chosen_labels=np.where(mask[0])[0]
chosen_labels

In [None]:
def find_indices(A, B):
    indices = np.where(np.isin(A, B))
    return indices


In [None]:
indices=find_indices(labels, chosen_labels)[0]
indices

In [None]:
f_vectors = vectors[indices]
f_labels = labels[indices]
f_performances=[]
for train_indices, test_indices in tqdm(kf.split(f_vectors), total=10):
    scaler = preprocessing.StandardScaler().fit(f_vectors[train_indices])
    clf.fit(scaler.transform(f_vectors[train_indices]), f_labels[train_indices])
    f_performances.append(clf.score(scaler.transform(f_vectors[test_indices]), f_labels[test_indices]))

In [None]:
np.mean(np.array(f_performances))

In [None]:
clf.coefs_[1].shape