# Evaluate clusters for balanced UMAP

Here, we compare the clusters in the balanced UMAP to our labelled predictors. We use silhouette score and kruskal-wallis score (as performed in previous notebooks). 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

In [2]:
import avgn

In [3]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir, FIGURE_DIR
from avgn.signalprocessing.create_spectrogram_dataset import flatten_spectrograms
from avgn.visualization.spectrogram import draw_spec_set
from avgn.visualization.projections import scatter_spec
from avgn.utils.general import save_fig

In [4]:
from scipy.stats import kruskal
from sklearn.metrics import silhouette_score, silhouette_samples

In [5]:
DATASET_ID = "git_repos"

In [6]:
DT_ID = '2022-03-04_18-41-29'

In [7]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID /  'UMAP_balanced_df.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp,umap
3,0.338289,0.439778,NL,26,0.338289,0.439778,NLSHDS,0.338289,0,11,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Excitement 2 170519 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",NL,NL-SH-DS,DS-SH-DS NL-SH-DS,"[7.156554, 7.9771276]"
0,0.020744,0.165861,NL,29,0.020744,0.165861,NLDS,0.020744,0,12,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alarm Alert series 1 100717,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",NL,NL-DS,NL-DS DS-SH-DS SH-LH,"[8.166741, 8.955216]"
3,0.349618,0.443789,NL,51,0.349618,0.443789,NLSHDS,0.349618,0,20,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 2 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",NL,NL-SH-DS,DS-SH-DS NL-SH-DS SH-DS LH,"[6.8037395, 9.888577]"


In [10]:
len(seg_df)

692

### Get specs

In [11]:
def norm(x):
    return (x-np.min(x)) / (np.max(x) - np.min(x))

In [13]:
specs = list(seg_df.spectrogram.values)
specs = [norm(i) for i in tqdm(specs)]

  0%|          | 0/692 [00:00<?, ?it/s]

### Project UMAP

In [None]:
colors = ["darkslategrey", "seagreen", "olivedrab", "yellowgreen"]
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [0,13],
    y_range = [-1,12],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.comb_labels.values,
        'alpha':1,
        's': 3,
        'show_legend': True,
        "color_palette": colors,
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.5,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);

save_loc = (FIGURE_DIR / DATASET_ID / 'Figures' / 'Balanced Segment UMAP.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=True)

In [None]:
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [0,13],
    y_range = [-1,12],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.labels.values,
        'alpha':1,
        's': 3,
        'show_legend': True,
        "color_palette": "deep",
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.5,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);

#### Calculate Silhouette Score

In [67]:
labelscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.labels.values)
comblabelscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.comb_labels.values)
indvscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.indv.values)
groupscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.group.values)
locscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.location.values)
sexscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.sex.values)
callscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.call_lab_simp.values)

In [None]:
d = {'Test Result': ['S'], 'Label': [labelscore], 'Comb_Label': [comblabelscore], 'Indv': [indvscore], 
      'Group': [groupscore], 'Study Site': [locscore], 'Sex': [sexscore], 'Call': [callscore]}
df = pd.DataFrame(d)
df

#### Calculate Kruskal-Wallis H test score

In [27]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.labels.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.labels.values))

In [None]:
KWlabels = kruskal(samples, chance_samples)
KWlabels

In [30]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.comb_labels.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.comb_labels.values))

In [None]:
KWcomblab = kruskal(samples, chance_samples)
KWcomblab

In [32]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.indv.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.indv.values))

In [None]:
KWindv = kruskal(samples, chance_samples)
KWindv

In [36]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.group.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.group.values))

In [None]:
KWgroup = kruskal(samples, chance_samples)
KWgroup

In [38]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.location.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.location.values))

In [None]:
KWloc = kruskal(samples, chance_samples)
KWloc

In [40]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.sex.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.sex.values))

In [None]:
KWsex = kruskal(samples, chance_samples)
KWsex

In [54]:
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.call_lab_simp.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.call_lab_simp.values))

In [None]:
KWcall = kruskal(samples, chance_samples)
KWcall

In [None]:
d = {'Test_Result': ['S', 'KW_stat', 'KW_pval'], 'Labels': [labelscore, KWlabels.statistic, KWlabels.pvalue], 
     'Comb_Labels': [comblabelscore, KWcomblab.statistic, KWcomblab.pvalue], 'Indv': [indvscore, KWindv.statistic, KWindv.pvalue],
     'Group': [groupscore, KWgroup.statistic, KWgroup.pvalue], 'Study Site': [locscore, KWloc.statistic, KWloc.pvalue],
    'Sex': [sexscore, KWsex.statistic, KWsex.pvalue], 'Call': [callscore, KWcall.statistic, KWcall.pvalue]}
df = pd.DataFrame(d)
df

## Closer look into silhouette score

In [71]:
from avgn.clusterability.silhouette import nn, sil, plot_within_without

In [78]:
# labels
labels = seg_df['comb_labels']

In [79]:
embedding = np.asarray(list(seg_df['umap'])) # UMAP coordinates

In [80]:
knn=5 #  for knn=5 nearest neighbors

nn_stats = nn(embedding, np.asarray(labels), k=knn)

In [None]:
# Summary scores
print("Evaluation score S (unweighted average of same-class probability P for all classes):",round(nn_stats.get_S(),3))
print("Evaluation score Snorm (unweighted average of normalized same-class probability Pnorm for all classes)::",round(nn_stats.get_Snorm(),3))

In [82]:
import seaborn as sns

## 2.3. Plot nearest neighbor metrics

In [None]:
nn_stats.plot_heat_S(vmin=0,       # lower end (for color scheme)
                     vmax=100,     # upper end (for color scheme)
                     center=50,    # center(for color scheme)
                     cmap=sns.color_palette("Greens", as_cmap=True),# color scheme 
                     cbar=None,    # show colorbar if True else don't
                     outname=None) # filename (with path) where figure will be saved. Default: None -> figure not saved

### 2.3.2. Normalized score

In [None]:
nn_stats.plot_heat_fold(center=1,    # center(for color scheme)
                        cmap=sns.diverging_palette(20, 145, as_cmap=True),# color scheme 
                        cbar=None,    # show colorbar if True else don't
                        outname=None) # filename (with path) where figure will be saved. Default: None -> figure not saved

## 2.3.3. Normalized, log-transformed score

In [None]:
nn_stats.plot_heat_Snorm(vmin=-13,     # lower end (for color scheme)
                         vmax=13,      # upper end (for color scheme)
                         center=1,     # center(for color scheme)
                         cmap=sns.diverging_palette(20, 145, as_cmap=True),# color scheme 
                         cbar=None,    # show colorbar if True else don't
                         outname=None) # filename (with path) where figure will be saved. Default: None -> figure not saved

## 3. Evaluation based on pairwise distances

In [None]:
plot_within_without(embedding=embedding,         # latent space coordinates (2D numpy array)
                    labels=labels,               # calltype labels
                    distance_metric='euclidean', # distance metric (all scipy distance metrics are valid)
                    outname=None,                # filename (with path) where figure will be saved. Default: None -> figure not saved
                    xmin=0,xmax=12,              # xaxis minimum and maximum
                    ymax=0.5,                    # yaxis maximum
                    nbins=50,                    # number of bins
                    nrows=4,                     # number of rows of subplots
                    ncols=2,                     # number of cols of subplots
                    density=True)                # plot density if True else plot frequency

## 4. Silhouette Plot

In [87]:
sil_stats = sil(embedding, labels)

In [None]:
sil_stats.plot_sil(outname=None) # filename (with path) where figure will be saved. Default: None -> figure not saved

In [None]:
sil_stats.get_avrg_score()