In this notebook, we view the UMAP projection and look at the distribution of predictor labels across clusters using silhouette score and kruskal wallis H test.

We calculate silhouette score (S) and kruskal wallis H test score (H) per predictor to determine:
- extent to which they discribe distribution of data across clusters (S)
- whether this distribution is significantly different to what would be expected from the projection being compared to a randomly labelled dataset (H)

# View UMAP Projection

In [1]:
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from joblib import Parallel, delayed
import umap
import pandas as pd

In [2]:
import avgn

In [3]:
import pandas as pd
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir, FIGURE_DIR, ensure_dir
from avgn.signalprocessing.create_spectrogram_dataset import flatten_spectrograms
from avgn.visualization.spectrogram import draw_spec_set
from avgn.visualization.projections import scatter_spec
from avgn.utils.general import save_fig

In [4]:
from scipy.stats import kruskal
from sklearn.metrics import silhouette_score, silhouette_samples

In [5]:
DATASET_ID = "git_repos"

In [6]:
DT_ID = '2022-03-04_18-41-29'

In [7]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'segment_df_umap_combinedtidied.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,location,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH


## View projection

In [8]:
def norm(x):
    return (x-np.min(x)) / (np.max(x) - np.min(x))

In [9]:
seg_df = seg_df[np.array([np.sum(i) > 0.0 for i in seg_df.specs.values])
]

In [10]:
specs = list(seg_df.specs.values)
specs = [norm(i) for i in tqdm(specs)]
specs_flattened = flatten_spectrograms(specs)
np.shape(specs_flattened)

  0%|          | 0/1333 [00:00<?, ?it/s]

(1333, 4096)

### Hand Labels

In [None]:
nex = -1

color = ["midnightblue", "tab:blue", "cornflowerblue", "powderblue"]

scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [-2,15.25],
    y_range = [-4,13.25],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.comb_labels.values,
        'alpha':1,
        's': 5,
        'show_legend': True,
        "color_palette": color,
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.4,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);
save_loc = (FIGURE_DIR / 'MS' / 'Segments UMAP' / 'UMAP_Proj_CombLabels.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=False)

In [None]:
#Silhouette
comblabelscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.comb_labels.values)
comblabelscore

In [13]:
##K-W H test
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.comb_labels.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.comb_labels.values))

In [None]:
KWlabels = kruskal(samples, chance_samples)
KWlabels

In [None]:
segmentdata = {'Test': ['S', 'H'], 'Hand Labels': [comblabelscore, KWlabels]}
df = pd.DataFrame(segmentdata)
df

### Individual
To see if caller ID has a significant affect on data distribution (i.e. whether data are clustering by individual)

In [None]:
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [-2,15.25],
    y_range = [-4,13.25],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.indv.values,
        'alpha':1,
        's': 5,
        'show_legend': False,
        "color_palette": "magma",
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.4,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);
save_loc = (FIGURE_DIR / 'MS' / 'Segments UMAP' / 'UMAP_Proj_Indv.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=False)

In [None]:
#Silhouette
indvscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.indv.values)
indvscore

In [18]:
##K-W H test
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.indv.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.indv.values))

In [None]:
KWindv = kruskal(samples, chance_samples)
KWindv

In [None]:
df["indv"] = [indvscore, KWindv]
df

### Group

In [None]:
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [-2,15.25],
    y_range = [-4,13.25],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.group.values,
        'alpha':1,
        's': 5,
        'show_legend': False,
        "color_palette": "viridis",
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.4,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);
save_loc = (FIGURE_DIR / 'MS' / 'Segments UMAP' / 'UMAP_Proj_Group.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=False)

In [None]:
#Silhouette
groupscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.group.values)
groupscore

In [56]:
##K-W H test
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.group.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.group.values))

In [None]:
KWgroup = kruskal(samples, chance_samples)
KWgroup

In [None]:
df["group"] = [groupscore, KWgroup]
df

### Study Site

In [None]:
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [-2,15.25],
    y_range = [-4,13.25],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.location.values,
        'alpha':1,
        's': 5,
        'show_legend': True,
        "color_palette": "Greens",
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.4,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);
save_loc = (FIGURE_DIR / 'MS' / 'Segments UMAP' / 'UMAP_Proj_Group.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=False)

In [None]:
#Silhouette
locscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.location.values)
locscore

In [62]:
##K-W H test
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.location.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.location.values))

In [None]:
KWloc = kruskal(samples, chance_samples)
KWloc

In [None]:
df["study site"] = [locscore, KWloc]
df

### Sex

In [None]:
nex = -1
scatter_spec(
    np.array(list(seg_df['umap'].values)),
    specs,
    column_size=12,
    x_range = [-2,15.25],
    y_range = [-4,13.25],
    pal_color="hls",
    color_points=False,
    enlarge_points=0,
    figsize=(10, 10),
    range_pad = 0.15,
    scatter_kwargs = {
        'labels': seg_df.sex.values,
        'alpha':1,
        's': 5,
        'show_legend': True,
        "color_palette": "Blues",
    },
    matshow_kwargs = {
        'cmap': plt.cm.Greys
    },
    line_kwargs = {
        'lw':0.4,
        'ls':"dashed",
        'alpha':0.25,
    },
    draw_lines=True,
    n_subset= 1000,
    border_line_width = 3,
    

);
save_loc = (FIGURE_DIR / 'MS' / 'Segments UMAP' / 'UMAP_Proj_Group.jpeg')
ensure_dir(save_loc.as_posix())
save_fig(save_loc, dpi=600, save_jpg=False)

In [None]:
#Silhouette
sexscore = silhouette_score(list(np.array(list(seg_df['umap'].values))), labels = seg_df.sex.values)
sexscore

In [67]:
##K-W H test
samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = seg_df.sex.values)
chance_samples = silhouette_samples(list(np.array(list(seg_df['umap'].values))), labels = np.random.permutation(seg_df.sex.values))

In [None]:
KWsex = kruskal(samples, chance_samples)
KWsex

In [None]:
df["sex"] = [sexscore, KWsex]
df