### Analysis on Individual Combinatorial Repertoire
Here we analyse the combinatorial repertoire for OMGO, who is one of the individuals with the largest count of spectrograms. In the next step we will view the UMAP, and have a look at the probabilities for segment transition.

In [1]:
import avgn

  from tqdm.autonotebook import tqdm


In [2]:
import pandas as pd
from avgn.utils.paths import DATA_DIR

In [3]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

In [4]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir
from avgn.signalprocessing.create_spectrogram_dataset import flatten_spectrograms
from avgn.visualization.spectrogram import draw_spec_set
from avgn.utils.paths import DATA_DIR, ensure_dir

In [12]:
DATASET_ID = "git_repos"

In [13]:
DT_ID = '2022-03-04_18-41-29'

In [15]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'walsh_magpie_gitrepos.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,unit_label,note_pos_unit,unit_unique_num,unit_pos_seq,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0,0,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,1,0,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,2,0,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [16]:
len(seg_df)

1643

In [17]:
seg_df = seg_df.loc[(seg_df["indv"]=="OMGO")]
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,unit_label,note_pos_unit,unit_unique_num,unit_pos_seq,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
0,12.167358,12.337176,NL,724,12.167358,12.337176,NLDS,0,300,0,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,12.346987,12.395739,DS,725,12.346987,12.395739,NLDS,1,300,0,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,12.607988,12.631858,SH,726,12.607988,12.631858,SHSHNLDS,0,301,1,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [18]:
seg_df.sort_values(by=['ID'], inplace=True)

In [19]:
len(seg_df.loc[(seg_df["labels"]=="NL")])

11

In [20]:
len(seg_df.loc[(seg_df["labels"]=="SH")])

146

In [21]:
len(seg_df.loc[(seg_df["labels"]=="DS")])

62

In [22]:
len(seg_df.loc[(seg_df["labels"]=="LH")])

33

In [23]:
seg_df2 = seg_df.loc[(seg_df["labels"]!="NL")]

Removed the NL segment class due to its very low size

## Balance counts between segment classes
-we will reduce the number of SH segments to be equal to the count of DS segments (N=62), rather than the smallest class (LH; N=33), to boost the sample size (too small a sample size hinders the effectiveness of UMAP).

In [24]:
SHdf = seg_df2.loc[(seg_df2["labels"]=="SH")]

In [25]:
len(SHdf)

146

In [26]:
#Take random sample of unique SH segments
SH = [i for i in SHdf.ID.unique()]
df = pd.DataFrame(SH)  #dataframe of unique segments
df = df.astype(int) #change dataframe to integer type
samps = df.sample(frac = 0.425) 
ds = list(samps[0])
len(ds)

62

In [27]:
SH_df = SHdf.loc[SHdf.ID.isin(ds)]  ##subset dataframe of segs if their ID is in random sample
SH_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,unit_label,note_pos_unit,unit_unique_num,unit_pos_seq,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
11,13.294838,13.322906,SH,735,13.294838,13.322906,SHSHSHLH,2,303,3,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
13,20.008098,20.031107,SH,737,20.008098,20.031107,SHNLDS,0,304,0,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [28]:
noSHdf = seg_df2.loc[(seg_df2["labels"]!="SH")]

In [29]:
#attach to dataframe with other OMGO segments
frames = [SH_df, noSHdf]
OMGO_df = pd.concat(frames)
OMGO_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,unit_label,note_pos_unit,unit_unique_num,unit_pos_seq,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
11,13.294838,13.322906,SH,735,13.294838,13.322906,SHSHSHLH,2,303,3,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
13,20.008098,20.031107,SH,737,20.008098,20.031107,SHNLDS,0,304,0,...,OMGO,0,MBG OMGO Alert and Combo Series 1 200220 AM b,MBG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG OMGO Alert and Combo Series 1 200220 AM b,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


Check to make sure this dataframe contains the right count of segments per class.

In [30]:
len(OMGO_df)

157

In [31]:
len(np.array(OMGO_df.loc[(OMGO_df["labels"]=="SH")]))

62

In [32]:
len(np.array(OMGO_df.loc[(OMGO_df["labels"]=="DS")]))

62

In [33]:
len(np.array(OMGO_df.loc[(OMGO_df["labels"]=="LH")]))

33

## UMAP

In [34]:
OMGO_df = OMGO_df[np.array([np.sum(i) > 0.0 for i in OMGO_df.specs.values])
]

In [36]:
def norm(x):
    return (x-np.min(x)) / (np.max(x) - np.min(x))

In [37]:
specs = list(OMGO_df.specs.values)
specs = [norm(i) for i in tqdm(specs)]
specs_flattened = flatten_spectrograms(specs)
np.shape(specs_flattened)

  0%|          | 0/157 [00:00<?, ?it/s]

(157, 4096)

In [38]:
fit = umap.UMAP(min_dist=0.0, verbose = True)
z = list(fit.fit_transform(specs_flattened))

UMAP(min_dist=0.0, verbose=True)
Construct fuzzy simplicial set
Wed Nov 16 16:58:55 2022 Finding Nearest Neighbors
Wed Nov 16 16:58:56 2022 Finished Nearest Neighbor Search
Wed Nov 16 16:58:58 2022 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Nov 16 16:58:59 2022 Finished embedding


In [43]:
OMGO_df['umap'] = list(z)

In [44]:
#save df
save_loc = DATA_DIR / DATASET_ID / DT_ID /  'OMGO_UMAP.pickle'
ensure_dir(save_loc.as_posix())
OMGO_df.to_pickle(save_loc)