### Analysis on Individual Combinatorial Repertoire
Here we analyse the combinatorial repertoire for VVVM, who is one of the individuals with the largest count of spectrograms. In the next step we will view the UMAP, and have a look at the probabilities for segment transitions.

In [1]:
import avgn

  from tqdm.autonotebook import tqdm


In [2]:
import pandas as pd
from avgn.utils.paths import DATA_DIR

In [3]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import umap
import pandas as pd

In [4]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir
from avgn.signalprocessing.create_spectrogram_dataset import flatten_spectrograms
from avgn.visualization.spectrogram import draw_spec_set
from avgn.utils.paths import DATA_DIR, ensure_dir

In [5]:
DATASET_ID = "git_repos"

In [6]:
DT_ID = '2022-03-04_18-41-29'

In [7]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'walsh_magpie_gitrepos.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [8]:
len(seg_df)

1645

In [9]:
seg_df = seg_df.loc[(seg_df["indv"]=="VVVM")]
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
0,5.081665,5.161625,NL,1095,5.081665,5.161625,NLDS,5.081665,0,439,...,VVVM,0,RVD VVVM Aerial Predator Call series 1 051120 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Aerial Predator Call series 1 051120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,5.168242,5.257137,DS,1096,5.168242,5.257137,NLDS,5.081665,1,439,...,VVVM,0,RVD VVVM Aerial Predator Call series 1 051120 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Aerial Predator Call series 1 051120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,5.272444,5.380619,LH,1097,5.272444,5.380619,LHDS,5.272444,0,440,...,VVVM,0,RVD VVVM Aerial Predator Call series 1 051120 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Aerial Predator Call series 1 051120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [10]:
len(seg_df)

350

In [11]:
seg_df.sort_values(by=['ID'], inplace=True)

### Need to create the combined segment labels:
- As per the UMAP on all individuals, however there it was done after the dataset was reduced
- Here, we want all of VVVM's segments

In [12]:
## create a new column of segment labels data so that I can modify for repeated units
seg_df['comb_labels'] = seg_df['labels']

In [13]:
## Create conditions for US and C labels (these will be modified in the next step)
cond1 = seg_df['comb_labels'] == 'US'
cond2 = seg_df['comb_labels'] == 'C'
cond3 = seg_df['comb_labels'] == 'HL'

In [14]:
## Modified based on results of segment UMAP analysis
seg_df.loc[cond1, 'comb_labels'] = 'SH'
seg_df.loc[cond2, 'comb_labels'] = 'LH'
seg_df.loc[cond3, 'comb_labels'] = 'LH'

## Balance Segment Classes
- Reduce counts in SH, DS and LH to reflect that in the smallest class (NL)
- However, first we need to remove two NL segment spectrograms which are still impacted by a bit of background noise & thus are lower quality (were not randomly selected in the dataset with all individuals, so this was not found as an issue before this analysis). 

In [15]:
len(seg_df.loc[(seg_df['comb_labels']=="SH")])

96

In [16]:
len(seg_df.loc[(seg_df['comb_labels']=="NL")])

63

In [17]:
len(seg_df.loc[(seg_df['comb_labels']=="DS")])

128

In [18]:
len(seg_df.loc[(seg_df['comb_labels']=="LH")])

63

Locate the NL segments that need to be removed

In [19]:
NLdf = seg_df.loc[(seg_df["comb_labels"]=="NL")]

In [20]:
NLdf.loc[(NLdf['filename']=="RVD VVVM Call combo 1 170220 AM")]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indvi,filename,group,location,sex,wav_loc,key,rate,specs,comb_labels
0,0.604888,0.646259,NL,1119,0.604888,0.646259,NLSHDS,0.604888,0,451,...,0,RVD VVVM Call combo 1 170220 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Call combo 1 170220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",NL


In [24]:
NLdf = NLdf.loc[(NLdf["ID"]!=1119)]
len(NLdf)

62

In [25]:
#Check it has been removed
NLdf.loc[(NLdf['ID']==1119)]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indvi,filename,group,location,sex,wav_loc,key,rate,specs,comb_labels


In [27]:
NLdf.loc[(NLdf['call_unique_num']==564)]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indvi,filename,group,location,sex,wav_loc,key,rate,specs,comb_labels
12,43.466584,43.988484,NL,1330,43.466584,43.988484,NL,43.466584,0,564,...,0,RVD VVVM Combo Series 2 151119 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Combo Series 2 151119 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",NL


In [28]:
NLdf = NLdf.loc[(NLdf["ID"]!=1330)]
len(NLdf)

61

Now reduce counts in the other classes

In [29]:
SHdf = seg_df.loc[(seg_df["comb_labels"]=="SH")]

In [30]:
#Take random sample of SH segments
SH = [i for i in SHdf.ID.unique()]
df = pd.DataFrame(SH)  #dataframe of unique ID numbers
df = df.astype(int) #change dataframe to integer type
samps = df.sample(frac = 0.64) 
ds = list(samps[0])
len(ds)

61

In [31]:
SH_df = SHdf.loc[SHdf.ID.isin(ds)]  ##subset dataframe of sequences if their number is in random sample

In [33]:
DSdf = seg_df.loc[(seg_df["comb_labels"]=="DS")]

In [34]:
#Take random sample of DS segs
DSsegs = [i for i in DSdf.ID.unique()]
df = pd.DataFrame(DSsegs)  #dataframe of unique ID numbers
df = df.astype(int) #change dataframe to integer type
samps = df.sample(frac = 0.48) 
ds = list(samps[0])
len(ds)

61

In [35]:
DS_df = DSdf.loc[DSdf.ID.isin(ds)] 

In [37]:
LHdf = seg_df.loc[(seg_df["comb_labels"]=="LH")]

In [38]:
#Take random sample of LH segs
LHsegs = [i for i in LHdf.ID.unique()]
df = pd.DataFrame(LHsegs)  #dataframe of unique ID numbers
df = df.astype(int) #change dataframe to integer type
samps = df.sample(frac = 0.97) 
ds = list(samps[0])
len(ds)

61

In [39]:
LH_df = LHdf.loc[LHdf.ID.isin(ds)] 

In [40]:
#attach to dataframe with other OMGO segments
frames = [SH_df, NLdf, DS_df, LH_df]
VVVM_df = pd.concat(frames)
VVVM_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,indvi,filename,group,location,sex,wav_loc,key,rate,specs,comb_labels
16,87.377496,87.420472,SH,1111,87.377496,87.420472,SHDS,87.377496,0,447,...,0,RVD VVVM Aerial Predator Call series 1 051120 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Aerial Predator Call series 1 051120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH
1,0.646259,0.666844,SH,1120,0.646259,0.666844,NLSHDS,0.604888,1,451,...,0,RVD VVVM Call combo 1 170220 AM,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Call combo 1 170220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH


Check to make sure the count is balanced

In [41]:
len(VVVM_df.loc[(VVVM_df['comb_labels']=="SH")])

61

In [42]:
len(VVVM_df.loc[(VVVM_df['comb_labels']=="DS")])

61

In [43]:
len(VVVM_df.loc[(VVVM_df['comb_labels']=="NL")])

61

In [44]:
len(VVVM_df.loc[(VVVM_df['comb_labels']=="LH")])

61

In [45]:
len(VVVM_df)

244

### Spectrograms for all individuals

In [46]:
def norm(x):
    return (x-np.min(x)) / (np.max(x) - np.min(x))

In [47]:
VVVM_df = VVVM_df[np.array([np.sum(i) > 0.0 for i in VVVM_df.specs.values])
]

In [48]:
specs = list(VVVM_df.specs.values)
specs = [norm(i) for i in tqdm(specs)]
specs_flattened = flatten_spectrograms(specs)
np.shape(specs_flattened)

  0%|          | 0/244 [00:00<?, ?it/s]

(244, 4096)

In [49]:
fit = umap.UMAP(min_dist=0.0, verbose = True)
z = list(fit.fit_transform(specs_flattened))

UMAP(min_dist=0.0, verbose=True)
Construct fuzzy simplicial set
Wed Nov 16 17:42:02 2022 Finding Nearest Neighbors
Wed Nov 16 17:42:04 2022 Finished Nearest Neighbor Search
Wed Nov 16 17:42:06 2022 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Wed Nov 16 17:42:07 2022 Finished embedding


In [50]:
VVVM_df['umap'] = list(z)

In [51]:
#save df
save_loc = DATA_DIR / DATASET_ID / DT_ID /  'VVVM_UMAP.pickle'
ensure_dir(save_loc.as_posix())
VVVM_df.to_pickle(save_loc)