# Make dataframe for balanced UMAP

Here, we create a dataframe with spectrogram count balanced across segment classes. As the NL segment class contains the lowest count (N = 173), the three other classes will be reduced to this amount. 

Firstly, we need to create separate dataframes for each segment class. Per segment class, determine the spectrogram count per individual. A smaller portion will be randomly selected for each individual such that the total spectrogram count is reduced in the following classes: LH, NL, and SH. Then these dataframes will be combined to create a single dataset. This dataset will then be projected into latent space using UMAP in the next step.  

In [1]:
import numpy as np
from tqdm.auto import tqdm
import pandas as pd

In [2]:
import avgn

In [3]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [4]:
DATASET_ID = "git_repos"

In [5]:
DT_ID = '2022-03-04_18-41-29'

In [6]:
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'segment_df_umap_combinedtidied.pickle')
seg_df[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,location,sex,wav_loc,key,rate,specs,umap,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[14.15081, 3.406464]",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[5.388335, 10.057652]",SH,DS-SH-DS,DS-SH-DS SH-LH
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[12.421062, 0.6991728]",DS,DS-SH-DS,DS-SH-DS SH-LH


In [7]:
len(seg_df)

1333

In [8]:
seg_df.columns

Index(['start_time', 'end_time', 'labels', 'ID', 'start_times', 'end_times',
       'call_label', 'call_start', 'seg_pos_call', 'call_unique_num',
       'call_pos_combi', 'combi_label', 'combi_start', 'seg_pos_combi',
       'combi_unique_num', 'indv', 'indvi', 'filename', 'group', 'location',
       'sex', 'wav_loc', 'key', 'rate', 'specs', 'umap', 'comb_labels',
       'call_lab_simp', 'combi_lab_simp'],
      dtype='object')

In [9]:
#drop umap column from this dataframe as we won't be using the previous UMAP projection for this dataset
seg_df = seg_df.drop("umap",1)

What is the spectrogram count for each segment class?

In [10]:
len(seg_df.loc[(seg_df["comb_labels"]=="SH")])

489

In [11]:
len(seg_df.loc[(seg_df["comb_labels"]=="NL")])

173

In [12]:
len(seg_df.loc[(seg_df["comb_labels"]=="LH")])

203

In [13]:
len(seg_df.loc[(seg_df["comb_labels"]=="DS")])

468

Thus we need to reduce to a random selection of spectrograms for SH, LH and DS segment classes.

### Create Dataframe for SH segment class

In [14]:
SH_df = seg_df.loc[(seg_df["comb_labels"]=="SH")]
SH_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
1,0.786865,0.835165,SH,1,0.786865,0.835165,DSSHDS,0.753604,1,0,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,DS-SH-DS,DS-SH-DS SH-LH
3,0.932017,0.942875,SH,3,0.932017,0.942875,SHSHLH,0.932017,0,1,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-LH,DS-SH-DS SH-LH


In [15]:
len(SH_df)

489

In [16]:
#Find out spectrogram count per indv
top_indvs = (
    pd.DataFrame(
        {i: [np.sum(SH_df.indv.values == i)] for i in SH_df.indv.unique()}
    )
    .T.sort_values(by=0, ascending=False)[:]
    .T
)
top_indvs

Unnamed: 0,OMGO,MRGO,VVOM,MOGO,MXXGRY,VVVM,MGGY,GBYM,GON XF 19-21,BMYO,RMXGMX,XF 20-21,RMYG,PMXRMX,MOOO,SCL XF 20-21,MXWW,SCL XM 20-21,GON XM 20-21
0,87,62,46,41,37,36,30,30,19,18,15,14,13,10,9,9,8,4,1


In [17]:
#Create array of indvs that have a count over 10
indv_reduce = [i for i in SH_df.indv.unique() if np.sum(SH_df.indv == i) > 10]
reduce = np.array(indv_reduce)
reduce

array(['MGGY', 'MRGO', 'VVOM', 'BMYO', 'GON XF 19-21', 'GBYM', 'MOGO',
       'XF 20-21', 'OMGO', 'RMYG', 'RMXGMX', 'VVVM', 'MXXGRY'],
      dtype='<U12')

In [20]:
#Randomly select 2 individuals to have a count of 11 (other top indvs will have count of 10)
np.random.choice(reduce,2)

array(['GBYM', 'MGGY'], dtype='<U4')

In [21]:
ten = ['MRGO', 'VVOM', 'BMYO', 'GON XF 19-21', 'MOGO',
       'XF 20-21', 'OMGO', 'RMYG', 'RMXGMX', 'VVVM', 'MXXGRY']
eleven = ['MGGY', 'GBYM']

In [22]:
SH_10_df = SH_df[SH_df.indv.isin(ten)]
SH_11_df = SH_df[SH_df.indv.isin(eleven)]

In [23]:
len(SH_10_df)

388

In [24]:
len(SH_11_df)

60

In [25]:
#reduce all individuals in one group to a count of 10
SH_ten_dfs = []
for indv in np.sort(SH_10_df.indv.unique()): #create separate dataframes per individual
    indv_df = SH_10_df[SH_10_df.indv == indv]
    segs = [i for i in indv_df.ID.unique()]
    df = pd.DataFrame(segs)  #dataframe of unique segment IDs
    df = df.astype(int) #change dataframe to integer type
    samp = df.sample(n = 10) ##random sample from dataframe (n = 10)
    ds = list(samp[0])
    indv_df2 = indv_df.loc[indv_df.ID.isin(ds)]  ##subset dataframe of segments if the ID is in random sample
    SH_ten_dfs.append(indv_df2)
SH_ten = pd.concat(SH_ten_dfs) #combine subset dataframes for all indvs

In [26]:
len(SH_ten)

110

In [27]:
#check to make sure all these individuals have a count of 10
pd.DataFrame(
        {i: [np.sum(SH_ten.indv.values == i)] for i in SH_ten.indv.unique()}
    )

Unnamed: 0,BMYO,GON XF 19-21,MOGO,MRGO,MXXGRY,OMGO,RMXGMX,RMYG,VVOM,VVVM,XF 20-21
0,10,10,10,10,10,10,10,10,10,10,10


In [28]:
#reduce the two randomly selected individuals to a count of 11
SH_11_dfs = []
for indv in np.sort(SH_11_df.indv.unique()): #create separate dataframes to take random selection per individual
    indv_df = SH_11_df[SH_11_df.indv == indv]
    segs = [i for i in indv_df.ID.unique()]
    df = pd.DataFrame(segs)  #dataframe of unique segment IDs
    df = df.astype(int) #change dataframe to integer type
    samp = df.sample(n = 11) ##random sample from dataframe (n = 11)
    ds = list(samp[0])
    indv_df2 = indv_df.loc[indv_df.ID.isin(ds)]  ##subset dataframe of segments if the ID is in random sample
    SH_11_dfs.append(indv_df2) 
SH_11 = pd.concat(SH_11_dfs) #combine subset dataframes for all indvs

In [29]:
len(SH_11)

22

In [30]:
#check to make sure all these individuals have a count of 11
pd.DataFrame(
        {i: [np.sum(SH_11.indv.values == i)] for i in SH_11.indv.unique()}
    )

Unnamed: 0,GBYM,MGGY
0,11,11


In [31]:
#create dataframe containing individuals whose spec count was not changed
##without any individuals with a count over 10
other_df = SH_df[SH_df["indv"].isin(['MGGY', 'MRGO', 'VVOM', 'BMYO', 'GON XF 19-21', 'GBYM', 'MOGO',
       'XF 20-21', 'OMGO', 'RMYG', 'RMXGMX', 'VVVM', 'MXXGRY'])==False] 
other_df.indv.unique()

array(['GON XM 20-21', 'MOOO', 'PMXRMX', 'MXWW', 'SCL XF 20-21',
       'SCL XM 20-21'], dtype=object)

In [32]:
#combine dataframes
frames = [other_df, SH_ten, SH_11]
SH_df2 = pd.concat(frames)
SH_df2

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
2,0.199272,0.223605,SH,451,0.199272,0.223605,SHLH,0.199272,0,187,...,GON,CRAWLEY,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,GON XFL Beg Series 1 XM Discrete Combo Series ...,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-LH,NL-DS SH-LH
2,0.964125,1.061754,SH,644,0.964125,1.061754,SHDS,0.964125,0,266,...,KMO,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,KMO XXXXF Call Combo 5 180619 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,NL-DS SH-DS
0,0.701188,0.746621,SH,646,0.701188,0.746621,SHDS,0.701188,0,267,...,KMO,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,KMO XXXXF Call combo 6 180619 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,SH-DS LH-DS
0,0.561553,0.594878,SH,650,0.561553,0.594878,SHDS,0.561553,0,269,...,KMO,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,KMO XXXXF Call combo 7 180619 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,SH-DS SH-LH-DS
2,0.730400,0.748173,US,652,0.730400,0.748173,USLHDS,0.730400,0,270,...,KMO,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,KMO XXXXF Call combo 7 180619 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-LH-DS,SH-DS SH-LH-DS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,0.790223,0.815939,SH,62,0.790223,0.815939,NLSHDS,0.695085,1,24,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 3 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,NL-SH-DS,NL-DS-SH-DS NL-SH-DS SH-DS LH
2,0.614766,0.657158,SH,69,0.614766,0.657158,SHDS,0.614766,0,28,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 8 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,NL-DS SH-DS NL-SH-DS SH-DS SH-LH
7,1.064164,1.082719,SH,74,1.064164,1.082719,SHSHDS,1.064164,0,30,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 8 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,NL-DS SH-DS NL-SH-DS SH-DS SH-LH
8,1.088034,1.131887,SH,75,1.088034,1.131887,SHSHDS,1.064164,1,30,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 8 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",SH,SH-DS,NL-DS SH-DS NL-SH-DS SH-DS SH-LH


In [33]:
len(SH_df2)

173

In [34]:
#check to make sure count is correct for all individuals
pd.DataFrame(
        {i: [np.sum(SH_df2.indv.values == i)] for i in SH_df2.indv.unique()}
    )

Unnamed: 0,GON XM 20-21,MOOO,PMXRMX,MXWW,SCL XF 20-21,SCL XM 20-21,BMYO,GON XF 19-21,MOGO,MRGO,MXXGRY,OMGO,RMXGMX,RMYG,VVOM,VVVM,XF 20-21,GBYM,MGGY
0,1,9,10,8,9,4,10,10,10,10,10,10,10,10,10,10,10,11,11


### Create dataframe for DS segments

In [65]:
DS_df = seg_df.loc[(seg_df["comb_labels"]=="DS")]
DS_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS SH-LH
2,0.855941,0.92116,DS,2,0.855941,0.92116,DSSHDS,0.753604,2,0,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS SH-LH


In [66]:
len(DS_df)

468

In [67]:
#check to see count for all individuals (split to just show first 15 indvs for ease of viewing)
pd.DataFrame(
        {i: [np.sum(DS_df.indv.values == i)] for i in DS_df.indv.unique()[:15]}
    )

Unnamed: 0,MGGY,MRGO,MOVY,VVOM,BMYO,RYMR,MORP,GON XF 19-21,GON XM 20-21,GBYM,MOGO,MOOO,MXXBYX,PMXRMX,XF 20-21
0,31,41,10,16,43,7,9,9,5,18,36,13,1,6,10


In [68]:
#check to see count for all individuals (remaining indvs)
pd.DataFrame(
        {i: [np.sum(DS_df.indv.values == i)] for i in DS_df.indv.unique()[15:]}
    )

Unnamed: 0,OMGO,RMYG,RMXGMX,VVVM,MXWW,SCL XF 20-21,SCL XM 20-21,MXXGRY
0,38,16,30,50,22,15,6,36


In [69]:
#need to reduce all individuals with a count over 8
## reduce 14 individuals to count of 8, 4 individuals to count of 9)
indv_reduce = [i for i in DS_df.indv.unique() if np.sum(DS_df.indv == i) > 8]
reduce = np.array(indv_reduce)
reduce

array(['MGGY', 'MRGO', 'MOVY', 'VVOM', 'BMYO', 'MORP', 'GON XF 19-21',
       'GBYM', 'MOGO', 'MOOO', 'XF 20-21', 'OMGO', 'RMYG', 'RMXGMX',
       'VVVM', 'MXWW', 'SCL XF 20-21', 'MXXGRY'], dtype='<U12')

In [74]:
#select 4 individuals to have a count of 9 (others will have count of 8)
np.random.choice(reduce,4, replace=False)

array(['MOVY', 'MXWW', 'MGGY', 'RMYG'], dtype='<U4')

In [76]:
eight = ['MRGO', 'VVOM', 'BMYO', 'MORP', 'GON XF 19-21',
       'GBYM', 'MOGO', 'MOOO', 'XF 20-21', 'OMGO', 'RMXGMX',
       'VVVM', 'SCL XF 20-21', 'MXXGRY']
nine = ['MOVY', 'MGGY', 'RMYG', 'MXWW']

In [77]:
DS_8_df = DS_df[DS_df.indv.isin(eight)]
DS_9_df = DS_df[DS_df.indv.isin(nine)]

In [78]:
len(DS_8_df)

364

In [79]:
len(DS_9_df)

79

In [80]:
#reduce all individuals in one group to count of 8
DS_8_dfs = []
for indv in np.sort(DS_8_df.indv.unique()): #create ubset dataframe for ach indv
    indv_df = DS_8_df[DS_8_df.indv == indv]
    segs = [i for i in indv_df.ID.unique()]
    df = pd.DataFrame(segs)  #dataframe of unique segment IDs
    df = df.astype(int) #change dataframe to integer type
    samp = df.sample(n = 8) ##random sample from dataframe (n = 8)
    ds = list(samp[0])
    indv_df2 = indv_df.loc[indv_df.ID.isin(ds)]  ##subset dataframe of segments if ID is in random sample
    DS_8_dfs.append(indv_df2)
DS_8 = pd.concat(DS_8_dfs) #combine subset dataframes
DS_8

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
2,0.540375,0.593466,DS,304,0.540375,0.593466,DS,0.540375,0,117,...,FMRa,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa BMYO Call Combo 1 180120 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS,NL-DS DS
1,0.262831,0.294904,DS,309,0.262831,0.294904,NLDS,0.158913,1,120,...,FMRa,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa BMYO Call Combo 1 271019 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS DS LH-DS
6,1.586613,1.652426,DS,319,1.586613,1.652426,SHSHDSSHLH,1.470229,2,125,...,FMRa,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa BMYO Call Combo Series 1 210120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS-SH-LH,NL-DS DS SH-DS-SH-LH
12,6.096868,6.206625,DS,325,6.096868,6.206625,DSDS,6.011257,1,127,...,FMRa,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa BMYO Call Combo Series 1 210120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS,NL-DS DS SH-DS-SH-LH
1,0.409920,0.439621,DS,372,0.409920,0.439621,SHDS,0.345914,1,145,...,FMRa,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa BMYO Discrete Series 1 Warble 1 Carol 1 C...,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS,SH-DS SH-DS-LH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,3.441890,3.510501,DS,707,3.441890,3.510501,SHDS,3.362798,1,292,...,LHP,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,LHP XF Discrete 1 Combo 1 021220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS,SH-DS SH-DS-SH-LH
1,10.313193,10.382307,DS,713,10.313193,10.382307,SHDS,10.261358,1,294,...,LHP,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,LHP XF Discrete Series 1 Combo 1 161220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS,SH-DS LH
3,10.602861,10.653680,DS,715,10.602861,10.653680,SHDS,10.497158,1,295,...,LHP,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,LHP XF Discrete Series 1 Combo 1 161220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS,SH-DS LH
3,22.838995,22.906088,DS,720,22.838995,22.906088,NLSHDS,22.739076,2,298,...,LHP,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,LHP XF Discrete Series 2 Combo 2 161220 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-SH-DS,NL NL-SH-DS SH-DS-LH


In [81]:
len(DS_8)

112

In [82]:
#check to make sure all these individuals have a count of 8
pd.DataFrame(
        {i: [np.sum(DS_8.indv.values == i)] for i in DS_8.indv.unique()}
    )

Unnamed: 0,BMYO,GBYM,GON XF 19-21,MOGO,MOOO,MORP,MRGO,MXXGRY,OMGO,RMXGMX,SCL XF 20-21,VVOM,VVVM,XF 20-21
0,8,8,8,8,8,8,8,8,8,8,8,8,8,8


In [83]:
#reduce 4 individuals in other group to count of 9
DS_9_dfs = []
for indv in np.sort(DS_9_df.indv.unique()): #create subset dataframe for each indv
    indv_df = DS_9_df[DS_9_df.indv == indv]
    segs = [i for i in indv_df.ID.unique()]
    df = pd.DataFrame(segs)  #dataframe of unique segment IDs
    df = df.astype(int) #change dataframe to integer type
    samp = df.sample(n = 9) ##random sample from dataframe (n = 9)
    ds = list(samp[0])
    indv_df2 = indv_df.loc[indv_df.ID.isin(ds)]  ##subset dataframe of segments if ID is in random sample
    DS_9_dfs.append(indv_df2)
DS_9 = pd.concat(DS_9_dfs) #combine subset dataframes
DS_9

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
0,0.753604,0.776773,DS,0,0.753604,0.776773,DSSHDS,0.753604,0,0,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS SH-LH
2,1.354234,1.420183,DS,11,1.354234,1.420183,DSSHDS,1.244022,2,4,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Discrete 1 300719 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS LH
2,0.399467,0.469851,DS,15,0.399467,0.469851,DSSHDS,0.30633,2,6,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Discrete 4 210519 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.170349,0.239915,DS,30,0.170349,0.239915,NLDS,0.020744,1,12,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alarm Alert series 1 100717,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS DS-SH-DS SH-LH
2,0.735856,0.758296,DS,31,0.735856,0.758296,DSSHDS,0.735856,0,13,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alarm Alert series 1 100717,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,NL-DS DS-SH-DS SH-LH
0,0.2508,0.300041,DS,36,0.2508,0.300041,DSSHDS,0.2508,0,15,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 1 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS-SH-DS,DS-SH-DS SH-LH
1,0.119913,0.151197,DS,42,0.119913,0.151197,SHDSSHDS,0.095932,1,17,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 1 250617,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS-SH-DS,SH-DS-SH-DS SH-LH
3,0.20735,0.2786,DS,44,0.20735,0.2786,SHDSSHDS,0.095932,3,17,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 1 250617,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS-SH-DS,SH-DS-SH-DS SH-LH
8,1.036989,1.096309,DS,65,1.036989,1.096309,SHDS,0.988646,1,25,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MGGY Alert 3 070817,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS,NL-DS-SH-DS NL-SH-DS SH-DS LH
1,0.289444,0.380955,DS,205,0.289444,0.380955,NLDS,0.185134,1,76,...,CPC,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,CPC MOVY Call Combo 2 230120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS LH-DS


In [84]:
len(DS_9)

36

In [85]:
#check to make sure all these individuals have a count of 9
pd.DataFrame(
        {i: [np.sum(DS_9.indv.values == i)] for i in DS_9.indv.unique()}
    )

Unnamed: 0,MGGY,MOVY,MXWW,RMYG
0,9,9,9,9


In [86]:
##combine dataframes
other_df = DS_df[DS_df["indv"].isin(['MGGY', 'MRGO', 'MOVY', 'VVOM', 'BMYO', 'MORP', 'GON XF 19-21',
       'GBYM', 'MOGO', 'MOOO', 'XF 20-21', 'OMGO', 'RMYG', 'RMXGMX',
       'VVVM', 'MXWW', 'SCL XF 20-21', 'MXXGRY'])==False] #without any individuals with a count over 8
other_df.indv.unique()

array(['RYMR', 'GON XM 20-21', 'MXXBYX', 'PMXRMX', 'SCL XM 20-21'],
      dtype=object)

In [87]:
frames = [other_df, DS_8, DS_9]
DS_df2 = pd.concat(frames)
DS_df2

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
1,0.289004,0.317107,DS,387,0.289004,0.317107,NLDS,0.233286,1,153,...,FMRa,GUILDFORD,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMR RYMR Call Combo 1 210519,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS DS LH-DS
2,0.404882,0.490733,DS,388,0.404882,0.490733,DS,0.404882,0,154,...,FMRa,GUILDFORD,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMR RYMR Call Combo 1 210519,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS,NL-DS DS LH-DS
4,0.800484,0.860156,DS,390,0.800484,0.860156,LHDS,0.545014,1,155,...,FMRa,GUILDFORD,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMR RYMR Call Combo 1 210519,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,LH-DS,NL-DS DS LH-DS
0,0.100229,0.157829,DS,391,0.100229,0.157829,DS,0.100229,0,156,...,FMRa,GUILDFORD,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa RYMR Call Combo 1 Warble 1 111120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,DS,DS LH-DS
2,0.428486,0.451712,DS,393,0.428486,0.451712,LHDS,0.247945,1,157,...,FMRa,GUILDFORD,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,FMRa RYMR Call Combo 1 Warble 1 111120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,LH-DS,DS LH-DS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,118.715823,118.775991,DS,999,118.715823,118.775991,SHDSLH,118.635227,1,390,...,MBG,CRAWLEY,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG RMYG MOBX Discrete Combo Series 1 121120,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS-LH,SH-DS SH-DS-LH
19,210.701917,210.801211,DS,1005,210.701917,210.801211,NLDS,210.661271,1,393,...,MBG,CRAWLEY,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG RMYG MOBX Discrete Combo Series 1 121120,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS LH
24,269.138047,269.222738,DS,1010,269.138047,269.222738,SHDSSHSHSHLH,269.097615,1,396,...,MBG,CRAWLEY,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG RMYG MOBX Discrete Combo Series 1 121120,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,SH-DS-SH-LH,NL-DS SH-DS-SH-LH
30,315.641010,315.733162,DS,1016,315.641010,315.733162,NLDS,315.391817,1,397,...,MBG,CRAWLEY,M,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,MBG RMYG MOBX Discrete Combo Series 1 121120,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",DS,NL-DS,NL-DS SH-DS-LH


In [88]:
#check to make sure all these individuals have correct counts
pd.DataFrame(
        {i: [np.sum(DS_df2.indv.values == i)] for i in DS_df2.indv.unique()}
    )

Unnamed: 0,RYMR,GON XM 20-21,MXXBYX,PMXRMX,SCL XM 20-21,BMYO,GBYM,GON XF 19-21,MOGO,MOOO,...,OMGO,RMXGMX,SCL XF 20-21,VVOM,VVVM,XF 20-21,MGGY,MOVY,MXWW,RMYG
0,7,5,1,6,6,8,8,8,8,8,...,8,8,8,8,8,8,9,9,9,9


In [89]:
len(DS_df2)

173

### Create dataframe of LH segments

In [91]:
LH_df = seg_df.loc[(seg_df["comb_labels"]=="LH")]
LH_df[:2]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
5,0.980101,1.36713,LH,5,0.980101,1.36713,SHSHLH,0.932017,2,1,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,SH-LH,DS-SH-DS SH-LH
2,1.48208,1.984961,HL,8,1.48208,1.984961,USHL,1.442686,1,3,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 300719 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,SH-LH,DS SH-LH


In [92]:
len(LH_df)

203

In [93]:
#check to see counts
pd.DataFrame(
        {i: [np.sum(LH_df.indv.values == i)] for i in LH_df.indv.unique()[:15]}
    )

Unnamed: 0,MGGY,MRGO,MOVY,VVOM,BMYO,RYMR,MORP,GON XF 19-21,GON XM 20-21,GBYM,MOGO,MOOO,PMXRMX,XF 20-21,OMGO
0,11,12,5,15,13,3,4,6,1,16,9,6,3,5,21


In [94]:
#check to see counts
pd.DataFrame(
        {i: [np.sum(LH_df.indv.values == i)] for i in LH_df.indv.unique()[15:]}
    )

Unnamed: 0,RMYG,RMXGMX,VVVM,MXWW,SCL XF 20-21,SCL XM 20-21,MXXGRY
0,8,11,25,4,5,2,18


In [95]:
#need to reduce for individuals with a count over 13 (to 13)
indv_reduce = [i for i in LH_df.indv.unique() if np.sum(LH_df.indv == i) > 13]
reduce = np.array(indv_reduce)
reduce

array(['VVOM', 'GBYM', 'OMGO', 'VVVM', 'MXXGRY'], dtype='<U6')

In [96]:
LH_red_df = LH_df[LH_df.indv.isin(reduce)]

In [97]:
len(LH_red_df)

95

In [98]:
#reduce all individuals in one group to count of 13
LH_red_dfs = []
for indv in np.sort(LH_red_df.indv.unique()): #create dataframe for each individual
    indv_df = LH_red_df[LH_red_df.indv == indv]
    segs = [i for i in indv_df.ID.unique()]
    df = pd.DataFrame(segs)  #dataframe of unique segment IDs
    df = df.astype(int) #change dataframe to integer type
    samp = df.sample(n = 13) ##random sample from dataframe (n = 13)
    ds = list(samp[0])
    indv_df2 = indv_df.loc[indv_df.ID.isin(ds)]  ##subset dataframe of segments if ID is in random sample
    LH_red_dfs.append(indv_df2)
LH_red = pd.concat(LH_red_dfs) #combine subset dataframes
LH_red

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
2,3.912987,4.447504,HL,461,3.912987,4.447504,HL,3.912987,0,193,...,JOG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,JOG GBYM Combo Discrete Series 1 261120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,NL-DS LH
5,10.235944,10.700218,HL,464,10.235944,10.700218,HL,10.235944,0,195,...,JOG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,JOG GBYM Combo Discrete Series 1 261120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,NL-DS LH
11,50.337574,50.881422,HL,470,50.337574,50.881422,HL,50.337574,0,199,...,JOG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,JOG GBYM Combo Discrete Series 1 261120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,NL-DS LH
14,61.496020,62.039867,HL,473,61.496020,62.039867,HL,61.496020,0,201,...,JOG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,JOG GBYM Combo Discrete Series 1 261120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,NL-DS LH
17,66.176934,66.706513,HL,476,66.176934,66.706513,HL,66.176934,0,203,...,JOG,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,JOG GBYM Combo Discrete Series 1 261120 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,NL-DS LH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8,109.519684,109.703527,LH,1283,109.519684,109.703527,LHDS,109.519684,0,539,...,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Combo Series 1 151119 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH-DS,NL SH-DS LH-DS
33,162.512947,162.656729,LH,1308,162.512947,162.656729,LHDS,162.512947,0,553,...,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Combo Series 1 151119 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH-DS,NL SH-DS LH-DS
38,166.516193,166.679976,LH,1313,166.516193,166.679976,LHDS,166.516193,0,556,...,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Combo Series 1 151119 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH-DS,NL SH-DS LH-DS
41,76.918871,77.243977,LH,1361,76.918871,77.243977,SHDSSHLH,76.741670,3,577,...,RVD,GUILDFORD,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,RVD VVVM Combo Series 2 151119 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,SH-DS-SH-LH,NL-SH-DS SH-DS-SH-LH


In [99]:
len(LH_red)

65

In [100]:
#check to make sure all these individuals have a count of 13
pd.DataFrame(
        {i: [np.sum(LH_red.indv.values == i)] for i in LH_red.indv.unique()}
    )

Unnamed: 0,GBYM,MXXGRY,OMGO,VVOM,VVVM
0,13,13,13,13,13


In [101]:
##combine dataframes
other_df = LH_df[LH_df["indv"].isin(['VVOM', 'GBYM', 'OMGO', 'VVVM', 'MXXGRY'])==False] #without any individuals with a count over 8
other_df.indv.unique()

array(['MGGY', 'MRGO', 'MOVY', 'BMYO', 'RYMR', 'MORP', 'GON XF 19-21',
       'GON XM 20-21', 'MOGO', 'MOOO', 'PMXRMX', 'XF 20-21', 'RMYG',
       'RMXGMX', 'MXWW', 'SCL XF 20-21', 'SCL XM 20-21'], dtype=object)

In [102]:
frames = [other_df, LH_red]
LH_df2 = pd.concat(frames)
LH_df2[:3]

Unnamed: 0,start_time,end_time,labels,ID,start_times,end_times,call_label,call_start,seg_pos_call,call_unique_num,...,group,location,sex,wav_loc,key,rate,specs,comb_labels,call_lab_simp,combi_lab_simp
5,0.980101,1.36713,LH,5,0.980101,1.36713,SHSHLH,0.932017,2,1,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,SH-LH,DS-SH-DS SH-LH
2,1.48208,1.984961,HL,8,1.48208,1.984961,USHL,1.442686,1,3,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 300719 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,SH-LH,DS SH-LH
3,1.471596,1.897371,LH,12,1.471596,1.897371,LH,1.471596,0,5,...,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Discrete 1 300719 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",LH,LH,DS-SH-DS LH


In [103]:
len(LH_df2)

173

In [104]:
#check to make sure all these individuals have correct counts
pd.DataFrame(
        {i: [np.sum(LH_df2.indv.values == i)] for i in LH_df2.indv.unique()}
    )

Unnamed: 0,MGGY,MRGO,MOVY,BMYO,RYMR,MORP,GON XF 19-21,GON XM 20-21,MOGO,MOOO,...,RMYG,RMXGMX,MXWW,SCL XF 20-21,SCL XM 20-21,GBYM,MXXGRY,OMGO,VVOM,VVVM
0,11,12,5,13,3,4,6,1,9,6,...,8,11,4,5,2,13,13,13,13,13


#### Combine frames

In [106]:
other_df = seg_df[seg_df["comb_labels"].isin(["SH", "DS", "LH"])==False] #dataframe without SH or DS or LH segments
other_df.comb_labels.unique()

array(['NL'], dtype=object)

In [107]:
frames = [other_df, SH_df2, DS_df2, LH_df2]
seg_df2 = pd.concat(frames)

#### Export to pickle

In [108]:
#save df
save_loc = DATA_DIR / DATASET_ID /  'segmentdf_for_UMAP_balanced.pickle'
ensure_dir(save_loc.as_posix())
seg_df2.to_pickle(save_loc)