# Tidy, Combine Labels and Reduce Dataset

- need to tidy up labels for ease of viewing (including change labels to exclude repeated elements, as was done in the segment analysis)
- need to update these labels to reflect those combined segment classes (i.e. US to SH, HL to LH)
- As some individuals have much higher counts of combinations compared to all other individuals, we need to reduce the number of combinations for these individuals that will be used in the analysis. This will be done according to the unique combinations that were randomly selected in the segment analysis. 

In [1]:
from tqdm.auto import tqdm

In [2]:
import avgn

In [3]:
import pandas as pd
from avgn.utils.paths import DATA_DIR, ensure_dir

In [4]:
DATASET_ID = "git_repos_call"

In [5]:
DT_ID = '2022-03-12_17-46-00' 

In [6]:
call_df = pd.read_pickle(DATA_DIR / DATASET_ID / DT_ID /  'walsh_magpie_call_gitrepos.pickle')
call_df[:3]

Unnamed: 0,start_time,end_time,ID,labels,start_times,end_times,call_unique_num,call_pos_combi,combi_label,combi_unique_num,...,indv,indvi,filename,group,location,sex,wav_loc,key,rate,specs
0,0.753604,0.92116,0,DSSHDS,0.753604,0.92116,0,0,DSSHDS SHSHLH,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,0.932017,1.36713,1,SHSHLH,0.932017,1.36713,1,1,DSSHDS SHSHLH,0,...,MGGY,0,BWY MGGY Call Combo 1 290719 PM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
0,1.218085,1.308841,2,DS,1.218085,1.308841,2,0,DS USC,1,...,MGGY,0,BWY MGGY Call Combo 1 300719 AM,BWYa,CRAWLEY,F,C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-...,BWY MGGY Call Combo 1 300719 AM,44100,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [8]:
len(call_df.combi_unique_num.unique())

273

In [9]:
len(call_df.call_unique_num.unique())

697

### Make sure files are only those that were looked at in segment analysis

In [11]:
import numpy as np

In [14]:
combis = np.array(
    [0,
     1,
     2,
     3,
     4,
     5,
     6,
     7,
     8,
     9,
     10,
     11,
     12,
     13,
     14,
     15,
     16,
     17,
     18,
     19,
     20,
     21,
     22,
     23,
     24,
     25,
     26,
     27,
     28,
     29,
     30,
     31,
     32,
     33,
     34,
     35,
     36,
     37,
     38,
     39,
     40,
     41,
     42,
     43,
     44,
     45,
     46,
     47,
     48,
     49,
     50,
     51,
     52,
     53,
     54,
     55,
     56,
     57,
     58,
     59,
     60,
     61,
     62,
     63,
     64,
     65,
     66,
     67,
     68,
     69,
     70,
     71,
     72,
     73,
     74,
     75,
     76,
     77,
     78,
     79,
     80,
     81,
     82,
     83,
     84,
     85,
     86,
     87,
     88,
     89,
     90,
     91,
     92,
     93,
     94,
     95,
     96,
     97,
     98,
     99,
     100,
     101,
     102,
     103,
     104,
     105,
     106,
     107,
     108,
     109,
     110,
     111,
     112,
     113,
     114,
     115,
     116,
     117,
     118,
     119,
     121,
     122,
     123,
     125,
     129,
     131,
     133,
     134,
     135,
     136,
     138,
     139,
     140,
     141,
     143,
     145,
     147,
     148,
     149,
     151,
     152,
     153,
     154,
     155,
     156,
     157,
     158,
     159,
     160,
     161,
     162,
     163,
     164,
     165,
     166,
     167,
     168,
     169,
     170,
     171,
     172,
     173,
     174,
     175,
     177,
     178,
     180,
     185,
     187,
     188,
     191,
     195,
     197,
     199,
     206,
     207,
     210,
     211,
     216,
     217,
     218,
     220,
     224,
     225,
     228,
     229,
     230,
     235,
     239,
     241,
     242,
     243,
     244,
     245,
     246,
     247,
     248,
     249,
     250,
     251,
     252,
     253,
     254,
     255,
     256,
     257,
     258,
     259,
     260,
     261,
     262,
     263,
     264,
     265,
     266,
     267,
     268,
     269,
     270,
     271,
     272,
     273
    ]
)

In [15]:
len(combis)

222

In [16]:
call_df = call_df.loc[call_df.combi_unique_num.isin(combis)]

In [17]:
len(call_df.combi_unique_num.unique())

222

In [18]:
len(call_df.combi_label.unique())

98

## Call Labels

In [19]:
### What are the unique call labels?
list(call_df.labels.unique())

['DSSHDS',
 'SHSHLH',
 'DS',
 'USC',
 'LH',
 'SHLH',
 'NLSHDS',
 'NLDS',
 'SHDSSHDS',
 'SHDS',
 'NLDSSHDS',
 'SHSHDS',
 'DSSHSHDS',
 'SHSHDSSHDS',
 'SHSHSHLH',
 'LHDS',
 'SHC',
 'SHDSSHSHSHSHLH',
 'DSDS',
 'SHSHDSSHLH',
 'SHDSLH',
 'SHSHDSLH',
 'USLHDS',
 'C',
 'SHSHSHSHDS',
 'SHSHSHDSSHSHLH',
 'USDSSHDS',
 'USSHDS',
 'NL',
 'SHDSSHLH',
 'SHDSSHSHLH',
 'SHSHNLDS',
 'SHNLDS',
 'SHSHSHDS',
 'DSSHDSSHSHSHLH',
 'SHSHSHSHLH',
 'SHDSSH',
 'SHDSSHSHSHLH',
 'SH']

In [20]:
## create a new column of copied unit label data so that I can modify
call_df['call_lab_simp'] = call_df['labels']

In [22]:
## Create conditions for all labels containing repeated segments (these will be simplified in the next step)
cond1 = call_df['call_lab_simp'] == 'DSSHDS'
cond2 = call_df['call_lab_simp'] == 'SHSHLH'
cond3 = call_df['call_lab_simp'] == 'USC'
cond4 = call_df['call_lab_simp'] == 'SHLH'
cond5 = call_df['call_lab_simp'] == 'NLSHDS'
cond6 = call_df['call_lab_simp'] == 'NLDS'
cond7 = call_df['call_lab_simp'] == 'SHDSSHDS'
cond8 = call_df['call_lab_simp'] == 'SHDS'
cond9 = call_df['call_lab_simp'] == 'NLDSSHDS'
cond10 = call_df['call_lab_simp'] == 'SHSHDS'
cond11 = call_df['call_lab_simp'] == 'DSSHSHDS'
cond12 = call_df['call_lab_simp'] == 'SHSHDSSHDS'
cond13 = call_df['call_lab_simp'] == 'SHSHSHLH'
cond14 = call_df['call_lab_simp'] == 'LHDS'
cond15 = call_df['call_lab_simp'] == 'SHC'
cond16 = call_df['call_lab_simp'] == 'SHDSSHSHSHSHLH'
cond17 = call_df['call_lab_simp'] == 'DSDS'
cond18 = call_df['call_lab_simp'] == 'SHSHDSSHLH'
cond19 = call_df['call_lab_simp'] == 'SHDSLH'
cond20 = call_df['call_lab_simp'] == 'SHSHDSLH'
cond21 = call_df['call_lab_simp'] == 'USLHDS'
cond22 = call_df['call_lab_simp'] == 'C'
cond23 = call_df['call_lab_simp'] == 'SHSHSHSHDS'
cond24 = call_df['call_lab_simp'] == 'SHSHSHDSSHSHLH'
cond25 = call_df['call_lab_simp'] == 'USDSSHDS'
cond26 = call_df['call_lab_simp'] == 'USSHDS'
cond27 = call_df['call_lab_simp'] == 'SHDSSHLH'
cond28 = call_df['call_lab_simp'] == 'SHDSSHSHLH'
cond29 = call_df['call_lab_simp'] == 'SHSHNLDS'
cond30 = call_df['call_lab_simp'] == 'SHNLDS'
cond31 = call_df['call_lab_simp'] == 'SHSHSHDS'
cond32 = call_df['call_lab_simp'] == 'DSSHDSSHSHSHLH'
cond33 = call_df['call_lab_simp'] == 'SHSHSHSHLH'
cond34 = call_df['call_lab_simp'] == 'SHDSSH'
cond35 = call_df['call_lab_simp'] == 'SHDSSHSHSHLH'

In [23]:
### Simplify
call_df.loc[cond1, 'call_lab_simp'] = 'DS-SH-DS'
call_df.loc[cond2, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond3, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond4, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond5, 'call_lab_simp'] = 'NL-SH-DS'
call_df.loc[cond6, 'call_lab_simp'] = 'NL-DS'
call_df.loc[cond7, 'call_lab_simp'] = 'SH-DS-SH-DS'
call_df.loc[cond8, 'call_lab_simp'] = 'SH-DS'
call_df.loc[cond9, 'call_lab_simp'] = 'NL-DS-SH-DS'
call_df.loc[cond10, 'call_lab_simp'] = 'SH-DS'
call_df.loc[cond11, 'call_lab_simp'] = 'DS-SH-DS'
call_df.loc[cond12, 'call_lab_simp'] = 'SH-DS-SH-DS'
call_df.loc[cond13, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond14, 'call_lab_simp'] = 'LH-DS'
call_df.loc[cond15, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond16, 'call_lab_simp'] = 'SH-DS-SH-LH'
call_df.loc[cond17, 'call_lab_simp'] = 'DS'
call_df.loc[cond18, 'call_lab_simp'] = 'SH-DS-SH-LH'
call_df.loc[cond19, 'call_lab_simp'] = 'SH-DS-LH'
call_df.loc[cond20, 'call_lab_simp'] = 'SH-DS-LH'
call_df.loc[cond21, 'call_lab_simp'] = 'SH-LH-DS'
call_df.loc[cond22, 'call_lab_simp'] = 'LH'
call_df.loc[cond23, 'call_lab_simp'] = 'SH-DS'
call_df.loc[cond24, 'call_lab_simp'] = 'SH-DS-SH-LH'
call_df.loc[cond25, 'call_lab_simp'] = 'SH-DS-SH-DS'
call_df.loc[cond26, 'call_lab_simp'] = 'SH-DS'
call_df.loc[cond27, 'call_lab_simp'] = 'SH-DS-SH-LH'
call_df.loc[cond28, 'call_lab_simp'] = 'SH-DS-SH-LH'
call_df.loc[cond29, 'call_lab_simp'] = 'SH-NL-DS'
call_df.loc[cond30, 'call_lab_simp'] = 'SH-NL-DS'
call_df.loc[cond31, 'call_lab_simp'] = 'SH-DS'
call_df.loc[cond32, 'call_lab_simp'] = 'DS-SH-DS-SH-LH'
call_df.loc[cond33, 'call_lab_simp'] = 'SH-LH'
call_df.loc[cond34, 'call_lab_simp'] = 'SH-DS-SH'
call_df.loc[cond35, 'call_lab_simp'] = 'SH-DS-SH-LH'

In [24]:
list(call_df.call_lab_simp.unique())

['DS-SH-DS',
 'SH-LH',
 'DS',
 'LH',
 'NL-SH-DS',
 'NL-DS',
 'SH-DS-SH-DS',
 'SH-DS',
 'NL-DS-SH-DS',
 'LH-DS',
 'SH-DS-SH-LH',
 'SH-DS-LH',
 'SH-LH-DS',
 'NL',
 'SH-NL-DS',
 'DS-SH-DS-SH-LH',
 'SH-DS-SH',
 'SH']

## Simplify Sequence Labels

In [25]:
## What are the unique sequence labels?
list(call_df.combi_label.unique())

['DSSHDS SHSHLH',
 'DS USC',
 'DSSHDS LH',
 'DSSHDS SHLH',
 'DSSHDS NLSHDS',
 'NLDS DSSHDS SHLH',
 'SHDSSHDS SHSHLH',
 'DSSHDS NLSHDS SHDS LH',
 'NLDSSHDS NLSHDS SHDS LH',
 'NLDS SHDS NLSHDS SHSHDS SHLH',
 'NLDS SHDS',
 'DSSHDS NLSHDS SHSHDS SHSHLH',
 'DSSHSHDS SHSHLH',
 'NLDS SHSHDS SHSHLH',
 'SHDSSHDS NLSHDS SHSHDS SHSHLH',
 'SHDS SHDS SHSHLH',
 'SHSHDSSHDS NLSHDS SHDS SHSHLH',
 'DSSHDS NLSHDS SHDS SHSHLH',
 'SHDSSHDS NLSHDS SHDS SHSHLH',
 'SHDS NLDS SHDS SHSHLH',
 'DSSHDS SHSHSHLH',
 'NLDS LHDS',
 'SHSHDS SHC',
 'SHSHDS SHC NLSHDS SHDSSHSHSHSHLH',
 'SHDS SHC',
 'NLDS DS',
 'DS LHDS',
 'NLDS DS LHDS',
 'NLDS DSDS SHSHDSSHLH',
 'NLDS DSDS SHDSLH',
 'NLSHDS DSDS SHSHDSLH',
 'NLDS DSDS SHSHDSLH',
 'DS USLHDS',
 'SHDS SHDSLH',
 'NLDS DS LH',
 'NLDS SHDS SHSHLH',
 'SHDS SHSHLH',
 'NLDS SHDS SHDS SHSHLH',
 'NLDS SHLH',
 'DS NLDS',
 'NLDS C',
 'NLDS SHSHLH',
 'NLDS SHSHSHLH',
 'NLSHDS SHSHSHLH',
 'NLSHDS SHSHLH',
 'SHDS SHSHSHSHDS SHSHLH',
 'NLDS SHSHSHDSSHSHLH',
 'NLSHDS DSSHDS SHSHLH',
 '

In [36]:
## create a new column of copied sequence label data so that I can modify
call_df['combi_lab_simp'] = call_df['combi_label']

In [37]:
## Create conditions for all labels containing repeated calls (these will be modified in the next step)
cond1 = call_df['combi_lab_simp'] == 'DSSHDS SHSHLH'
cond2 = call_df['combi_lab_simp'] == 'DS USC'
cond3 = call_df['combi_lab_simp'] == 'DSSHDS LH'
cond4 = call_df['combi_lab_simp'] == 'DSSHDS SHLH'
cond5 = call_df['combi_lab_simp'] == 'DSSHDS NLSHDS'
cond6 = call_df['combi_lab_simp'] == 'NLDS SHDS'
cond7 = call_df['combi_lab_simp'] == 'DSSHDS NLSHDS SHSHDS SHSHLH'
cond8 = call_df['combi_lab_simp'] == 'DSSHSHDS SHSHLH'
cond9 = call_df['combi_lab_simp'] == 'NLDS SHSHDS SHSHLH'
cond10 = call_df['combi_lab_simp'] == 'SHDSSHDS NLSHDS SHSHDS SHSHLH'
cond11 = call_df['combi_lab_simp'] == 'SHDS SHDS SHSHLH'
cond12 = call_df['combi_lab_simp'] == 'SHSHDSSHDS NLSHDS SHDS SHSHLH'
cond13 = call_df['combi_lab_simp'] == 'DSSHDS NLSHDS SHDS SHSHLH'
cond14 = call_df['combi_lab_simp'] == 'SHDSSHDS NLSHDS SHDS SHSHLH'
cond15 = call_df['combi_lab_simp'] == 'SHDS NLDS SHDS SHSHLH'
cond16 = call_df['combi_lab_simp'] == 'DSSHDS SHSHSHLH'
cond17 = call_df['combi_lab_simp'] == 'NLDS LHDS'
cond18 = call_df['combi_lab_simp'] == 'NLDS DS LHDS'
cond19 = call_df['combi_lab_simp'] == 'NLDS DS'
cond20 = call_df['combi_lab_simp'] == 'DS LHDS'
cond21 = call_df['combi_lab_simp'] == 'NLDS DSDS SHSHDSSHLH'
cond22 = call_df['combi_lab_simp'] == 'NLDS DSDS SHDSLH'
cond23 = call_df['combi_lab_simp'] == 'NLSHDS DSDS SHSHDSLH'
cond24 = call_df['combi_lab_simp'] == 'NLDS DSDS SHSHDSLH'
cond25 = call_df['combi_lab_simp'] == 'DS USLHDS'
cond26 = call_df['combi_lab_simp'] == 'SHDS SHDSLH'
cond27 = call_df['combi_lab_simp'] == 'NLDS DS LH'
cond28 = call_df['combi_lab_simp'] == 'NLDS SHDS SHSHLH'
cond29 = call_df['combi_lab_simp'] == 'SHDS SHSHLH'
cond30 = call_df['combi_lab_simp'] == 'NLDS SHDS SHDS SHSHLH'

In [38]:
cond31 = call_df['combi_lab_simp'] == 'NLDS SHLH'
cond32 = call_df['combi_lab_simp'] == 'DS NLDS'
cond33 = call_df['combi_lab_simp'] == 'NLDS C'
cond34 = call_df['combi_lab_simp'] == 'NLDS SHSHLH'
cond35 = call_df['combi_lab_simp'] == 'NLDS SHSHSHLH'
cond36 = call_df['combi_lab_simp'] == 'NLSHDS SHSHSHLH'
cond37 = call_df['combi_lab_simp'] == 'NLSHDS SHSHLH'
cond38 = call_df['combi_lab_simp'] == 'SHDS SHSHSHSHDS SHSHLH'
cond39 = call_df['combi_lab_simp'] == 'NLDS SHSHSHDSSHSHLH'
cond40 = call_df['combi_lab_simp'] == 'NLSHDS DSSHDS SHSHLH'
cond41 = call_df['combi_lab_simp'] == 'NLSHDS SHDSSHDS SHLH'
cond42 = call_df['combi_lab_simp'] == 'NLDS NLDS NLDS SHSHDS'
cond43 = call_df['combi_lab_simp'] == 'NLDS NLDS NLSHDS SHDS SHSHLH'
cond44 = call_df['combi_lab_simp'] == 'NLDS NLSHDS SHDS SHSHLH'
cond45 = call_df['combi_lab_simp'] == 'NLDS USDSSHDS SHSHLH'
cond46 = call_df['combi_lab_simp'] == 'NLDS SHDSSHDS'
cond47 = call_df['combi_lab_simp'] == 'NLDS NLDS DSSHDS'
cond48 = call_df['combi_lab_simp'] == 'SHDS LHDS'
cond49 = call_df['combi_lab_simp'] == 'SHDS USLHDS'
cond50 = call_df['combi_lab_simp'] == 'USSHDS USLHDS'
cond51 = call_df['combi_lab_simp'] == 'SHDS LHDS LHDS'
cond52 = call_df['combi_lab_simp'] == 'SHDS SHDSSHSHLH'
cond53 = call_df['combi_lab_simp'] == 'SHDS SHDS SHLH'
cond54 = call_df['combi_lab_simp'] == 'SHDS SHDSSHLH'
cond55 = call_df['combi_lab_simp'] == 'SHDS SHDS LH'
cond56 = call_df['combi_lab_simp'] == 'NL NLSHDS SHDSLH'
cond57 = call_df['combi_lab_simp'] == 'NLDS SHSHNLDS DSSHDS SHSHSHLH'
cond58 = call_df['combi_lab_simp'] == 'SHNLDS DSSHDS SHLH'
cond59 = call_df['combi_lab_simp'] == 'NLDS SHDS SHSHSHLH'
cond60 = call_df['combi_lab_simp'] == 'SHSHNLDS DSSHDS SHSHSHLH'

In [39]:
cond61 = call_df['combi_lab_simp'] == 'NLSHDS SHDS SHSHSHLH'
cond62 = call_df['combi_lab_simp'] == 'NLDS SHDS LH'
cond63 = call_df['combi_lab_simp'] == 'SHSHSHDS DSSHDSSHLH'
cond64 = call_df['combi_lab_simp'] == 'SHSHSHSHDS DSSHDSSHSHSHLH'
cond65 = call_df['combi_lab_simp'] == 'SHDSSH LHDS'
cond66 = call_df['combi_lab_simp'] == 'NLDS SHDSLH'
cond67 = call_df['combi_lab_simp'] == 'NL NLDS'
cond68 = call_df['combi_lab_simp'] == 'NLDS LH'
cond69 = call_df['combi_lab_simp'] == 'NLDS SHDSSHSHSHLH'
cond70 = call_df['combi_lab_simp'] == 'NLDS DSSHDS SHLH' 
cond71 = call_df['combi_lab_simp'] == 'SHDSSHDS SHSHLH'
cond72 = call_df['combi_lab_simp'] == 'DSSHDS NLSHDS SHDS LH'
cond73 = call_df['combi_lab_simp'] == 'NLDSSHDS NLSHDS SHDS LH'
cond74 = call_df['combi_lab_simp'] == 'NLDS SHDS NLSHDS SHSHDS SHLH'
cond75 = call_df['combi_lab_simp'] == 'NLDS NL'
cond76 = call_df['combi_lab_simp'] == 'NLDS SHDSSHSHLH'
cond77 = call_df['combi_lab_simp'] == 'NLDS SHDSSHLH'
cond78 = call_df['combi_lab_simp'] == 'NLSHDS SHDS SHSHSHSHLH'
cond79 = call_df['combi_lab_simp'] == 'NLSHDS SHDS SHSHLH'
cond80 = call_df['combi_lab_simp'] == 'SHDS SHSHSHLH'
cond81 = call_df['combi_lab_simp'] == 'NL NLDS SHDS LH'
cond82 = call_df['combi_lab_simp'] == 'NLDS SHDS SH'
cond83 = call_df['combi_lab_simp'] == 'NLDS SHDS SHLH'
cond84 = call_df['combi_lab_simp'] == 'SHDS LHDS NLSHDS DS'
cond85 = call_df['combi_lab_simp'] == 'NL SHDS'
cond86 = call_df['combi_lab_simp'] == 'NLDS DS DS'
cond87 = call_df['combi_lab_simp'] == 'NL SHDS LHDS'
cond88 = call_df['combi_lab_simp'] == 'NLSHDS SHDSSHSHLH'
cond89 = call_df['combi_lab_simp'] == 'NL SHDS SHDSSHLH'
cond90 = call_df['combi_lab_simp'] == 'NLSHDS SHDSSHLH'
cond91 = call_df['combi_lab_simp'] == 'NL NLDS NLDSSHDS LH'
cond92 = call_df['combi_lab_simp'] == 'NLDS NLDS NLDS LHDS SHDS'
cond93 = call_df['combi_lab_simp'] == 'NLDS NLDS NLDSSHDS SHSHSHLH'
cond94 = call_df['combi_lab_simp'] == 'NLDS LHDS SHSHSHSHDS'
cond95 = call_df['combi_lab_simp'] == 'NLDS NLDS SHDSSHDS SHLH'
cond96 = call_df['combi_lab_simp'] == 'SHSHDS SHC'
cond97 = call_df['combi_lab_simp'] == 'SHSHDS SHC NLSHDS SHDSSHSHSHSHLH'
cond98 = call_df['combi_lab_simp'] == 'SHDS SHC'

In [40]:
### Modify
call_df.loc[cond1, 'combi_lab_simp'] = 'DS-SH-DS SH-LH'
call_df.loc[cond2, 'combi_lab_simp'] = 'DS SH-LH'
call_df.loc[cond3, 'combi_lab_simp'] = 'DS-SH-DS LH'
call_df.loc[cond4, 'combi_lab_simp'] = 'DS-SH-DS SH-LH'
call_df.loc[cond5, 'combi_lab_simp'] = 'DS-SH-DS NL-SH-DS'
call_df.loc[cond6, 'combi_lab_simp'] = 'NL-DS SH-DS'
call_df.loc[cond7, 'combi_lab_simp'] = 'DS-SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond8, 'combi_lab_simp'] = 'DS-SH-DS SH-LH'
call_df.loc[cond9, 'combi_lab_simp'] = 'NL-DS SH-DS SH-LH'
call_df.loc[cond10, 'combi_lab_simp'] = 'SH-DS-SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond11, 'combi_lab_simp'] = 'SH-DS SH-LH'
call_df.loc[cond12, 'combi_lab_simp'] = 'SH-DS-SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond13, 'combi_lab_simp'] = 'DS-SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond14, 'combi_lab_simp'] = 'SH-DS-SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond15, 'combi_lab_simp'] = 'SH-DS NL-DS SH-DS SH-LH'
call_df.loc[cond16, 'combi_lab_simp'] = 'DS-SH-DS SH-LH'
call_df.loc[cond17, 'combi_lab_simp'] = 'NL-DS LH-DS'
call_df.loc[cond18, 'combi_lab_simp'] = 'NL-DS DS LH-DS'
call_df.loc[cond19, 'combi_lab_simp'] = 'NL-DS DS'
call_df.loc[cond20, 'combi_lab_simp'] = 'DS LH-DS'
call_df.loc[cond21, 'combi_lab_simp'] = 'NL-DS DS SH-DS-SH-LH'
call_df.loc[cond22, 'combi_lab_simp'] = 'NL-DS DS SH-DS-LH'
call_df.loc[cond23, 'combi_lab_simp'] = 'NL-SH-DS DS SH-DS-LH'
call_df.loc[cond24, 'combi_lab_simp'] = 'NL-DS DS SH-DS-LH'
call_df.loc[cond25, 'combi_lab_simp'] = 'DS SH-LH-DS'
call_df.loc[cond26, 'combi_lab_simp'] = 'SH-DS SH-DS-LH'
call_df.loc[cond27, 'combi_lab_simp'] = 'NL-DS DS LH'
call_df.loc[cond28, 'combi_lab_simp'] = 'NL-DS SH-DS SH-LH'
call_df.loc[cond29, 'combi_lab_simp'] = 'SH-DS SH-LH'
call_df.loc[cond30, 'combi_lab_simp'] = 'NL-DS SH-DS SH-LH'

In [41]:
call_df.loc[cond31, 'combi_lab_simp'] = 'NL-DS SH-LH'
call_df.loc[cond32, 'combi_lab_simp'] = 'DS NL-DS'
call_df.loc[cond33, 'combi_lab_simp'] = 'NL-DS LH'
call_df.loc[cond34, 'combi_lab_simp'] = 'NL-DS SH-LH'
call_df.loc[cond35, 'combi_lab_simp'] = 'NL-DS SH-LH'
call_df.loc[cond36, 'combi_lab_simp'] = 'NL-SH-DS SH-LH'
call_df.loc[cond37, 'combi_lab_simp'] = 'NL-SH-DS SH-LH'
call_df.loc[cond38, 'combi_lab_simp'] = 'SH-DS SH-DS SH-LH'
call_df.loc[cond39, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-LH'
call_df.loc[cond40, 'combi_lab_simp'] = 'NL-SH-DS DS-SH-DS SH-LH'
call_df.loc[cond41, 'combi_lab_simp'] = 'NL-SH-DS SH-DS-SH-DS SH-LH'
call_df.loc[cond42, 'combi_lab_simp'] = 'NL-DS SH-DS'
call_df.loc[cond43, 'combi_lab_simp'] = 'NL-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond44, 'combi_lab_simp'] = 'NL-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond45, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-DS SH-LH'
call_df.loc[cond46, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-DS'
call_df.loc[cond47, 'combi_lab_simp'] = 'NL-DS DS-SH-DS'
call_df.loc[cond48, 'combi_lab_simp'] = 'SH-DS LH-DS'
call_df.loc[cond49, 'combi_lab_simp'] = 'SH-DS SH-LH-DS'
call_df.loc[cond50, 'combi_lab_simp'] = 'SH-DS SH-LH-DS'
call_df.loc[cond51, 'combi_lab_simp'] = 'SH-DS LH-DS'
call_df.loc[cond52, 'combi_lab_simp'] = 'SH-DS SH-DS-SH-LH'
call_df.loc[cond53, 'combi_lab_simp'] =  'SH-DS SH-LH'
call_df.loc[cond54, 'combi_lab_simp'] = 'SH-DS SH-DS-SH-LH'
call_df.loc[cond55, 'combi_lab_simp'] = 'SH-DS LH'
call_df.loc[cond56, 'combi_lab_simp'] = 'NL NL-SH-DS SH-DS-LH'
call_df.loc[cond57, 'combi_lab_simp'] = 'NL-DS SH-NL-DS DS-SH-DS SH-LH'
call_df.loc[cond58, 'combi_lab_simp'] = 'SH-NL-DS DS-SH-DS SH-LH'
call_df.loc[cond59, 'combi_lab_simp'] = 'NL-DS SH-DS SH-LH'
call_df.loc[cond60, 'combi_lab_simp'] = 'SH-NL-DS DS-SH-DS SH-LH'

In [42]:
call_df.loc[cond61, 'combi_lab_simp'] = 'NL-SH-DS SH-DS SH-LH'
call_df.loc[cond62, 'combi_lab_simp'] = 'NL-DS SH-DS LH'
call_df.loc[cond63, 'combi_lab_simp'] = 'SH-DS DS-SH-DS-SH-LH'
call_df.loc[cond64, 'combi_lab_simp'] = 'SH-DS DS-SH-DS-SH-LH'
call_df.loc[cond65, 'combi_lab_simp'] = 'SH-DS-SH LH-DS'
call_df.loc[cond66, 'combi_lab_simp'] = 'NL-DS SH-DS-LH'
call_df.loc[cond67, 'combi_lab_simp'] = 'NL NL-DS'
call_df.loc[cond68, 'combi_lab_simp'] = 'NL-DS LH'
call_df.loc[cond69, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-LH'
call_df.loc[cond70, 'combi_lab_simp'] = 'NL-DS DS-SH-DS SH-LH'
call_df.loc[cond71, 'combi_lab_simp'] = 'SH-DS-SH-DS SH-LH'
call_df.loc[cond72, 'combi_lab_simp'] = 'DS-SH-DS NL-SH-DS SH-DS LH'
call_df.loc[cond73, 'combi_lab_simp'] = 'NL-DS-SH-DS NL-SH-DS SH-DS LH'
call_df.loc[cond74, 'combi_lab_simp'] = 'NL-DS SH-DS NL-SH-DS SH-DS SH-LH'
call_df.loc[cond75, 'combi_lab_simp'] = 'NL-DS NL'
call_df.loc[cond76, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-LH'
call_df.loc[cond77, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-LH'
call_df.loc[cond78, 'combi_lab_simp'] = 'NL-SH-DS SH-DS SH-LH'
call_df.loc[cond79, 'combi_lab_simp'] = 'NL-SH-DS SH-DS SH-LH'
call_df.loc[cond80, 'combi_lab_simp'] = 'SH-DS SH-LH'
call_df.loc[cond81, 'combi_lab_simp'] = 'NL NL-DS SH-DS LH'
call_df.loc[cond82, 'combi_lab_simp'] = 'NL-DS SH-DS SH'
call_df.loc[cond83, 'combi_lab_simp'] = 'NL-DS SH-DS SH-LH'
call_df.loc[cond84, 'combi_lab_simp'] = 'SH-DS LH-DS NL-SH-DS DS'
call_df.loc[cond85, 'combi_lab_simp'] = 'NL SH-DS'
call_df.loc[cond86, 'combi_lab_simp'] = 'NL-DS DS'
call_df.loc[cond87, 'combi_lab_simp'] = 'NL SH-DS LH-DS'
call_df.loc[cond88, 'combi_lab_simp'] = 'NL-SH-DS SH-DS-SH-LH'
call_df.loc[cond89, 'combi_lab_simp'] = 'NL SH-DS SH-DS-SH-LH'
call_df.loc[cond90, 'combi_lab_simp'] = 'NL-SH-DS SH-DS-SH-LH'
call_df.loc[cond91, 'combi_lab_simp'] = 'NL NL-DS NL-DS-SH-DS LH'
call_df.loc[cond92, 'combi_lab_simp'] = 'NL-DS LH-DS SH-DS'
call_df.loc[cond93, 'combi_lab_simp'] = 'NL-DS NL-DS-SH-DS SH-LH'
call_df.loc[cond94, 'combi_lab_simp'] = 'NL-DS LH-DS SH-DS'
call_df.loc[cond95, 'combi_lab_simp'] = 'NL-DS SH-DS-SH-DS SH-LH'
call_df.loc[cond96, 'combi_lab_simp'] = 'SH-DS SH-LH'
call_df.loc[cond97, 'combi_lab_simp'] = 'SH-DS SH-LH NL-SH-DS SH-DS-SH-LH'
call_df.loc[cond98, 'combi_lab_simp'] = 'SH-DS SH-LH'

In [43]:
## What are the unique simplified sequence labels? - check
list(call_df.combi_lab_simp.unique())

['DS-SH-DS SH-LH',
 'DS SH-LH',
 'DS-SH-DS LH',
 'DS-SH-DS NL-SH-DS',
 'NL-DS DS-SH-DS SH-LH',
 'SH-DS-SH-DS SH-LH',
 'DS-SH-DS NL-SH-DS SH-DS LH',
 'NL-DS-SH-DS NL-SH-DS SH-DS LH',
 'NL-DS SH-DS NL-SH-DS SH-DS SH-LH',
 'NL-DS SH-DS',
 'DS-SH-DS NL-SH-DS SH-DS SH-LH',
 'NL-DS SH-DS SH-LH',
 'SH-DS-SH-DS NL-SH-DS SH-DS SH-LH',
 'SH-DS SH-LH',
 'SH-DS NL-DS SH-DS SH-LH',
 'NL-DS LH-DS',
 'SH-DS SH-LH NL-SH-DS SH-DS-SH-LH',
 'NL-DS DS',
 'DS LH-DS',
 'NL-DS DS LH-DS',
 'NL-DS DS SH-DS-SH-LH',
 'NL-DS DS SH-DS-LH',
 'NL-SH-DS DS SH-DS-LH',
 'DS SH-LH-DS',
 'SH-DS SH-DS-LH',
 'NL-DS DS LH',
 'NL-DS SH-LH',
 'DS NL-DS',
 'NL-DS LH',
 'NL-SH-DS SH-LH',
 'SH-DS SH-DS SH-LH',
 'NL-DS SH-DS-SH-LH',
 'NL-SH-DS DS-SH-DS SH-LH',
 'NL-SH-DS SH-DS-SH-DS SH-LH',
 'NL-DS NL-SH-DS SH-DS SH-LH',
 'NL-DS SH-DS-SH-DS SH-LH',
 'NL-DS SH-DS-SH-DS',
 'NL-DS DS-SH-DS',
 'SH-DS LH-DS',
 'SH-DS SH-LH-DS',
 'NL-DS NL',
 'SH-DS SH-DS-SH-LH',
 'SH-DS LH',
 'NL NL-SH-DS SH-DS-LH',
 'NL-DS SH-NL-DS DS-SH-DS SH-LH',
 

In [44]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [46]:
#save df
save_loc = DATA_DIR / DATASET_ID / DT_ID /  'call_df_for_umap.pickle'
ensure_dir(save_loc.as_posix())
call_df.to_pickle(save_loc)