# Multi-level combinatoriality in magpie non-song vocalizations

This repository follows the tutorial outlined in [Sainburg et al., (2020)](https://doi.org/10.1371/journal.pcbi.1008228), with some modifications to account for acoustic parameters specific to our study species: the Western Australian magpie (*Gymnorhina tibicen dorsalis*). Further detail on the method, including code for much of the analysis used in this paper, can be accessed through the tutorial's repository at [github.com/timsainb/avgn](https://github.com/timsainb/avgn_paper/blob/V2/github.com/timsainb/avgn). 

Data for this study can be accessed [here](https://doi.org/10.26182/s77t-hw04). 

In this notebook I create dataframes for segment, call and combination data using information in the annotation (Praat textgrid) and audio (WAV) files. 

In [1]:
#Set Dataset ID
DATASET_ID = "git_repos"

In [2]:
import pdb
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json
import numpy as np
from pathlib2 import Path

In [3]:
import avgn

In [4]:
from avgn.utils.paths import DATA_DIR, ensure_dir

## Load Data in Original Format

In [5]:
#Creation of a unique datetime identifier for the files output by this notebook 
##This allows files to be saved in a folder with today's date/time of output creation
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2022-09-10_09-36-50'

In [6]:
#Defining Path to Raw Dataset
RAW_DATASET_LOC = DATA_DIR/"raw"/"magpie"/"individuals"
RAW_DATASET_LOC

WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals')

In [7]:
#Grab an array of all textgrid files corresponding to each WAV file
##Textgrid files contain annotations with start and end time boundaries
###as well as labels for combinations and their comprising calls and segments
TGLIST = np.array(list((RAW_DATASET_LOC).expanduser().glob('*/*.TextGrid')))
TGLIST = np.array([i for i in TGLIST if i.stem[0] != '.'])
TGLIST[0], len(TGLIST)

(WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals/BWYa_F_MGGY/BWY MGGY Call Combo 1 290719 PM.TextGrid'),
 153)

In [8]:
#Grab array of all WAV files
WAVLIST = np.array(list((RAW_DATASET_LOC).expanduser().glob('*/*.wav')))
WAVLIST[0], len(WAVLIST)

(WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals/BWYa_F_MGGY/BWY MGGY Call Combo 1 290719 PM.wav'),
 153)

## Create dataframes for Segments, Calls & Combinations


In [9]:
#Create array containing just the file names in WAVLIST
wav_stems = np.array([i.stem for i in WAVLIST])
wav_stems[0]

'BWY MGGY Call Combo 1 290719 PM'

In [10]:
from avgn.custom_parsing.magpie_for_repos import (
    get_segments,
    get_calls,
    get_combis
)

### Segment Data

In [11]:
#create textrgids just with segment data
with Parallel(n_jobs=-1, verbose=10) as parallel:
    seg_df = parallel(
        delayed(get_segments)(tg, WAVLIST, wav_stems)
        for tg in tqdm(TGLIST)
    )
seg_df = pd.concat(seg_df)

  0%|          | 0/153 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1968s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1636s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 130 out of 153 | elapsed:    5.3s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done 146 out of 153 | elapsed:   

In [12]:
seg_df[:3]

Unnamed: 0,indv,sex,gro,filename,wavloc,tgloc,seg_pos_combi,seg_start,seg_end,seg_label
0,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.753604,0.776773,DS
1,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,1,0.786865,0.835165,SH
2,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,2,0.855941,0.92116,DS


In [13]:
#how many segments are there?
len(seg_df)

1647

In [14]:
#Create ID from 0 to end of range - so that each segment has a unique identifier
seg_df.insert(0, 'New_ID', range(0, 0 + len(seg_df)))

In [15]:
#check to see if worked
seg_df[:3]

Unnamed: 0,New_ID,indv,sex,gro,filename,wavloc,tgloc,seg_pos_combi,seg_start,seg_end,seg_label
0,0,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.753604,0.776773,DS
1,1,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,1,0.786865,0.835165,SH
2,2,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,2,0.855941,0.92116,DS


In [16]:
#save segment dataframe as pickle
save_loc = DATA_DIR / DATASET_ID / 'walsh_magpie_segment_dataframe.pickle'
ensure_dir(save_loc)
seg_df.to_pickle(save_loc)

### Call Data

In [17]:
#create textrgids again just with call data
with Parallel(n_jobs=-1, verbose=10) as parallel:
    call_df = parallel(
        delayed(get_calls)(tg, WAVLIST, wav_stems)
        for tg in tqdm(TGLIST)
    )
call_df = pd.concat(call_df)

  0%|          | 0/153 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0319s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0758s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1526s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 103 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 130 out of 153 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 153 out of 153 | elapsed:    0.5s finished


In [18]:
#view call dataframe
call_df[:3]

Unnamed: 0,indv,sex,gro,filename,wavloc,tgloc,call_pos_combi,call_start,call_end,call_label
0,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.753604,0.92116,DSSHDS
1,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,1,0.932017,1.36713,SHSHLH
0,MGGY,F,BWYa,BWY MGGY Call Combo 1 300719 AM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,1.218085,1.308841,DS


### Combination Data

In [19]:
#create textgrids again just with combi data
with Parallel(n_jobs=-1, verbose=10) as parallel:
    combi_df = parallel(
        delayed(get_combis)(tg, WAVLIST, wav_stems)
        for tg in tqdm(TGLIST)
    )
combi_df = pd.concat(combi_df)

  0%|          | 0/153 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0236s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0715s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0993s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 130 out of 153 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 146 out of 153 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 153 out of 153 | elapsed:    0.3s finished


In [20]:
combi_df[:6]

Unnamed: 0,indv,sex,gro,filename,wavloc,tgloc,combi_num,combi_start,combi_end,combi_label
0,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 P,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.753604,1.36713,DSSHDS SHSHLH
0,MGGY,F,BWYa,BWY MGGY Call Combo 1 300719 A,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,1.218085,1.984961,DS USC
0,MGGY,F,BWYa,BWY MGGY Discrete 1 300719 AM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,1.244022,1.897371,DSSHDS LH
0,MGGY,F,BWYa,BWY MGGY Discrete 4 210519 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.30633,0.927264,DSSHDS SHLH
0,MGGY,F,BWYa,BWY MGGY Discrete 6 210519 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.158063,0.503686,DSSHDS SHLH
0,MGGY,F,BWYa,BWY MGGY Excitement 2 170519 P,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.05759,0.535485,DSSHDS NLSHDS


## Put all data into SQL database
In order to combine segment and call data for each combination 

In [21]:
import sqlite3 as sql

In [22]:
con = sql.connect('magpie_gitrepos.db')

In [23]:
#turn dataframes into string
seg_df = seg_df.applymap(str)
call_df = call_df.applymap(str)
combi_df = combi_df.applymap(str)

In [24]:
#connect string dataframes to SQL database
seg_df.to_sql('segs', con)
call_df.to_sql('calls', con)
combi_df.to_sql('combis', con)

In [25]:
#first I create dataframe with just ID information - to reduce duplications when combining all data
con = con
c = con.cursor()

c.execute("""
    SELECT DISTINCT New_ID, indv, gro, sex, wavloc, filename
    FROM segs
    """)

IDdf = pd.DataFrame(c.fetchall(), columns = ['New_ID', 'indv', 'gro', 'sex', 'wavloc', 'filename'])
print(IDdf)

     New_ID    indv  gro, sex  \
0         0    MGGY  BWYa   F   
1         1    MGGY  BWYa   F   
2         2    MGGY  BWYa   F   
3         3    MGGY  BWYa   F   
4         4    MGGY  BWYa   F   
...     ...     ...   ...  ..   
1642   1642  MXXGRY    SS   F   
1643   1643  MXXGRY    SS   F   
1644   1644  MXXGRY    SS   F   
1645   1645  MXXGRY    SS   F   
1646   1646  MXXGRY    SS   F   

                                                 wavloc  \
0     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
1     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
2     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
3     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
4     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
...                                                 ...   
1642  C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
1643  C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
1644  C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
1645  C:\User

In [26]:
IDdf = pd.DataFrame(IDdf)
IDdf[:8]

Unnamed: 0,New_ID,indv,"gro,",sex,wavloc,filename
0,0,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
1,1,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
2,2,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
3,3,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
4,4,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
5,5,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM
6,6,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 300719 AM
7,7,MGGY,BWYa,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 300719 AM


In [27]:
#how many rows? >1647 means there are duplicates
len(IDdf[:])

1647

In [28]:
#dataframe containing start and end times

con = con
c = con.cursor()

c.execute("""
    SELECT DISTINCT New_ID, segs.filename, seg_label, seg_start, seg_end, call_label, call_start, call_end, combi_label, combi_start, combi_end
    FROM segs
    LEFT JOIN calls ON (segs.wavloc == calls.wavloc AND segs.seg_start >= calls.call_start AND segs.seg_end <= calls.call_end)
    LEFT JOIN combis ON (segs.wavloc == combis.wavloc AND segs.seg_start >= combis.combi_start AND segs.seg_end <= combis.combi_end)
    """)

df = pd.DataFrame(c.fetchall(), columns = ['New_ID', 'filename', 'seg_label', 'seg_start', 'seg_end', 'call_label', 'call_start', 'call_end', 'combi_label', 'combi_start', 'combi_end'])
print(df)


     New_ID                                           filename seg_label  \
0         0                    BWY MGGY Call Combo 1 290719 PM        DS   
1         1                    BWY MGGY Call Combo 1 290719 PM        SH   
2         2                    BWY MGGY Call Combo 1 290719 PM        DS   
3         3                    BWY MGGY Call Combo 1 290719 PM        SH   
4         4                    BWY MGGY Call Combo 1 290719 PM        SH   
...     ...                                                ...       ...   
2515   1642  SS MXXGRY XXXX JUVENILE Call Combination Serie...        DS   
2516   1643  SS MXXGRY XXXX JUVENILE Call Combination Serie...        SH   
2517   1644  SS MXXGRY XXXX JUVENILE Call Combination Serie...        DS   
2518   1645  SS MXXGRY XXXX JUVENILE Call Combination Serie...        LH   
2519   1646  SS MXXGRY XXXX JUVENILE Call Combination Serie...        DS   

                seg_start              seg_end call_label  \
0      0.7536043330872654 

In [29]:
alldf = pd.DataFrame(df)
alldf[:3]

Unnamed: 0,New_ID,filename,seg_label,seg_start,seg_end,call_label,call_start,call_end,combi_label,combi_start,combi_end
0,0,BWY MGGY Call Combo 1 290719 PM,DS,0.7536043330872654,0.7767728702775286,DSSHDS,0.7536043330872654,0.9211598748571118,DSSHDS SHSHLH,0.7536043330872654,1.3671299042471126
1,1,BWY MGGY Call Combo 1 290719 PM,SH,0.7868652657285787,0.8351645868157468,DSSHDS,0.7536043330872654,0.9211598748571118,DSSHDS SHSHLH,0.7536043330872654,1.3671299042471126
2,2,BWY MGGY Call Combo 1 290719 PM,DS,0.8559414646395095,0.9211598748571118,DSSHDS,0.7536043330872654,0.9211598748571118,DSSHDS SHSHLH,0.7536043330872654,1.3671299042471126


In [30]:
#how many rows? if >1647 there are duplicates which can be fixed in the next step
len(alldf[:])

2520

## Turn dataframes into csv for processing
Easily allows combining the data from both dataframes and checking for any errors/duplicates that might have occurred during processing. 

In [31]:
#check data directory path
DATA_DIR

WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data')

In [32]:
save_loc = DATA_DIR / DATASET_ID / 'startendtimes_df_to_csv.csv'
ensure_dir(save_loc.as_posix())
alldf.to_csv(save_loc)

In [33]:
save_loc = DATA_DIR / DATASET_ID / 'ID_df_to_csv.csv'
ensure_dir(save_loc)
IDdf.to_csv(save_loc)