# Import CSV & Generate JSON

This notebook creates JSON files containing the information in the dataframes & csvs created in the previous step, which has now been combined into one file. Following checking of the csv file for any duplications/errors in processing, some data columns are created containing information on the position of each segment in the order of segments within their respective call and combination, and for each call's position in the order of calls within their respective combination. Columns are also created for unique identifiers for each call and combination and study site for the caller. It is then loaded into this notebook.

In [1]:
DATASET_ID = "git_repos"

In [2]:
import pdb
from joblib import Parallel, delayed
from tqdm.auto import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json
import numpy as np
from pathlib2 import Path

In [3]:
import avgn

In [4]:
from avgn.utils.paths import DATA_DIR

In [5]:
# create a unique datetime identifier for the files output by this notebook 
##(they will be saved in a folder with today's date/time of output creation)
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2022-09-10_09-41-48'

In [6]:
#Defining Path to Raw Dataset
RAW_DATASET_LOC = DATA_DIR/"raw"/"magpie"/"individuals"
RAW_DATASET_LOC

WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals')

### Import CSV

In [7]:
vocaldf = pd.read_csv (DATA_DIR/DATASET_ID/'alldata.csv')
print (vocaldf)

      New_ID    indv   gro   location sex  \
0          0    MGGY  BWYa    CRAWLEY   F   
1          1    MGGY  BWYa    CRAWLEY   F   
2          2    MGGY  BWYa    CRAWLEY   F   
3          3    MGGY  BWYa    CRAWLEY   F   
4          4    MGGY  BWYa    CRAWLEY   F   
...      ...     ...   ...        ...  ..   
1642    1642  MXXGRY    SS  GUILDFORD   F   
1643    1643  MXXGRY    SS  GUILDFORD   F   
1644    1644  MXXGRY    SS  GUILDFORD   F   
1645    1645  MXXGRY    SS  GUILDFORD   F   
1646    1646  MXXGRY    SS  GUILDFORD   F   

                                                 wavloc  \
0     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
1     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
2     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
3     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
4     C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...   
...                                                 ...   
1642  C:\Users\slwal\anaconda3\envs\PY36\avgn_

In [8]:
##create dataframe
vocdf = pd.DataFrame(vocaldf)

In [9]:
vocdf[:3]

Unnamed: 0,New_ID,indv,gro,location,sex,wavloc,filename,seg_label,seg_start,seg_end,...,call_start,call_end,combi_label,combi_start,combi_end,call_unique_num,seg_pos_call,combi_unique_num,seg_pos_combi,call_pos_combi
0,0,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.753604,0.776773,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,0,0,0,0
1,1,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,SH,0.786865,0.835165,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,1,0,1,0
2,2,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.855941,0.92116,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,2,0,2,0


In [10]:
#### Need to take WAV location paths from the segment pickle (made in previous step) as they no longer work after 
##being imported from the csv
seg_df = pd.read_pickle(DATA_DIR / DATASET_ID / 'walsh_magpie_segment_dataframe.pickle')
seg_df[:3]

Unnamed: 0,New_ID,indv,sex,gro,filename,wavloc,tgloc,seg_pos_combi,seg_start,seg_end,seg_label
0,0,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,0,0.753604,0.776773,DS
1,1,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,1,0.786865,0.835165,SH
2,2,MGGY,F,BWYa,BWY MGGY Call Combo 1 290719 PM,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,2,0.855941,0.92116,DS


#### Some checks just to ensure dataframes are the same

In [11]:
#Set index to be the same in both dataframes, as per unique identifiers previously created for segments
seg_df.set_index('New_ID', inplace=True)

In [12]:
vocdf.set_index('New_ID', inplace=True)

In [13]:
#check number of rows are the same
len(seg_df)

1647

In [14]:
len(vocdf)

1647

In [15]:
#check number of unique files is the same in both dataframes
len(vocdf.filename.unique())

153

In [16]:
len(seg_df.filename.unique())

153

In [17]:
#check 
vocdf.wavloc[20]

'C:\\Users\\slwal\\anaconda3\\envs\\PY36\\avgn_paper-2\\data\\raw\\magpie\\individuals\\BWYa_F_MGGY\\BWY MGGY Discrete 6 210519 PM.WAV'

In [18]:
#check
vocaldf.wavloc[20]

'C:\\Users\\slwal\\anaconda3\\envs\\PY36\\avgn_paper-2\\data\\raw\\magpie\\individuals\\BWYa_F_MGGY\\BWY MGGY Discrete 6 210519 PM.WAV'

In [19]:
#check
seg_df.wavloc[20]

WindowsPath('C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals/BWYa_F_MGGY/BWY MGGY Discrete 6 210519 PM.WAV')

In [20]:
#check
vocaldf.seg_label[1600]

'SH'

In [21]:
#check
seg_df.seg_label[1600]

'SH'

In [22]:
#check 
vocdf.seg_label[1600]

'SH'

In [23]:
#replace wavloc column in vocdf to be wavloc df
vocdf.loc[:,['wavloc']] = seg_df[['wavloc']]

In [24]:
vocdf[:3]

Unnamed: 0_level_0,indv,gro,location,sex,wavloc,filename,seg_label,seg_start,seg_end,call_label,call_start,call_end,combi_label,combi_start,combi_end,call_unique_num,seg_pos_call,combi_unique_num,seg_pos_combi,call_pos_combi
New_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.753604,0.776773,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,0,0,0,0
1,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,SH,0.786865,0.835165,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,1,0,1,0
2,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.855941,0.92116,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,2,0,2,0


In [25]:
#convert some columns into integers 
vocdf[["seg_pos_combi", "seg_pos_call", "call_pos_combi", "combi_unique_num", "call_unique_num"]] = vocdf[["seg_pos_combi", "seg_pos_call", "call_pos_combi", "combi_unique_num", "call_unique_num"]].apply(pd.to_numeric)

In [26]:
vocdf[:3]

Unnamed: 0_level_0,indv,gro,location,sex,wavloc,filename,seg_label,seg_start,seg_end,call_label,call_start,call_end,combi_label,combi_start,combi_end,call_unique_num,seg_pos_call,combi_unique_num,seg_pos_combi,call_pos_combi
New_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.753604,0.776773,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,0,0,0,0
1,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,SH,0.786865,0.835165,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,1,0,1,0
2,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.855941,0.92116,DSSHDS,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,2,0,2,0


In [27]:
vocdf.reset_index(inplace=True)

In [28]:
vocdf[:3]

Unnamed: 0,New_ID,indv,gro,location,sex,wavloc,filename,seg_label,seg_start,seg_end,...,call_start,call_end,combi_label,combi_start,combi_end,call_unique_num,seg_pos_call,combi_unique_num,seg_pos_combi,call_pos_combi
0,0,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.753604,0.776773,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,0,0,0,0
1,1,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,SH,0.786865,0.835165,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,1,0,1,0
2,2,MGGY,BWYa,CRAWLEY,F,C:\Users\slwal\anaconda3\envs\PY36\avgn_paper-...,BWY MGGY Call Combo 1 290719 PM,DS,0.855941,0.92116,...,0.753604,0.92116,DSSHDS SHSHLH,0.753604,1.36713,0,2,0,2,0


In [29]:
from avgn.utils.paths import ensure_dir

In [30]:
#save dataframe as pickle
save_loc = DATA_DIR / DATASET_ID / 'walsh_magpie_combined_dataframe.pickle'
ensure_dir(save_loc)
vocdf.to_pickle(save_loc)

# Generate JSON from dataframe

In [31]:
import avgn
from avgn.custom_parsing.magpie_for_repos import (
    gen_wav_json
)
from avgn.utils.paths import DATA_DIR

In [32]:
savewav=False # don't want to create new WAV files, just reference original ones

In [33]:
with Parallel(n_jobs=-1, verbose=10) as parallel:
    parallel(
        delayed(gen_wav_json)(
            wf,
            vocdf[vocdf.wavloc==wf],
            DT_ID)
        for wf in tqdm(vocdf.wavloc.unique())
    );

  0%|          | 0/153 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1847s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0986s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 146 out of 153 | elapsed:    4.9s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 153 out of 153 | elapsed:   

### Check it worked

In [34]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet

In [35]:
# create a set of hyperparameters for processing this dataset.  
hparams = HParams(
    num_mel_bins = 32,
    mel_lower_edge_hertz=400,
    mel_upper_edge_hertz=15000,
    butter_lowcut = 400,
    butter_highcut = 15000,
    ref_level_db = 20,
    min_level_db = -30,
    mask_spec = True,
    win_length_ms = 10,
    hop_length_ms = 2,
    nex=-1,
    n_jobs=-1,
    verbosity = 1,
)

In [36]:
# create a dataset object, which
dataset = DataSet(DATASET_ID, hparams = hparams)

loading json:   0%|          | 0/152 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 152 out of 152 | elapsed:    0.0s finished


getting unique individuals:   0%|          | 0/152 [00:00<?, ?it/s]

In [37]:
# to make sure everything loaded correctly, lets look at a sample JSON
print(json.dumps(dataset.sample_json, indent=4, default=str)[0:15000] + '...')

{
    "filename": "BWY MGGY Call Combo 1 290719 PM",
    "samplerate_hz": 44100,
    "sex": "F",
    "group": "BWYa",
    "location": "CRAWLEY",
    "wav_loc": "C:/Users/slwal/anaconda3/envs/PY36/avgn_paper-2/data/raw/magpie/individuals/BWYa_F_MGGY/BWY MGGY Call Combo 1 290719 PM.WAV",
    "length_s": 2.1197732426303855,
    "species": "Gymnorhina tibicen dorsalis",
    "common_name": "Western Australian magpie",
    "indvs": {
        "MGGY": {
            "call": {
                "labels": [
                    "DSSHDS",
                    "DSSHDS",
                    "DSSHDS",
                    "SHSHLH",
                    "SHSHLH",
                    "SHSHLH"
                ],
                "start_times": [
                    0.753604333,
                    0.753604333,
                    0.753604333,
                    0.932017365,
                    0.932017365,
                    0.932017365
                ],
                "end_times": [
                    0.