In [None]:
import time

import numpy as np
import pandas as pd

from preprocessing.preprocessing import build_dataset

pd.options.mode.chained_assignment = None 

In [None]:
def detect_group_leakage(df, group_name, split_name):
    
    df_overlapping = df.groupby([group_name])[split_name].nunique()
    return any(df_overlapping>1)

def plot_split_size(df, split_name, labels_names):
    
    df_plot = df.groupby([split_name])[labels_names].sum()
    display(round(df_plot/df_plot.sum(),2))

In [None]:
# Multiclass
wl = 3
sr = 22050
flims = (1,10000)
site_list = ['INCT20955','INCT41']
path_save = 'data/BuildDataset/datasetv2-multiclass_1'
labels_cols = ['BOAFAB_F', 'BOAFAB_M', 'PHYCUV_F', 'PHYCUV_M', 'BOAFAB_C', 'DENCRU_M',
               'BOALUN_M', 'BOAALB_M', 'BOAALB_F', 'DENCRU_F', 'BOALUN_F', 'PHYMAR_F',
               'PHYMAR_M', 'PITAZU_M', 'PITAZU_F', 'BOALUN_C', 'PHYMAR_C']
start = time.time()
df_multiclass = build_dataset(wl=wl, 
                              site_list=site_list,
                              target_sr=sr, 
                              flims=flims,
                              path_save=path_save,
                             labels_cols=labels_cols)
end = time.time()
print(round((end - start)/60,1),' minutes')

In [None]:
df_dataset_concat = df_multiclass.copy()

In [None]:
df_dataset_concat.shape

In [None]:
df_dataset_concat['fold'].value_counts(normalize=True)

In [None]:
pd.crosstab(df_dataset_concat['subset'], df_dataset_concat['fold'])

In [None]:
print('Group Leakage:', detect_group_leakage(df=df_dataset_concat, 
                 group_name='fname',
                 split_name='fold'))

In [None]:
df_dataset_concat_D = df_dataset_concat.copy()
df_dataset_concat_D['dummy'] = 1
df_dataset_concat_D = df_dataset_concat_D.pivot_table(index='class', columns = 'fold', values = 'dummy',aggfunc=np.sum)
df_dataset_concat_D = pd.concat([df_dataset_concat_D,
                                df_dataset_concat['class'].value_counts().rename('Label count')],axis=1,)
df_dataset_concat_D= pd.concat([df_dataset_concat_D,
                                pd.DataFrame(df_dataset_concat['fold'].value_counts(normalize=False).rename('Fold count')).T])
df_dataset_concat_D

In [None]:
df_dataset_concat['class'].value_counts()

In [None]:
df_dataset_concat.head()

In [None]:
# Binary BOAFAB
wl = 3
sr = 22050
flims = (100,2000)
verbose = False # Doubt about use this term
recordings_folder = 'data/INCT20955/raw/recordings/'
annotation_path = 'data/INCT20955/raw/annotations/'
path_save = 'data/datasetv1/binary_boafab_1'
prefix='SAMPLE_'
labels_cols = ['BOAFAB_M','BOAFAB_C'] # looks like multiclass but ends like binary
site=['INCT20955']

start = time.time()
df_boafab_binary_dataset = build_dataset(wav_path=recordings_folder, 
                                                  annotation_path=annotation_path, 
                                                  wl=wl, 
                                                  site=site,
                                                  target_sr=sr, 
                                                  flims=flims,
                                                  path_save=path_save, 
                                                  prefix=prefix, 
                                                  labels_cols=labels_cols,
                                                  verbose=verbose)
end = time.time()
print(round((end - start)/60,1),' minutes')

In [None]:
# Binary PHYCUV
wl = 3
sr = 22050
flims = (300,2000)
verbose=True
recordings_folder = 'data/INCT20955/raw/recordings/'
annotation_path = 'data/INCT20955/raw/annotations/'
path_save='data/datasetv1/binary_phycuv_1'
prefix='SAMPLE_'
labels_cols = ['PHYCUV_M']
site=['INCT20955']

start = time.time()
df_phycuv_binary_dataset = build_dataset(wav_path=recordings_folder, 
                                                  annotation_path=annotation_path, 
                                                  wl=wl, 
                                                  site=site,
                                                  target_sr=sr, 
                                                  flims=flims,
                                                  path_save=path_save, 
                                                  prefix=prefix, 
                                                  labels_cols=labels_cols,
                                                  verbose=verbose)
end = time.time()
print(round((end - start)/60,1),' minutes')

In [None]:
# Multiclass
wl = 3
sr = 22050
flims = (1,10000)
verbose = False # Doubt about use this term
recordings_folder = 'data/INCT41/raw/recordings/'
annotation_path = 'data/INCT41/raw/annotations/'
path_save='data/datasetv2/multiclass_1'
prefix='SAMPLE_'
max_duration = 60
labels_cols = ['BOAFAB_M','BOAFAB_C','PHYCUV_M']
site=['INCT20955','INCT4']

start = time.time()
df_boafab_multiclass_1 = build_dataset(wav_path=recordings_folder, 
                                                  annotation_path=annotation_path, 
                                                  wl=wl, 
                                                  site=site,
                                                  target_sr=sr, 
                                                  flims=flims,
                                                  path_save=path_save, 
                                                  prefix=prefix, 
                                                  labels_cols=labels_cols,
                                                  verbose=verbose)
end = time.time()
print(round((end - start)/60,1),' minutes')