# Create dataset

**Imports**

In [25]:
import glob
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import random
import time

**Generate balanced dataset of ~10,000 dives and non-dives**

In [31]:
birds = glob.glob('../Data/Acc_npy/*/')

arrs = []

t = time.time()

for bird in birds:
    
    print(f'PROCESSING BIRD: {bird}...')
    
    # Load data
    data = da.from_npy_stack(bird)
    
    # Sample dives
    dive_ix = da.where(data[:,-1] == 1)[0].compute()
    pos = data[da.random.choice(dive_ix, random.randint(1000, 1250), replace=False)]

    # Sample non-dives
    no_dive_ix = np.setdiff1d(np.arange(data.shape[0]), dive_ix)
    neg = data[da.random.choice(no_dive_ix, random.randint(1150, 1400), replace=False)]

    # Stack, shuffle, and add to list
    data_add = da.vstack((pos, neg))
    arrs.append(data_add)
    
print(f'\nTime elapsed: {time.time() - t}')
    
print('\nStacking and shuffling...')
data_full = da.vstack(arrs)  # stack
ix = np.random.choice(data_full.shape[0], data_full.shape[0], replace=False)
data_full = da.slicing.shuffle_slice(data_full, ix)  # shuffle    

# split into train/test
print('\rSplitting into train/test...')
msk = np.random.rand(len(df)) < 0.8
train = data_full[msk]
test = data_full[~msk]

# Save in npy stack
print('\rWriting to npy stack...')
da.to_npy_stack('../Data/Reduced/npy_train/', train, axis=0)
da.to_npy_stack('../Data/Reduced/npy_test/', test, axis=0)

# Convert to df and write to csv
print('\rBringing into memory...')
df_train = train.to_dask_dataframe().compute()
df_test = test.to_dask_dataframe().compute()

df_train.iloc[:, -1] = df_train.iloc[:, -1].astype(int)
df_test.iloc[:, -1] = df_test.iloc[:, -1].astype(int)

# write to csv
print('\rWriting to csv...')
df_train.to_csv('../Data/Reduced/train_reduced_dset.csv', header=False, index=False)
df_test.to_csv('../Data/Reduced/test_reduced_dset.csv', header=False, index=False)

PROCESSING BIRD: ../Data/Acc_npy/ch_gps07_gv37846_20190206_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps11_gv37849_20190206_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps10_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps12_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps16_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps03_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps13_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps08_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps09_S1/...

Time elapsed: 2311.46582698822

Stacking and shuffling...
Bringing into memory...
Writing to csv...


## With bird ID

In [26]:
birds = glob.glob('../Data/Acc_npy/*/')

dfs = []

t = time.time()

for bird in birds:
    
    print(f'PROCESSING BIRD: {bird}...')
    
    idd = bird.split('/')[-2]
    
    # Load data
    data = da.from_npy_stack(bird)
    
    # Sample dives
    dive_ix = da.where(data[:,-1] == 1)[0].compute()
    pos = data[da.random.choice(dive_ix, random.randint(1000, 1250), replace=False)]

    # Sample non-dives
    no_dive_ix = np.setdiff1d(np.arange(data.shape[0]), dive_ix)
    neg = data[da.random.choice(no_dive_ix, random.randint(1150, 1400), replace=False)]

    # Stack, compute, and add to list
    data_add = dd.from_dask_array(da.vstack((pos, neg))).compute()
    data_add.columns = [*data_add.columns[:-1], 'Dive']  # rename last col
    data_add['Dive'] = data_add['Dive'].astype(int)
    data_add['BirdID'] = idd
    
    dfs.append(data_add)
    
print(f'\nTime elapsed: {time.time() - t}')
    
print('\nStacking and shuffling...')
data_full = pd.concat(dfs, ignore_index=True) # stack
data_full = data_full.sample(frac=1) # shuffle

print('Writing to csv...')
data_full.to_csv('../Data/Reduced/reduced_dset_wnames.csv', header=True, index=False)

PROCESSING BIRD: ../Data/Acc_npy/ch_gps07_gv37846_20190206_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps11_gv37849_20190206_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps10_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps12_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps16_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps03_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps13_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps08_S1/...
PROCESSING BIRD: ../Data/Acc_npy/ch_gps09_S1/...

Time elapsed: 2292.1076719760895

Stacking and shuffling...
Writing to csv...
