Seems relevant: https://gist.github.com/arunaugustine/5551446

In [None]:
import aifc
import numpy as np
import h5py
from matplotlib import pyplot as plt
import os
%matplotlib notebook

## Examine distribution of data

In [None]:
all_fft_data_file = h5py.File("all_whale_training_fft_30000samples_shape2001x2.hdf5", 'r')
all_timeseries_data_file = h5py.File("all_whale_training_30000samples_shape4000.hdf5", 'r')

In [None]:
all_fft_data_file['labels'], np.array(all_fft_data_file['labels'])

Need to randomly reorder the data before splitting.
This is to ensure that each split has about the same ratio of signal/no signal. 

In [None]:
all_fft_data_file.keys(), all_timeseries_data_file.keys()

In [None]:
# Are all the labels in the same order? 
(np.array(all_fft_data_file['labels']) == np.array(all_timeseries_data_file['labels'])).all()

In [None]:
# Ok, so we can combine these 
timeseries_fft_labels = []
for i in range(len(all_fft_data_file['labels'])):
    timeseries_fft_labels.append([
        all_timeseries_data_file['data'][i],
        all_fft_data_file['data'][i],
        all_timeseries_data_file['labels'][i]
    ])
timeseries_fft_labels = np.array(timeseries_fft_labels)

In [None]:
timeseries_fft_labels.shape

In [None]:
flat_labels = []
for i in range(len(timeseries_fft_labels.T[2])):
    if timeseries_fft_labels.T[2][i][0] == 1:
        flat_labels.append(1)
    else:
        flat_labels.append(0)
plt.figure()
plt.plot(flat_labels, linewidth=0.05)
plt.title('Label distribution before shuffle')

In [None]:
np.random.shuffle(timeseries_fft_labels)

In [None]:
flat_labels = []
for i in range(len(timeseries_fft_labels.T[2])):
    if timeseries_fft_labels.T[2][i][0] == 1:
        flat_labels.append(1)
    else:
        flat_labels.append(0)
plt.figure()
plt.plot(flat_labels, linewidth=0.05)
plt.title('Label distribution after shuffle')

Very good! It's much more evenly distributed now.

## Split into training/validation/test

In [None]:
test_indicies = int(0.2 * 30000)
train_indicies = test_indicies + int(0.8 * (0.8 * 30000))
validataion_indicies = train_indicies + int(0.8 * (0.2 * 30000))

In [None]:
test_indicies, train_indicies, validataion_indicies

### Split up the timeseries data

In [None]:
# Test data
test_data = []
test_labels = []
for i in range(test_indicies):
    test_data.append(timeseries_fft_labels[i][0])
    test_labels.append(timeseries_fft_labels[i][2])
test_data = np.array(test_data)
test_labels = np.array(test_labels)

# Training data 
training_data = []
training_labels = []
for i in range(test_indicies, train_indicies):
    training_data.append(timeseries_fft_labels[i][0])
    training_labels.append(timeseries_fft_labels[i][2])
training_data = np.array(training_data)
training_labels = np.array(training_labels)

# Validation data 
validation_data = []
validation_labels = []
for i in range(train_indicies, validataion_indicies):
    validation_data.append(timeseries_fft_labels[i][0])
    validation_labels.append(timeseries_fft_labels[i][2])
validation_data = np.array(validation_data)
validation_labels = np.array(validation_labels)

In [None]:
test_data.shape, training_data.shape, validation_data.shape

In [None]:
type(test_data[0][9])

In [None]:
# Save the split data

split_data_file = h5py.File(
    "whale_training_timeseries_samples19200train4800val6000test_shape4000_gen20171129.hdf5", 'w-')

split_data_file.create_dataset("training_data", dtype=np.short, shape=training_data.shape)
split_data_file['training_data'][...] = training_data

split_data_file.create_dataset("training_labels", dtype='i', shape=training_labels.shape)
split_data_file['training_labels'][...] = training_labels

split_data_file.create_dataset("validation_data", dtype=np.short, shape=validation_data.shape)
split_data_file['validation_data'][...] = validation_data

split_data_file.create_dataset("validation_labels", dtype='i', shape=validation_labels.shape)
split_data_file['validation_labels'][...] = validation_labels

split_data_file.create_dataset("testing_data", dtype=np.short, shape=test_data.shape)
split_data_file['testing_data'][...] = test_data

split_data_file.create_dataset("testing_labels", dtype='i', shape=test_labels.shape)
split_data_file['testing_labels'][...] = test_labels

split_data_file.flush()
split_data_file.close()

### Split up the fft data

In [None]:
# Test data
fft_test_data = []
fft_test_labels = []
for i in range(test_indicies):
    fft_test_data.append(timeseries_fft_labels[i][1])
    fft_test_labels.append(timeseries_fft_labels[i][2])
fft_test_data = np.array(fft_test_data)
fft_test_labels = np.array(fft_test_labels)

# Training data 
fft_training_data = []
fft_training_labels = []
for i in range(test_indicies, train_indicies):
    fft_training_data.append(timeseries_fft_labels[i][1])
    fft_training_labels.append(timeseries_fft_labels[i][2])
fft_training_data = np.array(fft_training_data)
fft_training_labels = np.array(fft_training_labels)

# Validation data 
fft_validation_data = []
fft_validation_labels = []
for i in range(train_indicies, validataion_indicies):
    fft_validation_data.append(timeseries_fft_labels[i][1])
    fft_validation_labels.append(timeseries_fft_labels[i][2])
fft_validation_data = np.array(fft_validation_data)
fft_validation_labels = np.array(fft_validation_labels)

In [None]:
fft_test_data.shape, fft_training_data.shape, fft_validation_data.shape

In [None]:
type(fft_test_data[0][9][0])

In [None]:
# Save the split data

split_fft_data_file = h5py.File(
    "whale_training_fft_samples19200train4800val6000test_shape2001x2_gen20171129.hdf5", 'w-')

split_fft_data_file.create_dataset("training_data", dtype=np.short, shape=fft_training_data.shape)
split_fft_data_file['training_data'][...] = fft_training_data

split_fft_data_file.create_dataset("training_labels", dtype='i', shape=fft_training_labels.shape)
split_fft_data_file['training_labels'][...] = fft_training_labels

split_fft_data_file.create_dataset("validation_data", dtype=np.short, shape=fft_validation_data.shape)
split_fft_data_file['validation_data'][...] = fft_validation_data

split_fft_data_file.create_dataset("validation_labels", dtype='i', shape=fft_validation_labels.shape)
split_fft_data_file['validation_labels'][...] = fft_validation_labels

split_fft_data_file.create_dataset("testing_data", dtype=np.short, shape=fft_test_data.shape)
split_fft_data_file['testing_data'][...] = fft_test_data

split_fft_data_file.create_dataset("testing_labels", dtype='i', shape=fft_test_labels.shape)
split_fft_data_file['testing_labels'][...] = fft_test_labels

split_fft_data_file.flush()
split_fft_data_file.close()