Seems relevant: https://gist.github.com/arunaugustine/5551446

In [None]:
import aifc
import numpy as np
import h5py
from matplotlib import pyplot as plt
%matplotlib notebook

# Exploring Data Format

In [None]:
small_folder = "small_data_sample/right_whale/"
small_file_1 = "train12.aiff"

In [None]:
small_aifc_1 = aifc.open(small_folder + small_file_1)

In [None]:
small_aifc_1.getnchannels()

In [None]:
small_aifc_1.getframerate()

In [None]:
small_aifc_1.getnframes()

In [None]:
frames = small_aifc_1.readframes(small_aifc_1.getnframes())

In [None]:
array = np.fromstring(frames, np.short).byteswap()
array

In [None]:
#array = np.fromstring(frames, 'float64').byteswap()
#array

In [None]:
plt.figure()
plt.plot(array)

In [None]:
plt.figure()
plt.plot(np.fft.rfft(array))

In [None]:
whale_files = [
    "small_data_sample/right_whale/train12.aiff",
    "small_data_sample/right_whale/train28.aiff",
    "small_data_sample/right_whale/train6.aiff",
    "small_data_sample/right_whale/train7.aiff",
    "small_data_sample/right_whale/train9.aiff"
]
no_whale_files = [
    "small_data_sample/no_right_whale/train1.aiff",
    "small_data_sample/no_right_whale/train2.aiff",
    "small_data_sample/no_right_whale/train3.aiff",
    "small_data_sample/no_right_whale/train4.aiff",
    "small_data_sample/no_right_whale/train5.aiff"
]

In [None]:
whale_aifcs = []
whale_data = []
for file_name in whale_files:
    whale_aifcs.append(aifc.open(file_name))
for aifc_file in whale_aifcs:
    frames = aifc_file.readframes(aifc_file.getnframes())
    whale_data.append(np.fromstring(frames, np.short).byteswap())
whale_data = np.array(whale_data)

no_whale_aifcs = []
no_whale_data = []
for file_name in no_whale_files:
    no_whale_aifcs.append(aifc.open(file_name))
for aifc_file in no_whale_aifcs:
    frames = aifc_file.readframes(aifc_file.getnframes())
    no_whale_data.append(np.fromstring(frames, np.short).byteswap())
no_whale_data = np.array(no_whale_data)

In [None]:
whale_data, no_whale_data

In [None]:
plt.figure()
for array in whale_data:
    plt.plot(np.fft.rfft(array), linewidth=0.5)
plt.xlabel('Frequency')
plt.title('Whale data')

In [None]:
plt.figure()
for array in no_whale_data:
    plt.plot(np.fft.rfft(array), linewidth=0.5)
plt.xlabel('Frequency')
plt.title('No Whale data')

It's not entirely clear where the whale frequency content is, so let's not downsample.

# Processing the Real Data

In [None]:
import os

In [None]:
training_files_folder = "/home/sarah/github/deep-whale/data/train/"
#training_aiff_filenames = os.listdir(training_files_folder)
label_csv_filename = "/home/sarah/github/deep-whale/data/train.csv"

In [None]:
# Load the labels csv
labels_csv = np.genfromtxt(label_csv_filename, dtype=None, delimiter=',', skip_header=1)

In [None]:
# Load the training files as numpy arrays
training_numpy_arrays = []
training_labels = []
#for filename in training_aiff_filenames:
for filename_tuple in labels_csv:
    filename = filename_tuple[0]
    label = filename_tuple[1]
    if label == 0:
        training_labels.append([0,1])
    else:
        training_labels.append([1,0])
    aifc_file = aifc.open(training_files_folder + filename)
    frames = aifc_file.readframes(aifc_file.getnframes())
    training_numpy_arrays.append(np.fromstring(frames, np.short).byteswap())
training_numpy_arrays = np.array(training_numpy_arrays)
training_labels = np.array(training_labels)

In [None]:
training_labels, training_numpy_arrays, training_numpy_arrays.shape

### Save Numpy Timeseries Data

In [None]:
# now save the data
all_data_file = h5py.File("all_whale_training_30000samples_shape4000.hdf5", 'w-')

In [None]:
all_data_file.create_dataset("data", dtype=np.short, shape=training_numpy_arrays.shape)

In [None]:
all_data_file['data'][...] = training_numpy_arrays

In [None]:
all_data_file.create_dataset("labels", dtype='i', shape=training_labels.shape)
all_data_file['labels'][...] = training_labels

In [None]:
all_data_file.flush()
all_data_file.close()

## Fourier Transform the Data

In [None]:
real_part = np.fft.rfft(training_numpy_arrays[0]).real
imag_part = np.fft.rfft(training_numpy_arrays[0]).imag

In [None]:
plt.figure()
plt.plot(real_part, label='real')
plt.plot(imag_part, label='imag')
plt.legend()

In [None]:
concat_parts = np.concatenate([real_part, imag_part])
plt.figure()
plt.plot(concat_parts)

Prof Z says to NOT append. Just do two different channels. (She said "like tuples")

In [None]:
real_imag_array = np.array([real_part, imag_part])
real_imag_array, real_imag_array.T

In [None]:
training_numpy_ffts = []
for i in range(len(training_numpy_arrays)):
    fft = np.fft.rfft(training_numpy_arrays[i])
    real_part = fft.real
    imag_part = fft.imag
    training_numpy_ffts.append(np.array([real_part, imag_part]).T)
training_numpy_ffts = np.array(training_numpy_ffts)

In [None]:
training_numpy_ffts.shape

### Save Numpy FFT Data

In [None]:
all_fft_data_file = h5py.File("all_whale_training_fft_30000samples_shape2001x2.hdf5", 'w-')

all_fft_data_file.create_dataset("data", dtype=np.short, shape=training_numpy_ffts.shape)
all_fft_data_file['data'][...] = training_numpy_ffts

all_fft_data_file.create_dataset("labels", dtype='i', shape=training_labels.shape)
all_fft_data_file['labels'][...] = training_labels

all_fft_data_file.flush()
all_fft_data_file.close()

## Split into training/validation/test

In [None]:
test_frac = 0.2
(1-test_frac) * 30000, test_frac * 30000

In [None]:
val_frac = 0.2
(1-val_frac) * (24000.0), val_frac * (24000.0)

In [None]:
import aifc
import numpy as np
import h5py
from matplotlib import pyplot as plt
%matplotlib notebook

In [None]:
all_fft_data_file = h5py.File("all_whale_training_fft_30000samples_shape2001x2.hdf5", 'r')

In [None]:
all_fft_data_file['labels']

In [None]:
flat_labels = []
for i in range(len(all_fft_data_file['labels'])):
    if all_fft_data_file['labels'][i][0] == 1:
        flat_labels.append(1)
    else:
        flat_labels.append(0)


In [None]:
plt.figure()
plt.plot(flat_labels, '-', linewidth=0.05)

In [None]:
#Seems... close enough to randomly distributed

In [None]:
test_indicies = int(0.2 * 30000)
train_indicies = test_indicies + int(0.8 * (0.8 * 30000))
validataion_indicies = train_indicies + int(0.8 * (0.2 * 30000))

In [None]:
test_indicies, train_indicies, validataion_indicies

In [None]:
# Now split up the data!

# Test data
test_data = []
test_labels = []
for i in range(test_indicies):
    test_data.append(all_fft_data_file['data'][i])
    test_labels.append(all_fft_data_file['labels'][i])
test_data = np.array(test_data)
test_labels = np.array(test_labels)

# Training data 
training_data = []
training_labels = []
for i in range(test_indicies, train_indicies):
    training_data.append(all_fft_data_file['data'][i])
    training_labels.append(all_fft_data_file['labels'][i])
training_data = np.array(training_data)
training_labels = np.array(training_labels)

# Validation data 
validation_data = []
validation_labels = []
for i in range(train_indicies, validataion_indicies):
    validation_data.append(all_fft_data_file['data'][i])
    validation_labels.append(all_fft_data_file['labels'][i])
validation_data = np.array(validation_data)
validation_labels = np.array(validation_labels)

In [None]:
test_data.shape, training_data.shape, validation_data.shape

In [None]:
# Save the split data

split_fft_data_file = h5py.File("whale_training_fft_samples19200train4800val6000test_shape2001x2.hdf5", 'w-')

split_fft_data_file.create_dataset("training_data", dtype=np.short, shape=training_data.shape)
split_fft_data_file['training_data'][...] = training_data

split_fft_data_file.create_dataset("training_labels", dtype='i', shape=training_labels.shape)
split_fft_data_file['training_labels'][...] = training_labels

split_fft_data_file.create_dataset("validation_data", dtype=np.short, shape=validation_data.shape)
split_fft_data_file['validation_data'][...] = validation_data

split_fft_data_file.create_dataset("validation_labels", dtype='i', shape=validation_labels.shape)
split_fft_data_file['validation_labels'][...] = validation_labels

split_fft_data_file.create_dataset("testing_data", dtype=np.short, shape=test_data.shape)
split_fft_data_file['testing_data'][...] = test_data

split_fft_data_file.create_dataset("testing_labels", dtype='i', shape=test_labels.shape)
split_fft_data_file['testing_labels'][...] = test_labels

split_fft_data_file.flush()
split_fft_data_file.close()