# Data Set

Aim is to create dataset and split it into test and training set.

https://cs231n.github.io/neural-networks-2/:
An important point to make about the preprocessing is that
any preprocessing statistics (e.g. the data mean)
must only be computed on the training data.

The data is imbalanced.
Use http://contrib.scikit-learn.org/imbalanced-learn/index.html.

In [15]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
import imblearn.over_sampling
import matplotlib.pyplot as plt
%matplotlib nbagg

In [2]:
f = h5py.File('data.hdf5')
spectra = f['resampled']
labels = spectra['labels']
fluxes = spectra['fluxes']

In [3]:
# sturucture of classes and not labeled data
classes, counts = np.unique(labels, return_counts=True)
for label, count in zip(classes, counts):
    print('label {:2}: {:7} samples'.format(label, count))

label -1:   11487 samples
label  0:     176 samples
label  1:     172 samples
label  2:    1186 samples
label  3:      56 samples
label  4:     132 samples


In [4]:
# labeled index
labeled_idx = labels[:] != -1
X = fluxes[labeled_idx, :]
y = labels[labeled_idx]
X.shape, X.dtype, y.shape, y.dtype

((1722, 4000), dtype('>f8'), (1722,), dtype('int32'))

## Testing set

Split testing set.
35 samples from each class.
This would give 10/90 percent split.

In [5]:
n_classes = 5
n_test = 35
n_test_set = n_test * n_classes
n_samples = X.shape[0]
n_features = X.shape[1]
n_train_set = n_samples - n_test_set

print('test set: {:.2f}%\ntrain set: {:.2f}%'.format(
    n_test_set / n_samples * 100,
    (n_samples - n_test_set) / n_samples * 100
))

test set: 10.16%
train set: 89.84%


In [6]:
del f['X_train']
del f['y_train']
del f['X_test']
del f['y_test']

X_train = f.create_dataset('X_train', (n_train_set, n_features), dtype=X.dtype)
y_train = f.create_dataset('y_train', (n_train_set,), dtype=y.dtype)
X_test = f.create_dataset('X_test', (n_test_set, n_features), dtype=X.dtype)
y_test = f.create_dataset('y_test', (n_test_set,), dtype=y.dtype)
X_train, y_train, X_test, y_test

(<HDF5 dataset "X_train": shape (1547, 4000), type ">f8">,
 <HDF5 dataset "y_train": shape (1547,), type "<i4">,
 <HDF5 dataset "X_test": shape (175, 4000), type ">f8">,
 <HDF5 dataset "y_test": shape (175,), type "<i4">)

In [7]:
train_idx = 0
test_idx = 0
for label, count in zip(range(n_classes), counts[1:]):
    idx = y == label
    (X_train[train_idx:train_idx + count - n_test],
     X_test[test_idx:test_idx + n_test],
     y_train[train_idx:train_idx + count - n_test],
     y_test[test_idx:test_idx + n_test]) = train_test_split(
        X[idx], y[idx], test_size=n_test, random_state=0
    )
    train_idx += count - n_test
    test_idx += n_test

In [8]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int32), array([35, 35, 35, 35, 35]))

## Balancing Training Set

https://arxiv.org/pdf/1609.06570v1.pdf

In [9]:
_, counts = np.unique(y_train, return_counts=True)
X_balanced = X_train[:]
y_balanced = y_train[:]
counts

array([ 141,  137, 1151,   21,   97])

In [10]:
# call four times because each time only one class is balanced
for i in range(n_classes - 1):
    smote = imblearn.over_sampling.SMOTE()
    X_balanced, y_balanced = smote.fit_sample(X_balanced, y_balanced)



In [11]:
np.unique(y_balanced, return_counts=True)

(array([0, 1, 2, 3, 4], dtype=int32), array([1151, 1151, 1151, 1151, 1151]))

In [16]:
for x in X_balanced[y_balanced == 5]:
    plt.plot(x)

In [13]:
del f['X_balanced']
del f['y_balanced']

h5_X_balanced = f.create_dataset('X_balanced', X_balanced.shape, dtype=X_balanced.dtype)
h5_y_balanced = f.create_dataset('y_balanced', y_balanced.shape, dtype=y_balanced.dtype)
h5_X_balanced[...] = X_balanced
h5_y_balanced[...] = y_balanced

In [14]:
f.close()