# Data Set

Aim is to create dataset and split it into test and training set.

https://cs231n.github.io/neural-networks-2/:
An important point to make about the preprocessing is that
any preprocessing statistics (e.g. the data mean)
must only be computed on the training data.

The data is imbalanced.
Use http://contrib.scikit-learn.org/imbalanced-learn/index.html.

In [1]:
# load the resampled data with labels
import os
import pickle

data_filename = 'ondrejov-halpha-labeled-resampled.pickle'
with open(os.path.join('data', data_filename), 'rb') as f:
    spectra = pickle.load(f)

In [2]:
import numpy as np

In [3]:
# sturucture of classes and not labeled data
data_set_struc = np.unique(
    [x['label'] for x in spectra.values()],
    return_counts=True
)
for label, count in zip(*data_set_struc):
    print('label {:2}:\texamples: {:7}'.format(label, count))

label -1:	examples:   11486
label  0:	examples:     176
label  1:	examples:     172
label  2:	examples:    1186
label  3:	examples:      56
label  4:	examples:     132


In [4]:
# get only labeled data
data_set = {
    ident: data
    for ident, data in spectra.items()
    if data['label'] != -1
}

In [5]:
# create data matrix X and label vector y
X_list = []
y_list = []
for data in data_set.values():
    X_list.append(data['resampled_flux'])
    y_list.append(data['label'])
    
X = np.array(X_list)
y = np.array(y_list)
X.shape, y.shape

((1722, 4000), (1722,))

## Testing set

Split testing set.
35 samples from each class.
This would give 10/90 percent split.

In [6]:
n_test = 35
n_classes = 5
n_test_set = n_test * n_classes
total_samples = y.shape[0]
print('test set: {:.2f}%\ntrain set: {:.2f}%'.format(
    n_test_set / total_samples * 100,
    (total_samples - n_test_set) / total_samples * 100
))

test set: 10.16%
train set: 89.84%


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, y_train = [], []
X_test, y_test = [], []
for label in range(5):
    X_tr, X_te, y_tr, y_te = train_test_split(
        X[y == label], y[y == label],
        test_size=n_test,
        random_state=0
    )
    X_test.append(X_te)
    X_train.append(X_tr)
    y_test.append(y_te)
    y_train.append(y_tr)

X_test = np.concatenate(X_test)
X_train = np.concatenate(X_train)
y_test = np.concatenate(y_test)
y_train = np.concatenate(y_train)

In [9]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4]), array([35, 35, 35, 35, 35]))

## Balancing Training Set

https://arxiv.org/pdf/1609.06570v1.pdf

In [10]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2, 3, 4]), array([ 141,  137, 1151,   21,   97]))

In [11]:
import imblearn.over_sampling

In [12]:
# call four times because each time only
# one class is balanced
for i in range(4):
    smote = imblearn.over_sampling.RandomOverSampler()
    X_train, y_train = smote.fit_sample(X_train, y_train)
    np.unique(y_train, return_counts=True)

In [13]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2, 3, 4]), array([1151, 1151, 1151, 1151, 1151]))

## Pickle

In [14]:
# serialize the object for later use
def serialize_pickle(filename, data):
    with open(os.path.join('data', filename), 'wb') as f:
        pickle.dump(data, f)

serialize_pickle('train_set.pickle', (X_train, y_train))
serialize_pickle('test_set.pickle', (X_test, y_test))