# Dataset

In [1]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
import spectraldl.preprocessing as preprocessing

In [2]:
with h5py.File('data/data.hdf5') as f:
    spectra = f['spectra']
    
    # create dataset matrix
    X = np.zeros((len(spectra), 140), dtype=np.float64)
    y = np.zeros((len(spectra), ), dtype=np.int8)
    
    for i, (name, data) in enumerate(f['spectra'].items()):
        waves = data[0]
        fluxes = data[1]
        
        # process waves and fluxes and store them
        vac_waves = preprocessing.air2vacuum(waves)
        conv_fluxes = preprocessing.convolve_spectrum(fluxes)
        X[i, :] = preprocessing.resample_spectrum(vac_waves, conv_fluxes)
        
        # store label
        y[i] = spectra[name].attrs['label']

# remove unknown spectra (have label 2)
X = X[y != 2]
y = y[y != 2]
# change label of double-peak spectra (from 3 to 2)
y[y == 3] = 2

np.unique(y, return_counts=True)

(array([0, 1, 2], dtype=int8), array([5301, 6103, 1533]))

In [3]:
# test and train split means 10% for test set
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, stratify=y)

# validation and train split means 20% for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, stratify=y_tr)

train = np.unique(y_tr, return_counts=True)
valid = np.unique(y_val, return_counts=True)
test = np.unique(y_te, return_counts=True)

train, valid, test

((array([0, 1, 2], dtype=int8), array([3817, 4393, 1104])),
 (array([0, 1, 2], dtype=int8), array([ 954, 1099,  276])),
 (array([0, 1, 2], dtype=int8), array([530, 611, 153])))

In [4]:
with h5py.File('data/data.hdf5') as f:
    f.create_dataset('X', X.shape, dtype=np.float64)[...] = X
    f.create_dataset('y', y.shape, dtype=np.int8)[...] = y
    f.create_dataset('X_tr', X_tr.shape, dtype=np.float64)[...] = X_tr
    f.create_dataset('y_tr', y_tr.shape, dtype=np.int8)[...] = y_tr
    f.create_dataset('X_val', X_val.shape, dtype=np.float64)[...] = X_val
    f.create_dataset('y_val', y_val.shape, dtype=np.int8)[...] = y_val
    f.create_dataset('X_te', X_te.shape, dtype=np.float64)[...] = X_te
    f.create_dataset('y_te', y_te.shape, dtype=np.int8)[...] = y_te