In [1]:
import math
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn import preprocessing
import tensorflow as tf

In [22]:
f = h5py.File('data.hdf5')

X = f['X_balanced'][...]
y = f['y_balanced'][...]
N_FEATURES = X.shape[1]
N_CLASSES = 5

## Cross-validation

http://scikit-learn.org/stable/modules/cross_validation.html

Evaluating estimator performance. Maybe `cross_val_score`
can be used if TensorFlow supports estimator
with the right API.

In [29]:
def make_one_hot(y, n_classes=N_CLASSES):
    one_hot = np.zeros((y.shape[0], n_classes), dtype=y.dtype)
    one_hot[np.arange(y.shape[0]), y] = 1
    return one_hot

In [28]:
one_hot = make_one_hot(y)
one_hot

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ..., 
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]], dtype=int8)

### TensorFlow softmax graph

https://www.tensorflow.org/get_started/mnist/mechanics

In [7]:
# start interactive session
# session is connection to C++ backend
# typically construct graph and run it in session
sess = tf.InteractiveSession()

X_op = tf.placeholder(tf.float32, shape=[None, N_FEATURES])
y_op = tf.placeholder(tf.int8, shape=[None, N_CLASSES])

W = tf.Variable(tf.zeros([N_FEATURES, N_CLASSES]))
b = tf.Variable(tf.zeros([N_CLASSES]))

logits = tf.matmul(X_op, W) + b

cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(
        labels=y_op, logits=logits
    )
)

train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

correct_prediction = tf.equal(
    tf.argmax(logits, 1), tf.argmax(y_op, 1)
)

accuracy = tf.reduce_mean(
    tf.cast(correct_prediction, tf.float32)
)

In [15]:
# stratified k-folds cross-validator
# the folds preserve the percentage for each class
skf = StratifiedKFold(n_splits=5)
skf

# shuffle the matrixes
X, y = shuffle(X, y, random_state=0)

accuracies = np.zeros((5,))
for iteration, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    # split to train and validation seet
    X_train, y_train = X[train_idx], one_hot[train_idx]
    X_valid, y_valid = X[valid_idx], one_hot[valid_idx]

    # preprocessing
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    
    # initialize variables
    sess.run(tf.global_variables_initializer())
    
    # fit model
    train_dict = {
        X_op: X_train_transformed,
        y_op: y_train,
    }
    for i in range(200):
        train_step.run(feed_dict=train_dict)
        if i % 10 == 0:
            train_acc, loss = sess.run([accuracy, cross_entropy], feed_dict=train_dict)
            print('epoch: {:3} train_accuracy: {:.2f} loss: {:.7f}'.format(
                i, train_acc * 100, loss
            ))
    
    # evalueta model
    X_valid_transformed = scaler.transform(X_valid)
    valid_dict={
        X_op: X_valid_transformed,
        y_op: y_valid,
    }
    accuracy_score = accuracy.eval(feed_dict=valid_dict)
    accuracies[iteration] = accuracy_score
    print('validation accuracy: {:.2f}'.format(accuracy_score * 100))

accuracies.mean(), accuracies.std()

epoch:   0 train_accuracy: 73.00 loss: 6.4849358
epoch:  10 train_accuracy: 88.67 loss: 1.1685427
epoch:  20 train_accuracy: 98.17 loss: 0.1135308
epoch:  30 train_accuracy: 99.04 loss: 0.0489655
epoch:  40 train_accuracy: 99.52 loss: 0.0273929
epoch:  50 train_accuracy: 99.67 loss: 0.0168916
epoch:  60 train_accuracy: 99.83 loss: 0.0113925
epoch:  70 train_accuracy: 99.89 loss: 0.0085677
epoch:  80 train_accuracy: 99.91 loss: 0.0066205
epoch:  90 train_accuracy: 99.91 loss: 0.0051121
epoch: 100 train_accuracy: 99.96 loss: 0.0039684
epoch: 110 train_accuracy: 99.96 loss: 0.0032695
epoch: 120 train_accuracy: 99.98 loss: 0.0028949
epoch: 130 train_accuracy: 100.00 loss: 0.0026396
epoch: 140 train_accuracy: 100.00 loss: 0.0024427
epoch: 150 train_accuracy: 100.00 loss: 0.0022824
epoch: 160 train_accuracy: 100.00 loss: 0.0021477
epoch: 170 train_accuracy: 100.00 loss: 0.0020323
epoch: 180 train_accuracy: 100.00 loss: 0.0019318
epoch: 190 train_accuracy: 100.00 loss: 0.0018432
validation ac

In [32]:
X_test = f['X_test'][...]
y_test = f['y_test'][...]

one_hot_test = make_one_hot(y_test)
X_test.shape, y_test.shape, one_hot_test.shape

((175, 4000), (175,), (175, 5))

In [31]:
test_dict = {
    X_op: scaler.transform(X_test),
    y_op: one_hot_test,
}
test_accuracy = accuracy.eval(feed_dict=test_dict)
test_accuracy



0.81714284

In [33]:
f.close()