In [1]:
import pickle
from timeit import default_timer as timer

import os
import platform
import tensorflow as tf
import keras;

# *print ("Current file path: {}".format(os.path.dirname(os.path.realpath(__file__))))
print ("Current Working Directory: {}".format(os.getcwd()))
print("Python: {}, TensorFlow:{}, Keras:{}".\
      format(platform.python_version(), tf.__version__, keras.__version__))

random_seed = 1



Using TensorFlow backend.


Current Working Directory: /home/endre/git/finance_ml/src
Python: 3.6.1, TensorFlow:1.2.1, Keras:2.0.5


# Load pickled Features, Labels and Name of each Feature-array

In [3]:
print ("De-pickling features, labels and feature/label names..")
load_start = timer()

# pickled = pickle.load(open("RangeNamesFeaturesAndLabels-mk1.pickle", "rb"))
# pickled = pickle.load(open("RangeNamesFeaturesAndLabels-smallSet-6stocks.pickle", "rb"))
# pickled = pickle.load(open("RangeNamesFeaturesAndLabels-smallSet-50stocks.pickle", "rb"))
pickled = pickle.load(open("RangeNamesFeaturesAndLabels-mediumset-unknownNumStocks.pickle", "rb"))

load_millis = (timer()-load_start) * 1000
print ("De-pickle took {} ms".format(load_millis))

# {'rangeNames': rangeNames, 'features': features, 'labels': labels}
rangeNames = pickled['rangeNames']
features = pickled['features']
labels = pickled['labels']
print("\nRangeNames/features/labels len: {}/{}/{}".\
      format(len(rangeNames), len(features), len(labels)))
numFeatures = len(features[0])
print("\nNumber of Features for each feature-array: {}".format(numFeatures))
print("Number of Labels for each label-array: {}".format(len(labels[0])))

De-pickling features, labels and feature/label names..
De-pickle took 25738.598705996992 ms

RangeNames/features/labels len: 1008922/1008922/1008922

Number of Features for each feature-array: 54
Number of Labels for each label-array: 4


# Split set into Train and Test

In [4]:
import resource
import gc
def mem():
    print('Memory usage: %2.2f MB' % round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0,1))
    
mem()
trainRangeNames = []
trainFeatures = []
trainLabels = []

testRangeNames = []
testFeatures = []
testLabels = []
mem()

gc.collect()
mem()

length = len(rangeNames)
for i in range(length):
    names = rangeNames[i]
    if (names[1] < '2016-01-01'):
        trainRangeNames.append(names)
        trainFeatures.append(features[i])
        trainLabels.append(labels[i])
    else:
        testRangeNames.append(names)
        testFeatures.append(features[i])
        testLabels.append(labels[i])

lenTrainFeatures = len(trainFeatures)

print("TRAIN RangeNames/features/labels len: {}/{}/{} - {:.2f}% of total".\
      format(len(trainRangeNames), lenTrainFeatures, len(trainLabels), (lenTrainFeatures / len(features)) * 100 ))
print("TEST RangeNames/features/labels len: {}/{}/{} - {:.2f}% of total".\
      format(len(testRangeNames), len(testFeatures), len(testLabels), (len(testFeatures) / len(features)) * 100 ))

print("\nTotal loaded features: {}, trainFeatures + testFeatures:{} - {:.4f}%".\
      format(len(features), lenTrainFeatures + len(testFeatures), ((lenTrainFeatures + len(testFeatures)) / len(features)) * 100))

Memory usage: 7895.80 MB
Memory usage: 7895.80 MB
Memory usage: 7895.80 MB
TRAIN RangeNames/features/labels len: 939830/939830/939830 - 93.15% of total
TEST RangeNames/features/labels len: 69092/69092/69092 - 6.85% of total

Total loaded features: 1008922, trainFeatures + testFeatures:1008922 - 100.0000%


In [5]:
# One-hot the labels

# 0:  5 days
# 1: 10 days
# 2: 15 days
# 3: 20 days
label_to_use = 3

good = [1,0]
bad =  [0,1]

oh_trainLabels = [good if x[label_to_use] > 0 else bad  for x in trainLabels]
oh_testLabels = [good if x[label_to_use] > 0 else bad  for x in testLabels]
print("One-hotted ok.")

sum_trainLabels = [sum(i) for i in zip(*oh_trainLabels)]
sum_testLabels = [sum(i) for i in zip(*oh_testLabels)]
print("Train labels [good, bad]: {} -> [{:.2f}%, {:.2f}%]".\
      format(sum_trainLabels, (sum_trainLabels[0] / len(trainLabels)) * 100, (sum_trainLabels[1] / len(trainLabels)) * 100))
print("Test labels  [good, bad]: {} -> [{:.2f}%, {:.2f}%]".\
      format(sum_testLabels, (sum_testLabels[0] / len(testLabels)) * 100, (sum_testLabels[1] / len(testLabels)) * 100))

One-hotted ok.
Train labels [good, bad]: [505724, 434106] -> [53.81%, 46.19%]
Test labels  [good, bad]: [39999, 29093] -> [57.89%, 42.11%]


In [6]:
from sklearn.utils import shuffle

currentTrainPos = 0
def getTrainMiniBatch(miniBatchSize):
    global currentTrainPos
    global trainRangeNames, trainFeatures, trainLabels, oh_trainLabels
    if (currentTrainPos + miniBatchSize > lenTrainFeatures):
        currentTrainPos = 0
    if (currentTrainPos == 0):
        trainRangeNames, trainFeatures, trainLabels, oh_trainLabels =\
            shuffle(trainRangeNames, trainFeatures, trainLabels, oh_trainLabels, random_state=random_seed)
    start = currentTrainPos
    end = currentTrainPos + miniBatchSize
    currentTrainPos = end
    
    return trainFeatures[start:end], oh_trainLabels[start:end]

# Set up Neural Network

In [7]:
n1=800
n2=1500
n3=800
out=2
epochs = 15   # Not explicitly used, as we count mini_batches
mini_batch_size = 1024
learning_rate = 0.0001
transfer_function = tf.nn.relu

#========
mini_batches_per_epoch = int(lenTrainFeatures / mini_batch_size)
total_mini_batches = mini_batches_per_epoch * epochs

print("Mini batches per epoch: trainFeats {} / mini_batch_size {} = {}".format(lenTrainFeatures, mini_batch_size, mini_batches_per_epoch))
print("Total mini batches: mini_batches_per_epoch {} * epochs {} = {}".format(mini_batches_per_epoch, epochs, total_mini_batches))

# Reset the default graph, so as to chuck out existing variables
tf.reset_default_graph()

X = tf.placeholder(tf.float32, shape=[None, numFeatures])
labels = tf.placeholder(tf.float32, shape=[None, out])

def weight_variable(name, shape):
    # return tf.get_variable(name, initializer=tf.glorot_uniform_initializer(seed=random_seed), shape=shape)
    return tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(), shape=shape)
    
def bias_variable(name, shape):
    # return tf.get_variable(name, initializer=tf.glorot_uniform_initializer(seed=random_seed), shape=shape)
    return tf.get_variable(name, initializer=tf.contrib.layers.xavier_initializer(), shape=shape)

with tf.variable_scope("layer1"):
    w1 = weight_variable("weight", [numFeatures, n1])
    b1 = bias_variable("bias", [n1])
    l1 = tf.matmul(X, w1) + b1
    l1 = transfer_function(l1)

with tf.variable_scope("layer2"):
    w2 = weight_variable("weight", [n1, n2])
    b2 = bias_variable("bias", [n2])
    l2 = tf.matmul(l1, w2) + b2
    l2 = transfer_function(l2)

with tf.variable_scope("layer3"):
    w3 = weight_variable("weight", [n2, n3])
    b3 = bias_variable("bias", [n3])
    l3 = tf.matmul(l2, w3) + b3
    l3 = transfer_function(l3)

with tf.variable_scope("output"):
    wy = weight_variable("weight", [n3, out])
    by = bias_variable("bias", [out])
    Y = tf.matmul(l3, wy) + by

# Define Training: Loss/Cost and Optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=Y, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate)
# optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train = optimizer.minimize(cost)

# Define Test/Evaluate: Accuracy: Fraction right predictions
correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

print("Network set up.")

Mini batches per epoch: trainFeats 939830 / mini_batch_size 1024 = 917
Total mini batches: mini_batches_per_epoch 917 * epochs 15 = 13755
Network set up.


# Train Neural Network

In [8]:
sess = tf.Session()

# Initialize TensorFlow variables in Session
sess.run(tf.global_variables_initializer())

def printStats(what):
    check_features = 10000
    train_cost, train_accuracy = sess.run([cost, accuracy], 
                                       feed_dict={X: trainFeatures[:check_features], labels: oh_trainLabels[:check_features]})
    test_cost, test_accuracy = sess.run([cost, accuracy], 
                                     feed_dict={X: testFeatures, labels: oh_testLabels})
    print("{}\n  TRAIN cost/acc:{:.4f}/{:.4f}%, TEST cost/acc:{:.4f}/{:.4f}%"\
          .format(what, train_cost, train_accuracy*100, test_cost, test_accuracy*100))

printStats("BEFORE Training")

time_start = timer()
epoch = 0
for mini_batch in range(1, total_mini_batches+1):
    batch_inputs, batch_labels = getTrainMiniBatch(mini_batch_size)
    sess.run([train, accuracy], feed_dict={X: batch_inputs, 
                                           labels: batch_labels})
    if (mini_batch % 1000 == 0):
        # print(".. progress: mini_batch/total_mini_batches: {}/{}".format(mini_batch, total_mini_batches))
        printStats(' ... progress: In "Epoch" {}, mini batch {} of {}'.format(epoch, mini_batch, total_mini_batches))
        
    if (mini_batch % mini_batches_per_epoch == 0):
        epoch += 1
        printStats('\nAfter "Epoch" {}, mini batch {} of {}'.format(epoch, mini_batch, total_mini_batches))
        
training_time = timer()-time_start
printStats("\nFINISHED")
print("TRAINING took {} seconds.".format(training_time))

BEFORE Training
  TRAIN cost/acc:1.6980/53.5200%, TEST cost/acc:1.4199/57.1875%

After "Epoch" 1, mini batch 917 of 13755
  TRAIN cost/acc:0.7236/53.8100%, TEST cost/acc:0.7024/55.3769%
 ... progress: In "Epoch" 1, mini batch 1000 of 13755
  TRAIN cost/acc:0.7058/55.5600%, TEST cost/acc:0.7130/53.2898%

After "Epoch" 2, mini batch 1834 of 13755
  TRAIN cost/acc:0.6896/57.0100%, TEST cost/acc:0.7082/52.8976%
 ... progress: In "Epoch" 2, mini batch 2000 of 13755
  TRAIN cost/acc:0.7503/56.4600%, TEST cost/acc:0.7079/55.1395%

After "Epoch" 3, mini batch 2751 of 13755
  TRAIN cost/acc:0.7029/55.1500%, TEST cost/acc:0.7068/52.5271%
 ... progress: In "Epoch" 3, mini batch 3000 of 13755
  TRAIN cost/acc:0.6846/56.9000%, TEST cost/acc:0.6997/53.7182%

After "Epoch" 4, mini batch 3668 of 13755
  TRAIN cost/acc:0.6775/56.9300%, TEST cost/acc:0.6956/55.2075%
 ... progress: In "Epoch" 4, mini batch 4000 of 13755
  TRAIN cost/acc:0.6735/58.3300%, TEST cost/acc:0.6976/54.1394%

After "Epoch" 5, min