In [70]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn import preprocessing
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
import matplotlib.pyplot as plt
import GPy
import GPyOpt
import copy
import time


class Model:
    # Network Parameters
    # n_neurons, learning_rate, num_layers, rnn_type(RNN|BasicLSTM|LSTM|LSTM peelhole)
    # Control Parameters
    # risk_aversion - the margin added to the courtage that leads an buy or sell operation
    # learning_period - how many sequences model should learn before predicting next sequences
    # prediction_period - how many sequences the model should predict
    # max_repeats - how many times in maximum the model should learn
    # min_profit - what is the minimum profit in average during training phase, if the minimum is not reached, the model should not predict
    # gamma - what is the gamma used when preprocessing data
    
    step_profit_list = []
    mixed_domain = [{'name': 'n_neurons', 'type': 'discrete', 'domain': tuple(range(20,160,20))},
          {'name': 'learning_rate', 'type': 'discrete', 'domain': (0.001,0.002,0.003,0.004)},
          {'name': 'num_layers', 'type': 'discrete', 'domain': (1,2,3,4)},
          {'name': 'rnn_type', 'type': 'discrete', 'domain': (0,1,2)},
          {'name': 'learning_period', 'type': 'discrete', 'domain': tuple(range(10,40,10))},
          {'name': 'prediction_period', 'type': 'discrete', 'domain': tuple(range(5,10,5))},
          {'name': 'max_repeats', 'type': 'discrete', 'domain': tuple(range(1,10,2))},
          {'name': 'beta', 'type': 'discrete', 'domain': (99, 98)},
          {'name': 'ema', 'type': 'discrete', 'domain': (10,20)},
          {'name': 'time_input', 'type': 'discrete', 'domain': (0,1)},
         ]
    def __init__(self, regen):
        if regen == False:
            return
        def column_filter(x):
            if x == 'stepofweek':
                return True
            elif 'diff_ema' in x:
                return True
            elif 'volume' in x:
                return True
            elif 'value_ema' in x:
                return True
            else:
                return False
        for ema in (10, 20):
            for beta in (99, 98):
                filename = "data-prep-ema{}-beta{}.csv".format(ema, beta)
                print("pre-processing {}".format(filename))
                data = pd.read_csv(filename, parse_dates=["timestamp"])
                data['dayofweek'] = data['timestamp'].apply(lambda x: x.weekday())
                groups = data.set_index('timestamp').groupby(lambda x: x.date())
                
                # get maximum steps
                max_steps = 0
                for index, df in groups:
                    df_len = len(df)
                    if df_len > max_steps:
                        max_steps = df_len
                        
                np_data = np.zeros((len(groups), max_steps, 30*3+1))
                filtered_columns = list(filter(column_filter, data.columns))

                i = 0
                for index, df in groups:
                    df['stepofday'] = np.arange(0, max_steps)
                    df['stepofweek'] = df['dayofweek'] * max_steps + df['stepofday']
                    np_data[i] = df[['stepofweek'] + filtered_columns ].to_numpy()
                    print(np_data[i].shape)
                    i += 1
                    
                numpy_file_name = "np_ema{}_beta{}.npy".format(ema, beta)
                np.save(numpy_file_name, np_data)
                

        return
        
    def get_parameter_str(self, X):
        parameter_str = ""
        for i in range(len(self.mixed_domain)):
            parameter_str += self.mixed_domain[i]["name"]
            parameter_str += ':'
            parameter_str += str(X[i])
            parameter_str += ','
        return parameter_str
    
    def reset_graph(self, seed=42):
        tf.reset_default_graph()
        tf.set_random_seed(seed)
        np.random.seed(seed)
        
    
    def log(self, verbose, msg):
        if verbose:
            print(msg)

    def get_batch(self, seq_index, data_train_input, data_train_output):
        X_batch = data_train_input[seq_index:seq_index+1]
        y_batch = data_train_output[seq_index:seq_index+1]
        return X_batch, y_batch
    
    def transform(self, data_all, n_inputs, n_outputs):
        orig_shape = data_all.shape
        data_train_reshape = data_all.reshape((orig_shape[0] * orig_shape[1], orig_shape[2]))
        
        self.scaler_input = preprocessing.MinMaxScaler().fit(data_train_reshape[:,:n_inputs])
        data_train_input_scaled = self.scaler_input.transform(data_train_reshape[:,:n_inputs])
        
        # the invalid step, we change it to zero!
        data_train_input_scaled[~np.any(data_train_reshape, axis=1)] = 0
        data_train_input = data_train_input_scaled.reshape(orig_shape[0], orig_shape[1], n_inputs)
        
        self.scaler_output = preprocessing.MinMaxScaler().fit(data_train_reshape[:,-n_outputs:])
        data_train_output_scaled = self.scaler_output.transform(data_train_reshape[:,-n_outputs:])
        # the invalid step, we change it to zero!
        data_train_output_scaled[~np.any(data_train_reshape, axis=1)] = 0
        data_train_output = data_train_output_scaled.reshape(orig_shape[0], orig_shape[1], n_outputs)
        
        return data_train_input, data_train_output
    
    def inverse_transform_output(self, scaled_outputs):
        outputs_reshaped = scaled_outputs.reshape((scaled_outputs.shape[1], scaled_outputs.shape[2]))
        #outputs = np.exp(self.scaler_output.inverse_transform(outputs_reshaped)) - 1
        outputs = self.scaler_output.inverse_transform(outputs_reshaped)
        return outputs
    
    def inverse_transform_input(self, scaled_inputs):
        inputs_reshaped = scaled_inputs.reshape((scaled_inputs.shape[1], scaled_inputs.shape[2]))
        #inputs_reshaped[:,4:6] = np.exp(self.scaler_input.inverse_transform(inputs_reshaped)[:,4:6]) - 1
        inputs = self.scaler_input.inverse_transform(inputs_reshaped)
        # TODO: the volume and hold should be transformed back.
        return inputs
        
        
    def get_answer(self, features):
        n_neurons = int(features[0])
        learning_rate = features[1]
        num_layers = int(features[2])
        rnn_type = int(features[3])
        learning_period = int(features[4])
        prediction_period = int(features[5])
        max_repeats = int(features[6])
        beta = int(features[7])
        ema = int(features[8])
        time_input = int(features[9])

        # load data
        file_name = "np_ema{}_beta{}.npy".format(ema, beta)
        data_all = np.load(file_name)
        
        # pick the data for stock_id
        stock_index = 20
        n_inputs = 1
        if time_input != 0:
            n_inputs += 1
        n_outputs = 1
        
        # we must convert the array to 2D
        orig_shape = data_all.shape
        print("original shape: ")
        print(orig_shape)
        reshaped_data = data_all.reshape((orig_shape[0] * orig_shape[1], orig_shape[2]))
        
        input_column_list = [1 + 30 + stock_index]
        if time_input != 0:
            input_column_list = [0] + input_column_list
        output_column_list = [1 + 60 + stock_index]
        
        data_filtered = reshaped_data[:, input_column_list + output_column_list].reshape((orig_shape[0], orig_shape[1], n_inputs+n_outputs))
        

        batch_size = 1
        data_train_input, data_train_output = self.transform(data_filtered, n_inputs, n_outputs)

        # data_train_input in the shape [seq, steps, features]
        days = data_train_input.shape[0]
        max_steps = data_train_input.shape[1]

        self.reset_graph()
        
        X = tf.placeholder(tf.float32, [None, max_steps, n_inputs])
        y = tf.placeholder(tf.float32, [None, max_steps, n_outputs])
        
        layers = None
        if rnn_type == 0:
            layers = [tf.nn.rnn_cell.BasicLSTMCell(n_neurons) 
              for _ in range(num_layers)]
        elif rnn_type == 1:
            layers = [tf.nn.rnn_cell.LSTMCell(n_neurons, use_peepholes=False) 
              for _ in range(num_layers)]
        elif rnn_type == 2:
            layers = [tf.nn.rnn_cell.LSTMCell(n_neurons, use_peepholes=True) 
              for _ in range(num_layers)]
        else:
            print("WRONG")
        cell = tf.nn.rnn_cell.MultiRNNCell(layers)
        
        # For each layer, get the initial state. states will be a tuple of LSTMStateTuples.
        init_state = tf.placeholder(tf.float32, [num_layers, 2, batch_size, n_neurons])
        state_per_layer_list = tf.unstack(init_state, axis=0)
        rnn_tuple_state = tuple(
            [tf.nn.rnn_cell.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])
             for idx in range(num_layers)]
        )

        rnn_outputs, new_states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32, 
                                                    initial_state=rnn_tuple_state)
        
        stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
        stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
        outputs = tf.reshape(stacked_outputs, [-1, max_steps, n_outputs])
        
        
        loss = tf.reduce_mean(tf.square(outputs - y))
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)

        init = tf.global_variables_initializer()

        # now run the model to get answer:
        rnn_states_after_training = np.zeros((num_layers, 2, batch_size, n_neurons))
        asset = 1
        market_asset = 1
        graph_data = []
        my_loss_test_list = []
        with tf.Session() as sess:
            init.run()
            for learn_end_seq in range(learning_period, 
                                       days - prediction_period, 
                                       prediction_period):
                learning_start_seq = learn_end_seq - learning_period
                for repeat in range(max_repeats):
                    rnn_states = copy.deepcopy(rnn_states_after_training)
                    my_loss_train_list = []
                    train_asset = 1
                    for seq in range(learning_start_seq, learn_end_seq):
                        X_batch, y_batch = self.get_batch(seq, data_train_input, data_train_output)
                        feed_dict = {
                            X: X_batch,
                            y: y_batch,
                            init_state: rnn_states_after_training
                        }
                        
                        my_op, my_new_states, my_loss_train, my_outputs = sess.run([training_op, new_states, loss, outputs], feed_dict=feed_dict)
                        my_loss_train_list.append(my_loss_train)
                        rnn_states = my_new_states
                        
                    my_loss_train_avg = sum(my_loss_train_list) / len(my_loss_train_list)
                    print("sequence:{} - {} repeat={} training finished, training MSE={} training asset={}".format(learning_start_seq, learn_end_seq, repeat, my_loss_train_avg, train_asset))
                # backup the states after training.
                rnn_states_after_training = copy.deepcopy(rnn_states)
                
                
                for seq in range(learn_end_seq, learn_end_seq + prediction_period):
                    X_test, y_test = self.get_batch(seq, data_train_input, data_train_output)
                    feed_dict = {
                        X: X_test,
                        y: y_test,
                        init_state: rnn_states,
                    }
            
                    my_new_states, my_loss_test, my_outputs = sess.run([new_states, loss, outputs], feed_dict=feed_dict)
                    my_loss_test_list.append(my_loss_test)
                    
                    print("sequence:{} test finished, testing MSE={}".format(seq, my_loss_test))
                    rnn_states = my_new_states
            my_loss_test_avg = sum(my_loss_test_list)/len(my_loss_test_list)
            return my_loss_test_avg
                    
    def opt_wrapper(self, X_list):
        answer = np.zeros((X_list.shape[0], 1))
        for i in range(len(X_list)):
            print(self.get_parameter_str(X_list[i]))
            features = X_list[i]
            answer[i][0] = self.get_answer(features)
            #self.draw_step_profit_graph(self.step_profit_list, "step_profit_{}".format(answer[i][0]))
            #self.step_profit_list = []
            if answer[i][0] < self.min_answer:
                print("find new opt:{}, {}".format(answer[i][0], self.get_parameter_str(X_list[i])))
                self.min_answer = answer[i][0]
        return answer
                
        
    def optimize(self, max_iter=300):
        self.min_answer = 999
        myBopt = GPyOpt.methods.BayesianOptimization(f=self.opt_wrapper,  # Objective function       
                                             domain=self.mixed_domain,          # Box-constraints of the problem
                                             initial_design_numdata = 20,   # Number data initial design
                                             acquisition_type='EI',        # Expected Improvement
                                             exact_feval = True)           # True evaluations, no sample noise
        
        myBopt.run_optimization(max_iter,eps=0)
    
    
    # no optimize, we have already know the answer.
    def run(self, n_neurons, learning_rate, 
            num_layers, rnn_type, risk_aversion, 
            learning_period, prediction_period, 
            max_repeats, min_profit, gamma):
        features = [n_neurons, learning_rate, 
            num_layers, rnn_type, risk_aversion, 
            learning_period, prediction_period, 
            max_repeats, min_profit, gamma]
        
        answer = self.get_answer(features)
        print("Finished, result:{}".format(answer))
        return 

In [71]:
model = Model(False)


In [None]:
model.optimize()

n_neurons:140.0,learning_rate:0.002,num_layers:4.0,rnn_type:1.0,learning_period:20.0,prediction_period:5.0,max_repeats:9.0,beta:99.0,ema:10.0,time_input:0.0,
original shape: 
(53, 504, 91)
sequence:0 - 20 repeat=0 training finished, training MSE=0.00802745613618754 training asset=1
sequence:0 - 20 repeat=1 training finished, training MSE=0.0014849287836113945 training asset=1
sequence:0 - 20 repeat=2 training finished, training MSE=0.0013470235950080677 training asset=1
sequence:0 - 20 repeat=3 training finished, training MSE=0.001325701709720306 training asset=1
sequence:0 - 20 repeat=4 training finished, training MSE=0.001321365506737493 training asset=1
sequence:0 - 20 repeat=5 training finished, training MSE=0.001318666324368678 training asset=1
sequence:0 - 20 repeat=6 training finished, training MSE=0.0013141536634066141 training asset=1
sequence:0 - 20 repeat=7 training finished, training MSE=0.0013079701384413056 training asset=1
sequence:0 - 20 repeat=8 training finished, trai

sequence:5 - 25 repeat=0 training finished, training MSE=0.0007765020578517579 training asset=1
sequence:5 - 25 repeat=1 training finished, training MSE=0.0007376859837677329 training asset=1
sequence:5 - 25 repeat=2 training finished, training MSE=0.0007386570199741982 training asset=1
sequence:25 test finished, testing MSE=0.0003547748492565006
sequence:26 test finished, testing MSE=0.0007787112845107913
sequence:27 test finished, testing MSE=0.0008003671537153423
sequence:28 test finished, testing MSE=0.0013665675651282072
sequence:29 test finished, testing MSE=0.0006619772175326943
sequence:10 - 30 repeat=0 training finished, training MSE=0.0007582401813124306 training asset=1
sequence:10 - 30 repeat=1 training finished, training MSE=0.0007239965227199719 training asset=1
sequence:10 - 30 repeat=2 training finished, training MSE=0.0007176086393883452 training asset=1
sequence:30 test finished, testing MSE=0.00041071587475016713
sequence:31 test finished, testing MSE=0.0005083691794

sequence:10 - 20 repeat=6 training finished, training MSE=0.0006213719228981063 training asset=1
sequence:20 test finished, testing MSE=0.0006295993225648999
sequence:21 test finished, testing MSE=0.0004518504429142922
sequence:22 test finished, testing MSE=0.0004918539780192077
sequence:23 test finished, testing MSE=0.0007153369951993227
sequence:24 test finished, testing MSE=0.0007415335858240724
sequence:15 - 25 repeat=0 training finished, training MSE=0.000539449622738175 training asset=1
sequence:15 - 25 repeat=1 training finished, training MSE=0.0005266480351565406 training asset=1
sequence:15 - 25 repeat=2 training finished, training MSE=0.0005337941489415243 training asset=1
sequence:15 - 25 repeat=3 training finished, training MSE=0.0005469925177749247 training asset=1
sequence:15 - 25 repeat=4 training finished, training MSE=0.0005513897311175242 training asset=1
sequence:15 - 25 repeat=5 training finished, training MSE=0.0005520351522136479 training asset=1
sequence:15 - 25 

sequence:10 - 40 repeat=6 training finished, training MSE=0.0005717837940513467 training asset=1
sequence:40 test finished, testing MSE=0.0009721717797219753
sequence:41 test finished, testing MSE=0.00045739675988443196
sequence:42 test finished, testing MSE=0.0014525187434628606
sequence:43 test finished, testing MSE=0.0010872086277231574
sequence:44 test finished, testing MSE=0.0004335075500421226
sequence:15 - 45 repeat=0 training finished, training MSE=0.0006026421692998459 training asset=1
sequence:15 - 45 repeat=1 training finished, training MSE=0.0005936068167405514 training asset=1
sequence:15 - 45 repeat=2 training finished, training MSE=0.0005923224564564104 training asset=1
sequence:15 - 45 repeat=3 training finished, training MSE=0.0005916210512320201 training asset=1
sequence:15 - 45 repeat=4 training finished, training MSE=0.0005916341120610014 training asset=1
sequence:15 - 45 repeat=5 training finished, training MSE=0.000591343284274141 training asset=1
sequence:15 - 45

In [19]:
filename = "data-prep-ema10-beta99.csv"
print("pre-processing {}".format(filename))
data = pd.read_csv(filename, parse_dates=["timestamp"])
data['dayofweek'] = data['timestamp'].apply(lambda x: x.weekday())
groups = data.set_index('timestamp').groupby(lambda x: x.date())



pre-processing data-prep-ema10-beta99.csv


In [21]:
data.columns.tolist()

['Unnamed: 0',
 'timestamp',
 'volume_0',
 'volume_1',
 'volume_2',
 'volume_3',
 'volume_4',
 'volume_5',
 'volume_6',
 'volume_7',
 'volume_8',
 'volume_9',
 'volume_10',
 'volume_11',
 'volume_12',
 'volume_13',
 'volume_14',
 'volume_15',
 'volume_16',
 'volume_17',
 'volume_18',
 'volume_19',
 'volume_20',
 'volume_21',
 'volume_22',
 'volume_23',
 'volume_24',
 'volume_25',
 'volume_26',
 'volume_27',
 'volume_28',
 'volume_29',
 'last_0',
 'last_1',
 'last_2',
 'last_3',
 'last_4',
 'last_5',
 'last_6',
 'last_7',
 'last_8',
 'last_9',
 'last_10',
 'last_11',
 'last_12',
 'last_13',
 'last_14',
 'last_15',
 'last_16',
 'last_17',
 'last_18',
 'last_19',
 'last_20',
 'last_21',
 'last_22',
 'last_23',
 'last_24',
 'last_25',
 'last_26',
 'last_27',
 'last_28',
 'last_29',
 'diff_ema_10_0',
 'diff_ema_10_1',
 'diff_ema_10_2',
 'diff_ema_10_3',
 'diff_ema_10_4',
 'diff_ema_10_5',
 'diff_ema_10_6',
 'diff_ema_10_7',
 'diff_ema_10_8',
 'diff_ema_10_9',
 'diff_ema_10_10',
 'diff_ema_1