In [37]:
import os
import pandas as pd
import numpy as np
import math

import tensorflow as tf
import keras_tuner as kt

# from tensorflow.python.keras.models import Sequential
# from tensorflow.python.keras.layers import Activation, Dense, Flatten
# import tensorflow.python.keras.optimizers
# import tensorflow.python.keras.metrics

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten
import tensorflow.keras.optimizers
import tensorflow.keras.metrics

#%%capture
data_file_path = os.path.join(os.getcwd(), "Data\\returnsData.csv")
returnsData = pd.read_csv(data_file_path)
#returnsData.info(verbose=True)
longReturns = returnsData[["YearQuarter", "gvkey", "quarterlyReturns"]] #"quarterlyVolatility",
#longReturns.info(verbose=True)
wideReturns = pd.pivot(longReturns, index="YearQuarter", columns="gvkey", values="quarterlyReturns")
#wideReturns.info(verbose=True)
#print(wideReturns.iloc[0:10,0:10])

#wideReturns["YearQuarter"] = wideReturns.index
#wideReturns["YQ"] = wideReturns.index

trainSize = math.floor(len(wideReturns.index) * 0.5)
validationSize = math.floor(len(wideReturns.index) * 0.75)
testSize = len(wideReturns.index)
# trainSize = len(wideReturns["YearQuarter"]) * 0.5
# validationSize = len(wideReturns["YearQuarter"]) * 0.75
#wideReturns[:,"Split"] = 

class PreprocessData:

  def __init__(self, sample_length=2):

    dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=wideReturns, targets=None,
        sequence_length=sample_length, sequence_stride=1, sampling_rate=1,
        batch_size=128,  shuffle=False, 
        start_index=None, end_index=None)

    for batch in dataset:
      data = batch
      print(data.shape)
      break

    trainSize = math.floor(data.shape[0] * 0.5)
    validationSize = math.floor(data.shape[0] * 0.75)
    testSize = data.shape[0]

    train = data[:trainSize, :, :]
    validation = data[trainSize:validationSize, :, :]
    test = data[validationSize:, :, :]

    def reshapeData(rawData):
      #[lags of YearQuarter, YearQuarter, gvkey] => [Batch, YearQuarter, None]
      reshapedData = np.transpose(a=rawData, axes=(2, 0, 1))
      reshapedData = np.reshape(reshapedData, (-1, rawData.shape[1]))
      reshapedData = reshapedData[~np.isnan(reshapedData).any(axis=1),:]
      reshapedData = np.expand_dims(reshapedData, axis=2)
      return reshapedData

    train = reshapeData(train)
    validation = reshapeData(validation)
    test = reshapeData(test)

    xTrain, yTrain = train[:, :-1, :], train[:, -1, :]
    xVal, yVal = validation[:, :-1, :], validation[:, -1, :]
    xTest, yTest = test[:, :-1, :], test[:, -1, :]

    # train = tf.keras.preprocessing.timeseries_dataset_from_array(
    #     data=xTrainReshaped, targets=yTrainReshaped,
    #     sequence_length=sample_length, sequence_stride=1, sampling_rate=1,
    #     batch_size=batch_size,  shuffle=False)
    # validation = tf.keras.preprocessing.timeseries_dataset_from_array(
    #     data=xValReshaped, targets=yValReshaped,
    #     sequence_length=sample_length, sequence_stride=1, sampling_rate=1,
    #     batch_size=batch_size,  shuffle=False)
    # test = tf.keras.preprocessing.timeseries_dataset_from_array(
    #     data=xTestReshaped, targets=yTestReshaped,
    #     sequence_length=sample_length, sequence_stride=1, sampling_rate=1,
    #     batch_size=batch_size,  shuffle=False)


    normlayer = tf.keras.layers.Normalization(axis=None)
    normlayer.adapt(xTrain) #Calculates some returns multiple times
    self.xTrain, self.yTrain, self.xVal, self.yVal, self.xTest, self.yTest, self.normlayer = xTrain, yTrain, xVal, yVal, xTest, yTest, normlayer
    #return xTrain, yTrain, xVal, yVal, xTest, yTest, normlayer

#normlayer.adapt(xTrain[:,0,:])
#normlayer.adapt(xTrain[:,[0, -1],:])
# normlayer.adapt(wideReturns[trainSize, :])


In [None]:
# model = Sequential()
# model.add(normlayer)
# model.add(tf.keras.layers.LSTM(units=5)) #, input_shape=(,sample_length-1)   #mask
# model.add(Dense(units=1))

# model.compile(optimizer="adam", #tf.keras.optimizers.Adam(learning_rate=0.001)
#               loss="MeanAbsoluteError",
#               metrics=[tf.keras.losses.MeanSquaredError(), tf.keras.losses.MeanAbsoluteError()])

# #model.fit(x=dataset, batch_size=8)#, validation_data=(xVal, yVal))
# model.fit(x=xTrain, y=yTrain, batch_size=32, validation_data=(xVal, yVal))

In [23]:
processedData = PreprocessData(2)
processedData.xTrain

(52, 2, 9668)


array([[[  5.79090556]],

       [[ 23.80602498]],

       [[ 16.08308605]],

       ...,

       [[-29.10334347]],

       [[-32.11861379]],

       [[ 52.63157895]]])

In [38]:

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        model = Sequential()
        #model.add(normlayer)
        model.add(tf.keras.layers.LSTM(
            units=hp.Int("units", min_value=1, max_value=10, step=1),
            #input_shape=(sample_length-1,),
            dropout = hp.Float("dropout", min_value=0, max_value=0.9, step=0.1)

        ))
        model.add(Dense(units=1))

        optimizer = tf.keras.optimizers.Adam(learning_rate=hp.Choice("learning_rate", [0.01, 0.001, 0.0001]))
        model.compile(optimizer = optimizer,
                  loss="MeanAbsoluteError",
                  metrics=[tf.keras.losses.MeanSquaredError(), tf.keras.losses.MeanAbsoluteError()])
        return model

    def fit(self, hp, model, *args, **kwargs):
        
        lag_length = hp.Int("lags", min_value=2, max_value=15, step=1)
        dataForFitting = PreprocessData(lag_length)
        
        return model.fit(
            x=dataForFitting.xTrain, y=dataForFitting.yTrain,
            batch_size=hp.Choice("batch_size", [16, 32]),
            validation_data=(dataForFitting.xVal, dataForFitting.yVal)
        )

tuner = kt.RandomSearch(
    MyHyperModel(),
    objective="val_mean_absolute_error",
    max_trials=3,
    overwrite=True,
    directory="LSTM_KerasTuner_ResultsDir",
    project_name="tune_hypermodel",
)

tuner.search()

bestModel = tuner.get_best_models(1)


Trial 1 Complete [00h 01m 16s]
val_mean_absolute_error: 14.042917251586914

Best val_mean_absolute_error So Far: 14.042917251586914
Total elapsed time: 00h 01m 16s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
1                 |1                 |units
0.7               |0.5               |dropout
0.0001            |0.01              |learning_rate
10                |2                 |lags
32                |16                |batch_size

(44, 10, 9668)