In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


# Hypothesis: Each investment is a time series. We cannot train a model for each investment, so we should attempt to cluster. We can use LSTM autoencoders to do this.
1. Visualize rank as a function of time for a few investments
2. Develop an LSTM autoencoder class 
3. For each investment, run the first encoder... if the reconstruction error is high, then instantiate and train a new autoencoder.
4. After a max of 50 autoencoders, then assign remaining investments to the autoencoder with the least reconstruction error.
5. After all investments have been assigned to an autoencoder, then train an LSTM for each cluster. There will be 50 LSTMs. A dictionary will maintain a map between LSTM's and investments.
6. During inference, use the investment ID to map to the right LSTM and then predict the rank

In [None]:
df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
print(df.head())
print(df.info(verbose = True))
print(df.describe())

# Visualize Rank as a Function of Time for a Few Investments

In [None]:
securities = []
for i in range(10):
    security = df['SecuritiesCode'].sample().iloc[0]
    df[df['SecuritiesCode'] == security].plot('Date', 'Target')
    securities.append(security)

# Develop an LSTM AutoEncoder Class

In [None]:
# Based on Chitta Ranjan's example and text available here: https://towardsdatascience.com/step-by-step-understanding-lstm-autoencoder-layers-ffab055b6352
# lstm autoencoder to recreate a timeseries
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import BatchNormalization
from keras.layers import MaxPooling1D
from keras.layers import AveragePooling1D
'''
A UDF to convert input data into 3-D
array as required for LSTM network.
'''

def temporalize(X, y, lookback):
    output_X = []
    output_y = []
    for i in range(len(X)-lookback-1):
        t = []
        for j in range(1,lookback+1):
            # Gather past records upto the lookback period
            t.append(X[[(i+j+1)], :])
        output_X.append(t)
        output_y.append(y[i+lookback+1])
    return output_X, output_y

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
security_target = df[df['SecuritiesCode'] == securities[0]]['Target'].values

security_target = security_target.reshape(-1,1)
scaler.fit(security_target)
security_target = scaler.transform(security_target)

In [None]:
# define input timeseries
timeseries = security_target

timesteps = timeseries.shape[0]
n_features = timeseries.shape[1]
timeseries

In [None]:
timesteps = 3
X, y = temporalize(X = timeseries, y = np.zeros(len(timeseries)), lookback = timesteps)

n_features = 1
X = np.array(X)
X = X.reshape(X.shape[0], timesteps, n_features)

X

In [None]:
y

In [None]:
# define model

#encoder
model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=(timesteps,n_features), return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(64, activation='relu', return_sequences=False))
model.add(BatchNormalization())

#decoder
model.add(RepeatVector(timesteps))
model.add(LSTM(64, activation='relu', return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(BatchNormalization())
model.add(TimeDistributed(Dense(n_features)))
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
# fit model
model.fit(X, X, epochs=300, batch_size=16, verbose=1, validation_split = 0.2)
# demonstrate reconstruction
yhat = model.predict(X, verbose=0)
print('---Predicted---')
print(np.round(yhat,3))
print('---Actual---')
print(np.round(X, 3))

In [None]:
np.mean(yhat - X)