# LSTM Project 2

In this project, we modified the program kernel7da3d56615.ipynb to predict the price of iShares ETF. In addition to modification of the program, window normalization was added to the GET_BATCH helper function in order to boost the model performance. Finally, performances of different neural networks were compared and analyzed.

Note: Since model was evaluated based on validation sets, we will not present plots of prediction as well as Sharpe, CAGR and White Reality Check.

In [None]:
# Module imported
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Conv1D, MaxPool1D, Dense, Activation, GlobalMaxPool1D, Flatten
from sklearn.model_selection import train_test_split
from keras.layers import SimpleRNN
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
%matplotlib inline

In [None]:
# Read data
train = pd.read_csv('../input/ishareetf/iShareETF1.csv').fillna(0)
# Convert all price to positive value
train.update(train.select_dtypes(include=[np.number]).abs())
train.head()

In [None]:
# Shape of data
print(train.shape)

In [None]:
# Lag array function
def lag_arr(arr, lag,fill):
    filler = np.full((arr.shape[0],lag,1),-1)
    comb = np.concatenate((filler,arr),axis=1)
    result = comb[:,:arr.shape[1]]
    return result

The next function calculates autocorrelations.
This is the formula to calculate autocorrelation (the letter tao stands for lag, mu stands for mean and sigma stands for standard deviation):

This function checks for division by zero, which native Python autocorrelation functions do not do.

In [None]:
# Single autocorrelation function
def single_autocorr(series, lag):
    """
    Autocorrelation for single data series
    :param series: traffic series
    :param lag: lag, days
    :return:
    """
    s1 = series[lag:]
    s2 = series[:-lag]
    ms1 = np.mean(s1)
    ms2 = np.mean(s2)
    ds1 = s1 - ms1
    ds2 = s2 - ms2
    divider = np.sqrt(np.sum(ds1 * ds1)) * np.sqrt(np.sum(ds2 * ds2))
    return np.sum(ds1 * ds2) / divider if divider != 0 else 0

The next function calculates the autocorrelations for each series in the batch. Then we
fuse the correlations together into one NumPy array. Since autocorrelations are a
global feature, we need to create a new dimension for the length of the series and
another new dimension to show that this is only one feature. We then repeat the
autocorrelations over the entire length of the series.

In [None]:
# Batch autocorrelation function
def batc_autocorr(data,lag,series_length):
    corrs = []
    for i in range(data.shape[0]):
        c = single_autocorr(data, lag) 
        corrs.append(c)
    corr = np.array(corrs)
    corr = corr.reshape(-1,1)
    corr = np.expand_dims(corr,-1)
    corr = np.repeat(corr,series_length,axis=1)
    return corr

The next 4 cells take care of one hot encoding data: 'Region', 'Sub Asset Class' and 'Asset Class' categoricals and days categoricals pulled from the date columns:

In [None]:
# One hot encoding of 'region'
region_int = LabelEncoder().fit(train['Region'])
region_enc = region_int.transform(train['Region'])
region_enc = region_enc.reshape(-1, 1)
region_one_hot = OneHotEncoder(sparse=False).fit(region_enc)

del region_enc
type(region_one_hot)

In [None]:
# One hot encoding of date
datetime.datetime.strptime(train.columns.values[1], '%m/%d/%Y').strftime('%a')
weekdays = [datetime.datetime.strptime(date,'%m/%d/%Y').strftime('%a') 
           for date in train.columns.values[:-4]]

day_one_hot = LabelEncoder().fit_transform(weekdays)
day_one_hot = day_one_hot.reshape(-1, 1)
day_one_hot = OneHotEncoder(sparse=False).fit_transform(day_one_hot)
day_one_hot = np.expand_dims(day_one_hot,0)

In [None]:
# One hot encoding of 'Sub Asset Class'
sac_int = LabelEncoder().fit(train['Sub Asset Class'])
sac_enc = sac_int.transform(train['Sub Asset Class'])
sac_enc = sac_enc.reshape(-1, 1)
sac_one_hot = OneHotEncoder(sparse=False).fit(sac_enc)

del sac_enc

In [None]:
# One hot encoding of 'Asset Class'
ac_int = LabelEncoder().fit(train['Asset Class'])
ac_enc = ac_int.transform(train['Asset Class'])
ac_enc = ac_enc.reshape(-1, 1)
ac_one_hot = OneHotEncoder(sparse=False).fit(ac_enc)

del ac_enc

This is what the function get_batch() below does:

1. Ensures there is enough data to create a lookback window and a target from
the given starting point.
2. Separates the lookback window from the training data.
3. Separates the target and then takes the one plus logarithm of it.
4. Takes the one plus logarithm of the lookback window and adds a feature
dimension.
5. Gets the days from the precomputed one-hot encoding of days and repeats
it for each time series in the batch.
6. Computes the lag features for year lag, half-year lag, and quarterly lag.
7. Encodes the global features using the preceding defined
encoders. The next two steps, 8 and 9, will echo the same role.
8. Repeats step 7.
9. Repeats step 7 and 8.
10. Calculates the year, half-year, and quarterly autocorrelation.
11. Calculates the median for the lookback data.
12. Fuses all these features into one batch.

In [None]:
# Get batch helper function
def get_batch(train,start=1,lookback = 200):
    assert((start + lookback) <= (train.shape[1] - 3)) , 'End of lookback would be out of bounds' #1
    add=1
    data = train.iloc[:,start:start + lookback].values #start of window is randomly chosen #2
    target = train.iloc[:,start + lookback].values #gets one item beyond the training window
    
   # Window normalization
    f = 0.1
    small = .001
    
    df = pd.DataFrame(data)
    df = ((df/(np.array(df.iloc[:,0]).reshape(-1,1)+small))-1)*f
    data = df.values
    
    df_target = pd.DataFrame(target)
    df_target = ((df_target/(np.array(df.iloc[:,0]).reshape(-1,1)+small))-1)*f 
    target = df_target.values
    
    target = np.log1p(target)#3
    
    log_view = np.log1p(data)
    log_view = np.expand_dims(log_view,axis=-1)#4
    
    days = day_one_hot[:,start:start + lookback]
    days = np.repeat(days,repeats=train.shape[0],axis=0)#5
    
    year_lag = lag_arr(log_view,365,-1)#6
    halfyear_lag = lag_arr(log_view,182,-1)
    quarter_lag = lag_arr(log_view,91,-1)
    
    region_enc = region_int.transform(train['Region']) #7
    region_enc = region_enc.reshape(-1, 1)
    region_enc = region_one_hot.transform(region_enc)
    region_enc = np.expand_dims(region_enc,1)
    region_enc = np.repeat(region_enc,lookback,axis=1)
    
    sac_enc = sac_int.transform(train['Sub Asset Class'])#8
    sac_enc = sac_enc.reshape(-1, 1)
    sac_enc = sac_one_hot.transform(sac_enc)
    sac_enc = np.expand_dims(sac_enc, 1)
    sac_enc = np.repeat(sac_enc,lookback,axis=1)
    
    ac_enc = ac_int.transform(train['Asset Class'])#9
    ac_enc = ac_enc.reshape(-1, 1)
    ac_enc = ac_one_hot.transform(ac_enc)
    ac_enc = np.expand_dims(ac_enc,1)
    ac_enc = np.repeat(ac_enc,lookback,axis=1)
    
    year_autocorr = batc_autocorr(data,lag=365,series_length=lookback)#10
    halfyr_autocorr = batc_autocorr(data,lag=182,series_length=lookback)
    quarter_autocorr = batc_autocorr(data,lag=91,series_length=lookback)
    
    medians = np.median(data,axis=1) #11
    medians = np.expand_dims(medians,-1)
    medians = np.expand_dims(medians,-1)
    medians = np.repeat(medians,lookback,axis=1)
    
    batch = np.concatenate((log_view, 
                            days, 
                            year_lag, 
                            halfyear_lag, 
                            quarter_lag,
                            sac_enc,
                            region_enc,
                            ac_enc, 
                            year_autocorr, 
                            halfyr_autocorr,
                            quarter_autocorr, 
                            medians),axis=2)#12
    
    return batch, target

GENERATOR FUNCTION


In [None]:
# Generate batches function
def generate_batches(train,batch_size = 1, lookback = 200):
    num_samples = train.shape[0]
    num_steps = train.shape[1] - 5
    
    while True:
        # Loop to create batches
        for i in range(num_samples // batch_size):
            batch_start = i * batch_size
            batch_end = batch_start + batch_size
            
            #gets a random date (column number) from where to start the window of length lookback
            seq_start = np.random.randint(num_steps - lookback) 
            
            # Generate batch and target using get_batch
            X,y = get_batch(train.iloc[batch_start:batch_end],start=seq_start)
            
            # Yield iterators
            yield X,y

INITIALIZE VARIABLES:

In [None]:
#=timesteps=lookback
max_len = 200 
n_features = 40

DATA INPUT START

In [None]:
batch_size = 1

# Data Split
train_df, val_df = train_test_split(train, test_size=0.1)

# Training and validation iterators
train_gen = generate_batches(train_df,batch_size=batch_size) #train_gen is a  batch cube
val_gen = generate_batches(val_df, batch_size=batch_size) #val_gen is a batch cube
n_train_samples = train_df.shape[0]
n_val_samples = val_df.shape[0]

In [None]:
# Shapes of training and validation data
print(train_df.shape)
print(val_df.shape)

COMPILE convolutional:

In [None]:
# Model setting
model = Sequential()

model.add(Conv1D(16,5, input_shape=(max_len,n_features)))
model.add(Activation('relu'))
model.add(MaxPool1D(5))

model.add(Conv1D(16,5))
model.add(Activation('relu'))
model.add(MaxPool1D(5))

model.add(Flatten())
model.add(Dense(1))

model.compile(optimizer='adam',loss='mean_absolute_error')

In [None]:
# Model fitting
hist = model.fit_generator(train_gen, 
                    epochs=1,
                    steps_per_epoch=n_train_samples // batch_size, 
                    validation_data= val_gen, 
                    validation_steps=n_val_samples // batch_size)

RESULTS Convolutional

In [None]:
print(np.sqrt(hist.history['val_loss']))
Convolutional_loss=hist.history['val_loss'][0]

COMPILE SimpleRNN

In [None]:
# Model setting
model = Sequential()
model.add(SimpleRNN(16,input_shape=(max_len,n_features)))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mean_absolute_error')

In [None]:
# Model fitting
hist = model.fit_generator(train_gen, 
                    epochs=1,
                    steps_per_epoch=n_train_samples // batch_size, 
                    validation_data= val_gen, 
                    validation_steps=n_val_samples // batch_size)

RESULTS SimpleRNN

In [None]:
print(hist.history['val_loss'])
SimpleRNN_loss=hist.history['val_loss'][0]

COMPILE ComplexRNN 

In [None]:
# Model setting
model = Sequential()
model.add(SimpleRNN(32,return_sequences=True,input_shape=(max_len,n_features)))
model.add(SimpleRNN(16, return_sequences = True))
model.add(SimpleRNN(16))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mean_absolute_error')

In [None]:
# Model fitting
hist = model.fit_generator(train_gen, 
                    epochs=1,
                    steps_per_epoch=n_train_samples // batch_size, 
                    validation_data= val_gen, 
                    validation_steps=n_val_samples // batch_size)

RESULTS ComplexRNN

In [None]:
print(hist.history['val_loss'])
SimpleRNNagain_loss=hist.history['val_loss'][0]

COMPILE CuDNNLSTM

In [None]:
# Model setting
model = Sequential()
model.add(CuDNNLSTM(16,input_shape=(max_len,n_features)))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mean_absolute_error')

In [None]:
# Model fitting
hist = model.fit_generator(train_gen, 
                    epochs=1,
                      steps_per_epoch=n_train_samples // batch_size, 
                    validation_data= val_gen, 
                    validation_steps=n_val_samples // batch_size)

RESULTS CuDNNLSTM

In [None]:
print(hist.history['val_loss'])
CuDNNLSTM_loss=hist.history['val_loss'][0]

MODEL EVALUATION

Below we calculate the Mad/Mean Ratios of our NN results. We have avoided using MAPE, instead we have calculated MAD. Next we calculate MAD/Mean by dividing MAD by the mean.

In [None]:
# Summary of validation dataset
df = np.log1p(val_df.iloc[:,1:1257]) 
print(df.describe().transpose())
dfd=df.describe().transpose()

# Calculate Mad/MeanRatio
Convolutional_Mad_MeanRatio = Convolutional_loss/dfd['mean'].mean()
SimpleRNN_Mad_MeanRatio = SimpleRNN_loss/dfd['mean'].mean()
SimpleRNNagain_Mad_MeanRatio = SimpleRNNagain_loss/dfd['mean'].mean()
CuDNNLSTM_Mad_MeanRatio = CuDNNLSTM_loss/dfd['mean'].mean()

# Mad/MeanRatio results
print("Convolutional_Mad_MeanRatio", Convolutional_Mad_MeanRatio)
print("SimpleRNN_Mad/MeanRatio",SimpleRNN_Mad_MeanRatio)
print("SimpleRNNagain_Mad/MeanRatio", SimpleRNNagain_Mad_MeanRatio)
print("CuDNNLSTM_Mad/MeanRatio", CuDNNLSTM_Mad_MeanRatio)