In [1]:
import pandas as pd
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
# Crypto Dataset Link https://pythonprogramming.net/static/downloads/machine-learning-data/crypto_data.zip

In [3]:
data_root = r"D:/Research/LSTM/Sample_Data/crypto_data"
main_df = pd.DataFrame()
ratios  = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]

for ratio in ratios:
    data_path = data_root+"/"+ratio+".csv"
    df = pd.read_csv(data_path, names=["time", "low", "high", "open", "close", "volume"])
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)
    
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

In [4]:
# Starting Constants
# Problem Statement: Take last 60 minutes of pricing and predict the next 3 minutes
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3 # 3 minutes
RATIO_TO_PREDICT = "LTC-USD"

In [5]:
# Label data
def classify(current, future):
    if float(future) > float(current):
        return 1 # If the price is higher in the future than it is now (present)
    else:
        return 0

In [6]:
# The new column future contains the values present in corresponding RATIO_TO_PREDICT row below f"{RATIO_TO_PREDICT}_close"
# Example: 1st rwo in future == 4th row in  f"{RATIO_TO_PREDICT}_close
# Example: 2nd rwo in future == 5th row in  f"{RATIO_TO_PREDICT}_close
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

In [7]:
# Include the target label
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))
main_df.dropna(inplace=True)

In [8]:
# You cannot shuffle and take a random 10% because the sequences are 60 min long and we are predicting the three minutes.
# if we shuffle and draw 10%, the samples would all have close examples making easy the rnn model to overfit fully.
# Instead, take a chunk and seperate them away
# For a time series data, take a chunk of data in the future
# In our data, seperate the last 5% as the out of sample data.

In [9]:
# Train, Validation split
times = sorted(main_df.index.values)
last_5_percent = times[-int(0.05*len(times))] # indexing the last 5 percent of the times
validation_main_df = main_df[(main_df.index >= last_5_percent)]
train_main_df = main_df[(main_df.index < last_5_percent)]

In [10]:
def preprocess_df(df):
    df = df.drop('future', 1) # we only require the "future" column for labelling, so dropping it.
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].pct_change() # Normalising the column values
            df.dropna(inplace=True) # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values) # scaling - making all values in the column between 0 and 1
    df.dropna(inplace=True)
    sequential_data = [] # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)
    # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in    prev_days = deque(maxlen=SEQ_LEN) 
    for i in df.values:
        prev_days.append([n for n in i[:-1]]) # # store all but the target
        if len(prev_days) == SEQ_LEN: # make sure we have 60 sequences!
            sequential_data.append([np.asarray(prev_days), i[-1]])
    random.shuffle(sequential_data)
    # balancing the data to improve model performance
    buys = []
    sells = []
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        else:
            buys.append([seq, target])
    random.shuffle(buys)
    random.shuffle(sells)
    lower = min(len(buys), len(sells))
    buys = buys[:lower]
    sells = sells[:lower]
    sequential_data = buys+sells
    random.shuffle(sequential_data)
    # splitting into x (features) and y (labels)
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(int(target))
    return np.array(X), y

In [11]:
train_x, train_y = preprocess_df(train_main_df)
val_x, val_y = preprocess_df(validation_main_df)

In [12]:
print(f"train data: {len(train_x)} validation: {len(val_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {val_y.count(0)}, buys: {val_y.count(1)}")

train data: 68836 validation: 3400
Dont buys: 34418, buys: 34418
VALIDATION Dont buys: 1700, buys: 1700


In [13]:
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [16]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model = Sequential()
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model = Sequential()
model.add(LSTM(128)) # no return sequences as next layer is a Dense layer
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt = tf.keras.optimizers.Adam(lr=0.01, decay=1e-6)

model.compile(loss = 'sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
tensorboard = TensorBoard(log_dir = f'logs\{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

history = model.fit(
    train_x, np.array(train_y),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(val_x, val_y),
    callbacks=[tensorboard, checkpoint],
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 68836 samples, validate on 3400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
