In [2]:
import pandas as pd 
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [3]:
df = pd.read_csv('LTC-USD.csv',
                 names=['time','low','high','open','close','volume'])

In [4]:
df.head()

Unnamed: 0,time,low,high,open,close,volume
0,1528968660,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,96.279999,96.540001,96.5,96.389999,524.539978


In [5]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3 
RATIO_TO_PREDICT = "BCH-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}--{RATIO_TO_PREDICT}--{FUTURE_PERIOD_PREDICT}--PRED--{int(time.time())}"

In [6]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True) 


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!

In [7]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [8]:
main_df = pd.DataFrame()

In [9]:
ratios = ['BTC-USD','LTC-USD','ETH-USD','BCH-USD']

for ratio in ratios:
    dataset = f"{ratio}.csv"
    
    df = pd.read_csv(dataset,
                     names=['time','low','high','open','close','volume'])
    df.rename(columns={'close':f"{ratio}_close","volume":f"{ratio}_volume"},
              inplace=True)
    df.set_index('time',inplace=True)
    df=df[[f"{ratio}_close",f"{ratio}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805


In [10]:
main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)

In [11]:
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361,870.789978
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577,870.0
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243,869.98999
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862,869.450012
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805,869.98999


In [12]:
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df['future']))

In [13]:
main_df.head()

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,ETH-USD_close,ETH-USD_volume,BCH-USD_close,BCH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968660,6489.549805,0.5871,96.580002,9.6472,,,871.719971,5.675361,870.789978,0
1528968720,6487.379883,7.706374,96.660004,314.387024,486.01001,26.019083,870.859985,26.856577,870.0,0
1528968780,6479.410156,3.088252,96.57,77.129799,486.0,8.4494,870.099976,1.1243,869.98999,0
1528968840,6479.410156,1.4041,96.5,7.216067,485.75,26.994646,870.789978,1.749862,869.450012,0
1528968900,6479.97998,0.753,96.389999,524.539978,486.0,77.355759,870.0,1.6805,869.98999,0


In [14]:
times = sorted(main_df.index.values)

In [15]:
last_5pt = times[-int(0.05*len(times))]
last_5pt

1534922100

In [16]:
validation_main_df = main_df[(main_df.index >= last_5pt)]
main_df = main_df[(main_df.index < last_5pt)]

In [17]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [18]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

train data: 65728 validation: 2484
Dont buys: 32864, buys: 32864
VALIDATION Dont buys: 1242, buys: 1242


In [19]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 60, 128)           70144     
_________________________________________________________________
dropout (Dropout)            (None, 60, 128)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 60, 128)           512       
_________________________________________________________________
lstm_1 (LSTM)                (None, 60, 128)           131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 128)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 60, 128)           512       
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               1

In [20]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

# filepath = "RNN_Final-{epoch:02d}"  
# checkpoint = ModelCheckpoint("models/{}.model".format(filepath,
#                         monitor='val_acc', verbose=1, save_best_only=True,
#                         mode='max')) # saves only the best ones

history = model.fit(train_x, np.array(train_y),
                   batch_size = BATCH_SIZE,
                   epochs=EPOCHS,
                   validation_data = (validation_x, np.array(validation_y)),
                   callbacks=[tensorboard])



Epoch 1/10
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
