In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

import pandas as pd
from collections import deque
import random
import time
# from sklearn import preprocessing

In [2]:
from load_powerball_xls import load_xlsx

In [3]:
SEQ_LEN = 10  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
BALL_TO_PREDICT = 'Powerball'

In [4]:
EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [5]:
df = load_xlsx()

In [6]:
# df.loc[df['Powerball'] > 0]

In [7]:
df.loc[df['Bonus'] > 0]

Unnamed: 0,Draw,Date,1,2,3,4,5,6,Bonus,Bonus 2nd,Powerball
0,1,1987-08-01,4,29,16,40,8,32,30,,
1,2,1987-08-08,3,9,39,13,36,20,38,,
2,3,1987-08-15,11,26,18,39,22,5,38,,
3,4,1987-08-22,35,2,29,10,4,11,14,,
4,5,1987-08-29,23,7,3,8,12,11,15,,
...,...,...,...,...,...,...,...,...,...,...,...
1941,1942,2020-03-14,36,25,9,23,13,22,37,,2
1942,1943,2020-03-18,36,12,18,2,28,20,32,,10
1943,1944,2020-03-21,9,18,16,20,28,19,7,,5
1944,1945,2020-03-25,17,8,27,7,10,21,29,,7


In [8]:
def classify(current, future):
    if float(future) > float(current):  # if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:  # otherwise... it's a 0!
        return 0

In [20]:
df.loc[df['Draw'].idxmax()]

Draw                        1946
Date         2020-03-28 00:00:00
1                              3
2                              2
3                             39
4                             19
5                             35
6                             37
Bonus                         23
Bonus 2nd                   None
Powerball                     10
Name: 1945, dtype: object

In [17]:
last_draw = int(df['Draw'].loc[df['Draw'].idxmax()])
last_draw

1946

In [7]:
df.loc[df['Draw'] < 2]

Unnamed: 0,Draw,Date,1,2,3,4,5,6,Bonus,Bonus 2nd,Powerball
0,1,1987-08-01,4,29,16,40,8,32,30,,


In [31]:
print(df.dtypes)

Draw                  int64
Date         datetime64[ns]
1                     int64
2                     int64
3                     int64
4                     int64
5                     int64
6                     int64
Bonus                 int64
Bonus 2nd            object
Powerball            object
dtype: object


In [25]:
df[list("123456")].loc[df['Draw'] == last_draw]

Unnamed: 0,1,2,3,4,5,6
1945,3,2,39,19,35,37


In [30]:
df[list("123456")].loc[df['Draw'] > last_draw-SEQ_LEN]

Unnamed: 0,1,2,3,4,5,6
1936,30,8,2,33,21,3
1937,4,18,27,25,32,28
1938,30,39,36,5,14,40
1939,26,39,20,7,14,23
1940,22,29,21,13,40,8
1941,36,25,9,23,13,22
1942,36,12,18,2,28,20
1943,9,18,16,20,28,19
1944,17,8,27,7,10,21
1945,3,2,39,19,35,37


In [22]:
df["Bonus"].loc[df['Draw'] == last_draw]

1945    23
Name: Bonus, dtype: int64

In [90]:
# Arrange from future to past

# def history_window(index, window):
now = df[['1','2','3','4','5','6']]
now
# past = df.index-window
# return now-past

Unnamed: 0,1,2,3,4,5,6
0,4,29,16,40,8,32
1,3,9,39,13,36,20
2,11,26,18,39,22,5
3,35,2,29,10,4,11
4,23,7,3,8,12,11
...,...,...,...,...,...,...
1941,36,25,9,23,13,22
1942,36,12,18,2,28,20
1943,9,18,16,20,28,19
1944,17,8,27,7,10,21


In [50]:
history_window(df, 5)

Int64Index([5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            ...
            5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
           dtype='int64', length=1946)

In [48]:
def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic.


    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

    random.shuffle(sequential_data)  # shuffle for good measure.

    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!

In [None]:
main_df = pd.DataFrame() # begin empty

ratios = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 ratios we want to consider
for ratio in ratios:  # begin iteration

    ratio = ratio.split('.csv')[0]  # split away the ticker from the file-name
    print(ratio)
    dataset = f'training_datas/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
#print(main_df.head())  # how did we do??

main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))

main_df.dropna(inplace=True)

## here, split away some slice of the future data from the main main_df.
times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")

model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)

# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))