In [1]:
import numpy as np
import time
import csv
import pickle

In [2]:
# Release memory when possible due to large list sizes
def release_list(a):
   del a[:]
   del a

In [3]:
def prepare_dataset(path_to_dataset, sequence_length, max_values):
    
    # Read in data set from csv
    # Columns: DatTime, Open, High, Low, Close, Volume for 1 minute intervals
    with open(path_to_dataset) as f:
        reader = csv.reader(f, delimiter=",")
        data = []
        nb_of_values = 0
        for line in reader:
            try:
                data.append(line)
                nb_of_values += 1
            except ValueError:
                pass
            if max_values is not None and nb_of_values > max_values:
                break
    
    # Remove DateTime, convert from string to float
    data_ll = []
    for i in range(1,len(data)):
        data_ll.append([float(v) for j,v in enumerate(data[i]) if j > 0])
    release_list(data) 
    
    # Create the list of sliding window sequences 
    result = []
    for index in range(len(data_ll) - sequence_length):
        result.append(data_ll[index: index + sequence_length])
    release_list(data_ll)
    result = np.array(result, dtype=np.float32) 
    
    # Set mean to 0
    # mean is an  int due to np.array is dtype=np.int16
    open_mean = result[:,:,0].mean()
    close_mean = result[:,:,1].mean()
    high_mean = result[:,:,2].mean()
    low_mean = result[:,:,3].mean()
    volume_mean = result[:,:,4].mean()
    result[:,:,0] -= open_mean
    result[:,:,1] -= close_mean
    result[:,:,2] -= high_mean
    result[:,:,3] -= low_mean
    result[:,:,4] -= volume_mean
    print('----------')
    print('Open mean: ' + str(result[:,:,0].mean()))
    print('High mean: ' + str(result[:,:,1].mean()))
    print('Low mean: ' + str(result[:,:,2].mean()))
    print('Close mean: ' + str(result[:,:,3].mean()))
    print('Volume mean: ' + str(result[:,:,4].mean()))
    print('----------')
    
    # Shuffle training texamples
    # -- training occurs in no particular order
    # -- the distrubtion is uniform (for the batch calculation of the loss) 
    # -- not test set so that we can visualize our predictions with real signals.
    # 90% training, 10% testing
    row = int(round(0.9 * result.shape[0]))
    train = result[:row, :]
    np.random.shuffle(train)
    X_train = train[:, :-1] 
    y_train = train[:, -1, -2]  # want to predict the close in 1 minutes time
    X_test = result[row:, :-1]
    y_test = result[row:, -1, -2]  # want to predict the close in 1 minutes time
    
    print('----------')
    print('X_train shape: ' + str(X_train.shape))
    print('X_test shape: ' + str(X_test.shape))
    print('Y_train shape:' + str(y_train.shape))
    print('Y_test shape: ' + str(y_test.shape))
    print('----------')
    
    print('----------')
    print('Data, first sequence, "sequence length" minute, ohlc+v: ' + str(result[0][-1][:]))
    print('X_train first sequence, "sequence length - 1" minute, ohlc+v: ' + str(X_train[0][-1][:]))
    print('y_train first sequence, close at "sequence length" minute :' + str(y_train[0]))
    print('----------')
    
    return [X_train, y_train, X_test, y_test]

In [None]:
# Only variables you should need to  change
path_to_dataset = 'D:/bitfinexUSD_ohlc.csv'  # path to date, ohlc, volume csv file
filename = 'D:/bitfinexUSD_ready.pk'  # name of file to pickle model ready data to
sequence_length = 50  # size of sliding window
max_values = 500000  # maximum number number of values (minutes) to use

In [None]:
# data = [X_train, y_train, X_test, y_test]
data = prepare_dataset(path_to_dataset, sequence_length, max_values)

with open(filename, 'wb') as handle:
    pickle.dump(data, handle)

----------
Open mean: -0.000218632
High mean: -9.0254e-05
Low mean: 0.000625668
Close mean: -0.000237094
Volume mean: -1.96807e-05
----------
----------
X_train shape: (449955, 49, 5)
X_test shape: (49995, 49, 5)
Y_train shape:(449955,)
Y_test shape: (49995,)
----------
----------
Data, first sequence, "sequence length" minute, ohlc+v: [ 172.01077271  171.56350708  172.46130371  172.01812744  -17.13773537]
X_train first sequence, "sequence length - 1" minute, ohlc+v: [ 172.03079224  171.59353638  172.48132324  172.04815674  -17.09383583]
y_train first sequence, close at "sequence length" minute :172.018
----------
