In [1]:
import pandas as pd
import numpy as np

from keras import Input
from keras.engine import Model
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Concatenate, concatenate
from keras.callbacks import TensorBoard

Using TensorFlow backend.


In [11]:

# features is a list of strings of feature names 

def build_model(features, data_length):
    
    inputs_list = [] 
    for feature_name in features:
        inputs_list.append((Input(shape=(data_length,1), name=feature_name)))
    
    layers = [] 
    for i, input_name in enumerate(inputs_list): 
        layers.append(LSTM(64, return_sequences=False)(inputs_list[i]) )
        
    output = concatenate(layers) 
    output = Dense(1, activation='softmax', name='IsSpike')(output)
    
    model = Model(
        inputs = inputs_list,
        outputs = [output]
    )
    
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
    
    return model    

data_length = 10


In [12]:
from sklearn.preprocessing import MinMaxScaler

master_df = pd.read_csv('C:/Users/Shoya/surf/data/master_df.csv', encoding='latin1')
df = master_df[['Timestamp', 'Close', 'Volume_(BTC)', 'Volume_(Currency)', 'Date(UTC)', 'Bitcoin (Adj.Overlap)', 
               'Close Price % Change', 'Close Price % Change (Abs)', 'Is Spike']]

# lag inputs depending on data_length 
df['Price_lagged'] = df['Close'].shift(data_length)
df['Volume_BTC'] = df['Volume_(BTC)'].shift(data_length)
df['Bitcoin_Adj'] = df['Bitcoin (Adj.Overlap)'].shift(data_length)

df = df.dropna()
cols = ['Volume_BTC','Bitcoin_Adj', 'Close', 'Price_lagged']

# Stationalize Data by taking log differences
data_array = np.diff(np.log(df[cols]), axis=0)

# Min-Max Scale 

scalers = {}
datas = [] 

df_scaled = pd.DataFrame(columns=cols)

for i in range(len(cols)): 
    scalers[cols[i]] = MinMaxScaler()
    #print('data', data_array[:,i])
    
    col_data = data_array[:,i]
    col_data = np.reshape(col_data, (len(col_data), 1))
    
    data = scalers[cols[i]].fit_transform( col_data )  #:, np.newaxis
    #print('scaled', data)
    data = np.reshape(data, (1, len(data)))
    df_scaled[cols[i]] = data[0]
    
df_scaled['Is Spike'] = df['Is Spike']
df_scaled.dropna(inplace=True)
display(df_scaled.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Volume_BTC,Bitcoin_Adj,Close,Price_lagged,Is Spike
10,0.742111,0.422363,0.54954,0.463739,0.0
11,0.537603,0.484887,0.496233,0.54546,-1.0
12,0.557162,0.407654,0.574526,0.550962,0.0
13,0.18377,0.44331,0.533819,0.548729,0.0
14,0.754163,0.431608,0.54056,0.514868,0.0


In [10]:
# split and reshape data to feed into RNN

# X_timestamp = df_scaled['Timestamp'].values
X_volume = df_scaled['Volume_BTC'].values
X_trends = df_scaled['Bitcoin_Adj'].values
X_lagged_price = df_scaled['Price_lagged'].values

Y_is_spike = df_scaled['Is Spike'].values 

train_size = int(len(X_volume) * 0.85)
train_size = int(train_size/data_length) * data_length

test_size_index = int(len(X_volume)/data_length)*data_length

# X_train_timestamp, X_test_timestamp = X_timestamp[:train_size], X_timestamp[train_size:test_size_index ]
X_train_volume, X_test_volume = X_volume[:train_size], X_volume[train_size:test_size_index ]
X_train_trends, X_test_trends = X_trends[:train_size], X_trends[train_size:test_size_index ]
X_train_lagged_price, X_test_lagged_price = X_lagged_price[:train_size], X_lagged_price[train_size:test_size_index ]

Y_train_is_spike, Y_test_is_spike = Y_is_spike[:train_size], Y_is_spike[train_size:test_size_index ]


# X.shape is (samples, timesteps, dimension) 
# timestemps is 15, samples is just however many nobs there are (but it doesn't matter, so it should be None)



# X_train_timestamp = np.reshape(X_train_timestamp, (int(X_train_timestamp.shape[0]/data_length),data_length,1) ) 
X_train_volume = np.reshape(X_train_volume, (int(X_train_volume.shape[0]/data_length),data_length,1) ) 
X_train_trends = np.reshape(X_train_trends, (int(X_train_trends.shape[0]/data_length),data_length,1) ) 
X_train_lagged_price = np.reshape(X_train_lagged_price, (int(X_train_lagged_price.shape[0]/data_length), data_length, 1))

# X_test_timestamp = np.reshape(X_test_timestamp, (int(X_test_timestamp.shape[0]/data_length),data_length,1) ) 
X_test_volume = np.reshape(X_test_volume, (int(X_test_volume.shape[0]/data_length),data_length,1) ) 
X_test_trends = np.reshape(X_test_trends, (int(X_test_trends.shape[0]/data_length),data_length,1) )  
X_test_lagged_price = np.reshape(X_test_lagged_price, (int(X_test_lagged_price.shape[0]/data_length),data_length,1))


# Don't need the 1 for the third dimension for Y's??


Y_train_is_spike = np.reshape(Y_train_is_spike, (int(Y_train_is_spike.shape[0]/data_length),  data_length) ) 
Y_test_is_spike = np.reshape(Y_test_is_spike, (int(Y_test_is_spike.shape[0]/data_length),  data_length) )



# instead of using input 1,2,3,4,5,6,7,8,9,10 to predict output for 11,12,13,14,15,16,17,18,19,20
# I want to use input 1,2,3,4,5,6,7,8,9,10 to predict output for 11, then 2,3,4,5,6,7,8,9,10,11 to predict output for 12 

# right now I am actually feeding input 1,2,3,4,5,6,7,8,9,10 to predict output for 1,2,3,4,5,6,7,8,9,10. 
# instead I should at least feed 1,2,3..8,9,10 to predict 11,12,13,14,15,16,17,18,19,20 -> lag everything by data_length! 

array([[[ 0.77679111],
        [ 0.4633158 ],
        [ 0.72507941],
        ..., 
        [ 0.43692676],
        [ 0.7421114 ],
        [ 0.53760341]],

       [[ 0.55716235],
        [ 0.18376964],
        [ 0.75416316],
        ..., 
        [ 0.3839469 ],
        [ 0.50736037],
        [ 0.69401107]],

       [[ 0.55575319],
        [ 0.37927431],
        [ 0.67141699],
        ..., 
        [ 0.6548668 ],
        [ 0.62274773],
        [ 0.30933732]],

       ..., 
       [[ 0.25876546],
        [ 0.488586  ],
        [ 0.47574397],
        ..., 
        [ 0.76213578],
        [ 0.48108754],
        [ 0.41414922]],

       [[ 0.47271567],
        [ 0.70090827],
        [ 0.4060511 ],
        ..., 
        [ 0.74667333],
        [ 0.39332509],
        [ 0.59644994]],

       [[ 0.5602949 ],
        [ 0.46956629],
        [ 0.30546802],
        ..., 
        [ 0.45286573],
        [ 0.66948196],
        [ 0.63114748]]])

In [5]:
features = ['Volume_BTC', 'Bitcoin_Adj', 'Price_lagged']

rnn = build_model(features, 10) 

tensorboard_callback = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

history = rnn.fit(
    [
        #X_train_timestamp,
        X_train_volume,
        X_train_trends,
        X_train_lagged_price
    ],
    [
        Y_train_is_spike
    ]
    ,
    validation_data=(
        [
            #X_test_timestamp,
            X_test_volume,
            X_test_trends,
            X_test_lagged_price
        ],
        [
            Y_test_is_spike
        ]),
    epochs=10,
    batch_size=32,
    callbacks=[
      tensorboard_callback
    ],
    verbose=1
)

Train on 2386 samples, validate on 421 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [6]:
score = rnn.evaluate(
    [
        #X_test_timestamp,
        X_test_volume,
        X_test_trends,
        X_test_lagged_price
    ],
    [
        Y_test_is_spike
    ])

print("Accuracy: %.2f%%" % (score[1]*100))

Accuracy: 4.75%


In [7]:
yhat = rnn.predict( 
    [
        #X_test_timestamp,
        X_test_volume,
        X_test_trends,
        X_test_lagged_price
    ],
    verbose=0
)

display(yhat)

array([[ 0.04118741,  0.04820677,  0.01170081, ..., -0.06489895,
        -0.12635459,  0.04566633],
       [ 0.04067064,  0.048839  ,  0.01867078, ..., -0.06471471,
        -0.11981186,  0.04568062],
       [ 0.04209199,  0.04844366,  0.01405586, ..., -0.06447315,
        -0.12522832,  0.04572744],
       ..., 
       [ 0.0394179 ,  0.04518746,  0.01890367, ..., -0.06555228,
        -0.1281513 ,  0.04599499],
       [ 0.0422116 ,  0.04837319,  0.01481877, ..., -0.06414334,
        -0.12189236,  0.04572504],
       [ 0.04029136,  0.04628353,  0.02012031, ..., -0.06370412,
        -0.11830761,  0.04467715]], dtype=float32)

In [None]:
# Need to actually categorize output