In [420]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
%matplotlib inline

In [421]:
combined_df = pd.read_csv('combined_df.csv', index_col='date', infer_datetime_format=True, parse_dates=True)

In [422]:
X = combined_df.drop(columns = 'class')
y = combined_df['class'].to_frame()

# standardize X values
X = (X - X.mean())/X.std()

In [423]:
# check the class balance of data
y.value_counts()

class
 0       1391
 1         36
-1         23
dtype: int64

In [424]:
# define function to chunk the features data up with a rolling window of n days
# returns dataframe
def features(df, columns, steps):
    features = pd.DataFrame()
    for c in range(len(columns)):
        feature = []
        temp = dict()
        for i in range(len(df) - steps):
            f = df.iloc[i : i + steps, columns[c]].values
            feature.append(f)
        temp.update({df.columns[c]:feature})
        features = pd.concat([features, pd.DataFrame(temp)], axis=1)
    return features

In [425]:
X = features(X, list(range(8)), 10)

In [426]:
X.columns = combined_df.columns[1:]

In [427]:
# define function to chunk the target data up with a rolling window of n days
# returns array
def target(df, column, steps):
    target = []
    for i in range(len(df) - steps):
        t = df.iloc[i + steps, column]
        target.append(t)
    return np.array(target).reshape(-1, 1)

In [428]:
y = target(combined_df, 0, 10)

In [429]:
# split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [394]:
X_train.head()

Unnamed: 0,volume,google trend,Reddit positive,Reddit negative,Google positive,Google negative,reddit buzzword score,google buzzword score
1041,"[0.28631886907554177, 0.13430486270149553, -0....","[0.7842117901993176, 0.7268151129980124, 0.999...","[0.12644754475407072, 0.4336114708624002, 0.68...","[-0.15289189318320845, 0.08370708773000805, -0...","[0.07968865790303666, -1.1749101907331594, -0....","[1.0924479725525673, 0.4417032883617931, -1.37...","[-0.03395561704335166, 0.23559295792508458, 0....","[-0.8811116254867974, 0.10052123742994877, 0.3..."
547,"[-0.8261373974593625, -0.6639459584818282, 0.3...","[-0.2728436815913889, -0.2561029840743415, 0.9...","[0.4615354641449756, 0.880395363383606, 0.8803...","[0.04427392424447197, -0.19232505666874453, -0...","[-0.23396105425601255, 0.3306084276302756, -1....","[0.8526999310085978, -0.24329111604954787, -1....","[-0.08436079252274047, 0.567672850743787, 0.56...","[-0.3522010386682803, -0.49387351728038265, -0..."
410,"[-0.5962650856500092, -0.5908052636763478, -0....","[1.2075122845589445, 1.269692018193692, 1.2433...","[0.3777634842972494, -0.152792388071683, -0.18...","[0.320306068643224, 0.7146377034985848, 0.2414...","[-0.6417056800627761, -1.6453847589717328, -0....","[0.7842004905674635, 2.1884390196107133, -0.03...","[0.16015163195422163, 0.14479491958549134, -0....","[0.498463488464835, -0.6317680631294965, 0.897..."
1096,"[0.6664179299762324, 0.6138834291436025, -0.12...","[0.4852707631091857, 0.066753325183001, -0.198...","[-0.09694440150653219, 0.23814351788437235, 0....","[0.7146377034985848, -0.46835720106749656, 0.3...","[-0.3907859103355367, -1.551289845324018, 1.14...","[0.5102027288029273, 0.6129518894646285, -0.38...","[0.37129683962521237, 0.7452023362243423, 0.62...","[0.9089988487096816, 1.302533511521078, 1.4964..."
649,"[-0.4470288446735289, 0.5195974867128577, -0.1...","[0.22698571570331172, 0.46135548094197515, 0.8...","[0.6011554305578519, 0.6011554305578519, 0.517...","[-0.5472235280385688, -0.5472235280385688, 0.2...","[2.243871671800474, 1.5224773338346618, -0.233...","[-1.6817793653133644, -0.894035800240322, 1.70...","[0.18955503447983632, 0.18955503447983632, 0.0...","[-1.8211872280106602, 1.1621203438299716, 1.33..."


In [395]:
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [396]:
# resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
X_resampled, y_resampled = RandomOverSampler(random_state=1).fit_resample(X_train, y_train)

# View the count of target classes with Counter
Counter(y_resampled)

Counter({0: 1038, -1: 1038, 1: 1038})

In [397]:
# define function to change the features dataframe to array
def re_features(df, columns):
    re_features = []
    for c in range(len(df.columns)):
        re_feature = []
        for i in range(len(df)):
            f = df.iloc[i, columns[c]]
            re_feature.append(f)
        re_features.append(re_feature)
    return np.array(re_features).T

In [398]:
X_resampled = re_features(X_resampled, list(range(8)))

In [399]:
X_resampled.shape

(10, 3114, 8)

In [401]:
# reshape the training features array to feed to a LSTM model
X_resampled = X_resampled.reshape(3114, 10, 8)

In [402]:
X_test = re_features(X_test, list(range(8)))

In [403]:
X_test.shape

(10, 360, 8)

In [404]:
# reshape the test features array to feed to a LSTM mode
X_test = X_test.reshape(360, 10, 8)

In [405]:
y_resampled.shape

(3114,)

In [406]:
# reshape the training target array
y_resampled = y_resampled.reshape(-1, 1)

In [407]:
# one-hot-encode y values to feed to a LSTM model
y_resampled = to_categorical(y_resampled, num_classes=3)
y_test = to_categorical(y_test, num_classes=3)

In [408]:
y_resampled.shape

(3114, 3)

In [409]:
y_test.shape

(360, 3)

In [410]:
# construct a LSTM model
model = Sequential()
neurons = 30
model.add(LSTM(units=neurons, return_sequences=True, input_shape=(X_resampled.shape[1], 8)))
model.add(Dropout(0.2))
model.add(LSTM(units=neurons))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

In [411]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['categorical_accuracy'])

In [412]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_16 (LSTM)              (None, 10, 30)            4680      
                                                                 
 dropout_16 (Dropout)        (None, 10, 30)            0         
                                                                 
 lstm_17 (LSTM)              (None, 30)                7320      
                                                                 
 dropout_17 (Dropout)        (None, 30)                0         
                                                                 
 dense_8 (Dense)             (None, 3)                 93        
                                                                 
Total params: 12,093
Trainable params: 12,093
Non-trainable params: 0
_________________________________________________________________


In [413]:
# define earliystopping
stop = EarlyStopping(monitor='categorical_accuracy', mode='min', patience=10, verbose=1)

In [414]:
# train the model
model.fit(X_resampled, y_resampled, epochs=100, batch_size=5, verbose=1, shuffle=False, callbacks=[stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 00017: early stopping


<keras.callbacks.History at 0x1c386622f88>

In [415]:
score = model.evaluate(X_test, y_test)[1]
print("%0.2f accuracy" % (score))

0.04 accuracy
