In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(dataset):
    data = np.load(dataset)
    labels = data[:, 17]
    labels = labels.astype(np.float32)
    data = np.delete(arr=data, obj=[17], axis=1)
    data = data.astype(np.float32)
    return data, labels

In [3]:
train_data = np.load('train_data.npy')
test_data = np.load('test_data.npy')

In [4]:
train_data.shape

(1898322, 22)

In [5]:
train,train_label = load_data('train_data.npy')
test,test_label = load_data('test_data.npy')

In [6]:
sequence_length = 5

In [7]:
# function to reshape features into (samples, time steps, features) 
def gen_sequence(id_df, seq_length):
    """ Only sequences that meet the window-length are considered, no padding is used. This means for testing
    we need to drop those which are below the window-length. An alternative would be to pad sequences so that
    we can use shorter ones """
    data_array = id_df
    num_elements = data_array.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
        yield data_array[start:stop, :]

In [8]:
# generator for the sequences
seq_gen = list(gen_sequence(train, sequence_length))
seq_array = np.array(seq_gen).astype(np.float32)
seq_array.shape

(1898317, 5, 21)

In [9]:
# function to generate labels
def gen_labels(id_df, seq_length):
    data_array = id_df[:,17].reshape(-1,1)
    num_elements = data_array.shape[0]
    return data_array[seq_length:num_elements]

In [10]:
# generate labels
label_gen = gen_labels(train_data, sequence_length)
label_array = np.array(label_gen).astype(np.float32)
label_array.shape

(1898317, 1)

In [11]:
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Activation
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [12]:
# build the network
nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()

model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=False))
model.add(Dropout(0.2))

# model.add(LSTM(
#           units=100,
#           return_sequences=False))
# model.add(Dropout(0.2))

model.add(Dense(units=nb_out, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               48800     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 48,901
Trainable params: 48,901
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
%%time
# fit the network
model.fit(seq_array, label_array, epochs=1000, batch_size=200, validation_split=0.1, verbose=1)

Train on 1708485 samples, validate on 189832 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000