In [4]:
# import the libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from numpy import arange, sin, pi, random

# Global hyper-parameters
sequence_length = 1940
random_data_dup = 5  # each sample randomly duplicated between 0 and 9 times, see dropin function
epochs = 1
batch_size = 50

# consider delay threshold and missing segments
def get_range_proba(predict, label, delay=7):

    splits = np.where(label[1:] != label[:-1])[0] + 1
    is_anomaly = label[0] == 1
    new_predict = np.array(predict)
    pos = 0

    for sp in splits:
        if is_anomaly:
            if 1 in predict[pos:min(pos + delay + 1,sp)]:
                new_predict[pos: sp] = 1
            else:
                new_predict[pos: sp] = 0
        is_anomaly = not is_anomaly
        pos = sp
    sp = len(label)

    if is_anomaly:  #anomaly in the end
        if 1 in predict[pos: min(pos + delay+1,sp)]:
            new_predict[pos: sp] = 1
        else:
            new_predict[pos: sp] = 0
    return new_predict

# set missing = 0
def reconstruct_label(timestamp, label):
    timestamp = np.asarray(timestamp, np.int64)
    timestamp_sorted = np.asarray(timestamp[np.argsort(timestamp)])
    interval = np.min(np.diff(timestamp_sorted))
    if interval == 0:
        print(timestamp_sorted)
    idx = (timestamp_sorted - timestamp_sorted[0]) // interval
    new_label = np.zeros(shape=((timestamp_sorted[-1] - timestamp_sorted[0]) // interval + 1,), dtype=np.int)
    new_label[idx] = label
    return new_label

def label_evaluation(truth_df, result_df, delay=7):
    data = {'result': False, 'data': "", 'message': ""}
    kpi_names = truth_df['KPI ID'].values
    kpi_names = np.unique(kpi_names)
    y_true_list = []
    y_pred_list = []
    for kpi_name in kpi_names:
        truth = truth_df[truth_df["KPI ID"] == kpi_name]
        y_true = reconstruct_label(truth["timestamp"], truth["label"])
        try:
            result = result_df[result_df["KPI ID"] == kpi_name]
            y_pred = reconstruct_label(result["timestamp"], result["predict"])
        except:
            data['message'] = "The file you submitted need contain 'predict','timestamp' and  \
                             'KPI ID' columns"
            return json.dumps(data)
        try:
            assert np.array_equal(len(y_true),len(y_pred)) == True
        except:
            data['message'] = "The length of your submitted file is wrong"
            return json.dumps(data)

        y_pred = get_range_proba(y_pred, y_true, delay)
        y_true_list.append(y_true)
        y_pred_list.append(y_pred)

    fscore = f1_score(np.concatenate(y_true_list), np.concatenate(y_pred_list))
    data['result'] = True
    data['data'] = fscore
    data['message'] = 'success'
    print(json.dumps(data))
    return json.dumps(data)

def dropin(X, y):
    """ The name suggests the inverse of dropout, i.e. adding more samples. See Data Augmentation section at
    http://simaaron.github.io/Estimating-rainfall-from-weather-radar-readings-using-recurrent-neural-networks/
    :param X: Each row is a training sequence
    :param y: Tne target we train and will later predict
    :return: new augmented X, y
    """
    print("X shape:", X.shape)
    print("y shape:", y.shape)
    X_hat = []
    y_hat = []
    for i in range(0, len(X)):
        for j in range(0, np.random.random_integers(0, random_data_dup)):
            X_hat.append(X[i, :])
            y_hat.append(y[i, ])
    return np.asarray(X_hat), np.asarray(y_hat)

def make_samples(samples):
    data = samples.value
    print("Length of Data", len(data))
    # train data
    print("Creating train data...")
    result = []
    for index in range(0, len(data) - sequence_length):
        result.append(data[index: index + sequence_length])
    result = np.array(result)  # shape (samples, sequence_length)    
    print("Train data shape  : ", result.shape)
    np.random.shuffle(result)  # shuffles in-place
    X_train = result[:, :-1]
    y_train = result[:, -1]
    X_train,y_train = dropin(X_train,y_train)
    X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
    return X_train,y_train

def make_submit_samples(samples):
    data = samples.value
    print("Length of Data", len(data))
    # train data
    print("Creating train data...")
    result = []
    for index in range(0, len(data) - 1939):
        result.append(data[index: index+1939])
    result = np.array(result)  # shape (samples, sequence_length)
    print("Train data shape  : ", result.shape)
    np.random.shuffle(result)  # shuffles in-place
    X_train = result
    X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
    return X_train

In [2]:
def build_model():
    model = Sequential()
    layers = {'input': 1, 'hidden1': 64, 'hidden2': 256, 'hidden3': 100, 'output': 1}
    
    model.add(LSTM(
        input_length=sequence_length - 1,
        input_dim=1,
        #input_shape = (1,1440),
        output_dim=layers['hidden1'],
        return_sequences=True))
    
    model.add(Dropout(0.2))
    model.add(LSTM(
        layers['hidden2'],
        return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(
        layers['hidden3'],
        return_sequences=False))
    model.add(Dropout(0.2))

    model.add(Dense(
            output_dim=layers['output']))
    model.add(Activation("linear"))

    start = time.time()
    model.compile(loss="mse", optimizer="rmsprop")
    print("Compilation Time : ", time.time() - start)
    return model

def run_network(model=None, data=None):
    global_start_time = time.time()
    #X_train,y_train = make_samples(samples_train)
    #X_testt,y_testt = make_samples(samples_testt)
    print('\nData Loaded. Compiling...\n')
    if model is None:
        model = build_model()

    try:
        print("Training...")
        model.fit(
                X_train, y_train,
                batch_size=batch_size, nb_epoch=epochs, validation_split=0.05)
        print("Predicting...")
        predicted = model.predict(X_testt)
        print("Reshaping predicted")
        predicted = np.reshape(predicted, (predicted.size,))
    except KeyboardInterrupt:
        print("prediction exception")
        print('Training duration (s) : ', time.time() - global_start_time)
    return model, predicted

In [5]:
samples_train=pd.read_csv("./samples/samples_train.csv")
samples_testt=pd.read_csv("./samples/samples_test.csv")
X_train,y_train = make_samples(samples_train)
X_testt = make_submit_samples(samples_testt)
X_train.shape,y_train.shape

Length of Data 131795
Creating train data...
Train data shape  :  (129855, 1940)
X shape: (129855, 1939)
y shape: (129855,)




Length of Data 131795
Creating train data...
Train data shape  :  (129856, 1939)


((324412, 1939, 1), (324412,))

In [None]:
model, predicted = run_network()


Data Loaded. Compiling...



  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


Compilation Time :  0.06903505325317383
Training...
Train on 308191 samples, validate on 16221 samples
Epoch 1/1
   500/308191 [..............................] - ETA: 47:54:54 - loss: 1.7640

In [12]:
predicted = model.predict(X_testt)

In [15]:
predicted

array([[[ 1.897982  ,  1.89176536,  1.88228869, ...,  1.87918818,
          1.8881855 ,  1.86921906]],

       [[ 1.897982  ,  1.89176536,  1.88228869, ...,  1.87918818,
          1.8881855 ,  1.86921906]],

       [[ 1.897982  ,  1.89176536,  1.88228869, ...,  1.87918818,
          1.8881855 ,  1.86921906]],

       ..., 
       [[ 1.897982  ,  1.89176536,  1.88228869, ...,  1.87918818,
          1.8881855 ,  1.86921906]],

       [[ 1.897982  ,  1.89176536,  1.88228869, ...,  1.87918818,
          1.8881855 ,  1.86921906]],

       [[ 1.89798188,  1.89176524,  1.88228869, ...,  1.87918806,
          1.8881855 ,  1.86921906]]], dtype=float32)

In [22]:
len(predicted[19][0])

500