In [None]:
import pandas as pd
import numpy as np
import pytz
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
def preprocess_parquet_to_df(name):
    df = pd.read_parquet(name)
    df = df.replace(-1, 0)
    df = df.fillna(0)
    df["dt"] = pd.to_datetime(df.index, utc=True, unit="s")
    df["dt"] = df["dt"].dt.tz_convert(pytz.timezone('Europe/Amsterdam')).dt.tz_localize(None)
    df = df.set_index("dt")
    df = df.sort_index()
    #df['Total']= df.sum(axis=1)
    return(df)

def resample_normalize_dataframe(df, window):
    df = df.resample(window).mean()
    df = (df-df.mean())/df.std()
    return(df)

In [None]:
# Loop through all metric folders containing parquet files, select one parquet file for the indices (node names)
# For each node a seperate parquet file is generated, containing only the metrics for this specific node
folders = os.listdir("data")

index_df = preprocess_parquet_to_df("data/node_load1")
output_df = pd.DataFrame(index=index_df.index)

for col in ['r10n13', 'r11n18', 'r12n6', 'r13n23', 'r14n2', 'r15n23', 'r25n32',
            'r26n8', 'r27n16', 'r29n3', 'r30n6', 'r31n5', 'r38n1']: #index_df.columns:
    print(col)
    for f in folders:
        if not f.startswith('.'):
            #print(f)
            # preprocess data
            data = preprocess_parquet_to_df("data/"+f)
            if col in data.columns:
                output_df[f] = data[col]#[['Total']].copy()    
    output_df.to_parquet(col)

In [None]:
def process_node_df(file):
    # Load in processed parquet for sample nodes
    df = pd.read_parquet(file)
    # Normalize data, resample to different time bins
    df_15sec = (df-df.mean())/df.std()
    df_1min = resample_normalize_dataframe(df,"1T")
    df_5min = resample_normalize_dataframe(df,"5T")
    df_10min = resample_normalize_dataframe(df,"10T")
    return(df_15sec, df_1min, df_5min, df_10min)

In [None]:
r10n13_15sec, r10n13_1min, r10n13_5min, r10n13_10min = process_node_df('r10n13')
r11n18_15sec, r11n18_1min, r11n18_5min, r11n18_10min = process_node_df('r11n18')
r12n6_15sec, r12n6_1min, r12n6_5min, r12n6_10min = process_node_df('r12n6')
r13n23_15sec, r13n23_1min, r13n23_5min, r13n23_10min = process_node_df('r13n23')
r14n2_15sec, r14n2_1min, r14n2_5min, r14n2_10min = process_node_df('r14n2')
r15n23_15sec, r15n23_1min, r15n23_5min, r15n23_10min = process_node_df('r15n23')
r25n32_15sec, r25n32_1min, r25n32_5min, r25n32_10min = process_node_df('r25n32')
r26n8_15sec, r26n8_1min, r26n8_5min, r26n8_10min = process_node_df('r26n8')
r27n16_15sec, r27n16_1min, r27n16_5min, r27n16_10min = process_node_df('r27n16')
r29n3_15sec, r29n3_1min, r29n3_5min, r29n3_10min = process_node_df('r29n3')
r30n6_15sec, r30n6_1min, r30n6_5min, r30n6_10min = process_node_df('r30n6')
r31n5_15sec, r31n5_1min, r31n5_5min, r31n5_10min = process_node_df('r31n5')
r38n1_15sec, r38n1_1min, r38n1_5min, r38n1_10min = process_node_df('r38n1')

In [None]:
def split_series(series, n_past, n_future):
    # n_past = no of past observations
    # n_future = no of future observations 
    X, y = list(), list()
    for window_start in range(len(series)):
        past_end = window_start + n_past
        future_end = past_end + n_future
        if future_end > len(series):
            break
        # Slicing the past and future parts of the window
        past, future = series[window_start:past_end, :], series[past_end:future_end, :]
        X.append(past)
        y.append(future)
    return np.array(X), np.array(y)

In [None]:
# Setup - 2 hours input for all time granularities
# Predict 20 minutes ahead
# Compare loss values
# Set parameters
n_past = 480 # 480*15 seconds = 2 hours
n_future = 80 # 80*15 seconds = 20 minutes
n_features = 2 # ['node_sockstat_sockets_used', 'node_load1']

In [None]:
# Select two correlated metrics
selection = r10n13_15sec[['node_sockstat_sockets_used', 'node_load1']].copy()
# Split in train and evaluation datasets
train, test = train_test_split(selection, test_size=0.10, shuffle=False)
print(train)
print(test)

In [None]:
# Generate dataset for training LSTM
X_train, y_train = split_series(train.values,n_past, n_future)
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], n_features))

In [None]:
# Generate dataset for evaluating LSTM
X_test, y_test = split_series(test.values,n_past, n_future)
X_test = np.nan_to_num(X_train)
y_test = np.nan_to_num(y_train)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1],n_features))
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], n_features))

In [None]:
# Build LSTM model - based on https://www.analyticsvidhya.com/blog/2020/10/multivariate-multi-step-time-series-forecasting-using-stacked-lstm-sequence-to-sequence-autoencoder-in-tensorflow-2-0-keras/
encoder_inputs = tf.keras.layers.Input(shape=(n_past, n_features))
encoder_l1 = tf.keras.layers.LSTM(100, return_state=True)
encoder_outputs1 = encoder_l1(encoder_inputs)

encoder_states1 = encoder_outputs1[1:]

decoder_inputs = tf.keras.layers.RepeatVector(n_future)(encoder_outputs1[0])

decoder_l1 = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states1)
decoder_outputs1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_features))(decoder_l1)

model = tf.keras.models.Model(encoder_inputs,decoder_outputs1)

model.summary()

# Visualise LSTM
dot_img_file = 'LSTM_V2.png'
tf.keras.utils.plot_model(model, to_file=dot_img_file, show_shapes=True)

In [None]:
reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber())
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=3)

# Used for 15sec
model_e1d1.fit(X_train,y_train,epochs=40,batch_size=5120,verbose=1,validation_split=0.20,
               callbacks=[reduce_lr, earlystop_callback])

# Used for 1min
#model_e1d1.fit(X_train,y_train,epochs=40,batch_size=1280,verbose=1,validation_split=0.20,
#               callbacks=[reduce_lr, earlystop_callback])

# Used for 5min
#model_e1d1.fit(X_train,y_train,epochs=40,batch_size=256,verbose=1,validation_split=0.20,
#               callbacks=[reduce_lr, earlystop_callback])

# Used for 10min
#model_e1d1.fit(X_train,y_train,epochs=40,batch_size=128,verbose=1,validation_split=0.20,
#               callbacks=[reduce_lr, earlystop_callback])

In [None]:
# Evaluate LSTM performance by comparing prediction to ground truth
prediction = model.evaluate(X_test, y_test, batch_size=5120)