# Single sensor model
This notebook trains single sensor models for all Trafikverket sensors and writes to HDFS predictions and corresponding real sensor values as well as training and prediction time and finally validation error and training error.

In [1]:
import time
import math

from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *

import tensorflow as tf
import numpy as np

from hops import hdfs
from tempfile import TemporaryFile

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1231,application_1536227070932_0750,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
%%local
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
%matplotlib inline 

In [3]:
tf.__version__

'1.8.0'

## Define Parameters

In [4]:
# Dataset Parameters
year = 2016
month = 11

root_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/"
sensor_file_path = root_path + str(year) + "-" + str(month) + "_all-sensors-timeseries-parquet/*.parquet"
folder_path = root_path + str(year) + "-" + str(month) + "_single-sensor-30-min-supervised-parquet/"
export_path = root_path + str(year) + "-" + str(month) + "_single-sensor-30-min-output/"
export_gru_path = root_path + str(year) + "-" + str(month) + "_single-sensor-gru-30-min-output/"
    
batch_size = 100
num_epochs = 100
dataset_split = 0.70
max_density = 200

# Network Parameters
past_steps = 10
future_steps = 30
n_sensors = 1
lstm_units = 50

# Training Parameters
learning_rate = 0.001
display_step = 200

## Import Data

In [5]:
sensors = spark.read.parquet(sensor_file_path).columns[1:]

In [6]:
print("Number of sensors: " + str(len(sensors)))
print("First sensor: " + sensors[0])

## Prepare Data For Supervised Learning

In [7]:
def normalize(data, columns):
    scale_min = 0
    scale_max = 1
    
    out = []
    
    for column in columns:
        std = (data[column] - 0) / (max_density - 0)
        out.append( std * (scale_max - scale_min) + scale_min)
    return np.array(out)

In [8]:
def prepare_dataset(df):
    columns = df.columns[1:]
    df_normalized = df.rdd.map(lambda row: normalize(row, columns))
    x_dataset = np.array(df_normalized.map(lambda row: row[:10]).collect())
    y_dataset = np.array(df_normalized.map(lambda row: row[10:]).collect())
    
    x_dataset = np.reshape(x_dataset, (-1, past_steps, n_sensors))
    y_dataset = np.reshape(y_dataset, (-1, future_steps * n_sensors))
    
    train_size = int(len(x_dataset) * dataset_split)

    x_train = x_dataset[:train_size, :]
    x_test = x_dataset[train_size:, :]

    y_train = y_dataset[:train_size, :]
    y_test = y_dataset[train_size:, :]
    
    return x_train, y_train, x_test, y_test

## Define Neural Network

In [9]:
def define_model():
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    lstm_1 = tf.keras.layers.LSTM(units=lstm_units, return_sequences=True)(inputs)
    lstm_2 = tf.keras.layers.LSTM(units=lstm_units)(lstm_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(lstm_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

In [10]:
def define_gru_model():
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    gru_1 = tf.keras.layers.LSTM(units=50, return_sequences=True)(inputs)
    gru_2 = tf.keras.layers.LSTM(units=50)(gru_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(gru_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

## Define Model Training

In [11]:
def train_model(model, x_train, y_train):
    # Define early stopping criteria
    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', # Quantity to be monitored.
        min_delta=0.0001,   # Minimum change to qualify as an improvement.
        patience=10,         # Number of epochs with no improvement to stop training.
        verbose=0,          # Silent
        mode='auto'         # Direction of improvement is inferred.
    )

    # Start time
    t_start = time.time()

    # Train model
    model_info = model.fit(
        x=x_train,             # Training data
        y=y_train,             # Label data
        batch_size=batch_size,         # Number of samples per gradient update
        epochs=num_epochs,             # Number of iterations over the entire dataset
        verbose=0,             # Silent
        callbacks=[earlystop], # List of callbacks to apply during training
        validation_split=0.2   # Fraction of the training data to be used as validation data
    )

    # End time
    t_end = time.time()
    
    loss = [float(x) for x in model_info.history['loss']]
    val_loss = [float(x) for x in model_info.history['val_loss']]
    training_time = (t_end - t_start)
    
    return loss, val_loss, training_time

## Define Evaluation of Model

In [12]:
def denormalize(data_scaled):
    scale_min = 0
    scale_max = 1
    
    std = data_scaled / ((scale_max - scale_min) + scale_min)
    data = (std * (max_density - 0)) + 0 
    
    return data

In [13]:
def evaluate_model(model, x_test):
    p_start = time.time()
    predictions = model.predict(x_test)
    p_end = time.time()
    
    prediction_time = (p_end - p_start)
    
    return predictions, prediction_time

## Export Data

In [14]:
def export_data(sensor, y_values, pred_values, loss, val_loss, training_time, prediction_time, path):
    fs_handle = hdfs.get_fs()
    
    # Write actual sensor values
    temp_file = TemporaryFile()
    np.save(temp_file, y_values, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/true_values.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()
    
    # Write predicted sensor values
    temp_file = TemporaryFile()
    np.save(temp_file, pred_values, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/pred_values.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()
    
    # Write training losses
    temp_file = TemporaryFile()
    np.save(temp_file, loss, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/loss.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()
    
    temp_file = TemporaryFile()
    np.save(temp_file, val_loss, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/val_loss.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()
    
    # Write training_time, prediction_time
    temp_file = TemporaryFile()
    np.save(temp_file, training_time, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/training_time.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()
    
    temp_file = TemporaryFile()
    np.save(temp_file, prediction_time, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(path + sensor + "/prediction_time.npy", mode='w')
    fd.write(temp_file.read())
    fd.close()

## Run Training and Evaluation on all sensors

In [15]:
len(sensors[850:])

1091

In [None]:
for sensor in sensors:
    sensor_df = spark.read.parquet(folder_path + sensor).orderBy('Timestamp')
    x_train, y_train, x_test, y_test = prepare_dataset(sensor_df)
    model = define_model()
    loss, val_loss, training_time = train_model(model, x_train, y_train)
    predictions, prediction_time = evaluate_model(model, x_test)
    y_values = np.reshape(np.array([denormalize(y) for y in y_test]), (-1, future_steps, n_sensors))
    pred_values = np.reshape(np.array([denormalize(y) for y in predictions]), (-1, future_steps, n_sensors))
    export_data(sensor, y_values, pred_values, loss, val_loss, training_time, prediction_time, export_path)

In [None]:
for sensor in sensors[850:]:
    sensor_df = spark.read.parquet(folder_path + sensor).orderBy('Timestamp')
    x_train, y_train, x_test, y_test = prepare_dataset(sensor_df)
    model = define_gru_model()
    loss, val_loss, training_time = train_model(model, x_train, y_train)
    predictions, prediction_time = evaluate_model(model, x_test)
    y_values = np.reshape(np.array([denormalize(y) for y in y_test]), (-1, future_steps, n_sensors))
    pred_values = np.reshape(np.array([denormalize(y) for y in predictions]), (-1, future_steps, n_sensors))
    export_data(sensor, y_values, pred_values, loss, val_loss, training_time, prediction_time, export_gru_path)

## Verify Export

In [17]:
# Read true values from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/true_values.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array([[[16.27118644],
        [13.5483871 ],
        [ 7.38461538],
        ...,
        [11.80327869],
        [14.75409836],
        [ 7.27272727]],

       [[13.5483871 ],
        [ 7.38461538],
        [ 9.56521739],
        ...,
        [14.75409836],
        [ 7.27272727],
        [ 7.5       ]],

       [[ 7.38461538],
        [ 9.56521739],
        [ 9.        ],
        ...,
        [ 7.27272727],
        [ 7.5       ],
        [19.65517241]],

       ...,

       [[ 0.77922078],
        [ 0.83333333],
        [ 1.6       ],
        ...,
        [ 0.90909091],
        [ 0.        ],
        [ 0.89552239]],

       [[ 0.83333333],
        [ 1.6       ],
        [ 0.84507042],
        ...,
        [ 0.        ],
        [ 0.89552239],
        [ 0.        ]],

       [[ 1.6       ],
        [ 0.84507042],
        [ 0.        ],
        ...,
        [ 0.89552239],
        [ 0.        ],
        [ 0.        ]]])

In [18]:
# Read predicted values from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/pred_values.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array([[[18.242292 ],
        [16.053202 ],
        [16.598661 ],
        ...,
        [16.533257 ],
        [15.520663 ],
        [15.887652 ]],

       [[15.959862 ],
        [15.015607 ],
        [15.699027 ],
        ...,
        [16.213816 ],
        [15.175654 ],
        [15.574648 ]],

       [[10.081504 ],
        [11.78884  ],
        [12.721829 ],
        ...,
        [14.524057 ],
        [13.390411 ],
        [13.874653 ]],

       ...,

       [[ 1.4464101],
        [ 1.1097412],
        [ 1.3072675],
        ...,
        [ 2.6973233],
        [ 1.0778595],
        [ 1.7493668]],

       [[ 1.2951547],
        [ 1.0152605],
        [ 1.2178056],
        ...,
        [ 2.6383572],
        [ 1.0159919],
        [ 1.6899004]],

       [[ 2.0018685],
        [ 1.4056709],
        [ 1.5765277],
        ...,
        [ 2.8382058],
        [ 1.2251537],
        [ 1.8891755]]], dtype=float32)

In [19]:
# Read prediction time from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/prediction_time.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array(24.96166301)

In [20]:
# Read prediction time from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/training_time.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array(135.72623706)

In [21]:
# Read validation loss from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/val_loss.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array([0.00096028, 0.00090014, 0.00102293, 0.00093372, 0.00096108,
       0.00089569, 0.00095728, 0.00092306, 0.00089398, 0.00087433,
       0.00092681])

In [22]:
# Read loss from HDFS

fs_handle = hdfs.get_fs()
temp_file = TemporaryFile()

fd = fs_handle.open_file(export_path + sensors[-1] + "/loss.npy", mode='r')

temp_file.write(fd.read())
temp_file.seek(0) # important, set cursor to beginning of file

np_array = np.load(temp_file)
np_array

array([0.00246245, 0.00203831, 0.00197797, 0.00194428, 0.00190776,
       0.00189846, 0.00190155, 0.00190121, 0.00190021, 0.00189787,
       0.00187787])