# Hyperparameter tuning - single sensor model
This notebook does a hyperparameter search for the single sensor model

In [1]:
import time
import math
import itertools

from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *

import tensorflow as tf
import numpy as np

from hops import hdfs, util
from tempfile import TemporaryFile

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1506,application_1544690131655_0130,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
%%local
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
%matplotlib inline 

In [3]:
tf.__version__

'1.8.0'

## Define Parameters

In [4]:
# Dataset Parameters
year = 2016
month = 11

root_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/"
sensor_file_path = root_path + str(year) + "-" + str(month) + "_all-sensors-timeseries-parquet/*.parquet"
folder_path = root_path + str(year) + "-" + str(month) + "_single-sensor-30-min-supervised-parquet/"
export_path = root_path + str(year) + "-" + str(month) + "_single-sensor-30-min-output/"
    
batch_size = 100
num_epochs = 100
dataset_split = 0.70
max_density = 200

# Network Parameters
past_steps = 10
future_steps = 30
n_sensors = 1

# Training Parameters
learning_rate = 0.001
display_step = 200

## Define Functions

In [5]:
def write_hdfs_file(file_path, data):
    fs_handle = hdfs.get_fs()
    temp_file = TemporaryFile()
    np.save(temp_file, data, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(file_path, mode='w')
    fd.write(temp_file.read())
    fd.close()

## Import Data

In [6]:
sensors = spark.read.parquet(sensor_file_path).columns[1:]

In [7]:
print("Number of sensors: " + str(len(sensors)))
print("First sensor: " + sensors[0])

Number of sensors: 1941
First sensor: E182N-0005-1

## Prepare Data For Supervised Learning

In [8]:
def normalize(data, columns):
    scale_min = 0
    scale_max = 1
    
    out = []
    
    for column in columns:
        std = (data[column] - 0) / (max_density - 0)
        out.append( std * (scale_max - scale_min) + scale_min)
    return np.array(out)

In [9]:
def prepare_dataset(df):
    columns = df.columns[1:]
    df_normalized = df.rdd.map(lambda row: normalize(row, columns))
    x_dataset = np.array(df_normalized.map(lambda row: row[:10]).collect())
    y_dataset = np.array(df_normalized.map(lambda row: row[10:]).collect())
    
    x_dataset = np.reshape(x_dataset, (-1, past_steps, n_sensors))
    y_dataset = np.reshape(y_dataset, (-1, future_steps * n_sensors))
    
    train_size = int(len(x_dataset) * dataset_split)

    x_train = x_dataset[:train_size, :]
    x_test = x_dataset[train_size:, :]

    y_train = y_dataset[:train_size, :]
    y_test = y_dataset[train_size:, :]
    
    return x_train, y_train, x_test, y_test

## Define Neural Network

In [10]:
def define_model(lstm_units):
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    lstm_1 = tf.keras.layers.LSTM(units=lstm_units, return_sequences=True)(inputs)
    lstm_2 = tf.keras.layers.LSTM(units=lstm_units)(lstm_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(lstm_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

In [11]:
def define_gru_model(units):
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    lstm_1 = tf.keras.layers.GRU(units=units, return_sequences=True)(inputs)
    lstm_2 = tf.keras.layers.GRU(units=units)(lstm_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(lstm_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

## Define Model Training

In [12]:
def train_model(model, x_train, y_train):
    # Define early stopping criteria
    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', # Quantity to be monitored.
        min_delta=0.0001,   # Minimum change to qualify as an improvement.
        patience=10,         # Number of epochs with no improvement to stop training.
        verbose=0,          # Silent
        mode='auto'         # Direction of improvement is inferred.
    )

    # Start time
    t_start = time.time()

    # Train model
    model_info = model.fit(
        x=x_train,             # Training data
        y=y_train,             # Label data
        batch_size=batch_size,         # Number of samples per gradient update
        epochs=num_epochs,             # Number of iterations over the entire dataset
        verbose=2,             # Show progress bar
        callbacks=[earlystop], # List of callbacks to apply during training
        validation_split=0.2   # Fraction of the training data to be used as validation data
    )

    # End time
    t_end = time.time()
    
    loss = [float(x) for x in model_info.history['loss']]
    val_loss = [float(x) for x in model_info.history['val_loss']]
    training_time = (t_end - t_start)
    
    return loss, val_loss, training_time

## Run hyperparameter tuning

In [13]:
sensor_df = spark.read.parquet(folder_path + sensors[0]).orderBy('Timestamp')
x_train, y_train, x_test, y_test = prepare_dataset(sensor_df)

for lstm_units in [50, 100, 500, 1000]:
    model = define_model(lstm_units)
    loss, val_loss, training_time = train_model(model, x_train, y_train)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-lstm_units_" + \
                    str(lstm_units) + "/loss.npy", loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-lstm_units_" + \
                    str(lstm_units) + "/val_loss.npy", val_loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-lstm_units_" + \
                    str(lstm_units) + "/training_time.npy", training_time)

Train on 22556 samples, validate on 5640 samples
Epoch 1/100
 - 680s - loss: 3.0826e-04 - val_loss: 1.2323e-04
Epoch 2/100
 - 808s - loss: 2.6753e-04 - val_loss: 1.4952e-04
Epoch 3/100
 - 856s - loss: 2.4483e-04 - val_loss: 1.4767e-04
Epoch 4/100
 - 855s - loss: 2.4171e-04 - val_loss: 1.3934e-04
Epoch 5/100
 - 829s - loss: 2.3552e-04 - val_loss: 1.0876e-04
Epoch 6/100
 - 833s - loss: 2.2807e-04 - val_loss: 1.0361e-04
Epoch 7/100
 - 838s - loss: 2.2300e-04 - val_loss: 1.0627e-04
Epoch 8/100
 - 845s - loss: 2.2805e-04 - val_loss: 1.1954e-04
Epoch 9/100
 - 829s - loss: 2.2237e-04 - val_loss: 1.1009e-04
Epoch 10/100
 - 842s - loss: 2.2352e-04 - val_loss: 1.0974e-04
Epoch 11/100
 - 837s - loss: 2.2212e-04 - val_loss: 1.0982e-04

In [14]:
sensor_df = spark.read.parquet(folder_path + sensors[0]).orderBy('Timestamp')
x_train, y_train, x_test, y_test = prepare_dataset(sensor_df)

for units in [50, 100, 500, 1000]:
    model = define_gru_model(units)
    loss, val_loss, training_time = train_model(model, x_train, y_train)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-gru_units_" + \
                    str(units) + "/loss.npy", loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-gru_units_" + \
                    str(units) + "/val_loss.npy", val_loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/single_sensor-gru_units_" + \
                    str(units) + "/training_time.npy", training_time)

Train on 22556 samples, validate on 5640 samples
Epoch 1/100
 - 5s - loss: 2.6877e-04 - val_loss: 1.0693e-04
Epoch 2/100
 - 4s - loss: 2.4564e-04 - val_loss: 1.1004e-04
Epoch 3/100
 - 4s - loss: 2.4114e-04 - val_loss: 1.2057e-04
Epoch 4/100
 - 4s - loss: 2.3447e-04 - val_loss: 1.1731e-04
Epoch 5/100
 - 4s - loss: 2.3522e-04 - val_loss: 1.1054e-04
Epoch 6/100
 - 4s - loss: 2.3644e-04 - val_loss: 1.1921e-04
Epoch 7/100
 - 4s - loss: 2.3413e-04 - val_loss: 1.1009e-04
Epoch 8/100
 - 4s - loss: 2.3465e-04 - val_loss: 1.0538e-04
Epoch 9/100
 - 4s - loss: 2.3162e-04 - val_loss: 1.0568e-04
Epoch 10/100
 - 4s - loss: 2.3184e-04 - val_loss: 1.1066e-04
Epoch 11/100
 - 4s - loss: 2.2575e-04 - val_loss: 1.0637e-04
Train on 22556 samples, validate on 5640 samples
Epoch 1/100
 - 7s - loss: 2.6839e-04 - val_loss: 1.3260e-04
Epoch 2/100
 - 6s - loss: 2.4681e-04 - val_loss: 1.2910e-04
Epoch 3/100
 - 6s - loss: 2.4407e-04 - val_loss: 1.1503e-04
Epoch 4/100
 - 6s - loss: 2.4505e-04 - val_loss: 1.1469e-04
