In [1]:
import time
import math

from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *

import tensorflow as tf
import numpy as np

from hops import hdfs
from tempfile import TemporaryFile

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
719,application_1535116440643_0118,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
tf.__version__

'1.8.0'

## Define Parameters

In [3]:
# Dataset Parameters
year = 2016
month = 11

root_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/"
file_path = root_path + str(year) + "-" + str(month) + "_all-sensors-30-min-supervised-parquet/*.parquet"
index_path = root_path + str(year) + "-" + str(month) + "_sensor-to-index-30-min/*.parquet"
export_path = root_path + str(year) + "-" + str(month) + "_all-sensor-model-output/"

batch_size = 100
num_epochs = 100
dataset_split = 0.70
max_density = 200

# Network Parameters
past_steps = 10
future_steps = 5
n_sensors = 1941

# Training Parameters
learning_rate = 0.001
display_step = 200

## Define functions

In [4]:
def read_hdfs_file(file_path):
    fs_handle = hdfs.get_fs()
    temp_file = TemporaryFile()
    fd = fs_handle.open_file(file_path, mode='r')
    temp_file.write(fd.read())
    temp_file.seek(0)
    np_array = np.load(temp_file)
    return np_array

In [5]:
def write_hdfs_file(file_path, data):
    fs_handle = hdfs.get_fs()
    temp_file = TemporaryFile()
    np.save(temp_file, data, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(file_path, mode='w')
    fd.write(temp_file.read())
    fd.close()

## Import Data

In [6]:
df = spark.read.parquet(file_path).orderBy('Timestamp')
df.count()

40281

In [7]:
sensor_dict = dict([(row['key'], row['index']) for row in spark.read.parquet(index_path).orderBy('index').collect()])

## Prepare Data For Supervised Learning

In [8]:
def normalize(data, columns):
    scale_min = 0
    scale_max = 1
    
    out = []
    
    for column in columns:
        std = (np.array(data[column]) - 0) / (max_density - 0)
        out.append( std * (scale_max - scale_min) + scale_min)
    return np.array(out)

In [9]:
columns = ['t-9', 't-8', 't-7', 't-6', 't-5', 't-4', 't-3', 't-2', 't-1', 't', 't+3', 't+5', 't+10', 't+20', 't+30']

df_normalized = df.select(columns).rdd.map(lambda row: normalize(row, columns))
x_dataset = np.array(df_normalized.map(lambda row: row[:past_steps]).collect())
y_dataset = np.array(df_normalized.map(lambda row: row[past_steps:]).collect())

In [10]:
print x_dataset[0]
print y_dataset[0]

print x_dataset.shape
print y_dataset.shape

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.01285714 0.00759494 0.00333333 ... 0.         0.         0.        ]
 [0.         0.00428571 0.00952381 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.009      0.0122449  0.0137931  ... 0.         0.         0.        ]
 [0.00689655 0.00731707 0.004      ... 0.         0.         0.        ]
 [0.00322581 0.00326087 0.00348837 ... 0.         0.         0.        ]]
(40281, 10, 1941)
(40281, 5, 1941)

In [11]:
x_dataset = np.reshape(x_dataset, (-1, past_steps, n_sensors))
y_dataset = np.reshape(y_dataset, (-1, future_steps * n_sensors))

train_size = int(len(x_dataset) * dataset_split)

x_train = x_dataset[:train_size, :]
x_test = x_dataset[train_size:, :]

y_train = y_dataset[:train_size, :]
y_test = y_dataset[train_size:, :]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

((28196, 10, 1941), (28196, 9705), (12085, 10, 1941), (12085, 9705))

## Define LSTM Network

In [12]:
def define_model(lstm_units):
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    lstm_1 = tf.keras.layers.LSTM(units=lstm_units, return_sequences=True)(inputs)
    lstm_2 = tf.keras.layers.LSTM(units=lstm_units)(lstm_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(lstm_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

## Define GRU Network

In [13]:
def define_gru_model(gru_units):
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    gru_1 = tf.keras.layers.GRU(units=gru_units, return_sequences=True)(inputs)
    gru_2 = tf.keras.layers.GRU(units=gru_units)(gru_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(gru_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

## Train Model

In [None]:
def train_model(model, x_train, y_train):
    # Define early stopping criteria
    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', # Quantity to be monitored.
        min_delta=0.0001,   # Minimum change to qualify as an improvement.
        patience=10,         # Number of epochs with no improvement to stop training.
        verbose=2,          # Progress bar?
        mode='auto'         # Direction of improvement is inferred.
    )

    # Start time
    t_start = time.time()

    # Train model
    model_info = model.fit(
        x=x_train,             # Training data
        y=y_train,             # Label data
        batch_size=batch_size,         # Number of samples per gradient update
        epochs=num_epochs,             # Number of iterations over the entire dataset
        verbose=2,             # Show progress bar
        callbacks=[earlystop], # List of callbacks to apply during training
        validation_split=0.2   # Fraction of the training data to be used as validation data
    )

    # End time
    t_end = time.time()

    loss = [float(x) for x in model_info.history['loss']]
    val_loss = [float(x) for x in model_info.history['val_loss']]
    training_time = t_end - t_start
    return loss, val_loss, training_time

## Hyperparameter tuning

In [None]:
for lstm_units in [50, 100, 500, 1000, 1500, 2000]:
    loss, val_loss, training_time  = train_model(define_model(lstm_units), x_train, y_train)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-lstm_units_" + str(lstm_units) + "/loss.npy", loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-lstm_units_" + str(lstm_units) + "/val_loss.npy", val_loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-lstm_units_" + str(lstm_units) + "/training_time.npy", training_time)

In [None]:
for gru_units in [2000]:#[50, 100, 500, 1000, 1500, 2000]:
    loss, val_loss, training_time  = train_model(define_gru_model(gru_units), x_train, y_train)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-gru_units_" + str(gru_units) + "/loss.npy", loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-gru_units_" + str(gru_units) + "/val_loss.npy", val_loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/all_sensors-gru_units_" + str(gru_units) + "/training_time.npy", training_time)

## Plot Training Loss

In [None]:
losses = []
val_losses = []
training_times = []
labels = []

lstm_path = root_path + "hyperparameter-tuning/all_sensors-lstm_units_" 
gru_path = root_path + "hyperparameter-tuning/all_sensors-gru_units_" 

for lstm_units in [50, 100, 500, 1000, 1500, 2000]:
    losses.append(read_hdfs_file(lstm_path + str(lstm_units) + "/loss.npy"))
    val_losses.append(read_hdfs_file(lstm_path + str(lstm_units) + "/val_loss.npy"))
    training_times.append(read_hdfs_file(lstm_path + str(lstm_units) + "/training_time.npy"))
    labels.append(str(lstm_units) + " lstm units")

for gru_units in [50, 100, 500, 1000, 1500, 2000]:
    losses.append(read_hdfs_file(gru_path + str(gru_units) + "/loss.npy"))
    val_losses.append(read_hdfs_file(gru_path + str(gru_units) + "/val_loss.npy"))
    training_times.append(read_hdfs_file(gru_path + str(gru_units) + "/training_time.npy"))
    labels.append(str(gru_units) + " gru units")
    
max_length = 0
for loss in losses:
    if len(loss) > max_length:
        max_length = len(loss)
for i in range(len(losses)):
    for p in range(max_length - len(losses[i])):
        losses[i] = np.append(losses[i], None)
        val_losses[i] = np.append(val_losses[i], None)

In [None]:
%%spark -o lstm_loss_df

lstm_loss_df = spark.createDataFrame(
    sc.parallelize(np.array(losses)[:6].transpose().tolist()), 
    np.array(labels)[:6].tolist()
)

In [None]:
%%local
plt.rc('font', **{'weight' : 'normal','size'   : 35})
lstm_loss_plot = lstm_loss_df.plot.line(figsize=(40, 22), fontsize=40, linewidth=10)
 
lstm_loss_plot.set_xlabel("Epoch", {'size':50})
lstm_loss_plot.set_ylabel("RMSE", {'size':50})

In [None]:
%%spark -o gru_loss_df

gru_loss_df = spark.createDataFrame(
    sc.parallelize(np.array(losses)[6:].transpose().tolist()), 
    np.array(labels)[6:].tolist()
)

In [None]:
%%local
plt.rc('font', **{'weight' : 'normal','size'   : 35})
gru_loss_plot = gru_loss_df.plot.line(figsize=(40, 22), fontsize=40, linewidth=10)
 
gru_loss_plot.set_xlabel("Epoch", {'size':50})
gru_loss_plot.set_ylabel("RMSE", {'size':50})

## Plot Validation Loss

In [None]:
%%spark -o lstm_val_loss_df

lstm_val_loss_df = spark.createDataFrame(
    sc.parallelize(np.array(val_losses)[:6].transpose().tolist()), 
    np.array(labels)[:6].tolist()
)

In [None]:
%%local
plt.rc('font', **{'weight' : 'normal','size'   : 35})
lstm_val_loss_plot = lstm_val_loss_df.plot.line(figsize=(40, 22), fontsize=40, linewidth=10)
 
lstm_val_loss_plot.set_xlabel("Epoch", {'size':50})
lstm_val_loss_plot.set_ylabel("RMSE", {'size':50})

In [None]:
%%spark -o gru_val_loss_df

gru_val_loss_df = spark.createDataFrame(
    sc.parallelize(np.array(val_losses)[6:].transpose().tolist()), 
    np.array(labels)[6:].tolist()
)

In [None]:
%%local
plt.rc('font', **{'weight' : 'normal','size'   : 35})
gru_val_loss_plot = gru_val_loss_df.plot.line(figsize=(40, 22), fontsize=40, linewidth=10)
 
gru_val_loss_plot.set_xlabel("Epoch", {'size':50})
gru_val_loss_plot.set_ylabel("RMSE", {'size':50})

### Distributed hyperparameter search

In [None]:
# Todo