# Hyperparameter tuning - Partition model

In [None]:
import time
import math

from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *

import tensorflow as tf
import numpy as np

from hops import hdfs
from tempfile import TemporaryFile

Starting Spark application


In [None]:
%%local
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-darkgrid')
%matplotlib inline 

In [None]:
tf.__version__

## Define Parameters

In [None]:
# Dataset Parameters
year = 2016
month = 11
partition_min = 3
partition_id_to_tune = 30
direction = "backward"

root_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/"
partition_file_path = root_path + "partitions/" + direction + "_partitions-" + str(partition_min) + "min.csv"
sensor_file_path = root_path + str(year) + "-" + str(month) + "_all-sensors-timeseries-parquet/*.parquet"
data_path = root_path + str(year) + "-" + str(month) + "_single-sensor-30-min-supervised-parquet/"
    
batch_size = 100
num_epochs = 100
dataset_split = 0.70
max_density = 200

# Network Parameters
past_steps = 10
future_steps = 30

## Define functions

In [None]:
def read_hdfs_file(file_path):
    fs_handle = hdfs.get_fs()
    temp_file = TemporaryFile()
    fd = fs_handle.open_file(file_path, mode='r')
    temp_file.write(fd.read())
    temp_file.seek(0)
    np_array = np.load(temp_file)
    return np_array

In [None]:
def write_hdfs_file(file_path, data):
    fs_handle = hdfs.get_fs()
    temp_file = TemporaryFile()
    np.save(temp_file, data, allow_pickle=False)
    temp_file.seek(0)

    fd = fs_handle.open_file(file_path, mode='w')
    fd.write(temp_file.read())
    fd.close()

## Import Data

In [None]:
sensors = spark.read.parquet(sensor_file_path).columns[1:]

In [None]:
print("Number of sensors: " + str(len(sensors)))
print("First sensor: " + sensors[0])

In [None]:
partition_schema = StructType() \
    .add('node', StringType(), False) \
    .add('partition', IntegerType(), False) \
    .add('group', StringType(), False)

In [None]:
partitions_raw_df = spark.read.csv(
    partition_file_path, 
    sep=';', 
    schema=partition_schema,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    header=True,
    timestampFormat='yyyy/MM/dd HH:mm:ss.SSS'
)

In [None]:
@udf(StringType())
def shot_identifier(identifier):
    return identifier[:-2]

sensors_df = spark.createDataFrame(sc.parallelize([Row(identifier=s) for s in sensors]), ["identifier"]) \
    .withColumn("identifier_alt", shot_identifier("identifier"))
sensors_df.count()

In [None]:
partitions_df = partitions_raw_df.alias("p").join(
    sensors_df.alias("s"),
    col("s.identifier_alt") == col("p.node"),
    "rightouter"
)
partitions_df.count()

In [None]:
# These sensors are not connected to the rest of the graph and should be removed
partitions_df.where(col('p.node').isNull()).show()

In [None]:
partitions_df = partitions_df.where(~col('p.node').isNull()) \
    .select(col("s.identifier").alias("identifier"), col("p.partition").alias("partition"))
partitions_df.count()

In [None]:
max_partition_id = partitions_df.agg(max('partition')).collect()[0][0]
max_partition_id

In [None]:
partitions_df.groupBy('partition').count().show()

In [None]:
def load_data(sensors):
    columns = ["t-9", "t-8", "t-7", "t-6", "t-5", "t-4", "t-3", "t-2", "t-1", "t", 
               "t+1", "t+2", "t+3", "t+4", "t+5", "t+6", "t+7", "t+8", "t+9", "t+10", 
               "t+11", "t+12", "t+13", "t+14", "t+15", "t+16", "t+17", "t+18", "t+19", "t+20", 
               "t+21", "t+22", "t+23", "t+24", "t+25", "t+26", "t+27", "t+28", "t+29", "t+30"]
    
    data = np.array(np.array(spark.read.parquet(data_path + sensors[0]).orderBy('Timestamp').select(columns).collect()))
    shape = data.shape
    data = data.reshape((shape[0], shape[1], 1))
    
    for i in range(1, len(sensors)):
        data = np.append(data, np.array(spark.read.parquet(data_path + sensors[i]).orderBy('Timestamp').select(columns).collect()).reshape((shape[0], shape[1], 1)), 2)
    return data

## Prepare Data For Supervised Learning

In [None]:
def normalize(data):
    scale_min = 0
    scale_max = 1
    
    std = (data - 0) / (max_density - 0)
    out = std * (scale_max - scale_min) + scale_min
    return out

In [None]:
def prepare_dataset(data, n_sensors):
    data_normalized = normalize(data)
    x_dataset = data_normalized[:,:10,:]
    y_dataset = data_normalized[:,10:,:]
    
    x_dataset = np.reshape(x_dataset, (-1, past_steps, n_sensors))
    y_dataset = np.reshape(y_dataset, (-1, future_steps * n_sensors))
    
    train_size = int(len(x_dataset) * dataset_split)

    x_train = x_dataset[:train_size, :]
    x_test = x_dataset[train_size:, :]

    y_train = y_dataset[:train_size, :]
    y_test = y_dataset[train_size:, :]
    
    return x_train, y_train, x_test, y_test

## Define Neural Network

In [None]:
def define_model(l_units, n_sensors):
    inputs = tf.keras.Input(shape=(past_steps, n_sensors))

    lstm_1 = tf.keras.layers.LSTM(units=l_units, return_sequences=True)(inputs)
    lstm_2 = tf.keras.layers.LSTM(units=l_units)(lstm_1)

    dense = tf.keras.layers.Dense(
        units=500,
        activation='linear',
        kernel_constraint=tf.keras.constraints.NonNeg() 
    )(lstm_2)

    outputs = tf.keras.layers.Dense(
        units=future_steps * n_sensors
    )(dense)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer='adam',         # Optimizer to use.
        loss='mean_squared_error' # Loss function to use.
    )
    return model

## Define Model Training

In [None]:
def train_model(model, x_train, y_train):
    # Define early stopping criteria
    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', # Quantity to be monitored.
        min_delta=0.0001,   # Minimum change to qualify as an improvement.
        patience=10,         # Number of epochs with no improvement to stop training.
        verbose=0,          # Silent
        mode='auto'         # Direction of improvement is inferred.
    )

    # Start time
    t_start = time.time()

    # Train model
    model_info = model.fit(
        x=x_train,             # Training data
        y=y_train,             # Label data
        batch_size=batch_size,         # Number of samples per gradient update
        epochs=num_epochs,             # Number of iterations over the entire dataset
        verbose=0,             # Silent
        callbacks=[earlystop], # List of callbacks to apply during training
        validation_split=0.2   # Fraction of the training data to be used as validation data
    )

    # End time
    t_end = time.time()
    
    loss = [float(x) for x in model_info.history['loss']]
    val_loss = [float(x) for x in model_info.history['val_loss']]
    training_time = (t_end - t_start)
    
    return loss, val_loss, training_time

## Run Training and Evaluation on all partitions

In [None]:
partition_sensors = partitions_df.where(col("partition") == partition_id_to_tune).rdd.map(lambda row: row["identifier"]).collect()
data = load_data(partition_sensors)
x_train, y_train, x_test, y_test = prepare_dataset(data, len(partition_sensors))


for lstm_units in [100]: #[200, 400, 600, 800, 1000]:
    model = define_model(lstm_units, len(partition_sensors))
    loss, val_loss, training_time = train_model(model, x_train, y_train)
    write_hdfs_file(root_path + "hyperparameter-tuning/partition_" + str(partition_min) + \
                    "min-lstm_units_" + str(lstm_units) + "/loss.npy", loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/partition_" + str(partition_min) + \
                    "min-lstm_units_" + str(lstm_units) + "/val_loss.npy", val_loss)
    write_hdfs_file(root_path + "hyperparameter-tuning/partition_" + str(partition_min) + \
                    "min-lstm_units_" + str(lstm_units) + "/training_time.npy", training_time)