In [1]:
# Standard Libraries
import os
import json
import datetime
import numpy as np
import tensorflow as tf

import time
import nvtx

import argparse

import math
from tensorflow.keras.utils import Progbar

# ------------------------------- CUSTOM FUNCTIONS ------------------------------------------------
# Custom Library
import sys
sys.path.append('../')
    
from proxy_apps.apps.timeseries_prediction import deepDMD, proxyDeepDMD, proxyDeepDMD_Backup

from proxy_apps.utils.tf import TimingCallback
from proxy_apps.utils.data.main import NpEncoder
from proxy_apps.utils import file_reader, path_handler
from proxy_apps.utils.data.grid import GridNetworkDataHandler, GridNetworkTFDataHandler

In [2]:
# System Setup
config = file_reader.read_config()

_N_EPOCHS = 2
_BATCH_SIZE = 64000
_APP_NAME = config["info"]["app_name"]
_NROWS = int(config["data"]["n_rows"])
_NCOLS = int(config["data"]["n_cols"])
_REPEAT_COLS = int(config["data"]["repeat_cols"])
_DTYPE = config["model"]["dtype"]

_LABEL = "Baseline"
_SUFFIX =  "gpu" + '_' + \
            "a100" + '_' + \
            'ng' + str(1) + '_' + \
            'nc' + str(-1) + '_' + \
            'e' + str(_N_EPOCHS) + '_' + \
            'b' + str(_BATCH_SIZE) + '_' + \
            'r' + str(_REPEAT_COLS) + '_' + _LABEL

performance_dict = dict()

tic = time.time()

# current directory
curr_dir = "./"

# output directory
output_dir = path_handler.get_absolute_path(curr_dir, config["info"]["output_dir"] + config["info"]["name"] + "/" + config["info"]["app_name"] + "/" + _DTYPE + "/R" + str(_REPEAT_COLS) + "/")
if not os.path.exists(output_dir): os.makedirs(output_dir)

# TensorFlow Setup
print("[INFO] Tensorflow version: ", tf.__version__)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

[INFO] Tensorflow version:  2.4.0


In [3]:
print("[INFO] Eager mode: ", tf.executing_eagerly()) # For easy reset of notebook state.

tf.keras.backend.clear_session()
tf.keras.backend.set_floatx('float64')
# tf.config.optimizer.set_jit(True) # Enable XLA.

# if _LABEL in ["Baseline"]: tf.keras.backend.set_floatx('float64')
# elif _LABEL in ["TFDataOpt", "TFDataOptMGPU"]:
#     tf.keras.backend.set_floatx(_DTYPE)
# elif _LABEL in ["TFDataOptMP", "TFDataOptMGPUMP"]:
#     _DTYPE = "float32"
#     policy = tf.keras.mixed_precision.Policy('mixed_float16')
#     tf.keras.mixed_precision.set_global_policy(policy)
#     # tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

# if _LABEL in ["TFDataOpt", "TFDataOptMP", "TFDataOptMGPU", "TFDataOptMGPUMP"]:
mirrored_strategy = tf.distribute.MirroredStrategy()

os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"

[INFO] Eager mode:  True
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [4]:
# ------------------------------- DATA LOADING ------------------------------------------------   

data_loading = nvtx.start_range("Data Loading")
l_start = time.time()
data_handler = GridNetworkDataHandler(scenario_dir=path_handler.get_absolute_path(curr_dir, config["info"]["input_dir"]),
                                        n_rows=_NROWS,
                                        n_cols=_NCOLS,
                                        repeat_cols=_REPEAT_COLS,
                                        dtype=_DTYPE
                                     ) 

scenario_data = data_handler.load_grid_data()
l_stop = time.time()
print('[INFO]: Time taken for loading datasets:', l_stop - l_start, 'seconds')
performance_dict['data_loading_time'] = l_stop-l_start
nvtx.end_range(data_loading)



[INFO]: Loading the datasets from the directory: /qfs/people/jain432/pacer/data/TrainingDataIEEE68bus
[INFO]: Loading data for 30 scenarios ...
[INFO]: Total number of scenarios loaded: 30
[INFO]: Shape of each scenario loaded:  (1400, 136)
[INFO]: Done ...
[INFO]: Time taken for loading datasets: 2.448899745941162 seconds


In [5]:
# ------------------------------- DATA PREPROCESSING ------------------------------------------------

data_processing = nvtx.start_range("Data Processing")
i_start = time.time()
X_data, Y_data, U_data, V_data, Yp, Yf = data_handler.create_windows(scenario_data)
i_stop = time.time()
print('[INFO]: Time taken for creating X datasets:', i_stop - i_start, 'seconds')
performance_dict['data_processing_time'] = i_stop-i_start
nvtx.end_range(data_processing)


[INFO]: Original dataset size: 1400
[INFO]: Chosen dataset size: 800
[INFO]: Length of X_data:  1800
[INFO]: Length of each window after down sampling:  (800, 136)
[INFO]: Time taken for creating X datasets: 3.1523194313049316 seconds


In [6]:
# ------------------------------- DATA NORMALIZATION ------------------------------------------------

data_norm = nvtx.start_range("Data Normalization")
n_start = time.time()
X_array, Y_array, U_array, V_array, Yp_array, Yf_array = data_handler.scale_data(X_data, Y_data, U_data, V_data, Yp, Yf)
n_stop = time.time()
print('[INFO]: Time taken for normalization:', n_stop - n_start, 'seconds')

performance_dict['data_scaling_time'] = n_stop-n_start
nvtx.end_range(data_norm)

[INFO]: Yp_array shape:  (41970, 136)
[INFO]: Yf_array shape:  (41970, 136)
[INFO]: X_array shape:  (1440000, 136)
[INFO]: Y_array shape:  (1440000, 136)
[INFO]: U_array shape:  (1440000, 136)
[INFO]: V_array shape:  (1440000, 136)
[INFO]: Time taken for normalization: 12.533467531204224 seconds


In [7]:
# ------------------------------- MODEL SETUP ------------------------------------------------
# timing callback
timing_cb = TimingCallback()

# Stopping criteria if the training loss doesn't go down by 1e-3
early_stop_cb = tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta = 1e-3, verbose = 1, mode='min', patience = 3, 
    baseline=None, restore_best_weights=True)

# Create a TensorBoard Profiler
logs = path_handler.get_absolute_path(curr_dir, config["model"]["tb_log_dir"] + _APP_NAME + "/" + _DTYPE + "/R" + str(_REPEAT_COLS) + "/tensorboard/" + _SUFFIX)
# tb_callback = tf.keras.callbacks.TensorBoard(log_dir=logs, histogram_freq=1, embeddings_freq=1, profile_batch=(5,15))

# all callbacks
callbacks=[early_stop_cb, timing_cb]#, tb_callback]

In [8]:
# Initialize Hyperparameters - we can keep it as a dict instead of creating a separate class
hyper_param_dict = config["model"]["hyperparameters"]
hyper_param_dict['original_dim']       = _REPEAT_COLS * _NCOLS   # input data dimension
hyper_param_dict['num_epochs']         = _N_EPOCHS  # Number of epochs  
hyper_param_dict['batch_size']         = _BATCH_SIZE

In [9]:
# @tf.function
def train_step(X, Y):
    # Open a GradientTape to record the operations run
    # during the forward pass, which enables auto-differentiation.
    with tf.GradientTape() as tape:

        # Run the forward pass of the layer.
        # The operations that the layer applies
        # to its inputs are going to be recorded
        # on the GradientTape.
        Psi_X    = K_model(X, training=True)
        Psi_Y    = K_model(Y, training=True)    

        PSI_X    = tf.concat([X, Psi_X], 1)
        PSI_Y    = tf.concat([Y, Psi_Y], 1) 

        # 1-time step evolution on observable space:
        K_PSI_X  = tf.matmul(PSI_X, K_model.KO) 

        # 1-step Koopman loss on observable space:        
        K_loss   = tf.norm(PSI_Y - K_PSI_X, axis = [0,1], ord = 'fro')

        # Regularization loss on Koopman operator:
        Reg_loss= tf.math.scalar_mul(hp.rf, tf.norm(K_model.KO, axis = [0,1], ord = 'fro'))   

        # Compute the loss value for this minibatch.
        loss_value = K_loss + Reg_loss

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss.
    grads = tape.gradient(loss_value, K_model.trainable_weights)

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients(zip(grads, K_model.trainable_weights))
    
    return loss_value

@tf.function
def distributed_train_step(dist_inputs):
    per_replica_losses = mirrored_strategy.run(train_step, args=(dist_inputs[0],dist_inputs[1]))
    print(per_replica_losses)
    return mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
                         axis=None)

In [10]:
test_size = Yp_array.shape[0]

Xd_array = tf.data.Dataset.from_tensor_slices(X_array)
Yd_array = tf.data.Dataset.from_tensor_slices(Y_array)
Ud_array = tf.data.Dataset.from_tensor_slices(U_array)
Vd_array = tf.data.Dataset.from_tensor_slices(V_array)
Ydp_array = tf.data.Dataset.from_tensor_slices(Yp_array)
Ydf_array = tf.data.Dataset.from_tensor_slices(Yf_array)

hyper_param_dict['dtype']         = _DTYPE
hp = proxyDeepDMD.HyperParameters(hyper_param_dict)
hp.model_name         = _LABEL

performance_dict["n_epochs"] = hp.ep
performance_dict["batch_size"] = hp.bs
# performance_dict["n_training_batches"] = n_batches_training
# performance_dict["n_val_batches"] = n_batches - n_batches_training

In [11]:
# ------------------------------- MODEL TRAINING ------------------------------------------------
# Initialize, build, and fit the model
# with mirrored_strategy.scope():
with mirrored_strategy.scope():
    K_model = proxyDeepDMD.Encoder(hp)
    optimizer = tf.optimizers.Adagrad(hp.lr)

BATCH_SIZE = hp.bs # * mirrored_strategy.num_replicas_in_sync
zip_data = tf.data.Dataset.zip((Xd_array, Yd_array)).batch(BATCH_SIZE)

training_dataset = zip_data.cache()
training_dataset = training_dataset.shuffle(buffer_size=BATCH_SIZE)
training_dataset = training_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
training_dataset = mirrored_strategy.experimental_distribute_dataset(training_dataset)#.cache()#.with_options(options)# .take(n_batches_training)
# training_dataset = training_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

K_model.compile(optimizer=optimizer)

In [12]:
m_start = time.time()
for epoch in range(hp.ep):
    print("\nepoch {}/{}".format(epoch+1, hp.ep))
    pb_i = Progbar(math.ceil(1440000//hp.bs)+1, stateful_metrics=['loss'])

    # Iterate over the batches of the dataset.
    total_loss = 0.0
    num_batches = 0
    for step, inp_data in enumerate(training_dataset):
        total_loss += distributed_train_step(inp_data)
        num_batches += 1
        
        loss_value = total_loss / num_batches

        # Log every 200 batches.
        pb_i.add(hp.bs//hp.bs, values=[('loss', loss_value)])

m_stop = time.time()
print(m_stop-m_start)


epoch 1/2
Tensor("add:0", shape=(), dtype=float64, device=/job:localhost/replica:0/task:0/device:CPU:0)
Tensor("add:0", shape=(), dtype=float64, device=/job:localhost/replica:0/task:0/device:CPU:0)

epoch 2/2
63.135339975357056


In [13]:
K_model.summary()

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_layer (DenseLayer)     multiple                  17536     
_________________________________________________________________
dense_layer_1 (DenseLayer)   multiple                  16512     
_________________________________________________________________
dropout (Dropout)            multiple                  0         
_________________________________________________________________
dense_layer_2 (DenseLayer)   multiple                  8256      
_________________________________________________________________
dropout_1 (Dropout)          multiple                  0         
_________________________________________________________________
dense_layer_3 (DenseLayer)   multiple                  4160      
_________________________________________________________________
dropout_2 (Dropout)          multiple                  0   

In [14]:
hp = deepDMD.HyperParameters(hyper_param_dict)
hp.model_name         = _LABEL

performance_dict["n_epochs"] = hp.ep
performance_dict["batch_size"] = hp.bs
performance_dict["n_training_batches"] = 1 - hp.vs
performance_dict["n_val_batches"] = hp.vs

# ------------------------------- MODEL TRAINING ------------------------------------------------
# Initialize, build, and fit the model
m_start = time.time()
BaselineModel = deepDMD.NeuralNetworkModel(hp)
BaselineModel.compile(optimizer=tf.optimizers.Adagrad(hp.lr))

history = BaselineModel.fit([X_array, Y_array], batch_size=hp.bs, 
                  epochs=hp.ep, 
                  callbacks=callbacks, 
                  shuffle=True)

m_stop = time.time()

# print info
print('[INFO]: Time taken for model training (time module):', m_stop - m_start, 'seconds')
print('[INFO]: Time taken for model training (Keras):', sum(timing_cb.logs), 'seconds')

Epoch 1/2
Epoch 2/2
[INFO]: Time taken for model training (time module): 51.75946640968323 seconds
[INFO]: Time taken for model training (Keras): 49.36934209614992 seconds


In [15]:
BaselineModel.summary()

Model: "neural_network_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder (Encoder)            multiple                  50624     
Total params: 90,624
Trainable params: 90,624
Non-trainable params: 0
_________________________________________________________________
