# Training Notebook

First insert the command line arguments as dbutils widget parameters.

In [None]:
# --- Data params ---
dbutils.widgets.text("dataset", "system_1")
dbutils.widgets.text("window_size", "100")
dbutils.widgets.text("stride", "1")
dbutils.widgets.text("train_start", "0")
dbutils.widgets.text("train_end", "None")

# --- Model params ---
# 1D conv layer
dbutils.widgets.text("kernel_size", "7")
# GAT layers
dbutils.widgets.text("use_gatv2", "True")
dbutils.widgets.text("feat_gat_embed_dim", "None")
dbutils.widgets.text("time_gat_embed_dim", "None")
# GRU layer
dbutils.widgets.text("gru_n_layers", "1")
dbutils.widgets.text("gru_hid_dim", "150")
# Forecasting Model
dbutils.widgets.text("fc_n_layers", "3")
dbutils.widgets.text("fc_hid_dim", "150")
# Reconstruction Model
dbutils.widgets.text("recon_n_layers", "1")
dbutils.widgets.text("recon_hid_dim", "150")
# Other
dbutils.widgets.text("alpha", "0.2")
dbutils.widgets.text("gamma", "1.0")

# --- Train params ---
dbutils.widgets.text("epochs", "30")
dbutils.widgets.text("val_split", "0.1")
dbutils.widgets.text("batch_size", "256")
dbutils.widgets.text("init_lr", "0.001")
dbutils.widgets.text("step_lr", "10")
dbutils.widgets.text("gamma_lr", "0.9")
dbutils.widgets.text("patience", "None")
dbutils.widgets.text("shuffle_dataset", "True")
dbutils.widgets.text("dropout", "0.2")
dbutils.widgets.text("use_cuda", "True")
dbutils.widgets.text("print_every", "1")
# For epsilon
dbutils.widgets.text("reg_level", "1")
dbutils.widgets.text("use_mov_av", "False")

Import the required modules.

In [None]:
from datetime import datetime
import torch.nn as nn
import torch
import pandas as pd
import numpy as np

from architecture import MTAD_GAT
from model import Handler
from utils import str2bool, str2type
from utils import get_data, SlidingWindowDataset, create_data_loader, find_epsilon, update_json

import mlflow

Get the parameters' values and fix them to the correct type.

In [None]:
dataset = dbutils.widgets.get("dataset")
window_size = int(dbutils.widgets.get("window_size"))
stride = int(dbutils.widgets.get("stride"))
train_start = int(dbutils.widgets.get("train_start"))
train_end = str2type(dbutils.widgets.get("train_end"))

kernel_size = int(dbutils.widgets.get("kernel_size"))
use_gatv2 = str2type(dbutils.widgets.get("use_gatv2"))
feat_gat_embed_dim = str2type(dbutils.widgets.get("feat_gat_embed_dim"))
time_gat_embed_dim = str2type(dbutils.widgets.get("time_gat_embed_dim"))
gru_n_layers = int(dbutils.widgets.get("gru_n_layers"))
gru_hid_dim = int(dbutils.widgets.get("gru_hid_dim"))
fc_n_layers = int(dbutils.widgets.get("fc_n_layers"))
fc_hid_dim = int(dbutils.widgets.get("fc_hid_dim"))
recon_n_layers = int(dbutils.widgets.get("recon_n_layers"))
recon_hid_dim = int(dbutils.widgets.get("recon_hid_dim"))
alpha = float(dbutils.widgets.get("alpha"))
gamma = float(dbutils.widgets.get("gamma"))

epochs = int(dbutils.widgets.get("epochs"))
val_split = float(dbutils.widgets.get("val_split"))
batch_size = int(dbutils.widgets.get("batch_size"))
init_lr = float(dbutils.widgets.get("init_lr"))
step_lr = int(dbutils.widgets.get("step_lr"))
gamma_lr = float(dbutils.widgets.get("gamma_lr"))
patience = str2type(dbutils.widgets.get("patience"))
shuffle_dataset = str2type(dbutils.widgets.get("shuffle_dataset"))
dropout = float(dbutils.widgets.get("dropout"))
use_cuda = str2type(dbutils.widgets.get("use_cuda"))
print_every = int(dbutils.widgets.get("print_every"))
reg_level = int(dbutils.widgets.get("reg_level"))
use_mov_av = str2type(dbutils.widgets.get("use_mov_av"))

Add some of these to a dictionary to be saved as a `config.txt` file, since it may need to be loaded from the evaluation/prediction script.

In [None]:
args = {"window_size":window_size, "gamma":gamma, "batch_size":batch_size}

Make sure the proper container (to draw data from) is mounted.

In [None]:
# Checking if mount already exists
mnts = dbutils.fs.mounts()
mnt_exists = False
for mount in mnts:
    if mount.mountPoint == "/mnt/datasets":
        mnt_exists = True

if mnt_exists == False:
    # Setup some parameters and keys
    account_name = "canopuslake"
    container = "datasets"

    client_secret = dbutils.secrets.get(scope="vault_scope", key="dbricks-to-lake-secret")
    client_id = dbutils.secrets.get(scope="vault_scope", key="dbricks-to-lake-client-ID")
    tenant_id = dbutils.secrets.get(scope="vault_scope", key="dbricks-to-lake-tenant-ID")

    # Define the connection configurations
    configs = {"fs.azure.account.auth.type": "OAuth",
          "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
          "fs.azure.account.oauth2.client.id": client_id,
          "fs.azure.account.oauth2.client.secret": client_secret,
          "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"}

    # Command to mount the blob storage container locally
    dbutils.fs.mount(
    source = f"abfss://{container}@{account_name}.dfs.core.windows.net/",
    mount_point = "/mnt/datasets",
    extra_configs = configs)
else:
    print("Mount already exists.")

Mount already exists.


Finally, run the training script.

In [None]:
# Get custom id for every run
id = datetime.now().strftime("%d%m%Y_%H%M%S")

experiment = mlflow.set_experiment(experiment_name=f"/Experiments/{dataset}_training")
exp_id = experiment.experiment_id

with mlflow.start_run(experiment_id=exp_id, run_name=id):

    # --------------------------- START TRAINING -----------------------------
    # Get data from the dataset
    (x_train, _) = get_data(dataset, mode="train", start=train_start, end=train_end)

    # Cast data into tensor objects
    x_train = torch.from_numpy(x_train).float()
    n_features = x_train.shape[1]

    # We want to perform forecasting/reconstruction on all features
    out_dim = n_features
    print(f"Proceeding with forecasting and reconstruction of all {n_features} input features.")

    # Construct dataset from tensor object
    train_dataset = SlidingWindowDataset(x_train, window_size, stride)

    print("Training:")
    # Create the data loader(s)
    train_loader, val_loader = create_data_loader(train_dataset, batch_size, 
                                                val_split, shuffle_dataset)

    # Initialize the model
    model = MTAD_GAT(
        n_features,
        window_size,
        out_dim,
        kernel_size=kernel_size,
        use_gatv2=use_gatv2,
        feat_gat_embed_dim=feat_gat_embed_dim,
        time_gat_embed_dim=time_gat_embed_dim,
        gru_n_layers=gru_n_layers,
        gru_hid_dim=gru_hid_dim,
        forecast_n_layers=fc_n_layers,
        forecast_hid_dim=fc_hid_dim,
        recon_n_layers=recon_n_layers,
        recon_hid_dim=recon_hid_dim,
        dropout=dropout,
        alpha=alpha
    )

    # Initialize the optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)

    # Add a scheduler for variable learning rate
    e_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_lr, gamma=gamma_lr)

    # Set the criterion for each process: forecasting & reconstruction
    forecast_criterion = nn.MSELoss()
    recon_criterion = nn.MSELoss()

    # Initialize the Handler module
    handler = Handler(
        model=model,
        optimizer=optimizer,
        scheduler=e_scheduler,
        window_size=window_size,
        n_features=n_features,
        batch_size=batch_size,
        n_epochs=epochs,
        patience=patience,
        forecast_criterion=forecast_criterion,
        recon_criterion=recon_criterion,
        use_cuda=use_cuda,
        print_every=print_every,
        gamma=gamma
    )

    # Start training
    handler.fit(train_loader, val_loader)

    # ---------------------------- END TRAINING ------------------------------

    art_uri = mlflow.get_artifact_uri()

    # Get scores for training data to be used for thresholds later on
    print("Calculating scores on training data to be used for thresholding...")
    anom_scores, _ = handler.score(loader=train_loader, details=False)
    # Also get the ones from the validation data
    if val_loader is not None:
        val_scores, _ = handler.score(loader=val_loader, details=False)
        anom_scores = np.concatenate((anom_scores, val_scores), axis=0)

    # get threshold using epsilon method
    if str(reg_level).lower() != "none":

        if use_mov_av:
            smoothing_window = int(batch_size * window_size * 0.05)
            anom_scores = pd.DataFrame(anom_scores).ewm(span=smoothing_window).mean().values.flatten()

        e_thresh = find_epsilon(errors=anom_scores, reg_level=reg_level)
        update_json(art_uri, "thresholds.json", {"epsilon":e_thresh})

    mlflow.log_dict(args, "config.txt")

    mlflow.log_dict({'anom_scores':anom_scores.tolist()}, "anom_scores.json")

    # Don't log all parameters, only some are relevant for tuning
    to_be_logged = {'window_size':window_size, 'kernel_size':kernel_size, 'gru_n_layers':gru_n_layers, 'gru_hid_dim':gru_hid_dim, 'batch_size':batch_size, 'fc_n_layers':fc_n_layers, 'fc_hid_dim':fc_hid_dim, 'recon_n_layers':recon_n_layers, 'recon_hid_dim':recon_hid_dim, 'alpha':alpha, 'gamma':gamma, 'dropout':dropout}
    for key in to_be_logged:
        mlflow.log_param(key, to_be_logged[key])

    mlflow.pytorch.log_model(
        pytorch_model=handler.model,
        artifact_path=f"{dataset}_model",
        #registered_model_name=f"{dataset}_model"
    )

print("Finished.")

Proceeding with forecasting and reconstruction of all 38 input features.
Training:
The size of the dataset is: 25542 sample(s).
Reserved 2837 sample(s) for validation.
Training model for 5 epoch(s)...
[Epoch 1]
Elapsed time: 37.9s
Forecasting Loss: 0.07120,	Reconstruction Loss: 0.06059,	Total Training Loss: 0.13179.
Forecasting Loss: 0.02351,	Reconstruction Loss: 0.02822,	Total Validation Loss: 0.05173.
[Epoch 2]
Elapsed time: 37.9s
Forecasting Loss: 0.03149,	Reconstruction Loss: 0.02505,	Total Training Loss: 0.05654.
Forecasting Loss: 0.02114,	Reconstruction Loss: 0.02272,	Total Validation Loss: 0.04386.
[Epoch 3]
Elapsed time: 38.8s
Forecasting Loss: 0.02741,	Reconstruction Loss: 0.02237,	Total Training Loss: 0.04977.
Forecasting Loss: 0.02064,	Reconstruction Loss: 0.02200,	Total Validation Loss: 0.04264.
[Epoch 4]
Elapsed time: 39.2s
Forecasting Loss: 0.02551,	Reconstruction Loss: 0.02178,	Total Training Loss: 0.04729.
Forecasting Loss: 0.01951,	Reconstruction Loss: 0.02161,	Total V

  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:00<00:16,  5.94it/s]  2%|▏         | 2/100 [00:00<00:16,  5.91it/s]  3%|▎         | 3/100 [00:00<00:16,  5.85it/s]  4%|▍         | 4/100 [00:00<00:16,  5.88it/s]  5%|▌         | 5/100 [00:00<00:16,  5.88it/s]  6%|▌         | 6/100 [00:01<00:16,  5.78it/s]  7%|▋         | 7/100 [00:01<00:16,  5.77it/s]  8%|▊         | 8/100 [00:01<00:15,  5.81it/s]  9%|▉         | 9/100 [00:01<00:15,  5.80it/s] 10%|█         | 10/100 [00:01<00:15,  5.82it/s] 11%|█         | 11/100 [00:01<00:15,  5.80it/s] 12%|█▏        | 12/100 [00:02<00:15,  5.80it/s] 13%|█▎        | 13/100 [00:02<00:15,  5.80it/s] 14%|█▍        | 14/100 [00:02<00:14,  5.82it/s] 15%|█▌        | 15/100 [00:02<00:14,  5.83it/s] 16%|█▌        | 16/100 [00:02<00:14,  5.82it/s] 17%|█▋        | 17/100 [00:02<00:14,  5.83it/s] 18%|█▊        | 18/100 [00:03<00:14,  5.81it/s] 19%|█▉        | 19/100 [00:03<00:13,  5.83it/s] 20%|██        | 20/100 [00:03<00:13,

Finished.
