<a href="https://colab.research.google.com/gist/sdwalker62/e80f2d2e31c8830c286faf9537853e31/longruntransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install plotly==4.5
import plotly
plotly.__version__



'4.5.0'

In [None]:
%%capture
!pip install drain3
!pip install jupyter-dash
!pip install -Uqq ipdb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Feel free to change

In [None]:
# -- Base -- #
import os
import joblib
import logging
import time
import re
import io
from datetime import datetime
from tqdm import tqdm
import ipdb
from copy import deepcopy
import sys

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql
import tensorboard

# -- Tensorflow -- #
import tensorflow as tf
from jax import jit
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases, Phraser
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# -- Dash -- #
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

!cp /content/drive/MyDrive/Work/drain3.ini /content/

In [None]:
%pdb on

Extensions

In [None]:
%load_ext tensorboard

# Set up logging.
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = SOURCE + 'logs/func/%s' % stamp
writer = tf.summary.create_file_writer(logdir)

view graph

In [None]:
%tensorboard --logdir /content/drive/MyDrive/Work/logs

Check if GPU is in use:

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

## Environmental Variables


---



In [None]:
SOURCE = '/content/drive/MyDrive/Work/'

# -- TRANSFORMER Pipeline -- #
BATCH_SIZE = 100
EPOCHS = 1
DROPOUT_RATE = 0.1
MAX_SEQ_LEN = 200

ACTIVATION = "elu"

TRANSFORMER_LAYERS = 4
TRANSFORMER_DFF = 2000
TRANSFORMER_HEADS = 8

TRAINING = True
CONTAINER = 'core.soaesb'

# -- WORD2VEC Pipeline -- #
WINDOW_SIZE = 10
GENERATE_NEW_DRAIN = True
NUM_NEGATIVE_SAMPLING = 10
W2V_BATCH_SIZE = 2048
BUFFER_SIZE = 10000
W2V_EPOCHS = 200
W2V_EMBED_SIZE = 64
MAX_VOCAB_SIZE = 2000

## Logging

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)s | %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [None]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [None]:
dataset = database_builder(SOURCE + 'database/')
container_dataset = dataset[dataset['container_name']==CONTAINER]

2021-04-29 17:25:43,168 INFO | Building DataFrame ...
2021-04-29 17:25:43,191 INFO | Connected to database /content/drive/MyDrive/Work/database/elastic_logs.db
2021-04-29 17:25:44,539 INFO | ...complete!


# W2V Pipeline

## Pipeline Objects

### W2V_Pipeline

In [None]:
class W2V_Pipeline(tf.keras.Model):
    def __init__(self, 
                 save_model,
                 load_model):
        
        super(W2V_Pipeline, self).__init__()
        self.save_model = save_model
        self.load_model = load_model
        
        self.PCL = PhraseCaptureLayer(
            5, 7, 
            load_model=load_model, 
            save_model=save_model)
        
        self.TCL = TextClusteringLayer(
            load_model=load_model, 
            save_model=save_model)
        
        self.NSL = NegativeSkipgramLayer(W2V_EMBED_SIZE)

        self.W2V = Word2VecEmbeddingLayer(
            W2V_EMBED_SIZE, 
            load_model=load_model, 
            save_model=save_model)

    def call(self, x):
        x = standardize_logs(x)
        x = self.PCL(x)
        x = self.TCL(x)
        x = self.NSL(x)
        return self.W2V(x)

### Standardize Logs

In [None]:
def standardize_logs(logs: pd.DataFrame) -> pd.DataFrame:

    # remove timestamps
    logs['log'] = logs['log'].replace(
        to_replace=r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))|(?:\s{2,})',
        value=' ',
        regex=True)
    
    return logs

### PhraseCaptureLayer

In [None]:
class PhraseCaptureLayer(tf.keras.layers.Layer):

    def __init__(self,
                 min_count,
                 threshold,
                 load_model = False,
                 save_model = True):

        super(PhraseCaptureLayer, self).__init__()
        self.min_count = min_count
        self.threshold = threshold
        self.load_model = load_model
        self.save_model = save_model
        
        if self.load_model:
            self.phrase_model = joblib.load(SOURCE + '/results/phrase_model.joblib')
        else:
            self.phrase_model = Phrases(min_count=self.min_count, threshold=self.threshold)

    def call(self, corpus, training = True):

        def clean_log(log):
            log = log.lower().strip()
            return re.sub(r'\s{2,}', ' ', log)

        def reorganize_return(corpus_with_phrases):
            l = []
            for tokenized_log in corpus_with_phrases:
                l.append(' '.join(tokenized_log))
            return l

        split_corpus =[log.split(' ') for log in corpus['log']]
        
        if not training:
            self.phrase_model = self.phrase_model.freeze()

        self.phrase_model.add_vocab(split_corpus)

        if self.save_model:
            joblib.dump(self.phrase_model, SOURCE + '/results/phrase_model.joblib')
        
        corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [None]:
class TextClusteringLayer(tf.keras.layers.Layer):

    def __init__(self, 
                 load_model = False,
                 save_model = True):
        
        super(TextClusteringLayer, self).__init__()
        self.load_model = load_model
        self.save_model = save_model

        if load_model:
            self.template_miner = joblib.load(SOURCE + '/results/template_miner.joblib')
        else:
            self.template_miner = drain3.TemplateMiner()

    def call(self, corpus, training = True):
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                joblib.dump(self.template_miner, SOURCE + '/results/template_miner.joblib')
            
            print(len(self.template_miner.drain.clusters))

            return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster.get_template())
                    for cluster in self.template_miner.drain.clusters]
        else: 
            l = []
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)
                l.append(match_cluster)
            return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster.get_template())
                    for cluster in l]

### NegativeSkipgramLayer

In [None]:
from dataclasses import dataclass

@dataclass
class NSLBundle:
    vocab: dict
    targets: list
    contexts: list
    labels: list

class NegativeSkipgramLayer(tf.keras.layers.Layer):

    def __init__(self,
                 embedding_dim,
                 window_size = 10,
                 save_data = True):
        
        super(NegativeSkipgramLayer, self).__init__()
        self.vocab_size = 0
        self.vectorized_logs, self.corpus = [], []
        self.targets, self.contexts, self.labels = [], [], []
        self.vocab = {}
        self.window_size = window_size
        self.embedding_dim = embedding_dim
        self.save_data = save_data

    def collect_vocabulary(self):

        idx = 1
        self.vocab[0] = '<pad>'

        log_tokenizer.fit_on_texts(self.corpus)
        self.vectorized_logs = log_tokenizer.texts_to_sequences(self.corpus)

        self.vocab.update({v: k for k, v in log_tokenizer.word_index.items()})
        self.vocab_size = len(self.vocab.keys())

    def find_word_context(self):

        # Build the sampling table for vocab_size tokens.
        sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(len(self.vocab))

        for sequence in tqdm(self.vectorized_logs, position=0, leave=True):

            positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
                sequence,
                vocabulary_size=len(self.vocab),
                sampling_table=sampling_table,
                window_size=self.window_size,
                negative_samples=0)

            for target_word, context_word in positive_skip_grams:
                context_class = tf.expand_dims(
                    tf.constant([context_word], dtype='int64'), 1)

                negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                    true_classes=context_class,
                    num_true=1,
                    num_sampled=NUM_NEGATIVE_SAMPLING,
                    unique=True,
                    range_max=len(self.vocab),
                    seed=42,
                    name="negative_sampling")

                negative_sampling_candidates = tf.expand_dims(
                    negative_sampling_candidates, 1)

                context = tf.concat([context_class, negative_sampling_candidates], 0)
                label = tf.constant([1] + [0] * NUM_NEGATIVE_SAMPLING, dtype='int64')

                self.targets.append(target_word)
                self.contexts.append(context)
                self.labels.append(label)

    def call(self, corpus, training = True):

        self.corpus = corpus
        self.collect_vocabulary()
        self.find_word_context()

        if self.save_data:
            joblib.dump(self.vocab, SOURCE + '/results/vocab.joblib')
            joblib.dump(self.targets, SOURCE + '/results/targets.joblib')
            joblib.dump(self.contexts, SOURCE + '/results/contexts.joblib')
            joblib.dump(self.labels, SOURCE + '/results/labels.joblib')

        return NSLBundle(self.vocab, self.targets, self.contexts, self.labels)

### Word2VecEmbeddingLayer

In [None]:
class Word2VecEmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self, 
                 embedding_dim,
                 load_model = False, 
                 save_model = True):

        super(Word2VecEmbeddingLayer, self).__init__()
        self.embeddings = {}
        self.embedding_dim = embedding_dim
        self.load_model = load_model
        self.save_model = save_model
        self.Optimizer = tf.keras.optimizers.Adam()

        if load_model:
            self.Word2Vec = load_model(SOURCE + '/results/word2vec')
        else:
            self.Word2Vec= None

    def call(self, in_bundle, training):

        vocab = in_bundle.vocab
        targets = in_bundle.targets
        contexts = in_bundle.contexts
        labels = in_bundle.labels

        if not self.load_model and self.Word2Vec is None:
            self.Word2Vec = Word2Vec(len(vocab.keys()), self.embedding_dim)
            self.Word2Vec.compile(
                optimizer=self.Optimizer,
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

        dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
        dataset = dataset.shuffle(BUFFER_SIZE).batch(W2V_BATCH_SIZE, drop_remainder=True)
        dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

        self.Word2Vec.fit(dataset, epochs=W2V_EPOCHS)
        weights = self.Word2Vec.get_layer('w2v_embedding').get_weights()[0]

        for word in vocab.items():
            self.embeddings.update({
                word[1]: weights[word[0]]
                })

        if self.save_model:
            self.Word2Vec.save(SOURCE + '/results/word2vec')
            out_v = io.open(SOURCE + '/results/vectors.tsv', 'w', encoding='utf-8')
            out_m = io.open(SOURCE + '/results/metadata.tsv', 'w', encoding='utf-8')

            for index, word in enumerate(vocab.values()):
                if index == 0:
                    continue  # skip 0, it's padding.
                vec = weights[index]
                out_v.write('\t'.join([str(x) for x in vec]) + "\n")
                out_m.write(word + "\n")
            out_v.close()
            out_m.close()

        self.Word2Vec.summary()
        return self.embeddings

### Word2VecModel

In [None]:
class Word2Vec(tf.keras.models.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1, # input length 1 since we are focusing on one token
            name="w2v_embedding")

        self.context_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=NUM_NEGATIVE_SAMPLING + 1) # window size for contextual 
            # reasoning behind the sample token
        self.dots = tf.keras.layers.Dot(axes=(3, 2))
        self.flatten = tf.keras.layers.Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

## W2V Pipeline Main

In [None]:
# ** Preprocessing **
'''
standardize_logs 
'''

# ** Model **
# 1. 
# LogTokenEmbedder
'''
Seq = [PCL
       TCL 
       NSL
       GT1: W2V] -> {embedding_matrix, vocab} 
'''
######

# 2.
# Transformer Stuff 
'''
{log, embedding_matrix, vocab} ->
GT2: Transformer -> prediction 
'''
#LOG_DIR = SOURCE + 'logs'
#metadata = os.path.join(LOG_DIR, 'metadata.tsv')
#config = projector.ProjectorConfig()

log_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
w2vp = W2V_Pipeline(load_model = False, save_model = True)
embed_weights = w2vp(dataset)

## W2V Dash 

### W2V Dash Functions

In [None]:
def tree_parser(node, l, m, root_node):
    d = node.key_to_child_node # dict
    for token in list(d.keys()):
        if len(root_node.key_to_child_node.keys()) == 0:
            ret_list = []
            for row in m:
                proper_len = int(row[1])
                if len(row) == proper_len+1:
                  ret_list.append(row)
            return ret_list
        l.append(token)
        child = d[token]
        if child.key_to_child_node:
            tree_parser(child, l, m, root_node)
        else:
            d.pop(token)
            m.append(l)
            l = ['root']
            tree_parser(root_node, l, m, root_node)

### W2V Dash Main

The output of the W2V pipeline is a matrix of size [vocab size x embedding size] 

In [None]:
# -- W2V Dash Environmental Variables -- #

W2V_NEIGHBORS = 20
RECURSION_LIMIT = 10**6
N_PROJ_DIM = 3

In [None]:
# -- Generate Data for Word Embeddings Projector -- #

from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# shape = vocab size x embedding dim size
weights = np.ndarray(shape=(len(embed_weights), W2V_EMBED_SIZE)) 
pca = PCA(n_components=N_PROJ_DIM)

# -- Populate Matrix for PCA -- #
for idx, weight in enumerate(list(embed_weights.values())):
    weights[idx, :] = weight 

reduced_embeddings = pca.fit_transform(weights)

# -- Calculate Nearest Neighbors -- #
embeddings_knn = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
embeddings_knn_trained = embeddings_knn.fit(reduced_embeddings)

# Currently the array has a shape of vocab size x N_PROJ_DIM and contains
# the fitted PCA data. We need to add the vocab in the first column so 
# we know which vectors are represented. 
embedding_vocab_arr = np.array(list((embed_weights.keys())))
embedding_vocab_arr = np.expand_dims(embedding_vocab_arr, 1)
reduced_embeddings = np.hstack((embedding_vocab_arr, reduced_embeddings))

# This dataframe will be passed to the projection plot
plot_df = pd.DataFrame(data=reduced_embeddings, 
                       columns=['token', 'x1', 'x2', 'x3'])

# We want to ensure that the coordinates are numerical 
plot_df['x1'] = pd.to_numeric(plot_df['x1'])
plot_df['x2'] = pd.to_numeric(plot_df['x2'])
plot_df['x3'] = pd.to_numeric(plot_df['x3'])

We will build our plot using the tree_parser function. This function recursively
steps through the drain3.TemplateMiner.drain.Node structure of our 
**TextClusteringLayer** (TCL). The recursion populates a np.array which is then used
to build a pandas dataframe which the plotly treemap accepts. There is a column
appended to the tail of the dataframe which counts the number of stars 
(wild card masks) present in the row. This is used to define the colors shown.

In [None]:
# -- Generate Data for Treemap -- #

# By default python's recursion limit is 10**4 which is too small for our needs
sys.setrecursionlimit(RECURSION_LIMIT)

'''
We start by defining the head of our tree which is root. The list m is our 
master list for recording the paths in the tree. Each path is another list 
stored here.
'''

l = ['root']
m = []

# The root node is the master node of the tree and will be our return point
root_node = deepcopy(w2vp.TCL.template_miner.drain.root_node)

parsed_tree = tree_parser(root_node, l, m, root_node)
parsed_tree_df = pd.DataFrame(data=parsed_tree)

# The returned dataframe has generic columns so we will provide custom labels
n_cols = len(parsed_tree_df.columns)
col_name_list = []
for idx in range(n_cols):
    col_name_list.append('level' + str(idx))
parsed_tree_df.columns = col_name_list

'''
Without a color column our treemap would just be plain. We thought that taking 
the sum of the drain mask would be an interesting way to color the treemap. This
lambda function will sum those values in each row and return them to a new 
columnn named 'sum'
'''
parsed_tree_df['sum'] = parsed_tree_df.apply(lambda x: x.str.contains('<*>'), axis=1).sum(axis=1)

In [None]:
import dash
import plotly.express as px
import plotly.graph_objects as go
from dash.dependencies import Input, Output, State
import dash_core_components as dcc
import dash_html_components as html
import json
import dash_table

external_stylesheets = ['https://drive.google.com/uc?export=view&id=19OXGQ5iJIjRZD4VEZ-xiVChDmj0-SlSF']

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

f = px.scatter_3d(
    plot_df,
    x='x1', 
    y='x2', 
    z='x3',
    hover_name='token',
    height=750)

f.update_traces(marker=dict(size=5, line=dict(width=2, color='DarkSlateGrey')))

# -- TreeMap -- #
# f = px.treemap(parsed_tree_df, path=col_name_list, color='sum')

table = pd.DataFrame(data=list(embed_weights.keys()), columns=['token'])

app.layout = html.Div([
        html.Div(dcc.Graph(id = '3d_scat', figure=f, style={'height': 'auto'}), className='color1', id='graph_div'),
        html.Div(className='color2', id = 'data_table',
                children=[dash_table.DataTable(
                    id='table',
                    columns=[{"name": i, "id": i} for i in table.columns],
                    data=pd.DataFrame().to_dict('records'),
                    style_cell={'textAlign': 'left', 'overflow': 'hidden',
                                'textOverflow': 'ellipsis',
                                'maxWidth': 0})])], className='container')

# style_data={'height': 'auto', 'lineHeight': '15px'}
                    # style_table={'height': 'auto'},
                    # table.to_dict('records')
global_update = None
# html.Div(className='color3', id='logger', children=[html.P(children="this is a temporary log", id='logger_text')])

@app.callback(Output("table", "data"),
              Output("3d_scat", "figure"),
              Output("logger_text", "children"),
            Input('3d_scat', 'clickData'))
def select_point(clickData):
    ctx = dash.callback_context
    ids = [c['prop_id'] for c in ctx.triggered]

    if '3d_scat.clickData' in ids:
        if clickData:
            for p in clickData['points']:
                l = [p['x'],p['y'],p['z']]
                query_arr = np.array(l).reshape(1,-1)
                _, neighbors = knn_trained.kneighbors(X=query_arr)
                neighbors_list = neighbors.tolist()[0]
                tokens = []
                for idx in neighbors_list:
                    tokens.append(table.iloc[idx])
                update = pd.DataFrame(data=tokens)
                new_df = plot_df[plot_df.index.isin(neighbors_list)]
                old_df = plot_df.drop(index=neighbors_list)
                
                ff = px.scatter_3d(
                    new_df,
                    x='x1', 
                    y='x2', 
                    z='x3',
                    hover_name='token',
                    height=750)
                
                ff = ff.update_traces(marker=dict(size=5, color='red', line=dict(width=2, color='blue')))

                ff2 = px.scatter_3d(
                    old_df,
                    x='x1', 
                    y='x2', 
                    z='x3',
                    hover_name='token',
                    height=750)
                
                ff2 = ff2.update_traces(marker=dict(size=5, color='purple', opacity=0.2, line=dict(width=2, color='orange')))

                ff.add_trace(ff2.data[0])

                return update.to_dict('records'), ff, str(old_df.head(5))

app.run_server(mode='inline')

# Transformer Pipeline

### Main (Initialization)

In [None]:
# -- Data Batches, Vocab, and Embedding -- #
word_embedding_matrix = joblib.load(SOURCE + "results/w2v_weights.joblib")
vocabulary = joblib.load(SOURCE + "results/vocab_dict.joblib")
dataset = database_builder(SOURCE + 'database/')
dataset = dataset.sample(frac=1).reset_index(drop=True)
vocab_size = len(vocabulary)

batched_dataset = process_all_batches()

# -- Transformer Model -- #
optimus_prime = Transformer(
    TRANSFORMER_LAYERS,
    W2V_EMBED_SIZE,
    TRANSFORMER_HEADS,
    TRANSFORMER_DFF,
    vocab_size,
    word_embedding_matrix,
    MAX_SEQ_LEN,
    DROPOUT_RATE)

# -- Labels -- #
label_unique = dataset['label'].unique()
lbp = LabelEncoder().fit(label_unique)
binary_labels = lbp.transform(label_unique)

log_labels = {}
for idx, label in enumerate(label_unique):
    log_labels.update({
        label: binary_labels[idx]
    })

# -- Model Metrics -- #
learning_rate = CustomSchedule(W2V_EMBED_SIZE)
epoch_loss = tf.keras.metrics.Mean(name='train_loss')
epoch_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# -- Classification Step Layers -- #
add_att_layer = tf.keras.layers.AdditiveAttention()
softmax = tf.keras.layers.Softmax()

s1 = tf.keras.Sequential([
    tf.keras.layers.Dense(BATCH_SIZE, activation=ACTIVATION),
    tf.keras.layers.Dense(4, activation=ACTIVATION),
    tf.keras.layers.Softmax()
])

# -- Pipeline Info -- #
n_logs = len(dataset.index)
#n_iter = n_logs // BATCH_SIZE
n_iter = 5
remainder = n_logs % BATCH_SIZE
attns = []


# -- Checkpoints -- #
checkpoint_path = SOURCE + "checkpoints/"
checkpoint = tf.train.Checkpoint(step=tf.Variable(1), transformer=optimus_prime, optimizer=optimizer)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_path, max_to_keep=5)

tf.debugging.set_log_device_placement(True)

In [None]:
def process_all_batches():
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, vocabulary, idx, log_labels)

        batches.append((log_batch, labels))

    return batches

In [None]:
    tf.profiler.experimental.stop()
    tf.summary.trace_off()

### Main (Training)

In [None]:
for epoch in range(EPOCHS):

    start = time.time()
    epoch_loss.reset_states()
    epoch_accuracy.reset_states()
    dataset_iter = iter(batched_dataset)

    t = tqdm(range(n_iter), desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%}".format(0, 0, 0), position=0, leave=True)
    for _ in t:
        batch = next(dataset_iter)
        log_batch = batch[0]
        labels = batch[1]

        # Returns Eager Tensor for Predictions
        tf.summary.trace_on()
        tf.profiler.experimental.start(logdir)
        with writer.as_default():
          train_step(log_batch, labels)
          # with tf.summary.record_if(True):

          tf.summary.trace_export(
            name = "training_trace",
            step=0,
            profiler_outdir=logdir
          )

        tf.profiler.experimental.stop()
        tf.summary.trace_off()
        
        checkpoint.step.assign_add(1)

        if int(checkpoint.step) % 10 == 0:
            save_path = checkpoint_manager.save()

        t.set_description(desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%} ".format(epoch,
                                                                    epoch_loss.result(),
                                                                    epoch_accuracy.result()))
        t.refresh()

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=([BATCH_SIZE, None]), dtype=tf.float32),
    tf.TensorSpec(shape=([BATCH_SIZE]), dtype=tf.float32)
]

@tf.function(input_signature=train_step_signature)#, experimental_compile=True)
def train_step(log_batch: tf.Tensor, 
               labels: tf.Tensor):
    transformer_input = tf.tuple([
        log_batch,  # <tf.Tensor: shape=(batch_size, max_seq_len), dtype=float32>
        labels  # <tf.Tensor: shape=(batch_size, num_classes), dtype=float32>
    ])
    with tf.GradientTape() as tape:
        Rs, _ = optimus_prime(transformer_input)
        a_s = add_att_layer([Rs, Rs])
        y = softmax(a_s * Rs)
        print(a_s.shape)
        # y = Rs
        loss = tf.py_function(loss_function, [labels, y], tf.float32)
        pred = s1(y)
        labels = tf.cast(labels, tf.int64)
    # Optimize the model
    grads = tape.gradient(loss, optimus_prime.trainable_variables)
    optimizer.apply_gradients(zip(grads, optimus_prime.trainable_variables))

    acc = accuracy_function(labels, pred)

    # Tracking Progress
    epoch_loss.update_state(loss)  # Adding Batch Loss
    epoch_accuracy.update_state(acc)

    # return loss, acc

## Metric Objects

### Loss Function

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

### Accuracy Function

In [None]:
def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=1))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

### Custom Learning Rate Schedule

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model: int, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## Pipeline Objects

### ProcessBatch (NEEDS UPDATE)

In [None]:
def process_batch(dataset: pd.DataFrame,
                  vocabulary: dict,
                  idx: int,
                  labels: dict) -> tuple:
    logs = np.zeros((BATCH_SIZE, MAX_SEQ_LEN))
    y_true = np.empty((BATCH_SIZE,))

    start_window = idx * BATCH_SIZE
    end_window = (idx + 1) * BATCH_SIZE
    for log_idx, log in enumerate(dataset['log'][start_window:end_window]):
        for seq_idx, word in enumerate(log.split()):
            if seq_idx >= MAX_SEQ_LEN:
                break
            logs[log_idx, seq_idx] = vocabulary[word] if word in vocabulary.keys() else 0
        y_true[log_idx] = labels[dataset['label'][log_idx]]

    return tf.convert_to_tensor(logs, dtype=tf.float32), tf.convert_to_tensor(y_true, dtype=tf.float32)

### Transformer

In [None]:
class Transformer(tf.keras.Model):

    def __init__(self,
                 num_layers,
                 d_model,
                 num_heads,
                 dff,
                 input_vocab_size,
                 embedding_matrix,
                 max_seq_len,
                 rate=0.1):
        super(Transformer, self).__init__()

        self.d_model = d_model

        # self.embedding = tf.keras.layers.Embedding(
        #     input_vocab_size,
        #     d_model,
        #     weights=[embedding_matrix],
        #     input_length=max_seq_len,
        #     trainable=False)
        
        self.embedding = EmbeddingLayer(input_vocab_size, d_model, embedding_matrix, max_seq_len)

        self.pos_encoding = PositionalEncoding(max_seq_len, d_model)

        self.transformer_blocks = [TransformerBlock(
                        num_layers,
                        d_model,
                        embedding_matrix,
                        num_heads,
                        dff,
                        input_vocab_size,
                        max_seq_len,
                        rate) for _ in range(3)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, input_tuple: tf.tuple, **kwargs):
        log_batch = input_tuple[0]
        encoding_padding_mask = None # input_tuple[1]
        
        embedding_tensor = self.embedding(log_batch) # (batch_size, input_seq_len, d_model)
        embedding_tensor = self.pos_encoding(embedding_tensor)
        embedding_tensor = self.dropout(embedding_tensor, training=TRAINING)

        # Transformer Block #1
        # (batch_size, inp_seq_len, d_model), (batch_size, class, inp_seq_len, inp_seq_len)
        enc_output, att = self.transformer_blocks[0](embedding_tensor, encoding_padding_mask)

        # Transformer Block #2 vv (takes the place of the Decoder)
        fin_output, att = self.transformer_blocks[1](enc_output, encoding_padding_mask)

        final_output = tf.reduce_mean(fin_output, axis=1)
        final_output = tf.expand_dims(final_output, axis=0)

        print(final_output.shape)

        out, att = self.transformer_blocks[2](final_output, encoding_padding_mask)

        seq_representation = tf.reduce_mean(out, axis=1)
        return seq_representation, att

### EmbeddingLayer

In [None]:
class EmbeddingLayer(tf.keras.layers.Layer):
  def __init__(self, input_vocab_size, d_model, embedding_matrix, max_seq_len):
    self.max_seq_len = max_seq_len

    self.embedding = tf.keras.layers.Embedding(
      input_vocab_size,
      d_model,
      weights=[embedding_matrix],
      input_length=max_seq_len,
      trainable=False)

  def call(self, input):
    input_sequences = log_tokenizer.texts_to_sequences(input)
    
    inputs = pad_sequences(input_sequences, maxlen=self.max_seq_len, padding='post')

    embedding_tensor = self.embedding(inputs)
    embedding_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # (batch_size, input_seq_len, d_model)

    return embedding_tensor

  # adding embedding and position encoding.
  # embedding_tensor = self.embedding(log_batch, training=TRAINING)  # (batch_size, input_seq_len, d_model)
  # embedding_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # (batch_size, input_seq_len, d_model)

### PositionalEncodingLayer

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims += 1  # max_dims must be even
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000 ** (2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

### TransformerBlock

In [None]:
class TransformerBlock(tf.keras.layers.Layer):

    def __init__(self,
                 num_layers,
                 d_model,
                 embedding_matrix,
                 num_heads,
                 dff,
                 input_vocab_size,
                 max_seq_len,
                 rate=0.1):
        super(TransformerBlock, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]

    def call(self, x, mask):
        attn_weights = None
        for i in range(self.num_layers):
            x, attn_weights = self.enc_layers[i](x, mask)

        return tf.convert_to_tensor(x), tf.convert_to_tensor(attn_weights)  # (batch_size, input_seq_len, d_model)

### EncoderLayer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):

    def __init__(self,
                 d_model: int,
                 num_heads: int,
                 dff: int,
                 rate=0.1):
        super(EncoderLayer, self).__init__()

        self.multi_headed_attention = MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=d_model // num_heads,
                                                         dropout=0.1)

        self.feed_forward_network = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation=ACTIVATION),  # (batch_size, seq_len, dff)
            tf.keras.layers.Dense(d_model, activation=ACTIVATION)  # (batch_size, seq_len, d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, mask):
        # (1) - Attention Score
        attn_output, attn_weights = self.multi_headed_attention(x, 
                                                                x, 
                                                                return_attention_scores=True)  # (batch_size, input_seq_len, d_model)

        # (2) - Add & Normalize
        attn_output = self.dropout1(attn_output, training=TRAINING)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        # (3) - Feed Forward NN
        feed_forward_output = self.feed_forward_network(out1)  # (batch_size, input_seq_len, d_model)

        # (4) - Add & Normalize
        feed_forward_output = self.dropout2(feed_forward_output, training=TRAINING)
        out2 = self.layernorm2(out1 + feed_forward_output)  # (batch_size, input_seq_len, d_model)

        return tf.convert_to_tensor(out2), tf.convert_to_tensor(attn_weights)