# Setup

Feel free to change

In [1]:
%%capture
!pip install sentencepiece
# !pip install spm_train
!pip install pyyaml
!pip install tensorflow_text

In [None]:
def convert_sqlite_to_csv(inputFolder, ext, tableName):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csvWriter = csv.writer(open(inputFolder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(inputFolder):
        if file1.endswith('.'+ext):
            conn = sqlite3.connect(inputFolder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+tableName)
            rows = cursor.fetchall()
            for row in rows:
                csvWriter.writerow(row)
            continue
        else:
            continue

In [5]:
# -- Base -- #
import os
import joblib
import logging
import time
import re
import io
from datetime import datetime
from tqdm import tqdm
import ipdb
from copy import deepcopy
from dataclasses import dataclass
import sys, getopt
import json
from pathlib import Path
import yaml
import shutil
import csv
import sentencepiece as spm
import tensorflow_text as text

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql
import tensorboard

# -- Tensorflow -- #
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.neighbors import NearestNeighbors
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.random_projection import SparseRandomProjection
from sklearn.svm import SVC

# -- Dash -- #
import dash
import dash_table
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.io as pio
import plotly.express as px
from dash import no_update
from flask_caching import Cache

Extensions

## Environmental Variables


---



In [25]:
SOURCE = '/home/' + os.environ['USER']

# -- TRANSFORMER Pipeline -- #
BATCH_SIZE = 100
EPOCHS = 1
DROPOUT_RATE = 0.1
MAX_SEQ_LEN = 200

ACTIVATION = "elu"

TRANSFORMER_LAYERS = 4
TRANSFORMER_DFF = 2000
TRANSFORMER_HEADS = 8

TRAINING = True
CONTAINER = 'core.soaesb'

In [27]:
%load_ext tensorboard

# Set up logging.
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = SOURCE + 'logs/func/%s' % stamp
writer = tf.summary.create_file_writer(logdir)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


PermissionDeniedError: /home/jovyanlogs; Permission denied [Op:CreateSummaryFileWriter]

view graph

In [28]:
# %tensorboard --logdir /content/drive/MyDrive/Work/logs

Check if GPU is in use:

In [29]:
'''
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)
'''

'\ngpu_info = !nvidia-smi\ngpu_info = \'\n\'.join(gpu_info)\nif gpu_info.find(\'failed\') >= 0:\n  print(\'Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, \')\n  print(\'and then re-execute this cell.\')\nelse:\n  print(gpu_info)\n'

## Logging

In [30]:
logging.basicConfig(format='%(asctime)s %(levelname)s | %(message)s',
                    level=logging.INFO,
                    stream=sys.stdout)
logger = logging.getLogger(__name__)

# Define Dataset

## Define Database Functions

In [31]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [32]:
dataset = database_builder(SOURCE + '/data/')
container_dataset = dataset[dataset['container_name'] == CONTAINER]

2021-05-20 19:44:00,536 INFO | Building DataFrame ...
2021-05-20 19:44:00,538 INFO | Connected to database /home/jovyan/data/elastic_logs.db
2021-05-20 19:44:02,146 INFO | ...complete!


# W2V Pipeline

## Pipeline Objects

### Configuration

In [100]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class PreprocessingGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraseCaptureLayerConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringLayerConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class NegativeSkipgramLayerConfig:
    window_size: int = 2
    num_neg_sampling: int = 10
    load_model: bool = True
    save_model: bool = False
    training: bool = True

    def load(self, config):
        set_attributes(self, config)


@dataclass
class W2VLayerConfig:
    epochs: int = 25
    batch_size: int = 2048
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: bool = 'word2vec'

    def load(self, config):
        set_attributes(self, config)


class PreprocessingPipelineConfig:
    def __init__(self):
        self.PreprocessingGlobalConfig = PreprocessingGlobalConfig()
        self.PhraseCaptureLayerConfig = PhraseCaptureLayerConfig()
        self.TextClusteringLayerConfig = TextClusteringLayerConfig()
        self.NegativeSkipgrameLayerConfig = NegativeSkipgramLayerConfig()
        self.W2VLayerConfig = W2VLayerConfig()

    def load(self, path):
        try:
            with open(path) as f:
                preprocessing_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.PreprocessingGlobalConfig.load(preprocessing_config)
        self.PhraseCaptureLayerConfig.load(preprocessing_config)
        self.TextClusteringLayerConfig.load(preprocessing_config)
        self.NegativeSkipgrameLayerConfig.load(preprocessing_config)
        self.W2VLayerConfig.load(preprocessing_config)

### Tokenizer

In [None]:
# SOURCE = '/home/' + os.environ['USER'] + '/app'

# convert_sqlite_to_csv(SOURCE + '/data', 'db', 'logs')

# dummy_data = standardize_logs(dataset)
# dummy_data['log'].to_csv("demofile2.txt", header=False, sep=',', index=False)

# arr_list = ["This is a test", "Test number 1", "Please work", "lol what is this"]
# arr_df = pd.DataFrame(arr_list, columns=["logs"])

# spm.SentencePieceTrainer.train(input=SOURCE + '/assets/notebooks/demofile2.txt',
#                        model_prefix=SOURCE + '/assets/notebooks/sentencepiece_model',
#                        vocab_size=2000)

In [102]:
class Tokenizer:
    def __init__(self, model_path):
        self.sentencepiece = spm.SentencePieceProcessor()
        self.sentencepiece.load(model_path)

    @staticmethod
    def initialize_trained_model(self, src_path, model_path, vocab_size):
        spm.SentencePieceTrainer.train(input=src_path,
                                       model_prefix=model_path,
                                       vocab_size=vocab_size)

    def tokenize(self, _input):
        return self.sentencepiece.encode_as_pieces(_input)

### Generic Save Model

In [103]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### Standardize Logs

In [104]:
def standardize_logs(logs: pd.DataFrame) -> pd.DataFrame:

    # remove timestamps
    logs['log'] = logs['log'].replace(
        to_replace=r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))|(?:\s{2,})',
        value=' ',
        regex=True)

    return logs

### PhraseCaptureLayer

In [91]:
class PhraseCaptureLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: PhraseCaptureLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(PhraseCaptureLayer, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def call(self, corpus, training):

        def clean_log(log):
            log = log.lower().strip()
            return re.sub(r'\s{2,}', ' ', log)

        def reorganize_return(corpus_with_phrases):
            log_list = []
            for tokenized_log in corpus_with_phrases:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        if not training:
            self.phrase_model = self.phrase_model.freeze()
        else:
            self.phrase_model.add_vocab(split_corpus)

        if self.save_model:
            save_model(self.phrase_model, SOURCE + self.path + self.model_name)

        corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [92]:
class TextClusteringLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: TextClusteringLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(TextClusteringLayer, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def call(self, corpus, training):
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
            log_list = []
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)
                log_list.append(match_cluster)
            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster.get_template()) for cluster in log_list]

### NegativeSkipgramLayer

In [93]:
@dataclass
class NSLBundle:
    vocab: dict
    targets: list
    contexts: list
    labels: list


class NegativeSkipgramLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: NegativeSkipgramLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(NegativeSkipgramLayer, self).__init__()
        self.vocab_size = 0
        self.vectorized_logs, self.corpus = [], []
        self.targets, self.contexts, self.labels = [], [], []
        self.vocab = {}
        self.embedding_dim = global_config.embed_size
        self.window_size = config.window_size
        self.load_data = config.load_model
        self.save_data = config.save_model
        self.num_neg_sampling = global_config.num_neg_sampling
        self.path = global_config.path

    def collect_vocabulary(self):
        self.vocab[0] = '<pad>'

        # --- OLD --- No longer need to fit
        # log_tokenizer.fit_on_texts(self.corpus)
        # TODO: Need to add text to seqeuence methods (Instead of  Tokenize)
        self.vectorized_logs = log_tokenizer.texts_to_sequences(self.corpus)

        # TODO: Need to add word vocabulary dictionary options
        self.vocab.update({v: k for k, v in log_tokenizer.word_index.items()})
        self.vocab_size = len(self.vocab.keys())

    def find_word_context(self):

        # Build the sampling table for vocab_size tokens.
        sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(len(self.vocab))

        for sequence in tqdm(self.vectorized_logs, position=0, leave=True):

            positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
                sequence,
                vocabulary_size=len(self.vocab),
                sampling_table=sampling_table,
                window_size=self.window_size,
                negative_samples=0)

            for target_word, context_word in positive_skip_grams:
                context_class = tf.expand_dims(
                    tf.constant([context_word], dtype='int64'), 1)

                negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                    true_classes=context_class,
                    num_true=1,
                    num_sampled=self.num_neg_sampling,
                    unique=True,
                    range_max=len(self.vocab),
                    seed=42,
                    name="negative_sampling")

                negative_sampling_candidates = tf.expand_dims(
                    negative_sampling_candidates, 1)

                context = tf.concat([context_class, negative_sampling_candidates], 0)
                label = tf.constant([1] + [0] * self.num_neg_sampling, dtype='int64')

                self.targets.append(target_word)
                self.contexts.append(context)
                self.labels.append(label)

    def call(self, corpus, training):
        if self.load_data:
            print("WTF X 2")
            try:
                self.vocab = joblib.load(SOURCE + self.path + 'vocab.joblib')
                self.targets = joblib.load(SOURCE + self.path + 'targets.joblib')
                self.contexts = joblib.load(SOURCE + self.path + 'contexts.joblib')
                self.labels = joblib.load(SOURCE + self.path + 'labels.joblib')
            except Exception as e:
                print(e)
        else:
            self.corpus = corpus
            self.collect_vocabulary()
            self.find_word_context()

            if self.save_data:
                save_model(self.vocab,
                           SOURCE + self.path + 'vocab.joblib')
                save_model(self.targets,
                           SOURCE + self.path + 'targets.joblib')
                save_model(self.contexts,
                           SOURCE + self.path + 'contexts.joblib')
                save_model(self.labels,
                           SOURCE + self.path + 'labels.joblib')

        return NSLBundle(self.vocab, self.targets, self.contexts, self.labels)

### Word2VecEmbeddingLayer

In [94]:
class Word2VecEmbeddingLayer(tf.keras.layers.Layer):

    def __init__(self,
                 config: W2VLayerConfig,
                 global_config: PreprocessingGlobalConfig):

        super(Word2VecEmbeddingLayer, self).__init__()
        self.embeddings = {}
        self.embedding_dim = global_config.embed_size
        self.buffer_size = global_config.buffer_size
        self.num_neg_sampling = global_config.num_neg_sampling
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.batch_size = config.batch_size
        self.epochs = config.epochs
        self.Optimizer = tf.keras.optimizers.Adam()
        self.path = global_config.path
        self.model_name = config.model_name

        if self.load_model:
            self.Word2Vec = load_model(SOURCE + self.path + self.model_name)
        else:
            self.Word2Vec = None

    def call(self, in_bundle, training):

        vocab = in_bundle.vocab
        targets = in_bundle.targets
        contexts = in_bundle.contexts
        labels = in_bundle.labels

        if self.Word2Vec is None:
            self.Word2Vec = Word2Vec(len(vocab.keys()), self.embedding_dim, self.num_neg_sampling)
            self.Word2Vec.compile(
                optimizer=self.Optimizer,
                loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

        dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
        dataset = dataset.shuffle(self.buffer_size).batch(self.batch_size, drop_remainder=True)
        dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

        if training:
            self.Word2Vec.fit(dataset, epochs=self.epochs)

        weights = self.Word2Vec.get_layer('w2v_embedding').get_weights()[0]

        for word in vocab.items():
            self.embeddings.update({
                word[1]: weights[word[0]]
                })

        if self.save_model:
            if os.path.exists(SOURCE + self.path + self.model_name):
                shutil.rmtree(SOURCE + self.path + self.model_name)
            self.Word2Vec.save(SOURCE + self.path + self.model_name)
            out_v = io.open(SOURCE + self.path + 'vectors.tsv', 'w', encoding='utf-8')
            out_m = io.open(SOURCE + self.path + 'metadata.tsv', 'w', encoding='utf-8')

            for index, word in enumerate(vocab.values()):
                if index == 0:
                    continue  # skip 0, it's padding.
                vec = weights[index]
                out_v.write('\t'.join([str(x) for x in vec]) + "\n")
                out_m.write(word + "\n")
            out_v.close()
            out_m.close()

        self.Word2Vec.summary()
        return self.embeddings

### Word2VecModel

In [95]:
class Word2Vec(tf.keras.models.Model):

    def __init__(self, vocab_size, embedding_dim, num_neg_sampling):
        super(Word2Vec, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=1, # input length 1 since we are focusing on one token
            name="w2v_embedding")

        self.context_embedding = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim,
            input_length=num_neg_sampling + 1) # window size for contextual 
            # reasoning behind the sample token
        self.dots = tf.keras.layers.Dot(axes=(3, 2))
        self.flatten = tf.keras.layers.Flatten()

    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

### W2V_Pipeline

In [96]:
class W2V_Pipeline(tf.keras.Model):
    def __init__(self, config: PreprocessingPipelineConfig):
        super(W2V_Pipeline, self).__init__()

        self.PCL = PhraseCaptureLayer(config.PhraseCaptureLayerConfig,
                                      config.PreprocessingGlobalConfig)

        self.TCL = TextClusteringLayer(config.TextClusteringLayerConfig,
                                       config.PreprocessingGlobalConfig)

        self.NSL = NegativeSkipgramLayer(config.NegativeSkipgrameLayerConfig,
                                         config.PreprocessingGlobalConfig)

        self.W2V = Word2VecEmbeddingLayer(config.W2VLayerConfig,
                                          config.PreprocessingGlobalConfig)

        self.global_train = config.PreprocessingGlobalConfig.global_training
        self.PCL_train = True if self.global_train else config.PhraseCaptureLayerConfig.training  # noqa
        self.TCL_train = True if self.global_train else config.TextClusteringLayerConfig.training  # noqa
        self.NSL_train = True if self.global_train else config.NegativeSkipgrameLayerConfig.training  # noqa
        self.W2V_train = True if self.global_train else config.W2VLayerConfig.training  # noqa

    def call(self, x):
        x = standardize_logs(x)
        x = self.PCL(x, self.PCL_train)
        x = self.TCL(x, self.TCL_train)
        x = self.NSL(x, self.NSL_train)
        return self.W2V(x, self.W2V_train)

In [97]:
container_dataset.head(25)

Unnamed: 0,timestamp,container_name,log,label
9,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | aging/0-SNAPSHOT | CommandExtension ...,nitf-messaging-bundle-stopped
64,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | aging/0-SNAPSHOT | CommandExtension ...,nitf-messaging-bundle-stopped
65,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | ev.HealthMonitor | HealthMonitor | d...,nitf-messaging-bundle-stopped
182,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | aging/0-SNAPSHOT | BlueprintExtender...,nitf-messaging-bundle-stopped
369,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | ev.HealthMonitor | HealthMonitor | d...,nitf-messaging-bundle-stopped
370,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | b]-nio2-thread-1 | ServerUserAuthSer...,nitf-messaging-bundle-stopped
435,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | b]-nio2-thread-1 | ServerUserAuthSer...,nitf-messaging-bundle-stopped
487,2021-01-29T18:07:13.134Z,core.soaesb,| INFO | aging/0-SNAPSHOT | BlueprintExtender...,nitf-messaging-bundle-stopped
537,2021-01-29T18:07:38.143Z,core.soaesb,| INFO | ev.HealthMonitor | HealthMonitor | d...,nitf-messaging-bundle-stopped
589,2021-01-29T18:07:38.143Z,core.soaesb,| INFO | ev.HealthMonitor | HealthMonitor | d...,nitf-messaging-bundle-stopped


## W2V Pipeline Main

In [98]:
# ** Preprocessing **
'''
standardize_logs
'''

# ** Model **
# 1.
# LogTokenEmbedder
'''
Seq = [PCL
       TCL
       NSL
       GT1: W2V] -> {embedding_matrix, vocab}
'''
######

# 2.
# Transformer Stuff
'''
{log, embedding_matrix, vocab} ->
GT2: Transformer -> prediction
'''
# LOG_DIR = SOURCE + 'logs'
# metadata = os.path.join(LOG_DIR, 'metadata.tsv')
# config = projector.ProjectorConfig()

config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = PreprocessingPipelineConfig()
preprocessing_config.load(config_path)

# --- SUBWORD TOKENIZER IN WIP ---
log_tokenizer = Tokenizer(src_path=SOURCE + '/assets/notebooks/demofile2.txt',
                          model_path=SOURCE + '/assets/notebooks/sentencepiece_model',
                          num_words=preprocessing_config.PreprocessingGlobalConfig.max_vocab_size)
w2vp = W2V_Pipeline(preprocessing_config)
embed_weights = w2vp(container_dataset)

2021-05-20 20:15:29,390 INFO | Starting Drain3 template miner
2021-05-20 20:15:29,391 INFO | Loading configuration from drain3.ini
2021-05-20 20:15:31,099 INFO | collecting all words and their counts
2021-05-20 20:15:31,100 INFO | PROGRESS: at sentence #0, processed 0 words and 0 word types


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  logs['log'] = logs['log'].replace(


2021-05-20 20:15:31,740 INFO | PROGRESS: at sentence #10000, processed 282290 words and 7919 word types
2021-05-20 20:15:32,245 INFO | PROGRESS: at sentence #20000, processed 565850 words and 12007 word types
2021-05-20 20:15:32,673 INFO | collected 15999 token types (unigram + bigrams) from a corpus of 799297 words and 28226 sentences
2021-05-20 20:15:32,673 INFO | merged Phrases<15999 vocab, min_count=5, threshold=7, max_vocab_size=40000000>


100%|██████████| 28226/28226 [04:34<00:00, 102.69it/s]


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
INFO:tensorflow:Assets written to: /home/jovyan/results/word2vec/assets
2021-05-20 20:21:38,519 INFO | Assets written to: /home/jovyan/results/word2vec/assets
Model: "word2_vec_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
w2v_embedding (Embedding)    multiple                  43584     
_________________________________________________________________
embedding_8 (Embedding)      multiple                  43584     
_________________________________________________________________
dot_8 (Dot)                  multiple                  0         
_________________________________________________________________
f

## W2V Dash 

### Supporting Functions

In [None]:
def tree_parser(node, inner_list, outer_list, root_node, depth):
    d = node.key_to_child_node  # dict
    for token in list(d.keys()):
        if len(root_node.key_to_child_node.keys()) == 0:
            ret_list = []
            for row in outer_list:
                proper_len = int(row[1])
                if len(row) == proper_len+1 or len(row) + 1 == depth:
                    ret_list.append(row)
            return ret_list
        inner_list.append(token)
        child = d[token]
        if child.key_to_child_node:
            tree_parser(child, inner_list, outer_list, root_node, depth)
        else:
            d.pop(token)
            outer_list.append(inner_list)
            inner_list = ['root']
            tree_parser(root_node, inner_list, outer_list, root_node, depth)

In [None]:
def tree_to_list_parser(node):
    tree_df = []
    curr_path = []
    tree_dict = {}
    prev_root = [("root", node)]
    while len(prev_root) > 0:
        # Peek at last value
        curr_root = prev_root[-1]

        # Get the node element
        curr_node = curr_root[1].key_to_child_node

        # Follow path value if not already there
        if len(curr_path) <= 0 or curr_path[-1] != curr_root[0]:
            curr_path.append(curr_root[0])

        visited = False
        if curr_root[1] in tree_dict:
            visited = True
        else:
            tree_dict[curr_root[1]] = True

        # Check if value has any leaf nodes
        if not visited and len(curr_node.keys()) > 0:
            # Add those to the stack
            for nn in curr_node.items():
                prev_root.append((nn[0], nn[1]))
        else:
            # Remove previous node in the path
            prev_root.pop()

            # Record to the database if leaf
            if len(curr_node.keys()) <= 0:
                tree_df.append(deepcopy(curr_path))

            # Move back up tree
            curr_path.pop()
    return tree_df

In [None]:
def appendSpherical_np(xyz):
    ptsnew = np.hstack((xyz, np.zeros(xyz.shape)))
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    ptsnew[:, 3] = np.sqrt(xy + xyz[:, 2]**2)
    ptsnew[:, 4] = np.arctan2(np.sqrt(xy), xyz[:, 2])  # for elevation angle defined from Z-axis down
    # ptsnew[:,4] = np.arctan2(xyz[:,2], np.sqrt(xy)) # for elevation angle defined from XY-plane up
    ptsnew[:, 5] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return ptsnew

In [None]:
def get_spherical_coords(xyz):
    sph = np.zeros(shape=xyz.shape)
    xy = xyz[:, 0]**2 + xyz[:, 1]**2
    sph[:, 0] = np.sqrt(xy + xyz[:, 2]**2)
    sph[:, 1] = np.arctan2(np.sqrt(xy), xyz[:, 2])
    sph[:, 2] = np.arctan2(xyz[:, 1], xyz[:, 0])
    return sph

The output of the W2V pipeline is a matrix of size [vocab size x embedding size] 

### Environmental Variables

In [None]:
# -- W2V Dash Environmental Variables -- #

W2V_NEIGHBORS = 20
RECURSION_LIMIT = 10**6
N_PROJ_DIM = 3
DASH_SEED = 0

### Generate Projection Data

In [None]:
# -- Generate Data for Word Embeddings Projector -- #

# shape = vocab size x embedding dim size
weights = np.ndarray(shape=(len(embed_weights), w2v_config["embed_size"]))

# -- Populate Matrix for PCA -- #
for idx, weight in enumerate(list(embed_weights.values())):
    weights[idx, :] = weight

# -- Dimensionality Reduction -- #
pca = PCA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
ica = FastICA(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
srp = SparseRandomProjection(n_components=N_PROJ_DIM, random_state=DASH_SEED).fit(weights)
reduced_embeddings = pca.transform(weights)

# -- Calculate Nearest Neighbors -- #
model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
trained_embeddings = model.fit(reduced_embeddings)

# Currently the array has a shape of vocab size x N_PROJ_DIM and contains
# the fitted PCA data. We need to add the vocab in the first column so
# we know which vectors are represented.
scatter_plot_3d_cols = ['token', 'x1', 'x2', 'x3']
embedding_vocab_arr = np.array(list(embed_weights.keys()))
embedding_vocab_arr = np.expand_dims(embedding_vocab_arr, 1)
named_reduced_embeddings = np.hstack((embedding_vocab_arr, reduced_embeddings))
scatter_plot_3d_df = pd.DataFrame(
    data=named_reduced_embeddings,
    columns=scatter_plot_3d_cols)
scatter_plot_3d_df['x1'] = pd.to_numeric(scatter_plot_3d_df['x1'])
scatter_plot_3d_df['x2'] = pd.to_numeric(scatter_plot_3d_df['x2'])
scatter_plot_3d_df['x3'] = pd.to_numeric(scatter_plot_3d_df['x3'])

We will build our plot using the tree_parser function. This function recursively
steps through the drain3.TemplateMiner.drain.Node structure of our 
**TextClusteringLayer** (TCL). The recursion populates a np.array which is then used
to build a pandas dataframe which the plotly treemap accepts. There is a column
appended to the tail of the dataframe which counts the number of stars 
(wild card masks) present in the row. This is used to define the colors shown.

### Generate Treemap Data

In [None]:
# By default python's recursion limit is 10**4 which is too small for our needs
sys.setrecursionlimit(RECURSION_LIMIT)

# The root node is the master node of the tree and will be our return point
root_node = deepcopy(w2vp.TCL.template_miner.drain.root_node)
parsed_tree = tree_to_list_parser(root_node)
parsed_tree_df = pd.DataFrame(data=parsed_tree)

# The returned dataframe has generic columns so we will provide custom labels
n_cols = len(parsed_tree_df.columns)
col_name_list = []
for idx in range(n_cols):
    col_name_list.append('level' + str(idx))
parsed_tree_df.columns = col_name_list

'''
Without a color column our treemap would just be plain. We thought that taking
the sum of the drain mask would be an interesting way to color the treemap.
This lambda function will sum those values in each row and return them to a new
columnn named 'sum'
'''
parsed_tree_df['sum'] = parsed_tree_df.apply(lambda x: x.str.contains('<*>'), axis=1).sum(axis=1)  # noqa

### Dash Variables

In [None]:
pio.templates.default = "plotly_dark"
external_stylesheets_url = 'https://drive.google.com/uc?export=view&id=19OXGQ5iJIjRZD4VEZ-xiVChDmj0-SlSF'  # noqa
external_stylesheets = [external_stylesheets_url]

CACHE_CONFIG = dict()
CACHE_CONFIG['CACHE_TYPE'] = 'filesystem'
CACHE_CONFIG['CACHE_DIR'] = SOURCE + '/results/dash_cache'

### Colors

In [None]:
color_d = dict()
color_d['blue'] = 'rgb(66, 133, 244)'
color_d['red'] = 'rgb(219, 68, 55)'
color_d['yellow'] = 'rgb(244, 180, 0)'
color_d['orange'] = 'rgb(255, 165, 0)'
color_d['green'] = 'rgb(15, 157, 88)'
color_d['mint'] = 'rgb(3, 218, 198)'
color_d['dark mint'] = 'rgb(1, 135, 134)'
color_d['dark purple'] = 'rgb(55, 0, 179)'
color_d['purple'] = 'rgb(98, 0, 238)'

### Dash Formatting

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #

# Line formatting
scatter_plot_3d_line = dict()
scatter_plot_3d_line['width'] = 2
scatter_plot_3d_line['color'] = color_d['dark mint']

scatter_plot_3d_selected_line = dict()
scatter_plot_3d_selected_line['width'] = 2
scatter_plot_3d_selected_line['color'] = color_d['dark mint']

scatter_plot_3d_nonselected_line = dict()
scatter_plot_3d_nonselected_line['width'] = 2
scatter_plot_3d_nonselected_line['color'] = color_d['dark mint']

scatter_plot_3d_darker_line = dict()
scatter_plot_3d_darker_line['width'] = 2
scatter_plot_3d_darker_line['color'] = color_d['dark purple']


# Marker formatting
scatter_plot_3d_marker = dict()
scatter_plot_3d_marker['size'] = 5
scatter_plot_3d_marker['line'] = scatter_plot_3d_line
scatter_plot_3d_marker['color'] = color_d['mint']

scatter_plot_3d_selected_marker = dict()
scatter_plot_3d_selected_marker['size'] = 5
scatter_plot_3d_selected_marker['color'] = color_d['mint']
scatter_plot_3d_selected_marker['line'] = scatter_plot_3d_selected_line

scatter_plot_3d_nonselected_marker = dict()
scatter_plot_3d_nonselected_marker['size'] = 5
scatter_plot_3d_nonselected_marker['color'] = color_d['mint']
scatter_plot_3d_nonselected_marker['opacity'] = 0.15
scatter_plot_3d_nonselected_marker['line'] = scatter_plot_3d_nonselected_line

scatter_plot_3d_marker_no_color = dict()
scatter_plot_3d_marker_no_color['size'] = 5
scatter_plot_3d_marker_no_color['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_marker_cluster_center = dict()
scatter_plot_3d_marker_cluster_center['size'] = 10
scatter_plot_3d_marker_cluster_center['color'] = color_d['orange']
scatter_plot_3d_marker_cluster_center['opacity'] = 0.5
scatter_plot_3d_marker_cluster_center['line'] = scatter_plot_3d_darker_line

scatter_plot_3d_selected_table_marker = dict()
scatter_plot_3d_selected_table_marker['size'] = 5
scatter_plot_3d_selected_table_marker['color'] = color_d['yellow']
scatter_plot_3d_selected_table_marker['line'] = scatter_plot_3d_darker_line


# Style
scatter_plot_3d_style = dict()
scatter_plot_3d_style['height'] = '100%'
scatter_plot_3d_style['width'] = '100%'


# ========= #
#  Treemap  #
# ========= #

# Style
treemap_style = dict()
treemap_style['height'] = '100%'
treemap_style['width'] = '100%'


# ============ #
#  Data Table  #
# ============ #

# Style
data_table_cell_style = dict()
data_table_cell_style['textAlign'] = 'left'
data_table_cell_style['overflow'] = 'hidden'
data_table_cell_style['textOverflow'] = 'ellipsis'
data_table_cell_style['maxWidth'] = 0
data_table_cell_style['backgroundColor'] = 'rgb(20, 20, 20)'
data_table_cell_style['color'] = 'white'

data_table_header_style = dict()
data_table_header_style['backgroundColor'] = color_d['purple']


# ======== #
#  Labels  #
# ======== #

# Style
clustering_alg_drop_down_label_style = dict()
clustering_alg_drop_down_label_style['color'] = 'white'

coordinate_space_drop_down_label_style = dict()
coordinate_space_drop_down_label_style['color'] = 'white'

dim_reduction_drop_down_label_style = dict()
dim_reduction_drop_down_label_style['color'] = 'white'

### Dash Configuration

In [None]:
# ================= #
#  3d Scatter Plot  #
# ================= #
scatter_plot_3d_config = dict()
scatter_plot_3d_config['responsive'] = True


# ========= #
#  Treemap  #
# ========= #
treemap_config = dict()
treemap_config['responsive'] = True

### Dash Dropdown Options

In [None]:
clustering_alg_drop_down_options = [
    {'label': 'KNN', 'value': 'KNN'},
    {'label': 'GMM', 'value': 'GMM'},
    {'label': 'Bayesian GMM', 'value': 'BGMM'},
    {'label': 'Affinity Prop.', 'value': 'AP'},
    {'label': 'KMEANS', 'value': 'KM'},
    {'label': 'SVM', 'value': 'SVM'},
]

coordinate_space_drop_down_options = [
    {'label': 'Cartesian', 'value': 'CT'},
    {'label': 'Spherical', 'value': 'SP'}
]

dim_reduction_drop_down_options = [
    {'label': 'PCA', 'value': 'PCA'},
    {'label': 'ICA', 'value': 'ICA'},
    {'label': 'LDA', 'value': 'LDA'},
    {'label': 'Sparse RP', 'value': 'SRP'},
    {'label': 'Gaussian RP', 'value': 'GRP'}
]

### Dash Main

In [None]:
app = JupyterDash(__name__, external_stylesheets=external_stylesheets)
cache = Cache()
cache.init_app(app.server, config=CACHE_CONFIG)


# =============== #
#  Cluster Table  #
# =============== #
table = pd.DataFrame(
    data=list(embed_weights.keys()),
    columns=['token'])

# ============= #
#  Scatterplot  #
# ============= #
scatter_plot_3d_fig = px.scatter_3d(
                      scatter_plot_3d_df,
                      x='x1',
                      y='x2',
                      z='x3',
                      hover_name='token')

scatter_plot_2d_fig = px.scatter(
                     scatter_plot_3d_df,
                     x='x1',
                     y='x2',
                     hover_name='token')

scatter_plot_3d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_3d_fig['layout']['uirevision'] = 1

scatter_plot_2d_fig.update_traces(marker=scatter_plot_3d_marker)
scatter_plot_2d_fig['layout']['uirevision'] = 1


# ========= #
#  Treemap  #
# ========= #
treemap_fig = px.treemap(
    parsed_tree_df,
    path=col_name_list,
    color='sum')


# ============ #
#  App Layout  #
# ============ #
app.layout = html.Div([

        html.Div([

            # -- Clustering Technique Dropdown -- #
            html.Label(
                "Clustering Algorithm (TODO)",
                style=clustering_alg_drop_down_label_style),
            dcc.Dropdown(
                id='cluster-dropdown',
                options=clustering_alg_drop_down_options,
                value='KNN'),

            # -- Coordinate Space Dropdown -- #
            html.Label(
                "Coordinate Space",
                style=coordinate_space_drop_down_label_style),
            dcc.Dropdown(
                id='coord-dropdown',
                options=coordinate_space_drop_down_options,
                value='CT'),

            # -- Dimensionality Reduction Technique Dropdown -- #
            html.Label(
                "Dimensionality Reduction (TODO)",
                style=dim_reduction_drop_down_label_style),
            dcc.Dropdown(
                id='dr-dropdown',
                options=dim_reduction_drop_down_options,
                value='PCA'
            )
        ], className='options-graph-container'),

        # -- 3d Scatter Plot -- #
        html.Div(
            [dcc.Graph(
                id='3d_scat',
                figure=scatter_plot_3d_fig,
                config=scatter_plot_3d_config,
                style=scatter_plot_3d_style),
             dcc.Slider(
                id='my-slider',
                min=0.5,
                max=0.9,
                step=0.05,
                value=0.5)],
            className='main-graph-container',
            id='graph_div'),

        # -- Tree Map -- #
        html.Div(
            dcc.Graph(
                id='3d_tree',
                figure=treemap_fig,
                config=treemap_config,
                style=treemap_style),
            className='secondary-graph-container',
            id='tree_div'),

        # -- Neighbors Datatable -- #
        html.Div(
            children=[dash_table.DataTable(
                 id='table',
                 columns=[{"name": i, "id": i} for i in table.columns],
                 data=pd.DataFrame().to_dict('records'),
                 style_cell=data_table_cell_style,
                 style_header=data_table_header_style,
             )],
            className='related-graph',
            id='data_table'),

        # signal value to trigger callbacks
        dcc.Store(id='signal')],

    id='report-container')


# ============= #
#  Memoization  #
# ============= #

# Table of Contents:
# -----------------------------
# 1. Projection DataFrame
# 2. Coordinates
# 3. Dimensionality Reductions
# 4. Clustering Algorithms
# -----------------------------

# -- 1. Projection DataFrame -- #
@cache.memoize()
def dataframe_store(embeddings):
    new_df = pd.DataFrame(
        data=embeddings,
        columns=scatter_plot_3d_cols)
    new_df['x1'] = pd.to_numeric(new_df['x1'])
    new_df['x2'] = pd.to_numeric(new_df['x2'])
    new_df['x3'] = pd.to_numeric(new_df['x3'])
    return new_df


# -- 2. Coordinates -- #
@cache.memoize()
def coordinate_space_store(value, embeddings):
    # calculate new coordinate space
    if value == 'SP':
        spherical_embeddings = get_spherical_coords(embeddings)
        embeddings_stack_tup = (embedding_vocab_arr, spherical_embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    elif value == "CT":
        embeddings_stack_tup = (embedding_vocab_arr, embeddings)
        named_embeddings = np.hstack(embeddings_stack_tup)
    else:
        return no_update
    return named_embeddings


# -- 3. Dimensionality Reduction -- #
@cache.memoize()
def dimension_reduct_store(value):
    # calculate new dimensionality reduction algorithm
    if value == "PCA":
        dr_embeddings = pca.transform(weights)
    elif value == "ICA":
        dr_embeddings = ica.transform(weights)
    elif value == "SRP":
        dr_embeddings = srp.transform(weights)
    else:
        return no_update
    return dr_embeddings


# -- 4. Clustering Algorithms -- #
@cache.memoize()
def clustering_algo_store(value, damp_value):
    # calculate new clustering algorithm
    if value == "KNN":
        model = NearestNeighbors(n_neighbors=W2V_NEIGHBORS, algorithm='auto')
    elif value == "AP":
        model = AffinityPropagation(damping=damp_value, random_state=DASH_SEED)
    elif value == "KM":
        model = KMeans(n_clusters=4)
    elif value == "GMM":
        model = GaussianMixture(n_components=4)
    elif value == "SVM":
        model = SVC(kernel='poly', degree=3, probability=True, random_state=DASH_SEED)
    return model


# =========== #
#  Callbacks  #
# =========== #

# -- Calculate Projection Data -- #
@app.callback(Output('signal', 'data'),
              Input('dr-dropdown', 'value'),
              Input('cluster-dropdown', 'value'),
              Input('coord-dropdown', 'value'),
              Input('my-slider', 'value'))
def compute_coordinate_space(dr_val, cluster_val, coord_val, damp_value):
    return (dr_val, cluster_val, coord_val, damp_value)


# -- Point Selection Mechanics -- #
@app.callback(Output("table", "data"),
              Output("3d_scat", "figure"),
              Input('3d_scat', 'clickData'),
              Input("signal", "data"),
              Input("table", "selected_rows"))
def select_point(clickData, value, rows):
    ctx = dash.callback_context
    ids = [c['prop_id'] for c in ctx.triggered]

    embeddings = dimension_reduct_store(value[0])
    model = clustering_algo_store(value[1], value[3])
    named_embeddings = coordinate_space_store(value[2], embeddings)
    df = dataframe_store(named_embeddings)

    clustering_model = model.fit(named_embeddings[:, 1:4].astype(float))

    if '3d_scat.clickData' in ids:
        if clickData:
            for p in clickData['points']:
                if value[1] != "KNN":
                    return no_update, no_update

                coord_list = [p['x'], p['y'], p['z']]
                query_arr = np.array(coord_list).reshape(1, -1)

                _, neighbors = clustering_model.kneighbors(X=query_arr)
                neighbors_list = neighbors.tolist()[0]
                tokens = []
                for idx in neighbors_list:
                    tokens.append(table.iloc[idx])
                update = pd.DataFrame(data=tokens)

                selected_df = df[df.index.isin(neighbors_list)]
                nonselected_df = df.drop(index=neighbors_list)

                ff = px.scatter_3d(
                    selected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff = ff.update_traces(marker=scatter_plot_3d_selected_marker)

                if rows is not None:
                    table_point = selected_df[selected_df['token'] == rows]
                    ff2_1 = px.scatter_3d(
                            table_point,
                            x='x1',
                            y='x2',
                            z='x3',
                            text='token')

                    ff2_1 = ff2_1.update_traces(marker=scatter_plot_3d_selected_table_marker)
                    ff.add_trace(ff2_1.data[0])

                ff2 = px.scatter_3d(
                    nonselected_df,
                    x='x1',
                    y='x2',
                    z='x3',
                    hover_name='token')

                ff2 = ff2.update_traces(marker=scatter_plot_3d_nonselected_marker)

                ff.add_trace(ff2.data[0])
                ff['layout']['uirevision'] = 1

                return update.to_dict('records'), ff
    elif 'signal.data' in ids:
        if value[1] != "KNN":
            y_pred = clustering_model.predict(embeddings)

            df.insert(0, "Label", y_pred, True)
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                color='Label',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker_no_color)

            if "GMM" not in value[1]:
                centers = pd.DataFrame(data=clustering_model.cluster_centers_, columns=["x1", "x2", "x3"])
                ff2 = px.scatter_3d(
                    centers,
                    x='x1',
                    y='x2',
                    z='x3')
                ff2.update_traces(marker=scatter_plot_3d_marker_cluster_center)

                ff.add_trace(ff2.data[0])
        else:
            ff = px.scatter_3d(
                df,
                x='x1',
                y='x2',
                z='x3',
                hover_name='token')

            ff.update_traces(marker=scatter_plot_3d_marker)

        ff['layout']['uirevision'] = 1

        return no_update, ff
    else:
        return no_update, no_update


app.run_server(host='0.0.0.0', mode='jupyterlab')

# Transformer Pipeline

### Main (Initialization)

In [None]:
# -- Data Batches, Vocab, and Embedding -- #
word_embedding_matrix = joblib.load(SOURCE + "results/w2v_weights.joblib")
vocabulary = joblib.load(SOURCE + "results/vocab_dict.joblib")
dataset = database_builder(SOURCE + 'database/')
dataset = dataset.sample(frac=1).reset_index(drop=True)
vocab_size = len(vocabulary)

batched_dataset = process_all_batches()

# -- Transformer Model -- #
optimus_prime = Transformer(
    TRANSFORMER_LAYERS,
    W2V_EMBED_SIZE,
    TRANSFORMER_HEADS,
    TRANSFORMER_DFF,
    vocab_size,
    word_embedding_matrix,
    MAX_SEQ_LEN,
    DROPOUT_RATE)

# -- Labels -- #
label_unique = dataset['label'].unique()
lbp = LabelEncoder().fit(label_unique)
binary_labels = lbp.transform(label_unique)

log_labels = {}
for idx, label in enumerate(label_unique):
    log_labels.update({
        label: binary_labels[idx]
    })

# -- Model Metrics -- #
learning_rate = CustomSchedule(W2V_EMBED_SIZE)
epoch_loss = tf.keras.metrics.Mean(name='train_loss')
epoch_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# -- Classification Step Layers -- #
add_att_layer = tf.keras.layers.AdditiveAttention()
softmax = tf.keras.layers.Softmax()

s1 = tf.keras.Sequential([
    tf.keras.layers.Dense(BATCH_SIZE, activation=ACTIVATION),
    tf.keras.layers.Dense(4, activation=ACTIVATION),
    tf.keras.layers.Softmax()
])

# -- Pipeline Info -- #
n_logs = len(dataset.index)
#n_iter = n_logs // BATCH_SIZE
n_iter = 5
remainder = n_logs % BATCH_SIZE
attns = []


# -- Checkpoints -- #
checkpoint_path = SOURCE + "checkpoints/"
checkpoint = tf.train.Checkpoint(step=tf.Variable(1), transformer=optimus_prime, optimizer=optimizer)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_path, max_to_keep=5)

tf.debugging.set_log_device_placement(True)

In [None]:
def process_all_batches():
    batches = []

    for idx in range(n_iter + 1):
        log_batch, labels = process_batch(dataset, vocabulary, idx, log_labels)

        batches.append((log_batch, labels))

    return batches

In [None]:
    tf.profiler.experimental.stop()
    tf.summary.trace_off()

### Main (Training)

In [None]:
for epoch in range(EPOCHS):

    start = time.time()
    epoch_loss.reset_states()
    epoch_accuracy.reset_states()
    dataset_iter = iter(batched_dataset)

    t = tqdm(range(n_iter), desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%}".format(0, 0, 0), position=0, leave=True)
    for _ in t:
        batch = next(dataset_iter)
        log_batch = batch[0]
        labels = batch[1]

        # Returns Eager Tensor for Predictions
        tf.summary.trace_on()
        tf.profiler.experimental.start(logdir)
        with writer.as_default():
          train_step(log_batch, labels)
          # with tf.summary.record_if(True):

          tf.summary.trace_export(
            name = "training_trace",
            step=0,
            profiler_outdir=logdir
          )

        tf.profiler.experimental.stop()
        tf.summary.trace_off()

        checkpoint.step.assign_add(1)

        if int(checkpoint.step) % 10 == 0:
            save_path = checkpoint_manager.save()

        t.set_description(desc="Epoch: {:03d}, Loss: {:.3f}, Accuracy: {:.3%} ".format(epoch,
                                                                    epoch_loss.result(),
                                                                    epoch_accuracy.result()))
        t.refresh()

In [None]:
train_step_signature = [
    tf.TensorSpec(shape=([BATCH_SIZE, None]), dtype=tf.float32),
    tf.TensorSpec(shape=([BATCH_SIZE]), dtype=tf.float32)
]

@tf.function(input_signature=train_step_signature)#, experimental_compile=True)
def train_step(log_batch: tf.Tensor, 
               labels: tf.Tensor):
    transformer_input = tf.tuple([
        log_batch,  # <tf.Tensor: shape=(batch_size, max_seq_len), dtype=float32>
        labels  # <tf.Tensor: shape=(batch_size, num_classes), dtype=float32>
    ])
    with tf.GradientTape() as tape:
        Rs, _ = optimus_prime(transformer_input)
        a_s = add_att_layer([Rs, Rs])
        y = softmax(a_s * Rs)
        print(a_s.shape)
        # y = Rs
        loss = tf.py_function(loss_function, [labels, y], tf.float32)
        pred = s1(y)
        labels = tf.cast(labels, tf.int64)
    # Optimize the model
    grads = tape.gradient(loss, optimus_prime.trainable_variables)
    optimizer.apply_gradients(zip(grads, optimus_prime.trainable_variables))

    acc = accuracy_function(labels, pred)

    # Tracking Progress
    epoch_loss.update_state(loss)  # Adding Batch Loss
    epoch_accuracy.update_state(acc)

    # return loss, acc

## Metric Objects

### Loss Function

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

### Accuracy Function

In [None]:
def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=1))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

### Custom Learning Rate Schedule

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model: int, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

## Pipeline Objects

### ProcessBatch (NEEDS UPDATE)

In [None]:
def process_batch(dataset: pd.DataFrame,
                  vocabulary: dict,
                  idx: int,
                  labels: dict) -> tuple:
    logs = np.zeros((BATCH_SIZE, MAX_SEQ_LEN))
    y_true = np.empty((BATCH_SIZE,))

    start_window = idx * BATCH_SIZE
    end_window = (idx + 1) * BATCH_SIZE
    for log_idx, log in enumerate(dataset['log'][start_window:end_window]):
        for seq_idx, word in enumerate(log.split()):
            if seq_idx >= MAX_SEQ_LEN:
                break
            logs[log_idx, seq_idx] = vocabulary[word] if word in vocabulary.keys() else 0
        y_true[log_idx] = labels[dataset['label'][log_idx]]

    return tf.convert_to_tensor(logs, dtype=tf.float32), tf.convert_to_tensor(y_true, dtype=tf.float32)

### Transformer

In [None]:
class Transformer(tf.keras.Model):

    def __init__(self,
                 num_layers,
                 d_model,
                 num_heads,
                 dff,
                 input_vocab_size,
                 embedding_matrix,
                 max_seq_len,
                 rate=0.1):
        super(Transformer, self).__init__()

        self.d_model = d_model

        # self.embedding = tf.keras.layers.Embedding(
        #     input_vocab_size,
        #     d_model,
        #     weights=[embedding_matrix],
        #     input_length=max_seq_len,
        #     trainable=False)

        self.embedding = EmbeddingLayer(input_vocab_size, d_model, embedding_matrix, max_seq_len)

        self.pos_encoding = PositionalEncoding(max_seq_len, d_model)

        self.transformer_blocks = [TransformerBlock(
                        num_layers,
                        d_model,
                        embedding_matrix,
                        num_heads,
                        dff,
                        input_vocab_size,
                        max_seq_len,
                        rate) for _ in range(3)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, input_tuple: tf.tuple, **kwargs):
        log_batch = input_tuple[0]
        encoding_padding_mask = None # input_tuple[1]

        embedding_tensor = self.embedding(log_batch) # (batch_size, input_seq_len, d_model)
        embedding_tensor = self.pos_encoding(embedding_tensor)
        embedding_tensor = self.dropout(embedding_tensor, training=TRAINING)

        # Transformer Block #1
        # (batch_size, inp_seq_len, d_model), (batch_size, class, inp_seq_len, inp_seq_len)
        enc_output, att = self.transformer_blocks[0](embedding_tensor, encoding_padding_mask)

        # Transformer Block #2 vv (takes the place of the Decoder)
        fin_output, att = self.transformer_blocks[1](enc_output, encoding_padding_mask)

        final_output = tf.reduce_mean(fin_output, axis=1)
        final_output = tf.expand_dims(final_output, axis=0)

        print(final_output.shape)

        out, att = self.transformer_blocks[2](final_output, encoding_padding_mask)

        seq_representation = tf.reduce_mean(out, axis=1)
        return seq_representation, att

### EmbeddingLayer

In [101]:
class EmbeddingLayer(tf.keras.layers.Layer):
  def __init__(self, input_vocab_size, d_model, embedding_matrix, max_seq_len):
    self.max_seq_len = max_seq_len

    self.embedding = tf.keras.layers.Embedding(
      input_vocab_size,
      d_model,
      weights=[embedding_matrix],
      input_length=max_seq_len,
      trainable=False)

  def call(self, input):
    input_sequences = log_tokenizer.texts_to_sequences(input)

    inputs = pad_sequences(input_sequences, maxlen=self.max_seq_len, padding='post')

    embedding_tensor = self.embedding(inputs)
    embedding_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # (batch_size, input_seq_len, d_model)

    return embedding_tensor

  # adding embedding and position encoding.
  # embedding_tensor = self.embedding(log_batch, training=TRAINING)  # (batch_size, input_seq_len, d_model)
  # embedding_tensor *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # (batch_size, input_seq_len, d_model)

### PositionalEncodingLayer

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims += 1  # max_dims must be even
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        pos_emb = np.empty((1, max_steps, max_dims))
        pos_emb[0, :, ::2] = np.sin(p / 10000 ** (2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000 ** (2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

    def call(self, inputs):
        shape = tf.shape(inputs)
        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

### TransformerBlock

In [None]:
class TransformerBlock(tf.keras.layers.Layer):

    def __init__(self,
                 num_layers,
                 d_model,
                 embedding_matrix,
                 num_heads,
                 dff,
                 input_vocab_size,
                 max_seq_len,
                 rate=0.1):
        super(TransformerBlock, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]

    def call(self, x, mask):
        attn_weights = None
        for i in range(self.num_layers):
            x, attn_weights = self.enc_layers[i](x, mask)

        return tf.convert_to_tensor(x), tf.convert_to_tensor(attn_weights)  # (batch_size, input_seq_len, d_model)

### EncoderLayer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):

    def __init__(self,
                 d_model: int,
                 num_heads: int,
                 dff: int,
                 rate=0.1):
        super(EncoderLayer, self).__init__()

        self.multi_headed_attention = MultiHeadAttention(num_heads=num_heads,
                                                         key_dim=d_model // num_heads,
                                                         dropout=0.1)

        self.feed_forward_network = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation=ACTIVATION),  # (batch_size, seq_len, dff)
            tf.keras.layers.Dense(d_model, activation=ACTIVATION)  # (batch_size, seq_len, d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, mask):
        # (1) - Attention Score
        attn_output, attn_weights = self.multi_headed_attention(x,
                                                                x,
                                                                return_attention_scores=True)  # (batch_size, input_seq_len, d_model)

        # (2) - Add & Normalize
        attn_output = self.dropout1(attn_output, training=TRAINING)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        # (3) - Feed Forward NN
        feed_forward_output = self.feed_forward_network(out1)  # (batch_size, input_seq_len, d_model)

        # (4) - Add & Normalize
        feed_forward_output = self.dropout2(feed_forward_output, training=TRAINING)
        out2 = self.layernorm2(out1 + feed_forward_output)  # (batch_size, input_seq_len, d_model)

        return tf.convert_to_tensor(out2), tf.convert_to_tensor(attn_weights)