# Setup

In [1]:
cd /home/jovyan/assets

/home/jovyan/assets


In [2]:
# -- Base -- #
import os
import random
import joblib
import logging
import time
import re
from copy import deepcopy
from dataclasses import dataclass
import sys
import yaml
import csv
from typing import (
    List,
    Dict,
    Tuple
)
from yaspin import yaspin
from functools import partial

# -- Tokenizer -- #
import tokenizers
from tokenizers.models import WordPiece
from tokenizers import (
    Tokenizer,
    normalizers
)
from tokenizers.normalizers import (
    Lowercase,
    NFD,
    StripAccents
)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders

# -- PreTrained BERT -- #
from transformers import create_optimizer
from transformers import PreTrainedTokenizerFast
from datasets import Dataset

# -- Metrics -- #
import numpy as np
import pandas as pd
import sqlite3 as sql

# -- Tensorflow -- #
import tensorflow as tf

# -- Misc Models -- #
import drain3
from gensim.models.phrases import Phrases
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# -- Custom -- #
from libs.transformers.src.transformers.models.bert.modeling_tf_bert import TFBertForPreTraining
from libs.transformers.src.transformers.models.bert.configuration_bert import BertConfig
from libs.transformers.src.transformers.modeling_tf_utils import shape_list
from libs.transformers.src.transformers.data.data_tf_collator import TFDataCollatorForLanguageModeling

tf.__version__

2021-07-22 13:21:22.026859: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


'2.4.1'

In [3]:
def convert_sqlite_to_csv(input_folder, ext, table_name):
    """ inputFolder - Folder where sqlite files are located. 
        ext - Extension of your sqlite file (eg. db, sqlite, sqlite3 etc.)
        tableName - table name from which you want to select the data.
    """
    csv_writer = csv.writer(open(input_folder+'/output.csv', 'w', newline=''))
    for file1 in os.listdir(input_folder):
        if file1.endswith('.'+ext):
            conn = sql.connect(input_folder+'/'+file1)
            cursor = conn.cursor()
            cursor.execute("SELECT * FROM "+table_name)
            rows = cursor.fetchall()
            for row in rows:
                csv_writer.writerow(row)
            continue
        else:
            continue

# Extensions

In [4]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)

2021-07-22 13:21:24.575707: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-07-22 13:21:24.576658: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-07-22 13:21:24.624968: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-22 13:21:24.625380: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:0a:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.635GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-07-22 13:21:24.625395: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-07-22 13:21:24.626929: I tensorflow/stream_executor/platform/defau

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Environmental Variables

In [6]:
SOURCE = '/home/' + os.environ['USER']
CONTAINER = 'core.soaesb'

## Logging

In [7]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s | %(message)s',
    level=logging.DEBUG,
    stream=sys.stdout
)

logger = logging.getLogger(__name__)


def info_log(msg):
    logger.log(
        logging.INFO, 
        msg
    )

In [8]:
from typing import TypeVar


# Define Dataset

## Define Database Functions

In [9]:
def database_builder(path: str) -> pd.DataFrame():
    logger.info('Building DataFrame ...')
    (_, _, files) = next(os.walk(path))
    sql_query = 'SELECT * FROM logs'
    data = []
    for f in files:
        if '.db' in f:
            conn = create_connection(path + f)
            d = pd.read_sql_query(sql_query, conn)
            data.append(d)
    logger.info('...complete!')
    return pd.concat(data)


def create_connection(path: str) -> sql.Connection:
    """
    Creates a database connection
    :param path: str
        path to database object
    :return sql.Connection
        a connection to the database
    """
    try:
        conn = sql.connect(path)
        logger.info('Connected to database ' + path)
        return conn
    except sql.Error as e:
        logger.warning(e)

## Define Dataset Main

In [10]:
dataset = database_builder(SOURCE + '/data/')
dataset_container = dataset[dataset['container_name'] == 'solr']
dataset_container.reset_index(drop=True, inplace=True)

2021-07-22 13:21:28,046 INFO | Building DataFrame ...
2021-07-22 13:21:28,047 INFO | Connected to database /home/jovyan/data/elastic_logs.db
2021-07-22 13:21:28,591 INFO | ...complete!


In [11]:
dataset["log"].str.len().max()

11642

In [12]:
len(dataset.index)

410131

# W2V Pipeline

## Pipeline Objects

### Configuration

In [13]:
def set_attributes(self, config: dict):
    try:
        config = config[self.__class__.__name__]
    except Exception as e:
        logger.warning(e)
        logger.warning('No configuration found for ' +
                       self.__class__.__name__)

    for attr in config.keys():
        setattr(self, attr, config[attr])


@dataclass
class UnsupervisedGlobalConfig:
    embed_size: int = 512
    max_vocab_size: int = 2000
    buffer_size: int = 10000
    global_training: bool = True
    path: str = '/results/'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class PhraserModelConfig:
    min_count: int = 5
    threshold: float = 7
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'phrase_model.joblib'

    def load(self, config):
        set_attributes(self, config)


@dataclass
class TextClusteringConfig:
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'template_miner.joblib'

    def load(self, config):
        set_attributes(self, config)

        
@dataclass
class WordEmbeddingPreTrainingConfig:
    """-1 indicates non-initialized variables"""
    
    # BERT
    hidden_size: int = 512
    num_hidden_layers: int = 8
    num_attention_heads: int = 8
        
    # Save Paths
    pretrained_save_path: str = f"../results/PreTrainedModel/"
    checkpoint_save_path: str = f"../results/PreTrainedChkpts/"
    checkpoint_max_saves: int = 3
    checkpoint_save_interval: int = 200
    
    # Learning rate schedule
    init_lr: float = 1e-4
    num_warmup_steps: int = 2
    
    # Optimizer
    adam_beta1: float = 0.9
    adam_beta2: float = 0.999
    adam_epsilon: float = 0.1
    weight_decay_rate: float = 0.1
    
    batch_size: int = -1
    epochs: int = -1
    max_seq_length: int = -1
    repeat_size: int = 5
        
    # State 
    load_model: bool = True
    save_model: bool = False
    training: bool = True
    model_name: str = 'BERT'

    def load(self, config):
        set_attributes(self, config)

        
class UnsupervisedPipelineConfig:
    def __init__(self):
        self.UnsupervisedGlobalConfig = UnsupervisedGlobalConfig()
        self.PhraserModelConfig = PhraserModelConfig()
        self.TextClusteringConfig = TextClusteringConfig()
        self.WordEmbeddingPreTrainingConfig = WordEmbeddingPreTrainingConfig()

    def load(self, path: str):
        try:
            with open(path) as f:
                unsupervised_pipeline_config = yaml.load(f, Loader=yaml.FullLoader)
        except FileNotFoundError as e:
            logger.warning(e)
            return None

        self.UnsupervisedGlobalConfig.load(unsupervised_pipeline_config)
        self.PhraserModelConfig.load(unsupervised_pipeline_config)
        self.TextClusteringConfig.load(unsupervised_pipeline_config) 
        self.WordEmbeddingPreTrainingConfig.load(unsupervised_pipeline_config)

### Tokenizer

In [14]:
class PrimeTokenizer:
    def __init__(self, max_seq_length: int):
        self.prime_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        self.prime_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        self.prime_tokenizer.pre_tokenizer = Whitespace()
        self.prime_tokenizer.decoder = decoders.WordPiece()
        self.prime_tokenizer.enable_padding(length=max_seq_length)
        self.prime_tokenizer.enable_truncation(max_seq_length)

        self.prime_tokenizer.post_processor = TemplateProcessing(
            single="[CLS] $A [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[
                ("[CLS]", 1),
                ("[SEP]", 2),
            ],
        )

        self.trainer = WordPieceTrainer(
            vocab_size=153411,
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
        )

    def text_to_sequence(self, input_) -> List[tokenizers.Encoding]:
        if type(input_) is list:
            return self.prime_tokenizer.encode_batch(input_)
        return self.prime_tokenizer.encode(input_)

    def sequence_to_text(self, input_) -> List[str]:
        if type(input_) is list:
            return self.prime_tokenizer.decode_batch(input_)
        return self.prime_tokenizer.decode(input_)

    def train(self, data):
        log_itr = iter(data)
        self.prime_tokenizer.train_from_iterator(log_itr, self.trainer)
        self.save()

    def get_tokenizer(self) -> Tokenizer:
        return self.prime_tokenizer

    def get_vocab(self) -> Dict[str, int]:
        return self.prime_tokenizer.get_vocab()
    
    def get_vocab_size(self) -> int:
        return self.prime_tokenizer.get_vocab_size()
    
    def save(self):
        self.prime_tokenizer.save(SOURCE + "/results/prime_tokenizer.json")
        
    def load(self):
        self.prime_tokenizer = Tokenizer.from_file(SOURCE + "/results/prime_tokenizer.json")

### Generic Save Model

In [15]:
def save_model(model, path):
#     if not os.path.exists(path):
#         return

    if os.path.isfile(path):
        os.remove(path)
#     elif os.path.isdir(path):
#         shutil.rmtree(path)
#         return

    joblib.dump(model, path)

### PhraseCaptureLayer

In [16]:
class PhraserModel:

    def __init__(self,
                 config: PhraserModelConfig,
                 global_config: UnsupervisedGlobalConfig):

        super(PhraserModel, self).__init__()
        self.min_count = config.min_count
        self.threshold = config.threshold
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model:
            self.phrase_model = joblib.load(SOURCE +
                                            self.path +
                                            self.model_name)
        else:
            self.phrase_model = Phrases(min_count=self.min_count,
                                        threshold=self.threshold)

    def __call__(self, corpus: pd.DataFrame, training=None) -> list:
        if training is None:
            training = self.training
            
        def reorganize_return(phrased_corpus):
            log_list = []
            for tokenized_log in phrased_corpus:
                log_list.append(' '.join(tokenized_log))
            return log_list

        split_corpus = [log.split(' ') for log in corpus['log']]

        if not training:
            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
        else:
            self.phrase_model.add_vocab(split_corpus)

            if self.save_model:
                save_model(self.phrase_model, SOURCE + self.path + self.model_name)

            corpus_with_phrases = self.phrase_model.__getitem__(split_corpus)
            
        return reorganize_return(corpus_with_phrases)

### TextClusteringLayer

In [17]:
class TextClustering:

    def __init__(self,
                 config: TextClusteringConfig,
                 global_config: UnsupervisedGlobalConfig):

        super(TextClustering, self).__init__()
        self.load_model = config.load_model
        self.save_model = config.save_model
        self.path = global_config.path
        self.model_name = config.model_name
        self.training = config.training

        if self.load_model is True:
            self.template_miner = joblib.load(SOURCE +
                                              self.path +
                                              self.model_name)
        else:
            self.template_miner = drain3.TemplateMiner()

    def __call__(self, corpus: list, training=None) -> list:
        if training is None:
            training = self.training
            
        if training:
            for log in corpus:
                self.template_miner.add_log_message(log)
            if self.save_model:
                save_model(self.template_miner,
                           SOURCE + self.path + self.model_name)

            for idx, log in enumerate(corpus):
                template = self.template_miner.match(log).get_template()
                corpus[idx] = template

            return [re.sub(pattern=r' +',
                           repl=' ',
                           string=cluster) for cluster in corpus]
        else:
            log_set = set()
            for log in corpus:
                match_cluster = self.template_miner.match(log)
                if match_cluster is None:
                    match_cluster = self.template_miner.add_log_message(log)['template_mined']
                    log_set.add(match_cluster)
                else:
                    log_set.add(match_cluster.get_template())

            return list(log_set)
        
    def get_unique_templates(self) -> list:
        template_list = []
        for cluster in self.template_miner.drain.clusters:
            template_list.append(cluster.get_template())
        return [re.sub(pattern=r' +',
                       repl=' ',
                       string=cluster) for cluster in template_list]

### Preprocessing Pipeline

## BERT Model

### Custom Callback

In [18]:
# region Helper classes
class SavePretrainedCallback(tf.keras.callbacks.Callback):
    # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
    # metadata to allow them to be loaded as a pretrained model in future. This is a simple Keras callback
    # that saves the model with this method after each epoch.
    def __init__(self, output_dir, model):
        super().__init__()
        self.output_dir = output_dir
        self.model = model

    def on_epoch_end(self):
        self.model.save_pretrained(self.output_dir)

### Model

In [19]:
# TODO: Figure out config/phobert config
# TODO: Add documentation LAST
# TODO: Run black formatter LAST
# TODO: Investigate class attributes
# TODO: Fix transform

from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers.schedules import LearningRateSchedule
from tensorflow.keras.optimizers import Optimizer
from collections.abc import Iterable
from typing import Tuple


# Type Bounds
ScheduleTypeBound = TypeVar('ScheduleTypeBound', bound=LearningRateSchedule)
OptimizerTypeBound = TypeVar('OptimizerTypeBound', bound=Optimizer)

# Type Alias
MLMHeadLoss = Tuple[tf.Tensor, tf.Tensor]
NSPHeadLoss = Tuple[tf.Tensor, tf.Tensor]


class PhoBert:
    """ PhoBert handles pretraining our contextual embeddings, the entities 
    which form the input of our fine tuning model (PASCAL)
    
    :param config: A dataclass containing configuration settings loaded
        from UnsupervisedPipelineConfig.yaml
    :type config: WordEmbeddingPreTrainingConfig
    :param learning_rate_schedule: learning rate management
    :type learning_rate_schedule: ScheduleTypeBound
    :param training_optimizer: Training optimizer for Adam
    :type training_optimizer: OptimizerTypeBound
    """
    
    def __init__(
        self, 
        config: WordEmbeddingPreTrainingConfig,
        bert_config: BertConfig,
        learning_rate_schedule: ScheduleTypeBound,
        training_optimizer: OptimizerTypeBound,
        token_len: int
    ) -> None:
        
        # State information 
        self.batch_size = config.batch_size
        self.epochs = config.epochs
        self.outputs = None
        
        self.checkpoint_max_saves = config.checkpoint_max_saves
        self.checkpoint_save_interval = config.checkpoint_save_interval
        
        # Save File Paths
        self.pretrained_path = config.pretrained_save_path
        self.checkpoint_path = config.checkpoint_save_path
        
        self.loss_fn = SparseCategoricalCrossentropy(from_logits=True)

        # Prepare the metrics
        self.train_acc_metric = SparseCategoricalAccuracy()
        self.val_acc_metric = SparseCategoricalAccuracy()
        
        # BERT
        self.BERT = TFBertForPreTraining(bert_config)
        self.optimizer = training_optimizer
        self.lr_schedule = learning_rate_schedule
        self.BERT.resize_token_embeddings(token_len)
        
        # Save Model Callback
        self.SavePretrainedModel = SavePretrainedCallback(self.pretrained_path, self.BERT)
        
        # Checkpoint System
        self.chkpt = tf.train.Checkpoint(optimizer=training_optimizer, model=self.BERT)
        self.chkpt_manager = tf.train.CheckpointManager(
            self.chkpt, 
            self.checkpoint_path, 
            max_to_keep=self.checkpoint_max_saves
        )
        self.checkpoint_interval = self.checkpoint_save_interval
        self.chkpt.restore(self.chkpt_manager.latest_checkpoint)

        if self.chkpt_manager.latest_checkpoint:
            print("\nRestored from {}".format(self.chkpt_manager.latest_checkpoint))
        else:
            print("\nInitializing from scratch.")


    @staticmethod
    def compute_loss(labels: dict, logits: Iterable) -> Tuple[MLMHeadLoss, NSPHeadLoss]:
        """ Computes losses for both the MLM (clozure) and NSP heads. Make sure that only
        labels not equal to -100 are used in the loss calculation
        """
        
        # MLM calculation
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
        )
        
        # NSP calculation
        next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
        next_sentence_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
        )
        next_sentence_label = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
        )

        return (masked_lm_labels, masked_lm_reduced_logits), (next_sentence_label, next_sentence_reduced_logits)
        
        
    @tf.function
    def train_step(self, train_set: Dataset, test_set: Dataset) -> Tuple[tf.Tensor]:
        """Loss calculation for backwards propagation"""
        
        with tf.GradientTape() as tape:
            logits = self.BERT(train_set, training=True)
            mlm, nsp = self.compute_loss(test_set, (logits["prediction_logits"], logits["seq_relationship_logits"]))
            masked_lm_loss = self.loss_fn(mlm[0], mlm[1])
            next_sentence_loss = self.loss_fn(nsp[0], nsp[1])
            loss_value = masked_lm_loss + next_sentence_loss

        grads = tape.gradient(loss_value, self.BERT.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.BERT.trainable_weights))

        # Update training metric.
        self.train_acc_metric.update_state(mlm[0], mlm[1])
        self.train_acc_metric.update_state(nsp[0], nsp[1])
        
        self.outputs = logits

        return loss_value


    @tf.function
    def test_step(self, train_set: Dataset, test_set: Dataset) -> None:
        """Update accuracy metrics"""
        
        val_logits = self.BERT(train_set, training=False)
        mlm, nsp = self.compute_loss(test_set, (val_logits["prediction_logits"], val_logits["seq_relationship_logits"]))
        # Update val metrics
        self.val_acc_metric.update_state(mlm[0], mlm[1])
        self.val_acc_metric.update_state(nsp[0], nsp[1])
        
        
    def __call__(self, train_set: Dataset, test_set: Dataset) -> None:
        """Training loop for BERT"""
        
        with yaspin(text="", color='blue') as sp:
            for epoch in range(self.epochs):
                sp.text = f"Epochs {epoch}/{self.epochs}"
                info_log("Start of epoch %d" % (epoch,))
                start_time = time.time()
                
                # Iterate over the batches of the dataset.
                for step, (x_batch_train, y_batch_train) in enumerate(train_set):
                    loss_value = self.train_step(x_batch_train, y_batch_train)
                    
                    # Log progress and Save checkpoint per interval amount.
                    if step % self.checkpoint_interval == 0:
                        save_path = self.chkpt_manager.save()
                        info_log("Saved checkpoint for step {}: {}".format(int(self.chkpt.save_counter), save_path))
                        info_log("Training loss (for one batch) at step %d: %.4f" % (step, float(loss_value)))
                        info_log("Seen so far: %d samples" % ((step + 1) * self.batch_size))

                # Display metrics at the end of each epoch.
                train_acc = self.train_acc_metric.result()
                info_log("Training acc over epoch: %.4f" % (float(train_acc),))

                # Reset training metrics at the end of each epoch
                self.train_acc_metric.reset_states()

                # Run an evaluation loop at the end of each epoch.
                for x_batch_test, y_batch_test in test_set:
                    self.test_step(x_batch_test, y_batch_test)
                eval_acc = self.val_acc_metric.result()
                self.val_acc_metric.reset_states()
                info_log("Testing acc: %.4f" % (float(eval_acc),))
                info_log("Time taken: %.2fs" % (time.time() - start_time))
                
                self.SavePretrainedModel.on_epoch_end()
                
            sp.text = ""
            sp.ok('✔ Finished Training Epochs')

## Unsupervised Learning Pipeline

In [20]:
EncodedSeq = List[int]

def normalize_logs(logs: pd.DataFrame) -> pd.DataFrame:
    # remove timestamps and double spaces
    regexp = re.compile(
        r"""
        (?:               # Match all enclosed
        \d{4}-\d{2}-\d{2} # YYYY-MM-DD
        [\sT]             # Accept either a space or T
        \d{2}:\d{2}:\d{2} # HH:MM:SS
        ([.,]\d{3}|\s)    # Accept either a space or milliseconds
        )                 # End timestamp match
        | (?:\s{2,})      # Remove double spaces   
        | [^a-zA-Z\d:]    # Clean non-alphanumeric characters
        """, re.X)        # re.X enables comments and whitespace

    c_logs = deepcopy(logs)
    c_logs.loc[:, 'log'].replace(
        to_replace=regexp, 
        value=' ', 
        regex=True,
        inplace=True
    )

    return c_logs


def extract_unique_labels(logs: pd.DataFrame) -> dict:
    # -- Labels -- #
    label_unique = logs['label'].unique()
    
    le = LabelEncoder().fit(label_unique)
    labels = le.transform(label_unique)
    log_labels = {le.inverse_transform([i])[0]: i for i in labels}
    
    return log_labels


def create_sentence_pairing(examples):
    first_seqs = []
    nsp_labels = []

    examples["log"] = [
        line for line in examples["log"] if len(line) > 0 and not line.isspace()
    ]
    
    log_list = list(examples['log'])
    for idx in range(len(log_list)):
        first_value = log_list[idx]
        if random.random() > 0.5:
            # Pair with proper following log sequence
            second_value = log_list[(idx + 1) % len(log_list)]

            # IsNext Label
            nsp_labels.append(0)
        else:
            # Pair with random log
            rand_idx = random.randint(0, len(log_list) - 1)
            second_value = log_list[rand_idx]

            # IsNotNext Label
            nsp_labels.append(1)

        first_seqs.append((first_value, second_value))

    return {"log": first_seqs, "next_sentence_label": nsp_labels}


def generate_test_train_split(tokenized_datasets, test_size=.30):
    # Train - Test Split
    train_indices, test_indices = train_test_split(
        list(range(len(tokenized_datasets))), test_size=test_size
    )

    test_dataset = tokenized_datasets.select(test_indices)
    train_dataset = tokenized_datasets.select(train_indices)

    return train_dataset, test_dataset


class UnsupervisedLearningPipeline:
    def __init__(
        self, 
        config: UnsupervisedPipelineConfig
    ) -> None:
        
        we_config = config.WordEmbeddingPreTrainingConfig
        
        # Hyperparameters
        self.max_seq_len = we_config.max_seq_length
        self.n_logs = 0
        self.n_iter = 0
        self.repeat_size = we_config.repeat_size
        self.epochs = we_config.epochs
        self.batch_size = we_config.batch_size
        self.batches_per_epoch = None
        self.logs_as_templates = None
        self.pretrained_save_path = we_config.pretrained_save_path

        # Logs
        self.normalized_logs = None
        self.log_labels = None
        self.logs_with_phrases = list()
        
        # Tokenizers
        self.bert_tokenizer = PrimeTokenizer(self.max_seq_len)
        self.fast_tokenizer = None
        
        # Dataset for Training/Evaluation
        self.data_collator = None
        self.tf_train_dataset = None
        self.tf_test_dataset = None
        self.token_logs = None
                
        # Models        
        self.phobert = None # Instantiated during runtime
        self.pm = PhraserModel(config.PhraserModelConfig, 
                               config.UnsupervisedGlobalConfig)
        self.tc = TextClustering(config.TextClusteringConfig,
                                 config.UnsupervisedGlobalConfig)
        
        self.instantiate_optimizer = partial(
            create_optimizer,
            init_lr=we_config.init_lr,
            num_warmup_steps=we_config.num_warmup_steps,
            adam_beta1=we_config.adam_beta1,
            adam_beta2=we_config.adam_beta2,
            adam_epsilon=we_config.adam_epsilon,
            weight_decay_rate=we_config.weight_decay_rate
        )
        
        self.create_config = partial(
            BertConfig,
            hidden_size=we_config.hidden_size,
            num_hidden_layers=we_config.num_hidden_layers,
            num_attention_heads=we_config.num_attention_heads
        )
        
        self.phobert_init = partial(
            PhoBert,
            config=we_config
        )
        

    def initialize_fast_tokenizer(self):
        tokenizer_obj = self.bert_tokenizer.get_tokenizer()
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer_obj)
        fast_tokenizer.model_max_length = self.max_seq_len
        fast_tokenizer.unk_token = "[UNK]"
        fast_tokenizer.sep_token = "[SEP]"
        fast_tokenizer.pad_token = "[PAD]"
        fast_tokenizer.cls_token = "[CLS]"
        fast_tokenizer.mask_token = "[MASK]"
        self.fast_tokenizer = fast_tokenizer
        return fast_tokenizer
    
    
    @staticmethod
    def generate_data_signatures(train_dataset, test_dataset):
        # Train Signatures
        train_signature = {
            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
            for feature in train_dataset.features
            if feature != "special_tokens_mask" and feature != "next_sentence_label"
        }
        train_signature["next_sentence_label"] = tf.TensorSpec(shape=(), dtype=tf.int64)
        train_signature["labels"] = train_signature["input_ids"]
        train_signature = (train_signature, {"labels": train_signature["labels"], "next_sentence_label": train_signature["next_sentence_label"]})
        
        # Test Signatures
        test_signature = {
            feature: tf.TensorSpec(shape=(None,), dtype=tf.int64)
            for feature in test_dataset.features
            if feature != "special_tokens_mask" and feature != "next_sentence_label"
        }
        test_signature["next_sentence_label"] = tf.TensorSpec(shape=(), dtype=tf.int64)
        test_signature["labels"] = test_signature["input_ids"]
        test_signature = (test_signature, {"labels": test_signature["labels"], "next_sentence_label": test_signature["next_sentence_label"]})
        
        return train_signature, test_signature
    

    def tokenize_function(self, examples):
        # Remove empty lines
        return self.fast_tokenizer(
            examples["log"],
            padding=False,
            truncation=True,
            max_length=self.max_seq_len,
            return_special_tokens_mask=True,
        )


    def get_pre_training_data(self):
        fast_tokenizer = self.initialize_fast_tokenizer()
        
        dt = self.normalized_logs.drop(["label", "container_name", "timestamp"], axis=1)
        data = Dataset.from_pandas(dt)
        
        tokenized_datasets = data.map(
            create_sentence_pairing,
            batched=True,
            num_proc=1,
            remove_columns=["log"],
            desc="Creating sentence pairings for NSP Head"
        )
        tokenized_datasets = tokenized_datasets.map(
            self.tokenize_function,
            batched=True,
            num_proc=1,
            remove_columns=["log"],
            desc="Running tokenizer on dataset line_by_line"
        )
        
        train_dataset, test_dataset = generate_test_train_split(tokenized_datasets)
        train_signature, test_signature = self.generate_data_signatures(train_dataset, test_dataset)
        
        self.batches_per_epoch = len(train_dataset) // self.batch_size
        
        data_collator = TFDataCollatorForLanguageModeling(
            tokenizer=self.fast_tokenizer,
            padding_length=self.max_seq_len,
            batch_size=self.batch_size
        )
        
        tokenized_generator = partial(data_collator, train_dataset, fast_tokenizer)
        test_generator = partial(data_collator, test_dataset, fast_tokenizer)
        
        self.tf_train_dataset = (
            tf.data.Dataset.from_generator(tokenized_generator, output_signature=train_signature)
            .batch(batch_size=self.batch_size, drop_remainder=True)
            .shuffle(len(train_dataset), reshuffle_each_iteration=True)
            .repeat(int(self.repeat_size))
        )
        
        self.tf_test_dataset = (
            tf.data.Dataset.from_generator(test_generator, output_signature=test_signature)
            .batch(batch_size=self.batch_size, drop_remainder=True)
            .shuffle(len(test_dataset), reshuffle_each_iteration=True)
        )


    def train_bert_tokenizer(self, load_model=False):
        if load_model:
            self.bert_tokenizer.load()
        else:
            self.bert_tokenizer.train(self.logs_as_templates)
            
            
    def pre_train_bert(self):
        """Execute BERT pretraining"""
        bert_config = self.create_config(self.bert_tokenizer.get_vocab_size())
        
        training_optimizer, learning_rate_schedule = self.instantiate_optimizer(
             num_train_steps=self.repeat_size * self.batches_per_epoch
        )

        # PhoBert Model instantiation
        self.phobert = self.phobert_init(
            bert_config=bert_config,
            learning_rate_schedule=learning_rate_schedule,
            training_optimizer=training_optimizer,
            token_len=len(self.fast_tokenizer.vocab)
        )
        
        # Prefetch Data
        self.tf_train_dataset = self.tf_train_dataset.prefetch(tf.data.AUTOTUNE)
        self.tf_test_dataset = self.tf_test_dataset.prefetch(tf.data.AUTOTUNE)
        
        # Start PreTraining
        self.phobert(self.tf_train_dataset, self.tf_test_dataset)
        
        # Save the PreTrainedTokenizer
        self.fast_tokenizer.save_pretrained(self.pretrained_save_path)
    

    def fit(self, logs: pd.DataFrame):
        assert len(logs.index) > 0, 'process received an empty dataframe!'
        
        with yaspin(text="Normalizing Logs", color='green') as sp:
            self.normalized_logs = normalize_logs(logs)
            sp.text = ""
            sp.ok('✔ Completed log normalization')
            
            sp.text = "Extracting phrases"
            self.logs_with_phrases = self.pm(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed phrase extraction')
            
            sp.text = "Converting to log templates"
            self.logs_as_templates = np.array(self.tc(self.logs_with_phrases))
            sp.text = ""
            sp.ok('✔ Completed log template conversion')
        
            sp.text = "Extracting Unique Labels"
            self.log_labels = extract_unique_labels(self.normalized_logs)
            sp.text = ""
            sp.ok('✔ Completed extracting unique labels')

            sp.text = "Training Tokenizer"
            self.train_bert_tokenizer()
            sp.text = ""
            sp.ok('✔ Completed training of custom tokenizer')
                        
            sp.text = "Processing training dataset"
            self.get_pre_training_data()
            sp.text = ""
            sp.ok('✔ Completed processing training dataset')
            
            sp.text = "Pretraining BERT"
            self.pre_train_bert()
            sp.text = ""
            sp.ok('✔ Completed BERT pretraining')

## W2V Pipeline Main

In [21]:
config_path = SOURCE + '/assets/notebooks/PreprocessingConfig.yaml'
preprocessing_config = UnsupervisedPipelineConfig()
preprocessing_config.load(config_path)

In [22]:
# -- Unsupervised Learning Pipeline -- #

'''
Input: pd.DataFrame with batch_size number of rows 
Seq: 
    Normalize 
    Phraser
    Clustering
    Extract Unique Layers
    BERT
Returns: transformers.TFBertForPreTrainingOutput
'''

# --- SUBWORD TOKENIZER --
w2vp = UnsupervisedLearningPipeline(preprocessing_config)
training_outputs = w2vp.fit(dataset_container)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/results/phrase_model.joblib'

# BERT Visualization

In [23]:
%%capture
!pip install bertviz

In [24]:
from libs.transformers.src.transformers.models.bert.modeling_tf_bert import TFBertModel
import torch
from bertviz import head_view, model_view
from bertviz.neuron_view import show

model = TFBertModel.from_pretrained("../results/PreTrainedModel/", local_files_only=True)
tokenizer = PreTrainedTokenizerFast.from_pretrained("../results/PreTrainedModel/", local_files_only=True)

2021-07-22 13:21:42.922848: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-07-22 13:21:42.923984: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-22 13:21:42.924427: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:0a:00.0 name: GeForce RTX 2080 Ti computeCapability: 7.5
coreClock: 1.635GHz coreCount: 68 deviceMemorySize: 10.76GiB deviceMemoryBandwidth: 573.69GiB/s
2021-07-22 13:21:42.924460: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-0

In [25]:
sentence = dataset_container["log"][0]
sentence

'00000065:solr-1.srvcls[0022:adfd]'

In [26]:
x = tokenizer(
    sentence,
    padding=True,
    return_tensors='tf'
)
x = dict(x)

In [27]:
attention = model(x, training=False, output_attentions=True)[-1]

In [28]:
y = []
for n in attention:
    y.append(torch.tensor(n.numpy()))
attention = tuple(y)

In [29]:
tokens = tokenizer.convert_ids_to_tokens(x["input_ids"][0]) 

In [30]:
head_view(attention, tokens)

<IPython.core.display.Javascript object>

In [31]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

In [None]:
# Needs pytorch model!
show(model, 'bert', tokenizer, sentence, layer=2, head=0)