In [None]:
import os
import sys
from typing import List, Dict, Any

import pandas as pd

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.pipeline import Pipeline

from logrca import extract_app_config
from config.resolvers import LogRCA_ArtifactLocalBucketResolver

from config.logrca_config import LogRCAConfig, LogFileConfig
from logfile_utils import LogFileLocator
from preprocessing.nltk_preproc import NltkTextPreprocessor
from feature_extraction.tfidf import DataFrameTfIdfTransformer, DataFrameWord2VecPersister
from feature_extraction.glove import GloveFeatureExtractor
from feature_extraction.fasttext import FastTextFeatureExtractor, FastTextFeatureLoader
from vulogrca_core.pipeline_utils import grid_parameters
from feature_extraction.utils import cross_validated_feature_extraction
from feature_extraction.fasttext_utils import write_word2vec, generate_word2vec_model_file_path

print(sys.path)

In [None]:
logrca_cfg, logfile_cfgs = extract_app_config(
    inilocation=sys.path[1], logconfigjson="logs.json"
)

In [None]:
# process config
bucket_resolver = LogRCA_ArtifactLocalBucketResolver(logrca_cfg)
bucket_resolver.create_nonexisting_buckets()

In [None]:
def apply_nltk_processing(log_locator):
    df_unique_inliers = pd.read_csv(log_locator.get_unique_inliers_csv_path())
    
    nltk_proc = NltkTextPreprocessor(cols_to_preprocess=['event_template'], 
                                     append_preprocessed_cols=True, preprocessed_col_names=None)
    df_nltk = nltk_proc.fit_transform(df_unique_inliers)
    
    # write back the pre-processed templates data into the same file
    df_nltk.to_csv(log_locator.get_tfidf_csv_path(), index=False)
    df_nltk = pd.read_csv(log_locator.get_tfidf_csv_path())

    return df_nltk

In [None]:
def extract_tfidf_features(logfile_cfg: LogFileConfig, df_nltk: pd.DataFrame):
    logvector_persist_pipeline = Pipeline(steps=[
        ('df_2_tfidf_transformer', DataFrameTfIdfTransformer()),
        ('word2vec_persister', DataFrameWord2VecPersister())
    ])

    LOGVECTOR_PIPELINE_PARAMS = {
        'df_2_tfidf_transformer__eventid_colname': logfile_cfg.drain["eventid_colname"],
        'df_2_tfidf_transformer__preprocessed_colname': 'event_template_preprocessed',
        'df_2_tfidf_transformer__generate_dense_embeddings': True,
        # we will decide this with custom cross validation
        #'df_2_tfidf_transformer__feature_embedding_dim': 45, 

        'word2vec_persister__feature_root': bucket_resolver.featuresdir,
        'word2vec_persister__feature_for': logfile_cfg.logfile_for,
        'word2vec_persister__feature_version': logfile_cfg.version
    }

    # TODO: Use a tokenizer over preprocessed event templates to determine the vocab size.
    # Or use a TFIDF vectorizer with dense = false, get the vectors and calculate sparsity
    # Generally sparsity will be above 85-90 percent
    # Rule of thumb: Upper limit for dense dimensions with above sparsity is 15-20% of max vocab size
    cv_options = {
        "df_2_tfidf_transformer__feature_embedding_dim": logfile_cfg.tfidf["feature_embedding_dims"]
    }
    
    explained_variances = {}
    for cv_param in grid_parameters(cv_options):
        pipeline_params = LOGVECTOR_PIPELINE_PARAMS | cv_param
        # print(pipeline_params)

        logvector_persist_pipeline.set_params(**pipeline_params)
        logvector_persist_pipeline.fit_transform(df_nltk) # we dont care about the output

        #v = logvector_persist_pipeline["df_2_tfidf_transformer"]
        tfidf_tx = logvector_persist_pipeline.named_steps.df_2_tfidf_transformer
        explained_variance = tfidf_tx.explained_variance
        dims = cv_param["df_2_tfidf_transformer__feature_embedding_dim"]
        explained_variances[dims] = explained_variance

    return explained_variances

In [None]:
def extract_glove_features(logfile_cfg: LogFileConfig, df_inliers: pd.DataFrame):
    glove_feature_extraction_pipeline = Pipeline(steps=[
        ('feature_extractor', GloveFeatureExtractor())
    ])

    GLOVE_FEATURE_EXTRACT_CFG = {
        'feature_extractor__feature_for': logfile_cfg.logfile_for,
        'feature_extractor__feature_root': bucket_resolver.featuresdir,
        'feature_extractor__feature_version': logfile_cfg.version,
        'feature_extractor__training_iterations': logfile_cfg.glove["training_iterations"],
        'feature_extractor__eventid_colname': logfile_cfg.drain["eventid_colname"]
        # 'feature_extractor__feature_embedding_dim': 25,
        # 'feature_extractor__training_sequence_length': 50,
        # 'feature_extractor__window_length': 5,
    }

    cv_options = {
        "feature_extractor__feature_embedding_dim": logfile_cfg.glove["feature_embedding_dims"],
        "feature_extractor__training_sequence_length": logfile_cfg.glove["training_sequence_length"],
        'feature_extractor__window_length': logfile_cfg.glove["training_window_length"]
    }

    cross_validated_feature_extraction(df=df_inliers, pipeline=glove_feature_extraction_pipeline, 
                                        pipeline_cfg=GLOVE_FEATURE_EXTRACT_CFG, cv_options=cv_options)

In [None]:
def extract_fastText_features(logfile_cfg: LogFileConfig, best_hyperparams: Dict[str, Any], df_inliers: pd.DataFrame):
    fastText_feature_extraction_pipeline = Pipeline(steps=[
        ('feature_extractor', FastTextFeatureExtractor(
                                                    feature_for=logfile_cfg.logfile_for,
                                                    feature_root=bucket_resolver.featuresdir,
                                                    vocab_root=bucket_resolver.vocabdir,
                                                    depth=best_hyperparams['depth'], st=best_hyperparams['st']) )
    ])

    FASTTEXT_FEATURE_EXTRACT_CFG = {
        #'feature_extractor__feature_model': 'skipgram', # crossvalidation in future
        #"feature_extractor__feature_wordNgrams": 3, # crossvalidation in future
        #'feature_extractor__feature_embedding_dim': 32,
        #'feature_extractor__training_sequence_length': 10,
        'feature_extractor__feature_version': logfile_cfg.version,
        'feature_extractor__vocab_version': logfile_cfg.version,
        'feature_extractor__eventid_colname': logfile_cfg.drain["eventid_colname"]
    }

    cv_options = {
        'feature_extractor__feature_model': logfile_cfg.fastText["feature_model"],
        "feature_extractor__feature_wordNgrams": logfile_cfg.fastText["feature_wordNgrams"],
        "feature_extractor__feature_embedding_dim": logfile_cfg.fastText["feature_embedding_dims"],
        "feature_extractor__training_sequence_length": logfile_cfg.fastText["training_sequence_length"] #putting a 5 here causes fasttext to get confused
    }

    cross_validated_feature_extraction(df=df_inliers, pipeline=fastText_feature_extraction_pipeline, 
                                        pipeline_cfg=FASTTEXT_FEATURE_EXTRACT_CFG, cv_options=cv_options)

In [None]:
def fastText_bin_model_2_word2vec(logfile_cfg: LogFileConfig, best_hyperparams: Dict[str, Any], df_fasttext: pd.DataFrame):
    fasttext_2_word2vec_pipeline = Pipeline(steps=[
    ('feature_loader', FastTextFeatureLoader(feature_for=logfile_cfg.logfile_for,
                                       feature_root=bucket_resolver.featuresdir,
                                       vocab_root=bucket_resolver.vocabdir,
                                       feature_version=logfile_cfg.version, 
                                       vocab_version=logfile_cfg.version,
                                       eventid_colname=logfile_cfg.drain["eventid_colname"],
                                       load_normalized_embeddings=False,
                                       depth=best_hyperparams['depth'], st=best_hyperparams['st'])
    )])

    pipeline_cfg = {}
    cv_options = {
        'feature_loader__feature_model': logfile_cfg.fastText["feature_model"],
        "feature_loader__feature_wordNgrams": logfile_cfg.fastText["feature_wordNgrams"],
        "feature_loader__feature_embedding_dim": logfile_cfg.fastText["feature_embedding_dims"],
        "feature_loader__training_sequence_length":logfile_cfg.fastText["training_sequence_length"] #putting a 5 here causes fasttext to get confused
    }

    for cv_param in grid_parameters(cv_options):
        pipeline_params = pipeline_cfg | cv_param
        fasttext_2_word2vec_pipeline.set_params(**pipeline_params)
        fastText_embeddings = fasttext_2_word2vec_pipeline.fit_transform(df_fasttext)
        #unique_event_id_tokens = fasttext_2_word2vec_pipeline.named_steps.feature_loader.unique_event_id_tokens
        unique_event_ids = fasttext_2_word2vec_pipeline.named_steps.feature_loader.unique_event_ids
        #print(unique_event_ids)
        df = pd.DataFrame(data=fastText_embeddings, index=unique_event_ids)
        w2cfilepath = generate_word2vec_model_file_path(
                            embedding_root=bucket_resolver.featuresdir,
                            embedding_for=logfile_cfg.logfile_for,
                            embedding_model=cv_param['feature_loader__feature_model'], 
                            embedding_wordNgrams=cv_param['feature_loader__feature_wordNgrams'],
                            embedding_dim=cv_param['feature_loader__feature_embedding_dim'], 
                            train_seq_len=cv_param['feature_loader__training_sequence_length'], 
                            embedding_version=logfile_cfg.version,
                            embedding_type='fasttext')
        if os.path.exists(w2cfilepath):
            os.remove(w2cfilepath)
        write_word2vec(df, w2cfilepath)

    #return bin_loader.unique_event_id_tokens, fastText_embeddings

In [None]:
# RUN MAIN
for logfile_cfg in logfile_cfgs:
    best_hyperparams = { "depth": 4, "st": 0.3}

    log_locator = LogFileLocator(artifacts_root=logrca_cfg.general_config.artifactsroot, 
                                 logfile_for=logfile_cfg.logfile_for,
                                 depth=best_hyperparams["depth"],
                                 similarity_threshold=best_hyperparams["st"], 
                                 split_standard_deviation=logfile_cfg.drain["split_standard_deviations"],
                                 version=logfile_cfg.version)
    df_nltk = apply_nltk_processing(log_locator)
    explained_variances = extract_tfidf_features(logfile_cfg, df_nltk)

    df_inliers = pd.read_csv(log_locator.get_inliers_csv_path())
    df_unique_inliers = pd.read_csv(log_locator.get_unique_inliers_csv_path())
    
    extract_glove_features(logfile_cfg, df_inliers)
    df_unique_inliers.to_csv(log_locator.get_spectral_csv_path(), index=False)
    df_unique_inliers.to_csv(log_locator.get_normalized_glove_csv_path(), index=False)

    # after extracting fast text features, convert fastText bin format word embeddings into word2vec
    df_unique_inliers.to_csv(log_locator.get_fasttext_csv_path(), index=False)
    extract_fastText_features(logfile_cfg, best_hyperparams, df_inliers)
    df_fasttext = pd.read_csv(log_locator.get_fasttext_csv_path())
    fastText_bin_model_2_word2vec(logfile_cfg, best_hyperparams, df_fasttext)