In [1]:
import numpy as np

from algorithms import aimodel
from algorithms.logseq.log2template import Log2Template
from common import constants
from common.timelogger import TimeLogger

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import TopKCategoricalAccuracy


from sklearn.model_selection import train_test_split
import os
import joblib
import time

In [2]:
class LogSeq(aimodel.AIModel):
    def __init__(self, config, logger):
        self.config = config
        self.logger = logger

        self.params_logseq = None
        self.window_size = self.params_logseq['window_size'] if self.params_logseq is not None else 30

        # self.test_perc = self.config['parameter']['train']['data_set']['test'] if 'data_set' in config['parameter']['train'].keys() else None
        self.batch_size = self.params_logseq['batch_size'] if self.params_logseq is not None else 512
        self.epochs = self.params_logseq['epochs'] if self.params_logseq is not None else 50
        self.top_k = self.params_logseq['top_k'] if self.params_logseq is not None else 10
        self.hidden_size = self.params_logseq['hidden_size'] if self.params_logseq is not None else 512

        # self.test_perc = self.config['parameter']['train']['data_set']['test'] if 'data_set' in config['parameter']['train'].keys() else None
        self.test_perc = 30

        # template model
        self.log2template = Log2Template(config, logger)

        # tf model
        self.model = None

In [3]:
    def set_top_k(self, parameter):
        update_val = parameter['parameter']['service'][constants.MODEL_S_LOGSEQ]['top_k']
        self.logger.info(f"[LogSeq] top_k changed : {self.top_k} => {update_val}")
        self.top_k = update_val

    def get_model(self):
        n_classes = self.log2template.n_templates

        inp = Input(shape=(self.window_size,))
        emb = Embedding(n_classes+1, self.hidden_size)(inp)
        lstm = LSTM(self.hidden_size, return_sequences=True)(emb)
        lstm = LSTM(self.hidden_size//2, return_sequences=True)(lstm)
        lstm = LSTM(self.hidden_size//4)(lstm)
        out = Dense(n_classes, activation='softmax')(lstm)
        model = Model(inp, out)
        model.compile(optimizer=Adam(learning_rate=0.005), loss='categorical_crossentropy',
                      metrics=[TopKCategoricalAccuracy(k, name=f"top_{k}") for k in [1, 3, 5, 10, 20]])

        return model

    def get_sequence_data(self, data):
        x_data, y_data = [], []
        for i in range(len(data) - self.window_size):
            x_data.append(data[i:i + self.window_size])
            y_data.append(data[i + self.window_size] - 1)

        x_data, y_data = np.array(x_data), to_categorical(y_data, num_classes=self.log2template.n_templates)
        return x_data, y_data

    def fit(self, log_df):
        if not os.path.exists(os.path.join(self.config['model_dir'], f"{constants.MODEL_S_LOGSEQ}")):
            os.makedirs(os.path.join(self.config['model_dir'], f"{constants.MODEL_S_LOGSEQ}"))
        with TimeLogger(f"[LogSeq] model training time :", self.logger):
            time_s = time.time()

            tidxs = self.log2template.log2tidx(log_df, fitting=True)

            x_data, y_data = self.get_sequence_data(tidxs)

            x_train, x_test = train_test_split(x_data, test_size=self.test_perc / 100, shuffle=False)
            y_train, y_test = train_test_split(y_data, test_size=self.test_perc / 100, shuffle=False)

            # Model
            self.logger.info(f"[LogSeq] tf_model training start")
            self.model = self.get_model()
            init_epoch = 0# 10
            callbacks = [LoggerCallback(self.epochs, self.logger)]
            # self.model.fit(x_train, y_train, batch_size=self.batch_size, epochs=init_epoch, validation_data=(x_test, y_test), callbacks=callbacks, verbose=0)

            time_fit_s = time.time()
            callbacks.append(EarlyStopping(min_delta=0.01, patience=5, restore_best_weights=True))
            hist = self.model.fit(x_train, y_train, batch_size=self.batch_size, epochs=self.epochs, initial_epoch=init_epoch, validation_data=(x_test, y_test), callbacks=callbacks, verbose=0)
            time_e = time.time()
            self.logger.info(f"[LogSeq] tf_model training end (elapsed = {time_e - time_fit_s:.3f}s)")

            # TODO - return template_clusters
            train_result = {"from_date": self.config['date'][0], "to_date": self.config['date'][-1], "accuracy": hist.history['val_top_1'][-1],
                            "train_metrics": {},
                            'mined_period': self.log2template.mined_period, 'n_templates': self.log2template.n_templates,
                            'templates': [c.get_template() for c in self.log2template.template_miner.drain.clusters],
                            "except_failure_date_list": None, "except_business_list": None,
                            "business_list": None, "train_business_status": None, "train_mode": -1, "outlier_mode": -1}

            return train_result

    def predict(self, serving_logs):
        tidxs = self.log2template.log2tidx(serving_logs[60-self.window_size:])

        x_data, y_data = self.get_sequence_data(tidxs)
        preds = list(map(lambda x: list(enumerate(x)), self.model.predict(x_data, batch_size=512)))
        preds = np.array(list(map(lambda p: sorted(p, key=lambda x: x[1], reverse=True), preds)))
        preds = preds[:, :self.top_k, :]
        '''
        # format
        (tidx_1, proba_1) => top1
        (tidx_2, proba_2) => top2
               ...
        (tidx_k-1, proba_k-1)
        (tidx_k, proba_k) => topk
        '''

        result = {}
        for i in range(len(preds)):
            is_anomaly = np.argmax(y_data[i]) not in preds[i][:, 0]
            # expected_templates = self.log2template.tidx2template(preds[i][:, 0])
            expected_tidxs = preds[i][:, 0] + 1
            probas = preds[i][:, 1]
            result[str(i)] = {'anomaly': is_anomaly,
                              'real': serving_logs['msg'].iloc[60+i],
                              'expected_tidxs': expected_tidxs,
                              # 'expected_templates': expected_templates,
                              'probabilities': probas}
            self.logger.debug(f"result_{i} = {result[str(i)]}")
        return result

    def save(self, model_dir):
        self.log2template.save(model_dir)

        model_path = os.path.join(model_dir, f"{constants.MODEL_S_LOGSEQ}/tf_model.h5")
        self.model.save(model_path)
        self.logger.info(f"[LogSeq] tf_model saved (3/4)")

        etc_path = os.path.join(model_dir, f"{constants.MODEL_S_LOGSEQ}/etc_info.pkl")
        etc_info = {'top_k': self.top_k, 'mined_period': self.log2template.mined_period}
        joblib.dump(etc_info, etc_path)
        self.logger.info(f"[LogSeq] etc_info saved (4/4)")

    def load(self, model_dir):
        try:
            self.log2template.load(model_dir)
            self.logger.info(f"[LogSeq] template model loaded")

            tf_model_path = os.path.join(model_dir, f"{constants.MODEL_S_LOGSEQ}/tf_model.h5")
            self.model = load_model(tf_model_path)
            self.logger.info(f"[LogSeq] tf model loaded")

            etc_path = os.path.join(model_dir, f"{constants.MODEL_S_LOGSEQ}/etc_info.pkl")
            etc_info = joblib.load(etc_path)
            self.top_k = etc_info['top_k']
            self.log2template.mined_period = etc_info['mined_period']

            return True
        except Exception as e:
            self.logger.info(f"[LogSeq] Error log while Load() : {e}")
            return False




In [5]:
from tensorflow.keras.callbacks import Callback

class LoggerCallback(Callback):
    def __init__(self, epochs, logger):
        super().__init__()
        self.logger = logger
        self.epochs = epochs

    def on_epoch_end(self, epoch, logs=None):
        self.logger.info(f"[LogSeq] epoch={epoch + 1}/{self.epochs}"
                         f", top_1=(train={logs['top_1']:.3f}/test={logs['val_top_1']:.3f})"
                         f", top_3=({logs['top_3']:.3f}/{logs['val_top_3']:.3f})"
                         f", top_5=({logs['top_5']:.3f}/{logs['val_top_5']:.3f})"
                         f", top_10=({logs['top_10']:.3f}/{logs['val_top_10']:.3f})"
                         f", top_20=({logs['top_20']:.3f}/{logs['val_top_20']:.3f})")

In [48]:
import pandas as pd 
import json
import csv

In [83]:
a = list()

with open("./log_txt.txt", "r", encoding="utf-8") as f:
    data = f.readlines()
    for d in data:
        a.append(eval(d))

In [85]:
pd.DataFrame(a)

Unnamed: 0,msg,_time
0,DEBUG 2022-07-06 11:59:53.153 [org.spri] ():...,20220706115953
1,DEBUG 2022-07-06 11:59:53.153 [org.spri] ():...,20220706115953
2,DEBUG 2022-07-06 11:59:53.153 [org.spri] ():...,20220706115953
3,DEBUG 2022-07-06 11:59:53.152 [org.spri] ():...,20220706115953
4,DEBUG 2022-07-06 11:59:53.152 [org.spri] ():...,20220706115953
...,...,...
29313,DEBUG 2022-07-06 11:59:01.737 [org.spri] ():...,20220706115901
29314,DEBUG 2022-07-06 11:59:01.737 [org.spri] ():...,20220706115901
29315,DEBUG 2022-07-06 11:59:01.737 [org.spri] ():...,20220706115901
29316,DEBUG 2022-07-06 11:59:01.737 [org.spri] ():...,20220706115901


In [86]:
class Logger():
    def debug(self, msg):
        print(msg)
    def info(self, msg):
        print(msg)
        

In [87]:
logger = Logger()

In [88]:
config = dict()

logseq = LogSeq(config, logger)

KeyError: 'model_dir'