In [2]:
import os

import pandas as pd
import numpy as np

from logai.algorithms.parsing_algo.drain import DrainParams
from logai.dataloader.data_loader import DataLoaderConfig, FileDataLoader
from logai.information_extraction.feature_extractor import FeatureExtractorConfig, FeatureExtractor
from logai.information_extraction.log_parser import LogParserConfig, LogParser
from logai.preprocess.preprocess import PreprocessorConfig, Preprocessor
from logai.utils import constants


In [None]:
input_dir = "/Users/qcheng/workspace/gitsoma/logai/logai/data/open_datasets/HDFS_1/"


In [None]:
# dataloader
filepath = os.path.join(input_dir,"HDFS.log")
log_type = "log"
file_config = DataLoaderConfig(
    filepath=filepath,
    log_type=log_type
)

dataloader = FileDataLoader(file_config)
logrecord = dataloader.load_data()

In [None]:
loglines = logrecord.body[constants.LOGLINE_NAME]
loglines[0]

In [None]:
# read labels

blk_label_file = os.path.join(input_dir, "anomaly_label.csv")
blk_df = pd.read_csv(blk_label_file, header=0)
anomaly_blk = set(blk_df[blk_df['Label']=='Anomaly']['BlockId'])

In [None]:
# preprocess
loglines = logrecord.body[constants.LOGLINE_NAME]

preprocessor_config = PreprocessorConfig(
    custom_replace_list=[
        [r'(?<=blk_)[-\d]+', '<block_id>'],
        [r'\d+\.\d+\.\d+\.\d+',  '<IP>'],
        [r'(/[-\w]+)+', '<file_path>']
    ]
)

preprocessor = Preprocessor(preprocessor_config)

clean_logs, custom_patterns = preprocessor.clean_log(loglines)

# parsing
parsing_algo_params = DrainParams(
    sim_th=0.5,
    depth=5
)

log_parser_config = LogParserConfig(
    parsing_algorithm='drain',
    parsing_algo_params=parsing_algo_params
)

parser = LogParser(log_parser_config)
parsed_result = parser.parse(clean_logs)

parsed_result['block_id'] = custom_patterns['<block_id>'].map(lambda x: "blk_{}".format(x[0]))

In [None]:
from logai.information_extraction.categorical_encoder import CategoricalEncoder, CategoricalEncoderConfig

encoder_config = CategoricalEncoderConfig()

cat_encoder = CategoricalEncoder(encoder_config)
parsed_result['event_id'] = cat_encoder.fit_transform(parsed_result[[constants.PARSED_LOGLINE_NAME]])

In [3]:
OUT_DIR = '/Users/qcheng/workspace/gitsoma/logai/logai/results'
# parsed_result.to_pickle(os.path.join(OUT_DIR, 'parse_res'))
parsed_result = pd.read_pickle(os.path.join(OUT_DIR, 'parse_res'))

In [4]:
config = FeatureExtractorConfig(
    group_by_category=['block_id']
)

feature_extractor = FeatureExtractor(config)

block_list = feature_extractor.convert_to_sequence(parsed_result['event_id'], parsed_result['block_id'])

block_list['label'] = block_list['block_id'].apply(lambda x: 1 if x in anomaly_blk else 0)

NameError: name 'anomaly_blk' is not defined

In [None]:
# gb = parsed_result[['event_id', 'block_id']].groupby(by=['block_id'])
#
# block_list = gb.size().to_frame(name='counts').join(gb.agg(event_ids=('event_id',list))).reset_index()
#
# block_list['event_seq'] = block_list['event_ids'].apply(lambda x: " ".join([str(c) for c in x]))
# block_list['label'] = block_list['block_id'].apply(lambda x: 1 if x in anomaly_blk else 0)
#
# block_list[block_list['label'] == 1].head(3)

In [None]:
neg_df = block_list[block_list['label'] == 0][['event_sequence', 'label']].sample(frac=0.1, random_state=1)
pos_df = block_list[block_list['label'] == 1][['event_sequence', 'label']].sample(frac=0.5, random_state=1)

from sklearn.model_selection import train_test_split
train_df_neg, test_df_neg = train_test_split(neg_df, test_size=.2)
train_df_pos, test_df_pos = train_test_split(pos_df, test_size=.2)

train_df = train_df_pos.append(train_df_neg)
test_df = test_df_pos.append(test_df_neg)

In [None]:
from logai.algorithms.nn_model.transformers import TransformerAlgoConfig, TransformerAlgo

config = TransformerAlgoConfig()
transformer = TransformerAlgo(config)
transformer.train(train_df['event_sequence'], train_df['label'])

In [None]:
predictions = transformer.predict(test_df['event_seq'], test_df['label'])


