# Demo to Build Log Representation Model for HDFS Dataset

## Load Data

Use `OpenSetDataLoader` to load HDFS logs.

In [1]:
%%time

import numpy as np
import pandas as pd

from logai.dataloader.openset_data_loader import OpenSetDataLoader, OpenSetDataLoaderConfig

#File Configuration
filepath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/open_datasets/HDFS_1/HDFS_2000.log"

dataset_name = "HDFS"
data_loader = OpenSetDataLoader(
    OpenSetDataLoaderConfig(
        dataset_name=dataset_name,
        filepath=filepath)
)

logrecord = data_loader.load_data()

Total size after encoding is 2001 2001
CPU times: user 1.05 s, sys: 363 ms, total: 1.42 s
Wall time: 576 ms


## Preprocessing

Retrieve `<block id>`, `<ip>` and `<filepath>` from raw loglines using `preprocessor`.

In [2]:
%%time

from logai.preprocess.preprocess import PreprocessorConfig, Preprocessor
from logai.utils import constants

loglines = logrecord.body[constants.LOGLINE_NAME]

preprocessor_config = PreprocessorConfig(
    custom_replace_list=[
        [r"(?<=blk_)[-\d]+", "<block_id>"],
        [r"\d+\.\d+\.\d+\.\d+", "<IP>"],
        [r"(/[-\w]+)+", "<file_path>"],
    ]
)

preprocessor = Preprocessor(preprocessor_config)

clean_logs, custom_patterns = preprocessor.clean_log(
    loglines
)


CPU times: user 1.09 s, sys: 318 ms, total: 1.41 s
Wall time: 1.66 s


## Parsing

Parse logs using Drain.


In [3]:
%%time

from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainParams

# parsing
parsing_algo_params = DrainParams(
    sim_th=0.5, depth=5
)

log_parser_config = LogParserConfig(
    parsing_algorithm="drain",
    parsing_algo_params=parsing_algo_params
)

parser = LogParser(log_parser_config)
parsed_result = parser.parse(clean_logs)

CPU times: user 145 ms, sys: 3.23 ms, total: 148 ms
Wall time: 151 ms


## Partitioning

Partition Logs and generate log sequence for each `block_id`.


In [4]:
%%time

n_gram = 10 # Length of log sequence.
from logai.preprocess.partition import PartitionerConfig, Partitioner
from logai.dataloader.data_model import LogRecordObject

attributes = logrecord.attributes

attributes['block_id'] = custom_patterns["<block_id>"].apply(lambda x:x[0])

new_log_record = LogRecordObject(
    body=pd.DataFrame(clean_logs),
    attributes=attributes
)

new_log_df = new_log_record.to_dataframe()

config = PartitionerConfig(
            sliding_window=n_gram+1,
            group_by_category=['block_id'],
            sep_token="[SEP]"
        )
partitioner = Partitioner(config)

partitioned_df = partitioner.group_sliding_window(new_log_df, constants.LOGLINE_NAME)

partitioned_df['block_id'] = partitioned_df['block_id'].astype(str)

CPU times: user 320 ms, sys: 53.7 ms, total: 374 ms
Wall time: 416 ms


## Read Log Anomaly Labels

Read the `anomaly.csv` file and acquire anomaly labels for each `block_id`.

In [5]:
labelpath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/open_datasets/HDFS_1/anomaly_label.csv"

labels = pd.read_csv(labelpath, header=0)

labels['block_id'] = labels['BlockId'].apply(lambda x:x.split("_")[1]).astype(str)
labels['label'] = labels['Label'].apply(lambda x:1 if x == "Anomaly" else 0)

pos = set(labels[labels['label'] == 1]['block_id'])
partitioned_df['label'] = np.array([1 if blk in pos else 0 for blk in partitioned_df['block_id']])

In [6]:
partitioned_df.head(5)

Unnamed: 0,block_id,logline,label
0,-1052739769153545987,dfs.DataNode$DataXceiver: Receiving block blk_...,0
1,-1052739769153545987,dfs.DataNode$DataXceiver: Receiving block blk_...,0
2,-1052739769153545987,dfs.DataNode$DataXceiver: Receiving block blk_...,0
3,-1052739769153545987,dfs.DataNode$DataXceiver: Receiving block blk_...,0
4,-105450231192318816,dfs.DataNode$DataXceiver: Receiving block blk_...,0


## Tokenization and Vectorization


In [None]:
from logai.information_extraction.log_vectorizer import VectorizerConfig, LogVectorizer

loglines = partitioned_df.logline

vectorizer_config = VectorizerConfig(
    algo_name = "word2vec"
)

vectorizer = LogVectorizer(
    vectorizer_config
)
vectorizer.fit(loglines)
log_vectors = vectorizer.transform(loglines)

In [None]:
lens = []

for lv in log_vectors:
    lens.append(len(lv))

In [4]:
#File Configuration
filepath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/open_datasets/HealthApp.log"

outpath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/open_datasets/HealthApp_20000.log"

with open(outpath, "w") as of:
    with open(filepath, "r") as rf:
        counter = 0
        for line in rf.readlines():
            if counter >= 20000:
                break
            of.write(line)
            counter += 1


## Inference