In [None]:
from logai.dataloader.data_loader import FileDataLoader, DataLoaderConfig
from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainParams

# Data Loader

Load log data and store all data in @LogRecordObject. Currently only implemented FileDataLoader.

Please change the filepath correspondingly.
I've put the ./data dir in .gitignore to avoid checking in data unexpectedly


In [3]:

#File Configuration
filepath = "../data/default_data/mixed.csv"
log_type = 'csv'
dimensions = {'timestamp': ['timestamp'],
              'attributes': ['cluster_label',
                             'logRecordType'],
              'body': ['_raw']}
custom_delimeter_regex = [r"`+|\s+"]

file_config = DataLoaderConfig(
    filepath=filepath,
    log_type='csv',
    dimensions=dimensions,
    custom_delimeter_regex=custom_delimeter_regex,
    header=0
)

file_config

FileConfig(filepath='../data/default_data/mixed.csv', log_type='csv', dimensions={'timestamp': ['timestamp'], 'attributes': ['cluster_label', 'logRecordType'], 'body': ['_raw']}, custom_delimeter_regex=['`+|\\s+'], sep=None, header=0)

In [None]:


dataloader = FileDataLoader(file_config)
logrecord = dataloader.load_data()


# Preprocess

Do customer rules to initially parse the loglines. Add custom delimeters in a regex

Group log records by any attributes. Return grouped log index so follow up process can handle them separately.


In [3]:
from logai.preprocess.preprocess import Preprocessor, PreprocessorConfig

preprocessor = Preprocessor(PreprocessorConfig(custom_delimiters_regex=custom_delimeter_regex))
preprocessed_loglines = preprocessor.clean_log(logrecord.body['_raw'])

In [4]:
preprocessed_loglines.head(5)

0    ffgen 20210501000000.005 112100 0 0 acbcfa25dd...
1    phqry 20210501000000.035 5436 0 0 28e3e6101f0d...
2    phqry 20210501000000.121 1259 0 0 0 1o:035 232...
3    mqdbg 20210501000000.124 1411 0 0 0 Qr:021 232...
4    mqdbg 20210501000000.127 1411 0 0 0 fr:021 232...
Name: _raw, dtype: object

# Information Extraction


## Parsing

Currently only implemented DRAIN. IPLoM next but needs time to refactor code from LogPAI.


In [5]:

log_parser_config = LogParserConfig(
    parsing_algorithm='drain'
)

parser = LogParser(log_parser_config)
parsed_result = parser.parse(preprocessed_loglines)

In [6]:
parsed_result.head(5)

Unnamed: 0,logline,parsed_logline,parameter_list
0,ffgen 20210501000000.005 112100 0 0 acbcfa25dd...,ffgen * * 0 0 * * 0 * 232.3.3 INFO RollingHttp...,"[20210501000000.005, 112100, acbcfa25ddbbae4b,..."
1,phqry 20210501000000.035 5436 0 0 28e3e6101f0d...,phqry * * 0 0 * * 0 * 232.3.3 * * * asyncop,"[20210501000000.035, 5436, 28e3e6101f0dbba5, -..."
2,phqry 20210501000000.121 1259 0 0 0 1o:035 232...,phqry * * 0 0 0 * 232.3.3 RT * * * * * * * = *...,"[20210501000000.121, 1259, 1o:035, {qplo=0,tq=..."
3,mqdbg 20210501000000.124 1411 0 0 0 Qr:021 232...,mqdbg * * 0 0 0 * 232.3.3 sfdc.common.messagin...,"[20210501000000.124, 1411, Qr:021, Stopping, d..."
4,mqdbg 20210501000000.127 1411 0 0 0 fr:021 232...,mqdbg * * 0 0 0 * 232.3.3 sfdc.common.messagin...,"[20210501000000.127, 1411, fr:021, agent, for,..."


## Vectorization

### Vectorization using Word2Vec

In [7]:
from logai.information_extraction.log_vectorizer import VectorizerConfig, LogVectorizer
parsed_loglines = parsed_result['parsed_logline']



#params = Word2VecParams()
vectorizer_config = VectorizerConfig(algo_name="word2vec")
vectorizor = LogVectorizer(vectorizer_config)
vectorizor.fit(parsed_loglines)

#Log vector is a pandas.Series
log_vectors_w2v = vectorizor.transform(parsed_loglines)


In [8]:
log_vectors_w2v


0        [0.054382484, 0.0063305697, 0.1157879, -0.0178...
1        [-0.08794713, -0.29176697, -0.028551897, -0.01...
2        [-0.08794713, -0.29176697, -0.028551897, -0.01...
3        [-0.15993507, -0.103575386, 0.22658761, -0.017...
4        [-0.15993507, -0.103575386, 0.22658761, -0.017...
                               ...                        
86930    [-0.04845802, -0.30693808, 0.14568503, -0.0178...
86931    [-0.08794713, -0.29176697, -0.028551897, -0.01...
86932    [0.054382484, 0.0063305697, 0.1157879, -0.0178...
86933    [-0.08794713, -0.29176697, -0.028551897, -0.01...
86934    [-0.15993507, -0.103575386, 0.22658761, -0.017...
Length: 86935, dtype: object

## Vectorization

### Categorical Encoding for log attributes

In [9]:
from logai.information_extraction.categorical_encoder import CategoricalEncoderConfig, CategoricalEncoder

encoder_config = CategoricalEncoderConfig(name="label_encoder")

encoder = CategoricalEncoder(encoder_config)

attributes = encoder.fit_transform(logrecord.attributes)

attributes.head(5)


Unnamed: 0,cluster_label_categorical,logRecordType_categorical
0,43,3
1,205,7
2,327,7
3,16,6
4,15,6


## Feature extraction

### Implement log vector to feature

This will convert the vector metrics to a n dimensional feature. Implement simple 0 padding method.


In [10]:
from logai.information_extraction.feature_extractor import FeatureExtractor, FeatureExtractorConfig

import pandas as pd


timestamps = pd.to_datetime(logrecord.timestamp['timestamp'])


### convert to feature vector

In [12]:
config = FeatureExtractorConfig(
    max_feature_len=100,
    group_by_category=['logRecordType']
)

feature_extractor = FeatureExtractor(config)

feature_vector = feature_extractor.convert_to_feature_vector(log_vectors_w2v, logrecord.attributes, timestamps)

feature_vector.head(5)

Unnamed: 0,logRecordType,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_90,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99
0,ailtn,0.019059,0.248064,-0.027109,-0.017529,0.008031,0.169828,-0.018005,0.007744,0.169683,...,0.006279,0.005041505,0.000611,0.00183,0.003287,0.002173,0.00376,0.003077,0.001193,-0.001134
1,augen,-0.190104,0.061961,0.202962,-0.017823,0.00786,0.169471,-0.017835,0.008071,0.169711,...,-0.082377,0.3072485,0.232524,-0.065904,0.232332,0.151251,-0.004455,0.131363,0.179685,0.121703
2,cptsk,0.094219,0.180048,0.235089,-0.017557,0.00819,0.169343,-0.017779,0.008002,0.170041,...,-2e-06,7.881005e-07,1.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ffgen,0.054346,0.006331,0.115815,-0.017942,0.007932,0.169966,-0.017872,0.007908,0.170102,...,0.003419,0.002875315,0.000808,0.00093,-0.000963,0.000364,0.0033,0.003912,0.001366,0.001002
4,ksgen,0.092584,0.229095,0.203279,-0.098165,0.002991,0.12867,-0.017973,0.007998,0.169543,...,0.031865,0.06168883,-0.002998,0.025346,0.020107,-0.010445,0.028911,0.059404,-0.007812,0.028381


### convert to counter vector

Convert logline to vector

In [14]:
config = FeatureExtractorConfig(
    group_by_time="15min",
    group_by_category=['parsed_logline', 'logRecordType'],
    max_feature_len=100
)


feature_extractor = FeatureExtractor(config)

counter_vector_str = feature_extractor.convert_to_counter_vector(
    log_pattern=parsed_loglines,
    attributes=logrecord.attributes,
    timestamps=timestamps
)

counter_vector_str.head(5)

Unnamed: 0,parsed_logline,logRecordType,timestamp,Counts
0,* * * 0 0 0 * 232.3.3 INFO * * * for *,ffgen,2021-05-01 00:00:00+00:00,5
1,* * * 0 0 0 * 232.3.3 INFO * * * for *,ksgen,2021-05-01 00:00:00+00:00,15
2,ailtn * * * 0 0 * * * * 232.3.3 * * * * * * 23...,ailtn,2021-05-01 00:00:00+00:00,3995
3,ailtn * * * 0 0 * * * * 232.3.3 * * * * * * 23...,ailtn,2021-05-01 00:00:00+00:00,163
4,ailtn * * * 0 0 * * * * 232.3.3 * * * * 000000...,ailtn,2021-05-01 00:00:00+00:00,7


In [15]:
counter_vector_str = feature_extractor.convert_to_counter_vector(parsed_loglines, logrecord.attributes, timestamps)

counter_vector_str.head(5)

Unnamed: 0,parsed_logline,logRecordType,timestamp,Counts
0,* * * 0 0 0 * 232.3.3 INFO * * * for *,ffgen,2021-05-01 00:00:00+00:00,5
1,* * * 0 0 0 * 232.3.3 INFO * * * for *,ksgen,2021-05-01 00:00:00+00:00,15
2,ailtn * * * 0 0 * * * * 232.3.3 * * * * * * 23...,ailtn,2021-05-01 00:00:00+00:00,3995
3,ailtn * * * 0 0 * * * * 232.3.3 * * * * * * 23...,ailtn,2021-05-01 00:00:00+00:00,163
4,ailtn * * * 0 0 * * * * 232.3.3 * * * * 000000...,ailtn,2021-05-01 00:00:00+00:00,7


### Counter vector on log vector 

Convert np.array vector to counter vector

In [16]:
import numpy as np

log_vector_pattern = log_vectors_w2v.apply(lambda x: np.array2string(x, formatter={'float_kind':lambda x: "%.2f" % x})).rename('log_pattern')

config = FeatureExtractorConfig(
    group_by_time="15min",
    group_by_category=['log_pattern', 'logRecordType'],
    max_feature_len=100
)


feature_extractor = FeatureExtractor(config)


counter_vector_numeric = feature_extractor.convert_to_counter_vector(log_vector_pattern, logrecord.attributes, timestamps)

In [17]:
counter_vector_numeric

Unnamed: 0,log_pattern,logRecordType,timestamp,Counts
0,[-0.02 0.01 0.17 -0.02 0.01 0.17 -0.02 0.01 0....,ffgen,2021-05-01 00:00:00+00:00,5
1,[-0.02 0.01 0.17 -0.02 0.01 0.17 -0.02 0.01 0....,ksgen,2021-05-01 00:00:00+00:00,15
2,[-0.05 -0.31 0.15 -0.02 0.01 0.17 -0.02 0.01 0...,mlmul,2021-05-01 00:00:00+00:00,3
3,[-0.05 -0.31 0.15 -0.02 0.01 0.17 -0.02 0.01 0...,mlmul,2021-05-01 00:00:00+00:00,9997
4,[-0.09 -0.29 -0.03 -0.00 -0.21 0.19 -0.02 0.01...,phqry,2021-05-01 00:00:00+00:00,2
...,...,...,...,...
658,[0.12 0.25 0.27 0.27 -0.05 0.27 -0.02 0.01 0.1...,s,2021-05-01 00:00:00+00:00,1
659,[0.12 0.25 0.27 0.27 -0.05 0.27 -0.02 0.01 0.1...,s,2021-05-01 00:00:00+00:00,209
660,[0.12 0.25 0.27 0.28 0.08 -0.27 0.26 -0.32 -0....,s,2021-05-01 00:00:00+00:00,1
661,[0.12 0.25 0.27 0.30 -0.17 0.03 -0.02 0.01 0.1...,s,2021-05-01 00:00:00+00:00,7


## Clustering

### prepare feature

In [18]:

feature_for_clustering = feature_vector.loc[:, ~feature_vector.columns.isin(['timestamp', 'cluster_label', 'logRecordType'])]

In [19]:
feature_vector.head(10)

Unnamed: 0,logRecordType,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_90,feature_91,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99
0,ailtn,0.019059,0.248064,-0.027109,-0.017529,0.008031,0.169828,-0.018005,0.007744,0.169683,...,0.006279,0.005041505,0.000611,0.00183,0.003287,0.002173,0.00376,0.003077,0.001193,-0.001134
1,augen,-0.190104,0.061961,0.202962,-0.017823,0.00786,0.169471,-0.017835,0.008071,0.169711,...,-0.082377,0.3072485,0.232524,-0.065904,0.232332,0.151251,-0.004455,0.131363,0.179685,0.121703
2,cptsk,0.094219,0.180048,0.235089,-0.017557,0.00819,0.169343,-0.017779,0.008002,0.170041,...,-2e-06,7.881005e-07,1.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ffgen,0.054346,0.006331,0.115815,-0.017942,0.007932,0.169966,-0.017872,0.007908,0.170102,...,0.003419,0.002875315,0.000808,0.00093,-0.000963,0.000364,0.0033,0.003912,0.001366,0.001002
4,ksgen,0.092584,0.229095,0.203279,-0.098165,0.002991,0.12867,-0.017973,0.007998,0.169543,...,0.031865,0.06168883,-0.002998,0.025346,0.020107,-0.010445,0.028911,0.059404,-0.007812,0.028381
5,mlmul,-0.048458,-0.306938,0.145685,-0.017874,0.007881,0.170112,-0.017874,0.007881,0.170112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,mqdbg,-0.159935,-0.103575,0.226588,-0.017767,0.007967,0.169629,-0.017669,0.007947,0.169702,...,-0.047487,0.3595531,-0.425073,-0.012746,0.085932,-0.127878,0.007068,0.071849,-0.022037,-0.036018
7,phqry,-0.087947,-0.291767,-0.028552,-0.017905,0.007893,0.17005,-0.017801,0.00791,0.169836,...,0.135102,0.07805949,-0.043997,0.149282,0.079436,-0.038141,0.151809,0.092054,-0.030102,0.151883
8,ppcmi,0.091437,0.231332,0.202181,-0.017874,0.007881,0.170112,-0.017874,0.007881,0.170112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,s,0.120894,0.250615,0.27081,-0.021007,0.018393,0.134851,-0.017906,0.007863,0.169834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clustering using DBSCAN

In [20]:
from logai.analysis.clustering import ClusteringConfig, Clustering
from logai.algorithms.clustering_algo.dbscan import DbScanParams

algo_params = DbScanParams()
clustering_config = ClusteringConfig('DBSCAN', algo_params, None)

clustering = Clustering(clustering_config)
clustering.fit(feature_for_clustering)
dbscan_res = clustering.predict(feature_for_clustering)

In [21]:
dbscan_res.value_counts()

-1    10
dtype: int64

### Clustering using K-Means

In [22]:
from logai.algorithms.clustering_algo.kmeans import KMeansParams

algo_params = KMeansParams()
clustering_config = ClusteringConfig('KMeans', algo_params, None)

clustering = Clustering(clustering_config)
clustering.fit(feature_for_clustering)
kmeans_res = clustering.predict(feature_for_clustering)

In [23]:
kmeans_res.value_counts()

0    3
1    1
3    1
2    1
4    1
5    1
6    1
7    1
dtype: int64

## Anomaly Detection

### Prepare training and testing set


In [24]:
from sklearn.model_selection import train_test_split

feature_for_anomaly_detection = feature_vector.loc[:, ~feature_vector.columns.isin(['timestamp', 'cluster_label', 'logRecordType'])]
train, test = train_test_split(feature_for_anomaly_detection, train_size=0.7, test_size=0.3)

### Isolation forest for anomaly detection

In [25]:
from logai.algorithms.anomaly_detection_algo.isolation_forest import IsolationForestParams
from logai.analysis.anomaly_detector import AnomalyDetectionConfig, AnomalyDetector

algo_params = IsolationForestParams()
config = AnomalyDetectionConfig(algo_name='isolation_forest', algo_params=algo_params)

anomaly_detector = AnomalyDetector(config)
anomaly_detector.fit(train)
res = anomaly_detector.predict(test)

### One class SVM for anomaly detection

In [26]:
from logai.algorithms.anomaly_detection_algo.one_class_svm import OneClassSVMParams
from logai.analysis.anomaly_detector import AnomalyDetectionConfig, AnomalyDetector

algo_params = OneClassSVMParams()

config = AnomalyDetectionConfig(algo_name='one_class_svm')

anomaly_detector = AnomalyDetector(config)
anomaly_detector.fit(train)
res = anomaly_detector.predict(test)

[LibSVM].*.*
optimization finished, #iter = 16
obj = 2.917964, rho = 2.494658
nSV = 6, nBSV = 2


#### Anomalous loglines

In [27]:
pd.concat((parsed_result.iloc[res[res==1].index], res[res==1].rename('anomaly_label')), axis=1)


Unnamed: 0,logline,parsed_logline,parameter_list,anomaly_label
0,ffgen 20210501000000.005 112100 0 0 acbcfa25dd...,ffgen * * 0 0 * * 0 * 232.3.3 INFO RollingHttp...,"[20210501000000.005, 112100, acbcfa25ddbbae4b,...",1
6,ffgen 20210501000000.160 4bf7FOabZfbDZ4mt-SUBZ...,ffgen * * * 0 0 * * * 0 * 232.3.3 INFO end att...,"[20210501000000.160, 4bf7FOabZfbDZ4mt-SUBZ-, 1...",1
4,mqdbg 20210501000000.127 1411 0 0 0 fr:021 232...,mqdbg * * 0 0 0 * 232.3.3 sfdc.common.messagin...,"[20210501000000.127, 1411, fr:021, agent, for,...",1
