In [19]:
from os import listdir
from os.path import isfile, join, exists, dirname
import sys
import os

import pandas as pd

from logai.dataloader.data_loader import FileDataLoader
from logai.dataloader.data_loader import FileConfig
from logai.information_extraction.log_parser import LogParser, LogParserConfig
from logai.algorithms.parsing_algo.drain import DrainConfig
from logai.preprocess.preprocess import Preprocess

In [3]:
# Data layer
# Load log data and store all data in @LogRecordObject. Currently only implemented FileDataLoader.

# Please change the filepath correspondingly.
# I've put the ./data dir in .gitignore to avoid checking in data unexpectedly

#File Configuration
filepath = "/Users/qcheng/workspace/gitsoma/logai/logai/data/sf_datasets/mixed.csv"
log_type = 'csv'
dimensions = {'attributes': ['cluster_label', 'logRecordType'],
              'body': ['_raw']}
custom_delimeter_regex = r"`+|\s+"

file_config = FileConfig(
    filepath=filepath,
    log_type='csv',
    dimensions=dimensions,
    custom_delimeter_regex=custom_delimeter_regex,
    header=0
)

dataloader = FileDataLoader(file_config)
logrecord = dataloader.load_data()


In [4]:
# Preprocess
# Do customer rules to initially parse the loglines. Add custom delimeters in a regex
# Group log records by any attributes. Return grouped log index so follow up process can handle them separately.

preprocessed_loglines = Preprocess.clean_log(logrecord.body, file_config.custom_delimeter_regex)

#bucket loglines into groups.
index_groups = Preprocess.group_log_index(logrecord.attributes, by=['logRecordType'])

In [35]:
# Information Extraction
root_path = '/Users/qcheng/workspace/gitsoma/logai/logai/results/drain_cluster/'

drain_config = DrainConfig(sim_th=0.9,
                           extra_delimiters=[])

log_parser_config = LogParserConfig(parsing_algorithm='drain',
                                    parsing_algo_params=drain_config)

num_patterns = dict()

tree_path = os.path.join(root_path, "tree/")

res_path = os.path.join(root_path, "parsed_results/")
print(tree_path)
print(res_path)

if not exists(dirname(tree_path)):
    os.makedirs(dirname(tree_path))

if not exists(dirname(res_path)):
    os.makedirs(dirname(res_path))

for i in index_groups.index[:500]:
    indices = index_groups['group_index'][i]
    loglines_in_group = preprocessed_loglines.iloc[indices]
    cluster_labels = logrecord.attributes['cluster_label'].iloc[indices]
    parser = LogParser(log_parser_config)
    parsed_result = parser.parse(loglines_in_group.dropna()[dimensions['body'][0]])
    #parsed_result = pd.join((parsed_result, cluster_labels))
    longest_log_length = max([len(l) for l in parsed_result['logline']])
    uniq_patterns = parsed_result['parsed_logline'].unique()
    num_uniq_cluster = len(cluster_labels.unique())
    num_p = len(uniq_patterns)
    longest_p_length = max([len(p.split(" ")) for p in uniq_patterns])
    num_patterns[index_groups['logRecordType'].iloc[i]] = [num_uniq_cluster, num_p, longest_p_length, longest_log_length]

    #write results to file
    pd.concat((parsed_result, cluster_labels), axis=1).to_csv(os.path.join(res_path, 'drain_cluster_{}.csv'.format(index_groups['logRecordType'][i])))

    # write generated tree to file
    write_path = join(tree_path, '{}.txt'.format(index_groups['logRecordType'][i]))

    if parser.parser.clusters_counter > 1:
        if not exists(dirname(write_path)):
            try:
                os.makedirs(dirname(write_path))
            except OSError as exc: # Guard against race condition
                if exc.errno != exc.errno.EEXIST:
                    raise
        with open(write_path, 'w+') as f:
            parser.parser.print_tree(max_clusters=100, file=f)
            f.close()



/Users/qcheng/workspace/gitsoma/logai/logai/results/drain_cluster/tree/
/Users/qcheng/workspace/gitsoma/logai/logai/results/drain_cluster/parsed_results/


In [36]:
res_df = pd.DataFrame.from_dict(num_patterns, orient='index', columns=["num_clusters", "num_patterns", "longest_pattern_length", "longest_logline_length"])
res_df.to_csv(os.path.join(root_path, "clustering_summary.csv"))

In [37]:
preprocessed_loglines.shape

(86935, 1)